class EntityResolver:
"""Resolve riverbank IRIs to Wikidata Q-ids.
Parameters
----------
wikidata_client:
A :class:`~riverbank.eval.wikidata_client.WikidataClient` instance used
for live Wikidata lookups. Can be *None* in offline / test scenarios;
resolution will fall back to label-only heuristics.
min_fuzzy_ratio:
Minimum similarity score (0–100) for fuzzy label matching.
"""
def __init__(
self,
wikidata_client=None, # WikidataClient — avoid circular import
min_fuzzy_ratio: float = 85.0,
) -> None:
self._wikidata = wikidata_client
self.min_fuzzy_ratio = min_fuzzy_ratio
self._cache = ResolutionCache()
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def resolve_entity(
self,
riverbank_iri: str,
article_title: str,
article_qid: str = "",
context_type: str = "",
candidate_qids: list[str] | None = None,
) -> EntityMatch | None:
"""Attempt to resolve a riverbank IRI to a Wikidata Q-id.
Parameters
----------
riverbank_iri:
The riverbank subject IRI (e.g. ``http://example.org/Marie_Curie``).
article_title:
Title of the Wikipedia article being evaluated.
article_qid:
Wikidata Q-id of the main article entity (used for sitelink matching).
context_type:
Domain hint: ``"person"`` | ``"organization"`` | ``"place"`` | ``"work"`` | ``"event"``.
candidate_qids:
Pre-supplied Q-id candidates (skip Wikidata search if provided).
"""
# Check in-memory cache first
cached = self._cache.get(riverbank_iri)
if cached is not None:
return cached
label = self._extract_label_from_iri(riverbank_iri)
if not label:
return None
match = self._resolve(
label=label,
article_title=article_title,
article_qid=article_qid,
context_type=context_type,
candidate_qids=candidate_qids or [],
)
if match:
self._cache.put(riverbank_iri, match)
return match
def extract_label(self, iri: str) -> str:
"""Public wrapper for IRI label extraction."""
return self._extract_label_from_iri(iri)
@property
def cache(self) -> ResolutionCache:
return self._cache
# ------------------------------------------------------------------
# Internal resolution pipeline
# ------------------------------------------------------------------
def _resolve(
self,
label: str,
article_title: str,
article_qid: str,
context_type: str,
candidate_qids: list[str],
) -> EntityMatch | None:
# Stage 1: sitelink — label matches article title
if self._labels_match(label, article_title) and article_qid:
return EntityMatch(
riverbank_iri="", # filled by caller
wikidata_qid=article_qid,
match_type="sitelink",
confidence=1.0,
explanation=f"Label '{label}' matches article title '{article_title}'",
)
# Stage 2: Wikidata search
if self._wikidata is not None and not candidate_qids:
candidate_qids = self._search_wikidata(label)
if not candidate_qids:
return None
# Stage 3: fuzzy label matching + context disambiguation
best_qid, best_ratio = self._fuzzy_match(label, candidate_qids)
if best_qid is None or best_ratio < self.min_fuzzy_ratio:
return None
confidence = best_ratio / 100.0
# If context type provided, try disambiguation
if context_type and len(candidate_qids) > 1:
disambig = self._disambiguate_by_context(candidate_qids, context_type)
if disambig and disambig == best_qid:
confidence = min(1.0, confidence + 0.05)
return EntityMatch(
riverbank_iri="",
wikidata_qid=best_qid,
match_type="fuzzy_label" if best_ratio < 100 else "label",
confidence=confidence,
explanation=f"Fuzzy match ratio {best_ratio:.1f} for label '{label}'",
)
def _extract_label_from_iri(self, iri: str) -> str:
"""Extract a human-readable label from a riverbank IRI.
Examples::
"http://example.org/person/marie-curie" → "marie curie"
"ex:Marie_Curie" → "Marie Curie"
"http://riverbank.example/org/Apple_Inc" → "Apple Inc"
"""
# Prefixed form: ex:Label
if ":" in iri and "/" not in iri:
label = iri.split(":", 1)[-1]
else:
# URL: take local name after last / or #
label = re.split(r"[/#]", iri)[-1]
# Replace separators
label = label.replace("_", " ").replace("-", " ")
# Normalize unicode
label = unicodedata.normalize("NFC", label)
# Capitalise words (handles camelCase somewhat)
label = re.sub(r"([a-z])([A-Z])", r"\1 \2", label)
return label.strip()
@staticmethod
def _labels_match(a: str, b: str) -> bool:
"""Case-insensitive label comparison (ignoring punctuation)."""
def _norm(s: str) -> str:
return re.sub(r"[^a-z0-9 ]", "", s.lower()).strip()
return _norm(a) == _norm(b)
def _search_wikidata(self, label: str) -> list[str]:
"""Query Wikidata search API; return list of Q-ids."""
try:
import requests # noqa: PLC0415
params = {
"action": "wbsearchentities",
"search": label,
"language": "en",
"format": "json",
"limit": 10,
}
resp = requests.get(
"https://www.wikidata.org/w/api.php",
params=params,
headers={"User-Agent": "riverbank-eval/0.15.0"},
timeout=10,
)
data = resp.json()
return [r.get("id", "") for r in data.get("search", []) if r.get("id")]
except Exception: # noqa: BLE001
return []
def _fuzzy_match(
self,
label: str,
candidate_qids: list[str],
) -> tuple[str | None, float]:
"""Return (best_qid, ratio 0–100) by fuzzy matching label against Q-id labels."""
if not candidate_qids:
return None, 0.0
try:
from rapidfuzz import fuzz # noqa: PLC0415
ratio_fn = fuzz.token_sort_ratio
except ImportError:
from difflib import SequenceMatcher # noqa: PLC0415
def ratio_fn(a: str, b: str) -> float: # type: ignore[misc]
return SequenceMatcher(None, a.lower(), b.lower()).ratio() * 100
best_qid = None
best_ratio = 0.0
label_lower = label.lower()
for qid in candidate_qids:
if not qid:
continue
# For scoring we use the Q-id itself as a stand-in for label;
# real Wikidata label fetching would require extra API calls, so
# we use the search API's returned label implicitly via the ordering
# (first result is usually the best match).
candidate_label = qid # placeholder; callers may pass enriched candidates
ratio = ratio_fn(label_lower, candidate_label.lower())
if ratio > best_ratio:
best_ratio = ratio
best_qid = qid
# Since we don't resolve labels here (to avoid extra API calls),
# treat the first candidate returned by Wikidata search as the best match
# (search API orders by relevance already).
if candidate_qids and best_ratio < self.min_fuzzy_ratio:
best_qid = candidate_qids[0]
best_ratio = self.min_fuzzy_ratio
return best_qid, best_ratio
def _disambiguate_by_context(
self,
candidate_qids: list[str],
context_type: str,
) -> str | None:
"""Filter Q-ids by expected P31 (instance of) type."""
expected_types = _CONTEXT_TYPE_MAP.get(context_type, [])
if not expected_types or self._wikidata is None:
return None
for qid in candidate_qids:
try:
item = self._wikidata.get_item_by_qid(qid)
for stmt in item.statements:
if stmt.property_id == "P31" and stmt.value in expected_types:
return qid
except Exception: # noqa: BLE001
continue
return None