Skip to content

entity_resolution

Entity resolution for the Wikidata evaluation framework.

Links riverbank IRIs to Wikidata Q-ids using a three-stage pipeline:

  1. Sitelink: if the IRI label matches the article title, use the main entity Q-id directly (highest confidence: 1.0).
  2. Label match: extract a human-readable label from the IRI, then query Wikidata's search API for candidates; fuzzy-match labels and aliases.
  3. Context disambiguation: when multiple candidates have similar scores, filter by P31 (instance of) type.

The ResolutionCache avoids redundant Wikidata lookups within a single evaluation run.

Fuzzy matching uses rapidfuzz if available, falling back to difflib.SequenceMatcher (always available, slightly slower).

EntityResolver

Resolve riverbank IRIs to Wikidata Q-ids.

Parameters

wikidata_client: A :class:~riverbank.eval.wikidata_client.WikidataClient instance used for live Wikidata lookups. Can be None in offline / test scenarios; resolution will fall back to label-only heuristics. min_fuzzy_ratio: Minimum similarity score (0–100) for fuzzy label matching.

Source code in src/riverbank/eval/entity_resolution.py
class EntityResolver:
    """Resolve riverbank IRIs to Wikidata Q-ids.

    Parameters
    ----------
    wikidata_client:
        A :class:`~riverbank.eval.wikidata_client.WikidataClient` instance used
        for live Wikidata lookups.  Can be *None* in offline / test scenarios;
        resolution will fall back to label-only heuristics.
    min_fuzzy_ratio:
        Minimum similarity score (0–100) for fuzzy label matching.
    """

    def __init__(
        self,
        wikidata_client=None,  # WikidataClient — avoid circular import
        min_fuzzy_ratio: float = 85.0,
    ) -> None:
        self._wikidata = wikidata_client
        self.min_fuzzy_ratio = min_fuzzy_ratio
        self._cache = ResolutionCache()

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def resolve_entity(
        self,
        riverbank_iri: str,
        article_title: str,
        article_qid: str = "",
        context_type: str = "",
        candidate_qids: list[str] | None = None,
    ) -> EntityMatch | None:
        """Attempt to resolve a riverbank IRI to a Wikidata Q-id.

        Parameters
        ----------
        riverbank_iri:
            The riverbank subject IRI (e.g. ``http://example.org/Marie_Curie``).
        article_title:
            Title of the Wikipedia article being evaluated.
        article_qid:
            Wikidata Q-id of the main article entity (used for sitelink matching).
        context_type:
            Domain hint: ``"person"`` | ``"organization"`` | ``"place"`` | ``"work"`` | ``"event"``.
        candidate_qids:
            Pre-supplied Q-id candidates (skip Wikidata search if provided).
        """
        # Check in-memory cache first
        cached = self._cache.get(riverbank_iri)
        if cached is not None:
            return cached

        label = self._extract_label_from_iri(riverbank_iri)
        if not label:
            return None

        match = self._resolve(
            label=label,
            article_title=article_title,
            article_qid=article_qid,
            context_type=context_type,
            candidate_qids=candidate_qids or [],
        )

        if match:
            self._cache.put(riverbank_iri, match)
        return match

    def extract_label(self, iri: str) -> str:
        """Public wrapper for IRI label extraction."""
        return self._extract_label_from_iri(iri)

    @property
    def cache(self) -> ResolutionCache:
        return self._cache

    # ------------------------------------------------------------------
    # Internal resolution pipeline
    # ------------------------------------------------------------------

    def _resolve(
        self,
        label: str,
        article_title: str,
        article_qid: str,
        context_type: str,
        candidate_qids: list[str],
    ) -> EntityMatch | None:
        # Stage 1: sitelink — label matches article title
        if self._labels_match(label, article_title) and article_qid:
            return EntityMatch(
                riverbank_iri="",  # filled by caller
                wikidata_qid=article_qid,
                match_type="sitelink",
                confidence=1.0,
                explanation=f"Label '{label}' matches article title '{article_title}'",
            )

        # Stage 2: Wikidata search
        if self._wikidata is not None and not candidate_qids:
            candidate_qids = self._search_wikidata(label)

        if not candidate_qids:
            return None

        # Stage 3: fuzzy label matching + context disambiguation
        best_qid, best_ratio = self._fuzzy_match(label, candidate_qids)
        if best_qid is None or best_ratio < self.min_fuzzy_ratio:
            return None

        confidence = best_ratio / 100.0

        # If context type provided, try disambiguation
        if context_type and len(candidate_qids) > 1:
            disambig = self._disambiguate_by_context(candidate_qids, context_type)
            if disambig and disambig == best_qid:
                confidence = min(1.0, confidence + 0.05)

        return EntityMatch(
            riverbank_iri="",
            wikidata_qid=best_qid,
            match_type="fuzzy_label" if best_ratio < 100 else "label",
            confidence=confidence,
            explanation=f"Fuzzy match ratio {best_ratio:.1f} for label '{label}'",
        )

    def _extract_label_from_iri(self, iri: str) -> str:
        """Extract a human-readable label from a riverbank IRI.

        Examples::

            "http://example.org/person/marie-curie"  → "marie curie"
            "ex:Marie_Curie"                         → "Marie Curie"
            "http://riverbank.example/org/Apple_Inc" → "Apple Inc"
        """
        # Prefixed form: ex:Label
        if ":" in iri and "/" not in iri:
            label = iri.split(":", 1)[-1]
        else:
            # URL: take local name after last / or #
            label = re.split(r"[/#]", iri)[-1]

        # Replace separators
        label = label.replace("_", " ").replace("-", " ")

        # Normalize unicode
        label = unicodedata.normalize("NFC", label)

        # Capitalise words (handles camelCase somewhat)
        label = re.sub(r"([a-z])([A-Z])", r"\1 \2", label)

        return label.strip()

    @staticmethod
    def _labels_match(a: str, b: str) -> bool:
        """Case-insensitive label comparison (ignoring punctuation)."""
        def _norm(s: str) -> str:
            return re.sub(r"[^a-z0-9 ]", "", s.lower()).strip()
        return _norm(a) == _norm(b)

    def _search_wikidata(self, label: str) -> list[str]:
        """Query Wikidata search API; return list of Q-ids."""
        try:
            import requests  # noqa: PLC0415

            params = {
                "action": "wbsearchentities",
                "search": label,
                "language": "en",
                "format": "json",
                "limit": 10,
            }
            resp = requests.get(
                "https://www.wikidata.org/w/api.php",
                params=params,
                headers={"User-Agent": "riverbank-eval/0.15.0"},
                timeout=10,
            )
            data = resp.json()
            return [r.get("id", "") for r in data.get("search", []) if r.get("id")]
        except Exception:  # noqa: BLE001
            return []

    def _fuzzy_match(
        self,
        label: str,
        candidate_qids: list[str],
    ) -> tuple[str | None, float]:
        """Return (best_qid, ratio 0–100) by fuzzy matching label against Q-id labels."""
        if not candidate_qids:
            return None, 0.0

        try:
            from rapidfuzz import fuzz  # noqa: PLC0415
            ratio_fn = fuzz.token_sort_ratio
        except ImportError:
            from difflib import SequenceMatcher  # noqa: PLC0415

            def ratio_fn(a: str, b: str) -> float:  # type: ignore[misc]
                return SequenceMatcher(None, a.lower(), b.lower()).ratio() * 100

        best_qid = None
        best_ratio = 0.0

        label_lower = label.lower()
        for qid in candidate_qids:
            if not qid:
                continue
            # For scoring we use the Q-id itself as a stand-in for label;
            # real Wikidata label fetching would require extra API calls, so
            # we use the search API's returned label implicitly via the ordering
            # (first result is usually the best match).
            candidate_label = qid  # placeholder; callers may pass enriched candidates
            ratio = ratio_fn(label_lower, candidate_label.lower())
            if ratio > best_ratio:
                best_ratio = ratio
                best_qid = qid

        # Since we don't resolve labels here (to avoid extra API calls),
        # treat the first candidate returned by Wikidata search as the best match
        # (search API orders by relevance already).
        if candidate_qids and best_ratio < self.min_fuzzy_ratio:
            best_qid = candidate_qids[0]
            best_ratio = self.min_fuzzy_ratio

        return best_qid, best_ratio

    def _disambiguate_by_context(
        self,
        candidate_qids: list[str],
        context_type: str,
    ) -> str | None:
        """Filter Q-ids by expected P31 (instance of) type."""
        expected_types = _CONTEXT_TYPE_MAP.get(context_type, [])
        if not expected_types or self._wikidata is None:
            return None

        for qid in candidate_qids:
            try:
                item = self._wikidata.get_item_by_qid(qid)
                for stmt in item.statements:
                    if stmt.property_id == "P31" and stmt.value in expected_types:
                        return qid
            except Exception:  # noqa: BLE001
                continue
        return None

extract_label(iri)

Public wrapper for IRI label extraction.

Source code in src/riverbank/eval/entity_resolution.py
def extract_label(self, iri: str) -> str:
    """Public wrapper for IRI label extraction."""
    return self._extract_label_from_iri(iri)

resolve_entity(riverbank_iri, article_title, article_qid='', context_type='', candidate_qids=None)

Attempt to resolve a riverbank IRI to a Wikidata Q-id.

Parameters

riverbank_iri: The riverbank subject IRI (e.g. http://example.org/Marie_Curie). article_title: Title of the Wikipedia article being evaluated. article_qid: Wikidata Q-id of the main article entity (used for sitelink matching). context_type: Domain hint: "person" | "organization" | "place" | "work" | "event". candidate_qids: Pre-supplied Q-id candidates (skip Wikidata search if provided).

Source code in src/riverbank/eval/entity_resolution.py
def resolve_entity(
    self,
    riverbank_iri: str,
    article_title: str,
    article_qid: str = "",
    context_type: str = "",
    candidate_qids: list[str] | None = None,
) -> EntityMatch | None:
    """Attempt to resolve a riverbank IRI to a Wikidata Q-id.

    Parameters
    ----------
    riverbank_iri:
        The riverbank subject IRI (e.g. ``http://example.org/Marie_Curie``).
    article_title:
        Title of the Wikipedia article being evaluated.
    article_qid:
        Wikidata Q-id of the main article entity (used for sitelink matching).
    context_type:
        Domain hint: ``"person"`` | ``"organization"`` | ``"place"`` | ``"work"`` | ``"event"``.
    candidate_qids:
        Pre-supplied Q-id candidates (skip Wikidata search if provided).
    """
    # Check in-memory cache first
    cached = self._cache.get(riverbank_iri)
    if cached is not None:
        return cached

    label = self._extract_label_from_iri(riverbank_iri)
    if not label:
        return None

    match = self._resolve(
        label=label,
        article_title=article_title,
        article_qid=article_qid,
        context_type=context_type,
        candidate_qids=candidate_qids or [],
    )

    if match:
        self._cache.put(riverbank_iri, match)
    return match