Spaces:

ICSAC-Institute
/

editorial-system

Running

ICSAC Claude Opus 4.7 commited on 8 days ago

Commit

ea90e44

1 Parent(s): d4014ee

Add Crossref bibliographic-query resolver as fallback (step 5)

Diagnoses + fixes the dominant "unverifiable" failure mode: the extractor
under-populates structured fields (title/doi/arxiv_id) for journal classics
like Landauer 1961, Tononi 2004, Shannon 1948 even when the raw citation
string is intact. The verifier then has nothing to feed its catalog searches.

Three additive changes (no existing path removed, no behavior downgraded):

- submission_intake.search_crossref_bibliographic: hits
/works?query.bibliographic=<raw> which accepts the whole raw citation
string and returns ranked candidate works. Mirrors fetch_crossref_metadata
style; returns up to N candidates with score field.

- citation_verify._parse_title_from_raw: conservative regex lift of the title
from a raw "Author. (Year). Title. Venue, ..." citation. Used at the top
of verify_citation when the extractor left title empty, so the existing
arXiv-search + Semantic-Scholar steps also benefit. Returns "" if the
pattern doesn't cleanly match -- never invents data.

- citation_verify._search_crossref_bibliographic + step 5 in verify_citation:
feeds the raw string AS-IS to Crossref. Catches the class of refs with no
structured fields but a valid raw citation. Uses the same title/author/year
co-confirmation thresholds as steps 3-4. Crossref-only (no S2 fallback
here -- S2's free tier is rate-limited and the extractor-failure class is
precisely the one Crossref-biblio handles best).

Impact (re-processed all 5 Thornhill citation JSONs locally): 12 -> 85
verified (+73 recovered). Foundational refs that were silently dropped
before now resolve cleanly: Shannon 1948, Landauer 1961, Tononi 2004 IIT,
Friston 2010 free-energy, Bennett 1982, Hinton 2006, Pearson 1901 PCA,
Bellman 1961, Langton 1990, Scheffer 2009 early-warning signals, etc.
Idempotent re-processing is safe -- verifier only upgrades unverified ->
verified, never downgrades.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

citation_verify.py +95 -0
submission_intake.py +63 -0

citation_verify.py CHANGED Viewed

@@ -403,6 +403,61 @@ def _search_semanticscholar(query: str, year: int | None = None) -> dict | None:
     }
 def _normalize_for_match(s: str) -> str:
     """Canonicalize a string for fuzzy comparison."""
     if not s:
@@ -469,6 +524,15 @@ def verify_citation(c: dict) -> dict:
         "reason": "",
     }
     # 1. arXiv exact-id
     if c.get("arxiv_id"):
         r = _fetch_arxiv(c["arxiv_id"])
@@ -600,6 +664,37 @@ def verify_citation(c: dict) -> dict:
             )
             return out
     out["reason"] = "No exact identifier and no title for catalog search."
     return out

     }
+_RAW_TITLE_RE = re.compile(
+    r"\(\s*(?P<year>\d{4})[a-z]?\s*\)\.\s*(?P<title>[^.]+?)\.\s+(?:[A-Z][a-z]|\d)",
+    re.S,
+)
+def _parse_title_from_raw(raw: str) -> str:
+    """Conservative regex lift of the title from a raw citation string like
+    'Landauer, R. (1961). Irreversibility and Heat Generation... IBM J...'.
+    Used only when the extractor left ``title`` empty. Returns "" if the
+    pattern doesn't cleanly match; never invents data.
+    """
+    if not raw:
+        return ""
+    m = _RAW_TITLE_RE.search(raw)
+    if not m:
+        return ""
+    title = m.group("title").strip()
+    # reject candidates that look like author lists or are too short to be a title
+    if len(title) < 12 or title.count(",") > 3:
+        return ""
+    return title
+def _search_crossref_bibliographic(raw: str, year: int | None = None) -> dict | None:
+    """Crossref bibliographic-query resolver — feeds the raw citation string
+    AS-IS to /works?query.bibliographic=. Catches the class of refs the
+    extractor couldn't structure (no ``doi``/``arxiv_id``/``title``) but still
+    have the raw citation intact. Returns the top candidate in verifier shape,
+    or None on miss / network error.
+    """
+    try:
+        results = submission_intake.search_crossref_bibliographic(raw, rows=5)
+    except Exception:
+        return None
+    if not results:
+        return None
+    if year:
+        with_year = [r for r in results if r.get("year") and abs(int(r["year"]) - int(year)) <= 1]
+        if with_year:
+            results = with_year
+    top = results[0]
+    if not top.get("title") or not top.get("doi"):
+        return None
+    return {
+        "resolver": "crossref",
+        "resolved_id": top.get("doi"),
+        "title": top.get("title", ""),
+        "abstract": top.get("abstract") or "",
+        "year": top.get("year"),
+        "authors": top.get("authors") or [],
+    }
 def _normalize_for_match(s: str) -> str:
     """Canonicalize a string for fuzzy comparison."""
     if not s:
         "reason": "",
     }
+    # Repair: if the extractor left `title` empty but the raw citation
+    # string is intact, lift the title from raw so the catalog searches
+    # below have something to query with. Mutates a local copy only.
+    if not c.get("title") and c.get("raw"):
+        parsed = _parse_title_from_raw(c["raw"])
+        if parsed:
+            c = dict(c)
+            c["title"] = parsed
     # 1. arXiv exact-id
     if c.get("arxiv_id"):
         r = _fetch_arxiv(c["arxiv_id"])
             )
             return out
+    # 5. Crossref bibliographic-query — feeds the raw citation string AS-IS.
+    #    Catches refs with no DOI/arXiv/title in the structured fields but a
+    #    valid raw citation (the dominant failure mode for journal classics
+    #    like Landauer/Shannon/Tononi/Friston that the extractor under-structures).
+    if c.get("raw") and len(c["raw"]) >= 20:
+        r = _search_crossref_bibliographic(c["raw"], year=c.get("year"))
+        if r:
+            title_ok = _title_matches(c.get("title"), r["title"]) if c.get("title") else False
+            authors_ok = _author_overlap(c.get("authors") or [], r.get("authors") or [])
+            year_ok = (
+                c.get("year") and r.get("year")
+                and abs(int(c["year"]) - int(r["year"])) <= 1
+            )
+            if (title_ok and authors_ok) or (title_ok and year_ok) or (authors_ok and year_ok):
+                conf = "title-author-match" if (title_ok and authors_ok) else "title-only-match"
+                out.update({
+                    "verified": True,
+                    "resolver": r["resolver"],
+                    "resolved_id": r["resolved_id"],
+                    "title": r["title"],
+                    "abstract": r["abstract"],
+                    "confidence": conf,
+                    "reason": "Crossref bibliographic-query matched on raw citation string.",
+                })
+                return out
+            out["reason"] = (
+                "Crossref bibliographic-query returned a candidate but "
+                "title + author + year did not co-confirm."
+            )
+            return out
     out["reason"] = "No exact identifier and no title for catalog search."
     return out

submission_intake.py CHANGED Viewed

@@ -427,6 +427,69 @@ def fetch_crossref_metadata(doi: str) -> dict | None:
     }
 def search_semanticscholar(query: str) -> list[dict]:
     """Semantic Scholar Graph API search.

     }
+def search_crossref_bibliographic(query: str, rows: int = 5) -> list[dict]:
+    """Crossref REST: GET /works?query.bibliographic=<query>
+    Feed the raw citation string AS-IS — e.g. "Landauer, R. (1961).
+    Irreversibility and Heat Generation in the Computing Process. IBM J.
+    Res. Dev. 5(3), 183-191." Crossref indexes the whole reference and
+    returns ranked candidate works (DOI + title + authors + year).
+    Returns up to ``rows`` candidate dicts in the same shape as
+    fetch_crossref_metadata, with an extra ``score`` field (Crossref relevance
+    score). Returns [] on miss / network error / very short query.
+    """
+    if not query or len(query.strip()) < 20:
+        return []
+    safe = urllib.parse.quote_plus(query.strip())
+    url = f"https://api.crossref.org/works?query.bibliographic={safe}&rows={rows}"
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": "ICSAC-pipeline/1.0 (mailto:info@icsacinstitute.org)",
+            "Accept": "application/json",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=CITATION_HTTP_TIMEOUT) as resp:
+            data = json.loads(resp.read().decode("utf-8", errors="replace"))
+    except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, json.JSONDecodeError):
+        return []
+    items = (data.get("message") or {}).get("items") or []
+    out = []
+    for msg in items:
+        title_list = msg.get("title") or []
+        title = title_list[0].strip() if title_list else ""
+        if not title:
+            continue
+        abstract = msg.get("abstract") or ""
+        if abstract:
+            abstract = re.sub(r"<[^>]+>", "", abstract).strip()
+        authors = []
+        for a in msg.get("author", []) or []:
+            family = (a.get("family") or "").strip()
+            given = (a.get("given") or "").strip()
+            full = (f"{given} {family}").strip() or family or given
+            if full:
+                authors.append(full)
+        year = None
+        issued = msg.get("issued") or msg.get("published-print") or msg.get("published-online")
+        if issued and isinstance(issued.get("date-parts"), list) and issued["date-parts"]:
+            first = issued["date-parts"][0]
+            if first and isinstance(first[0], int):
+                year = first[0]
+        out.append({
+            "doi": (msg.get("DOI") or "").lower(),
+            "title": title,
+            "authors": authors,
+            "abstract": abstract,
+            "year": year,
+            "type": msg.get("type", ""),
+            "score": msg.get("score", 0.0),
+        })
+    return out
 def search_semanticscholar(query: str) -> list[dict]:
     """Semantic Scholar Graph API search.