Spaces:

4waiz
/

CiteGuard-RAG

Running

App Files Files Community

CiteGuard-RAG / src /citeguard /preprocess /citation_parser.py

4waiz

chore: flatten repo layout

910eb59 14 days ago

raw

history blame contribute delete

3.12 kB

	"""Citation parsing.

	Supports bracketed citations such as ``[doc1]``, ``[1]``, ``[doc1, doc2]``.
	Numeric citations like ``[1]`` are normalized to ``doc1`` so they line up
	with context ``doc_id``s when authors number their sources.
	"""
	from __future__ import annotations

	import re
	from dataclasses import dataclass


	_DEFAULT_PATTERNS = [r"\[([^\]\[]*)\]"]
	_NUMERIC_RE = re.compile(r"^\d+$")


	@dataclass
	class CitationParser:
	"""Parse cited doc ids from a sentence.

	``numeric_prefix`` controls how ``[1]`` is rendered. With the default
	``"doc"`` it becomes ``"doc1"``.
	"""

	patterns: list[str] \| None = None
	numeric_prefix: str = "doc"

	def __post_init__(self) -> None:
	patterns = self.patterns or _DEFAULT_PATTERNS
	self._compiled = [re.compile(p) for p in patterns]

	def parse(self, sentence: str) -> tuple[list[str], bool]:
	"""Return (cited_doc_ids, parser_uncertain).

	``parser_uncertain`` is True when bracketed content was found but
	produced no usable doc ids (e.g. ``[?]``, ``[ ]``, ``[see fig 1]``).
	Empty input or input with no brackets is not uncertain — just uncited.
	"""
	if not sentence:
	return [], False

	found_any_bracket = False
	produced_ids: list[str] = []
	seen: set[str] = set()

	for regex in self._compiled:
	for match in regex.finditer(sentence):
	found_any_bracket = True
	inner = match.group(1).strip()
	if not inner:
	continue
	# Split on commas or semicolons inside the bracket.
	parts = re.split(r"[,;]", inner)
	for raw in parts:
	tok = raw.strip()
	if not tok:
	continue
	doc_id = self._normalize(tok)
	if doc_id and doc_id not in seen:
	produced_ids.append(doc_id)
	seen.add(doc_id)

	parser_uncertain = found_any_bracket and not produced_ids
	return produced_ids, parser_uncertain

	def _normalize(self, token: str) -> str \| None:
	token = token.strip()
	if not token:
	return None
	if _NUMERIC_RE.match(token):
	return f"{self.numeric_prefix}{token}"
	# Drop trailing punctuation like "doc1." but keep underscores/hyphens.
	token = re.sub(r"[^\w\-]+$", "", token)
	token = re.sub(r"^[^\w]+", "", token)
	if not token:
	return None
	# Reject obviously-non-citation content like "see Fig 2" (contains spaces).
	if " " in token:
	# Allow "Smith2020" but reject "see fig" — i.e. require no whitespace.
	return None
	return token


	def parse_citations_in_sentence(
	sentence: str,
	patterns: list[str] \| None = None,
	numeric_prefix: str = "doc",
	) -> tuple[list[str], bool]:
	"""Convenience wrapper that constructs a parser per call."""
	return CitationParser(patterns=patterns, numeric_prefix=numeric_prefix).parse(sentence)