ask-the-web-agent / src /synthesis /citations.py
debashis2007's picture
Upload folder using huggingface_hub
75bea1c verified
from __future__ import annotations
"""Citation management for source tracking."""
from dataclasses import dataclass, field
from typing import Any
from urllib.parse import urlparse
@dataclass
class Citation:
"""A source citation."""
title: str
url: str
snippet: str = ""
accessed_at: str = ""
reliability_score: float = 0.5
class CitationManager:
"""Manages citations and source tracking."""
def __init__(self):
"""Initialize the citation manager."""
self._citations: list[Citation] = []
self._url_index: dict[str, int] = {}
def add_citation(
self,
title: str,
url: str,
snippet: str = "",
) -> int:
"""Add a citation and return its index.
Args:
title: Source title
url: Source URL
snippet: Relevant text snippet
Returns:
Citation index (1-based)
"""
# Check if URL already exists
if url in self._url_index:
return self._url_index[url]
# Calculate reliability score based on domain
reliability = self._assess_reliability(url)
citation = Citation(
title=title,
url=url,
snippet=snippet,
reliability_score=reliability,
)
self._citations.append(citation)
index = len(self._citations)
self._url_index[url] = index
return index
def get_citation(self, index: int) -> Citation | None:
"""Get a citation by index.
Args:
index: Citation index (1-based)
Returns:
Citation or None if not found
"""
if 1 <= index <= len(self._citations):
return self._citations[index - 1]
return None
def get_all_citations(self) -> list[Citation]:
"""Get all citations.
Returns:
List of all citations
"""
return list(self._citations)
def format_inline(self, index: int) -> str:
"""Format an inline citation reference.
Args:
index: Citation index
Returns:
Formatted inline citation [n]
"""
return f"[{index}]"
def format_bibliography(self, style: str = "markdown") -> str:
"""Format all citations as a bibliography.
Args:
style: Output style (markdown, plain, html)
Returns:
Formatted bibliography
"""
if not self._citations:
return ""
lines = []
if style == "markdown":
lines.append("**Sources:**")
for i, cite in enumerate(self._citations, 1):
lines.append(f"[{i}] [{cite.title}]({cite.url})")
elif style == "plain":
lines.append("Sources:")
for i, cite in enumerate(self._citations, 1):
lines.append(f"{i}. {cite.title}")
lines.append(f" {cite.url}")
elif style == "html":
lines.append("<div class='sources'>")
lines.append("<h4>Sources:</h4>")
lines.append("<ol>")
for cite in self._citations:
lines.append(
f'<li><a href="{cite.url}" target="_blank">{cite.title}</a></li>'
)
lines.append("</ol>")
lines.append("</div>")
return "\n".join(lines)
def to_dict_list(self) -> list[dict[str, Any]]:
"""Convert citations to list of dictionaries.
Returns:
List of citation dictionaries
"""
return [
{
"title": cite.title,
"url": cite.url,
"snippet": cite.snippet,
"reliability_score": cite.reliability_score,
}
for cite in self._citations
]
def clear(self) -> None:
"""Clear all citations."""
self._citations.clear()
self._url_index.clear()
def _assess_reliability(self, url: str) -> float:
"""Assess reliability of a source based on URL.
Args:
url: Source URL
Returns:
Reliability score (0.0 - 1.0)
"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
except Exception:
return 0.3
# High reliability domains
high_reliability = [
".gov",
".edu",
"wikipedia.org",
"bbc.com",
"reuters.com",
"apnews.com",
"nature.com",
"sciencedirect.com",
"pubmed.gov",
"nytimes.com",
"wsj.com",
"economist.com",
]
# Medium reliability domains
medium_reliability = [
"medium.com",
"techcrunch.com",
"wired.com",
"arstechnica.com",
"theverge.com",
"forbes.com",
"bloomberg.com",
]
for high in high_reliability:
if high in domain:
return 0.9
for medium in medium_reliability:
if medium in domain:
return 0.7
# Default moderate reliability for unknown sources
return 0.5
def get_most_reliable(self, n: int = 3) -> list[Citation]:
"""Get the n most reliable citations.
Args:
n: Number of citations to return
Returns:
List of most reliable citations
"""
sorted_citations = sorted(
self._citations,
key=lambda c: c.reliability_score,
reverse=True,
)
return sorted_citations[:n]