Spaces:

debashis2007
/

ask-the-web-agent

Sleeping

App Files Files Community

ask-the-web-agent / src /synthesis /citations.py

debashis2007

Upload folder using huggingface_hub

75bea1c verified 2 months ago

raw

history blame contribute delete

5.85 kB

	from __future__ import annotations
	"""Citation management for source tracking."""

	from dataclasses import dataclass, field
	from typing import Any
	from urllib.parse import urlparse


	@dataclass
	class Citation:
	"""A source citation."""

	title: str
	url: str
	snippet: str = ""
	accessed_at: str = ""
	reliability_score: float = 0.5


	class CitationManager:
	"""Manages citations and source tracking."""

	def __init__(self):
	"""Initialize the citation manager."""
	self._citations: list[Citation] = []
	self._url_index: dict[str, int] = {}

	def add_citation(
	self,
	title: str,
	url: str,
	snippet: str = "",
	) -> int:
	"""Add a citation and return its index.

	Args:
	title: Source title
	url: Source URL
	snippet: Relevant text snippet

	Returns:
	Citation index (1-based)
	"""
	# Check if URL already exists
	if url in self._url_index:
	return self._url_index[url]

	# Calculate reliability score based on domain
	reliability = self._assess_reliability(url)

	citation = Citation(
	title=title,
	url=url,
	snippet=snippet,
	reliability_score=reliability,
	)

	self._citations.append(citation)
	index = len(self._citations)
	self._url_index[url] = index

	return index

	def get_citation(self, index: int) -> Citation \| None:
	"""Get a citation by index.

	Args:
	index: Citation index (1-based)

	Returns:
	Citation or None if not found
	"""
	if 1 <= index <= len(self._citations):
	return self._citations[index - 1]
	return None

	def get_all_citations(self) -> list[Citation]:
	"""Get all citations.

	Returns:
	List of all citations
	"""
	return list(self._citations)

	def format_inline(self, index: int) -> str:
	"""Format an inline citation reference.

	Args:
	index: Citation index

	Returns:
	Formatted inline citation [n]
	"""
	return f"[{index}]"

	def format_bibliography(self, style: str = "markdown") -> str:
	"""Format all citations as a bibliography.

	Args:
	style: Output style (markdown, plain, html)

	Returns:
	Formatted bibliography
	"""
	if not self._citations:
	return ""

	lines = []

	if style == "markdown":
	lines.append("Sources:")
	for i, cite in enumerate(self._citations, 1):
	lines.append(f"[{i}] [{cite.title}]({cite.url})")

	elif style == "plain":
	lines.append("Sources:")
	for i, cite in enumerate(self._citations, 1):
	lines.append(f"{i}. {cite.title}")
	lines.append(f" {cite.url}")

	elif style == "html":
	lines.append("<div class='sources'>")
	lines.append("<h4>Sources:</h4>")
	lines.append("<ol>")
	for cite in self._citations:
	lines.append(
	f'<li><a href="{cite.url}" target="_blank">{cite.title}</a></li>'
	)
	lines.append("</ol>")
	lines.append("</div>")

	return "\n".join(lines)

	def to_dict_list(self) -> list[dict[str, Any]]:
	"""Convert citations to list of dictionaries.

	Returns:
	List of citation dictionaries
	"""
	return [
	{
	"title": cite.title,
	"url": cite.url,
	"snippet": cite.snippet,
	"reliability_score": cite.reliability_score,
	}
	for cite in self._citations
	]

	def clear(self) -> None:
	"""Clear all citations."""
	self._citations.clear()
	self._url_index.clear()

	def _assess_reliability(self, url: str) -> float:
	"""Assess reliability of a source based on URL.

	Args:
	url: Source URL

	Returns:
	Reliability score (0.0 - 1.0)
	"""
	try:
	parsed = urlparse(url)
	domain = parsed.netloc.lower()
	except Exception:
	return 0.3

	# High reliability domains
	high_reliability = [
	".gov",
	".edu",
	"wikipedia.org",
	"bbc.com",
	"reuters.com",
	"apnews.com",
	"nature.com",
	"sciencedirect.com",
	"pubmed.gov",
	"nytimes.com",
	"wsj.com",
	"economist.com",
	]

	# Medium reliability domains
	medium_reliability = [
	"medium.com",
	"techcrunch.com",
	"wired.com",
	"arstechnica.com",
	"theverge.com",
	"forbes.com",
	"bloomberg.com",
	]

	for high in high_reliability:
	if high in domain:
	return 0.9

	for medium in medium_reliability:
	if medium in domain:
	return 0.7

	# Default moderate reliability for unknown sources
	return 0.5

	def get_most_reliable(self, n: int = 3) -> list[Citation]:
	"""Get the n most reliable citations.

	Args:
	n: Number of citations to return

	Returns:
	List of most reliable citations
	"""
	sorted_citations = sorted(
	self._citations,
	key=lambda c: c.reliability_score,
	reverse=True,
	)
	return sorted_citations[:n]