from __future__ import annotations """Citation management for source tracking.""" from dataclasses import dataclass, field from typing import Any from urllib.parse import urlparse @dataclass class Citation: """A source citation.""" title: str url: str snippet: str = "" accessed_at: str = "" reliability_score: float = 0.5 class CitationManager: """Manages citations and source tracking.""" def __init__(self): """Initialize the citation manager.""" self._citations: list[Citation] = [] self._url_index: dict[str, int] = {} def add_citation( self, title: str, url: str, snippet: str = "", ) -> int: """Add a citation and return its index. Args: title: Source title url: Source URL snippet: Relevant text snippet Returns: Citation index (1-based) """ # Check if URL already exists if url in self._url_index: return self._url_index[url] # Calculate reliability score based on domain reliability = self._assess_reliability(url) citation = Citation( title=title, url=url, snippet=snippet, reliability_score=reliability, ) self._citations.append(citation) index = len(self._citations) self._url_index[url] = index return index def get_citation(self, index: int) -> Citation | None: """Get a citation by index. Args: index: Citation index (1-based) Returns: Citation or None if not found """ if 1 <= index <= len(self._citations): return self._citations[index - 1] return None def get_all_citations(self) -> list[Citation]: """Get all citations. Returns: List of all citations """ return list(self._citations) def format_inline(self, index: int) -> str: """Format an inline citation reference. Args: index: Citation index Returns: Formatted inline citation [n] """ return f"[{index}]" def format_bibliography(self, style: str = "markdown") -> str: """Format all citations as a bibliography. Args: style: Output style (markdown, plain, html) Returns: Formatted bibliography """ if not self._citations: return "" lines = [] if style == "markdown": lines.append("**Sources:**") for i, cite in enumerate(self._citations, 1): lines.append(f"[{i}] [{cite.title}]({cite.url})") elif style == "plain": lines.append("Sources:") for i, cite in enumerate(self._citations, 1): lines.append(f"{i}. {cite.title}") lines.append(f" {cite.url}") elif style == "html": lines.append("
") lines.append("

Sources:

") lines.append("
    ") for cite in self._citations: lines.append( f'
  1. {cite.title}
  2. ' ) lines.append("
") lines.append("
") return "\n".join(lines) def to_dict_list(self) -> list[dict[str, Any]]: """Convert citations to list of dictionaries. Returns: List of citation dictionaries """ return [ { "title": cite.title, "url": cite.url, "snippet": cite.snippet, "reliability_score": cite.reliability_score, } for cite in self._citations ] def clear(self) -> None: """Clear all citations.""" self._citations.clear() self._url_index.clear() def _assess_reliability(self, url: str) -> float: """Assess reliability of a source based on URL. Args: url: Source URL Returns: Reliability score (0.0 - 1.0) """ try: parsed = urlparse(url) domain = parsed.netloc.lower() except Exception: return 0.3 # High reliability domains high_reliability = [ ".gov", ".edu", "wikipedia.org", "bbc.com", "reuters.com", "apnews.com", "nature.com", "sciencedirect.com", "pubmed.gov", "nytimes.com", "wsj.com", "economist.com", ] # Medium reliability domains medium_reliability = [ "medium.com", "techcrunch.com", "wired.com", "arstechnica.com", "theverge.com", "forbes.com", "bloomberg.com", ] for high in high_reliability: if high in domain: return 0.9 for medium in medium_reliability: if medium in domain: return 0.7 # Default moderate reliability for unknown sources return 0.5 def get_most_reliable(self, n: int = 3) -> list[Citation]: """Get the n most reliable citations. Args: n: Number of citations to return Returns: List of most reliable citations """ sorted_citations = sorted( self._citations, key=lambda c: c.reliability_score, reverse=True, ) return sorted_citations[:n]