""" LaTeX file parser for citation extraction. """ import re from dataclasses import dataclass from pathlib import Path from typing import Optional @dataclass class CitationContext: """Represents a citation with its context.""" key: str line_number: int command: str # e.g., \cite, \citep, \citet context_before: str # Text before citation context_after: str # Text after citation full_context: str # Full surrounding context raw_line: str # The raw line containing the citation file_path: Optional[str] = None # Added class TexParser: """Parser for .tex files.""" # Citation command patterns CITE_PATTERNS = [ # Standard citation commands r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', # natbib commands r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', # biblatex commands r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', ] # Compiled pattern for finding any citation CITE_REGEX = re.compile( r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', re.IGNORECASE ) def __init__(self): self.citations: dict[str, list[CitationContext]] = {} self.all_keys: set[str] = set() self.lines: list[str] = [] self.content: str = "" self.current_filepath: Optional[str] = None def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]: """Parse a .tex file and extract all citations.""" path = Path(filepath) if not path.exists(): raise FileNotFoundError(f"TeX file not found: {filepath}") with open(path, 'r', encoding='utf-8', errors='replace') as f: content = f.read() self.current_filepath = filepath return self.parse_content(content) def parse_content(self, content: str) -> dict[str, list[CitationContext]]: """Parse tex content and extract citations.""" self.content = content self.lines = content.split('\n') self.citations = {} self.all_keys = set() # Remove comments content_no_comments = self._remove_comments(content) # Find all citations line by line for line_num, line in enumerate(self.lines, 1): # Skip comment lines if line.strip().startswith('%'): continue # Remove inline comments for matching line_no_comment = re.sub(r'(? str: """Remove LaTeX comments from content.""" # Remove line comments (but keep escaped %) lines = content.split('\n') cleaned = [] for line in lines: # Remove inline comments result = re.sub(r'(? dict: """Extract surrounding context for a citation (sentences).""" # Get a larger window of lines first to ensure we capture full sentences start_line = max(0, line_num - 10) end_line = min(len(self.lines), line_num + 10) # Combine lines into a single text block raw_block = ' '.join(self.lines[start_line:end_line]) # Clean the block first to make sentence splitting easier clean_block = self._clean_text(raw_block) # Find the citation in the clean block (approximation) # Since we cleaned the text, we can't find the exact \cite command easily. # Instead, we'll use the raw lines to find the citation index, then map to clean text. # However, a simpler approach for LLM context is to just return the cleaned text # centered around the line. # Better approach: # 1. Get the raw line content current_raw_line = self.lines[line_num - 1] # 2. Get surrounding lines before_lines = self.lines[start_line:line_num - 1] after_lines = self.lines[line_num:end_line] # 3. Clean everything current_clean = self._clean_text(current_raw_line) before_clean = self._clean_text(' '.join(before_lines)) after_clean = self._clean_text(' '.join(after_lines)) # 4. Split into sentences (simple splitting by .!?) def split_sentences(text): return re.split(r'(?<=[.!?])\s+', text) before_sentences = split_sentences(before_clean) after_sentences = split_sentences(after_clean) # Take last N sentences from before context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else "" # Take first N sentences from after context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else "" # Combine full_context = f"{context_before} {current_clean} {context_after}".strip() return { 'before': context_before, 'after': context_after, 'full': full_context } def _clean_text(self, text: str) -> str: """Clean LaTeX text for readability.""" # Remove common LaTeX commands but keep text content text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text) # Remove braces text = re.sub(r'[{}]', '', text) # Normalize whitespace text = re.sub(r'\s+', ' ', text) return text.strip() def is_cited(self, key: str) -> bool: """Check if a key is cited in the document.""" return key in self.all_keys def get_citation_contexts(self, key: str) -> list[CitationContext]: """Get all citation contexts for a key.""" return self.citations.get(key, []) def get_all_cited_keys(self) -> set[str]: """Get all citation keys found in the document.""" return self.all_keys.copy()