|
|
""" |
|
|
LaTeX file parser for citation extraction. |
|
|
""" |
|
|
import re |
|
|
from dataclasses import dataclass |
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class CitationContext: |
|
|
"""Represents a citation with its context.""" |
|
|
key: str |
|
|
line_number: int |
|
|
command: str |
|
|
context_before: str |
|
|
context_after: str |
|
|
full_context: str |
|
|
raw_line: str |
|
|
file_path: Optional[str] = None |
|
|
|
|
|
|
|
|
class TexParser: |
|
|
"""Parser for .tex files.""" |
|
|
|
|
|
|
|
|
CITE_PATTERNS = [ |
|
|
|
|
|
r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
|
|
|
r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
|
|
|
r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
] |
|
|
|
|
|
|
|
|
CITE_REGEX = re.compile( |
|
|
r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', |
|
|
re.IGNORECASE |
|
|
) |
|
|
|
|
|
def __init__(self): |
|
|
self.citations: dict[str, list[CitationContext]] = {} |
|
|
self.all_keys: set[str] = set() |
|
|
self.lines: list[str] = [] |
|
|
self.content: str = "" |
|
|
self.current_filepath: Optional[str] = None |
|
|
|
|
|
def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]: |
|
|
"""Parse a .tex file and extract all citations.""" |
|
|
path = Path(filepath) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"TeX file not found: {filepath}") |
|
|
|
|
|
with open(path, 'r', encoding='utf-8', errors='replace') as f: |
|
|
content = f.read() |
|
|
|
|
|
self.current_filepath = filepath |
|
|
return self.parse_content(content) |
|
|
|
|
|
def parse_content(self, content: str) -> dict[str, list[CitationContext]]: |
|
|
"""Parse tex content and extract citations.""" |
|
|
self.content = content |
|
|
self.lines = content.split('\n') |
|
|
self.citations = {} |
|
|
self.all_keys = set() |
|
|
|
|
|
|
|
|
content_no_comments = self._remove_comments(content) |
|
|
|
|
|
|
|
|
for line_num, line in enumerate(self.lines, 1): |
|
|
|
|
|
if line.strip().startswith('%'): |
|
|
continue |
|
|
|
|
|
|
|
|
line_no_comment = re.sub(r'(?<!\\)%.*$', '', line) |
|
|
|
|
|
|
|
|
for match in self.CITE_REGEX.finditer(line_no_comment): |
|
|
command = match.group(1) |
|
|
keys_str = match.group(2) |
|
|
|
|
|
|
|
|
keys = [k.strip() for k in keys_str.split(',')] |
|
|
|
|
|
for key in keys: |
|
|
if not key: |
|
|
continue |
|
|
|
|
|
self.all_keys.add(key) |
|
|
|
|
|
|
|
|
context = self._extract_context(line_num) |
|
|
|
|
|
citation = CitationContext( |
|
|
key=key, |
|
|
line_number=line_num, |
|
|
command=f'\\{command}', |
|
|
context_before=context['before'], |
|
|
context_after=context['after'], |
|
|
full_context=context['full'], |
|
|
raw_line=line, |
|
|
file_path=self.current_filepath |
|
|
) |
|
|
|
|
|
if key not in self.citations: |
|
|
self.citations[key] = [] |
|
|
self.citations[key].append(citation) |
|
|
|
|
|
return self.citations |
|
|
|
|
|
def _remove_comments(self, content: str) -> str: |
|
|
"""Remove LaTeX comments from content.""" |
|
|
|
|
|
lines = content.split('\n') |
|
|
cleaned = [] |
|
|
for line in lines: |
|
|
|
|
|
result = re.sub(r'(?<!\\)%.*$', '', line) |
|
|
cleaned.append(result) |
|
|
return '\n'.join(cleaned) |
|
|
|
|
|
def _extract_context(self, line_num: int, context_sentences: int = 2) -> dict: |
|
|
"""Extract surrounding context for a citation (sentences).""" |
|
|
|
|
|
start_line = max(0, line_num - 10) |
|
|
end_line = min(len(self.lines), line_num + 10) |
|
|
|
|
|
|
|
|
raw_block = ' '.join(self.lines[start_line:end_line]) |
|
|
|
|
|
|
|
|
clean_block = self._clean_text(raw_block) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_raw_line = self.lines[line_num - 1] |
|
|
|
|
|
|
|
|
before_lines = self.lines[start_line:line_num - 1] |
|
|
after_lines = self.lines[line_num:end_line] |
|
|
|
|
|
|
|
|
current_clean = self._clean_text(current_raw_line) |
|
|
before_clean = self._clean_text(' '.join(before_lines)) |
|
|
after_clean = self._clean_text(' '.join(after_lines)) |
|
|
|
|
|
|
|
|
def split_sentences(text): |
|
|
return re.split(r'(?<=[.!?])\s+', text) |
|
|
|
|
|
before_sentences = split_sentences(before_clean) |
|
|
after_sentences = split_sentences(after_clean) |
|
|
|
|
|
|
|
|
context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else "" |
|
|
|
|
|
|
|
|
context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else "" |
|
|
|
|
|
|
|
|
full_context = f"{context_before} {current_clean} {context_after}".strip() |
|
|
|
|
|
return { |
|
|
'before': context_before, |
|
|
'after': context_after, |
|
|
'full': full_context |
|
|
} |
|
|
|
|
|
def _clean_text(self, text: str) -> str: |
|
|
"""Clean LaTeX text for readability.""" |
|
|
|
|
|
text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text) |
|
|
|
|
|
text = re.sub(r'[{}]', '', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
return text.strip() |
|
|
|
|
|
def is_cited(self, key: str) -> bool: |
|
|
"""Check if a key is cited in the document.""" |
|
|
return key in self.all_keys |
|
|
|
|
|
def get_citation_contexts(self, key: str) -> list[CitationContext]: |
|
|
"""Get all citation contexts for a key.""" |
|
|
return self.citations.get(key, []) |
|
|
|
|
|
def get_all_cited_keys(self) -> set[str]: |
|
|
"""Get all citation keys found in the document.""" |
|
|
return self.all_keys.copy() |
|
|
|