File size: 7,691 Bytes
46df5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
"""
LaTeX file parser for citation extraction.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
@dataclass
class CitationContext:
"""Represents a citation with its context."""
key: str
line_number: int
command: str # e.g., \cite, \citep, \citet
context_before: str # Text before citation
context_after: str # Text after citation
full_context: str # Full surrounding context
raw_line: str # The raw line containing the citation
file_path: Optional[str] = None # Added
class TexParser:
"""Parser for .tex files."""
# Citation command patterns
CITE_PATTERNS = [
# Standard citation commands
r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
# natbib commands
r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
# biblatex commands
r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
]
# Compiled pattern for finding any citation
CITE_REGEX = re.compile(
r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
re.IGNORECASE
)
def __init__(self):
self.citations: dict[str, list[CitationContext]] = {}
self.all_keys: set[str] = set()
self.lines: list[str] = []
self.content: str = ""
self.current_filepath: Optional[str] = None
def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]:
"""Parse a .tex file and extract all citations."""
path = Path(filepath)
if not path.exists():
raise FileNotFoundError(f"TeX file not found: {filepath}")
with open(path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
self.current_filepath = filepath
return self.parse_content(content)
def parse_content(self, content: str) -> dict[str, list[CitationContext]]:
"""Parse tex content and extract citations."""
self.content = content
self.lines = content.split('\n')
self.citations = {}
self.all_keys = set()
# Remove comments
content_no_comments = self._remove_comments(content)
# Find all citations line by line
for line_num, line in enumerate(self.lines, 1):
# Skip comment lines
if line.strip().startswith('%'):
continue
# Remove inline comments for matching
line_no_comment = re.sub(r'(?<!\\)%.*$', '', line)
# Find all citations in this line
for match in self.CITE_REGEX.finditer(line_no_comment):
command = match.group(1)
keys_str = match.group(2)
# Split multiple keys (e.g., \cite{key1, key2})
keys = [k.strip() for k in keys_str.split(',')]
for key in keys:
if not key:
continue
self.all_keys.add(key)
# Extract context
context = self._extract_context(line_num)
citation = CitationContext(
key=key,
line_number=line_num,
command=f'\\{command}',
context_before=context['before'],
context_after=context['after'],
full_context=context['full'],
raw_line=line,
file_path=self.current_filepath
)
if key not in self.citations:
self.citations[key] = []
self.citations[key].append(citation)
return self.citations
def _remove_comments(self, content: str) -> str:
"""Remove LaTeX comments from content."""
# Remove line comments (but keep escaped %)
lines = content.split('\n')
cleaned = []
for line in lines:
# Remove inline comments
result = re.sub(r'(?<!\\)%.*$', '', line)
cleaned.append(result)
return '\n'.join(cleaned)
def _extract_context(self, line_num: int, context_sentences: int = 2) -> dict:
"""Extract surrounding context for a citation (sentences)."""
# Get a larger window of lines first to ensure we capture full sentences
start_line = max(0, line_num - 10)
end_line = min(len(self.lines), line_num + 10)
# Combine lines into a single text block
raw_block = ' '.join(self.lines[start_line:end_line])
# Clean the block first to make sentence splitting easier
clean_block = self._clean_text(raw_block)
# Find the citation in the clean block (approximation)
# Since we cleaned the text, we can't find the exact \cite command easily.
# Instead, we'll use the raw lines to find the citation index, then map to clean text.
# However, a simpler approach for LLM context is to just return the cleaned text
# centered around the line.
# Better approach:
# 1. Get the raw line content
current_raw_line = self.lines[line_num - 1]
# 2. Get surrounding lines
before_lines = self.lines[start_line:line_num - 1]
after_lines = self.lines[line_num:end_line]
# 3. Clean everything
current_clean = self._clean_text(current_raw_line)
before_clean = self._clean_text(' '.join(before_lines))
after_clean = self._clean_text(' '.join(after_lines))
# 4. Split into sentences (simple splitting by .!?)
def split_sentences(text):
return re.split(r'(?<=[.!?])\s+', text)
before_sentences = split_sentences(before_clean)
after_sentences = split_sentences(after_clean)
# Take last N sentences from before
context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else ""
# Take first N sentences from after
context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else ""
# Combine
full_context = f"{context_before} {current_clean} {context_after}".strip()
return {
'before': context_before,
'after': context_after,
'full': full_context
}
def _clean_text(self, text: str) -> str:
"""Clean LaTeX text for readability."""
# Remove common LaTeX commands but keep text content
text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text)
# Remove braces
text = re.sub(r'[{}]', '', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip()
def is_cited(self, key: str) -> bool:
"""Check if a key is cited in the document."""
return key in self.all_keys
def get_citation_contexts(self, key: str) -> list[CitationContext]:
"""Get all citation contexts for a key."""
return self.citations.get(key, [])
def get_all_cited_keys(self) -> set[str]:
"""Get all citation keys found in the document."""
return self.all_keys.copy()
|