File size: 7,691 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
LaTeX file parser for citation extraction.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional


@dataclass
class CitationContext:
    """Represents a citation with its context."""
    key: str
    line_number: int
    command: str  # e.g., \cite, \citep, \citet
    context_before: str  # Text before citation
    context_after: str   # Text after citation
    full_context: str    # Full surrounding context
    raw_line: str        # The raw line containing the citation
    file_path: Optional[str] = None # Added


class TexParser:
    """Parser for .tex files."""
    
    # Citation command patterns
    CITE_PATTERNS = [
        # Standard citation commands
        r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
        # natbib commands
        r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
        r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
        # biblatex commands
        r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
        r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
    ]
    
    # Compiled pattern for finding any citation
    CITE_REGEX = re.compile(
        r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
        re.IGNORECASE
    )
    
    def __init__(self):
        self.citations: dict[str, list[CitationContext]] = {}
        self.all_keys: set[str] = set()
        self.lines: list[str] = []
        self.content: str = ""
        self.current_filepath: Optional[str] = None
    
    def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]:
        """Parse a .tex file and extract all citations."""
        path = Path(filepath)
        if not path.exists():
            raise FileNotFoundError(f"TeX file not found: {filepath}")
        
        with open(path, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
        
        self.current_filepath = filepath
        return self.parse_content(content)
    
    def parse_content(self, content: str) -> dict[str, list[CitationContext]]:
        """Parse tex content and extract citations."""
        self.content = content
        self.lines = content.split('\n')
        self.citations = {}
        self.all_keys = set()
        
        # Remove comments
        content_no_comments = self._remove_comments(content)
        
        # Find all citations line by line
        for line_num, line in enumerate(self.lines, 1):
            # Skip comment lines
            if line.strip().startswith('%'):
                continue
            
            # Remove inline comments for matching
            line_no_comment = re.sub(r'(?<!\\)%.*$', '', line)
            
            # Find all citations in this line
            for match in self.CITE_REGEX.finditer(line_no_comment):
                command = match.group(1)
                keys_str = match.group(2)
                
                # Split multiple keys (e.g., \cite{key1, key2})
                keys = [k.strip() for k in keys_str.split(',')]
                
                for key in keys:
                    if not key:
                        continue
                    
                    self.all_keys.add(key)
                    
                    # Extract context
                    context = self._extract_context(line_num)
                    
                    citation = CitationContext(
                        key=key,
                        line_number=line_num,
                        command=f'\\{command}',
                        context_before=context['before'],
                        context_after=context['after'],
                        full_context=context['full'],
                        raw_line=line,
                        file_path=self.current_filepath
                    )
                    
                    if key not in self.citations:
                        self.citations[key] = []
                    self.citations[key].append(citation)
        
        return self.citations
    
    def _remove_comments(self, content: str) -> str:
        """Remove LaTeX comments from content."""
        # Remove line comments (but keep escaped %)
        lines = content.split('\n')
        cleaned = []
        for line in lines:
            # Remove inline comments
            result = re.sub(r'(?<!\\)%.*$', '', line)
            cleaned.append(result)
        return '\n'.join(cleaned)
    
    def _extract_context(self, line_num: int, context_sentences: int = 2) -> dict:
        """Extract surrounding context for a citation (sentences)."""
        # Get a larger window of lines first to ensure we capture full sentences
        start_line = max(0, line_num - 10)
        end_line = min(len(self.lines), line_num + 10)
        
        # Combine lines into a single text block
        raw_block = ' '.join(self.lines[start_line:end_line])
        
        # Clean the block first to make sentence splitting easier
        clean_block = self._clean_text(raw_block)
        
        # Find the citation in the clean block (approximation)
        # Since we cleaned the text, we can't find the exact \cite command easily.
        # Instead, we'll use the raw lines to find the citation index, then map to clean text.
        # However, a simpler approach for LLM context is to just return the cleaned text 
        # centered around the line.
        
        # Better approach:
        # 1. Get the raw line content
        current_raw_line = self.lines[line_num - 1]
        
        # 2. Get surrounding lines
        before_lines = self.lines[start_line:line_num - 1]
        after_lines = self.lines[line_num:end_line]
        
        # 3. Clean everything
        current_clean = self._clean_text(current_raw_line)
        before_clean = self._clean_text(' '.join(before_lines))
        after_clean = self._clean_text(' '.join(after_lines))
        
        # 4. Split into sentences (simple splitting by .!?)
        def split_sentences(text):
            return re.split(r'(?<=[.!?])\s+', text)
            
        before_sentences = split_sentences(before_clean)
        after_sentences = split_sentences(after_clean)
        
        # Take last N sentences from before
        context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else ""
        
        # Take first N sentences from after
        context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else ""
        
        # Combine
        full_context = f"{context_before} {current_clean} {context_after}".strip()
        
        return {
            'before': context_before,
            'after': context_after,
            'full': full_context
        }
    
    def _clean_text(self, text: str) -> str:
        """Clean LaTeX text for readability."""
        # Remove common LaTeX commands but keep text content
        text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text)
        # Remove braces
        text = re.sub(r'[{}]', '', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def is_cited(self, key: str) -> bool:
        """Check if a key is cited in the document."""
        return key in self.all_keys
    
    def get_citation_contexts(self, key: str) -> list[CitationContext]:
        """Get all citation contexts for a key."""
        return self.citations.get(key, [])
    
    def get_all_cited_keys(self) -> set[str]:
        """Get all citation keys found in the document."""
        return self.all_keys.copy()