Spaces:

thinkwee
/

BibGuard

Running

thinkwee

init

46df5f0 10 days ago

7.69 kB

	"""
	LaTeX file parser for citation extraction.
	"""
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Optional


	@dataclass
	class CitationContext:
	"""Represents a citation with its context."""
	key: str
	line_number: int
	command: str # e.g., \cite, \citep, \citet
	context_before: str # Text before citation
	context_after: str # Text after citation
	full_context: str # Full surrounding context
	raw_line: str # The raw line containing the citation
	file_path: Optional[str] = None # Added


	class TexParser:
	"""Parser for .tex files."""

	# Citation command patterns
	CITE_PATTERNS = [
	# Standard citation commands
	r'\\cite(?:p\|t\|alp\|alt\|author\|year\|yearpar)?\?\s(?:\[[^\]]\])?\s\{([^}]+)\}',
	# natbib commands
	r'\\citep?\?\s(?:\[[^\]]\])?\s(?:\[[^\]]\])?\s\{([^}]+)\}',
	r'\\citet?\?\s(?:\[[^\]]\])?\s(?:\[[^\]]\])?\s\{([^}]+)\}',
	# biblatex commands
	r'\\(?:auto\|text\|paren\|foot\|super)cite\?\s(?:\[[^\]]\])?\s\{([^}]+)\}',
	r'\\(?:full\|short)cite\?\s(?:\[[^\]]\])?\s\{([^}]+)\}',
	]

	# Compiled pattern for finding any citation
	CITE_REGEX = re.compile(
	r'\\(cite[a-z])\?\s(?:\[[^\]]\])?\s(?:\[[^\]]\])?\s*\{([^}]+)\}',
	re.IGNORECASE
	)

	def __init__(self):
	self.citations: dict[str, list[CitationContext]] = {}
	self.all_keys: set[str] = set()
	self.lines: list[str] = []
	self.content: str = ""
	self.current_filepath: Optional[str] = None

	def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]:
	"""Parse a .tex file and extract all citations."""
	path = Path(filepath)
	if not path.exists():
	raise FileNotFoundError(f"TeX file not found: {filepath}")

	with open(path, 'r', encoding='utf-8', errors='replace') as f:
	content = f.read()

	self.current_filepath = filepath
	return self.parse_content(content)

	def parse_content(self, content: str) -> dict[str, list[CitationContext]]:
	"""Parse tex content and extract citations."""
	self.content = content
	self.lines = content.split('\n')
	self.citations = {}
	self.all_keys = set()

	# Remove comments
	content_no_comments = self._remove_comments(content)

	# Find all citations line by line
	for line_num, line in enumerate(self.lines, 1):
	# Skip comment lines
	if line.strip().startswith('%'):
	continue

	# Remove inline comments for matching
	line_no_comment = re.sub(r'(?<!\\)%.*$', '', line)

	# Find all citations in this line
	for match in self.CITE_REGEX.finditer(line_no_comment):
	command = match.group(1)
	keys_str = match.group(2)

	# Split multiple keys (e.g., \cite{key1, key2})
	keys = [k.strip() for k in keys_str.split(',')]

	for key in keys:
	if not key:
	continue

	self.all_keys.add(key)

	# Extract context
	context = self._extract_context(line_num)

	citation = CitationContext(
	key=key,
	line_number=line_num,
	command=f'\\{command}',
	context_before=context['before'],
	context_after=context['after'],
	full_context=context['full'],
	raw_line=line,
	file_path=self.current_filepath
	)

	if key not in self.citations:
	self.citations[key] = []
	self.citations[key].append(citation)

	return self.citations

	def _remove_comments(self, content: str) -> str:
	"""Remove LaTeX comments from content."""
	# Remove line comments (but keep escaped %)
	lines = content.split('\n')
	cleaned = []
	for line in lines:
	# Remove inline comments
	result = re.sub(r'(?<!\\)%.*$', '', line)
	cleaned.append(result)
	return '\n'.join(cleaned)

	def _extract_context(self, line_num: int, context_sentences: int = 2) -> dict:
	"""Extract surrounding context for a citation (sentences)."""
	# Get a larger window of lines first to ensure we capture full sentences
	start_line = max(0, line_num - 10)
	end_line = min(len(self.lines), line_num + 10)

	# Combine lines into a single text block
	raw_block = ' '.join(self.lines[start_line:end_line])

	# Clean the block first to make sentence splitting easier
	clean_block = self._clean_text(raw_block)

	# Find the citation in the clean block (approximation)
	# Since we cleaned the text, we can't find the exact \cite command easily.
	# Instead, we'll use the raw lines to find the citation index, then map to clean text.
	# However, a simpler approach for LLM context is to just return the cleaned text
	# centered around the line.

	# Better approach:
	# 1. Get the raw line content
	current_raw_line = self.lines[line_num - 1]

	# 2. Get surrounding lines
	before_lines = self.lines[start_line:line_num - 1]
	after_lines = self.lines[line_num:end_line]

	# 3. Clean everything
	current_clean = self._clean_text(current_raw_line)
	before_clean = self._clean_text(' '.join(before_lines))
	after_clean = self._clean_text(' '.join(after_lines))

	# 4. Split into sentences (simple splitting by .!?)
	def split_sentences(text):
	return re.split(r'(?<=[.!?])\s+', text)

	before_sentences = split_sentences(before_clean)
	after_sentences = split_sentences(after_clean)

	# Take last N sentences from before
	context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else ""

	# Take first N sentences from after
	context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else ""

	# Combine
	full_context = f"{context_before} {current_clean} {context_after}".strip()

	return {
	'before': context_before,
	'after': context_after,
	'full': full_context
	}

	def _clean_text(self, text: str) -> str:
	"""Clean LaTeX text for readability."""
	# Remove common LaTeX commands but keep text content
	text = re.sub(r'\\[a-zA-Z]+\?(?:\[[^\]]\])\s', ' ', text)
	# Remove braces
	text = re.sub(r'[{}]', '', text)
	# Normalize whitespace
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def is_cited(self, key: str) -> bool:
	"""Check if a key is cited in the document."""
	return key in self.all_keys

	def get_citation_contexts(self, key: str) -> list[CitationContext]:
	"""Get all citation contexts for a key."""
	return self.citations.get(key, [])

	def get_all_cited_keys(self) -> set[str]:
	"""Get all citation keys found in the document."""
	return self.all_keys.copy()