diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..db76649358c20964e976f0e2ba42b6d0a123d256 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/*.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1da09a88cfa0ea5df2e0601f6fd830c9c78ce209
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,62 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environments
+venv/
+env/
+.env
+.venv/
+
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Project Specific Outputs
+*.txt
+*.md
+!README.md
+*_only_used_entry.bib
+
+# LaTeX and Bibliography (User Data)
+# Ignoring these to prevent committing personal paper content
+*.tex
+*.bib
+*.pdf
+*.log
+*.aux
+*.out
+*.bbl
+*.blg
+*.synctex.gz
+*.fls
+*.fdb_latexmk
+
+# cache
+.cache
\ No newline at end of file
diff --git a/README.md b/README.md
index ef3acefd1324f42b9925f5caaceb95456768b90c..855a15eddcb5509cf25fd9766415d966a7dcc9c8 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,198 @@
----
-title: BibGuard
-emoji: ⚡
-colorFrom: pink
-colorTo: purple
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
-short_description: Automated bibliography verification and LaTeX quality auditi
+# BibGuard: Bibliography & LaTeX Quality Auditor
+
+**BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit.
+
+AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims.
+
+## 🛡 Why BibGuard?
+
+- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
+- **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems
+- **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated
+- **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM)
+- **⚡ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations
+
+## 🚀 Features
+
+### Bibliography Validation
+- **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
+- **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional)
+- **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.)
+- **👀 Usage Analysis**: Highlights missing citations and unused bib entries
+- **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching
+
+### LaTeX Quality Checks
+- **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
+- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
+- **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology
+- **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
+- **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching)
+- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
+- **📅 Citation Age**: Flags references older than 30 years
+
+## 📦 Installation
+
+```bash
+git clone git@github.com:thinkwee/BibGuard.git
+cd BibGuard
+pip install -r requirements.txt
+```
+
+## ⚡ Quick Start
+
+### 1. Initialize Configuration
+
+```bash
+python main.py --init
+```
+
+This creates `config.yaml`. Edit it to set your file paths. You have two modes:
+
+#### Option A: Single File Mode
+Best for individual papers.
+```yaml
+files:
+ bib: "paper.bib"
+ tex: "paper.tex"
+ output_dir: "bibguard_output"
+```
+
+#### Option B: Directory Scan Mode
+Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files.
+```yaml
+files:
+ input_dir: "./my_project_dir"
+ output_dir: "bibguard_output"
+```
+
+### 2. Run Full Check
+
+```bash
+python main.py
+```
+
+**Output** (in `bibguard_output/`):
+- `bibliography_report.md` - Bibliography validation results
+- `latex_quality_report.md` - Writing and formatting issues
+- `line_by_line_report.md` - All issues sorted by line number
+- `*_only_used.bib` - Clean bibliography (used entries only)
+
+## 🛠 Configuration
+
+Edit `config.yaml` to customize checks:
+
+```yaml
+bibliography:
+ check_metadata: true # Validate against online databases (takes time)
+ check_usage: true # Find unused/missing entries
+ check_duplicates: true # Detect duplicate entries
+ check_preprint_ratio: true # Warn if >50% are preprints
+ check_relevance: false # LLM-based relevance check (requires API key)
+
+submission:
+ # Format checks
+ caption: true # Table/figure caption placement
+ reference: true # Cross-reference integrity
+ formatting: true # Citation spacing, blank lines
+ equation: true # Equation punctuation, numbering
+
+ # Writing quality
+ sentence: true # Weak starters, hedging language
+ consistency: true # Spelling, hyphenation, terminology
+ acronym: true # Acronym definitions (3+ letters)
+
+ # Submission compliance
+ ai_artifacts: true # AI-generated text detection
+ anonymization: true # Double-blind compliance
+ citation_quality: true # Old citations (>30 years)
+ number: true # Percentage formatting
+```
+
+## 🤖 LLM-Based Relevance Check
+
+To verify citations match their context using AI:
+
+```yaml
+bibliography:
+ check_relevance: true
+
+llm:
+ backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm
+ api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY)
+```
+
+**Supported Backends:**
+- **Gemini** (Google): `GEMINI_API_KEY`
+- **OpenAI**: `OPENAI_API_KEY`
+- **Anthropic**: `ANTHROPIC_API_KEY`
+- **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance)
+- **Ollama**: Local models (no API key needed)
+- **vLLM**: Custom endpoint
+
+Then run:
+```bash
+python main.py
+```
+
+## 📝 Understanding Reports
+
+### Bibliography Report
+Shows for each entry:
+- ✅ **Verified**: Metadata matches online databases
+- ⚠️ **Issues**: Mismatches, missing entries, duplicates
+- 📊 **Statistics**: Usage, duplicates, preprint ratio
+
+### LaTeX Quality Report
+Organized by severity:
+- 🔴 **Errors**: Critical issues (e.g., undefined references)
+- 🟡 **Warnings**: Important issues (e.g., inconsistent spelling)
+- 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
+
+### Line-by-Line Report
+All LaTeX issues sorted by line number for easy fixing.
+
+## 🧐 Understanding Mismatches
+
+BibGuard is strict, but false positives happen:
+
+1. **Year Discrepancy (±1 Year)**:
+ - *Reason*: Delay between preprint (arXiv) and official publication
+ - *Action*: Verify which version you intend to cite
+
+2. **Author List Variations**:
+ - *Reason*: Different databases handle large author lists differently
+ - *Action*: Check if primary authors match
+
+3. **Venue Name Differences**:
+ - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
+ - *Action*: Both are usually correct
+
+4. **Non-Academic Sources**:
+ - *Reason*: Blogs, documentation not indexed by academic databases
+ - *Action*: Manually verify URL and title
+
+## 🔧 Advanced Options
+
+```bash
+python main.py --help # Show all options
+python main.py --list-templates # List conference templates
+python main.py --config my.yaml # Use custom config file
+```
+
+## 🤝 Contributing
+
+Contributions welcome! Please open an issue or pull request.
+
+## 🙏 Acknowledgments
+
+BibGuard uses multiple data sources:
+- arXiv API
+- CrossRef API
+- Semantic Scholar API
+- DBLP API
+- OpenAlex API
+- Google Scholar (via scholarly)
+
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+**Made with ❤️ for researchers who care about their submission**
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9c83565bb4530858992e45d1ccb01ecb3e482
--- /dev/null
+++ b/app.py
@@ -0,0 +1,922 @@
+#!/usr/bin/env python3
+"""
+BibGuard Gradio Web Application
+
+A web interface for checking bibliography and LaTeX quality.
+"""
+import gradio as gr
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Optional, Tuple
+import base64
+
+from src.parsers import BibParser, TexParser
+from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
+from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
+from src.report.generator import ReportGenerator, EntryReport
+from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep
+from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
+from src.checkers import CHECKER_REGISTRY
+from src.report.line_report import LineByLineReportGenerator
+from app_helper import fetch_and_compare_with_workflow
+
+
+# Custom CSS for better Markdown rendering
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+
+* {
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+}
+"""
+
+WELCOME_HTML = """
+
+"""
+
+CUSTOM_CSS += """
+/* Global Reset */
+body, gradio-app {
+ overflow: hidden !important; /* Prevent double scrollbars on the page */
+}
+
+.gradio-container {
+ max-width: none !important;
+ width: 100% !important;
+ height: 100vh !important;
+ padding: 0 !important;
+ margin: 0 !important;
+}
+
+/* Header Styling */
+.app-header {
+ padding: 20px;
+ background: white;
+ border-bottom: 1px solid #e5e7eb;
+}
+
+/* Sidebar Styling */
+.app-sidebar {
+ height: calc(100vh - 100px) !important;
+ overflow-y: auto !important;
+ padding: 20px !important;
+ border-right: 1px solid #e5e7eb;
+}
+
+/* Main Content Area */
+.app-content {
+ height: calc(100vh - 100px) !important;
+ padding: 0 !important;
+}
+
+/* The Magic Scroll Container - Clean and Explicit */
+.scrollable-report-area {
+ height: calc(100vh - 180px) !important; /* Fixed height relative to viewport */
+ overflow-y: auto !important;
+ padding: 24px;
+ background-color: #f9fafb;
+ border: 1px solid #e5e7eb;
+ border-radius: 8px;
+ margin-top: 10px;
+}
+
+/* Report Card Styling */
+.report-card {
+ background: white;
+ border-radius: 12px;
+ padding: 24px;
+ margin-bottom: 16px; /* Spacing between cards */
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+ border: 1px solid #e5e7eb;
+ transition: transform 0.2s, box-shadow 0.2s;
+}
+
+.report-card:hover {
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+ transform: translateY(-2px);
+}
+
+/* Card Internals */
+.card-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: flex-start;
+ margin-bottom: 16px;
+ padding-bottom: 16px;
+ border-bottom: 1px solid #f3f4f6;
+}
+
+.card-title {
+ font-size: 1.1em;
+ font-weight: 600;
+ color: #111827;
+ margin: 0 0 4px 0;
+}
+
+.card-subtitle {
+ font-size: 0.9em;
+ color: #6b7280;
+ font-family: monospace;
+}
+
+.card-content {
+ font-size: 0.95em;
+ color: #374151;
+ line-height: 1.5;
+}
+
+/* Badges */
+.badge {
+ display: inline-flex;
+ align-items: center;
+ padding: 4px 10px;
+ border-radius: 9999px;
+ font-size: 0.8em;
+ font-weight: 500;
+}
+
+.badge-success { background-color: #dcfce7; color: #166534; }
+.badge-warning { background-color: #fef9c3; color: #854d0e; }
+.badge-error { background-color: #fee2e2; color: #991b1b; }
+.badge-info { background-color: #dbeafe; color: #1e40af; }
+.badge-neutral { background-color: #f3f4f6; color: #4b5563; }
+
+/* Stats Grid */
+.stats-container {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
+ gap: 16px;
+ margin-bottom: 24px;
+}
+
+.stat-card {
+ padding: 16px;
+ border-radius: 12px;
+ color: white;
+ text-align: center;
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+
+.stat-value { font-size: 1.8em; font-weight: 700; }
+.stat-label { font-size: 0.9em; opacity: 0.9; }
+
+/* Detail Grid - Flexbox for better filling */
+.detail-grid {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 12px;
+ margin-bottom: 16px;
+ width: 100%;
+}
+
+.detail-item {
+ background: #f9fafb;
+ padding: 10px 12px;
+ border-radius: 8px;
+ border: 1px solid #f3f4f6;
+
+ /* Flex sizing: grow, shrink, min-basis */
+ flex: 1 1 160px;
+ min-width: 0; /* Important for word-break to work in flex children */
+
+ /* Layout control */
+ display: flex;
+ flex-direction: column;
+
+ /* Height constraint to prevent one huge card from stretching the row */
+ max-height: 100px;
+ overflow-y: auto;
+}
+
+/* Custom scrollbar for detail items */
+.detail-item::-webkit-scrollbar {
+ width: 4px;
+}
+.detail-item::-webkit-scrollbar-thumb {
+ background-color: #d1d5db;
+ border-radius: 4px;
+}
+
+.detail-label {
+ font-size: 0.75em;
+ color: #6b7280;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+ margin-bottom: 2px;
+ position: sticky;
+ top: 0;
+ background: #f9fafb; /* Maintain bg on scroll */
+ z-index: 1;
+}
+
+.detail-value {
+ font-weight: 500;
+ color: #1f2937;
+ font-size: 0.9em;
+ line-height: 1.4;
+ word-break: break-word; /* Fix overflow */
+ overflow-wrap: break-word;
+} border: 1px solid #e5e7eb;
+ transition: all 0.2s;
+}
+
+.report-card:hover {
+ box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
+}
+
+/* Card Header */
+.card-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: flex-start;
+ margin-bottom: 12px;
+ border-bottom: 1px solid #f3f4f6;
+ padding-bottom: 12px;
+}
+
+.card-title {
+ font-size: 1.1em;
+ font-weight: 600;
+ color: #1f2937;
+ margin: 0;
+}
+
+.card-subtitle {
+ font-size: 0.9em;
+ color: #6b7280;
+ margin-top: 4px;
+}
+
+/* Status Badges */
+.badge {
+ display: inline-flex;
+ align-items: center;
+ padding: 4px 10px;
+ border-radius: 9999px;
+ font-size: 0.8em;
+ font-weight: 500;
+}
+
+.badge-success { background-color: #dcfce7; color: #166534; }
+.badge-warning { background-color: #fef9c3; color: #854d0e; }
+.badge-error { background-color: #fee2e2; color: #991b1b; }
+.badge-info { background-color: #dbeafe; color: #1e40af; }
+.badge-neutral { background-color: #f3f4f6; color: #374151; }
+
+/* Content Styling */
+.card-content {
+ font-size: 15px;
+ color: #374151;
+ line-height: 1.6;
+}
+
+.card-content code {
+ background-color: #f3f4f6;
+ padding: 2px 6px;
+ border-radius: 4px;
+ font-family: monospace;
+ font-size: 0.9em;
+ color: #c2410c;
+}
+
+/* Grid for details */
+.detail-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+ gap: 12px;
+ margin-top: 12px;
+}
+
+.detail-item {
+ background: #f9fafb;
+ padding: 10px;
+ border-radius: 6px;
+}
+
+.detail-label {
+ font-size: 0.8em;
+ color: #6b7280;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+}
+
+.detail-value {
+ font-weight: 500;
+ color: #111827;
+}
+
+/* Summary Stats */
+.stats-container {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 16px;
+ margin-bottom: 24px;
+}
+
+.stat-card {
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ color: white;
+ padding: 20px;
+ border-radius: 12px;
+ text-align: center;
+ box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25);
+}
+
+.stat-value {
+ font-size: 2em;
+ font-weight: 700;
+}
+
+.stat-label {
+ font-size: 0.9em;
+ opacity: 0.9;
+ margin-top: 4px;
+}
+
+/* Button styling */
+.primary-btn {
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+ border: none !important;
+ font-weight: 600 !important;
+}
+
+/* Tab styling */
+.tab-nav button {
+ font-weight: 500 !important;
+ font-size: 15px !important;
+}
+"""
+
+
+def create_config_from_ui(
+ check_metadata: bool,
+ check_usage: bool,
+ check_duplicates: bool,
+ check_preprint_ratio: bool,
+ caption: bool,
+ reference: bool,
+ formatting: bool,
+ equation: bool,
+ ai_artifacts: bool,
+ sentence: bool,
+ consistency: bool,
+ acronym: bool,
+ number: bool,
+ citation_quality: bool,
+ anonymization: bool
+) -> BibGuardConfig:
+ """Create a BibGuardConfig from UI settings."""
+ config = BibGuardConfig()
+
+ config.bibliography = BibliographyConfig(
+ check_metadata=check_metadata,
+ check_usage=check_usage,
+ check_duplicates=check_duplicates,
+ check_preprint_ratio=check_preprint_ratio,
+ check_relevance=False # Disabled for web
+ )
+
+ config.submission = SubmissionConfig(
+ caption=caption,
+ reference=reference,
+ formatting=formatting,
+ equation=equation,
+ ai_artifacts=ai_artifacts,
+ sentence=sentence,
+ consistency=consistency,
+ acronym=acronym,
+ number=number,
+ citation_quality=citation_quality,
+ anonymization=anonymization
+ )
+
+ config.output = OutputConfig(quiet=True, minimal_verified=False)
+
+ return config
+
+
+def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str:
+ """Generate HTML content for bibliography report."""
+ html = ['') # Close container
+ return "".join(html)
+
+def generate_latex_html(results: list) -> str:
+ """Generate HTML for LaTeX quality check."""
+ from src.checkers import CheckSeverity
+
+ html = ['')
+ return "".join(html)
+
+def generate_line_html(content: str, results: list) -> str:
+ """Generate HTML for Line-by-Line report."""
+ # Build a dictionary of line_number -> list of issues
+ issues_by_line = {}
+ for r in results:
+ if r.line_number not in issues_by_line:
+ issues_by_line[r.line_number] = []
+ issues_by_line[r.line_number].append(r)
+
+ lines = content.split('\n')
+
+ html = ['')
+ return "".join(html)
+
+
+
+
+def run_check(
+ bib_file,
+ tex_file,
+ check_metadata: bool,
+ check_usage: bool,
+ check_duplicates: bool,
+ check_preprint_ratio: bool,
+ caption: bool,
+ reference: bool,
+ formatting: bool,
+ equation: bool,
+ ai_artifacts: bool,
+ sentence: bool,
+ consistency: bool,
+ acronym: bool,
+ number: bool,
+ citation_quality: bool,
+ anonymization: bool,
+ progress=gr.Progress()
+) -> Tuple[str, str, str]:
+ """Run BibGuard checks and return three reports."""
+
+ if bib_file is None or tex_file is None:
+ return (
+ "⚠️ Please upload both `.bib` and `.tex` files.",
+ "⚠️ Please upload both `.bib` and `.tex` files.",
+ "⚠️ Please upload both `.bib` and `.tex` files."
+ )
+
+ try:
+ # Create config from UI
+ config = create_config_from_ui(
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
+ caption, reference, formatting, equation, ai_artifacts,
+ sentence, consistency, acronym, number, citation_quality, anonymization
+ )
+
+ # Get file paths from uploaded files
+ bib_path = bib_file.name
+ tex_path = tex_file.name
+
+ # Read tex content for checkers
+ tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
+
+ # Parse files
+ bib_parser = BibParser()
+ entries = bib_parser.parse_file(bib_path)
+
+ tex_parser = TexParser()
+ tex_parser.parse_file(tex_path)
+
+ bib_config = config.bibliography
+
+ # Initialize components
+ arxiv_fetcher = None
+ crossref_fetcher = None
+ semantic_scholar_fetcher = None
+ openalex_fetcher = None
+ dblp_fetcher = None
+ comparator = None
+ usage_checker = None
+ duplicate_detector = None
+
+ if bib_config.check_metadata:
+ arxiv_fetcher = ArxivFetcher()
+ semantic_scholar_fetcher = SemanticScholarFetcher()
+ openalex_fetcher = OpenAlexFetcher()
+ dblp_fetcher = DBLPFetcher()
+ crossref_fetcher = CrossRefFetcher()
+ comparator = MetadataComparator()
+
+ if bib_config.check_usage:
+ usage_checker = UsageChecker(tex_parser)
+
+ if bib_config.check_duplicates:
+ duplicate_detector = DuplicateDetector()
+
+ # Initialize report generator
+ report_gen = ReportGenerator(
+ minimal_verified=False,
+ check_preprint_ratio=bib_config.check_preprint_ratio,
+ preprint_warning_threshold=bib_config.preprint_warning_threshold
+ )
+ report_gen.set_metadata([bib_file.name], [tex_file.name])
+
+ # Run submission quality checks
+ progress(0.2, desc="Running LaTeX quality checks...")
+ submission_results = []
+ enabled_checkers = config.submission.get_enabled_checkers()
+
+ for checker_name in enabled_checkers:
+ if checker_name in CHECKER_REGISTRY:
+ checker = CHECKER_REGISTRY[checker_name]()
+ results = checker.check(tex_content, {})
+ for r in results:
+ r.file_path = tex_file.name
+ submission_results.extend(results)
+
+ report_gen.set_submission_results(submission_results, None)
+
+ # Check for duplicates
+ if bib_config.check_duplicates and duplicate_detector:
+ duplicate_groups = duplicate_detector.find_duplicates(entries)
+ report_gen.set_duplicate_groups(duplicate_groups)
+
+ # Check missing citations
+ if bib_config.check_usage and usage_checker:
+ missing = usage_checker.get_missing_entries(entries)
+ report_gen.set_missing_citations(missing)
+
+ # Build workflow
+ workflow_config = get_default_workflow()
+
+ # Process entries
+ progress(0.3, desc="Processing bibliography entries...")
+ total_entries = len(entries)
+
+ for i, entry in enumerate(entries):
+ progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}")
+
+ # Check usage
+ usage_result = None
+ if usage_checker:
+ usage_result = usage_checker.check_usage(entry)
+
+ # Fetch and compare metadata
+ comparison_result = None
+ if bib_config.check_metadata and comparator:
+ comparison_result = fetch_and_compare_with_workflow(
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher,
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+ )
+
+ # Create entry report
+ entry_report = EntryReport(
+ entry=entry,
+ comparison=comparison_result,
+ usage=usage_result,
+ evaluations=[]
+ )
+ report_gen.add_entry_report(entry_report)
+
+ progress(0.85, desc="Generating structured reports...")
+
+ # Generate Bibliography HTML Report
+ bib_report = generate_bibliography_html(report_gen, entries)
+
+ # Generate LaTeX Quality HTML Report
+ latex_report = generate_latex_html(submission_results)
+
+ # Generate Line-by-Line HTML Report
+ line_report = ""
+ if submission_results:
+ line_report = generate_line_html(tex_content, submission_results)
+ else:
+ line_report = 'No issues to display line-by-line.
'
+
+ progress(1.0, desc="Done!")
+
+ return bib_report, latex_report, line_report
+
+ except Exception as e:
+ error_msg = f"❌ Error: {str(e)}"
+ import traceback
+ error_msg += f"\n\n```\n{traceback.format_exc()}\n```"
+ return error_msg, error_msg, error_msg
+
+
+
+def create_app():
+ """Create and configure the Gradio app."""
+
+ # Load icon as base64
+ icon_html = ""
+ try:
+ icon_path = Path("assets/icon-192.png")
+ if icon_path.exists():
+ with open(icon_path, "rb") as f:
+ encoding = base64.b64encode(f.read()).decode()
+ icon_html = f'
'
+ else:
+ icon_html = '📚'
+ except Exception:
+ icon_html = '📚'
+
+ with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app:
+
+ # Header with icon
+ with gr.Row(elem_classes=["app-header"]):
+ gr.HTML(f"""
+
+ {icon_html}
+
+
BibGuard
+
Bibliography & LaTeX Quality Checker
+
+
+ """)
+
+ with gr.Row(elem_classes=["app-body"]):
+ # Left column: Upload & Settings
+ with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]):
+ gr.Markdown("### 📁 Upload Files")
+
+ bib_file = gr.File(
+ label="Bibliography (.bib)",
+ file_types=[".bib"],
+ file_count="single"
+ )
+
+ tex_file = gr.File(
+ label="LaTeX Source (.tex)",
+ file_types=[".tex"],
+ file_count="single"
+ )
+
+ # Check options in grid layout
+ gr.Markdown("#### ⚙️ Options")
+
+ with gr.Row():
+ check_metadata = gr.Checkbox(label="🔍 Metadata", value=False)
+ check_usage = gr.Checkbox(label="📊 Usage", value=True)
+
+ with gr.Row():
+ check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True)
+ check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True)
+
+ with gr.Row():
+ caption = gr.Checkbox(label="🖼️ Captions", value=True)
+ reference = gr.Checkbox(label="🔗 References", value=True)
+
+ with gr.Row():
+ formatting = gr.Checkbox(label="✨ Formatting", value=True)
+ equation = gr.Checkbox(label="🔢 Equations", value=True)
+
+ with gr.Row():
+ ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True)
+ sentence = gr.Checkbox(label="📝 Sentences", value=True)
+
+ with gr.Row():
+ consistency = gr.Checkbox(label="🔄 Consistency", value=True)
+ acronym = gr.Checkbox(label="🔤 Acronyms", value=True)
+
+ with gr.Row():
+ number = gr.Checkbox(label="🔢 Numbers", value=True)
+ citation_quality = gr.Checkbox(label="📚 Citations", value=True)
+
+ with gr.Row():
+ anonymization = gr.Checkbox(label="🎭 Anonymization", value=True)
+
+ run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg")
+
+ gr.HTML("""
+
+ """)
+
+ # Right column: Reports
+ with gr.Column(scale=4, elem_classes=["app-content"]):
+ with gr.Tabs():
+ with gr.Tab("📚 Bibliography Report"):
+ bib_report = gr.HTML(
+ value=WELCOME_HTML,
+ elem_classes=["report-panel"]
+ )
+
+ with gr.Tab("📝 LaTeX Quality"):
+ latex_report = gr.HTML(
+ value=WELCOME_HTML,
+ elem_classes=["report-panel"]
+ )
+
+ with gr.Tab("📋 Line-by-Line"):
+ line_report = gr.HTML(
+ value=WELCOME_HTML,
+ elem_classes=["report-panel"]
+ )
+
+ # Event handling
+ run_btn.click(
+ fn=run_check,
+ inputs=[
+ bib_file, tex_file,
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
+ caption, reference, formatting, equation, ai_artifacts,
+ sentence, consistency, acronym, number, citation_quality, anonymization
+ ],
+ outputs=[bib_report, latex_report, line_report]
+ )
+
+ return app
+
+
+# Create the app
+app = create_app()
+
+if __name__ == "__main__":
+ app.launch(
+ favicon_path="assets/icon-192.png",
+ show_error=True,
+ css=CUSTOM_CSS,
+ theme=gr.themes.Soft()
+ )
diff --git a/app_helper.py b/app_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..4780a3d5533081b245c538f8394b23192786ad2b
--- /dev/null
+++ b/app_helper.py
@@ -0,0 +1,98 @@
+def fetch_and_compare_with_workflow(
+ entry, workflow_steps, arxiv_fetcher, crossref_fetcher,
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+):
+ """Fetch metadata from online sources using the configured workflow."""
+ from src.utils.normalizer import TextNormalizer
+
+ best_result = None
+
+ # If no steps provided, use default order
+ if not workflow_steps:
+ # Create a default list of steps if needed, or simply handle logic here
+ pass
+
+ # Simplified workflow execution: Run through enabled steps
+ # We manualy iterate through sources in a preferred order if workflow is not fully configured
+ # Or iterate through the steps list.
+
+ # Since extracting WorkflowConfig logic is complex, let's just implement a robust
+ # default search strategy here which is what the user likely wants.
+
+ results = []
+
+ # 1. DBLP (High quality for CS)
+ if dblp_fetcher and entry.title:
+ try:
+ dblp_result = dblp_fetcher.search_by_title(entry.title)
+ if dblp_result:
+ res = comparator.compare_with_dblp(entry, dblp_result)
+ if res.is_match: return res
+ results.append(res)
+ except Exception: pass
+
+ # 2. Semantic Scholar (Comprehensive)
+ if semantic_scholar_fetcher and entry.title:
+ try:
+ ss_result = None
+ if entry.doi:
+ ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
+ if not ss_result:
+ ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
+
+ if ss_result:
+ res = comparator.compare_with_semantic_scholar(entry, ss_result)
+ if res.is_match: return res
+ results.append(res)
+ except Exception: pass
+
+ # 3. OpenAlex
+ if openalex_fetcher and entry.title:
+ try:
+ oa_result = None
+ if entry.doi:
+ oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
+ if not oa_result:
+ oa_result = openalex_fetcher.search_by_title(entry.title)
+
+ if oa_result:
+ res = comparator.compare_with_openalex(entry, oa_result)
+ if res.is_match: return res
+ results.append(res)
+ except Exception: pass
+
+ # 4. CrossRef (Official metadata)
+ if crossref_fetcher and entry.doi:
+ try:
+ crossref_result = crossref_fetcher.search_by_doi(entry.doi)
+ if crossref_result:
+ res = comparator.compare_with_crossref(entry, crossref_result)
+ if res.is_match: return res
+ results.append(res)
+ except Exception: pass
+
+ # 5. ArXiv
+ if arxiv_fetcher:
+ try:
+ arxiv_meta = None
+ if entry.has_arxiv:
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+ elif entry.title:
+ # Search by title
+ search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
+ if search_results:
+ arxiv_meta = search_results[0]
+
+ if arxiv_meta:
+ res = comparator.compare_with_arxiv(entry, arxiv_meta)
+ if res.is_match: return res
+ results.append(res)
+ except Exception: pass
+
+ # Return the best result (highest confidence) if no perfect match found
+ if results:
+ results.sort(key=lambda x: x.confidence, reverse=True)
+ return results[0]
+
+ # If absolutely nothing found, return None or an 'Unable' result
+ return comparator.create_unable_result(entry, "No metadata found in any source")
diff --git a/assets/icon-192.png b/assets/icon-192.png
new file mode 100644
index 0000000000000000000000000000000000000000..087848a7f545297b6b770b6dd80b273af062402d
--- /dev/null
+++ b/assets/icon-192.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:158c7c199e8e4978d2e8d6da90c4896022bf83436b0ab2c9b6285078cad60863
+size 339925
diff --git a/assets/icon-512.png b/assets/icon-512.png
new file mode 100644
index 0000000000000000000000000000000000000000..744430c35a3ebdc6d0d56725806a4fae7bf0390f
--- /dev/null
+++ b/assets/icon-512.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da47e48d79d2aae7f81cd1b04b39f0b7a66e760ee2338dfcdde36f66293f3ccf
+size 312990
diff --git a/bibguard.yaml b/bibguard.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7dc574f5a798d32b96cd2b2e5a307e0473318e97
--- /dev/null
+++ b/bibguard.yaml
@@ -0,0 +1,197 @@
+# ==============================================================================
+# BibGuard Configuration File
+# ==============================================================================
+#
+# Usage: python main.py --config bibguard.yaml
+# python main.py (auto-detect bibguard.yaml in current/parent directories)
+#
+# All paths are relative to this configuration file's directory.
+
+# ==============================================================================
+# 📁 File Settings
+# ==============================================================================
+files:
+ # Required: Path to your .bib bibliography file
+ bib: "test.bib"
+
+ # Required: Path to your .tex LaTeX source file
+ tex: "test.tex"
+
+ # Optional: Directory path for recursive scanning (Experimental)
+ # When set, BibGuard will recursively search for all .tex and .bib files in this directory.
+ # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
+ # input_dir: "./paper_project"
+
+ # Output directory for all generated reports and files (default: bibguard_output)
+ # All outputs including reports, cleaned .bib, and input file copies will be saved here
+ output_dir: "test"
+
+
+# ==============================================================================
+# 🎓 Conference Template
+# ==============================================================================
+# Specify a conference template for venue-specific checks and formatting rules.
+# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
+# Leave empty ("") to skip template-specific checks.
+template: ""
+
+# ==============================================================================
+# 📚 Bibliography Checks
+# ==============================================================================
+bibliography:
+ # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
+ # Detects incorrect titles, authors, venues, and publication years
+ # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
+ check_metadata: true
+
+ # Usage Check - Detect unused bib entries and missing citations
+ # Identifies entries in .bib not cited in .tex, and citations without bib entries
+ check_usage: true
+
+ # Duplicate Detection - Find duplicate entries with different keys
+ # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
+ check_duplicates: true
+
+ # Preprint Ratio Check - Warn if too many references are preprints
+ # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
+ check_preprint_ratio: true
+ preprint_warning_threshold: 0.50 # Warn if more than 50% of used entries are preprints
+
+ # Relevance Assessment - Use LLM to evaluate if citations match their context
+ # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
+ check_relevance: false
+
+# ==============================================================================
+# 📋 Submission Quality Checks
+# ==============================================================================
+submission:
+ # ─────────────────────────────────────────────────────────────────────────────
+ # Format Checks
+ # ─────────────────────────────────────────────────────────────────────────────
+
+ # Caption Position - Ensure table captions are above, figure captions below
+ # Checks \caption placement relative to \begin{table}/\begin{figure}
+ caption: true
+
+ # Cross-References - Verify all figures/tables/sections are referenced in text
+ # Detects orphaned floats that are never mentioned
+ reference: true
+
+ # Formatting Standards - Check citation format, spacing, special characters
+ # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
+ formatting: true
+
+ # Equation Checks - Verify equation punctuation and numbering consistency
+ # Ensures equations end with proper punctuation and labels are used correctly
+ equation: true
+
+ # ─────────────────────────────────────────────────────────────────────────────
+ # Writing Quality
+ # ─────────────────────────────────────────────────────────────────────────────
+
+ # AI Artifacts - Detect traces of AI-generated text
+ # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
+ ai_artifacts: true
+
+ # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
+ # Helps improve readability and academic writing style
+ sentence: true
+
+ # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
+ # Examples: "deep learning" vs "deep-learning", "color" vs "colour"
+ consistency: true
+
+ # ─────────────────────────────────────────────────────────────────────────────
+ # Academic Standards
+ # ─────────────────────────────────────────────────────────────────────────────
+
+ # Acronym Definitions - Ensure acronyms are defined on first use
+ # Example: "Natural Language Processing (NLP)" before using "NLP" alone
+ acronym: true
+
+ # Number Formatting - Check percentage formatting consistency
+ # Ensures no space before % sign and consistent use of '%' vs 'percent'
+ number: true
+
+ # Citation Quality - Flag outdated references and citation formatting issues
+ # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
+ citation_quality: true
+
+ # ─────────────────────────────────────────────────────────────────────────────
+ # Review Compliance
+ # ─────────────────────────────────────────────────────────────────────────────
+
+ # Anonymization - Check double-blind review compliance
+ # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
+ anonymization: true
+
+# ==============================================================================
+# 🔍 Metadata Check Workflow
+# ==============================================================================
+# Define the data sources and order for metadata validation.
+# BibGuard will try each enabled source in sequence until a match is found.
+# Set enabled: false to skip a particular source.
+workflow:
+ - name: arxiv_id
+ enabled: true
+ description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
+
+ - name: crossref_doi
+ enabled: true
+ description: "Lookup by DOI via CrossRef (authoritative for published papers)"
+
+ - name: semantic_scholar
+ enabled: true
+ description: "Semantic Scholar API (good coverage, includes citations)"
+
+ - name: dblp
+ enabled: true
+ description: "DBLP database (comprehensive for computer science papers)"
+
+ - name: openalex
+ enabled: true
+ description: "OpenAlex API (broad coverage across disciplines)"
+
+ - name: arxiv_title
+ enabled: true
+ description: "Search arXiv by title (fallback when ID unavailable)"
+
+ - name: crossref_title
+ enabled: true
+ description: "Search CrossRef by title (fallback when DOI unavailable)"
+
+ - name: google_scholar
+ enabled: false # May be rate-limited, disabled by default
+ description: "Google Scholar web scraping (use as last resort)"
+
+# ==============================================================================
+# 🤖 LLM Configuration (for Relevance Checking)
+# ==============================================================================
+llm:
+ # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
+ # Each backend requires different setup (API keys, local installation, etc.)
+ backend: "gemini"
+
+ # Model name (leave empty to use backend default)
+ # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
+ model: ""
+
+ # API endpoint (leave empty to use backend default)
+ # Only needed for self-hosted models (vllm, ollama) or custom endpoints
+ endpoint: ""
+
+ # API key (recommended to use environment variables instead)
+ # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
+ api_key: ""
+
+# ==============================================================================
+# 📊 Output Settings
+# ==============================================================================
+output:
+ # Quiet mode - Suppress progress messages, only output final reports
+ # Useful for CI/CD pipelines or batch processing
+ quiet: false
+
+ # Minimal verified entries - Hide detailed info for entries that passed all checks
+ # Reduces report size when you only care about issues
+ minimal_verified: false
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..f77d35e1cd9fd687e074ffd493835a0c80fd1e36
--- /dev/null
+++ b/main.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+"""
+BibGuard - Bibliography Checker & Paper Submission Quality Tool
+
+Usage:
+ python main.py # Use bibguard.yaml in current directory
+ python main.py --config my.yaml # Use specified config file
+ python main.py --init # Create default config file
+ python main.py --list-templates # List available templates
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Optional, List
+
+from src.parsers import BibParser, TexParser
+from src.fetchers import ArxivFetcher, ScholarFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
+from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, DuplicateDetector
+from src.analyzers.llm_evaluator import LLMBackend
+from src.report.generator import ReportGenerator, EntryReport
+from src.utils.progress import ProgressDisplay
+from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
+from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
+from src.templates.base_template import get_template, get_all_templates
+from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="BibGuard: Bibliography Checker & Paper Submission Quality Tool",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Usage Examples:
+ python main.py # Auto-detect config.yaml in current directory
+ python main.py --config my.yaml # Use specified config file
+ python main.py --init # Create default config.yaml
+ python main.py --list-templates # List available conference templates
+ """
+ )
+
+ parser.add_argument(
+ "--config", "-c",
+ help="Config file path (default: auto-detect config.yaml)"
+ )
+ parser.add_argument(
+ "--init",
+ action="store_true",
+ help="Create default config.yaml in current directory"
+ )
+ parser.add_argument(
+ "--list-templates",
+ action="store_true",
+ help="List all available conference templates"
+ )
+
+ args = parser.parse_args()
+
+ # Handle --init
+ if args.init:
+ output = create_default_config()
+ print(f"✓ Created configuration file: {output}")
+ print("")
+ print(" Next steps:")
+ print(" 1. Edit the 'bib' and 'tex' paths in config.yaml")
+ print(" 2. Run: python main.py --config config.yaml")
+ print("")
+ sys.exit(0)
+
+ # Handle --list-templates
+ if args.list_templates:
+ from src.ui.template_selector import list_templates
+ list_templates()
+ sys.exit(0)
+
+ # Find and load config
+ config_path = args.config
+ if not config_path:
+ found = find_config_file()
+ if found:
+ config_path = str(found)
+ else:
+ print("Error: Config file not found")
+ print("")
+ print("Please run 'python main.py --init' to create config.yaml")
+ print("Or use 'python main.py --config ' to specify a config file")
+ print("")
+ sys.exit(1)
+
+ try:
+ config = load_config(config_path)
+ except FileNotFoundError:
+ print(f"Error: Config file does not exist: {config_path}")
+ sys.exit(1)
+ except Exception as e:
+ print(f"Error: Failed to parse config file: {e}")
+ sys.exit(1)
+
+ # Validate required fields
+ mode_dir = bool(config.files.input_dir)
+
+ if mode_dir:
+ input_dir = config.input_dir_path
+ if not input_dir.exists() or not input_dir.is_dir():
+ print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
+ sys.exit(1)
+
+ tex_files = list(input_dir.rglob("*.tex"))
+ bib_files = list(input_dir.rglob("*.bib"))
+
+ if not tex_files:
+ print(f"Error: No .tex files found in {input_dir}")
+ sys.exit(1)
+ if not bib_files:
+ print(f"Error: No .bib files found in {input_dir}")
+ sys.exit(1)
+
+ config._tex_files = tex_files
+ config._bib_files = bib_files
+ else:
+ if not config.files.bib:
+ print("Error: bib file path not specified in config")
+ sys.exit(1)
+ if not config.files.tex:
+ print("Error: tex file path not specified in config")
+ sys.exit(1)
+
+ # Validate files exist
+ if not config.bib_path.exists():
+ print(f"Error: Bib file does not exist: {config.bib_path}")
+ sys.exit(1)
+ if not config.tex_path.exists():
+ print(f"Error: TeX file does not exist: {config.tex_path}")
+ sys.exit(1)
+
+ config._tex_files = [config.tex_path]
+ config._bib_files = [config.bib_path]
+
+ # Load template if specified
+ template = None
+ if config.template:
+ template = get_template(config.template)
+ if not template:
+ print(f"Error: Unknown template: {config.template}")
+ print("Use --list-templates to see available templates")
+ sys.exit(1)
+
+ # Run the checker
+ try:
+ run_checker(config, template)
+ except KeyboardInterrupt:
+ print("\n\nCancelled")
+ sys.exit(130)
+ except Exception as e:
+ print(f"\nError: {e}")
+ import traceback
+ traceback.print_exc()
+ sys.exit(1)
+
+
+def run_checker(config: BibGuardConfig, template=None):
+ """Run the bibliography checker with the given configuration."""
+ progress = ProgressDisplay()
+
+ # Show config info (minimal)
+ if template:
+ pass # Skip printing header/info here to keep output clean
+
+ # Parse files (silent)
+ bib_parser = BibParser()
+ entries = []
+ for bib_path in config._bib_files:
+ entries.extend(bib_parser.parse_file(str(bib_path)))
+
+ tex_parser = TexParser()
+ tex_contents = {}
+ merged_citations = {}
+ merged_all_keys = set()
+
+ for tex_path in config._tex_files:
+ cits = tex_parser.parse_file(str(tex_path))
+ # Accumulate citations
+ for k, v in cits.items():
+ if k not in merged_citations:
+ merged_citations[k] = []
+ merged_citations[k].extend(v)
+ # Accumulate keys
+ merged_all_keys.update(tex_parser.get_all_cited_keys())
+ # Store content
+ tex_contents[str(tex_path)] = tex_path.read_text(encoding='utf-8', errors='replace')
+
+ # Inject merged data back into parser for components that use it
+ tex_parser.citations = merged_citations
+ tex_parser.all_keys = merged_all_keys
+
+ # Initialize components based on config
+ bib_config = config.bibliography
+
+ arxiv_fetcher = None
+ crossref_fetcher = None
+ scholar_fetcher = None
+ semantic_scholar_fetcher = None
+ openalex_fetcher = None
+ dblp_fetcher = None
+ comparator = None
+ usage_checker = None
+ llm_evaluator = None
+ duplicate_detector = None
+
+ if bib_config.check_metadata or bib_config.check_relevance:
+ arxiv_fetcher = ArxivFetcher()
+
+ if bib_config.check_metadata:
+ semantic_scholar_fetcher = SemanticScholarFetcher()
+ openalex_fetcher = OpenAlexFetcher()
+ dblp_fetcher = DBLPFetcher()
+ crossref_fetcher = CrossRefFetcher()
+ scholar_fetcher = ScholarFetcher()
+ comparator = MetadataComparator()
+
+ if bib_config.check_usage:
+ usage_checker = UsageChecker(tex_parser)
+
+ if bib_config.check_duplicates:
+ duplicate_detector = DuplicateDetector()
+
+ if bib_config.check_relevance:
+ llm_config = config.llm
+ backend = LLMBackend(llm_config.backend)
+ llm_evaluator = LLMEvaluator(
+ backend=backend,
+ endpoint=llm_config.endpoint or None,
+ model=llm_config.model or None,
+ api_key=llm_config.api_key or None
+ )
+
+ # Test LLM connection (silent)
+ llm_evaluator.test_connection()
+
+ if not usage_checker:
+ usage_checker = UsageChecker(tex_parser)
+
+ # Initialize report generator
+ report_gen = ReportGenerator(
+ minimal_verified=config.output.minimal_verified,
+ check_preprint_ratio=config.bibliography.check_preprint_ratio,
+ preprint_warning_threshold=config.bibliography.preprint_warning_threshold
+ )
+ report_gen.set_metadata(
+ [str(f) for f in config._bib_files],
+ [str(f) for f in config._tex_files]
+ )
+
+ # Run submission quality checks
+ submission_results = []
+ enabled_checkers = config.submission.get_enabled_checkers()
+
+ for checker_name in enabled_checkers:
+ if checker_name in CHECKER_REGISTRY:
+ checker = CHECKER_REGISTRY[checker_name]()
+ for tex_path_str, content in tex_contents.items():
+ results = checker.check(content, {})
+ # Tag results with file path
+ for r in results:
+ r.file_path = tex_path_str
+ submission_results.extend(results)
+
+ # Set results in report generator for summary calculation
+ report_gen.set_submission_results(submission_results, template)
+
+ # Check for duplicates (silent)
+ if bib_config.check_duplicates and duplicate_detector:
+ duplicate_groups = duplicate_detector.find_duplicates(entries)
+ report_gen.set_duplicate_groups(duplicate_groups)
+
+ # Check missing citations (silent)
+ if bib_config.check_usage and usage_checker:
+ missing = usage_checker.get_missing_entries(entries)
+ report_gen.set_missing_citations(missing)
+
+ # Process entries
+
+ # Build workflow from config
+ from src.config.workflow import WorkflowConfig, get_default_workflow, WorkflowStep as WFStep
+ workflow_config = get_default_workflow()
+ if config.workflow:
+ workflow_config = WorkflowConfig(
+ steps=[
+ WFStep(
+ name=step.name,
+ display_name=step.name,
+ description=step.description,
+ enabled=step.enabled,
+ priority=i
+ )
+ for i, step in enumerate(config.workflow)
+ ]
+ )
+
+ # Process entries in parallel for metadata checks
+ from concurrent.futures import ThreadPoolExecutor, as_completed
+ import threading
+
+ # Thread-safe progress tracking
+ progress_lock = threading.Lock()
+ completed_count = [0] # Use list for mutability in closure
+
+ def process_single_entry(entry):
+ """Process a single entry (thread-safe)."""
+ # Check usage
+ usage_result = None
+ if usage_checker:
+ usage_result = usage_checker.check_usage(entry)
+
+ # Fetch and compare metadata
+ comparison_result = None
+ if bib_config.check_metadata and comparator:
+ comparison_result = fetch_and_compare_with_workflow(
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher,
+ scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher,
+ dblp_fetcher, comparator
+ )
+
+ # LLM evaluation (keep sequential per entry)
+ evaluations = []
+ if bib_config.check_relevance and llm_evaluator:
+ if usage_result and usage_result.is_used:
+ abstract = get_abstract(entry, comparison_result, arxiv_fetcher)
+ if abstract:
+ for ctx in usage_result.contexts:
+ eval_result = llm_evaluator.evaluate(
+ entry.key, ctx.full_context, abstract
+ )
+ eval_result.line_number = ctx.line_number
+ eval_result.file_path = ctx.file_path
+ evaluations.append(eval_result)
+
+ # Create entry report
+ entry_report = EntryReport(
+ entry=entry,
+ comparison=comparison_result,
+ usage=usage_result,
+ evaluations=evaluations
+ )
+
+ return entry_report, comparison_result
+
+ # Determine number of workers (max 10 to avoid overwhelming APIs)
+ max_workers = min(10, len(entries))
+
+ with progress.progress_context(len(entries), "Processing bibliography") as prog:
+ # Use ThreadPoolExecutor for parallel processing
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ # Submit all tasks
+ future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
+
+ # Process completed tasks
+ for future in as_completed(future_to_entry):
+ entry = future_to_entry[future]
+ try:
+ entry_report, comparison_result = future.result()
+
+ # Thread-safe progress update
+ with progress_lock:
+ report_gen.add_entry_report(entry_report)
+
+ # Update progress
+ if comparison_result and comparison_result.is_match:
+ prog.mark_success()
+ elif comparison_result and comparison_result.has_issues:
+ prog.mark_warning()
+ else:
+ prog.mark_error()
+
+ completed_count[0] += 1
+ prog.update(entry.key, "Done", 1)
+
+ except Exception as e:
+ with progress_lock:
+ prog.mark_error()
+ progress.print_error(f"Error processing {entry.key}: {e}")
+ completed_count[0] += 1
+ prog.update(entry.key, "Failed", 1)
+
+ # Summary will be printed at the very end
+
+ # Generate reports and organize outputs (silent)
+
+ # Create output directory
+ output_dir = config.output_dir_path
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Copy input files to output directory
+ import shutil
+ for bib_path in config._bib_files:
+ shutil.copy2(bib_path, output_dir / bib_path.name)
+ for tex_path in config._tex_files:
+ shutil.copy2(tex_path, output_dir / tex_path.name)
+ # 1. Bibliography Report
+ bib_report_path = output_dir / "bibliography_report.md"
+ report_gen.save_bibliography_report(str(bib_report_path))
+
+ # 2. LaTeX Quality Report
+ if submission_results:
+ latex_report_path = output_dir / "latex_quality_report.md"
+ report_gen.save_latex_quality_report(
+ str(latex_report_path),
+ submission_results,
+ template
+ )
+
+ # 3. Line-by-Line Report
+ from src.report.line_report import generate_line_report
+ line_report_path = output_dir / "line_by_line_report.md"
+
+ # For multiple files, we generate one big report with sections
+ all_line_reports = []
+ for tex_path_str, content in tex_contents.items():
+ file_results = [r for r in submission_results if r.file_path == tex_path_str]
+ if not file_results:
+ continue
+
+ from src.report.line_report import LineByLineReportGenerator
+ gen = LineByLineReportGenerator(content, tex_path_str)
+ gen.add_results(file_results)
+ all_line_reports.append(gen.generate())
+
+ if all_line_reports:
+ with open(line_report_path, 'w', encoding='utf-8') as f:
+ f.write("\n\n".join(all_line_reports))
+
+ # 4. Clean bib file (if generated earlier)
+ if bib_config.check_usage and usage_checker:
+ used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
+ if used_entries:
+ try:
+ keys_to_keep = {entry.key for entry in used_entries}
+ # If multiple bibs, we merge them into one cleaned file
+ # or just use the first one if it's single mode.
+ # For now, let's just use a default name if multiple.
+ if len(config._bib_files) == 1:
+ clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
+ bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
+ else:
+ clean_bib_path = output_dir / "merged_only_used.bib"
+ # We need a way to filter multiple files into one.
+ # BibParser.filter_file currently takes one input.
+ # Let's just write all used entries to a new file.
+ with open(clean_bib_path, 'w', encoding='utf-8') as f:
+ for entry in used_entries:
+ f.write(entry.raw + "\n\n")
+ except Exception as e:
+ pass
+
+ # Print beautiful console summary
+ if not config.output.quiet:
+ bib_stats, latex_stats = report_gen.get_summary_stats()
+ progress.print_detailed_summary(bib_stats, latex_stats, str(output_dir.absolute()))
+
+
+def fetch_and_compare_with_workflow(
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+):
+ """Fetch metadata from online sources using the configured workflow."""
+ from src.utils.normalizer import TextNormalizer
+
+ all_results = []
+ enabled_steps = workflow_config.get_enabled_steps()
+
+ for step in enabled_steps:
+ result = None
+
+ if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher:
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+ if arxiv_meta:
+ result = comparator.compare_with_arxiv(entry, arxiv_meta)
+
+ elif step.name == "crossref_doi" and entry.doi and crossref_fetcher:
+ crossref_result = crossref_fetcher.search_by_doi(entry.doi)
+ if crossref_result:
+ result = comparator.compare_with_crossref(entry, crossref_result)
+
+ elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher:
+ ss_result = None
+ if entry.doi:
+ ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
+ if not ss_result:
+ ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
+ if ss_result:
+ result = comparator.compare_with_semantic_scholar(entry, ss_result)
+
+ elif step.name == "dblp" and entry.title and dblp_fetcher:
+ dblp_result = dblp_fetcher.search_by_title(entry.title)
+ if dblp_result:
+ result = comparator.compare_with_dblp(entry, dblp_result)
+
+ elif step.name == "openalex" and entry.title and openalex_fetcher:
+ oa_result = None
+ if entry.doi:
+ oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
+ if not oa_result:
+ oa_result = openalex_fetcher.search_by_title(entry.title)
+ if oa_result:
+ result = comparator.compare_with_openalex(entry, oa_result)
+
+ elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
+ results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
+ if results:
+ best_result = None
+ best_sim = 0.0
+ norm1 = TextNormalizer.normalize_for_comparison(entry.title)
+
+ for r in results:
+ norm2 = TextNormalizer.normalize_for_comparison(r.title)
+ sim = TextNormalizer.similarity_ratio(norm1, norm2)
+ if sim > best_sim:
+ best_sim = sim
+ best_result = r
+
+ if best_result and best_sim > 0.5:
+ result = comparator.compare_with_arxiv(entry, best_result)
+
+ elif step.name == "crossref_title" and entry.title and crossref_fetcher:
+ crossref_result = crossref_fetcher.search_by_title(entry.title)
+ if crossref_result:
+ result = comparator.compare_with_crossref(entry, crossref_result)
+
+ elif step.name == "google_scholar" and entry.title and scholar_fetcher:
+ scholar_result = scholar_fetcher.search_by_title(entry.title)
+ if scholar_result:
+ result = comparator.compare_with_scholar(entry, scholar_result)
+
+ if result:
+ all_results.append(result)
+ if result.is_match:
+ return result
+
+ if all_results:
+ all_results.sort(key=lambda r: r.confidence, reverse=True)
+ return all_results[0]
+
+ return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
+
+
+def get_abstract(entry, comparison_result, arxiv_fetcher):
+ """Get abstract for an entry from various sources."""
+ if entry.abstract:
+ return entry.abstract
+
+ if entry.has_arxiv and arxiv_fetcher:
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+ if arxiv_meta and arxiv_meta.abstract:
+ return arxiv_meta.abstract
+
+ if entry.title and arxiv_fetcher:
+ results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
+ if results and results[0].abstract:
+ return results[0].abstract
+
+ return ""
+
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9daac4b5e5b83ea62c2a379a6d8b6b94969dbec2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+gradio>=4.0.0
+bibtexparser>=1.4.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+rich>=13.7.0
+Unidecode>=1.3.0
+lxml>=5.0.0
+PyYAML>=6.0
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0117b5587d0af7197827f9cd247130558430ddc1
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1 @@
+"""Bibliography Checker Package"""
diff --git a/src/__pycache__/__init__.cpython-311.pyc b/src/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f7f93299a13eacb2dde71c5682e2d6728c5e004
Binary files /dev/null and b/src/__pycache__/__init__.cpython-311.pyc differ
diff --git a/src/__pycache__/__init__.cpython-313.pyc b/src/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf00bb1d45f8e27a17b606bff18240ef972d3884
Binary files /dev/null and b/src/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/analyzers/__init__.py b/src/analyzers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a37a5742021697f75381d6065ac1bce2ea13eb5
--- /dev/null
+++ b/src/analyzers/__init__.py
@@ -0,0 +1,7 @@
+"""Analyzers package"""
+from .metadata_comparator import MetadataComparator
+from .usage_checker import UsageChecker
+from .llm_evaluator import LLMEvaluator
+from .duplicate_detector import DuplicateDetector
+
+__all__ = ['MetadataComparator', 'UsageChecker', 'LLMEvaluator', 'DuplicateDetector']
diff --git a/src/analyzers/__pycache__/__init__.cpython-313.pyc b/src/analyzers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..509937c364b785deb44375acb3aaba5013361e3d
Binary files /dev/null and b/src/analyzers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc b/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c7bb39c9efd0e7d77bd19c7f34e62d025ab5192
Binary files /dev/null and b/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc b/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf74a8d1d49fa44054e0e72b9f877dbd49b4f95e
Binary files /dev/null and b/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc b/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe9762b1cde28ad6e29ba3f4c3adc11d15564b76
Binary files /dev/null and b/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc b/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b19844c8e6b1dcc1954bf4ab5afdf1df5bb02a9b
Binary files /dev/null and b/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc b/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d2cb0813a973d1cf3460c4535633e00d0a04efa
Binary files /dev/null and b/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/url_validator.cpython-313.pyc b/src/analyzers/__pycache__/url_validator.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63eec7a7977c4938f9e48a79bee40a00c45d3e4d
Binary files /dev/null and b/src/analyzers/__pycache__/url_validator.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/usage_checker.cpython-313.pyc b/src/analyzers/__pycache__/usage_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7184ac66348bfe9886e264fa93a73cbfe9821a1
Binary files /dev/null and b/src/analyzers/__pycache__/usage_checker.cpython-313.pyc differ
diff --git a/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc b/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8adc6c39e805bbc56e450ca5f561022b393e89a8
Binary files /dev/null and b/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc differ
diff --git a/src/analyzers/duplicate_detector.py b/src/analyzers/duplicate_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b4a7e13e846fc57b9014204993e6b7e7743734f
--- /dev/null
+++ b/src/analyzers/duplicate_detector.py
@@ -0,0 +1,204 @@
+"""
+Duplicate entry detector for bibliography files.
+Uses fuzzy matching to find potential duplicates.
+"""
+from dataclasses import dataclass
+from typing import List, Tuple
+
+from ..parsers.bib_parser import BibEntry
+from ..utils.normalizer import TextNormalizer
+
+
+@dataclass
+class DuplicateGroup:
+ """A group of potentially duplicate entries."""
+ entries: List[BibEntry]
+ similarity_score: float
+ reason: str
+
+ @property
+ def entry_keys(self) -> List[str]:
+ return [e.key for e in self.entries]
+
+
+class DuplicateDetector:
+ """Detects duplicate bibliography entries using fuzzy matching."""
+
+ # Thresholds for duplicate detection
+ TITLE_SIMILARITY_THRESHOLD = 0.85
+ COMBINED_SIMILARITY_THRESHOLD = 0.80
+
+ def __init__(self):
+ self.normalizer = TextNormalizer
+
+ def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]:
+ """
+ Find all duplicate groups in the bibliography.
+
+ Returns:
+ List of DuplicateGroup objects, each containing 2+ similar entries.
+ """
+ duplicates = []
+ processed = set()
+
+ for i, entry1 in enumerate(entries):
+ if entry1.key in processed:
+ continue
+
+ # Find all entries similar to this one
+ similar_entries = [entry1]
+
+ for j, entry2 in enumerate(entries[i+1:], start=i+1):
+ if entry2.key in processed:
+ continue
+
+ similarity, reason = self._calculate_similarity(entry1, entry2)
+
+ if similarity >= self.COMBINED_SIMILARITY_THRESHOLD:
+ similar_entries.append(entry2)
+ processed.add(entry2.key)
+
+ # If we found duplicates, create a group
+ if len(similar_entries) > 1:
+ processed.add(entry1.key)
+
+ # Calculate average similarity for the group
+ avg_similarity = self._calculate_group_similarity(similar_entries)
+ reason = self._generate_reason(similar_entries)
+
+ duplicates.append(DuplicateGroup(
+ entries=similar_entries,
+ similarity_score=avg_similarity,
+ reason=reason
+ ))
+
+ # Sort by similarity score (highest first)
+ duplicates.sort(key=lambda g: g.similarity_score, reverse=True)
+
+ return duplicates
+
+ def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]:
+ """
+ Calculate similarity between two entries.
+
+ Returns:
+ (similarity_score, reason_string)
+ """
+ # Normalize titles
+ title1 = self.normalizer.normalize_for_comparison(entry1.title)
+ title2 = self.normalizer.normalize_for_comparison(entry2.title)
+
+ # Calculate title similarity
+ title_sim = self.normalizer.similarity_ratio(title1, title2)
+
+ # If titles are very similar, likely a duplicate
+ if title_sim >= self.TITLE_SIMILARITY_THRESHOLD:
+ return title_sim, "Very similar titles"
+
+ # Check author similarity
+ author_sim = self._calculate_author_similarity(entry1, entry2)
+
+ # Combined score: weighted average
+ # Title is more important (70%) than authors (30%)
+ combined_sim = 0.7 * title_sim + 0.3 * author_sim
+
+ if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD:
+ return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})"
+
+ return combined_sim, ""
+
+ def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float:
+ """Calculate similarity between author lists."""
+ # Parse author strings
+ authors1 = self._parse_authors(entry1.author)
+ authors2 = self._parse_authors(entry2.author)
+
+ if not authors1 or not authors2:
+ return 0.0
+
+ # Normalize author names
+ norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1]
+ norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2]
+
+ # Count matching authors
+ matches = 0
+ for a1 in norm_authors1:
+ for a2 in norm_authors2:
+ if self._authors_match(a1, a2):
+ matches += 1
+ break
+
+ # Calculate Jaccard similarity
+ total_unique = len(set(norm_authors1) | set(norm_authors2))
+ if total_unique == 0:
+ return 0.0
+
+ return matches / total_unique
+
+ def _parse_authors(self, author_string: str) -> List[str]:
+ """Parse author string into list of names."""
+ if not author_string:
+ return []
+
+ # Split by 'and'
+ authors = author_string.split(' and ')
+
+ # Clean up each author
+ cleaned = []
+ for author in authors:
+ # Remove extra whitespace
+ author = ' '.join(author.split())
+ if author:
+ cleaned.append(author)
+
+ return cleaned
+
+ def _authors_match(self, name1: str, name2: str) -> bool:
+ """Check if two author names match (handles initials)."""
+ # Simple exact match after normalization
+ if name1 == name2:
+ return True
+
+ # Check if one is a substring of the other (handles initials)
+ if name1 in name2 or name2 in name1:
+ return True
+
+ # Calculate string similarity
+ sim = self.normalizer.similarity_ratio(name1, name2)
+ return sim >= 0.8
+
+ def _calculate_group_similarity(self, entries: List[BibEntry]) -> float:
+ """Calculate average similarity within a group."""
+ if len(entries) < 2:
+ return 1.0
+
+ total_sim = 0.0
+ count = 0
+
+ for i, entry1 in enumerate(entries):
+ for entry2 in entries[i+1:]:
+ sim, _ = self._calculate_similarity(entry1, entry2)
+ total_sim += sim
+ count += 1
+
+ return total_sim / count if count > 0 else 0.0
+
+ def _generate_reason(self, entries: List[BibEntry]) -> str:
+ """Generate a human-readable reason for the duplicate group."""
+ # Check if all titles are very similar
+ titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries]
+
+ # Calculate pairwise title similarities
+ title_sims = []
+ for i, t1 in enumerate(titles):
+ for t2 in titles[i+1:]:
+ title_sims.append(self.normalizer.similarity_ratio(t1, t2))
+
+ avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0
+
+ if avg_title_sim >= 0.95:
+ return "Nearly identical titles"
+ elif avg_title_sim >= 0.85:
+ return "Very similar titles"
+ else:
+ return "Similar titles and authors"
diff --git a/src/analyzers/llm_evaluator.py b/src/analyzers/llm_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..285a0fcda794d9e220de2b9ebdee12d9049ef46c
--- /dev/null
+++ b/src/analyzers/llm_evaluator.py
@@ -0,0 +1,376 @@
+"""
+LLM-based citation relevance evaluator.
+Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
+"""
+import json
+import re
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+from enum import Enum
+import os
+
+import requests
+
+
+class LLMBackend(Enum):
+ OPENAI = "openai"
+ ANTHROPIC = "anthropic"
+ GEMINI = "gemini"
+ VLLM = "vllm"
+ OLLAMA = "ollama"
+ DEEPSEEK = "deepseek"
+
+
+@dataclass
+class EvaluationResult:
+ """Result of LLM citation evaluation."""
+ entry_key: str
+ relevance_score: int # 1-5
+ is_relevant: bool
+ explanation: str
+ context_used: str
+ abstract_used: str
+ line_number: Optional[int] = None
+ file_path: Optional[str] = None
+ error: Optional[str] = None
+
+ @property
+ def score_label(self) -> str:
+ labels = {
+ 1: "Not Relevant",
+ 2: "Marginally Relevant",
+ 3: "Somewhat Relevant",
+ 4: "Relevant",
+ 5: "Highly Relevant"
+ }
+ return labels.get(self.relevance_score, "Unknown")
+
+
+class LLMEvaluator:
+ """Evaluates citation relevance using LLM."""
+
+ PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
+
+## Citation Context (from the manuscript):
+{context}
+
+## Cited Paper's Abstract:
+{abstract}
+
+## Task:
+Evaluate the relevance and appropriateness of this citation. Consider:
+1. Does the citation support the claim being made in the context?
+2. Is the cited paper's topic related to the discussion?
+3. Is this citation necessary, or could it be replaced with a more relevant one?
+
+## Response Format:
+Provide your response in the following JSON format:
+{{
+ "relevance_score": <1-5 integer>,
+ "is_relevant": ,
+ "explanation": ""
+}}
+
+Score guide:
+- 1: Not relevant at all
+- 2: Marginally relevant
+- 3: Somewhat relevant
+- 4: Relevant and appropriate
+- 5: Highly relevant and essential
+
+STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
+
+ def __init__(
+ self,
+ backend: LLMBackend = LLMBackend.GEMINI,
+ endpoint: Optional[str] = None,
+ model: Optional[str] = None,
+ api_key: Optional[str] = None
+ ):
+ self.backend = backend
+ self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
+
+ # Set defaults based on backend
+ if backend == LLMBackend.OPENAI:
+ self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
+ self.model = model or "gpt-5-mini"
+ elif backend == LLMBackend.ANTHROPIC:
+ self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
+ self.model = model or "claude-4.5-haiku"
+ elif backend == LLMBackend.DEEPSEEK:
+ self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
+ self.model = model or "deepseek-chat"
+ elif backend == LLMBackend.OLLAMA:
+ self.endpoint = endpoint or "http://localhost:11434/api/generate"
+ self.model = model or "Qwen/qwen3-4B-Instruct-2507"
+ elif backend == LLMBackend.VLLM:
+ self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
+ self.model = model or "Qwen/qwen3-4B-Instruct-2507"
+ elif backend == LLMBackend.GEMINI:
+ self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
+ self.model = model or "gemini-2.5-flash-lite"
+
+ def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
+ """Evaluate citation relevance."""
+ if not context or not abstract:
+ return EvaluationResult(
+ entry_key=entry_key,
+ relevance_score=0,
+ is_relevant=False,
+ explanation="Missing context or abstract",
+ context_used=context,
+ abstract_used=abstract,
+ error="Missing context or abstract for evaluation"
+ )
+
+ # Don't truncate - preserve full context and abstract
+ prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
+
+ try:
+ if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
+ response = self._call_openai_compatible(prompt)
+ elif self.backend == LLMBackend.ANTHROPIC:
+ response = self._call_anthropic(prompt)
+ elif self.backend == LLMBackend.OLLAMA:
+ response = self._call_ollama(prompt)
+ elif self.backend == LLMBackend.GEMINI:
+ response = self._call_gemini(prompt)
+ else:
+ raise ValueError(f"Unknown backend: {self.backend}")
+
+ return self._parse_response(entry_key, response, context, abstract)
+
+ except Exception as e:
+ return EvaluationResult(
+ entry_key=entry_key,
+ relevance_score=0,
+ is_relevant=False,
+ explanation="",
+ context_used=context,
+ abstract_used=abstract,
+ error=str(e)
+ )
+
+ def _call_openai_compatible(self, prompt: str) -> str:
+ """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}"
+ }
+
+ payload = {
+ "model": self.model,
+ "messages": [
+ {"role": "user", "content": prompt}
+ ],
+ "temperature": 0.1,
+ "max_tokens": 2000,
+ "response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None
+ }
+
+ response = requests.post(
+ self.endpoint,
+ json=payload,
+ headers=headers,
+ timeout=60
+ )
+ response.raise_for_status()
+
+ data = response.json()
+ choices = data.get("choices", [])
+ if choices:
+ return choices[0].get("message", {}).get("content", "")
+ return ""
+
+ def _call_anthropic(self, prompt: str) -> str:
+ """Call Anthropic API."""
+ headers = {
+ "x-api-key": self.api_key,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json"
+ }
+
+ payload = {
+ "model": self.model,
+ "max_tokens": 2000,
+ "temperature": 0.1,
+ "messages": [
+ {"role": "user", "content": prompt}
+ ]
+ }
+
+ response = requests.post(
+ self.endpoint,
+ json=payload,
+ headers=headers,
+ timeout=60
+ )
+ response.raise_for_status()
+
+ data = response.json()
+ content = data.get("content", [])
+ if content and content[0].get("type") == "text":
+ return content[0].get("text", "")
+ return ""
+
+ def _call_ollama(self, prompt: str) -> str:
+ """Call Ollama API."""
+ payload = {
+ "model": self.model,
+ "prompt": prompt,
+ "stream": False,
+ "options": {
+ "temperature": 0.1,
+ "num_predict": 2000
+ },
+ "format": "json"
+ }
+
+ response = requests.post(
+ self.endpoint,
+ json=payload,
+ timeout=60
+ )
+ response.raise_for_status()
+
+ return response.json().get("response", "")
+
+ def _call_gemini(self, prompt: str) -> str:
+ """Call Gemini API."""
+ # Build URL with model
+ url = f"{self.endpoint}/{self.model}:generateContent"
+ if self.api_key:
+ url += f"?key={self.api_key}"
+
+ payload = {
+ "contents": [
+ {
+ "parts": [
+ {"text": prompt}
+ ]
+ }
+ ],
+ "generationConfig": {
+ "temperature": 0.1,
+ "maxOutputTokens": 2000,
+ "responseMimeType": "application/json"
+ }
+ }
+
+ response = requests.post(
+ url,
+ json=payload,
+ timeout=60
+ )
+ response.raise_for_status()
+
+ candidates = response.json().get("candidates", [])
+ if candidates:
+ content = candidates[0].get("content", {})
+ parts = content.get("parts", [])
+ if parts:
+ return parts[0].get("text", "")
+ return ""
+
+ def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
+ """Parse LLM response."""
+ # Try to extract JSON from response
+ json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
+
+ data = {}
+ if not json_match:
+ # Try to parse the whole response as JSON
+ try:
+ data = json.loads(response.strip())
+ except json.JSONDecodeError:
+ pass
+ else:
+ try:
+ data = json.loads(json_match.group())
+ except json.JSONDecodeError:
+ pass
+
+ if not data:
+ return EvaluationResult(
+ entry_key=entry_key,
+ relevance_score=0,
+ is_relevant=False,
+ explanation=response,
+ context_used=context,
+ abstract_used=abstract,
+ error="Failed to parse LLM response as JSON"
+ )
+
+ # Extract fields
+ relevance_score = data.get("relevance_score", 0)
+ if isinstance(relevance_score, str):
+ try:
+ relevance_score = int(relevance_score)
+ except ValueError:
+ relevance_score = 0
+
+ is_relevant = data.get("is_relevant", False)
+ if isinstance(is_relevant, str):
+ is_relevant = is_relevant.lower() in ("true", "yes", "1")
+
+ explanation = data.get("explanation", "")
+
+ return EvaluationResult(
+ entry_key=entry_key,
+ relevance_score=relevance_score,
+ is_relevant=is_relevant,
+ explanation=explanation,
+ context_used=context,
+ abstract_used=abstract
+ )
+
+ def test_connection(self) -> bool:
+ """Test if LLM backend is accessible."""
+ try:
+ if self.backend == LLMBackend.OLLAMA:
+ response = requests.get(
+ self.endpoint.replace("/api/generate", "/api/tags"),
+ timeout=5
+ )
+ return response.status_code == 200
+ elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
+ # Test with a simple model list or empty completion
+ headers = {"Authorization": f"Bearer {self.api_key}"}
+ # Try listing models if possible, otherwise simple completion
+ if "chat/completions" in self.endpoint:
+ # Try a minimal completion
+ payload = {
+ "model": self.model,
+ "messages": [{"role": "user", "content": "hi"}],
+ "max_tokens": 1
+ }
+ response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
+ return response.status_code == 200
+ else:
+ return False
+ elif self.backend == LLMBackend.ANTHROPIC:
+ headers = {
+ "x-api-key": self.api_key,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json"
+ }
+ payload = {
+ "model": self.model,
+ "max_tokens": 1,
+ "messages": [{"role": "user", "content": "hi"}]
+ }
+ response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
+ return response.status_code == 200
+ elif self.backend == LLMBackend.GEMINI:
+ if not self.api_key:
+ return False
+ url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}"
+ payload = {
+ "contents": [{"parts": [{"text": "test"}]}],
+ "generationConfig": {"maxOutputTokens": 10}
+ }
+ response = requests.post(url, json=payload, timeout=10)
+ return response.status_code == 200
+ except Exception:
+ return False
+ return False
diff --git a/src/analyzers/metadata_comparator.py b/src/analyzers/metadata_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec53a23dc4f5fec944995e6a704f88e4533f20e8
--- /dev/null
+++ b/src/analyzers/metadata_comparator.py
@@ -0,0 +1,474 @@
+"""
+Metadata comparison between bib entries and fetched metadata.
+"""
+from dataclasses import dataclass
+from typing import Optional
+
+from ..parsers.bib_parser import BibEntry
+from ..fetchers.arxiv_fetcher import ArxivMetadata
+from ..fetchers.scholar_fetcher import ScholarResult
+from ..fetchers.crossref_fetcher import CrossRefResult
+from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult
+from ..fetchers.openalex_fetcher import OpenAlexResult
+from ..fetchers.dblp_fetcher import DBLPResult
+from ..utils.normalizer import TextNormalizer
+
+
+@dataclass
+class ComparisonResult:
+ """Result of comparing bib entry with fetched metadata."""
+ entry_key: str
+
+ # Title comparison
+ title_match: bool
+ title_similarity: float
+ bib_title: str
+ fetched_title: str
+
+ # Author comparison
+ author_match: bool
+ author_similarity: float
+ bib_authors: list[str]
+ fetched_authors: list[str]
+
+ # Year comparison
+ year_match: bool
+ bib_year: str
+ fetched_year: str
+
+ # Overall assessment
+ is_match: bool
+ confidence: float
+ issues: list[str]
+ source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
+
+ @property
+ def has_issues(self) -> bool:
+ return len(self.issues) > 0
+
+
+class MetadataComparator:
+ """Compares bibliography entries with fetched metadata."""
+
+ # Thresholds for matching
+ TITLE_THRESHOLD = 0.8
+ AUTHOR_THRESHOLD = 0.6
+
+ def __init__(self):
+ self.normalizer = TextNormalizer
+
+ def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
+ """Compare bib entry with arXiv metadata."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm)
+ # Also try Levenshtein for short titles
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors]
+
+ author_similarity = self._compare_author_lists(bib_authors, arxiv_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ arxiv_year = arxiv_meta.year
+ year_match = bib_year == arxiv_year
+
+ if not year_match and bib_year and arxiv_year:
+ issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=arxiv_meta.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=arxiv_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=arxiv_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="arxiv"
+ )
+
+ def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
+ """Compare bib entry with Scholar search result."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm)
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors (Scholar format is less structured)
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ # Scholar authors are comma-separated
+ scholar_authors_raw = scholar_result.authors.split(',')
+ scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw]
+
+ author_similarity = self._compare_author_lists(bib_authors, scholar_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ scholar_year = scholar_result.year
+ year_match = bib_year == scholar_year
+
+ if not year_match and bib_year and scholar_year:
+ issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=scholar_result.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=scholar_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=scholar_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="scholar"
+ )
+
+ def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult:
+ """Compare bib entry with CrossRef search result."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm)
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors]
+
+ author_similarity = self._compare_author_lists(bib_authors, crossref_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ crossref_year = crossref_result.year
+ year_match = bib_year == crossref_year
+
+ if not year_match and bib_year and crossref_year:
+ issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=crossref_result.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=crossref_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=crossref_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="crossref"
+ )
+
+ def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
+ """Create result when metadata couldn't be fetched."""
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=False,
+ title_similarity=0.0,
+ bib_title=bib_entry.title,
+ fetched_title="",
+ author_match=False,
+ author_similarity=0.0,
+ bib_authors=self.normalizer.normalize_author_list(bib_entry.author),
+ fetched_authors=[],
+ year_match=False,
+ bib_year=bib_entry.year,
+ fetched_year="",
+ is_match=False,
+ confidence=0.0,
+ issues=[reason],
+ source="unable"
+ )
+
+ def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
+ """Compare two author lists."""
+ if not list1 and not list2:
+ return 1.0
+ if not list1 or not list2:
+ return 0.0
+
+ # Find best matches for each author in list1
+ total_similarity = 0.0
+ for author1 in list1:
+ best_match = 0.0
+ for author2 in list2:
+ # Check if one name contains the other (handle abbreviated names)
+ if self._names_match(author1, author2):
+ best_match = 1.0
+ break
+ sim = self.normalizer.similarity_ratio(author1, author2)
+ best_match = max(best_match, sim)
+ total_similarity += best_match
+
+ return total_similarity / len(list1)
+
+ def _names_match(self, name1: str, name2: str) -> bool:
+ """Check if two names match (handles abbreviated names)."""
+ words1 = name1.split()
+ words2 = name2.split()
+
+ if not words1 or not words2:
+ return False
+
+ # Check if last names match
+ if words1[-1] != words2[-1]:
+ # Try first word as last name too
+ if words1[0] != words2[-1] and words1[-1] != words2[0]:
+ return False
+
+ return True
+
+ def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult:
+ """Compare bib entry with Semantic Scholar result."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm)
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors]
+
+ author_similarity = self._compare_author_lists(bib_authors, ss_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ ss_year = ss_result.year
+ year_match = bib_year == ss_year
+
+ if not year_match and bib_year and ss_year:
+ issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=ss_result.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=ss_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=ss_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="semantic_scholar"
+ )
+
+ def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult:
+ """Compare bib entry with OpenAlex result."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm)
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors]
+
+ author_similarity = self._compare_author_lists(bib_authors, oa_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ oa_year = oa_result.year
+ year_match = bib_year == oa_year
+
+ if not year_match and bib_year and oa_year:
+ issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=oa_result.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=oa_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=oa_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="openalex"
+ )
+
+ def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult:
+ """Compare bib entry with DBLP result."""
+ issues = []
+
+ # Compare titles
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+ dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title)
+
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm)
+ if len(bib_title_norm) < 100:
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm)
+ title_similarity = max(title_similarity, lev_sim)
+
+ title_match = title_similarity >= self.TITLE_THRESHOLD
+
+ if not title_match:
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+
+ # Compare authors
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+ dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors]
+
+ author_similarity = self._compare_author_lists(bib_authors, dblp_authors)
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
+
+ if not author_match:
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+
+ # Compare years
+ bib_year = bib_entry.year.strip()
+ dblp_year = dblp_result.year
+ year_match = bib_year == dblp_year
+
+ if not year_match and bib_year and dblp_year:
+ issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}")
+
+ # Overall assessment
+ is_match = title_match and author_match
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+
+ return ComparisonResult(
+ entry_key=bib_entry.key,
+ title_match=title_match,
+ title_similarity=title_similarity,
+ bib_title=bib_entry.title,
+ fetched_title=dblp_result.title,
+ author_match=author_match,
+ author_similarity=author_similarity,
+ bib_authors=bib_authors,
+ fetched_authors=dblp_authors,
+ year_match=year_match,
+ bib_year=bib_year,
+ fetched_year=dblp_year,
+ is_match=is_match,
+ confidence=confidence,
+ issues=issues,
+ source="dblp"
+ )
diff --git a/src/analyzers/usage_checker.py b/src/analyzers/usage_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c4d82da9aedb140dddf454e3a6f43af4741d7f
--- /dev/null
+++ b/src/analyzers/usage_checker.py
@@ -0,0 +1,82 @@
+"""
+Usage checker for bibliography entries in TeX files.
+"""
+from dataclasses import dataclass
+from typing import Optional
+
+from ..parsers.bib_parser import BibEntry
+from ..parsers.tex_parser import TexParser, CitationContext
+
+
+@dataclass
+class UsageResult:
+ """Result of checking if a bib entry is used."""
+ entry_key: str
+ is_used: bool
+ usage_count: int
+ contexts: list[CitationContext]
+ line_numbers: list[int]
+
+ @property
+ def first_usage_line(self) -> Optional[int]:
+ return self.line_numbers[0] if self.line_numbers else None
+
+
+class UsageChecker:
+ """Checks if bibliography entries are used in TeX files."""
+
+ def __init__(self, tex_parser: TexParser):
+ self.tex_parser = tex_parser
+ self._cited_keys = tex_parser.get_all_cited_keys()
+
+ def check_usage(self, entry: BibEntry) -> UsageResult:
+ """Check if a bib entry is used in the TeX document."""
+ key = entry.key
+ is_used = key in self._cited_keys
+ contexts = self.tex_parser.get_citation_contexts(key)
+
+ return UsageResult(
+ entry_key=key,
+ is_used=is_used,
+ usage_count=len(contexts),
+ contexts=contexts,
+ line_numbers=[ctx.line_number for ctx in contexts]
+ )
+
+ def get_unused_entries(self, entries: list[BibEntry]) -> list[BibEntry]:
+ """Get list of entries that are not cited in the document."""
+ unused = []
+ for entry in entries:
+ if entry.key not in self._cited_keys:
+ unused.append(entry)
+ return unused
+
+ def get_missing_entries(self, entries: list[BibEntry]) -> list[str]:
+ """Get list of citation keys that don't have corresponding bib entries."""
+ entry_keys = {e.key for e in entries}
+ missing = []
+ for key in self._cited_keys:
+ if key not in entry_keys:
+ missing.append(key)
+ return missing
+
+ def get_combined_context(self, key: str, max_chars: int = 1000) -> str:
+ """Get combined context for all usages of a key."""
+ contexts = self.tex_parser.get_citation_contexts(key)
+ if not contexts:
+ return ""
+
+ combined = []
+ total_chars = 0
+
+ for ctx in contexts:
+ if total_chars + len(ctx.full_context) > max_chars:
+ # Add truncated context
+ remaining = max_chars - total_chars
+ if remaining > 100:
+ combined.append(ctx.full_context[:remaining] + "...")
+ break
+ combined.append(ctx.full_context)
+ total_chars += len(ctx.full_context)
+
+ return "\n---\n".join(combined)
diff --git a/src/checkers/__init__.py b/src/checkers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfa6f6c745d324ef8d6b1a4909050ae5fd07987
--- /dev/null
+++ b/src/checkers/__init__.py
@@ -0,0 +1,66 @@
+"""Checkers module for paper submission quality checks."""
+from .base import BaseChecker, CheckResult, CheckSeverity
+from .caption_checker import CaptionChecker
+from .reference_checker import ReferenceChecker
+from .ai_artifacts_checker import AIArtifactsChecker
+from .formatting_checker import FormattingChecker
+from .anonymization_checker import AnonymizationChecker
+from .number_checker import NumberChecker
+from .sentence_checker import SentenceChecker
+from .consistency_checker import ConsistencyChecker
+from .citation_quality_checker import CitationQualityChecker
+from .equation_checker import EquationChecker
+from .acronym_checker import AcronymChecker
+
+__all__ = [
+ 'BaseChecker',
+ 'CheckResult',
+ 'CheckSeverity',
+ 'CaptionChecker',
+ 'ReferenceChecker',
+ 'AIArtifactsChecker',
+ 'FormattingChecker',
+ 'AnonymizationChecker',
+ 'NumberChecker',
+ 'SentenceChecker',
+ 'ConsistencyChecker',
+ 'CitationQualityChecker',
+ 'EquationChecker',
+ 'AcronymChecker',
+]
+
+
+# Registry of all available checkers
+CHECKER_REGISTRY = {
+ 'caption': CaptionChecker,
+ 'reference': ReferenceChecker,
+ 'ai_artifacts': AIArtifactsChecker,
+ 'formatting': FormattingChecker,
+ 'anonymization': AnonymizationChecker,
+ 'number': NumberChecker,
+ 'sentence': SentenceChecker,
+ 'consistency': ConsistencyChecker,
+ 'citation_quality': CitationQualityChecker,
+ 'equation': EquationChecker,
+ 'acronym': AcronymChecker,
+}
+
+
+def get_checker(name: str) -> BaseChecker:
+ """Get a checker instance by name."""
+ if name not in CHECKER_REGISTRY:
+ raise ValueError(f"Unknown checker: {name}")
+ return CHECKER_REGISTRY[name]()
+
+
+def run_all_checkers(tex_content: str, config: dict = None) -> list:
+ """Run all checkers and return combined results."""
+ results = []
+ config = config or {}
+
+ for name, checker_class in CHECKER_REGISTRY.items():
+ checker = checker_class()
+ checker_results = checker.check(tex_content, config)
+ results.extend(checker_results)
+
+ return results
diff --git a/src/checkers/__pycache__/__init__.cpython-313.pyc b/src/checkers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfbadc876a0bde939483c7b0f82bf06d24750cbd
Binary files /dev/null and b/src/checkers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/acronym_checker.cpython-313.pyc b/src/checkers/__pycache__/acronym_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ac22b6f83d4ca40144e2133b2c42c6080d78275
Binary files /dev/null and b/src/checkers/__pycache__/acronym_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc b/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b174f02eae34f07ad56e5e64a30638db94aa0a39
Binary files /dev/null and b/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc b/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..235ac14399e902e0cd1888e78a3ec1aca756822b
Binary files /dev/null and b/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/base.cpython-313.pyc b/src/checkers/__pycache__/base.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8a48378cd6d7d880c2ea0c1635ab2d266402df
Binary files /dev/null and b/src/checkers/__pycache__/base.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/caption_checker.cpython-313.pyc b/src/checkers/__pycache__/caption_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ded8600e65ad18259c7d969ba6d3a6a88b64c677
Binary files /dev/null and b/src/checkers/__pycache__/caption_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc b/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0403cd564b3cb9927f6b0f05c6c51afb8e213a54
Binary files /dev/null and b/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/consistency_checker.cpython-313.pyc b/src/checkers/__pycache__/consistency_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c64657741db59e4627caae647a1ff9e8a725492
Binary files /dev/null and b/src/checkers/__pycache__/consistency_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/equation_checker.cpython-313.pyc b/src/checkers/__pycache__/equation_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d598e60c925c8e5d0c4505d3f97909dbe9e4e818
Binary files /dev/null and b/src/checkers/__pycache__/equation_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/formatting_checker.cpython-313.pyc b/src/checkers/__pycache__/formatting_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2aaa1357a5a6800b184ae2f5b6dcae7c2df51cb
Binary files /dev/null and b/src/checkers/__pycache__/formatting_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/number_checker.cpython-313.pyc b/src/checkers/__pycache__/number_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af09ad160e7cf0d63d4cd3685ca98973e9175fa0
Binary files /dev/null and b/src/checkers/__pycache__/number_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/reference_checker.cpython-313.pyc b/src/checkers/__pycache__/reference_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c02ac2fa5d31c58c4e9909fecfb0616cd0fca9d9
Binary files /dev/null and b/src/checkers/__pycache__/reference_checker.cpython-313.pyc differ
diff --git a/src/checkers/__pycache__/sentence_checker.cpython-313.pyc b/src/checkers/__pycache__/sentence_checker.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00cf2c808194adcccaa032ee526542c5715e25ca
Binary files /dev/null and b/src/checkers/__pycache__/sentence_checker.cpython-313.pyc differ
diff --git a/src/checkers/acronym_checker.py b/src/checkers/acronym_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a79955af212923f0476d3c4de853aa05a3b1933c
--- /dev/null
+++ b/src/checkers/acronym_checker.py
@@ -0,0 +1,284 @@
+"""
+Acronym and abbreviation checker.
+
+Validates that:
+- Acronyms found in text have corresponding full forms defined
+- Acronyms are used after their definition
+- Only checks acronyms that have matching full forms in the document
+"""
+import re
+from typing import List, Dict, Set, Tuple
+from collections import defaultdict
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class AcronymChecker(BaseChecker):
+ """Check acronym definitions and consistency."""
+
+ name = "acronym"
+ display_name = "Acronyms"
+ description = "Check acronym definitions and consistent usage"
+
+ # Enhanced pattern to find defined acronyms with LaTeX formatting support
+ # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
+ DEFINITION_PATTERN = re.compile(
+ r'([A-Z][a-zA-Z\s\-]+)\s*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC})
+ r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name)
+ re.MULTILINE
+ )
+
+ # Pattern to find standalone acronyms (3+ capital letters)
+ ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
+
+ # Comprehensive list of common acronyms that don't need definition
+ COMMON_ACRONYMS = {
+ # Hardware & Computing
+ 'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
+ 'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
+ 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
+
+ # File Formats & Standards
+ 'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
+ 'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
+
+ # AI & Machine Learning (General)
+ 'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
+ 'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
+ 'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
+
+ # NLP & Language Models
+ 'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
+ 'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
+ 'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
+
+ # Computer Vision
+ 'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
+ 'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
+
+ # Reinforcement Learning
+ 'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
+ 'POMDP', 'RLHF', 'RLAIF',
+
+ # Metrics & Evaluation
+ 'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
+ 'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
+
+ # Data & Statistics
+ 'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
+ 'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
+
+ # Academic & Organizations
+ 'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
+ 'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
+ 'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
+ 'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
+
+ # Methods & Techniques (Common in ML papers)
+ 'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
+ 'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
+ 'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
+
+ # Misc
+ 'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
+ 'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
+ 'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
+ 'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
+ }
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # Remove comments using base class method
+ content = self._remove_comments(tex_content)
+
+ # Find all defined acronyms with their positions
+ defined_acronyms = self._find_definitions(content)
+
+ # Find all acronym usages (excluding special contexts)
+ all_usages = self._find_all_usages(content)
+
+ # NEW: Find potential full forms for each acronym
+ acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
+
+ # Check for undefined acronyms (only those with matching full forms)
+ for acronym, positions in all_usages.items():
+ if acronym in self.COMMON_ACRONYMS:
+ continue
+
+ # Skip if no matching full form found in document
+ if acronym not in acronym_full_forms:
+ continue
+
+ if acronym not in defined_acronyms:
+ # First usage should define it
+ first_pos = positions[0]
+ line_num = self._find_line_number(content, first_pos)
+ full_form = acronym_full_forms[acronym]
+
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
+ line_number=line_num,
+ suggestion=f"Define on first use: '{full_form} ({acronym})'"
+ ))
+ else:
+ # Check if used before definition
+ def_pos = defined_acronyms[acronym]
+ for pos in positions:
+ if pos < def_pos:
+ line_num = self._find_line_number(content, pos)
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"Acronym '{acronym}' used before definition",
+ line_number=line_num,
+ suggestion="Move definition before first use"
+ ))
+ break
+
+ return results
+
+ def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
+ """Find potential full forms for acronyms by matching capital letters."""
+ full_forms = {}
+
+ for acronym in acronyms:
+ if acronym in self.COMMON_ACRONYMS:
+ continue
+
+ # Build regex pattern to match full form
+ # For "ABC", match words starting with A, B, C
+ acronym_clean = acronym.rstrip('s') # Remove plural
+ if len(acronym_clean) < 3:
+ continue
+
+ # Create pattern: match sequence of words where first letters spell the acronym
+ # Allow optional words in between (like "of", "the", "and")
+ pattern_parts = []
+ for i, letter in enumerate(acronym_clean):
+ if i == 0:
+ # First word must start with the letter
+ pattern_parts.append(f'{letter}[a-z]+')
+ else:
+ # Subsequent words: allow optional filler words
+ pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
+
+ full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
+
+ try:
+ matches = re.finditer(full_pattern, content, re.IGNORECASE)
+ for match in matches:
+ candidate = match.group(0)
+
+ # Skip if candidate contains common non-content words
+ # These words indicate the match is part of a sentence, not an acronym full form
+ excluded_words = {
+ 'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+ 'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
+ 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
+ 'can', 'could', 'may', 'might', 'must', 'shall',
+ 'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
+ 'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
+ 'more', 'most', 'less', 'few', 'several', 'other', 'another'
+ }
+
+ candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
+ if any(word in excluded_words for word in candidate_words):
+ continue
+
+ # Verify: extract first letters and check if they match acronym
+ words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
+ # Filter out filler words (allowed in between but not counted)
+ filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
+ meaningful_words = [w for w in words if w.lower() not in filler_words]
+
+ if len(meaningful_words) >= len(acronym_clean):
+ first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
+ if first_letters == acronym_clean:
+ full_forms[acronym] = candidate
+ break # Found a match, use the first one
+ except re.error:
+ # Invalid regex, skip this acronym
+ continue
+
+ return full_forms
+
+ def _find_definitions(self, content: str) -> Dict[str, int]:
+ """Find all acronym definitions and their positions."""
+ definitions = {}
+
+ for match in self.DEFINITION_PATTERN.finditer(content):
+ # Get acronym from either pattern
+ acronym = match.group(2) or match.group(3)
+ if acronym:
+ acronym = acronym.rstrip('s') # Remove plural
+ definitions[acronym] = match.start()
+
+ return definitions
+
+ def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
+ """Find all acronym usages, excluding special contexts."""
+ usages = defaultdict(list)
+
+ for match in self.ACRONYM_PATTERN.finditer(content):
+ acronym = match.group(1).rstrip('s')
+ pos = match.start()
+
+ # Skip if in special context
+ if self._is_in_special_context(content, pos, acronym):
+ continue
+
+ usages[acronym].append(pos)
+
+ return usages
+
+ def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
+ """Check if acronym at position is in a special context that should be ignored."""
+ # Get surrounding context
+ start = max(0, pos - 50)
+ end = min(len(content), pos + len(acronym) + 50)
+ before = content[start:pos]
+ after = content[pos + len(acronym):end]
+
+ # Skip if inside definition parentheses: (ACRONYM)
+ if before.endswith('(') and after.startswith(')'):
+ return True
+
+ # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
+ if before.rstrip().endswith('\\'):
+ return True
+
+ # Skip if inside label: \label{...:ACRONYM...}
+ if r'\label{' in before[-20:] and '}' in after[:20]:
+ return True
+
+ # Skip if inside ref: \ref{...:ACRONYM...}
+ if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
+ return True
+
+ # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
+ if r'\url{' in before[-20:] or 'http' in before[-20:]:
+ return True
+
+ # Skip if inside math mode (simple heuristic)
+ # Count $ signs before position
+ dollar_count = before.count('$') - before.count(r'\$')
+ if dollar_count % 2 == 1: # Odd number means we're inside math mode
+ return True
+
+ # Skip if inside \begin{equation} or similar
+ if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
+ if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
+ return True
+
+ # Skip if it looks like a LaTeX command argument: \command[ACRONYM]
+ if before.endswith('[') and after.startswith(']'):
+ return True
+
+ # Skip if part of a file path or extension
+ if '.' in before[-5:] or '/' in before[-10:]:
+ return True
+
+ return False
diff --git a/src/checkers/ai_artifacts_checker.py b/src/checkers/ai_artifacts_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..209af561a41e25d1d9f617bfc3e7ee9ad1ba9205
--- /dev/null
+++ b/src/checkers/ai_artifacts_checker.py
@@ -0,0 +1,176 @@
+"""
+AI artifacts checker.
+
+Detects leftover text from AI writing assistants that should be removed
+before submission, such as:
+- Conversational responses ("Sure, here is...")
+- Placeholder text
+- Markdown formatting artifacts
+- Common AI response patterns
+"""
+import re
+from typing import List, Tuple
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class AIArtifactsChecker(BaseChecker):
+ """Detect AI-generated text artifacts that should be removed."""
+
+ name = "ai_artifacts"
+ display_name = "AI Artifacts"
+ description = "Detect leftover AI assistant text and placeholders"
+
+ # Conversational AI patterns (case insensitive)
+ # These are phrases that clearly indicate a dialogue between user and AI assistant
+ AI_CONVERSATION_PATTERNS = [
+ # Responses to requests
+ (r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
+ (r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
+ (r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
+ (r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
+ (r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
+ (r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
+
+ # Self-identification
+ (r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
+ (r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
+ (r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
+
+ # Explanatory transitions typical of chat
+ (r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
+ (r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
+ (r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
+
+ # Closing/Politeness
+ (r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
+ (r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
+ (r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
+ (r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
+ (r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
+ (r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
+
+ # Instructions/Meta-commentary
+ (r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
+ (r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
+ (r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
+ ]
+
+ # Placeholder patterns
+ PLACEHOLDER_PATTERNS = [
+ (r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
+ (r'\[add\s+[^\]]+\]', "Placeholder text"),
+ (r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
+ (r'\btodo\s*:\s*.{0,50}', "TODO comment"),
+ (r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
+ (r'\bxxx\b', "XXX placeholder"),
+ (r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
+ (r'author[\s_-]*name', "Author name placeholder"),
+ (r'your\.?email@example\.com', "Email placeholder"),
+ (r'example@(example\.com|university\.edu)', "Email placeholder"),
+ (r'\[citation\s+needed\]', "Citation needed placeholder"),
+ ]
+
+ # Markdown artifacts (should not appear in LaTeX)
+ MARKDOWN_PATTERNS = [
+ (r'^\s*#{1,6}\s+\w', "Markdown header"),
+ (r'\*\*[^*]+\*\*', "Markdown bold"),
+ (r'(? List[CheckResult]:
+ results = []
+ lines = tex_content.split('\n')
+
+ # Track if we are inside a verbatim-like environment
+ in_verbatim = False
+ verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
+
+ # Check each line
+ for line_num, line in enumerate(lines, 1):
+ # Check for environment boundaries
+ # Handle \begin{env}
+ if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
+ in_verbatim = True
+ continue # Skip the begin line itself
+
+ # Handle \end{env}
+ if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
+ in_verbatim = False
+ continue # Skip the end line itself
+
+ # Skip checks if inside verbatim environment
+ if in_verbatim:
+ continue
+
+ # Skip commented lines using base class method
+ if self._is_comment_line(line):
+ continue
+
+ # Remove inline comments for checking using base class method
+ line_to_check = self._remove_line_comment(line)
+
+ # Check AI conversation patterns
+ for pattern, description in self.AI_CONVERSATION_PATTERNS:
+ if re.search(pattern, line_to_check, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message=f"{description} detected",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Remove AI-generated conversational text"
+ ))
+ break # One match per line for this category
+
+ # Check placeholder patterns
+ for pattern, description in self.PLACEHOLDER_PATTERNS:
+ match = re.search(pattern, line_to_check, re.IGNORECASE)
+ if match:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"{description}: '{match.group(0)[:50]}'",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Replace placeholder with actual content or remove"
+ ))
+
+ # Check Markdown patterns (less strict - might be intentional in some cases)
+ for pattern, description in self.MARKDOWN_PATTERNS:
+ # Skip if line looks like a LaTeX command (starts with \)
+ if line_to_check.strip().startswith('\\'):
+ continue
+
+ # Special handling for bullet points: ensure space after
+ if "bullet point" in description:
+ # Skip if it looks like a math subtraction or negative number
+ if re.search(r'[-+]\d', line_to_check):
+ continue
+ # Skip if inside math mode (simple heuristic)
+ if '$' in line_to_check:
+ continue
+
+ # Special handling for italics: avoid matching math mode like $x*y$
+ if "italic" in description:
+ if '$' in line_to_check:
+ continue
+
+ if re.search(pattern, line_to_check):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Possible {description} in LaTeX",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Convert to LaTeX formatting or remove if unintentional"
+ ))
+
+ return results
diff --git a/src/checkers/anonymization_checker.py b/src/checkers/anonymization_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c17e53510ffc4954b128a355b79ec95c37b727c
--- /dev/null
+++ b/src/checkers/anonymization_checker.py
@@ -0,0 +1,216 @@
+"""
+Anonymization checker.
+
+For double-blind review submissions, checks for:
+- Author name leaks in acknowledgments
+- Personal URLs (GitHub, personal pages)
+- Self-citations that reveal identity
+- Institutional information in comments
+"""
+import re
+from typing import List
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class AnonymizationChecker(BaseChecker):
+ """Check for anonymization issues in double-blind submissions."""
+
+ name = "anonymization"
+ display_name = "Anonymization"
+ description = "Detect potential identity leaks in double-blind submissions"
+
+ # Patterns for identity-revealing content
+ PERSONAL_URL_PATTERNS = [
+ (r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
+ (r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
+ (r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
+ (r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
+ (r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
+ (r'~[a-zA-Z]+/', "Personal university page"),
+ (r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
+ (r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
+ ]
+
+ # Anonymous submission indicators (should be present)
+ ANONYMOUS_MARKERS = [
+ r'\\author\{[^}]*anonymous[^}]*\}',
+ r'anonymous\s+submission',
+ r'\\runningauthor\{[^}]*\}', # Should be empty or generic
+ ]
+
+ # Potentially revealing patterns
+ SELF_CITE_PATTERNS = [
+ r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
+ r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
+ r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
+ ]
+
+ # Acknowledgment patterns
+ ACK_PATTERN = re.compile(
+ r'\\(?:section\*?\{acknowledgment|begin\{ack)',
+ re.IGNORECASE
+ )
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+ lines = tex_content.split('\n')
+
+ # Check if this is a review submission (look for anonymous author)
+ is_review_version = self._is_review_version(tex_content)
+
+ if not is_review_version:
+ # If camera-ready, skip anonymization checks
+ results.append(self._create_result(
+ passed=True,
+ severity=CheckSeverity.INFO,
+ message="Document appears to be camera-ready version (not checking anonymization)"
+ ))
+ return results
+
+ # Check for personal URLs
+ for line_num, line in enumerate(lines, 1):
+ # Skip comments, but still check for leaks in comments!
+ if self._is_comment_line(line):
+ for pattern, desc in self.PERSONAL_URL_PATTERNS:
+ if re.search(pattern, line, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"{desc} in comment (could be revealed when compiling)",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Remove or anonymize URL even in comments"
+ ))
+ continue
+
+ for pattern, desc in self.PERSONAL_URL_PATTERNS:
+ if re.search(pattern, line, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message=f"{desc} may reveal author identity",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Replace with anonymized URL or remove for review"
+ ))
+
+ # Check acknowledgments section
+ ack_results = self._check_acknowledgments(tex_content, lines)
+ results.extend(ack_results)
+
+ # Check for self-revealing citations
+ for line_num, line in enumerate(lines, 1):
+ # Skip comments using base class method
+ if self._is_comment_line(line):
+ continue
+
+ for pattern in self.SELF_CITE_PATTERNS:
+ if re.search(pattern, line, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="Potentially self-revealing citation pattern",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
+ ))
+
+ # Check for \author content
+ author_results = self._check_author_field(tex_content)
+ results.extend(author_results)
+
+ return results
+
+ def _is_review_version(self, content: str) -> bool:
+ """Detect if this is a review (anonymous) version."""
+ # Check for common anonymous submission markers
+ review_indicators = [
+ r'review',
+ r'submitted\s+to',
+ r'under\s+review',
+ r'anonymous',
+ r'\\usepackage\[review\]',
+ ]
+
+ for indicator in review_indicators:
+ if re.search(indicator, content[:2000], re.IGNORECASE):
+ return True
+
+ # Check for camera-ready indicators (negative)
+ camera_indicators = [
+ r'\\usepackage\[accepted\]',
+ r'\\usepackage\[final\]',
+ r'camera[\s-]?ready',
+ ]
+
+ for indicator in camera_indicators:
+ if re.search(indicator, content[:2000], re.IGNORECASE):
+ return False
+
+ # Default to review version (safer)
+ return True
+
+ def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
+ """Check acknowledgments section for identity leaks."""
+ results = []
+
+ # Find acknowledgment section
+ ack_match = self.ACK_PATTERN.search(content)
+ if not ack_match:
+ return results
+
+ # Find the line number
+ ack_line = self._find_line_number(content, ack_match.start())
+
+ # Check if it's commented out
+ actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
+ if not actual_line.lstrip().startswith('%'):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="Acknowledgments section found - should be commented out for review",
+ line_number=ack_line,
+ suggestion="Comment out acknowledgments with % for anonymous submission"
+ ))
+
+ return results
+
+ def _check_author_field(self, content: str) -> List[CheckResult]:
+ """Check \\author{} field for revealing content."""
+ results = []
+
+ # Find \author{...} - handle multiline
+ author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
+ match = author_pattern.search(content)
+
+ if match:
+ # Extract author content (handle nested braces)
+ start = match.end()
+ brace_count = 1
+ i = start
+ while i < len(content) and brace_count > 0:
+ if content[i] == '{':
+ brace_count += 1
+ elif content[i] == '}':
+ brace_count -= 1
+ i += 1
+
+ author_content = content[start:i-1]
+ line_num = self._find_line_number(content, match.start())
+
+ # Check if author content looks anonymous
+ if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
+ # Check if it's not using \Anonymous or similar
+ if not re.search(r'\\(Anonymous|blindauthor)', author_content):
+ # Might contain real author info
+ if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message="Author field may contain real names",
+ line_number=line_num,
+ suggestion="Replace with 'Anonymous' or use anonymization command"
+ ))
+
+ return results
diff --git a/src/checkers/base.py b/src/checkers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..de8caf1d73fc485eb2a92532bd41ce8efd9539fc
--- /dev/null
+++ b/src/checkers/base.py
@@ -0,0 +1,193 @@
+"""
+Base checker class for paper submission quality checks.
+
+All specific checkers inherit from BaseChecker and implement
+the check() method to validate specific aspects of the TeX document.
+"""
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Tuple
+
+
+class CheckSeverity(Enum):
+ """Severity levels for check results."""
+ ERROR = "error" # Must fix before submission
+ WARNING = "warning" # Strongly recommended to fix
+ INFO = "info" # Suggestion or best practice
+
+
+@dataclass
+class CheckResult:
+ """Result of a single check."""
+ checker_name: str
+ passed: bool
+ severity: CheckSeverity
+ message: str
+ line_number: Optional[int] = None
+ line_content: Optional[str] = None
+ suggestion: Optional[str] = None
+ file_path: Optional[str] = None
+
+ def to_dict(self) -> dict:
+ return {
+ 'checker': self.checker_name,
+ 'passed': self.passed,
+ 'severity': self.severity.value,
+ 'message': self.message,
+ 'line': self.line_number,
+ 'content': self.line_content,
+ 'suggestion': self.suggestion,
+ 'file_path': self.file_path
+ }
+
+
+class BaseChecker(ABC):
+ """
+ Abstract base class for all paper submission checkers.
+
+ Each checker validates a specific aspect of the paper,
+ such as caption placement, reference integrity, or formatting.
+ """
+
+ # Checker metadata - override in subclasses
+ name: str = "base"
+ display_name: str = "Base Checker"
+ description: str = "Base checker class"
+
+ @abstractmethod
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ """
+ Run the check on the given TeX content.
+
+ Args:
+ tex_content: The full content of the TeX file
+ config: Optional configuration dict (e.g., conference-specific settings)
+
+ Returns:
+ List of CheckResult objects describing found issues
+ """
+ pass
+
+ def _remove_comments(self, content: str) -> str:
+ """
+ Remove all LaTeX comments from content.
+
+ Preserves line structure (replaces comment with empty string on same line).
+ Handles escaped percent signs (\\%) correctly.
+ """
+ lines = content.split('\n')
+ result = []
+
+ for line in lines:
+ # Find first unescaped %
+ cleaned = self._remove_line_comment(line)
+ result.append(cleaned)
+
+ return '\n'.join(result)
+
+ def _remove_line_comment(self, line: str) -> str:
+ """Remove comment from a single line, preserving content before %."""
+ i = 0
+ while i < len(line):
+ if line[i] == '%':
+ # Check if escaped
+ num_backslashes = 0
+ j = i - 1
+ while j >= 0 and line[j] == '\\':
+ num_backslashes += 1
+ j -= 1
+ if num_backslashes % 2 == 0:
+ # Not escaped, this is a comment start
+ return line[:i]
+ i += 1
+ return line
+
+ def _is_comment_line(self, line: str) -> bool:
+ """Check if a line is entirely a comment (starts with %)."""
+ stripped = line.lstrip()
+ if not stripped:
+ return False
+ return stripped[0] == '%'
+
+ def _get_non_comment_lines(self, content: str) -> List[Tuple[int, str]]:
+ """
+ Get all non-comment lines with their line numbers.
+
+ Returns:
+ List of (line_number, line_content) tuples for non-comment lines.
+ Line content has inline comments removed.
+ """
+ lines = content.split('\n')
+ result = []
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip pure comment lines
+ if self._is_comment_line(line):
+ continue
+
+ # Remove inline comments
+ cleaned = self._remove_line_comment(line)
+
+ # Skip if nothing left after removing comment
+ if not cleaned.strip():
+ continue
+
+ result.append((line_num, cleaned))
+
+ return result
+
+ def _find_line_number(self, content: str, position: int) -> int:
+ """Find line number for a character position in content."""
+ return content[:position].count('\n') + 1
+
+ def _get_line_content(self, content: str, line_number: int) -> str:
+ """Get the content of a specific line."""
+ lines = content.split('\n')
+ if 1 <= line_number <= len(lines):
+ return lines[line_number - 1].strip()
+ return ""
+
+ def _is_commented(self, content: str, position: int) -> bool:
+ """Check if a position is within a LaTeX comment."""
+ # Find the start of the current line
+ line_start = content.rfind('\n', 0, position) + 1
+ line_before = content[line_start:position]
+
+ # Check for unescaped % before this position on the same line
+ i = 0
+ while i < len(line_before):
+ if line_before[i] == '%':
+ # Check if escaped
+ num_backslashes = 0
+ j = i - 1
+ while j >= 0 and line_before[j] == '\\':
+ num_backslashes += 1
+ j -= 1
+ if num_backslashes % 2 == 0:
+ # Not escaped, this is a comment
+ return True
+ i += 1
+ return False
+
+ def _create_result(
+ self,
+ passed: bool,
+ severity: CheckSeverity,
+ message: str,
+ line_number: Optional[int] = None,
+ line_content: Optional[str] = None,
+ suggestion: Optional[str] = None
+ ) -> CheckResult:
+ """Helper to create a CheckResult with this checker's name."""
+ return CheckResult(
+ checker_name=self.name,
+ passed=passed,
+ severity=severity,
+ message=message,
+ line_number=line_number,
+ line_content=line_content,
+ suggestion=suggestion
+ )
+
diff --git a/src/checkers/caption_checker.py b/src/checkers/caption_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9238c27db234afbc844a44976cf43d8f9834d49b
--- /dev/null
+++ b/src/checkers/caption_checker.py
@@ -0,0 +1,136 @@
+"""
+Caption placement checker.
+
+Validates that:
+- Table captions appear ABOVE the table content
+- Figure captions appear BELOW the figure content
+"""
+import re
+from typing import List
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class CaptionChecker(BaseChecker):
+ """Check for correct caption placement in tables and figures."""
+
+ name = "caption"
+ display_name = "Caption Placement"
+ description = "Verify table captions are above and figure captions are below"
+
+ # Patterns for environments
+ TABLE_ENV_PATTERN = re.compile(
+ r'\\begin\{table\*?\}(.*?)\\end\{table\*?\}',
+ re.DOTALL | re.IGNORECASE
+ )
+ FIGURE_ENV_PATTERN = re.compile(
+ r'\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}',
+ re.DOTALL | re.IGNORECASE
+ )
+
+ # Content patterns
+ CAPTION_PATTERN = re.compile(r'\\caption\s*[\[{]')
+ TABULAR_PATTERN = re.compile(r'\\begin\{tabular')
+ INCLUDEGRAPHICS_PATTERN = re.compile(r'\\includegraphics')
+ TIKZ_PATTERN = re.compile(r'\\begin\{tikzpicture\}')
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # Check table environments
+ for match in self.TABLE_ENV_PATTERN.finditer(tex_content):
+ env_content = match.group(1)
+ env_start = match.start()
+
+ # Skip if commented
+ if self._is_commented(tex_content, env_start):
+ continue
+
+ result = self._check_table_caption(env_content, tex_content, env_start)
+ if result:
+ results.append(result)
+
+ # Check figure environments
+ for match in self.FIGURE_ENV_PATTERN.finditer(tex_content):
+ env_content = match.group(1)
+ env_start = match.start()
+
+ # Skip if commented
+ if self._is_commented(tex_content, env_start):
+ continue
+
+ result = self._check_figure_caption(env_content, tex_content, env_start)
+ if result:
+ results.append(result)
+
+ return results
+
+ def _check_table_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
+ """Check that table caption is above tabular content."""
+ caption_match = self.CAPTION_PATTERN.search(env_content)
+ tabular_match = self.TABULAR_PATTERN.search(env_content)
+
+ if not caption_match:
+ line_num = self._find_line_number(full_content, env_start)
+ return self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="Table environment missing caption",
+ line_number=line_num,
+ suggestion="Add \\caption{} before \\begin{tabular}"
+ )
+
+ if not tabular_match:
+ # Table without tabular content - skip
+ return None
+
+ # Caption should come BEFORE tabular
+ if caption_match.start() > tabular_match.start():
+ line_num = self._find_line_number(full_content, env_start + caption_match.start())
+ return self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message="Table caption should be placed ABOVE the table content",
+ line_number=line_num,
+ line_content=self._get_line_content(full_content, line_num),
+ suggestion="Move \\caption{} before \\begin{tabular}"
+ )
+
+ return None
+
+ def _check_figure_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
+ """Check that figure caption is below image content."""
+ caption_match = self.CAPTION_PATTERN.search(env_content)
+ graphics_match = self.INCLUDEGRAPHICS_PATTERN.search(env_content)
+ tikz_match = self.TIKZ_PATTERN.search(env_content)
+
+ # Find the actual content (either graphics or tikz)
+ content_match = graphics_match or tikz_match
+
+ if not caption_match:
+ line_num = self._find_line_number(full_content, env_start)
+ return self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="Figure environment missing caption",
+ line_number=line_num,
+ suggestion="Add \\caption{} after \\includegraphics"
+ )
+
+ if not content_match:
+ # Figure without graphics/tikz - could be custom content, skip
+ return None
+
+ # Caption should come AFTER content
+ if caption_match.start() < content_match.start():
+ line_num = self._find_line_number(full_content, env_start + caption_match.start())
+ return self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message="Figure caption should be placed BELOW the figure content",
+ line_number=line_num,
+ line_content=self._get_line_content(full_content, line_num),
+ suggestion="Move \\caption{} after \\includegraphics"
+ )
+
+ return None
diff --git a/src/checkers/citation_quality_checker.py b/src/checkers/citation_quality_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..de29ca2900bfd242adf368f33c1026595c10eb8f
--- /dev/null
+++ b/src/checkers/citation_quality_checker.py
@@ -0,0 +1,131 @@
+"""
+Citation quality checker.
+
+Validates:
+- Old citations (>30 years) that might need updating
+- Citation formatting patterns (et al., hardcoded citations, etc.)
+"""
+import re
+from typing import List, Dict
+from datetime import datetime
+from collections import defaultdict
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class CitationQualityChecker(BaseChecker):
+ """Check citation quality and balance."""
+
+ name = "citation_quality"
+ display_name = "Citation Quality"
+ description = "Check citation age, balance, and formatting"
+
+ # Thresholds
+ OLD_CITATION_YEARS = 30 # Citations older than this get flagged
+
+ CURRENT_YEAR = datetime.now().year
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # This checker works best with bib content, but we can do some analysis
+ # on the tex file alone by looking at citation patterns
+
+ # Check for inline year citations that are old
+ old_cite_results = self._check_old_citations_in_text(tex_content)
+ results.extend(old_cite_results)
+
+ # Check for citation formatting issues
+ format_results = self._check_citation_formatting(tex_content)
+ results.extend(format_results)
+
+ return results
+
+ def _check_old_citations_in_text(self, content: str) -> List[CheckResult]:
+ """Look for citations with old years visible in text."""
+ results = []
+ lines = content.split('\n')
+
+ # Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)"
+ year_pattern = re.compile(
+ r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|' # Parenthetical
+ r'\b(?:19[89]\d|20[01]\d)\b)', # Standalone year
+ re.IGNORECASE
+ )
+
+ old_years_found = set()
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip comments using base class method
+ if self._is_comment_line(line):
+ continue
+
+ for match in year_pattern.finditer(line):
+ year_str = re.search(r'(19[89]\d|20[01]\d)', match.group())
+ if year_str:
+ year = int(year_str.group())
+ age = self.CURRENT_YEAR - year
+
+ if age >= self.OLD_CITATION_YEARS and year not in old_years_found:
+ old_years_found.add(year)
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Citation from {year} ({age} years old)",
+ line_number=line_num,
+ suggestion=f"Consider if there's more recent work on this topic"
+ ))
+
+ return results
+
+ def _check_citation_formatting(self, content: str) -> List[CheckResult]:
+ """Check for common citation formatting issues."""
+ results = []
+ lines = content.split('\n')
+
+ for line_num, line in enumerate(lines, 1):
+ if line.lstrip().startswith('%'):
+ continue
+
+ # Check for "et al" without period
+ if re.search(r'\bet al\b(?!\.)', line):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="'et al' should be 'et al.'",
+ line_number=line_num,
+ suggestion="Add period after 'et al.'"
+ ))
+
+ # Check for "[1]" style citations (might want natbib style)
+ # Skip if it's a command definition or argument
+ if re.search(r'\[\d+\]', line):
+ # Skip if in command definition
+ if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line:
+ continue
+ # Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1]
+ if re.search(r'\\[a-zA-Z]+\[\d+\]', line):
+ continue
+ # Only flag if it looks like actual citation in text
+ if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message="Numeric citation style detected",
+ line_number=line_num,
+ suggestion="Consider author-year style for better readability"
+ ))
+
+ # Check for hardcoded citations instead of \cite
+ if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line):
+ if '\\cite' not in line:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message="Appears to be hardcoded citation instead of \\cite",
+ line_number=line_num,
+ line_content=line.strip()[:80],
+ suggestion="Use \\cite{} for proper bibliography management"
+ ))
+
+ return results
diff --git a/src/checkers/consistency_checker.py b/src/checkers/consistency_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..14849b9b71ed715864b33b0fcda2edfd0f99935d
--- /dev/null
+++ b/src/checkers/consistency_checker.py
@@ -0,0 +1,254 @@
+"""
+Terminology consistency checker.
+
+Validates:
+- Consistent spelling of the same term
+- Consistent hyphenation
+- Consistent capitalization of technical terms
+"""
+import re
+from typing import List, Dict, Set
+from collections import defaultdict
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class ConsistencyChecker(BaseChecker):
+ """Check terminology and spelling consistency."""
+
+ name = "consistency"
+ display_name = "Consistency"
+ description = "Check for inconsistent terminology and spelling"
+
+ # Known variant pairs (canonical -> variants)
+ KNOWN_VARIANTS = {
+ # Hyphenation variants
+ 'self-supervised': ['self supervised', 'selfsupervised'],
+ 'pre-trained': ['pre trained', 'pretrained'],
+ 'fine-tuned': ['fine tuned', 'finetuned'],
+ 'state-of-the-art': ['state of the art', 'stateoftheart'],
+ 'real-world': ['real world', 'realworld'],
+ 'end-to-end': ['end to end', 'endtoend', 'e2e'],
+ 'large-scale': ['large scale', 'largescale'],
+ 'long-term': ['long term', 'longterm'],
+ 'short-term': ['short term', 'shortterm'],
+ 'multi-task': ['multi task', 'multitask'],
+ 'multi-modal': ['multi modal', 'multimodal'],
+ 'cross-lingual': ['cross lingual', 'crosslingual'],
+ 'zero-shot': ['zero shot', 'zeroshot'],
+ 'few-shot': ['few shot', 'fewshot'],
+ 'in-context': ['in context', 'incontext'],
+
+ # American vs British English (comprehensive list)
+ # -or/-our endings
+ 'color': ['colour'],
+ 'behavior': ['behaviour'],
+ 'favor': ['favour'],
+ 'honor': ['honour'],
+ 'labor': ['labour'],
+ 'neighbor': ['neighbour'],
+ 'rumor': ['rumour'],
+ 'vapor': ['vapour'],
+
+ # -ize/-ise endings
+ 'analyze': ['analyse'],
+ 'characterize': ['characterise'],
+ 'generalize': ['generalise'],
+ 'initialize': ['initialise'],
+ 'maximize': ['maximise'],
+ 'minimize': ['minimise'],
+ 'normalize': ['normalise'],
+ 'optimize': ['optimise'],
+ 'organize': ['organise'],
+ 'realize': ['realise'],
+ 'recognize': ['recognise'],
+ 'specialize': ['specialise'],
+ 'standardize': ['standardise'],
+ 'summarize': ['summarise'],
+ 'utilize': ['utilise'],
+ 'visualize': ['visualise'],
+ 'categorize': ['categorise'],
+ 'emphasize': ['emphasise'],
+ 'hypothesize': ['hypothesise'],
+ 'prioritize': ['prioritise'],
+ 'synchronize': ['synchronise'],
+
+ # -ization/-isation endings
+ 'generalization': ['generalisation'],
+ 'initialization': ['initialisation'],
+ 'maximization': ['maximisation'],
+ 'minimization': ['minimisation'],
+ 'normalization': ['normalisation'],
+ 'optimization': ['optimisation'],
+ 'organization': ['organisation'],
+ 'realization': ['realisation'],
+ 'regularization': ['regularisation'],
+ 'specialization': ['specialisation'],
+ 'standardization': ['standardisation'],
+ 'summarization': ['summarisation'],
+ 'utilization': ['utilisation'],
+ 'visualization': ['visualisation'],
+ 'categorization': ['categorisation'],
+ 'characterization': ['characterisation'],
+ 'parametrization': ['parametrisation'],
+ 'quantization': ['quantisation'],
+
+ # -er/-re endings
+ 'center': ['centre'],
+ 'fiber': ['fibre'],
+ 'meter': ['metre'],
+ 'liter': ['litre'],
+
+ # -l-/-ll- (American single, British double)
+ 'modeling': ['modelling'],
+ 'labeled': ['labelled'],
+ 'labeling': ['labelling'],
+ 'traveled': ['travelled'],
+ 'traveling': ['travelling'],
+ 'canceled': ['cancelled'],
+ 'canceling': ['cancelling'],
+ 'signaled': ['signalled'],
+ 'signaling': ['signalling'],
+
+ # -og/-ogue endings
+ 'analog': ['analogue'],
+ 'catalog': ['catalogue'],
+ 'dialog': ['dialogue'],
+
+ # -ense/-ence endings
+ 'defense': ['defence'],
+ 'license': ['licence'],
+ 'offense': ['offence'],
+
+ # Other common differences
+ 'gray': ['grey'],
+ 'artifact': ['artefact'],
+ 'program': ['programme'], # Note: 'program' is standard in computing
+ 'skeptical': ['sceptical'],
+ 'aluminum': ['aluminium'],
+
+ # Verb forms
+ 'learned': ['learnt'],
+ 'burned': ['burnt'],
+ 'spelled': ['spelt'],
+
+ # Common term variants
+ 'dataset': ['data set', 'data-set'],
+ 'benchmark': ['bench mark', 'bench-mark'],
+ 'baseline': ['base line', 'base-line'],
+ 'downstream': ['down stream', 'down-stream'],
+ 'upstream': ['up stream', 'up-stream'],
+ 'encoder': ['en-coder'],
+ 'decoder': ['de-coder'],
+ }
+
+ # Capitalization variants to track
+ CAPITALIZATION_TERMS = [
+ 'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
+ 'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
+ ]
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # Remove comments
+ content = re.sub(r'(? 1:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"Inconsistent spelling: {', '.join(found_forms)}",
+ suggestion=f"Use '{canonical}' consistently throughout"
+ ))
+
+ # Check hyphenated word consistency
+ hyphen_results = self._check_hyphenation_consistency(content)
+ results.extend(hyphen_results)
+
+ # Check capitalization consistency
+ cap_results = self._check_capitalization_consistency(content)
+ results.extend(cap_results)
+
+ return results
+
+ def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
+ """Find words that appear both hyphenated and non-hyphenated."""
+ results = []
+
+ # Common terms that should always be hyphenated (exceptions)
+ ALWAYS_HYPHENATED = {
+ 'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
+ 'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
+ 'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
+ 'self-supervised', 'self-attention', 'co-training', 'pre-training',
+ 'post-processing', 'pre-processing', 'well-known', 'well-defined',
+ 'high-quality', 'low-quality', 'long-term', 'short-term'
+ }
+
+ # Find all hyphenated words
+ hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))
+
+ for hyph_word in hyphenated:
+ # Skip if it's a known compound that should always be hyphenated
+ if hyph_word.lower() in ALWAYS_HYPHENATED:
+ continue
+
+ # Create non-hyphenated version
+ non_hyph = hyph_word.replace('-', ' ')
+ combined = hyph_word.replace('-', '')
+
+ # Check if non-hyphenated version exists
+ if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
+ suggestion="Choose one form and use it consistently"
+ ))
+ elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
+ suggestion="Choose one form and use it consistently"
+ ))
+
+ return results
+
+ def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
+ """Check if technical terms have consistent capitalization."""
+ results = []
+
+ for term in self.CAPITALIZATION_TERMS:
+ # Find all case variations
+ pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
+ matches = pattern.findall(content)
+
+ if len(matches) > 1:
+ # Check if there are mixed capitalizations
+ unique_forms = set(matches)
+ if len(unique_forms) > 1:
+ forms_str = ', '.join(f"'{f}'" for f in unique_forms)
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Inconsistent capitalization: {forms_str}",
+ suggestion="Use consistent capitalization for technical terms"
+ ))
+
+ return results
diff --git a/src/checkers/equation_checker.py b/src/checkers/equation_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e97e5901bebe56a8349ff499136802dc7f5735
--- /dev/null
+++ b/src/checkers/equation_checker.py
@@ -0,0 +1,134 @@
+"""
+Equation formatting checker.
+
+Validates:
+- Punctuation after equations (based on grammar)
+- Equation numbering consistency
+- Variable definitions
+"""
+import re
+from typing import List, Set
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class EquationChecker(BaseChecker):
+ """Check equation formatting and consistency."""
+
+ name = "equation"
+ display_name = "Equations"
+ description = "Check equation formatting and punctuation"
+
+ # Equation environments
+ EQUATION_ENVS = [
+ 'equation', 'align', 'gather', 'multline', 'eqnarray',
+ 'equation*', 'align*', 'gather*', 'multline*', 'eqnarray*'
+ ]
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # Check equation punctuation
+ punct_results = self._check_equation_punctuation(tex_content)
+ results.extend(punct_results)
+
+ # Check for numbered vs unnumbered consistency
+ numbering_results = self._check_numbering_consistency(tex_content)
+ results.extend(numbering_results)
+
+ # Check inline math consistency ($...$ vs \(...\))
+ inline_results = self._check_inline_math_consistency(tex_content)
+ results.extend(inline_results)
+
+ return results
+
+ def _check_equation_punctuation(self, content: str) -> List[CheckResult]:
+ """Check if equations end with appropriate punctuation."""
+ results = []
+
+ for env in self.EQUATION_ENVS:
+ if '*' in env:
+ env_escaped = env.replace('*', r'\*')
+ else:
+ env_escaped = env
+
+ # Find equation content
+ pattern = re.compile(
+ rf'\\begin\{{{env_escaped}\}}(.*?)\\end\{{{env_escaped}\}}',
+ re.DOTALL
+ )
+
+ for match in pattern.finditer(content):
+ eq_content = match.group(1).strip()
+
+ # Check what comes after the equation
+ after_pos = match.end()
+ after_text = content[after_pos:after_pos + 50].strip()
+
+ # Equations in running text should have punctuation
+ # Check if equation content ends with punctuation
+ eq_content_clean = re.sub(r'\\label\{[^}]+\}', '', eq_content).strip()
+
+ if eq_content_clean and not re.search(r'[.,;]$', eq_content_clean):
+ # Check if next text starts lowercase (indicating sentence continues)
+ if after_text and after_text[0].islower():
+ line_num = self._find_line_number(content, match.end())
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message="Equation may need punctuation (sentence continues after)",
+ line_number=line_num,
+ suggestion="Add comma or period inside equation if it ends a clause"
+ ))
+
+ return results
+
+ def _check_numbering_consistency(self, content: str) -> List[CheckResult]:
+ """Check for mixed numbered and unnumbered equations."""
+ results = []
+
+ # Count numbered vs unnumbered
+ numbered = 0
+ unnumbered = 0
+
+ for env in self.EQUATION_ENVS:
+ count = len(re.findall(rf'\\begin\{{{env}\}}', content))
+ if '*' in env or 'nonumber' in content:
+ unnumbered += count
+ else:
+ numbered += count
+
+ # Also count \nonumber and \notag usage
+ unnumbered += len(re.findall(r'\\nonumber|\\notag', content))
+
+ # If there's a significant mix, warn
+ total = numbered + unnumbered
+ if total > 3 and numbered > 0 and unnumbered > 0:
+ ratio = min(numbered, unnumbered) / total
+ if ratio > 0.2: # More than 20% in minority
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Mixed equation numbering: {numbered} numbered, {unnumbered} unnumbered",
+ suggestion="Consider consistent numbering strategy"
+ ))
+
+ return results
+
+ def _check_inline_math_consistency(self, content: str) -> List[CheckResult]:
+ """Check for mixed inline math delimiters."""
+ results = []
+
+ # Count different inline math styles
+ dollar_count = len(re.findall(r'(? 0 and paren_count > 0:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Mixed inline math: ${dollar_count} \\$...\\$ and {paren_count} \\(...\\)",
+ suggestion="Use consistent inline math delimiters throughout"
+ ))
+
+ return results
diff --git a/src/checkers/formatting_checker.py b/src/checkers/formatting_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc3e9df645e870322ab1879e6a70bcb1c4ae1b
--- /dev/null
+++ b/src/checkers/formatting_checker.py
@@ -0,0 +1,204 @@
+"""
+Formatting checker.
+
+Validates common LaTeX formatting issues:
+- Citation formatting consistency
+- Non-breaking spaces before citations
+- Special character escaping
+- Whitespace issues
+"""
+import re
+from typing import List
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class FormattingChecker(BaseChecker):
+ """Check for common LaTeX formatting issues."""
+
+ name = "formatting"
+ display_name = "Formatting"
+ description = "Check citation style, spacing, and special characters"
+
+ # Citation commands
+ CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
+ 'citeauthor', 'citeyear', 'autocite', 'textcite',
+ 'parencite', 'footcite']
+
+ # Pattern for citations without non-breaking space
+ # Matches: "word \cite" but not "word~\cite"
+ CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
+
+ # Pattern for multiple consecutive spaces
+ MULTI_SPACE_PATTERN = re.compile(r'(? List[CheckResult]:
+ results = []
+ lines = tex_content.split('\n')
+
+ # Track citation style consistency
+ cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip commented lines using base class method
+ if self._is_comment_line(line):
+ continue
+
+ # Remove inline comments using base class method
+ line_content = self._remove_line_comment(line)
+
+ # Check citation non-breaking space
+ for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message="Citation without non-breaking space",
+ line_number=line_num,
+ line_content=line.strip()[:100],
+ suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
+ ))
+
+ # Track citation styles
+ for cmd in self.CITE_COMMANDS:
+ if re.search(rf'\\{cmd}\b', line_content):
+ if cmd in ['citep', 'parencite', 'autocite']:
+ cite_styles['parenthetical'] += 1
+ elif cmd in ['citet', 'textcite']:
+ cite_styles['textual'] += 1
+ elif cmd == 'cite':
+ cite_styles['plain'] += 1
+
+ # Check citation style consistency
+ styles_used = [s for s, count in cite_styles.items() if count > 0]
+ if len(styles_used) > 1:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Mixed citation styles detected: {', '.join(styles_used)}",
+ suggestion="Consider using consistent citation style throughout"
+ ))
+
+ # Check for multiple blank lines (3 or more)
+ for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
+ line_num = self._find_line_number(tex_content, match.start())
+ # Count how many blank lines
+ blank_count = match.group(0).count('\n') - 1
+
+ # Get context: the line before, blank lines, and the line after
+ start_pos = match.start()
+ end_pos = match.end()
+
+ # Find the line before the blank lines
+ prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
+ prev_line_end = start_pos
+ prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
+
+ # Find the line after the blank lines
+ next_line_end = tex_content.find('\n', end_pos)
+ if next_line_end == -1:
+ next_line_end = len(tex_content)
+ next_line = tex_content[end_pos:next_line_end].rstrip()
+
+ # Create visual representation with warning markers
+ blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
+ line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
+
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
+ line_number=line_num,
+ line_content=line_content,
+ suggestion="Reduce to single blank line or use \\vspace"
+ ))
+
+ # Check for common issues with special characters
+ results.extend(self._check_special_chars(tex_content, lines))
+
+ return results
+
+ def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
+ """Check for unescaped special characters."""
+ results = []
+
+ # Find math environments to skip
+ math_regions = self._find_math_regions(content)
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip commented lines using base class method
+ if self._is_comment_line(line):
+ continue
+
+ # Remove inline comments using base class method
+ line_content = self._remove_line_comment(line)
+
+ # Get position of this line in full content
+ line_start = sum(len(l) + 1 for l in lines[:line_num-1])
+
+ # Check for unescaped & (common error)
+ for match in re.finditer(r'(? List[tuple]:
+ """Find regions that are inside math mode."""
+ regions = []
+
+ # Inline math $ ... $
+ for match in re.finditer(r'(? bool:
+ """Check if position is inside a math region."""
+ return any(start <= pos <= end for start, end in regions)
+
+ def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
+ """Check if position is inside any of the given environments."""
+ for env in env_names:
+ # Find all instances of this environment
+ pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
+ for match in re.finditer(pattern, content, re.DOTALL):
+ if match.start() <= pos <= match.end():
+ return True
+ return False
diff --git a/src/checkers/number_checker.py b/src/checkers/number_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64ba9f7321d6787175d1969257c62e3d0536320
--- /dev/null
+++ b/src/checkers/number_checker.py
@@ -0,0 +1,88 @@
+"""
+Number and unit formatting checker.
+
+Validates:
+- Percentage format consistency (no space before %, consistent use of % vs 'percent')
+"""
+import re
+from typing import List
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class NumberChecker(BaseChecker):
+ """Check percentage formatting."""
+
+ name = "number"
+ display_name = "Numbers & Units"
+ description = "Check percentage formatting"
+
+ # Percentage patterns
+ PERCENT_WITH_SPACE = re.compile(r'\d\s+%') # "50 %" is wrong
+
+ # Inconsistent percentage usage
+ PERCENT_WORD = re.compile(r'\d+\s+percent\b', re.IGNORECASE)
+ PERCENT_SYMBOL = re.compile(r'\d+%')
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+ lines = tex_content.split('\n')
+
+ # Track percentage style for consistency check
+ uses_symbol = False
+ uses_word = False
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip comments using base class method
+ if self._is_comment_line(line):
+ continue
+
+ # Skip lines that are likely in math/tables
+ if self._in_special_context(line):
+ continue
+
+ # Skip lines that look like math formulas (contain common math commands)
+ if re.search(r'\\(frac|sum|prod|int|partial|nabla|approx|neq|leq|geq|log|ln|exp|sin|cos|tan|alpha|beta|gamma|delta|theta|sigma|omega|left|right)', line):
+ continue
+
+ line_content = re.sub(r'(? bool:
+ """Check if line is in a context where number rules don't apply."""
+ special_patterns = [
+ r'\\begin\{(tabular|array|equation|align|gather)',
+ r'\\includegraphics',
+ r'\\caption',
+ r'\\label',
+ r'\\ref',
+ r'^\s*&', # Table cell
+ r'\$.*\$', # Inline math
+ ]
+ return any(re.search(p, line) for p in special_patterns)
diff --git a/src/checkers/reference_checker.py b/src/checkers/reference_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8bf234c5f11d679aa2ca40995631235dfe25cb
--- /dev/null
+++ b/src/checkers/reference_checker.py
@@ -0,0 +1,196 @@
+"""
+Cross-reference checker.
+
+Validates that:
+- All figures and tables are referenced in text
+- All labels have corresponding references
+- Appendix sections are referenced in main text
+"""
+import re
+from typing import List, Set, Tuple
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class ReferenceChecker(BaseChecker):
+ """Check cross-reference integrity in the document."""
+
+ name = "reference"
+ display_name = "Cross-References"
+ description = "Verify all figures, tables, and sections are properly referenced"
+
+ # Label pattern: \label{prefix:name}
+ LABEL_PATTERN = re.compile(r'\\label\{([^}]+)\}')
+
+ # Reference patterns
+ REF_PATTERNS = [
+ re.compile(r'\\ref\{([^}]+)\}'),
+ re.compile(r'\\autoref\{([^}]+)\}'),
+ re.compile(r'\\cref\{([^}]+)\}'),
+ re.compile(r'\\Cref\{([^}]+)\}'),
+ re.compile(r'\\eqref\{([^}]+)\}'),
+ re.compile(r'\\pageref\{([^}]+)\}'),
+ re.compile(r'\\nameref\{([^}]+)\}'),
+ ]
+
+ # Appendix detection
+ APPENDIX_START_PATTERN = re.compile(r'\\appendix\b|\\begin\{appendix\}')
+ SECTION_PATTERN = re.compile(r'\\section\*?\{([^}]+)\}')
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+
+ # Extract all labels and their positions
+ labels = self._extract_labels(tex_content)
+
+ # Extract all references
+ references = self._extract_references(tex_content)
+
+ # Find unreferenced labels
+ for label, (line_num, line_content) in labels.items():
+ if label not in references:
+ # Determine severity based on label type
+ severity = self._get_severity_for_label(label)
+ label_type = self._get_label_type(label)
+
+ results.append(self._create_result(
+ passed=False,
+ severity=severity,
+ message=f"Unreferenced {label_type}: '{label}'",
+ line_number=line_num,
+ line_content=line_content,
+ suggestion=f"Add \\ref{{{label}}} or \\autoref{{{label}}} where appropriate"
+ ))
+
+ # Find undefined references (refs without labels)
+ for ref, (line_num, line_content) in references.items():
+ if ref not in labels:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.ERROR,
+ message=f"Reference to undefined label: '{ref}'",
+ line_number=line_num,
+ line_content=line_content,
+ suggestion=f"Add \\label{{{ref}}} to the target element or fix the reference"
+ ))
+
+ # Check appendix sections
+ appendix_results = self._check_appendix_references(tex_content, labels, references)
+ results.extend(appendix_results)
+
+ return results
+
+ def _extract_labels(self, content: str) -> dict:
+ """Extract all labels with their line numbers."""
+ labels = {}
+ for match in self.LABEL_PATTERN.finditer(content):
+ if not self._is_commented(content, match.start()):
+ label = match.group(1)
+ line_num = self._find_line_number(content, match.start())
+ line_content = self._get_line_content(content, line_num)
+ labels[label] = (line_num, line_content)
+ return labels
+
+ def _extract_references(self, content: str) -> dict:
+ """Extract all references with their line numbers."""
+ references = {}
+ for pattern in self.REF_PATTERNS:
+ for match in pattern.finditer(content):
+ if not self._is_commented(content, match.start()):
+ # Handle comma-separated refs like \ref{fig:a,fig:b}
+ refs_str = match.group(1)
+ for ref in refs_str.split(','):
+ ref = ref.strip()
+ if ref and ref not in references:
+ # Skip if ref looks like command parameter (#1, #2)
+ if ref.startswith('#') and len(ref) == 2 and ref[1].isdigit():
+ continue
+
+ # Skip if inside \newcommand or \renewcommand definition
+ line_num = self._find_line_number(content, match.start())
+ line_content = self._get_line_content(content, line_num)
+ if re.search(r'\\(new|renew|provide)command', line_content):
+ continue
+
+ references[ref] = (line_num, line_content)
+ return references
+
+ def _get_label_type(self, label: str) -> str:
+ """Determine the type of a label based on its prefix."""
+ if ':' in label:
+ prefix = label.split(':')[0].lower()
+ type_map = {
+ 'fig': 'figure',
+ 'tab': 'table',
+ 'sec': 'section',
+ 'eq': 'equation',
+ 'alg': 'algorithm',
+ 'lst': 'listing',
+ 'app': 'appendix',
+ }
+ return type_map.get(prefix, 'label')
+ return 'label'
+
+ def _get_severity_for_label(self, label: str) -> CheckSeverity:
+ """Determine severity based on label type."""
+ label_type = self._get_label_type(label)
+
+ # Figures and tables should always be referenced
+ if label_type in ('figure', 'table'):
+ return CheckSeverity.WARNING
+
+ # Equations might not always need explicit reference
+ if label_type == 'equation':
+ return CheckSeverity.INFO
+
+ return CheckSeverity.INFO
+
+ def _check_appendix_references(
+ self,
+ content: str,
+ labels: dict,
+ references: dict
+ ) -> List[CheckResult]:
+ """Check that appendix sections are referenced in main text."""
+ results = []
+
+ # Find where appendix starts
+ appendix_match = self.APPENDIX_START_PATTERN.search(content)
+ if not appendix_match:
+ return results
+
+ appendix_start = appendix_match.start()
+ main_content = content[:appendix_start]
+ appendix_content = content[appendix_start:]
+
+ # Find section labels in appendix
+ for match in self.LABEL_PATTERN.finditer(appendix_content):
+ if self._is_commented(appendix_content, match.start()):
+ continue
+
+ label = match.group(1)
+
+ # Check if this label is for a section
+ if 'sec' in label.lower() or 'app' in label.lower():
+ # Check if referenced in main text (before appendix)
+ is_referenced = False
+ for pattern in self.REF_PATTERNS:
+ if pattern.search(main_content) and label in main_content:
+ for m in pattern.finditer(main_content):
+ if label in m.group(1):
+ is_referenced = True
+ break
+ if is_referenced:
+ break
+
+ if not is_referenced:
+ line_num = self._find_line_number(content, appendix_start + match.start())
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.WARNING,
+ message=f"Appendix section '{label}' is not referenced in main text",
+ line_number=line_num,
+ suggestion="Add a reference to this appendix section in the main text"
+ ))
+
+ return results
diff --git a/src/checkers/sentence_checker.py b/src/checkers/sentence_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5dd952cd6e852ad08675237fce20d9c85cf2c32
--- /dev/null
+++ b/src/checkers/sentence_checker.py
@@ -0,0 +1,107 @@
+"""
+Sentence quality checker.
+
+Validates:
+- Weak sentence starters
+- Common writing issues
+"""
+import re
+from typing import List
+
+from .base import BaseChecker, CheckResult, CheckSeverity
+
+
+class SentenceChecker(BaseChecker):
+ """Check sentence quality and readability."""
+
+ name = "sentence"
+ display_name = "Sentence Quality"
+ description = "Check weak patterns and writing issues"
+
+ # Weak sentence starters (avoid these)
+ WEAK_STARTERS = [
+ (r'^There\s+(is|are|was|were|has been|have been)\s+',
+ "Weak start with 'There is/are'"),
+ (r'^It\s+(is|was|has been|should be noted)\s+',
+ "Weak start with 'It is'"),
+ (r'^This\s+(is|was|shows|demonstrates)\s+',
+ "Vague 'This' without clear antecedent"),
+ (r'^As\s+(mentioned|discussed|shown|noted)\s+(above|before|earlier|previously)',
+ "Consider being more specific about what was mentioned"),
+ ]
+
+ # Weasel words and hedging
+ WEASEL_PATTERNS = [
+ (r'\b(many|some|most|several)\s+(researchers?|studies|papers?|works?)\s+(have\s+)?(shown?|demonstrated?|suggested?|believe)',
+ "Vague attribution - consider citing specific work"),
+ (r'\b(obviously|clearly|of course|needless to say|it is well known)\b',
+ "Unsupported assertion - consider citing or removing"),
+ (r'\b(very|really|quite|extremely|highly)\s+(important|significant|good|effective)',
+ "Consider more precise language"),
+ (r'\bit\s+is\s+(important|crucial|essential|necessary)\s+to\s+note\s+that',
+ "Wordy phrase - consider simplifying"),
+ ]
+
+ # Redundant phrases
+ REDUNDANT_PATTERNS = [
+ (r'\bin order to\b', "Use 'to' instead of 'in order to'"),
+ (r'\bdue to the fact that\b', "Use 'because' instead"),
+ (r'\bat this point in time\b', "Use 'now' or 'currently'"),
+ (r'\bin the event that\b', "Use 'if' instead"),
+ (r'\bdespite the fact that\b', "Use 'although' instead"),
+ (r'\bfor the purpose of\b', "Use 'to' or 'for' instead"),
+ (r'\bwith the exception of\b', "Use 'except' instead"),
+ (r'\bin close proximity to\b', "Use 'near' instead"),
+ (r'\ba large number of\b', "Use 'many' instead"),
+ (r'\bthe vast majority of\b', "Use 'most' instead"),
+ ]
+
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+ results = []
+ lines = tex_content.split('\n')
+
+ for line_num, line in enumerate(lines, 1):
+ # Skip commented lines using base class method
+ if self._is_comment_line(line):
+ continue
+
+ # Remove inline comments using base class method
+ line_content = self._remove_line_comment(line)
+
+ # Check weak starters
+ for pattern, message in self.WEAK_STARTERS:
+ if re.search(pattern, line_content, re.IGNORECASE):
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=message,
+ line_number=line_num,
+ line_content=line.strip()[:80]
+ ))
+ break # One per line
+
+ # Check weasel words
+ for pattern, message in self.WEASEL_PATTERNS:
+ match = re.search(pattern, line_content, re.IGNORECASE)
+ if match:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Hedging language: '{match.group(0)[:30]}'",
+ line_number=line_num,
+ suggestion=message
+ ))
+
+ # Check redundant phrases
+ for pattern, message in self.REDUNDANT_PATTERNS:
+ match = re.search(pattern, line_content, re.IGNORECASE)
+ if match:
+ results.append(self._create_result(
+ passed=False,
+ severity=CheckSeverity.INFO,
+ message=f"Redundant phrase: '{match.group(0)}'",
+ line_number=line_num,
+ suggestion=message
+ ))
+
+ return results
diff --git a/src/config/__init__.py b/src/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc11c5c27190e3c255ffa26799a1e7e3d99eae0
--- /dev/null
+++ b/src/config/__init__.py
@@ -0,0 +1,4 @@
+"""Configuration module for BibGuard."""
+from .workflow import WorkflowConfig, WorkflowStep, DEFAULT_WORKFLOW, get_default_workflow
+
+__all__ = ['WorkflowConfig', 'WorkflowStep', 'DEFAULT_WORKFLOW', 'get_default_workflow']
diff --git a/src/config/__pycache__/__init__.cpython-313.pyc b/src/config/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f34a85f65061f1f6901383f2c3427098a0c19e75
Binary files /dev/null and b/src/config/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/config/__pycache__/workflow.cpython-313.pyc b/src/config/__pycache__/workflow.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b55e19e08f6821df1f5d3550a2332b5ce0226ca
Binary files /dev/null and b/src/config/__pycache__/workflow.cpython-313.pyc differ
diff --git a/src/config/__pycache__/yaml_config.cpython-313.pyc b/src/config/__pycache__/yaml_config.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..439b18be73390a7bb9137472d739bbc5a36de835
Binary files /dev/null and b/src/config/__pycache__/yaml_config.cpython-313.pyc differ
diff --git a/src/config/workflow.py b/src/config/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d7f962df8873a977a76c69fc8b0b45b835ad4b
--- /dev/null
+++ b/src/config/workflow.py
@@ -0,0 +1,174 @@
+"""
+Workflow configuration for reference checking.
+
+Allows users to customize the order and enable/disable individual fetchers
+in the reference verification workflow.
+"""
+import json
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import List, Optional
+
+
+@dataclass
+class WorkflowStep:
+ """A single step in the reference checking workflow."""
+ name: str
+ display_name: str
+ description: str
+ enabled: bool = True
+ priority: int = 0
+
+ # Step type: 'by_id', 'by_doi', 'by_title'
+ search_type: str = 'by_title'
+
+ def to_dict(self) -> dict:
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict) -> 'WorkflowStep':
+ return cls(**data)
+
+
+@dataclass
+class WorkflowConfig:
+ """Configuration for the reference checking workflow."""
+ steps: List[WorkflowStep] = field(default_factory=list)
+ name: str = "default"
+ description: str = "Default workflow configuration"
+
+ def get_enabled_steps(self) -> List[WorkflowStep]:
+ """Get only enabled steps, sorted by priority."""
+ return sorted(
+ [s for s in self.steps if s.enabled],
+ key=lambda x: x.priority
+ )
+
+ def move_step_up(self, index: int) -> bool:
+ """Move a step up in priority (swap with previous)."""
+ if index <= 0 or index >= len(self.steps):
+ return False
+ self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index]
+ self._update_priorities()
+ return True
+
+ def move_step_down(self, index: int) -> bool:
+ """Move a step down in priority (swap with next)."""
+ if index < 0 or index >= len(self.steps) - 1:
+ return False
+ self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index]
+ self._update_priorities()
+ return True
+
+ def toggle_step(self, index: int) -> bool:
+ """Toggle enabled status of a step."""
+ if 0 <= index < len(self.steps):
+ self.steps[index].enabled = not self.steps[index].enabled
+ return True
+ return False
+
+ def _update_priorities(self):
+ """Update priority values based on current order."""
+ for i, step in enumerate(self.steps):
+ step.priority = i
+
+ def to_dict(self) -> dict:
+ return {
+ 'name': self.name,
+ 'description': self.description,
+ 'steps': [s.to_dict() for s in self.steps]
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict) -> 'WorkflowConfig':
+ steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])]
+ return cls(
+ steps=steps,
+ name=data.get('name', 'custom'),
+ description=data.get('description', '')
+ )
+
+ def save(self, filepath: str):
+ """Save workflow configuration to JSON file."""
+ path = Path(filepath)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, 'w', encoding='utf-8') as f:
+ json.dump(self.to_dict(), f, indent=2)
+
+ @classmethod
+ def load(cls, filepath: str) -> 'WorkflowConfig':
+ """Load workflow configuration from JSON file."""
+ with open(filepath, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ return cls.from_dict(data)
+
+
+# Default workflow matching current implementation order
+DEFAULT_WORKFLOW = WorkflowConfig(
+ name="default",
+ description="Default reference checking workflow prioritizing reliable APIs",
+ steps=[
+ WorkflowStep(
+ name="arxiv_id",
+ display_name="arXiv by ID",
+ description="Look up paper by arXiv ID (highest priority for arXiv papers)",
+ priority=0,
+ search_type="by_id"
+ ),
+ WorkflowStep(
+ name="crossref_doi",
+ display_name="CrossRef by DOI",
+ description="Look up paper by DOI (authoritative for DOIs)",
+ priority=1,
+ search_type="by_doi"
+ ),
+ WorkflowStep(
+ name="semantic_scholar",
+ display_name="Semantic Scholar",
+ description="Official API with high quality metadata",
+ priority=2,
+ search_type="by_title"
+ ),
+ WorkflowStep(
+ name="dblp",
+ display_name="DBLP",
+ description="Official API, especially good for CS publications",
+ priority=3,
+ search_type="by_title"
+ ),
+ WorkflowStep(
+ name="openalex",
+ display_name="OpenAlex",
+ description="Official API with broad coverage",
+ priority=4,
+ search_type="by_title"
+ ),
+ WorkflowStep(
+ name="arxiv_title",
+ display_name="arXiv by Title",
+ description="Search arXiv by title (fallback for non-ID lookups)",
+ priority=5,
+ search_type="by_title"
+ ),
+ WorkflowStep(
+ name="crossref_title",
+ display_name="CrossRef by Title",
+ description="Search CrossRef by title",
+ priority=6,
+ search_type="by_title"
+ ),
+ WorkflowStep(
+ name="google_scholar",
+ display_name="Google Scholar",
+ description="Web scraping fallback (may be rate-limited or blocked)",
+ priority=7,
+ search_type="by_title",
+ enabled=True # Still enabled but lowest priority
+ ),
+ ]
+)
+
+
+def get_default_workflow() -> WorkflowConfig:
+ """Get a fresh copy of the default workflow."""
+ return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict())
diff --git a/src/config/yaml_config.py b/src/config/yaml_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe34964e7917246d6487797c95f802c42a9bc1f1
--- /dev/null
+++ b/src/config/yaml_config.py
@@ -0,0 +1,300 @@
+"""
+YAML configuration loader for BibGuard.
+
+Loads configuration from YAML file and provides defaults.
+"""
+import yaml
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict, Any
+
+
+@dataclass
+class FilesConfig:
+ """File path configuration."""
+ bib: str = ""
+ tex: str = ""
+ input_dir: str = "" # Directory to recursive search for .tex and .bib files
+ output_dir: str = "bibguard_output" # Output directory for all generated files
+
+
+@dataclass
+class BibliographyConfig:
+ """Bibliography check configuration."""
+ check_metadata: bool = True
+ check_usage: bool = True
+ check_duplicates: bool = True
+ check_preprint_ratio: bool = True
+ preprint_warning_threshold: float = 0.50
+ check_relevance: bool = False
+
+
+@dataclass
+class SubmissionConfig:
+ """Submission quality check configuration."""
+
+ # Format checks
+ caption: bool = True
+ reference: bool = True
+ formatting: bool = True
+ equation: bool = True
+
+ # Writing quality
+ ai_artifacts: bool = True
+ sentence: bool = True
+ consistency: bool = True
+
+ # Academic standards
+ acronym: bool = True
+ number: bool = True
+ citation_quality: bool = True
+
+ # Review compliance
+ anonymization: bool = True
+
+ def get_enabled_checkers(self) -> List[str]:
+ """Get list of enabled checker names."""
+ checkers = []
+ if self.caption:
+ checkers.append('caption')
+ if self.reference:
+ checkers.append('reference')
+ if self.formatting:
+ checkers.append('formatting')
+ if self.equation:
+ checkers.append('equation')
+ if self.ai_artifacts:
+ checkers.append('ai_artifacts')
+ if self.sentence:
+ checkers.append('sentence')
+ if self.consistency:
+ checkers.append('consistency')
+ if self.acronym:
+ checkers.append('acronym')
+ if self.number:
+ checkers.append('number')
+ if self.citation_quality:
+ checkers.append('citation_quality')
+ if self.anonymization:
+ checkers.append('anonymization')
+ return checkers
+
+
+@dataclass
+class WorkflowStep:
+ """Single step in the reference check workflow."""
+ name: str
+ enabled: bool = True
+ description: str = ""
+
+
+@dataclass
+class LLMConfig:
+ """LLM configuration for relevance checking."""
+ backend: str = "gemini"
+ model: str = ""
+ endpoint: str = ""
+ api_key: str = ""
+
+
+@dataclass
+class OutputConfig:
+ """Output configuration."""
+ quiet: bool = False
+ minimal_verified: bool = False
+
+
+@dataclass
+class BibGuardConfig:
+ """Complete BibGuard configuration."""
+ files: FilesConfig = field(default_factory=FilesConfig)
+ template: str = ""
+ bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
+ submission: SubmissionConfig = field(default_factory=SubmissionConfig)
+ workflow: List[WorkflowStep] = field(default_factory=list)
+ llm: LLMConfig = field(default_factory=LLMConfig)
+ output: OutputConfig = field(default_factory=OutputConfig)
+
+ # Internal fields to store discovered files in directory mode
+ _bib_files: List[Path] = field(default_factory=list)
+ _tex_files: List[Path] = field(default_factory=list)
+
+ # Path to the config file (for resolving relative paths)
+ _config_dir: Path = field(default_factory=lambda: Path.cwd())
+
+ def resolve_path(self, path: str) -> Path:
+ """Resolve a path relative to the config file directory."""
+ p = Path(path)
+ if p.is_absolute():
+ return p
+ return self._config_dir / p
+
+ @property
+ def bib_path(self) -> Path:
+ return self.resolve_path(self.files.bib)
+
+ @property
+ def tex_path(self) -> Path:
+ return self.resolve_path(self.files.tex)
+
+ @property
+ def input_dir_path(self) -> Path:
+ return self.resolve_path(self.files.input_dir)
+
+ @property
+ def output_dir_path(self) -> Path:
+ return self.resolve_path(self.files.output_dir)
+
+
+def load_config(config_path: str) -> BibGuardConfig:
+ """Load configuration from YAML file."""
+ path = Path(config_path)
+
+ if not path.exists():
+ raise FileNotFoundError(f"Config file not found: {config_path}")
+
+ with open(path, 'r', encoding='utf-8') as f:
+ data = yaml.safe_load(f) or {}
+
+ config = BibGuardConfig()
+ config._config_dir = path.parent.absolute()
+
+ # Parse files section
+ if 'files' in data:
+ files = data['files']
+ config.files = FilesConfig(
+ bib=files.get('bib', ''),
+ tex=files.get('tex', ''),
+ input_dir=files.get('input_dir', ''),
+ output_dir=files.get('output_dir', 'bibguard_output')
+ )
+
+ # Parse template
+ config.template = data.get('template', '')
+
+ # Parse bibliography section
+ if 'bibliography' in data:
+ bib = data['bibliography']
+ config.bibliography = BibliographyConfig(
+ check_metadata=bib.get('check_metadata', True),
+ check_usage=bib.get('check_usage', True),
+ check_duplicates=bib.get('check_duplicates', True),
+ check_preprint_ratio=bib.get('check_preprint_ratio', True),
+ preprint_warning_threshold=bib.get('preprint_warning_threshold', 0.50),
+ check_relevance=bib.get('check_relevance', False)
+ )
+
+ # Parse submission section
+ if 'submission' in data:
+ sub = data['submission']
+ config.submission = SubmissionConfig(
+ caption=sub.get('caption', True),
+ reference=sub.get('reference', True),
+ formatting=sub.get('formatting', True),
+ equation=sub.get('equation', True),
+ ai_artifacts=sub.get('ai_artifacts', True),
+ sentence=sub.get('sentence', True),
+ consistency=sub.get('consistency', True),
+ acronym=sub.get('acronym', True),
+ number=sub.get('number', True),
+ citation_quality=sub.get('citation_quality', True),
+ anonymization=sub.get('anonymization', True)
+ )
+
+ # Parse workflow section
+ if 'workflow' in data:
+ config.workflow = [
+ WorkflowStep(
+ name=step.get('name', ''),
+ enabled=step.get('enabled', True),
+ description=step.get('description', '')
+ )
+ for step in data['workflow']
+ ]
+
+ # Parse LLM section
+ if 'llm' in data:
+ llm = data['llm']
+ config.llm = LLMConfig(
+ backend=llm.get('backend', 'gemini'),
+ model=llm.get('model', ''),
+ endpoint=llm.get('endpoint', ''),
+ api_key=llm.get('api_key', '')
+ )
+
+ # Parse output section
+ if 'output' in data:
+ out = data['output']
+ config.output = OutputConfig(
+ quiet=out.get('quiet', False),
+ minimal_verified=out.get('minimal_verified', False)
+ )
+
+ return config
+
+
+def find_config_file() -> Optional[Path]:
+ """Find config file in current directory or parent directories."""
+ config_names = ['config.yaml', 'bibguard.yaml', 'bibguard.yml', '.bibguard.yaml', '.bibguard.yml']
+
+ current = Path.cwd()
+
+ for _ in range(5): # Check up to 5 levels
+ for name in config_names:
+ config_path = current / name
+ if config_path.exists():
+ return config_path
+
+ parent = current.parent
+ if parent == current:
+ break
+ current = parent
+
+ return None
+
+
+def create_default_config(output_path: str = "config.yaml"):
+ """Create a default config file."""
+ default = """# BibGuard Configuration File
+
+files:
+ bib: "paper.bib"
+ tex: "paper.tex"
+ output_dir: "bibguard_output"
+
+template: ""
+
+bibliography:
+ check_metadata: true
+ check_usage: true
+ check_duplicates: true
+ check_preprint_ratio: true
+ preprint_warning_threshold: 0.50
+ check_relevance: false
+
+submission:
+ caption: true
+ reference: true
+ formatting: true
+ equation: true
+ ai_artifacts: true
+ sentence: true
+ consistency: true
+ acronym: true
+ number: true
+ citation_quality: true
+ anonymization: true
+
+llm:
+ backend: "gemini"
+ model: ""
+ api_key: ""
+
+output:
+ quiet: false
+ minimal_verified: false
+"""
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(default)
+
+ return output_path
diff --git a/src/fetchers/__init__.py b/src/fetchers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c2bb70a42976b4f0eb2b1901b17611a9eec26c
--- /dev/null
+++ b/src/fetchers/__init__.py
@@ -0,0 +1,16 @@
+"""Fetchers package"""
+from .arxiv_fetcher import ArxivFetcher
+from .scholar_fetcher import ScholarFetcher
+from .crossref_fetcher import CrossRefFetcher
+from .semantic_scholar_fetcher import SemanticScholarFetcher
+from .openalex_fetcher import OpenAlexFetcher
+from .dblp_fetcher import DBLPFetcher
+
+__all__ = [
+ 'ArxivFetcher',
+ 'ScholarFetcher',
+ 'CrossRefFetcher',
+ 'SemanticScholarFetcher',
+ 'OpenAlexFetcher',
+ 'DBLPFetcher'
+]
diff --git a/src/fetchers/__pycache__/__init__.cpython-313.pyc b/src/fetchers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2c8c41f22daf3f9ebe4906fbf377db97bdda5b9
Binary files /dev/null and b/src/fetchers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6c0f6d2bea0f0fc0d4eaa6cc21db0bf88d23ea0
Binary files /dev/null and b/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..308f1616aefb91ef39445c7ffa769df645c0278d
Binary files /dev/null and b/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e6c4c3048b8dbe934a6601fd1d5db5ab66f94e1
Binary files /dev/null and b/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c22735c52179987deacb4a0647d9c24b78709ed7
Binary files /dev/null and b/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a5aef1df4d19d55bf6834cf0e7f368896ea8c9b
Binary files /dev/null and b/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..089ed166dfaaf59e5fc7b54b7f45a9f3e2b2b744
Binary files /dev/null and b/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc differ
diff --git a/src/fetchers/arxiv_fetcher.py b/src/fetchers/arxiv_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a8a064ebe5cf07f60a641d41c7fa47193684a4
--- /dev/null
+++ b/src/fetchers/arxiv_fetcher.py
@@ -0,0 +1,228 @@
+"""
+arXiv metadata fetcher using the public API.
+"""
+import re
+import time
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from typing import Optional
+from urllib.parse import quote
+
+import requests
+
+
+@dataclass
+class ArxivMetadata:
+ """Metadata fetched from arXiv."""
+ arxiv_id: str
+ title: str
+ authors: list[str]
+ abstract: str
+ published: str
+ updated: str
+ categories: list[str]
+ primary_category: str
+ doi: str
+ journal_ref: str
+ comment: str
+ pdf_url: str
+ abs_url: str
+
+ @property
+ def year(self) -> str:
+ """Extract year from published date."""
+ if self.published:
+ match = re.match(r'(\d{4})', self.published)
+ if match:
+ return match.group(1)
+ return ""
+
+
+class ArxivFetcher:
+ """Fetches metadata from arXiv API."""
+
+ API_BASE = "http://export.arxiv.org/api/query"
+ RATE_LIMIT_DELAY = 3.0 # seconds between requests
+
+ def __init__(self):
+ self._last_request_time = 0.0
+
+ def _rate_limit(self):
+ """Ensure rate limiting between requests."""
+ elapsed = time.time() - self._last_request_time
+ if elapsed < self.RATE_LIMIT_DELAY:
+ time.sleep(self.RATE_LIMIT_DELAY - elapsed)
+ self._last_request_time = time.time()
+
+ def fetch_by_id(self, arxiv_id: str) -> Optional[ArxivMetadata]:
+ """Fetch metadata by arXiv ID."""
+ # Clean up ID
+ arxiv_id = arxiv_id.strip()
+ arxiv_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE)
+
+ self._rate_limit()
+
+ params = {
+ 'id_list': arxiv_id,
+ 'max_results': 1
+ }
+
+ try:
+ response = requests.get(
+ self.API_BASE,
+ params=params,
+ timeout=30,
+ headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'}
+ )
+ response.raise_for_status()
+ except requests.RequestException as e:
+ return None
+
+ return self._parse_response(response.text)
+
+ def search_by_title(self, title: str, max_results: int = 5) -> list[ArxivMetadata]:
+ """Search arXiv by title."""
+ self._rate_limit()
+
+ # Clean up title for search
+ clean_title = re.sub(r'[^\w\s]', ' ', title)
+ clean_title = re.sub(r'\s+', ' ', clean_title).strip()
+
+ # Build search query
+ search_query = f'ti:"{clean_title}"'
+
+ params = {
+ 'search_query': search_query,
+ 'max_results': max_results,
+ 'sortBy': 'relevance',
+ 'sortOrder': 'descending'
+ }
+
+ try:
+ response = requests.get(
+ self.API_BASE,
+ params=params,
+ timeout=30,
+ headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'}
+ )
+ response.raise_for_status()
+ except requests.RequestException as e:
+ return []
+
+ return self._parse_response_multiple(response.text)
+
+ def _parse_response(self, xml_content: str) -> Optional[ArxivMetadata]:
+ """Parse single entry response."""
+ results = self._parse_response_multiple(xml_content)
+ return results[0] if results else None
+
+ def _parse_response_multiple(self, xml_content: str) -> list[ArxivMetadata]:
+ """Parse multiple entries from response."""
+ results = []
+
+ try:
+ root = ET.fromstring(xml_content)
+ except ET.ParseError:
+ return results
+
+ # Define namespaces
+ ns = {
+ 'atom': 'http://www.w3.org/2005/Atom',
+ 'arxiv': 'http://arxiv.org/schemas/atom'
+ }
+
+ entries = root.findall('atom:entry', ns)
+
+ for entry in entries:
+ try:
+ metadata = self._parse_entry(entry, ns)
+ if metadata:
+ results.append(metadata)
+ except Exception:
+ continue
+
+ return results
+
+ def _parse_entry(self, entry: ET.Element, ns: dict) -> Optional[ArxivMetadata]:
+ """Parse a single entry element."""
+ # Get ID
+ id_elem = entry.find('atom:id', ns)
+ if id_elem is None or id_elem.text is None:
+ return None
+
+ abs_url = id_elem.text.strip()
+
+ # Extract arXiv ID from URL
+ match = re.search(r'arxiv\.org/abs/(.+)$', abs_url)
+ arxiv_id = match.group(1) if match else ""
+
+ # Get title
+ title_elem = entry.find('atom:title', ns)
+ title = self._clean_text(title_elem.text) if title_elem is not None and title_elem.text else ""
+
+ # Get abstract
+ summary_elem = entry.find('atom:summary', ns)
+ abstract = self._clean_text(summary_elem.text) if summary_elem is not None and summary_elem.text else ""
+
+ # Get authors
+ authors = []
+ for author_elem in entry.findall('atom:author', ns):
+ name_elem = author_elem.find('atom:name', ns)
+ if name_elem is not None and name_elem.text:
+ authors.append(name_elem.text.strip())
+
+ # Get dates
+ published_elem = entry.find('atom:published', ns)
+ published = published_elem.text.strip() if published_elem is not None and published_elem.text else ""
+
+ updated_elem = entry.find('atom:updated', ns)
+ updated = updated_elem.text.strip() if updated_elem is not None and updated_elem.text else ""
+
+ # Get categories
+ categories = []
+ for cat_elem in entry.findall('atom:category', ns):
+ term = cat_elem.get('term')
+ if term:
+ categories.append(term)
+
+ primary_cat_elem = entry.find('arxiv:primary_category', ns)
+ primary_category = primary_cat_elem.get('term', '') if primary_cat_elem is not None else ""
+
+ # Get DOI
+ doi_elem = entry.find('arxiv:doi', ns)
+ doi = doi_elem.text.strip() if doi_elem is not None and doi_elem.text else ""
+
+ # Get journal reference
+ journal_elem = entry.find('arxiv:journal_ref', ns)
+ journal_ref = journal_elem.text.strip() if journal_elem is not None and journal_elem.text else ""
+
+ # Get comment
+ comment_elem = entry.find('arxiv:comment', ns)
+ comment = comment_elem.text.strip() if comment_elem is not None and comment_elem.text else ""
+
+ # Build PDF URL
+ pdf_url = abs_url.replace('/abs/', '/pdf/') + '.pdf'
+
+ return ArxivMetadata(
+ arxiv_id=arxiv_id,
+ title=title,
+ authors=authors,
+ abstract=abstract,
+ published=published,
+ updated=updated,
+ categories=categories,
+ primary_category=primary_category,
+ doi=doi,
+ journal_ref=journal_ref,
+ comment=comment,
+ pdf_url=pdf_url,
+ abs_url=abs_url
+ )
+
+ def _clean_text(self, text: str) -> str:
+ """Clean up text from XML."""
+ if not text:
+ return ""
+ # Normalize whitespace
+ text = re.sub(r'\s+', ' ', text)
+ return text.strip()
diff --git a/src/fetchers/crossref_fetcher.py b/src/fetchers/crossref_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a65cc55b07c2209eb26c8c66958dfc987355643
--- /dev/null
+++ b/src/fetchers/crossref_fetcher.py
@@ -0,0 +1,197 @@
+"""
+CrossRef API fetcher for bibliography metadata.
+
+CrossRef provides free, reliable access to metadata for academic publications.
+No API key required, no rate limiting for reasonable use.
+"""
+import requests
+from dataclasses import dataclass
+from typing import Optional, List
+import time
+
+
+@dataclass
+class CrossRefResult:
+ """Metadata result from CrossRef API."""
+ title: str
+ authors: List[str]
+ year: str
+ doi: str
+ publisher: str
+ container_title: str # Journal/conference name
+ abstract: str = ""
+ url: str = ""
+
+
+class CrossRefFetcher:
+ """
+ Fetcher for CrossRef API.
+
+ CrossRef is a reliable, free API for academic metadata.
+ Much more reliable than Google Scholar scraping.
+ """
+
+ BASE_URL = "https://api.crossref.org/works"
+ RATE_LIMIT_DELAY = 1.0 # Be polite
+
+ def __init__(self, mailto: str = "bibguard@example.com"):
+ """
+ Initialize CrossRef fetcher.
+
+ Args:
+ mailto: Email for polite pool (gets better rate limits)
+ """
+ self.mailto = mailto
+ self._last_request_time = 0.0
+ self._session = requests.Session()
+
+ def _rate_limit(self):
+ """Ensure rate limiting between requests."""
+ elapsed = time.time() - self._last_request_time
+ if elapsed < self.RATE_LIMIT_DELAY:
+ time.sleep(self.RATE_LIMIT_DELAY - elapsed)
+ self._last_request_time = time.time()
+
+ def _get_headers(self) -> dict:
+ """Get request headers with mailto for polite pool."""
+ return {
+ 'User-Agent': f'BibGuard/1.0 (mailto:{self.mailto})',
+ 'Accept': 'application/json',
+ }
+
+ def search_by_title(self, title: str, max_results: int = 5) -> Optional[CrossRefResult]:
+ """
+ Search for a paper by title.
+
+ Args:
+ title: Paper title to search for
+ max_results: Maximum number of results to retrieve
+
+ Returns:
+ Best matching CrossRefResult or None if not found
+ """
+ self._rate_limit()
+
+ params = {
+ 'query.title': title,
+ 'rows': max_results,
+ 'select': 'title,author,published-print,published-online,DOI,publisher,container-title,abstract'
+ }
+
+ try:
+ response = self._session.get(
+ self.BASE_URL,
+ params=params,
+ headers=self._get_headers(),
+ timeout=30
+ )
+ response.raise_for_status()
+
+ data = response.json()
+
+ if data.get('status') != 'ok':
+ return None
+
+ items = data.get('message', {}).get('items', [])
+
+ if not items:
+ return None
+
+ # Return best match (first result, as CrossRef ranks by relevance)
+ return self._parse_item(items[0])
+
+ except requests.RequestException:
+ return None
+
+ def search_by_doi(self, doi: str) -> Optional[CrossRefResult]:
+ """
+ Fetch metadata by DOI.
+
+ Args:
+ doi: DOI of the paper
+
+ Returns:
+ CrossRefResult or None if not found
+ """
+ self._rate_limit()
+
+ # Clean DOI (remove https://doi.org/ prefix if present)
+ doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
+
+ try:
+ response = self._session.get(
+ f"{self.BASE_URL}/{doi}",
+ headers=self._get_headers(),
+ timeout=30
+ )
+ response.raise_for_status()
+
+ data = response.json()
+
+ if data.get('status') != 'ok':
+ return None
+
+ item = data.get('message', {})
+ return self._parse_item(item)
+
+ except requests.RequestException:
+ return None
+
+ def _parse_item(self, item: dict) -> Optional[CrossRefResult]:
+ """Parse a CrossRef API item into CrossRefResult."""
+ try:
+ # Get title
+ titles = item.get('title', [])
+ title = titles[0] if titles else ""
+
+ if not title:
+ return None
+
+ # Get authors
+ authors = []
+ for author in item.get('author', []):
+ given = author.get('given', '')
+ family = author.get('family', '')
+ if family:
+ if given:
+ authors.append(f"{given} {family}")
+ else:
+ authors.append(family)
+
+ # Get year (try published-print first, then published-online)
+ year = ""
+ for date_field in ['published-print', 'published-online', 'created']:
+ date_parts = item.get(date_field, {}).get('date-parts', [[]])
+ if date_parts and date_parts[0]:
+ year = str(date_parts[0][0])
+ break
+
+ # Get DOI
+ doi = item.get('DOI', '')
+
+ # Get publisher
+ publisher = item.get('publisher', '')
+
+ # Get container title (journal/conference name)
+ container_titles = item.get('container-title', [])
+ container_title = container_titles[0] if container_titles else ""
+
+ # Get abstract (if available)
+ abstract = item.get('abstract', '')
+
+ # Build URL
+ url = f"https://doi.org/{doi}" if doi else ""
+
+ return CrossRefResult(
+ title=title,
+ authors=authors,
+ year=year,
+ doi=doi,
+ publisher=publisher,
+ container_title=container_title,
+ abstract=abstract,
+ url=url
+ )
+
+ except (KeyError, IndexError, TypeError):
+ return None
diff --git a/src/fetchers/dblp_fetcher.py b/src/fetchers/dblp_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb493d95f5fe049d2a6fc5a18e437534239be783
--- /dev/null
+++ b/src/fetchers/dblp_fetcher.py
@@ -0,0 +1,121 @@
+import requests
+import time
+import logging
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass
+
+@dataclass
+class DBLPResult:
+ title: str
+ authors: List[str]
+ year: str
+ venue: str
+ url: str
+ doi: Optional[str] = None
+
+class DBLPFetcher:
+ """Fetcher for DBLP API."""
+
+ BASE_URL = "https://dblp.org/search/publ/api"
+
+ def __init__(self):
+ self.last_request_time = 0
+ # DBLP asks for 1-2 seconds between requests. We'll use 1.5s to be safe.
+ self.rate_limit_delay = 1.5
+ self.logger = logging.getLogger(__name__)
+
+ def _wait_for_rate_limit(self):
+ """Ensure we don't exceed rate limits."""
+ elapsed = time.time() - self.last_request_time
+ if elapsed < self.rate_limit_delay:
+ time.sleep(self.rate_limit_delay - elapsed)
+ self.last_request_time = time.time()
+
+ def search_by_title(self, title: str) -> Optional[DBLPResult]:
+ """
+ Search DBLP by title.
+
+ Args:
+ title: Paper title to search for
+
+ Returns:
+ DBLPResult if found, None otherwise
+ """
+ self._wait_for_rate_limit()
+
+ params = {
+ "q": title,
+ "format": "json",
+ "h": 3 # Limit to top 3 hits
+ }
+
+ try:
+ response = requests.get(self.BASE_URL, params=params, timeout=10)
+
+ if response.status_code == 429:
+ self.logger.warning("DBLP rate limit exceeded. Waiting longer...")
+ time.sleep(5)
+ return None
+
+ if response.status_code != 200:
+ self.logger.warning(f"DBLP API error: {response.status_code}")
+ return None
+
+ data = response.json()
+ return self._parse_response(data, title)
+
+ except Exception as e:
+ self.logger.error(f"Error fetching from DBLP: {e}")
+ return None
+
+ def _parse_response(self, data: Dict[str, Any], query_title: str) -> Optional[DBLPResult]:
+ """Parse DBLP JSON response."""
+ try:
+ result = data.get("result", {})
+ hits = result.get("hits", {}).get("hit", [])
+
+ if not hits:
+ return None
+
+ # Find best match
+ best_hit = None
+
+ # Simple check: first hit is usually the best in DBLP for exact title match
+ # But we can do a quick normalization check if needed.
+ # For now, let's take the first hit that is a publication (not a person/venue)
+ # The search/publ/api endpoint should only return publications.
+
+ best_hit = hits[0]
+ info = best_hit.get("info", {})
+
+ # Extract authors
+ authors_data = info.get("authors", {}).get("author", [])
+ authors = []
+ if isinstance(authors_data, list):
+ authors = [a.get("text", "") for a in authors_data]
+ elif isinstance(authors_data, dict):
+ authors = [authors_data.get("text", "")]
+
+ # Extract other fields
+ title = info.get("title", "")
+ year = info.get("year", "")
+ venue = info.get("venue", "")
+ url = info.get("url", "")
+ doi = info.get("doi", "")
+
+ # Clean title (DBLP titles often end with a dot)
+ if title.endswith("."):
+ title = title[:-1]
+
+ return DBLPResult(
+ title=title,
+ authors=authors,
+ year=year,
+ venue=venue,
+ url=url,
+ doi=doi if doi else None
+ )
+
+ except Exception as e:
+ self.logger.error(f"Error parsing DBLP response: {e}")
+ return None
diff --git a/src/fetchers/openalex_fetcher.py b/src/fetchers/openalex_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..f978cc8d9e5a59e0918b3240804fbb04e6c6c105
--- /dev/null
+++ b/src/fetchers/openalex_fetcher.py
@@ -0,0 +1,196 @@
+"""
+OpenAlex API fetcher.
+Free and open API for scholarly metadata.
+"""
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import requests
+
+
+@dataclass
+class OpenAlexResult:
+ """Search result from OpenAlex API."""
+ title: str
+ authors: list[str]
+ year: str
+ abstract: str
+ doi: str
+ citation_count: int
+ url: str
+
+
+class OpenAlexFetcher:
+ """
+ Fetcher using OpenAlex's free API.
+
+ API Docs: https://docs.openalex.org/
+ Rate Limits:
+ - 100,000 requests per day
+ - 10 requests per second (very generous)
+ - No API key required (but polite pool recommended)
+ """
+
+ BASE_URL = "https://api.openalex.org"
+ RATE_LIMIT_DELAY = 0.1 # 10 req/sec max
+
+ def __init__(self, email: Optional[str] = None):
+ """
+ Initialize OpenAlex fetcher.
+
+ Args:
+ email: Optional email for polite pool (faster rate limits)
+ """
+ self.email = email
+ self._last_request_time = 0.0
+ self._session = requests.Session()
+
+ # Set user agent (required by OpenAlex)
+ self._session.headers.update({
+ 'User-Agent': 'BibGuard/1.0 (https://github.com/thinkwee/BibGuard; mailto:bibguard@example.com)'
+ })
+
+ # Add email to polite pool if provided
+ if email:
+ self._session.headers.update({'From': email})
+
+ def _rate_limit(self):
+ """Ensure rate limiting between requests."""
+ elapsed = time.time() - self._last_request_time
+ if elapsed < self.RATE_LIMIT_DELAY:
+ time.sleep(self.RATE_LIMIT_DELAY - elapsed)
+ self._last_request_time = time.time()
+
+ def search_by_title(self, title: str, max_results: int = 5) -> Optional[OpenAlexResult]:
+ """
+ Search for a paper by title.
+
+ Args:
+ title: Paper title to search for
+ max_results: Maximum number of results to fetch (default: 5)
+
+ Returns:
+ OpenAlexResult if found, None otherwise
+ """
+ self._rate_limit()
+
+ url = f"{self.BASE_URL}/works"
+ params = {
+ 'search': title,
+ 'per-page': max_results
+ }
+
+ try:
+ response = self._session.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+
+ results = data.get('results', [])
+ if not results:
+ return None
+
+ # Return the first (most relevant) result
+ return self._parse_work(results[0])
+
+ except requests.RequestException:
+ return None
+
+ def fetch_by_doi(self, doi: str) -> Optional[OpenAlexResult]:
+ """
+ Fetch paper metadata by DOI.
+
+ Args:
+ doi: DOI of the paper
+
+ Returns:
+ OpenAlexResult if found, None otherwise
+ """
+ self._rate_limit()
+
+ # OpenAlex uses DOI URLs
+ doi_url = f"https://doi.org/{doi}"
+ url = f"{self.BASE_URL}/works/{doi_url}"
+
+ try:
+ response = self._session.get(url, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+ return self._parse_work(data)
+
+ except requests.RequestException:
+ return None
+
+ def _parse_work(self, work_data: dict) -> Optional[OpenAlexResult]:
+ """Parse work data from API response."""
+ try:
+ # Extract title
+ title = work_data.get('title', '')
+
+ # Extract authors
+ authors = []
+ authorships = work_data.get('authorships', [])
+ for authorship in authorships:
+ author = authorship.get('author', {})
+ name = author.get('display_name', '')
+ if name:
+ authors.append(name)
+
+ # Get publication year
+ year = work_data.get('publication_year')
+ year_str = str(year) if year else ""
+
+ # Get abstract (inverted index format)
+ abstract = ""
+ abstract_inverted = work_data.get('abstract_inverted_index')
+ if abstract_inverted:
+ # Reconstruct abstract from inverted index
+ abstract = self._reconstruct_abstract(abstract_inverted)
+
+ # Get DOI
+ doi = work_data.get('doi', '')
+ if doi and doi.startswith('https://doi.org/'):
+ doi = doi.replace('https://doi.org/', '')
+
+ # Get citation count
+ citation_count = work_data.get('cited_by_count', 0)
+
+ # Get URL
+ url = work_data.get('id', '') # OpenAlex ID URL
+
+ return OpenAlexResult(
+ title=title,
+ authors=authors,
+ year=year_str,
+ abstract=abstract,
+ doi=doi,
+ citation_count=citation_count,
+ url=url
+ )
+ except (KeyError, TypeError):
+ return None
+
+ def _reconstruct_abstract(self, inverted_index: dict) -> str:
+ """
+ Reconstruct abstract text from inverted index.
+
+ OpenAlex stores abstracts in inverted index format:
+ {"word": [position1, position2, ...], ...}
+ """
+ if not inverted_index:
+ return ""
+
+ try:
+ # Create a list to hold words at their positions
+ max_pos = max(max(positions) for positions in inverted_index.values())
+ words = [''] * (max_pos + 1)
+
+ # Place each word at its positions
+ for word, positions in inverted_index.items():
+ for pos in positions:
+ words[pos] = word
+
+ # Join words with spaces
+ return ' '.join(word for word in words if word)
+ except (ValueError, TypeError):
+ return ""
diff --git a/src/fetchers/scholar_fetcher.py b/src/fetchers/scholar_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..47a3cd36fb8a9c88f3890e9f09cfd26e93582f2a
--- /dev/null
+++ b/src/fetchers/scholar_fetcher.py
@@ -0,0 +1,218 @@
+"""
+Google Scholar search (scraping-based fallback).
+"""
+import re
+import time
+import random
+from dataclasses import dataclass
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class ScholarResult:
+ """Search result from Google Scholar."""
+ title: str
+ authors: str
+ year: str
+ snippet: str
+ url: str
+ cited_by: int
+
+
+class ScholarFetcher:
+ """
+ Fallback fetcher using Google Scholar search.
+
+ Note: This uses scraping and may be blocked.
+ Use rate limiting and respect robots.txt.
+ """
+
+ SEARCH_URL = "https://scholar.google.com/scholar"
+ RATE_LIMIT_DELAY = 10.0 # Conservative delay to avoid blocking (was 5.0)
+ MAX_RETRIES = 2 # Retry on failures
+
+ USER_AGENTS = [
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
+ ]
+
+ def __init__(self):
+ self._last_request_time = 0.0
+ self._session = requests.Session()
+ self._request_count = 0
+ self._blocked = False # Track if we've been blocked
+
+ def _rate_limit(self):
+ """Ensure rate limiting between requests."""
+ elapsed = time.time() - self._last_request_time
+ # Add more randomness to avoid detection (3-5 seconds extra)
+ delay = self.RATE_LIMIT_DELAY + random.uniform(3, 5)
+ if elapsed < delay:
+ time.sleep(delay - elapsed)
+ self._last_request_time = time.time()
+
+ def _get_headers(self) -> dict:
+ """Get request headers with random user agent."""
+ return {
+ 'User-Agent': random.choice(self.USER_AGENTS),
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Accept-Encoding': 'gzip, deflate',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ }
+
+ def search(self, query: str, max_results: int = 5) -> list[ScholarResult]:
+ """
+ Search Google Scholar.
+
+ Returns list of search results.
+ Note: This may fail if blocked by Google.
+ """
+ # If we've been blocked, don't waste time
+ if self._blocked:
+ return []
+
+ self._rate_limit()
+ self._request_count += 1
+
+ params = {
+ 'q': query,
+ 'hl': 'en',
+ 'num': min(max_results, 10) # Scholar max is 10 per page
+ }
+
+ try:
+ response = self._session.get(
+ self.SEARCH_URL,
+ params=params,
+ headers=self._get_headers(),
+ timeout=30
+ )
+ response.raise_for_status()
+ except requests.RequestException as e:
+ return []
+
+ # Check if we're blocked
+ if 'unusual traffic' in response.text.lower() or response.status_code == 429:
+ self._blocked = True
+ print(f"⚠️ Google Scholar blocked after {self._request_count} requests. Skipping further Scholar queries.")
+ return []
+
+ return self._parse_results(response.text, max_results)
+
+ def search_by_title(self, title: str) -> Optional[ScholarResult]:
+ """Search for a specific paper by title."""
+ # Use quotes for exact title match
+ query = f'"{title}"'
+ results = self.search(query, max_results=3)
+
+ if not results:
+ # Try without quotes
+ results = self.search(title, max_results=5)
+
+ return results[0] if results else None
+
+ def _parse_results(self, html: str, max_results: int) -> list[ScholarResult]:
+ """Parse search results from HTML."""
+ results = []
+ soup = BeautifulSoup(html, 'lxml')
+
+ # Find all result entries
+ entries = soup.find_all('div', class_='gs_ri')
+
+ for entry in entries[:max_results]:
+ try:
+ result = self._parse_entry(entry)
+ if result:
+ results.append(result)
+ except Exception:
+ continue
+
+ return results
+
+ def _parse_entry(self, entry) -> Optional[ScholarResult]:
+ """Parse a single search result entry."""
+ # Get title
+ title_elem = entry.find('h3', class_='gs_rt')
+ if not title_elem:
+ return None
+
+ # Get title text and URL
+ title_link = title_elem.find('a')
+ if title_link:
+ title = title_link.get_text(strip=True)
+ url = title_link.get('href', '')
+ else:
+ title = title_elem.get_text(strip=True)
+ url = ''
+
+ # Clean title (remove [PDF], [HTML] markers)
+ title = re.sub(r'^\[(PDF|HTML|BOOK|CITATION)\]\s*', '', title)
+
+ # Get authors and year from the green line
+ meta_elem = entry.find('div', class_='gs_a')
+ authors = ""
+ year = ""
+
+ if meta_elem:
+ meta_text = meta_elem.get_text(strip=True)
+
+ # Extract year first
+ year_match = re.search(r'\b(19|20)\d{2}\b', meta_text)
+ if year_match:
+ year = year_match.group(0)
+
+ # Parse authors more carefully
+ # Format is usually: "Author1, Author2 - Journal, Year - Publisher"
+ # or sometimes: "Author1, Author2 - Journal/Conference - Year"
+ parts = meta_text.split(' - ')
+ if parts:
+ author_part = parts[0].strip()
+
+ # Clean up author field - remove year if it leaked in
+ if year:
+ # Remove year and anything after it from author field
+ author_part = re.sub(r',?\s*' + re.escape(year) + r'.*$', '', author_part)
+
+ # Remove common journal/venue keywords that might have leaked
+ # Handle patterns like "the journal of", "the proceedings", etc.
+ author_part = re.sub(r'\s+the\s+(journal|proceedings|conference|symposium|workshop|transactions|magazine|review|annals)\s+.*$', '', author_part, flags=re.IGNORECASE)
+
+ # Also handle without "the" prefix
+ author_part = re.sub(r'\s+(journal|proceedings|conference|symposium|workshop|transactions|magazine|review|annals)\s+.*$', '', author_part, flags=re.IGNORECASE)
+
+ # Remove standalone "the" at the end (in case it's left over)
+ author_part = re.sub(r'\s+the\s*$', '', author_part, flags=re.IGNORECASE)
+
+ # Remove trailing commas and whitespace
+ author_part = author_part.rstrip(', ').strip()
+
+ authors = author_part
+
+ # Get snippet
+ snippet_elem = entry.find('div', class_='gs_rs')
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
+
+ # Get cited by count
+ cited_by = 0
+ cited_elem = entry.find('a', string=re.compile(r'Cited by \d+'))
+ if cited_elem:
+ match = re.search(r'Cited by (\d+)', cited_elem.get_text())
+ if match:
+ cited_by = int(match.group(1))
+
+ return ScholarResult(
+ title=title,
+ authors=authors,
+ year=year,
+ snippet=snippet,
+ url=url,
+ cited_by=cited_by
+ )
diff --git a/src/fetchers/semantic_scholar_fetcher.py b/src/fetchers/semantic_scholar_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..8170a2e9a658d00bf9e5d82a165a5183365b316b
--- /dev/null
+++ b/src/fetchers/semantic_scholar_fetcher.py
@@ -0,0 +1,172 @@
+"""
+Semantic Scholar API fetcher.
+Official API with high quality metadata and generous rate limits.
+"""
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import requests
+
+
+@dataclass
+class SemanticScholarResult:
+ """Search result from Semantic Scholar API."""
+ title: str
+ authors: list[str]
+ year: str
+ abstract: str
+ paper_id: str
+ citation_count: int
+ url: str
+
+
+class SemanticScholarFetcher:
+ """
+ Fetcher using Semantic Scholar's official API.
+
+ API Docs: https://api.semanticscholar.org/
+ Rate Limits:
+ - Without API key: 100 requests per 5 minutes
+ - With API key: 5,000 requests per 5 minutes (free)
+ """
+
+ BASE_URL = "https://api.semanticscholar.org/graph/v1"
+ RATE_LIMIT_DELAY = 0.5 # Conservative delay (120 req/min max)
+
+ def __init__(self, api_key: Optional[str] = None):
+ """
+ Initialize Semantic Scholar fetcher.
+
+ Args:
+ api_key: Optional API key for higher rate limits (free from semanticscholar.org)
+ """
+ self.api_key = api_key
+ self._last_request_time = 0.0
+ self._session = requests.Session()
+
+ if api_key:
+ self._session.headers.update({'x-api-key': api_key})
+
+ def _rate_limit(self):
+ """Ensure rate limiting between requests."""
+ elapsed = time.time() - self._last_request_time
+ if elapsed < self.RATE_LIMIT_DELAY:
+ time.sleep(self.RATE_LIMIT_DELAY - elapsed)
+ self._last_request_time = time.time()
+
+ def search_by_title(self, title: str, max_results: int = 5) -> Optional[SemanticScholarResult]:
+ """
+ Search for a paper by title.
+
+ Args:
+ title: Paper title to search for
+ max_results: Maximum number of results to fetch (default: 5)
+
+ Returns:
+ SemanticScholarResult if found, None otherwise
+ """
+ self._rate_limit()
+
+ url = f"{self.BASE_URL}/paper/search"
+ params = {
+ 'query': title,
+ 'limit': max_results,
+ 'fields': 'title,authors,year,abstract,paperId,citationCount,url'
+ }
+
+ try:
+ response = self._session.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+
+ papers = data.get('data', [])
+ if not papers:
+ return None
+
+ # Return the first (most relevant) result
+ return self._parse_paper(papers[0])
+
+ except requests.RequestException:
+ return None
+
+ def fetch_by_doi(self, doi: str) -> Optional[SemanticScholarResult]:
+ """
+ Fetch paper metadata by DOI.
+
+ Args:
+ doi: DOI of the paper
+
+ Returns:
+ SemanticScholarResult if found, None otherwise
+ """
+ self._rate_limit()
+
+ url = f"{self.BASE_URL}/paper/DOI:{doi}"
+ params = {
+ 'fields': 'title,authors,year,abstract,paperId,citationCount,url'
+ }
+
+ try:
+ response = self._session.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+ return self._parse_paper(data)
+
+ except requests.RequestException:
+ return None
+
+ def fetch_by_arxiv_id(self, arxiv_id: str) -> Optional[SemanticScholarResult]:
+ """
+ Fetch paper metadata by arXiv ID.
+
+ Args:
+ arxiv_id: arXiv ID (e.g., "2301.12345" or "arXiv:2301.12345")
+
+ Returns:
+ SemanticScholarResult if found, None otherwise
+ """
+ self._rate_limit()
+
+ # Clean arXiv ID (remove "arXiv:" prefix if present)
+ clean_id = arxiv_id.replace('arXiv:', '')
+
+ url = f"{self.BASE_URL}/paper/ARXIV:{clean_id}"
+ params = {
+ 'fields': 'title,authors,year,abstract,paperId,citationCount,url'
+ }
+
+ try:
+ response = self._session.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+ return self._parse_paper(data)
+
+ except requests.RequestException:
+ return None
+
+ def _parse_paper(self, paper_data: dict) -> Optional[SemanticScholarResult]:
+ """Parse paper data from API response."""
+ try:
+ # Extract author names
+ authors = []
+ for author in paper_data.get('authors', []):
+ name = author.get('name', '')
+ if name:
+ authors.append(name)
+
+ # Get year (may be None)
+ year = paper_data.get('year')
+ year_str = str(year) if year else ""
+
+ return SemanticScholarResult(
+ title=paper_data.get('title', ''),
+ authors=authors,
+ year=year_str,
+ abstract=paper_data.get('abstract', ''),
+ paper_id=paper_data.get('paperId', ''),
+ citation_count=paper_data.get('citationCount', 0),
+ url=paper_data.get('url', '')
+ )
+ except (KeyError, TypeError):
+ return None
diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d67cd8290224189a418e2ba5fdd3b4f9c6f283
--- /dev/null
+++ b/src/parsers/__init__.py
@@ -0,0 +1,5 @@
+"""Parsers package"""
+from .bib_parser import BibParser
+from .tex_parser import TexParser
+
+__all__ = ['BibParser', 'TexParser']
diff --git a/src/parsers/__pycache__/__init__.cpython-311.pyc b/src/parsers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f017711d2f52991009793611123488f899922d17
Binary files /dev/null and b/src/parsers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/src/parsers/__pycache__/__init__.cpython-313.pyc b/src/parsers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a15dd1f751a12d6f5b7ff993742bb1678d571ec6
Binary files /dev/null and b/src/parsers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/parsers/__pycache__/bib_parser.cpython-311.pyc b/src/parsers/__pycache__/bib_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4282597b28154a0a8adb3ce685336f6c9d67131
Binary files /dev/null and b/src/parsers/__pycache__/bib_parser.cpython-311.pyc differ
diff --git a/src/parsers/__pycache__/bib_parser.cpython-313.pyc b/src/parsers/__pycache__/bib_parser.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b404d8ab5625452ced6d60b2ddd63a2c1603767
Binary files /dev/null and b/src/parsers/__pycache__/bib_parser.cpython-313.pyc differ
diff --git a/src/parsers/__pycache__/tex_parser.cpython-313.pyc b/src/parsers/__pycache__/tex_parser.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2fd788571f2145b5b053a9de4091a48329fa1e9
Binary files /dev/null and b/src/parsers/__pycache__/tex_parser.cpython-313.pyc differ
diff --git a/src/parsers/bib_parser.py b/src/parsers/bib_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..48853ed09ed811d0c500a5e5790fbbdb86a270d4
--- /dev/null
+++ b/src/parsers/bib_parser.py
@@ -0,0 +1,273 @@
+"""
+BibTeX file parser.
+"""
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+
+import bibtexparser
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.customization import convert_to_unicode
+
+
+@dataclass
+class BibEntry:
+ """Represents a parsed bibliography entry."""
+ key: str
+ entry_type: str
+ title: str = ""
+ author: str = ""
+ year: str = ""
+ abstract: str = ""
+ url: str = ""
+ doi: str = ""
+ arxiv_id: str = ""
+ journal: str = ""
+ booktitle: str = ""
+ publisher: str = ""
+ pages: str = ""
+ volume: str = ""
+ number: str = ""
+ raw_entry: dict = field(default_factory=dict)
+
+ @property
+ def has_arxiv(self) -> bool:
+ """Check if entry has arXiv information."""
+ return bool(self.arxiv_id)
+
+ @property
+ def search_query(self) -> str:
+ """Get search query for this entry."""
+ return self.title or self.key
+
+
+class BibParser:
+ """Parser for .bib files."""
+
+ # Patterns for extracting arXiv IDs
+ ARXIV_PATTERNS = [
+ # New format: 2301.00001 or 2301.00001v1
+ r'(\d{4}\.\d{4,5}(?:v\d+)?)',
+ # Old format: hep-th/9901001 or math.GT/0309136
+ r'([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+ # arXiv: prefix
+ r'arXiv:(\d{4}\.\d{4,5}(?:v\d+)?)',
+ r'arXiv:([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+ ]
+
+ # URL patterns for arXiv
+ ARXIV_URL_PATTERNS = [
+ r'arxiv\.org/abs/(\d{4}\.\d{4,5}(?:v\d+)?)',
+ r'arxiv\.org/abs/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+ r'arxiv\.org/pdf/(\d{4}\.\d{4,5}(?:v\d+)?)(?:\.pdf)?',
+ r'arxiv\.org/pdf/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)(?:\.pdf)?',
+ ]
+
+ def __init__(self):
+ self.entries: list[BibEntry] = []
+
+ def parse_file(self, filepath: str) -> list[BibEntry]:
+ """Parse a .bib file and return list of entries."""
+ path = Path(filepath)
+ if not path.exists():
+ raise FileNotFoundError(f"Bib file not found: {filepath}")
+
+ with open(path, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+
+ return self.parse_content(content)
+
+ def parse_content(self, content: str) -> list[BibEntry]:
+ """Parse bib content string."""
+ parser = BibTexParser(common_strings=True)
+ parser.customization = convert_to_unicode
+
+ try:
+ bib_database = bibtexparser.loads(content, parser=parser)
+ except Exception as e:
+ raise ValueError(f"Failed to parse bib content: {e}")
+
+ self.entries = []
+ for entry in bib_database.entries:
+ bib_entry = self._convert_entry(entry)
+ self.entries.append(bib_entry)
+
+ return self.entries
+
+ def _convert_entry(self, entry: dict) -> BibEntry:
+ """Convert a bibtexparser entry to BibEntry."""
+ # Extract basic fields
+ bib_entry = BibEntry(
+ key=entry.get('ID', ''),
+ entry_type=entry.get('ENTRYTYPE', ''),
+ title=entry.get('title', ''),
+ author=entry.get('author', ''),
+ year=entry.get('year', ''),
+ abstract=entry.get('abstract', ''),
+ url=entry.get('url', ''),
+ doi=entry.get('doi', ''),
+ journal=entry.get('journal', ''),
+ booktitle=entry.get('booktitle', ''),
+ publisher=entry.get('publisher', ''),
+ pages=entry.get('pages', ''),
+ volume=entry.get('volume', ''),
+ number=entry.get('number', ''),
+ raw_entry=entry.copy()
+ )
+
+ # Extract arXiv ID
+ bib_entry.arxiv_id = self._extract_arxiv_id(entry)
+
+ return bib_entry
+
+ def _extract_arxiv_id(self, entry: dict) -> str:
+ """Extract arXiv ID from entry."""
+ # Check eprint field first
+ eprint = entry.get('eprint', '')
+ if eprint:
+ arxiv_id = self._parse_arxiv_id(eprint)
+ if arxiv_id:
+ return arxiv_id
+
+ # Check arxiv field
+ arxiv = entry.get('arxiv', '')
+ if arxiv:
+ arxiv_id = self._parse_arxiv_id(arxiv)
+ if arxiv_id:
+ return arxiv_id
+
+ # Check URL field
+ url = entry.get('url', '')
+ if url:
+ for pattern in self.ARXIV_URL_PATTERNS:
+ match = re.search(pattern, url, re.IGNORECASE)
+ if match:
+ return match.group(1)
+
+ # Check journal field for "arXiv preprint arXiv:XXXX.XXXXX" format
+ journal = entry.get('journal', '')
+ if journal and 'arxiv' in journal.lower():
+ arxiv_id = self._parse_arxiv_id(journal)
+ if arxiv_id:
+ return arxiv_id
+
+ # Check note field
+ note = entry.get('note', '')
+ if note:
+ arxiv_id = self._parse_arxiv_id(note)
+ if arxiv_id:
+ return arxiv_id
+
+ return ""
+
+ def _parse_arxiv_id(self, text: str) -> str:
+ """Parse arXiv ID from text."""
+ for pattern in self.ARXIV_PATTERNS:
+ match = re.search(pattern, text)
+ if match:
+ return match.group(1)
+ return ""
+
+ def get_entry_by_key(self, key: str) -> Optional[BibEntry]:
+ """Get entry by citation key."""
+ for entry in self.entries:
+ if entry.key == key:
+ return entry
+ return None
+
+ def filter_file(self, input_path: str, output_path: str, keys_to_keep: set[str]):
+ """
+ Create a new bib file containing only specified keys.
+ Preserves original formatting, comments, and strings.
+ """
+ with open(input_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+
+ filtered_content = self._filter_content(content, keys_to_keep)
+
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(filtered_content)
+
+ def _filter_content(self, content: str, keys_to_keep: set[str]) -> str:
+ """Filter content string keeping only specified keys."""
+ ranges_to_remove = []
+ i = 0
+ length = len(content)
+
+ while i < length:
+ if content[i] == '@':
+ start = i
+ # Find opening brace
+ brace_open = content.find('{', i)
+ if brace_open == -1:
+ i += 1
+ continue
+
+ # Get entry type
+ entry_type = content[i+1:brace_open].strip().lower()
+
+ # Skip comments
+ if entry_type == 'comment':
+ i = brace_open + 1
+ continue
+
+ # Find matching closing brace to determine entry end
+ balance = 1
+ j = brace_open + 1
+ in_quote = False
+
+ while j < length and balance > 0:
+ char = content[j]
+
+ # Handle escaped characters
+ if char == '\\':
+ j += 2
+ continue
+
+ if char == '"':
+ in_quote = not in_quote
+ elif not in_quote:
+ if char == '{':
+ balance += 1
+ elif char == '}':
+ balance -= 1
+ j += 1
+
+ end = j
+
+ # Extract key (between { and ,)
+ # Only for standard entries, not @string or @preamble
+ if entry_type not in ('string', 'preamble'):
+ # Find comma or end of entry
+ # Key is usually the first token after {
+ key_part = content[brace_open+1:end]
+ comma_pos = key_part.find(',')
+
+ if comma_pos != -1:
+ key = key_part[:comma_pos].strip()
+
+ # If key is NOT in keep list, mark for removal
+ if key not in keys_to_keep:
+ ranges_to_remove.append((start, end))
+
+ i = end
+ else:
+ i += 1
+
+ # Reconstruct content
+ new_content = []
+ last_pos = 0
+ for start, end in ranges_to_remove:
+ new_content.append(content[last_pos:start])
+
+ # Clean up whitespace after removed entry
+ last_pos = end
+ while last_pos < length and content[last_pos] in ' \t\r':
+ last_pos += 1
+ if last_pos < length and content[last_pos] == '\n':
+ last_pos += 1
+
+ new_content.append(content[last_pos:])
+ return "".join(new_content)
+
diff --git a/src/parsers/tex_parser.py b/src/parsers/tex_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f487b9d207636914ca92f5efcb1f98223b13272
--- /dev/null
+++ b/src/parsers/tex_parser.py
@@ -0,0 +1,200 @@
+"""
+LaTeX file parser for citation extraction.
+"""
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class CitationContext:
+ """Represents a citation with its context."""
+ key: str
+ line_number: int
+ command: str # e.g., \cite, \citep, \citet
+ context_before: str # Text before citation
+ context_after: str # Text after citation
+ full_context: str # Full surrounding context
+ raw_line: str # The raw line containing the citation
+ file_path: Optional[str] = None # Added
+
+
+class TexParser:
+ """Parser for .tex files."""
+
+ # Citation command patterns
+ CITE_PATTERNS = [
+ # Standard citation commands
+ r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ # natbib commands
+ r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ # biblatex commands
+ r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ ]
+
+ # Compiled pattern for finding any citation
+ CITE_REGEX = re.compile(
+ r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}',
+ re.IGNORECASE
+ )
+
+ def __init__(self):
+ self.citations: dict[str, list[CitationContext]] = {}
+ self.all_keys: set[str] = set()
+ self.lines: list[str] = []
+ self.content: str = ""
+ self.current_filepath: Optional[str] = None
+
+ def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]:
+ """Parse a .tex file and extract all citations."""
+ path = Path(filepath)
+ if not path.exists():
+ raise FileNotFoundError(f"TeX file not found: {filepath}")
+
+ with open(path, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+
+ self.current_filepath = filepath
+ return self.parse_content(content)
+
+ def parse_content(self, content: str) -> dict[str, list[CitationContext]]:
+ """Parse tex content and extract citations."""
+ self.content = content
+ self.lines = content.split('\n')
+ self.citations = {}
+ self.all_keys = set()
+
+ # Remove comments
+ content_no_comments = self._remove_comments(content)
+
+ # Find all citations line by line
+ for line_num, line in enumerate(self.lines, 1):
+ # Skip comment lines
+ if line.strip().startswith('%'):
+ continue
+
+ # Remove inline comments for matching
+ line_no_comment = re.sub(r'(? str:
+ """Remove LaTeX comments from content."""
+ # Remove line comments (but keep escaped %)
+ lines = content.split('\n')
+ cleaned = []
+ for line in lines:
+ # Remove inline comments
+ result = re.sub(r'(? dict:
+ """Extract surrounding context for a citation (sentences)."""
+ # Get a larger window of lines first to ensure we capture full sentences
+ start_line = max(0, line_num - 10)
+ end_line = min(len(self.lines), line_num + 10)
+
+ # Combine lines into a single text block
+ raw_block = ' '.join(self.lines[start_line:end_line])
+
+ # Clean the block first to make sentence splitting easier
+ clean_block = self._clean_text(raw_block)
+
+ # Find the citation in the clean block (approximation)
+ # Since we cleaned the text, we can't find the exact \cite command easily.
+ # Instead, we'll use the raw lines to find the citation index, then map to clean text.
+ # However, a simpler approach for LLM context is to just return the cleaned text
+ # centered around the line.
+
+ # Better approach:
+ # 1. Get the raw line content
+ current_raw_line = self.lines[line_num - 1]
+
+ # 2. Get surrounding lines
+ before_lines = self.lines[start_line:line_num - 1]
+ after_lines = self.lines[line_num:end_line]
+
+ # 3. Clean everything
+ current_clean = self._clean_text(current_raw_line)
+ before_clean = self._clean_text(' '.join(before_lines))
+ after_clean = self._clean_text(' '.join(after_lines))
+
+ # 4. Split into sentences (simple splitting by .!?)
+ def split_sentences(text):
+ return re.split(r'(?<=[.!?])\s+', text)
+
+ before_sentences = split_sentences(before_clean)
+ after_sentences = split_sentences(after_clean)
+
+ # Take last N sentences from before
+ context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else ""
+
+ # Take first N sentences from after
+ context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else ""
+
+ # Combine
+ full_context = f"{context_before} {current_clean} {context_after}".strip()
+
+ return {
+ 'before': context_before,
+ 'after': context_after,
+ 'full': full_context
+ }
+
+ def _clean_text(self, text: str) -> str:
+ """Clean LaTeX text for readability."""
+ # Remove common LaTeX commands but keep text content
+ text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text)
+ # Remove braces
+ text = re.sub(r'[{}]', '', text)
+ # Normalize whitespace
+ text = re.sub(r'\s+', ' ', text)
+ return text.strip()
+
+ def is_cited(self, key: str) -> bool:
+ """Check if a key is cited in the document."""
+ return key in self.all_keys
+
+ def get_citation_contexts(self, key: str) -> list[CitationContext]:
+ """Get all citation contexts for a key."""
+ return self.citations.get(key, [])
+
+ def get_all_cited_keys(self) -> set[str]:
+ """Get all citation keys found in the document."""
+ return self.all_keys.copy()
diff --git a/src/report/__init__.py b/src/report/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..078cdff2c97c5e94121741baa5a0a270660d45ba
--- /dev/null
+++ b/src/report/__init__.py
@@ -0,0 +1,4 @@
+"""Report package"""
+from .generator import ReportGenerator
+
+__all__ = ['ReportGenerator']
diff --git a/src/report/__pycache__/__init__.cpython-313.pyc b/src/report/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bec69e5de0fd5315c6a0c642dc565d685207de90
Binary files /dev/null and b/src/report/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/report/__pycache__/generator.cpython-313.pyc b/src/report/__pycache__/generator.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7df3494844c80e1581b16a5359eba1bd6653a2
Binary files /dev/null and b/src/report/__pycache__/generator.cpython-313.pyc differ
diff --git a/src/report/__pycache__/line_report.cpython-313.pyc b/src/report/__pycache__/line_report.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38464dba5318b23773c196083cfaece9c6dcd917
Binary files /dev/null and b/src/report/__pycache__/line_report.cpython-313.pyc differ
diff --git a/src/report/generator.py b/src/report/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..73fe74c12642a85d0b6e1a1b04122daebeb1815c
--- /dev/null
+++ b/src/report/generator.py
@@ -0,0 +1,785 @@
+"""
+Report generator for bibliography check results.
+"""
+import re
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Optional, List
+from pathlib import Path
+
+from ..parsers.bib_parser import BibEntry
+from ..analyzers.metadata_comparator import ComparisonResult
+from ..analyzers.usage_checker import UsageResult
+from ..analyzers.llm_evaluator import EvaluationResult
+from ..analyzers.duplicate_detector import DuplicateGroup
+from ..checkers.base import CheckResult, CheckSeverity
+
+
+@dataclass
+class EntryReport:
+ """Complete report for a single bib entry."""
+ entry: BibEntry
+ comparison: Optional[ComparisonResult]
+ usage: Optional[UsageResult]
+ evaluations: list[EvaluationResult]
+
+
+class ReportGenerator:
+ """Generates formatted markdown reports."""
+
+ def __init__(self, minimal_verified: bool = False, check_preprint_ratio: bool = True, preprint_warning_threshold: float = 0.50):
+ self.entries: list[EntryReport] = []
+ self.missing_citations: list[str] = []
+ self.duplicate_groups: list[DuplicateGroup] | None = None # None means check not run
+ self.bib_files: list[str] = []
+ self.tex_files: list[str] = []
+ self.bib_file: str = "" # Keep for backward compatibility/single file
+ self.tex_file: str = "" # Keep for backward compatibility/single file
+ self.minimal_verified = minimal_verified # Whether to show minimal info for verified entries
+ self.submission_results: List[CheckResult] = [] # Submission quality check results
+ self.template = None # Conference template if used
+ self.check_preprint_ratio = check_preprint_ratio # Whether to check preprint ratio
+ self.preprint_warning_threshold = preprint_warning_threshold # Threshold for preprint warning
+
+
+ def add_entry_report(self, report: EntryReport):
+ """Add an entry report."""
+ self.entries.append(report)
+
+ def set_metadata(self, bib_files: str | list[str], tex_files: str | list[str]):
+ """Set source file information."""
+ if isinstance(bib_files, str):
+ self.bib_files = [bib_files]
+ self.bib_file = bib_files
+ else:
+ self.bib_files = bib_files
+ self.bib_file = bib_files[0] if bib_files else ""
+
+ if isinstance(tex_files, str):
+ self.tex_files = [tex_files]
+ self.tex_file = tex_files
+ else:
+ self.tex_files = tex_files
+ self.tex_file = tex_files[0] if tex_files else ""
+
+ def set_missing_citations(self, missing: list[str]):
+ """Set list of citations without bib entries."""
+ self.missing_citations = missing
+
+ def set_duplicate_groups(self, groups: list[DuplicateGroup]):
+ """Set list of duplicate entry groups."""
+ self.duplicate_groups = groups
+
+ def set_submission_results(self, results: List[CheckResult], template=None):
+ """Set submission quality check results."""
+ self.submission_results = results
+ self.template = template
+
+ def generate(self) -> str:
+ """Generate the full markdown report."""
+ lines = []
+
+ # Header
+ lines.extend(self._generate_header())
+ lines.append("")
+
+ # Disclaimer
+ lines.extend(self._generate_disclaimer())
+ lines.append("")
+
+ # Summary statistics
+ lines.extend(self._generate_summary())
+ lines.append("")
+
+ # ⚠️ Critical Issues (Detailed) - Bibliography-related issues
+ lines.extend(self._generate_issues_section())
+ lines.append("")
+
+ # ✅ Verified Entries (Clean)
+ lines.extend(self._generate_verified_section())
+ lines.append("")
+
+ # 📋 Submission Quality Checks (LaTeX quality checks)
+ if self.submission_results:
+ lines.extend(self._generate_submission_section())
+ lines.append("")
+
+ # Footer
+ lines.extend(self._generate_footer())
+
+ return "\n".join(lines)
+
+ def get_summary_stats(self) -> tuple[dict, dict]:
+ """Get summary statistics as dictionaries for console display (Issues only)."""
+ total = len(self.entries)
+
+ # Bibliography issues breakdown
+ title_mismatches = 0
+ author_mismatches = 0
+ year_mismatches = 0
+ low_relevance = 0
+ unable_to_verify = 0
+
+ for e in self.entries:
+ # Metadata issues
+ if e.comparison:
+ if e.comparison.has_issues:
+ # Categorize issues
+ has_title = False
+ has_author = False
+ has_year = False
+
+ for issue in e.comparison.issues:
+ if "Title mismatch" in issue: has_title = True
+ elif "Author mismatch" in issue: has_author = True
+ elif "Year mismatch" in issue: has_year = True
+ elif "Unable to find" in issue: unable_to_verify += 1
+
+ if has_title: title_mismatches += 1
+ if has_author: author_mismatches += 1
+ if has_year: year_mismatches += 1
+
+ # Relevance issues
+ if any(ev.relevance_score <= 2 for ev in e.evaluations):
+ low_relevance += 1
+
+ bib_stats = {}
+ if title_mismatches > 0: bib_stats["Title Mismatches"] = title_mismatches
+ if author_mismatches > 0: bib_stats["Author Mismatches"] = author_mismatches
+ if year_mismatches > 0: bib_stats["Year Mismatches"] = year_mismatches
+ if low_relevance > 0: bib_stats["Low Relevance"] = low_relevance
+ if unable_to_verify > 0: bib_stats["Unable to Verify"] = unable_to_verify
+
+ if self.duplicate_groups:
+ bib_stats["Duplicate Groups"] = len(self.duplicate_groups)
+
+ if self.missing_citations:
+ bib_stats["Missing Bib Entries"] = len(self.missing_citations)
+
+ unused = [e for e in self.entries if e.usage and not e.usage.is_used]
+ if unused:
+ bib_stats["Unused Entries"] = len(unused)
+
+ # LaTeX stats - Group by precise Rule Names
+ latex_stats = {}
+
+ # Rule mapping for professional display names
+ RULE_MAPPING = {
+ "Very long sentence": "Sentence Length (Critical)",
+ "Long sentence": "Sentence Length (Warning)",
+ "Possible Markdown bullet point": "Markdown Bullet Point",
+ "Possible Markdown numbered list": "Markdown Numbered List",
+ "Possible Markdown italic": "Markdown Italic",
+ "Possible Markdown bold": "Markdown Bold",
+ "Inconsistent hyphenation": "Hyphenation Inconsistency",
+ "Inconsistent spelling": "Spelling Inconsistency",
+ "Unreferenced figure": "Unreferenced Figure",
+ "Unreferenced table": "Unreferenced Table",
+ "Unreferenced section": "Unreferenced Section",
+ "Unreferenced label": "Unreferenced Label",
+ "Multiple blank lines": "Multiple Blank Lines",
+ "Citation from": "Old Citation (10+ years)",
+ "Hedging language": "Hedging/Vague Language",
+ "Redundant phrase": "Redundant Phrasing",
+ "Weak start with": "Weak Sentence Starter",
+ "Unescaped &": "Unescaped Special Character",
+ "Citation without non-breaking space": "Missing Non-breaking Space (~)",
+ "Mixed citation styles": "Mixed Citation Styles",
+ "Mixed inline math": "Mixed Math Notation",
+ "Appendix section": "Unreferenced Appendix",
+ "Missing space before unit": "Unit Spacing Issue"
+ }
+
+ for r in self.submission_results:
+ if r.passed:
+ continue
+
+ raw_msg = r.message
+ rule_name = "Unknown Rule"
+
+ # Match against our professional rule names
+ matched = False
+ for pattern, official_name in RULE_MAPPING.items():
+ if pattern in raw_msg:
+ rule_name = official_name
+ matched = True
+ break
+
+ if not matched:
+ # Fallback: Clean the message (remove dynamic parts)
+ clean_msg = re.sub(r"\(.*?\)", "", raw_msg)
+ clean_msg = re.sub(r"'.*?'", "", clean_msg)
+ clean_msg = re.sub(r"\d+", "", clean_msg)
+ rule_name = clean_msg.split(":")[0].strip()
+
+ if rule_name not in latex_stats:
+ latex_stats[rule_name] = 0
+ latex_stats[rule_name] += 1
+
+ return bib_stats, latex_stats
+
+ def generate_console_output(self) -> str:
+ """Generate console-friendly output (Summary + Issues only)."""
+ lines = []
+
+ # Summary statistics
+ lines.extend(self._generate_summary())
+ lines.append("")
+
+ # Critical Issues
+ lines.extend(self._generate_issues_section())
+ lines.append("")
+
+ return "\n".join(lines)
+
+ def _generate_header(self) -> list[str]:
+ """Generate report header."""
+ bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A"
+ tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A"
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+ return [
+ "# Bibliography Validation Report",
+ "",
+ f"**Generated:** {timestamp}",
+ "",
+ "| File Type | Filename |",
+ "|-----------|----------|",
+ f"| **Bib File(s)** | {bib_names} |",
+ f"| **TeX File(s)** | {tex_names} |"
+ ]
+
+ def _generate_disclaimer(self) -> list[str]:
+ """Generate disclaimer section."""
+ return [
+ "> **⚠️ Disclaimer:** This report is generated by an automated tool. While BibGuard strives for accuracy, it may produce false positives or miss certain issues. **This tool cannot replace human review.** Please manually verify all reported issues before making changes to your bibliography."
+ ]
+
+ def _generate_summary(self) -> list[str]:
+ """Generate summary statistics."""
+ total = len(self.entries)
+
+ # Check availability of results
+ has_metadata = any(e.comparison is not None for e in self.entries)
+ has_usage = any(e.usage is not None for e in self.entries)
+ has_eval = any(len(e.evaluations) > 0 for e in self.entries)
+
+ # Calculate Verified/Issues
+ # Note: _is_verified depends on _has_issues.
+ # If a check wasn't run, it won't contribute to issues.
+ verified = sum(1 for e in self.entries if self._is_verified(e))
+ issues = sum(1 for e in self.entries if self._has_issues(e))
+
+ # Usage stats
+ if has_usage:
+ used = sum(1 for e in self.entries if e.usage and e.usage.is_used)
+ unused = total - used
+ used_str = str(used)
+ unused_str = str(unused)
+ missing_str = str(len(self.missing_citations))
+ else:
+ used_str = "N/A"
+ unused_str = "N/A"
+ missing_str = "N/A"
+
+ # Duplicate stats - show N/A if check wasn't run (duplicate_groups is None means not checked)
+ if self.duplicate_groups is None:
+ dup_str = "N/A"
+ else:
+ dup_str = str(len(self.duplicate_groups))
+
+ # Preprint detection (only if enabled)
+ preprint_str = "N/A"
+ preprint_warning = []
+ if self.check_preprint_ratio and has_usage:
+ used_entries = [e for e in self.entries if e.usage and e.usage.is_used]
+ if used_entries:
+ preprint_count = sum(1 for e in used_entries if self._is_preprint(e.entry))
+ preprint_ratio = preprint_count / len(used_entries)
+ preprint_str = f"{preprint_count} ({preprint_ratio:.1%})"
+
+ # Warning if exceeds threshold
+ if preprint_ratio > self.preprint_warning_threshold:
+ preprint_warning = [
+ "",
+ f"> ⚠️ **High Preprint Ratio Warning:** {preprint_ratio:.1%} of your used references are preprints (arXiv, bioRxiv, etc.). Consider replacing some with peer-reviewed publications if available."
+ ]
+
+ summary_lines = [
+ "## 📊 Summary",
+ "",
+ "### 📚 Bibliography Statistics",
+ "",
+ "| Metric | Count |",
+ "|--------|-------|",
+ f"| **Total Entries** | {total} |",
+ f"| ✅ **Verified (Clean)** | {verified} |",
+ f"| ⚠️ **With Issues** | {issues} |",
+ f"| 📝 **Used in TeX** | {used_str} |",
+ f"| 🗑️ **Unused** | {unused_str} |",
+ f"| 🔄 **Duplicate Groups** | {dup_str} |",
+ f"| ❌ **Missing Bib Entries** | {missing_str} |",
+ f"| 📄 **Preprints (Used)** | {preprint_str} |",
+ ]
+
+ # Add warning if needed
+ if preprint_warning:
+ summary_lines.extend(preprint_warning)
+
+ summary_lines.extend([
+ "",
+ "### 📋 LaTeX Quality Checks",
+ "",
+ self._get_submission_summary()
+ ])
+
+ return summary_lines
+
+ def _is_preprint(self, entry: BibEntry) -> bool:
+ """Check if an entry is a preprint."""
+ # Preprint indicators
+ preprint_keywords = [
+ 'arxiv', 'biorxiv', 'medrxiv', 'ssrn', 'preprint',
+ 'openreview', 'techreport', 'technical report', 'working paper',
+ 'tech report', 'tech. report'
+ ]
+
+ # Check entry type
+ if entry.entry_type.lower() in ['techreport', 'unpublished', 'misc']:
+ # Further check if it's actually a preprint
+ text_to_check = ' '.join([
+ entry.journal.lower(),
+ entry.booktitle.lower(),
+ entry.publisher.lower(),
+ entry.entry_type.lower()
+ ])
+
+ if any(keyword in text_to_check for keyword in preprint_keywords):
+ return True
+
+ # Check if arXiv ID exists
+ if entry.has_arxiv:
+ return True
+
+ # Check journal/booktitle/publisher fields
+ venue_text = ' '.join([
+ entry.journal.lower(),
+ entry.booktitle.lower(),
+ entry.publisher.lower()
+ ])
+
+ return any(keyword in venue_text for keyword in preprint_keywords)
+
+ def _get_submission_summary(self) -> str:
+ """Generate submission quality summary table."""
+ if not self.submission_results:
+ return "*No quality checks were performed.*"
+
+ # Count by severity
+ error_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.ERROR)
+ warning_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.WARNING)
+ info_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.INFO)
+
+ lines = [
+ "| Severity | Count |",
+ "|----------|-------|",
+ f"| 🔴 **Errors** | {error_count} |",
+ f"| 🟡 **Warnings** | {warning_count} |",
+ f"| 🔵 **Suggestions** | {info_count} |"
+ ]
+ return "\n".join(lines)
+
+ def _is_verified(self, entry: EntryReport) -> bool:
+ """Check if entry is clean (no issues)."""
+ return not self._has_issues(entry)
+
+ def _has_issues(self, entry: EntryReport) -> bool:
+ """Check if entry has any issues."""
+ # Metadata issues
+ if entry.comparison and entry.comparison.has_issues:
+ return True
+ # LLM issues (low relevance)
+ if any(ev.relevance_score <= 2 for ev in entry.evaluations):
+ return True
+ # NOTE: We don't include usage issues (unused) here because
+ # unused entries are already shown in the "Unused Entries" section
+ return False
+
+ def _has_metadata_or_relevance_issues(self, entry: EntryReport) -> bool:
+ """Check if entry has metadata or relevance issues (excluding duplicate/unused)."""
+ # Metadata issues
+ if entry.comparison and entry.comparison.has_issues:
+ return True
+ # LLM issues (low relevance)
+ if any(ev.relevance_score <= 2 for ev in entry.evaluations):
+ return True
+ return False
+
+ def _generate_issues_section(self) -> list[str]:
+ """Generate detailed section for entries with issues."""
+ lines = ["## ⚠️ Critical Issues Detected", ""]
+
+ has_any_issues = False
+
+ # 1. Missing Citations
+ if self.missing_citations:
+ has_any_issues = True
+ lines.append("### ❌ Missing Bibliography Entries")
+ lines.append("The following keys are cited in the TeX file but missing from the .bib file:")
+ lines.append("")
+ for key in self.missing_citations:
+ lines.append(f"- `{key}`")
+ lines.append("")
+
+ # 2. Duplicate Entries
+ if self.duplicate_groups:
+ has_any_issues = True
+ lines.append("### 🔄 Duplicate Entries")
+ for i, group in enumerate(self.duplicate_groups, 1):
+ lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})")
+ lines.append(f"**Reason:** {group.reason}")
+ lines.append("")
+ lines.append("| Key | Title | Year |")
+ lines.append("|-----|-------|------|")
+ for entry in group.entries:
+ lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |")
+ lines.append("")
+
+ # 3. Unused Entries
+ unused = [e for e in self.entries if e.usage and not e.usage.is_used]
+ if unused:
+ has_any_issues = True
+ lines.append("### 🗑️ Unused Entries")
+ lines.append("The following entries are in the .bib file but NOT cited in the TeX file:")
+ lines.append("")
+ for e in unused:
+ lines.append(f"- `{e.entry.key}`: *{e.entry.title}*")
+ lines.append("")
+
+ # 4. Metadata Mismatches & Low Relevance
+ issue_entries = [e for e in self.entries if self._has_metadata_or_relevance_issues(e)]
+
+ if issue_entries:
+ has_any_issues = True
+ lines.append("### ⚠️ Metadata & Relevance Issues")
+
+ for entry_report in issue_entries:
+ lines.extend(self._format_entry_detail(entry_report, is_verified=False))
+
+ if not has_any_issues:
+ lines.append("🎉 **No critical issues found!**")
+
+ return lines
+
+ def _generate_verified_section(self) -> list[str]:
+ """Generate section for verified entries."""
+ lines = ["## ✅ Verified Entries", ""]
+
+ verified = [e for e in self.entries if self._is_verified(e)]
+
+ if not verified:
+ lines.append("_No verified entries found._")
+ return lines
+
+ lines.append(f"Found **{len(verified)}** entries with correct metadata.")
+ lines.append("")
+
+ # Use a collapsible details block for clean UI
+ lines.append("")
+ lines.append("Click to view verified entries
")
+ lines.append("")
+
+ for entry_report in verified:
+ lines.extend(self._format_entry_detail(entry_report, minimal=self.minimal_verified, is_verified=True))
+
+ lines.append(" ")
+ return lines
+
+ def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> list[str]:
+ """Format a single entry report in Markdown."""
+ entry = report.entry
+ comp = report.comparison
+ lines = []
+
+ # Title header - use checkmark for verified entries, warning for issues
+ icon = "✅" if is_verified else "⚠️"
+ lines.append(f"#### {icon} `{entry.key}`")
+ lines.append(f"**Title:** {entry.title}")
+ lines.append("")
+
+ # Metadata Status
+ if comp:
+ status_icon = "✅" if comp.is_match else "❌"
+ lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})")
+
+ if comp.has_issues and not minimal:
+ lines.append(" - **Discrepancies:**")
+ for issue in comp.issues:
+ # Format mismatch details nicely
+ if "Mismatch" in issue or "mismatch" in issue:
+ lines.append(f" - 🔴 {issue}")
+ if "Title" in issue:
+ lines.append(f" - **Bib:** `{comp.bib_title}`")
+ lines.append(f" - **Fetched:** `{comp.fetched_title}`")
+ elif "Author" in issue:
+ lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`")
+ lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`")
+ else:
+ lines.append(f" - 🔸 {issue}")
+
+ # Relevance Status
+ if report.evaluations and not minimal:
+ lines.append("- **Relevance Analysis:**")
+ for eval_res in report.evaluations:
+ score_icon = "🟢" if eval_res.relevance_score >= 4 else ("🟡" if eval_res.relevance_score == 3 else "🔴")
+ lines.append(f" - {score_icon} **Score {eval_res.relevance_score}/5** ({eval_res.score_label})")
+ loc = []
+ if eval_res.file_path:
+ loc.append(f"File: `{Path(eval_res.file_path).name}`")
+ if eval_res.line_number:
+ loc.append(f"Line {eval_res.line_number}")
+ if loc:
+ lines.append(f" - {' | '.join(loc)}")
+ lines.append(f" - *\"{eval_res.explanation}\"*")
+
+ lines.append("")
+ lines.append("---")
+ lines.append("")
+ return lines
+
+ def _generate_submission_section(self) -> list[str]:
+ """Generate section for submission quality check results."""
+ lines = ["## 📋 Submission Quality Checks", ""]
+
+ # Template info
+ if self.template:
+ lines.append(f"**Conference Template:** {self.template.name}")
+ lines.append(f"**Page Limit:** {self.template.page_limit_review} (review) / {self.template.page_limit_camera} (camera-ready)")
+ if self.template.mandatory_sections:
+ lines.append(f"**Required Sections:** {', '.join(self.template.mandatory_sections)}")
+ lines.append("")
+
+ # Count by severity
+ errors = [r for r in self.submission_results if r.severity == CheckSeverity.ERROR and not r.passed]
+ warnings = [r for r in self.submission_results if r.severity == CheckSeverity.WARNING and not r.passed]
+ infos = [r for r in self.submission_results if r.severity == CheckSeverity.INFO and not r.passed]
+
+ # Summary
+ if errors or warnings or infos:
+ lines.append("| Severity | Count |")
+ lines.append("|----------|-------|")
+ if errors:
+ lines.append(f"| 🔴 **Errors** | {len(errors)} |")
+ if warnings:
+ lines.append(f"| 🟡 **Warnings** | {len(warnings)} |")
+ if infos:
+ lines.append(f"| 🔵 **Suggestions** | {len(infos)} |")
+ lines.append("")
+ else:
+ lines.append("🎉 **No submission issues found!**")
+ lines.append("")
+ return lines
+
+ # Group by checker
+ by_checker = {}
+ for result in self.submission_results:
+ if result.passed:
+ continue
+ if result.checker_name not in by_checker:
+ by_checker[result.checker_name] = []
+ by_checker[result.checker_name].append(result)
+
+ # Display errors first
+ if errors:
+ lines.append("### 🔴 Critical Errors")
+ lines.append("")
+ for result in errors:
+ lines.append(f"- **{result.message}**")
+ loc = []
+ if result.file_path:
+ loc.append(f"File: `{Path(result.file_path).name}`")
+ if result.line_number:
+ loc.append(f"Line {result.line_number}")
+ if loc:
+ lines.append(f" - {' | '.join(loc)}")
+ if result.line_content:
+ lines.append(f" - `{result.line_content[:80]}`")
+ if result.suggestion:
+ lines.append(f" - 💡 *{result.suggestion}*")
+ lines.append("")
+
+ # Display warnings
+ if warnings:
+ lines.append("### 🟡 Warnings")
+ lines.append("")
+ for result in warnings:
+ lines.append(f"- {result.message}")
+ loc = []
+ if result.file_path:
+ loc.append(f"File: `{Path(result.file_path).name}`")
+ if result.line_number:
+ loc.append(f"Line {result.line_number}")
+ if loc:
+ lines.append(f" - {' | '.join(loc)}")
+ if result.suggestion:
+ lines.append(f" - 💡 *{result.suggestion}*")
+ lines.append("")
+
+ # Display suggestions (collapsible)
+ if infos:
+ lines.append("### 🔵 Suggestions")
+ lines.append("")
+ lines.append("Click to view suggestions
")
+ lines.append("")
+ for result in infos:
+ lines.append(f"- {result.message}")
+ loc = []
+ if result.file_path:
+ loc.append(f"File: `{Path(result.file_path).name}`")
+ if result.line_number:
+ loc.append(f"Line {result.line_number}")
+ if loc:
+ lines.append(f" - {' | '.join(loc)}")
+ if result.suggestion:
+ lines.append(f" - 💡 *{result.suggestion}*")
+ lines.append("")
+ lines.append(" ")
+ lines.append("")
+
+ return lines
+
+ def _generate_footer(self) -> list[str]:
+ """Generate report footer."""
+ return [
+ "",
+ "---",
+ f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+ ]
+
+ def save(self, filepath: str):
+ """Save report to file."""
+ content = self.generate()
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(content)
+
+ def save_bibliography_report(self, filepath: str):
+ """Generate and save bibliography-only report (all bib-related checks)."""
+ lines = []
+
+ # Header
+ lines.append("# Bibliography Validation Report")
+ lines.append("")
+ lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+ lines.append("")
+ bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A"
+ tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A"
+ lines.append("| File Type | Filename |")
+ lines.append("|-----------|----------|")
+ lines.append(f"| **Bib File(s)** | {bib_names} |")
+ lines.append(f"| **TeX File(s)** | {tex_names} |")
+ lines.append("")
+
+ # Disclaimer
+ lines.extend(self._generate_disclaimer())
+ lines.append("")
+
+ # Summary - Bibliography only
+ total = len(self.entries)
+ verified = sum(1 for e in self.entries if self._is_verified(e))
+ issues = sum(1 for e in self.entries if self._has_issues(e))
+
+ has_usage = any(e.usage is not None for e in self.entries)
+ if has_usage:
+ used = sum(1 for e in self.entries if e.usage and e.usage.is_used)
+ unused = total - used
+ used_str = str(used)
+ unused_str = str(unused)
+ missing_str = str(len(self.missing_citations))
+ else:
+ used_str = "N/A"
+ unused_str = "N/A"
+ missing_str = "N/A"
+
+ if self.duplicate_groups is None:
+ dup_str = "N/A"
+ else:
+ dup_str = str(len(self.duplicate_groups))
+
+ lines.append("## 📊 Summary")
+ lines.append("")
+ lines.append("| Metric | Count |")
+ lines.append("|--------|-------|")
+ lines.append(f"| **Total Entries** | {total} |")
+ lines.append(f"| ✅ **Verified (Clean)** | {verified} |")
+ lines.append(f"| ⚠️ **With Issues** | {issues} |")
+ lines.append(f"| 📝 **Used in TeX** | {used_str} |")
+ lines.append(f"| 🗑️ **Unused** | {unused_str} |")
+ lines.append(f"| 🔄 **Duplicate Groups** | {dup_str} |")
+ lines.append(f"| ❌ **Missing Bib Entries** | {missing_str} |")
+ lines.append("")
+
+ # Issues section
+ lines.extend(self._generate_issues_section())
+ lines.append("")
+
+ # Verified entries
+ lines.extend(self._generate_verified_section())
+ lines.append("")
+
+ # Footer
+ lines.extend(self._generate_footer())
+
+ content = "\n".join(lines)
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(content)
+
+ def save_latex_quality_report(self, filepath: str, submission_results: List[CheckResult], template=None):
+ """Generate and save LaTeX quality report (all tex-related quality checks)."""
+ lines = []
+
+ # Header
+ lines.append("# LaTeX Quality Report")
+ lines.append("")
+ lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+ lines.append("")
+ tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A"
+ lines.append(f"**TeX File(s):** {tex_names}")
+ lines.append("")
+
+ if template:
+ lines.append(f"**Template:** {template.name}")
+ lines.append("")
+
+ # Disclaimer
+ lines.append("> **⚠️ Note:** This report contains automated quality checks for your LaTeX document. Please review all suggestions carefully before making changes.")
+ lines.append("")
+
+ # Summary
+ error_count = sum(1 for r in submission_results if r.severity == CheckSeverity.ERROR)
+ warning_count = sum(1 for r in submission_results if r.severity == CheckSeverity.WARNING)
+ info_count = sum(1 for r in submission_results if r.severity == CheckSeverity.INFO)
+
+ lines.append("## 📊 Summary")
+ lines.append("")
+ lines.append("| Severity | Count |")
+ lines.append("|----------|-------|")
+ lines.append(f"| 🔴 **Errors** | {error_count} |")
+ lines.append(f"| 🟡 **Warnings** | {warning_count} |")
+ lines.append(f"| 🔵 **Suggestions** | {info_count} |")
+ lines.append("")
+
+ # Detailed issues
+ self.submission_results = submission_results
+ self.template = template
+ lines.extend(self._generate_submission_section())
+ lines.append("")
+
+ # Footer
+ lines.append("---")
+ lines.append("")
+ lines.append(f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+ content = "\n".join(lines)
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(content)
+
diff --git a/src/report/line_report.py b/src/report/line_report.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9bb3a84437284c284d6a21fa5ac2353c79705a1
--- /dev/null
+++ b/src/report/line_report.py
@@ -0,0 +1,254 @@
+"""
+Line-by-line report generator.
+
+Generates a report that follows the TeX file structure,
+showing issues in order of appearance in the document.
+"""
+import re
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass, field
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+
+from ..checkers.base import CheckResult, CheckSeverity
+
+
+@dataclass
+class LineIssue:
+ """An issue associated with a specific line or range."""
+ start_line: int
+ end_line: int
+ line_content: str
+ issues: List[CheckResult] = field(default_factory=list)
+ block_type: Optional[str] = None # 'figure', 'table', 'equation', etc.
+
+
+class LineByLineReportGenerator:
+ """
+ Generates a report organized by TeX file line order.
+
+ Groups consecutive lines and special environments into blocks,
+ then outputs issues in the order they appear in the document.
+ """
+
+ # LaTeX environments that should be grouped as blocks
+ BLOCK_ENVIRONMENTS = [
+ 'figure', 'figure*', 'table', 'table*', 'tabular', 'tabular*',
+ 'equation', 'equation*', 'align', 'align*', 'gather', 'gather*',
+ 'algorithm', 'algorithm2e', 'algorithmic', 'lstlisting',
+ 'verbatim', 'minted', 'tikzpicture', 'minipage', 'subfigure',
+ ]
+
+ def __init__(self, tex_content: str, tex_path: str):
+ self.tex_content = tex_content
+ self.tex_path = tex_path
+ self.lines = tex_content.split('\n')
+ self.line_issues: Dict[int, List[CheckResult]] = defaultdict(list)
+ self.blocks: List[Tuple[int, int, str]] = [] # (start, end, env_type)
+
+ # Parse block environments
+ self._parse_blocks()
+
+ def _parse_blocks(self):
+ """Find all block environments in the TeX content."""
+ for env in self.BLOCK_ENVIRONMENTS:
+ env_escaped = env.replace('*', r'\*')
+ pattern = re.compile(
+ rf'\\begin\{{{env_escaped}\}}.*?\\end\{{{env_escaped}\}}',
+ re.DOTALL
+ )
+
+ for match in pattern.finditer(self.tex_content):
+ start_line = self._pos_to_line(match.start())
+ end_line = self._pos_to_line(match.end())
+ self.blocks.append((start_line, end_line, env))
+
+ # Sort blocks by start line
+ self.blocks.sort(key=lambda x: x[0])
+
+ def _pos_to_line(self, pos: int) -> int:
+ """Convert character position to line number (1-indexed)."""
+ return self.tex_content[:pos].count('\n') + 1
+
+ def add_results(self, results: List[CheckResult]):
+ """Add check results to the line-by-line mapping."""
+ for result in results:
+ if result.passed:
+ continue
+
+ line_num = result.line_number or 0
+ if line_num > 0:
+ self.line_issues[line_num].append(result)
+
+ def _get_block_for_line(self, line_num: int) -> Optional[Tuple[int, int, str]]:
+ """Check if a line is part of a block environment."""
+ for start, end, env_type in self.blocks:
+ if start <= line_num <= end:
+ return (start, end, env_type)
+ return None
+
+ def _get_block_content(self, start: int, end: int) -> str:
+ """Get content for a block of lines."""
+ block_lines = self.lines[start-1:end]
+ if len(block_lines) > 10:
+ # Truncate long blocks
+ return '\n'.join(block_lines[:5]) + '\n [...]\n' + '\n'.join(block_lines[-3:])
+ return '\n'.join(block_lines)
+
+ def _severity_icon(self, severity: CheckSeverity) -> str:
+ """Get icon for severity level."""
+ icons = {
+ CheckSeverity.ERROR: '🔴',
+ CheckSeverity.WARNING: '🟡',
+ CheckSeverity.INFO: '🔵',
+ }
+ return icons.get(severity, '⚪')
+
+ def generate(self) -> str:
+ """Generate the line-by-line report."""
+ lines = []
+
+ # Header
+ lines.append("# BibGuard Line-by-Line Report")
+ lines.append("")
+ lines.append(f"**File:** `{Path(self.tex_path).name}`")
+ lines.append(f"**Generated at:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+ lines.append("")
+ lines.append("---")
+ lines.append("")
+
+ # Summary counts
+ error_count = sum(1 for issues in self.line_issues.values()
+ for r in issues if r.severity == CheckSeverity.ERROR)
+ warning_count = sum(1 for issues in self.line_issues.values()
+ for r in issues if r.severity == CheckSeverity.WARNING)
+ info_count = sum(1 for issues in self.line_issues.values()
+ for r in issues if r.severity == CheckSeverity.INFO)
+
+ lines.append("## 📊 Overview")
+ lines.append("")
+ lines.append(f"| 🔴 Errors | 🟡 Warnings | 🔵 Suggestions |")
+ lines.append(f"|:---------:|:-----------:|:--------------:|")
+ lines.append(f"| {error_count} | {warning_count} | {info_count} |")
+ lines.append("")
+ lines.append("---")
+ lines.append("")
+
+ if not self.line_issues:
+ lines.append("🎉 **No issues found!**")
+ return '\n'.join(lines)
+
+ # Process lines in order
+ lines.append("## 📝 Line-by-Line Details")
+ lines.append("")
+
+ processed_lines = set()
+ sorted_line_nums = sorted(self.line_issues.keys())
+
+ for line_num in sorted_line_nums:
+ if line_num in processed_lines:
+ continue
+
+ issues = self.line_issues[line_num]
+ if not issues:
+ continue
+
+ # Check if this line is part of a block
+ block = self._get_block_for_line(line_num)
+
+ if block:
+ start, end, env_type = block
+
+ # Mark all lines in block as processed
+ for ln in range(start, end + 1):
+ processed_lines.add(ln)
+
+ # Collect all issues in this block
+ block_issues = []
+ for ln in range(start, end + 1):
+ if ln in self.line_issues:
+ block_issues.extend(self.line_issues[ln])
+
+ if block_issues:
+ lines.append(f"### 📦 `{env_type}` Environment (Lines {start}-{end})")
+ lines.append("")
+ lines.append("```latex")
+ lines.append(self._get_block_content(start, end))
+ lines.append("```")
+ lines.append("")
+
+ # Group issues by type
+ for issue in block_issues:
+ icon = self._severity_icon(issue.severity)
+ lines.append(f"- {icon} **{issue.message}**")
+ if issue.suggestion:
+ lines.append(f" - 💡 {issue.suggestion}")
+
+ lines.append("")
+ else:
+ # Single line
+ processed_lines.add(line_num)
+
+ # Use custom line_content from CheckResult if available, otherwise get from file
+ custom_content = None
+ for issue in issues:
+ if issue.line_content:
+ custom_content = issue.line_content
+ break
+
+ line_content = custom_content if custom_content else (
+ self.lines[line_num - 1] if line_num <= len(self.lines) else ""
+ )
+
+ lines.append(f"### Line {line_num}")
+ lines.append("")
+ lines.append("```latex")
+ lines.append(line_content)
+ lines.append("```")
+ lines.append("")
+
+ for issue in issues:
+ icon = self._severity_icon(issue.severity)
+ lines.append(f"- {icon} **{issue.message}**")
+ if issue.suggestion:
+ lines.append(f" - 💡 {issue.suggestion}")
+
+ lines.append("")
+
+ # Footer
+ lines.append("---")
+ lines.append("")
+ lines.append("*Report generated by BibGuard*")
+
+ return '\n'.join(lines)
+
+ def save(self, filepath: str):
+ """Save report to file."""
+ content = self.generate()
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(content)
+
+
+def generate_line_report(
+ tex_content: str,
+ tex_path: str,
+ results: List[CheckResult],
+ output_path: str
+) -> str:
+ """
+ Generate a line-by-line report from check results.
+
+ Args:
+ tex_content: The TeX file content
+ tex_path: Path to the TeX file
+ results: List of check results from all checkers
+ output_path: Where to save the report
+
+ Returns:
+ Path to the generated report
+ """
+ generator = LineByLineReportGenerator(tex_content, tex_path)
+ generator.add_results(results)
+ generator.save(output_path)
+ return output_path
diff --git a/src/templates/__init__.py b/src/templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff0977111a09516f9941f54b60c44126d7f499e5
--- /dev/null
+++ b/src/templates/__init__.py
@@ -0,0 +1,4 @@
+"""Templates module for conference-specific submission requirements."""
+from .base_template import ConferenceTemplate, get_template, get_all_templates
+
+__all__ = ['ConferenceTemplate', 'get_template', 'get_all_templates']
diff --git a/src/templates/__pycache__/__init__.cpython-313.pyc b/src/templates/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bea3b9a05696703f73715f8a425c228cecc7cb1
Binary files /dev/null and b/src/templates/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/templates/__pycache__/base_template.cpython-313.pyc b/src/templates/__pycache__/base_template.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c144a736268828d8c269b73827a944ae4bb49105
Binary files /dev/null and b/src/templates/__pycache__/base_template.cpython-313.pyc differ
diff --git a/src/templates/base_template.py b/src/templates/base_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e186628afdb647ac9c437f0f33b5a7b7abc2f34
--- /dev/null
+++ b/src/templates/base_template.py
@@ -0,0 +1,263 @@
+"""
+Conference template definitions.
+
+Each template contains conference-specific formatting requirements
+and rules for paper submission quality checking.
+"""
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional
+from enum import Enum
+
+
+class ConferenceField(Enum):
+ """Research field categories."""
+ NLP = "Natural Language Processing"
+ CV = "Computer Vision"
+ ML = "Machine Learning"
+
+
+@dataclass
+class ConferenceTemplate:
+ """
+ Template containing conference-specific submission requirements.
+
+ Attributes:
+ name: Full conference name (e.g., "ACL 2025")
+ short_name: Short identifier (e.g., "acl")
+ field: Research field category
+ page_limit_review: Page limit for review submission (main content only)
+ page_limit_camera: Page limit for camera-ready (main content only)
+ double_blind: Whether the conference uses double-blind review
+ caption_table_above: Whether table captions should be above
+ caption_figure_below: Whether figure captions should be below
+ mandatory_sections: List of required sections (e.g., ["Limitations"])
+ optional_sections: List of encouraged sections
+ style_package: Name of the LaTeX style package
+ checkers: List of checker names to run for this template
+ extra_rules: Additional conference-specific rules
+ """
+ name: str
+ short_name: str
+ field: ConferenceField
+ page_limit_review: int
+ page_limit_camera: int
+ double_blind: bool = True
+ caption_table_above: bool = True
+ caption_figure_below: bool = True
+ mandatory_sections: List[str] = field(default_factory=list)
+ optional_sections: List[str] = field(default_factory=list)
+ style_package: str = ""
+ checkers: List[str] = field(default_factory=lambda: [
+ 'caption', 'reference', 'ai_artifacts', 'formatting', 'anonymization'
+ ])
+ extra_rules: Dict[str, str] = field(default_factory=dict)
+
+ def to_dict(self) -> dict:
+ return {
+ 'name': self.name,
+ 'short_name': self.short_name,
+ 'field': self.field.value,
+ 'page_limit_review': self.page_limit_review,
+ 'page_limit_camera': self.page_limit_camera,
+ 'double_blind': self.double_blind,
+ 'mandatory_sections': self.mandatory_sections,
+ 'optional_sections': self.optional_sections,
+ 'checkers': self.checkers,
+ }
+
+
+# ============================================================================
+# NLP Conferences (ACL, EMNLP, NAACL)
+# ============================================================================
+
+ACL_TEMPLATE = ConferenceTemplate(
+ name="ACL 2025",
+ short_name="acl",
+ field=ConferenceField.NLP,
+ page_limit_review=8,
+ page_limit_camera=9,
+ double_blind=True,
+ mandatory_sections=["Limitations"],
+ optional_sections=["Ethical Considerations"],
+ style_package="acl2025",
+ extra_rules={
+ "format": "Two-column, A4 paper",
+ "references": "Unlimited pages for references",
+ "appendix": "Allowed after references, two-column format",
+ }
+)
+
+EMNLP_TEMPLATE = ConferenceTemplate(
+ name="EMNLP 2024",
+ short_name="emnlp",
+ field=ConferenceField.NLP,
+ page_limit_review=8,
+ page_limit_camera=9,
+ double_blind=True,
+ mandatory_sections=["Limitations"],
+ optional_sections=["Ethics Statement"],
+ style_package="emnlp2024",
+ extra_rules={
+ "format": "Two-column, single-spaced",
+ "short_paper": "4 pages for short papers (5 camera-ready)",
+ }
+)
+
+NAACL_TEMPLATE = ConferenceTemplate(
+ name="NAACL 2025",
+ short_name="naacl",
+ field=ConferenceField.NLP,
+ page_limit_review=8,
+ page_limit_camera=9,
+ double_blind=True,
+ mandatory_sections=["Limitations"],
+ optional_sections=["Ethics Statement"],
+ style_package="naacl2025",
+ extra_rules={
+ "review_system": "ACL Rolling Review (ARR)",
+ "format": "Two-column, A4 paper",
+ }
+)
+
+# ============================================================================
+# Computer Vision Conferences (CVPR, ICCV, ECCV)
+# ============================================================================
+
+CVPR_TEMPLATE = ConferenceTemplate(
+ name="CVPR 2025",
+ short_name="cvpr",
+ field=ConferenceField.CV,
+ page_limit_review=8,
+ page_limit_camera=8, # No extra page for camera-ready
+ double_blind=True,
+ mandatory_sections=[],
+ optional_sections=[],
+ style_package="cvpr",
+ extra_rules={
+ "strict_anonymity": "No links to websites that reveal identity",
+ "supplementary": "Separate PDF allowed, no page limit",
+ "references": "No limit on references",
+ }
+)
+
+ICCV_TEMPLATE = ConferenceTemplate(
+ name="ICCV 2025",
+ short_name="iccv",
+ field=ConferenceField.CV,
+ page_limit_review=8,
+ page_limit_camera=8,
+ double_blind=True,
+ mandatory_sections=[],
+ optional_sections=[],
+ style_package="iccv",
+ extra_rules={
+ "format": "Two-column, 10pt Times font",
+ "supplementary": "Optional PDF for extra material",
+ }
+)
+
+ECCV_TEMPLATE = ConferenceTemplate(
+ name="ECCV 2024",
+ short_name="eccv",
+ field=ConferenceField.CV,
+ page_limit_review=14,
+ page_limit_camera=14,
+ double_blind=True,
+ mandatory_sections=[],
+ optional_sections=[],
+ style_package="eccv",
+ extra_rules={
+ "format": "Springer LNCS format",
+ "template": "Do not use TIMES font, use default template font",
+ "headings": "Capitalize except articles/prepositions/conjunctions",
+ }
+)
+
+# ============================================================================
+# Machine Learning Conferences (NeurIPS, ICML, ICLR)
+# ============================================================================
+
+NEURIPS_TEMPLATE = ConferenceTemplate(
+ name="NeurIPS 2025",
+ short_name="neurips",
+ field=ConferenceField.ML,
+ page_limit_review=9,
+ page_limit_camera=10,
+ double_blind=True,
+ mandatory_sections=["Paper Checklist"],
+ optional_sections=["Broader Impact"],
+ style_package="neurips_2025",
+ extra_rules={
+ "checklist": "NeurIPS paper checklist is MANDATORY, desk reject without it",
+ "appendix": "Technical appendix after checklist, no page limit",
+ "format": "Single PDF including main content, references, and checklist",
+ }
+)
+
+ICML_TEMPLATE = ConferenceTemplate(
+ name="ICML 2025",
+ short_name="icml",
+ field=ConferenceField.ML,
+ page_limit_review=8,
+ page_limit_camera=9,
+ double_blind=True,
+ mandatory_sections=["Impact Statement"], # Required for camera-ready
+ optional_sections=["Acknowledgments"],
+ style_package="icml2025",
+ extra_rules={
+ "font": "10 point Times, embedded Type-1 fonts only",
+ "lay_summary": "Plain language summary required for accepted papers",
+ "format": "Use pdflatex for best results",
+ }
+)
+
+ICLR_TEMPLATE = ConferenceTemplate(
+ name="ICLR 2025",
+ short_name="iclr",
+ field=ConferenceField.ML,
+ page_limit_review=10,
+ page_limit_camera=10,
+ double_blind=True,
+ mandatory_sections=[],
+ optional_sections=["Ethics Statement", "Reproducibility Statement"],
+ style_package="iclr2025_conference",
+ extra_rules={
+ "format": "10pt Times New Roman, 11pt vertical spacing",
+ "submission": "Via OpenReview",
+ "min_pages": "Main text must be between 6 and 10 pages",
+ }
+)
+
+# ============================================================================
+# Template Registry
+# ============================================================================
+
+TEMPLATE_REGISTRY: Dict[str, ConferenceTemplate] = {
+ # NLP
+ 'acl': ACL_TEMPLATE,
+ 'emnlp': EMNLP_TEMPLATE,
+ 'naacl': NAACL_TEMPLATE,
+ # CV
+ 'cvpr': CVPR_TEMPLATE,
+ 'iccv': ICCV_TEMPLATE,
+ 'eccv': ECCV_TEMPLATE,
+ # ML
+ 'neurips': NEURIPS_TEMPLATE,
+ 'icml': ICML_TEMPLATE,
+ 'iclr': ICLR_TEMPLATE,
+}
+
+
+def get_template(name: str) -> Optional[ConferenceTemplate]:
+ """Get a conference template by short name."""
+ return TEMPLATE_REGISTRY.get(name.lower())
+
+
+def get_all_templates() -> Dict[str, ConferenceTemplate]:
+ """Get all available templates."""
+ return TEMPLATE_REGISTRY.copy()
+
+
+def get_templates_by_field(field: ConferenceField) -> List[ConferenceTemplate]:
+ """Get templates filtered by research field."""
+ return [t for t in TEMPLATE_REGISTRY.values() if t.field == field]
diff --git a/src/ui/__init__.py b/src/ui/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97ae04a37c5633477b761f41c3d289bde104d670
--- /dev/null
+++ b/src/ui/__init__.py
@@ -0,0 +1,5 @@
+"""UI module for BibGuard terminal interfaces."""
+from .workflow_editor import WorkflowEditor
+from .template_selector import TemplateSelector
+
+__all__ = ['WorkflowEditor', 'TemplateSelector']
diff --git a/src/ui/__pycache__/__init__.cpython-313.pyc b/src/ui/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bd2ea8343637c994e457e05040eda25af3cb0fe
Binary files /dev/null and b/src/ui/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/ui/__pycache__/template_selector.cpython-313.pyc b/src/ui/__pycache__/template_selector.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae6532abad95d6e03be3fb6976bf1110fe17326d
Binary files /dev/null and b/src/ui/__pycache__/template_selector.cpython-313.pyc differ
diff --git a/src/ui/__pycache__/workflow_editor.cpython-313.pyc b/src/ui/__pycache__/workflow_editor.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81b4ec5be6d7d659ee63902d7ebef17fc82c2fd6
Binary files /dev/null and b/src/ui/__pycache__/workflow_editor.cpython-313.pyc differ
diff --git a/src/ui/template_selector.py b/src/ui/template_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a011b66ca21b1e91daeb632fdbe3872edea6e88
--- /dev/null
+++ b/src/ui/template_selector.py
@@ -0,0 +1,186 @@
+"""
+Interactive template selector for conference presets.
+
+Provides a terminal UI for selecting a conference template
+with information about requirements and rules.
+"""
+from typing import Optional
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.prompt import Prompt
+from rich.columns import Columns
+from rich.text import Text
+
+from ..templates.base_template import (
+ ConferenceTemplate,
+ get_template,
+ get_all_templates,
+ ConferenceField
+)
+
+
+class TemplateSelector:
+ """Interactive terminal selector for conference templates."""
+
+ def __init__(self):
+ self.console = Console()
+ self.templates = get_all_templates()
+
+ def display_templates(self):
+ """Display all available templates grouped by field."""
+ self.console.clear()
+
+ # Header
+ self.console.print(Panel(
+ "[bold blue]🎓 Conference Template Selector[/bold blue]\n"
+ "[dim]Choose a conference template for submission checks[/dim]",
+ border_style="blue"
+ ))
+ self.console.print()
+
+ # Group by field
+ fields = {
+ ConferenceField.NLP: ("🗣️ NLP Conferences", []),
+ ConferenceField.CV: ("👁️ Computer Vision Conferences", []),
+ ConferenceField.ML: ("🧠 Machine Learning Conferences", []),
+ }
+
+ for template in self.templates.values():
+ fields[template.field][1].append(template)
+
+ # Display each field
+ for field_enum, (title, templates) in fields.items():
+ self.console.print(f"[bold cyan]{title}[/bold cyan]")
+
+ table = Table(show_header=True, header_style="bold", box=None, padding=(0, 2))
+ table.add_column("ID", style="yellow", width=10)
+ table.add_column("Conference", width=15)
+ table.add_column("Pages", width=12)
+ table.add_column("Key Requirements", style="dim")
+
+ for template in templates:
+ pages = f"{template.page_limit_review}→{template.page_limit_camera}"
+ requirements = []
+ if template.mandatory_sections:
+ requirements.append(f"Required: {', '.join(template.mandatory_sections)}")
+ if template.extra_rules:
+ first_rule = list(template.extra_rules.values())[0]
+ requirements.append(first_rule[:50])
+
+ table.add_row(
+ template.short_name,
+ template.name,
+ pages,
+ " | ".join(requirements) if requirements else "Standard format"
+ )
+
+ self.console.print(table)
+ self.console.print()
+
+ def display_template_details(self, template: ConferenceTemplate):
+ """Display detailed information about a template."""
+ self.console.print()
+ self.console.print(Panel(
+ f"[bold]{template.name}[/bold]",
+ border_style="cyan"
+ ))
+
+ # Basic info
+ info = Table(show_header=False, box=None, padding=(0, 2))
+ info.add_column("Label", style="dim")
+ info.add_column("Value")
+
+ info.add_row("Style Package", f"[cyan]{template.style_package}[/cyan]")
+ info.add_row("Page Limit (Review)", f"[yellow]{template.page_limit_review}[/yellow] pages")
+ info.add_row("Page Limit (Camera)", f"[green]{template.page_limit_camera}[/green] pages")
+ info.add_row("Double-Blind", "✓ Yes" if template.double_blind else "✗ No")
+
+ if template.mandatory_sections:
+ info.add_row("Mandatory Sections", ", ".join(template.mandatory_sections))
+ if template.optional_sections:
+ info.add_row("Optional Sections", ", ".join(template.optional_sections))
+
+ self.console.print(info)
+
+ # Extra rules
+ if template.extra_rules:
+ self.console.print()
+ self.console.print("[bold]Special Requirements:[/bold]")
+ for key, value in template.extra_rules.items():
+ self.console.print(f" • [dim]{key}:[/dim] {value}")
+
+ self.console.print()
+
+ def run(self) -> Optional[ConferenceTemplate]:
+ """Run the interactive selector and return the chosen template."""
+ while True:
+ self.display_templates()
+
+ # Get user input
+ choice = Prompt.ask(
+ "[bold]Enter template ID (or 'q' to quit, 'd ' for details)[/bold]",
+ default="q"
+ )
+
+ if choice.lower() == 'q':
+ return None
+
+ # Handle details command
+ if choice.lower().startswith('d '):
+ template_id = choice[2:].strip().lower()
+ template = get_template(template_id)
+ if template:
+ self.display_template_details(template)
+ Prompt.ask("Press Enter to continue")
+ else:
+ self.console.print(f"[red]Unknown template: {template_id}[/red]")
+ Prompt.ask("Press Enter to continue")
+ continue
+
+ # Try to get template
+ template = get_template(choice)
+ if template:
+ self.console.print(f"[green]✓ Selected: {template.name}[/green]")
+ return template
+ else:
+ self.console.print(f"[red]Unknown template: {choice}[/red]")
+ self.console.print("[dim]Available: " + ", ".join(self.templates.keys()) + "[/dim]")
+ Prompt.ask("Press Enter to continue")
+
+
+def launch_template_selector() -> Optional[ConferenceTemplate]:
+ """Launch the template selector and return the chosen template."""
+ selector = TemplateSelector()
+ return selector.run()
+
+
+def list_templates(console: Console = None):
+ """Print a simple list of available templates."""
+ if console is None:
+ console = Console()
+
+ console.print("\n[bold]Available Conference Templates:[/bold]\n")
+
+ templates = get_all_templates()
+
+ # Group by field
+ by_field = {}
+ for t in templates.values():
+ if t.field not in by_field:
+ by_field[t.field] = []
+ by_field[t.field].append(t)
+
+ field_names = {
+ ConferenceField.NLP: "NLP",
+ ConferenceField.CV: "Computer Vision",
+ ConferenceField.ML: "Machine Learning",
+ }
+
+ for field, field_templates in by_field.items():
+ console.print(f"[cyan]{field_names[field]}:[/cyan]")
+ for t in field_templates:
+ console.print(f" • [yellow]{t.short_name:8}[/yellow] - {t.name} ({t.page_limit_review}/{t.page_limit_camera} pages)")
+
+ console.print()
diff --git a/src/ui/workflow_editor.py b/src/ui/workflow_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5e832d373da660e49cd7353e6fb35d67e9d7dc
--- /dev/null
+++ b/src/ui/workflow_editor.py
@@ -0,0 +1,164 @@
+"""
+Interactive workflow editor for reference checking configuration.
+
+Provides a terminal-based UI using rich for customizing the order
+and enabled state of fetchers in the verification workflow.
+"""
+from typing import Optional
+from pathlib import Path
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.prompt import Prompt, Confirm
+from rich.text import Text
+
+from ..config.workflow import WorkflowConfig, get_default_workflow
+
+
+class WorkflowEditor:
+ """Interactive terminal editor for workflow configuration."""
+
+ def __init__(self, config: Optional[WorkflowConfig] = None):
+ self.console = Console()
+ self.config = config or get_default_workflow()
+ self.selected_index = 0
+ self.modified = False
+
+ def display_workflow(self):
+ """Display current workflow configuration as a table."""
+ self.console.clear()
+
+ # Header
+ self.console.print(Panel(
+ "[bold blue]📋 Reference Check Workflow Editor[/bold blue]\n"
+ "[dim]Customize the order and sources for metadata verification[/dim]",
+ border_style="blue"
+ ))
+
+ # Instructions
+ self.console.print()
+ self.console.print("[dim]Commands: [cyan]u[/cyan]=move up, [cyan]d[/cyan]=move down, "
+ "[cyan]t[/cyan]=toggle, [cyan]s[/cyan]=save, [cyan]r[/cyan]=reset, [cyan]q[/cyan]=quit[/dim]")
+ self.console.print()
+
+ # Workflow table
+ table = Table(show_header=True, header_style="bold magenta", box=None)
+ table.add_column("#", style="dim", width=3)
+ table.add_column("Status", width=8)
+ table.add_column("Source", width=25)
+ table.add_column("Description", style="dim")
+
+ for i, step in enumerate(self.config.steps):
+ # Highlight selected row
+ row_style = "reverse" if i == self.selected_index else ""
+
+ # Status indicator
+ if step.enabled:
+ status = "[green]✓ ON[/green]"
+ else:
+ status = "[red]✗ OFF[/red]"
+
+ # Priority number
+ priority = f"{i + 1}"
+
+ table.add_row(
+ priority,
+ status,
+ step.display_name,
+ step.description,
+ style=row_style
+ )
+
+ self.console.print(table)
+ self.console.print()
+
+ # Current selection info
+ if 0 <= self.selected_index < len(self.config.steps):
+ step = self.config.steps[self.selected_index]
+ info = Text()
+ info.append("Selected: ", style="dim")
+ info.append(step.display_name, style="cyan bold")
+ info.append(f" (search type: {step.search_type})", style="dim")
+ self.console.print(info)
+
+ if self.modified:
+ self.console.print("[yellow]* Unsaved changes[/yellow]")
+
+ def run(self) -> WorkflowConfig:
+ """Run the interactive editor loop."""
+ while True:
+ self.display_workflow()
+
+ # Get user input
+ try:
+ cmd = Prompt.ask(
+ "\n[bold]Enter command[/bold]",
+ choices=["u", "d", "t", "s", "r", "q", "1", "2", "3", "4", "5", "6", "7", "8"],
+ default="q",
+ show_choices=False
+ )
+ except KeyboardInterrupt:
+ cmd = "q"
+
+ if cmd == "q":
+ if self.modified:
+ if Confirm.ask("Discard unsaved changes?", default=False):
+ break
+ else:
+ break
+ elif cmd == "u":
+ if self.config.move_step_up(self.selected_index):
+ self.selected_index -= 1
+ self.modified = True
+ elif cmd == "d":
+ if self.config.move_step_down(self.selected_index):
+ self.selected_index += 1
+ self.modified = True
+ elif cmd == "t":
+ self.config.toggle_step(self.selected_index)
+ self.modified = True
+ elif cmd == "s":
+ self._save_workflow()
+ elif cmd == "r":
+ if Confirm.ask("Reset to default workflow?", default=False):
+ self.config = get_default_workflow()
+ self.selected_index = 0
+ self.modified = True
+ elif cmd.isdigit():
+ num = int(cmd)
+ if 1 <= num <= len(self.config.steps):
+ self.selected_index = num - 1
+
+ return self.config
+
+ def _save_workflow(self):
+ """Save workflow configuration to file."""
+ default_path = Path.home() / ".bibguard" / "workflow.json"
+
+ path_str = Prompt.ask(
+ "Save to",
+ default=str(default_path)
+ )
+
+ try:
+ self.config.save(path_str)
+ self.console.print(f"[green]✓ Saved to {path_str}[/green]")
+ self.modified = False
+ except Exception as e:
+ self.console.print(f"[red]✗ Failed to save: {e}[/red]")
+
+ Prompt.ask("Press Enter to continue")
+
+
+def launch_workflow_editor(config_path: Optional[str] = None) -> WorkflowConfig:
+ """Launch the workflow editor and return the resulting configuration."""
+ config = None
+ if config_path:
+ try:
+ config = WorkflowConfig.load(config_path)
+ except FileNotFoundError:
+ pass
+
+ editor = WorkflowEditor(config)
+ return editor.run()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4598f4578edc81e5e74eaccddea165bd356d5be3
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,5 @@
+"""Utilities package"""
+from .normalizer import TextNormalizer
+from .progress import ProgressDisplay
+
+__all__ = ['TextNormalizer', 'ProgressDisplay']
diff --git a/src/utils/__pycache__/__init__.cpython-313.pyc b/src/utils/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa3d7bac9268b90e0eb75a748c60a6968a18742
Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-313.pyc differ
diff --git a/src/utils/__pycache__/cache.cpython-313.pyc b/src/utils/__pycache__/cache.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f38d16fda753fc16d22b7ff926ce8ba1bdd40cb
Binary files /dev/null and b/src/utils/__pycache__/cache.cpython-313.pyc differ
diff --git a/src/utils/__pycache__/logger.cpython-313.pyc b/src/utils/__pycache__/logger.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb8aab83bbd5eb9dd620dcef4ecbde88bb73e480
Binary files /dev/null and b/src/utils/__pycache__/logger.cpython-313.pyc differ
diff --git a/src/utils/__pycache__/normalizer.cpython-313.pyc b/src/utils/__pycache__/normalizer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbddfbdb1c6abb30fffd5a6007d5c39cdcb1e4c4
Binary files /dev/null and b/src/utils/__pycache__/normalizer.cpython-313.pyc differ
diff --git a/src/utils/__pycache__/progress.cpython-313.pyc b/src/utils/__pycache__/progress.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..149e039a4e27ed25266d6b6c9558e5d01c1d8b4d
Binary files /dev/null and b/src/utils/__pycache__/progress.cpython-313.pyc differ
diff --git a/src/utils/__pycache__/source_manager.cpython-313.pyc b/src/utils/__pycache__/source_manager.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c47a85cfe68646c85c70cb7ba4f906a1652d9b2
Binary files /dev/null and b/src/utils/__pycache__/source_manager.cpython-313.pyc differ
diff --git a/src/utils/normalizer.py b/src/utils/normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9018f44a97deea650ebda98221d1163be57c7e
--- /dev/null
+++ b/src/utils/normalizer.py
@@ -0,0 +1,236 @@
+"""
+Text normalization utilities for comparing bibliography entries.
+"""
+import re
+import unicodedata
+from unidecode import unidecode
+
+
+class TextNormalizer:
+ """Utility class for normalizing text for comparison."""
+
+ # LaTeX command patterns
+ LATEX_COMMANDS = [
+ (r'\\textbf\{([^}]*)\}', r'\1'),
+ (r'\\textit\{([^}]*)\}', r'\1'),
+ (r'\\emph\{([^}]*)\}', r'\1'),
+ (r'\\textrm\{([^}]*)\}', r'\1'),
+ (r'\\texttt\{([^}]*)\}', r'\1'),
+ (r'\\textsf\{([^}]*)\}', r'\1'),
+ (r'\\textsc\{([^}]*)\}', r'\1'),
+ (r'\\text\{([^}]*)\}', r'\1'),
+ (r'\\mathrm\{([^}]*)\}', r'\1'),
+ (r'\\mathbf\{([^}]*)\}', r'\1'),
+ (r'\\mathit\{([^}]*)\}', r'\1'),
+ (r'\\url\{([^}]*)\}', r'\1'),
+ (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'),
+ ]
+
+ # LaTeX special character mappings
+ LATEX_CHARS = {
+ r'\&': '&',
+ r'\%': '%',
+ r'\$': '$',
+ r'\#': '#',
+ r'\_': '_',
+ r'\{': '{',
+ r'\}': '}',
+ r'\~': '~',
+ r'\^': '^',
+ r'``': '"',
+ r"''": '"',
+ r'`': "'",
+ r"'": "'",
+ r'--': '–',
+ r'---': '—',
+ }
+
+ # LaTeX accent commands
+ LATEX_ACCENTS = [
+ (r"\\'([aeiouAEIOU])", r'\1'), # acute
+ (r'\\`([aeiouAEIOU])', r'\1'), # grave
+ (r'\\^([aeiouAEIOU])', r'\1'), # circumflex
+ (r'\\"([aeiouAEIOU])', r'\1'), # umlaut
+ (r'\\~([nNaAoO])', r'\1'), # tilde
+ (r'\\c\{([cC])\}', r'\1'), # cedilla
+ (r"\\'{([aeiouAEIOU])}", r'\1'),
+ (r'\\`{([aeiouAEIOU])}', r'\1'),
+ (r'\\^{([aeiouAEIOU])}', r'\1'),
+ (r'\\"{([aeiouAEIOU])}', r'\1'),
+ (r'\\~{([nNaAoO])}', r'\1'),
+ ]
+
+ @classmethod
+ def normalize_latex(cls, text: str) -> str:
+ """Remove LaTeX formatting commands."""
+ if not text:
+ return ""
+
+ result = text
+
+ # Remove LaTeX commands
+ for pattern, replacement in cls.LATEX_COMMANDS:
+ result = re.sub(pattern, replacement, result)
+
+ # Handle LaTeX accents
+ for pattern, replacement in cls.LATEX_ACCENTS:
+ result = re.sub(pattern, replacement, result)
+
+ # Replace LaTeX special characters
+ for latex_char, normal_char in cls.LATEX_CHARS.items():
+ result = result.replace(latex_char, normal_char)
+
+ # Remove remaining braces
+ result = re.sub(r'[{}]', '', result)
+
+ return result
+
+ @classmethod
+ def normalize_unicode(cls, text: str) -> str:
+ """Normalize Unicode characters to ASCII."""
+ if not text:
+ return ""
+
+ # Normalize unicode
+ text = unicodedata.normalize('NFKD', text)
+ # Convert to ASCII
+ text = unidecode(text)
+ return text
+
+ @classmethod
+ def normalize_whitespace(cls, text: str) -> str:
+ """Normalize whitespace."""
+ if not text:
+ return ""
+
+ # Replace multiple whitespace with single space
+ text = re.sub(r'\s+', ' ', text)
+ # Strip leading/trailing whitespace
+ text = text.strip()
+ return text
+
+ @classmethod
+ def remove_punctuation(cls, text: str) -> str:
+ """Remove punctuation for comparison."""
+ if not text:
+ return ""
+
+ # Keep alphanumeric and spaces only
+ return re.sub(r'[^\w\s]', '', text)
+
+ @classmethod
+ def normalize_for_comparison(cls, text: str) -> str:
+ """
+ Full normalization pipeline for text comparison.
+
+ Steps:
+ 1. Remove LaTeX formatting
+ 2. Normalize Unicode to ASCII
+ 3. Convert to lowercase
+ 4. Normalize whitespace
+ 5. Remove punctuation
+ """
+ if not text:
+ return ""
+
+ text = cls.normalize_latex(text)
+ text = cls.normalize_unicode(text)
+ text = text.lower()
+ text = cls.normalize_whitespace(text)
+ text = cls.remove_punctuation(text)
+ return text
+
+ @classmethod
+ def normalize_author_name(cls, name: str) -> str:
+ """
+ Normalize author name format.
+ Handles: "Last, First" and "First Last" formats.
+ Returns: normalized "first last" format.
+ """
+ if not name:
+ return ""
+
+ name = cls.normalize_latex(name)
+ name = cls.normalize_unicode(name)
+ name = cls.normalize_whitespace(name)
+
+ # Handle "Last, First" format
+ if ',' in name:
+ parts = name.split(',', 1)
+ if len(parts) == 2:
+ name = f"{parts[1].strip()} {parts[0].strip()}"
+
+ name = name.lower()
+ name = cls.remove_punctuation(name)
+ return name
+
+ @classmethod
+ def normalize_author_list(cls, authors: str) -> list[str]:
+ """
+ Parse and normalize a list of authors.
+ Handles "and" as separator and "Last, First" format.
+ """
+ if not authors:
+ return []
+
+ # Split by " and "
+ author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE)
+
+ # Normalize each author
+ normalized = []
+ for author in author_list:
+ normalized_name = cls.normalize_author_name(author.strip())
+ if normalized_name:
+ normalized.append(normalized_name)
+
+ return normalized
+
+ @classmethod
+ def similarity_ratio(cls, text1: str, text2: str) -> float:
+ """
+ Calculate similarity ratio between two strings.
+ Uses simple word-based Jaccard similarity.
+ """
+ if not text1 or not text2:
+ return 0.0
+
+ words1 = set(text1.split())
+ words2 = set(text2.split())
+
+ if not words1 and not words2:
+ return 1.0
+ if not words1 or not words2:
+ return 0.0
+
+ intersection = words1 & words2
+ union = words1 | words2
+
+ return len(intersection) / len(union)
+
+ @classmethod
+ def levenshtein_similarity(cls, s1: str, s2: str) -> float:
+ """Calculate normalized Levenshtein similarity."""
+ if not s1 and not s2:
+ return 1.0
+ if not s1 or not s2:
+ return 0.0
+
+ # Simple Levenshtein implementation
+ m, n = len(s1), len(s2)
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+ for i in range(m + 1):
+ dp[i][0] = i
+ for j in range(n + 1):
+ dp[0][j] = j
+
+ for i in range(1, m + 1):
+ for j in range(1, n + 1):
+ if s1[i-1] == s2[j-1]:
+ dp[i][j] = dp[i-1][j-1]
+ else:
+ dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
+
+ max_len = max(m, n)
+ distance = dp[m][n]
+ return 1.0 - (distance / max_len)
diff --git a/src/utils/progress.py b/src/utils/progress.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a6b66098113b6eccb4a9d014b47894bc157945
--- /dev/null
+++ b/src/utils/progress.py
@@ -0,0 +1,202 @@
+"""
+Rich progress display for terminal output.
+"""
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
+from rich.panel import Panel
+from rich.table import Table
+from rich.live import Live
+from rich.layout import Layout
+from rich.text import Text
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ProgressStats:
+ """Statistics for progress display."""
+ total_entries: int = 0
+ processed: int = 0
+ success: int = 0
+ warnings: int = 0
+ errors: int = 0
+ current_entry: str = ""
+ current_task: str = ""
+
+
+class ProgressDisplay:
+ """Rich terminal progress display."""
+
+ def __init__(self):
+ self.console = Console()
+ self.stats = ProgressStats()
+ self._progress: Optional[Progress] = None
+ self._live: Optional[Live] = None
+ self._main_task = None
+
+ def _create_stats_table(self) -> Table:
+ """Create a statistics table."""
+ table = Table(show_header=False, box=None, padding=(0, 2))
+ table.add_column("Label", style="dim")
+ table.add_column("Value", style="bold")
+
+ table.add_row("📚 Total Entries", str(self.stats.total_entries))
+ table.add_row("✅ Success", f"[green]{self.stats.success}[/green]")
+ table.add_row("⚠️ Warnings", f"[yellow]{self.stats.warnings}[/yellow]")
+ table.add_row("❌ Errors", f"[red]{self.stats.errors}[/red]")
+
+ return table
+
+ def _create_display(self) -> Panel:
+ """Create the main display panel."""
+ layout = Layout()
+
+ # Status text
+ status_text = Text()
+ status_text.append("Current: ", style="dim")
+ status_text.append(self.stats.current_entry or "N/A", style="cyan bold")
+ status_text.append("\n")
+ status_text.append("Task: ", style="dim")
+ status_text.append(self.stats.current_task or "Initializing...", style="white")
+
+ return Panel(
+ status_text,
+ title="[bold blue]📖 Bibliography Checker[/bold blue]",
+ border_style="blue"
+ )
+
+ @contextmanager
+ def progress_context(self, total: int, description: str = "Processing"):
+ """Context manager for progress display."""
+ self.stats.total_entries = total
+
+ with Progress(
+ SpinnerColumn(),
+ TextColumn("[progress.description]{task.description}"),
+ BarColumn(bar_width=40),
+ TaskProgressColumn(),
+ TimeElapsedColumn(),
+ console=self.console,
+ transient=False
+ ) as progress:
+ self._progress = progress
+ self._main_task = progress.add_task(description, total=total)
+ try:
+ yield self
+ finally:
+ self._progress = None
+ self._main_task = None
+
+ def update(self, entry_key: str = "", task: str = "", advance: int = 0):
+ """Update progress display."""
+ if entry_key:
+ self.stats.current_entry = entry_key
+ if task:
+ self.stats.current_task = task
+
+ if self._progress and self._main_task is not None:
+ desc = f"[cyan]{entry_key}[/cyan] - {task}" if entry_key else task
+ self._progress.update(self._main_task, description=desc, advance=advance)
+ self.stats.processed += advance
+
+ def mark_success(self):
+ """Mark current entry as successful."""
+ self.stats.success += 1
+
+ def mark_warning(self):
+ """Mark current entry with warning."""
+ self.stats.warnings += 1
+
+ def mark_error(self):
+ """Mark current entry as error."""
+ self.stats.errors += 1
+
+ def print_header(self, title: str):
+ """Print a section header."""
+ self.console.print()
+ self.console.print(Panel(
+ f"[bold]{title}[/bold]",
+ border_style="blue",
+ expand=False
+ ))
+
+ def print_status(self, message: str, style: str = ""):
+ """Print a status message."""
+ self.console.print(f" {message}", style=style)
+
+ def print_success(self, message: str):
+ """Print a success message."""
+ self.console.print(f" [green]✓[/green] {message}")
+
+ def print_warning(self, message: str):
+ """Print a warning message."""
+ self.console.print(f" [yellow]⚠[/yellow] {message}")
+
+ def print_error(self, message: str):
+ """Print an error message."""
+ self.console.print(f" [red]✗[/red] {message}")
+
+ def print_info(self, message: str):
+ """Print an info message."""
+ self.console.print(f" [blue]ℹ[/blue] {message}")
+
+ def print_detailed_summary(self, bib_stats: dict, latex_stats: dict, output_dir: str):
+ """Print a beautiful detailed summary table (Issues only)."""
+ self.console.print()
+
+ # Create Bibliography Issues Table
+ bib_table = Table(show_header=True, header_style="bold cyan", box=None, padding=(0, 1))
+ bib_table.add_column("📚 Bibliography Issues", style="white")
+ bib_table.add_column("Count", justify="right", style="bold red")
+
+ for label, value in bib_stats.items():
+ bib_table.add_row(label, str(value))
+
+ # Create LaTeX Issues Table - Fine-grained Breakdown
+ latex_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1))
+ latex_table.add_column("📋 LaTeX Quality Issues (Fine-grained)", style="white")
+ latex_table.add_column("Count", justify="right", style="bold yellow")
+
+ if not latex_stats:
+ latex_table.add_row("[green]No issues found[/green]", "0")
+ else:
+ # Sort by count descending
+ for category, count in sorted(latex_stats.items(), key=lambda x: x[1], reverse=True):
+ latex_table.add_row(category, str(count))
+
+ # Combine into a single panel
+ from rich.columns import Columns
+
+ # If no bib issues, only show latex table
+ content = []
+ if bib_stats:
+ content.append(bib_table)
+ content.append(latex_table)
+
+ summary_panel = Panel(
+ Columns(content, expand=True),
+ title="[bold red]⚠️ Issue Summary (Action Required)[/bold red]",
+ border_style="red",
+ padding=(1, 2)
+ )
+
+ self.console.print(summary_panel)
+
+ # File meaning guide
+ guide_table = Table(show_header=True, header_style="bold green", box=None, padding=(0, 2))
+ guide_table.add_column("File Name", style="cyan")
+ guide_table.add_column("Description", style="dim")
+
+ guide_table.add_row("bibliography_report.md", "Detailed metadata and usage issues for each bib entry")
+ guide_table.add_row("latex_quality_report.md", "Summary of all LaTeX writing and formatting issues")
+ guide_table.add_row("line_by_line_report.md", "All LaTeX issues sorted by line number for easy fixing")
+ guide_table.add_row("*_only_used.bib", "A cleaned version of your bib file containing only cited entries")
+
+ self.console.print(Panel(
+ guide_table,
+ title="[bold green]Output Directory Guide[/bold green]",
+ subtitle=f"Location: [blue underline]{output_dir}[/blue underline]",
+ border_style="green",
+ padding=(1, 1)
+ ))