Spaces:

thinkwee
/

BibGuard

Running

App Files Files Community

thinkwee commited on 4 days ago

Commit

46df5f0

1 Parent(s): 6984298

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +62 -0
README.md +196 -11
app.py +922 -0
app_helper.py +98 -0
assets/icon-192.png +3 -0
assets/icon-512.png +3 -0
bibguard.yaml +197 -0
main.py +564 -0
requirements.txt +8 -0
src/__init__.py +1 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/analyzers/__init__.py +7 -0
src/analyzers/__pycache__/__init__.cpython-313.pyc +0 -0
src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc +0 -0
src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc +0 -0
src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc +0 -0
src/analyzers/__pycache__/retraction_checker.cpython-313.pyc +0 -0
src/analyzers/__pycache__/url_validator.cpython-313.pyc +0 -0
src/analyzers/__pycache__/usage_checker.cpython-313.pyc +0 -0
src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc +0 -0
src/analyzers/duplicate_detector.py +204 -0
src/analyzers/llm_evaluator.py +376 -0
src/analyzers/metadata_comparator.py +474 -0
src/analyzers/usage_checker.py +82 -0
src/checkers/__init__.py +66 -0
src/checkers/__pycache__/__init__.cpython-313.pyc +0 -0
src/checkers/__pycache__/acronym_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/anonymization_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/base.cpython-313.pyc +0 -0
src/checkers/__pycache__/caption_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/consistency_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/equation_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/formatting_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/number_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/reference_checker.cpython-313.pyc +0 -0
src/checkers/__pycache__/sentence_checker.cpython-313.pyc +0 -0
src/checkers/acronym_checker.py +284 -0
src/checkers/ai_artifacts_checker.py +176 -0
src/checkers/anonymization_checker.py +216 -0
src/checkers/base.py +193 -0
src/checkers/caption_checker.py +136 -0
src/checkers/citation_quality_checker.py +131 -0
src/checkers/consistency_checker.py +254 -0
src/checkers/equation_checker.py +134 -0
src/checkers/formatting_checker.py +204 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,62 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environments
+venv/
+env/
+.env
+.venv/
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Project Specific Outputs
+*.txt
+*.md
+!README.md
+*_only_used_entry.bib
+# LaTeX and Bibliography (User Data)
+# Ignoring these to prevent committing personal paper content
+*.tex
+*.bib
+*.pdf
+*.log
+*.aux
+*.out
+*.bbl
+*.blg
+*.synctex.gz
+*.fls
+*.fdb_latexmk
+# cache
+.cache

README.md CHANGED Viewed

@@ -1,13 +1,198 @@
----
-title: BibGuard
-emoji: ⚡
-colorFrom: pink
-colorTo: purple
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
-short_description: Automated bibliography verification and LaTeX quality auditi
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# BibGuard: Bibliography & LaTeX Quality Auditor
+**BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit.
+AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims.
+## 🛡 Why BibGuard?
+-   **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
+-   **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems
+-   **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated
+-   **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM)
+-   **⚡ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations
+## 🚀 Features
+### Bibliography Validation
+-   **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
+-   **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional)
+-   **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.)
+-   **👀 Usage Analysis**: Highlights missing citations and unused bib entries
+-   **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching
+### LaTeX Quality Checks
+-   **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
+-   **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
+-   **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology
+-   **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
+-   **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching)
+-   **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
+-   **📅 Citation Age**: Flags references older than 30 years
+## 📦 Installation
+```bash
+git clone git@github.com:thinkwee/BibGuard.git
+cd BibGuard
+pip install -r requirements.txt
+```
+## ⚡ Quick Start
+### 1. Initialize Configuration
+```bash
+python main.py --init
+```
+This creates `config.yaml`. Edit it to set your file paths. You have two modes:
+#### Option A: Single File Mode
+Best for individual papers.
+```yaml
+files:
+  bib: "paper.bib"
+  tex: "paper.tex"
+  output_dir: "bibguard_output"
+```
+#### Option B: Directory Scan Mode
+Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files.
+```yaml
+files:
+  input_dir: "./my_project_dir"
+  output_dir: "bibguard_output"
+```
+### 2. Run Full Check
+```bash
+python main.py
+```
+**Output** (in `bibguard_output/`):
+- `bibliography_report.md` - Bibliography validation results
+- `latex_quality_report.md` - Writing and formatting issues
+- `line_by_line_report.md` - All issues sorted by line number
+- `*_only_used.bib` - Clean bibliography (used entries only)
+## 🛠 Configuration
+Edit `config.yaml` to customize checks:
+```yaml
+bibliography:
+  check_metadata: true        # Validate against online databases (takes time)
+  check_usage: true           # Find unused/missing entries
+  check_duplicates: true      # Detect duplicate entries
+  check_preprint_ratio: true  # Warn if >50% are preprints
+  check_relevance: false      # LLM-based relevance check (requires API key)
+submission:
+  # Format checks
+  caption: true               # Table/figure caption placement
+  reference: true             # Cross-reference integrity
+  formatting: true            # Citation spacing, blank lines
+  equation: true              # Equation punctuation, numbering
+  # Writing quality
+  sentence: true              # Weak starters, hedging language
+  consistency: true           # Spelling, hyphenation, terminology
+  acronym: true               # Acronym definitions (3+ letters)
+  # Submission compliance
+  ai_artifacts: true          # AI-generated text detection
+  anonymization: true         # Double-blind compliance
+  citation_quality: true      # Old citations (>30 years)
+  number: true                # Percentage formatting
+```
+## 🤖 LLM-Based Relevance Check
+To verify citations match their context using AI:
+```yaml
+bibliography:
+  check_relevance: true
+llm:
+  backend: "gemini"  # Options: gemini, openai, anthropic, deepseek, ollama, vllm
+  api_key: ""        # Or use environment variable (e.g., GEMINI_API_KEY)
+```
+**Supported Backends:**
+- **Gemini** (Google): `GEMINI_API_KEY`
+- **OpenAI**: `OPENAI_API_KEY`
+- **Anthropic**: `ANTHROPIC_API_KEY`
+- **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance)
+- **Ollama**: Local models (no API key needed)
+- **vLLM**: Custom endpoint
+Then run:
+```bash
+python main.py
+```
+## 📝 Understanding Reports
+### Bibliography Report
+Shows for each entry:
+- ✅ **Verified**: Metadata matches online databases
+- ⚠️ **Issues**: Mismatches, missing entries, duplicates
+- 📊 **Statistics**: Usage, duplicates, preprint ratio
+### LaTeX Quality Report
+Organized by severity:
+- 🔴 **Errors**: Critical issues (e.g., undefined references)
+- 🟡 **Warnings**: Important issues (e.g., inconsistent spelling)
+- 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
+### Line-by-Line Report
+All LaTeX issues sorted by line number for easy fixing.
+## 🧐 Understanding Mismatches
+BibGuard is strict, but false positives happen:
+1.  **Year Discrepancy (±1 Year)**:
+    - *Reason*: Delay between preprint (arXiv) and official publication
+    - *Action*: Verify which version you intend to cite
+2.  **Author List Variations**:
+    - *Reason*: Different databases handle large author lists differently
+    - *Action*: Check if primary authors match
+3.  **Venue Name Differences**:
+    - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
+    - *Action*: Both are usually correct
+4.  **Non-Academic Sources**:
+    - *Reason*: Blogs, documentation not indexed by academic databases
+    - *Action*: Manually verify URL and title
+## 🔧 Advanced Options
+```bash
+python main.py --help              # Show all options
+python main.py --list-templates    # List conference templates
+python main.py --config my.yaml    # Use custom config file
+```
+## 🤝 Contributing
+Contributions welcome! Please open an issue or pull request.
+## 🙏 Acknowledgments
+BibGuard uses multiple data sources:
+- arXiv API
+- CrossRef API
+- Semantic Scholar API
+- DBLP API
+- OpenAlex API
+- Google Scholar (via scholarly)
 ---
+**Made with ❤️ for researchers who care about their submission**

app.py ADDED Viewed

	@@ -0,0 +1,922 @@

+#!/usr/bin/env python3
+"""
+BibGuard Gradio Web Application
+A web interface for checking bibliography and LaTeX quality.
+"""
+import gradio as gr
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Optional, Tuple
+import base64
+from src.parsers import BibParser, TexParser
+from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
+from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
+from src.report.generator import ReportGenerator, EntryReport
+from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep
+from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
+from src.checkers import CHECKER_REGISTRY
+from src.report.line_report import LineByLineReportGenerator
+from app_helper import fetch_and_compare_with_workflow
+# Custom CSS for better Markdown rendering
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+* {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+}
+"""
+WELCOME_HTML = """
+<div class="scrollable-report-area">
+    <div class="report-card" style="max-width: 800px; margin: 0 auto;">
+        <div class="card-header">
+            <h3 class="card-title" style="font-size: 1.5em;">👋 Welcome to BibGuard</h3>
+        </div>
+        <div class="card-content" style="line-height: 1.6; color: #374151;">
+            <p style="font-size: 1.1em; margin-bottom: 24px;">
+                Ensure your academic paper is flawless. Upload your <code>.bib</code> and <code>.tex</code> files on the left and click <strong>"Check Now"</strong>.
+            </p>
+            <div style="display: grid; gap: 20px; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));">
+                <div style="background: #fefce8; padding: 16px; border-radius: 8px; border: 1px solid #fde047;">
+                    <strong style="color: #854d0e; display: block; margin-bottom: 8px;">⚠️ Metadata Check Defaults</strong>
+                    "🔍 Metadata" is <strong>disabled by default</strong>. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification.
+                </div>
+                <div style="background: #eff6ff; padding: 16px; border-radius: 8px; border: 1px solid #bfdbfe;">
+                    <strong style="color: #1e40af; display: block; margin-bottom: 8px;">🚀 Go Pro with Local Version</strong>
+                    LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="color: #2563eb; text-decoration: underline; font-weight: 600;">GitHub repo</a> to use the full power with your API key.
+                </div>
+            </div>
+            <h4 style="margin: 24px 0 12px 0; color: #111827; font-size: 1.1em;">📊 Understanding Your Reports</h4>
+            <div style="display: grid; gap: 12px;">
+                <div style="display: flex; gap: 12px; align-items: baseline;">
+                    <span style="background: #e0e7ff; color: #3730a3; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📚 Bibliography</span>
+                    <span>Validates metadata fields, detects duplicates, and checks citation counts.</span>
+                </div>
+                <div style="display: flex; gap: 12px; align-items: baseline;">
+                    <span style="background: #dcfce7; color: #166534; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📝 LaTeX Quality</span>
+                    <span>Syntax check, caption validation, acronym consistency, and style suggestions.</span>
+                </div>
+                <div style="display: flex; gap: 12px; align-items: baseline;">
+                    <span style="background: #f3f4f6; color: #4b5563; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📋 Line-by-Line</span>
+                    <span>Maps every issue found directly to the line number in your source file.</span>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+"""
+CUSTOM_CSS += """
+/* Global Reset */
+body, gradio-app {
+    overflow: hidden !important; /* Prevent double scrollbars on the page */
+}
+.gradio-container {
+    max-width: none !important;
+    width: 100% !important;
+    height: 100vh !important;
+    padding: 0 !important;
+    margin: 0 !important;
+}
+/* Header Styling */
+.app-header {
+    padding: 20px;
+    background: white;
+    border-bottom: 1px solid #e5e7eb;
+}
+/* Sidebar Styling */
+.app-sidebar {
+    height: calc(100vh - 100px) !important;
+    overflow-y: auto !important;
+    padding: 20px !important;
+    border-right: 1px solid #e5e7eb;
+}
+/* Main Content Area */
+.app-content {
+    height: calc(100vh - 100px) !important;
+    padding: 0 !important;
+}
+/* The Magic Scroll Container - Clean and Explicit */
+.scrollable-report-area {
+    height: calc(100vh - 180px) !important; /* Fixed height relative to viewport */
+    overflow-y: auto !important;
+    padding: 24px;
+    background-color: #f9fafb;
+    border: 1px solid #e5e7eb;
+    border-radius: 8px;
+    margin-top: 10px;
+}
+/* Report Card Styling */
+.report-card {
+    background: white;
+    border-radius: 12px;
+    padding: 24px;
+    margin-bottom: 16px; /* Spacing between cards */
+    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+    border: 1px solid #e5e7eb;
+    transition: transform 0.2s, box-shadow 0.2s;
+}
+.report-card:hover {
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+    transform: translateY(-2px);
+}
+/* Card Internals */
+.card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    margin-bottom: 16px;
+    padding-bottom: 16px;
+    border-bottom: 1px solid #f3f4f6;
+}
+.card-title {
+    font-size: 1.1em;
+    font-weight: 600;
+    color: #111827;
+    margin: 0 0 4px 0;
+}
+.card-subtitle {
+    font-size: 0.9em;
+    color: #6b7280;
+    font-family: monospace;
+}
+.card-content {
+    font-size: 0.95em;
+    color: #374151;
+    line-height: 1.5;
+}
+/* Badges */
+.badge {
+    display: inline-flex;
+    align-items: center;
+    padding: 4px 10px;
+    border-radius: 9999px;
+    font-size: 0.8em;
+    font-weight: 500;
+}
+.badge-success { background-color: #dcfce7; color: #166534; }
+.badge-warning { background-color: #fef9c3; color: #854d0e; }
+.badge-error { background-color: #fee2e2; color: #991b1b; }
+.badge-info { background-color: #dbeafe; color: #1e40af; }
+.badge-neutral { background-color: #f3f4f6; color: #4b5563; }
+/* Stats Grid */
+.stats-container {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
+    gap: 16px;
+    margin-bottom: 24px;
+}
+.stat-card {
+    padding: 16px;
+    border-radius: 12px;
+    color: white;
+    text-align: center;
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+.stat-value { font-size: 1.8em; font-weight: 700; }
+.stat-label { font-size: 0.9em; opacity: 0.9; }
+/* Detail Grid - Flexbox for better filling */
+.detail-grid {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 12px;
+    margin-bottom: 16px;
+    width: 100%;
+}
+.detail-item {
+    background: #f9fafb;
+    padding: 10px 12px;
+    border-radius: 8px;
+    border: 1px solid #f3f4f6;
+    /* Flex sizing: grow, shrink, min-basis */
+    flex: 1 1 160px;
+    min-width: 0; /* Important for word-break to work in flex children */
+    /* Layout control */
+    display: flex;
+    flex-direction: column;
+    /* Height constraint to prevent one huge card from stretching the row */
+    max-height: 100px;
+    overflow-y: auto;
+}
+/* Custom scrollbar for detail items */
+.detail-item::-webkit-scrollbar {
+    width: 4px;
+}
+.detail-item::-webkit-scrollbar-thumb {
+    background-color: #d1d5db;
+    border-radius: 4px;
+}
+.detail-label {
+    font-size: 0.75em;
+    color: #6b7280;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    margin-bottom: 2px;
+    position: sticky;
+    top: 0;
+    background: #f9fafb; /* Maintain bg on scroll */
+    z-index: 1;
+}
+.detail-value {
+    font-weight: 500;
+    color: #1f2937;
+    font-size: 0.9em;
+    line-height: 1.4;
+    word-break: break-word; /* Fix overflow */
+    overflow-wrap: break-word;
+}    border: 1px solid #e5e7eb;
+    transition: all 0.2s;
+}
+.report-card:hover {
+    box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
+}
+/* Card Header */
+.card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    margin-bottom: 12px;
+    border-bottom: 1px solid #f3f4f6;
+    padding-bottom: 12px;
+}
+.card-title {
+    font-size: 1.1em;
+    font-weight: 600;
+    color: #1f2937;
+    margin: 0;
+}
+.card-subtitle {
+    font-size: 0.9em;
+    color: #6b7280;
+    margin-top: 4px;
+}
+/* Status Badges */
+.badge {
+    display: inline-flex;
+    align-items: center;
+    padding: 4px 10px;
+    border-radius: 9999px;
+    font-size: 0.8em;
+    font-weight: 500;
+}
+.badge-success { background-color: #dcfce7; color: #166534; }
+.badge-warning { background-color: #fef9c3; color: #854d0e; }
+.badge-error { background-color: #fee2e2; color: #991b1b; }
+.badge-info { background-color: #dbeafe; color: #1e40af; }
+.badge-neutral { background-color: #f3f4f6; color: #374151; }
+/* Content Styling */
+.card-content {
+    font-size: 15px;
+    color: #374151;
+    line-height: 1.6;
+}
+.card-content code {
+    background-color: #f3f4f6;
+    padding: 2px 6px;
+    border-radius: 4px;
+    font-family: monospace;
+    font-size: 0.9em;
+    color: #c2410c;
+}
+/* Grid for details */
+.detail-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 12px;
+    margin-top: 12px;
+}
+.detail-item {
+    background: #f9fafb;
+    padding: 10px;
+    border-radius: 6px;
+}
+.detail-label {
+    font-size: 0.8em;
+    color: #6b7280;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.detail-value {
+    font-weight: 500;
+    color: #111827;
+}
+/* Summary Stats */
+.stats-container {
+    display: grid;
+    grid-template-columns: repeat(3, 1fr);
+    gap: 16px;
+    margin-bottom: 24px;
+}
+.stat-card {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 12px;
+    text-align: center;
+    box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25);
+}
+.stat-value {
+    font-size: 2em;
+    font-weight: 700;
+}
+.stat-label {
+    font-size: 0.9em;
+    opacity: 0.9;
+    margin-top: 4px;
+}
+/* Button styling */
+.primary-btn {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    font-weight: 600 !important;
+}
+/* Tab styling */
+.tab-nav button {
+    font-weight: 500 !important;
+    font-size: 15px !important;
+}
+"""
+def create_config_from_ui(
+    check_metadata: bool,
+    check_usage: bool,
+    check_duplicates: bool,
+    check_preprint_ratio: bool,
+    caption: bool,
+    reference: bool,
+    formatting: bool,
+    equation: bool,
+    ai_artifacts: bool,
+    sentence: bool,
+    consistency: bool,
+    acronym: bool,
+    number: bool,
+    citation_quality: bool,
+    anonymization: bool
+) -> BibGuardConfig:
+    """Create a BibGuardConfig from UI settings."""
+    config = BibGuardConfig()
+    config.bibliography = BibliographyConfig(
+        check_metadata=check_metadata,
+        check_usage=check_usage,
+        check_duplicates=check_duplicates,
+        check_preprint_ratio=check_preprint_ratio,
+        check_relevance=False  # Disabled for web
+    )
+    config.submission = SubmissionConfig(
+        caption=caption,
+        reference=reference,
+        formatting=formatting,
+        equation=equation,
+        ai_artifacts=ai_artifacts,
+        sentence=sentence,
+        consistency=consistency,
+        acronym=acronym,
+        number=number,
+        citation_quality=citation_quality,
+        anonymization=anonymization
+    )
+    config.output = OutputConfig(quiet=True, minimal_verified=False)
+    return config
+def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str:
+    """Generate HTML content for bibliography report."""
+    html = ['<div class="scrollable-report-area">']
+    # 1. Summary Stats
+    total = len(entries)
+    verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match)
+    used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used)
+    html.append('<div class="stats-container">')
+    html.append(f'<div class="stat-card"><div class="stat-value">{total}</div><div class="stat-label">Total Entries</div></div>')
+    html.append(f'<div class="stat-card"><div class="stat-value">{verified}</div><div class="stat-label">Verified</div></div>')
+    html.append(f'<div class="stat-card"><div class="stat-value">{used}</div><div class="stat-label">Used in Text</div></div>')
+    html.append('</div>')
+    # 2. Entries
+    for report in report_gen.entries:
+        entry = report.entry
+        status_badges = []
+        # Metadata Status
+        if report.comparison:
+            if report.comparison.is_match:
+                status_badges.append('<span class="badge badge-success">✓ Verified</span>')
+                if report.comparison.source:
+                    status_badges.append(f'<span class="badge badge-info">{report.comparison.source.upper()}</span>')
+            else:
+                status_badges.append('<span class="badge badge-error">⚠ Metadata Mismatch</span>')
+        else:
+             status_badges.append('<span class="badge badge-neutral">No Metadata Check</span>')
+        # Usage Status
+        if report.usage:
+            if report.usage.is_used:
+                status_badges.append(f'<span class="badge badge-success">Used: {report.usage.usage_count}x</span>')
+            else:
+                status_badges.append('<span class="badge badge-warning">Unused</span>')
+        # Build Card
+        html.append(f'''
+        <div class="report-card">
+            <div class="card-header">
+                <div>
+                    <h3 class="card-title">{entry.title or "No Title"}</h3>
+                    <div class="card-subtitle">{entry.key} • {entry.year} • {entry.entry_type}</div>
+                </div>
+                <div style="display: flex; gap: 8px;">
+                    {" ".join(status_badges)}
+                </div>
+            </div>
+            <div class="card-content">
+                <div class="detail-grid">
+                    {
+                        (lambda e: "".join([
+                            f'<div class="detail-item"><div class="detail-label">{k}</div><div class="detail-value">{v}</div></div>'
+                            for k, v in filter(None, [
+                                ("Authors", e.author or "N/A"),
+                                ("Venue", e.journal or e.booktitle or e.publisher or "N/A"),
+                                ("DOI", e.doi) if e.doi else None,
+                                ("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None,
+                                ("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None,
+                                ("URL", f'<a href="{e.url}" target="_blank" style="text-decoration:underline;">Link</a>') if e.url else None
+                            ])
+                        ]))(entry)
+                    }
+                </div>
+        ''')
+        # Add issues if any
+        issues = []
+        if report.comparison and not report.comparison.is_match:
+             # Add main message derived from match status
+             if report.comparison.issues:
+                 for issue in report.comparison.issues:
+                     issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• {issue}</div>')
+             else:
+                  issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• Verification failed</div>')
+        if issues:
+            html.append('<div style="margin-top: 16px; padding-top: 12px; border-top: 1px solid #eee;">')
+            html.append("".join(issues))
+            html.append('</div>')
+        html.append('</div></div>') # Close card-content and report-card
+    html.append('</div>') # Close container
+    return "".join(html)
+def generate_latex_html(results: list) -> str:
+    """Generate HTML for LaTeX quality check."""
+    from src.checkers import CheckSeverity
+    html = ['<div class="scrollable-report-area">']
+    # Stats
+    errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR)
+    warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING)
+    infos = sum(1 for r in results if r.severity == CheckSeverity.INFO)
+    html.append('<div class="stats-container">')
+    html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #ef4444 0%, #b91c1c 100%);"><div class="stat-value">{errors}</div><div class="stat-label">Errors</div></div>')
+    html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);"><div class="stat-value">{warnings}</div><div class="stat-label">Warnings</div></div>')
+    html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);"><div class="stat-value">{infos}</div><div class="stat-label">Suggestions</div></div>')
+    html.append('</div>')
+    if not results:
+        html.append('<div class="report-card"><div class="card-content" style="text-align: center; padding: 40px; color: #166534; font-size: 1.2em;">✅ No issues found in LaTeX code!</div></div>')
+    else:
+        # Group by Checker
+        results.sort(key=lambda x: x.checker_name)
+        current_checker = None
+        for result in results:
+            badge_class = "badge-neutral"
+            if result.severity == CheckSeverity.ERROR: badge_class = "badge-error"
+            elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning"
+            elif result.severity == CheckSeverity.INFO: badge_class = "badge-info"
+            html.append(f'''
+            <div class="report-card">
+                <div class="card-header">
+                    <div>
+                        <h3 class="card-title">{result.checker_name}</h3>
+                        <div class="card-subtitle">Line {result.line_number}</div>
+                    </div>
+                     <span class="badge {badge_class}">{result.severity.name}</span>
+                </div>
+                <div class="card-content">
+                    {result.message}
+                    {f'<div style="margin-top: 8px; background: #f3f4f6; padding: 8px; border-radius: 4px; font-family: monospace;">{result.line_content}</div>' if result.line_content else ''}
+                    {f'<div style="margin-top: 8px; color: #166534;">💡 Suggestion: {result.suggestion}</div>' if result.suggestion else ''}
+                </div>
+            </div>
+            ''')
+    html.append('</div>')
+    return "".join(html)
+def generate_line_html(content: str, results: list) -> str:
+    """Generate HTML for Line-by-Line report."""
+    # Build a dictionary of line_number -> list of issues
+    issues_by_line = {}
+    for r in results:
+        if r.line_number not in issues_by_line:
+            issues_by_line[r.line_number] = []
+        issues_by_line[r.line_number].append(r)
+    lines = content.split('\n')
+    html = ['<div class="scrollable-report-area">']
+    html.append('<div class="report-card"><div class="card-content">Issues are mapped to specific lines below.</div></div>')
+    for i, line in enumerate(lines, 1):
+        if i in issues_by_line:
+            # Highlight this line
+            line_issues = issues_by_line[i]
+            html.append(f'''
+            <div class="report-card" style="border-left: 4px solid #ef4444; padding: 12px;">
+                <div style="font-family: monospace; color: #6b7280; font-size: 0.9em; margin-bottom: 4px;">Line {i}</div>
+                <div style="font-family: monospace; background: #fee2e2; padding: 4px; border-radius: 4px; overflow-x: auto; white-space: pre;">{line}</div>
+                <div style="margin-top: 8px;">
+            ''')
+            for issue in line_issues:
+                html.append(f'<div style="color: #991b1b; font-size: 0.95em; margin-top: 4px;">• {issue.message}</div>')
+            html.append('</div></div>')
+    html.append('</div>')
+    return "".join(html)
+def run_check(
+    bib_file,
+    tex_file,
+    check_metadata: bool,
+    check_usage: bool,
+    check_duplicates: bool,
+    check_preprint_ratio: bool,
+    caption: bool,
+    reference: bool,
+    formatting: bool,
+    equation: bool,
+    ai_artifacts: bool,
+    sentence: bool,
+    consistency: bool,
+    acronym: bool,
+    number: bool,
+    citation_quality: bool,
+    anonymization: bool,
+    progress=gr.Progress()
+) -> Tuple[str, str, str]:
+    """Run BibGuard checks and return three reports."""
+    if bib_file is None or tex_file is None:
+        return (
+            "⚠️ Please upload both `.bib` and `.tex` files.",
+            "⚠️ Please upload both `.bib` and `.tex` files.",
+            "⚠️ Please upload both `.bib` and `.tex` files."
+        )
+    try:
+        # Create config from UI
+        config = create_config_from_ui(
+            check_metadata, check_usage, check_duplicates, check_preprint_ratio,
+            caption, reference, formatting, equation, ai_artifacts,
+            sentence, consistency, acronym, number, citation_quality, anonymization
+        )
+        # Get file paths from uploaded files
+        bib_path = bib_file.name
+        tex_path = tex_file.name
+        # Read tex content for checkers
+        tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
+        # Parse files
+        bib_parser = BibParser()
+        entries = bib_parser.parse_file(bib_path)
+        tex_parser = TexParser()
+        tex_parser.parse_file(tex_path)
+        bib_config = config.bibliography
+        # Initialize components
+        arxiv_fetcher = None
+        crossref_fetcher = None
+        semantic_scholar_fetcher = None
+        openalex_fetcher = None
+        dblp_fetcher = None
+        comparator = None
+        usage_checker = None
+        duplicate_detector = None
+        if bib_config.check_metadata:
+            arxiv_fetcher = ArxivFetcher()
+            semantic_scholar_fetcher = SemanticScholarFetcher()
+            openalex_fetcher = OpenAlexFetcher()
+            dblp_fetcher = DBLPFetcher()
+            crossref_fetcher = CrossRefFetcher()
+            comparator = MetadataComparator()
+        if bib_config.check_usage:
+            usage_checker = UsageChecker(tex_parser)
+        if bib_config.check_duplicates:
+            duplicate_detector = DuplicateDetector()
+        # Initialize report generator
+        report_gen = ReportGenerator(
+            minimal_verified=False,
+            check_preprint_ratio=bib_config.check_preprint_ratio,
+            preprint_warning_threshold=bib_config.preprint_warning_threshold
+        )
+        report_gen.set_metadata([bib_file.name], [tex_file.name])
+        # Run submission quality checks
+        progress(0.2, desc="Running LaTeX quality checks...")
+        submission_results = []
+        enabled_checkers = config.submission.get_enabled_checkers()
+        for checker_name in enabled_checkers:
+            if checker_name in CHECKER_REGISTRY:
+                checker = CHECKER_REGISTRY[checker_name]()
+                results = checker.check(tex_content, {})
+                for r in results:
+                    r.file_path = tex_file.name
+                submission_results.extend(results)
+        report_gen.set_submission_results(submission_results, None)
+        # Check for duplicates
+        if bib_config.check_duplicates and duplicate_detector:
+            duplicate_groups = duplicate_detector.find_duplicates(entries)
+            report_gen.set_duplicate_groups(duplicate_groups)
+        # Check missing citations
+        if bib_config.check_usage and usage_checker:
+            missing = usage_checker.get_missing_entries(entries)
+            report_gen.set_missing_citations(missing)
+        # Build workflow
+        workflow_config = get_default_workflow()
+        # Process entries
+        progress(0.3, desc="Processing bibliography entries...")
+        total_entries = len(entries)
+        for i, entry in enumerate(entries):
+            progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}")
+            # Check usage
+            usage_result = None
+            if usage_checker:
+                usage_result = usage_checker.check_usage(entry)
+            # Fetch and compare metadata
+            comparison_result = None
+            if bib_config.check_metadata and comparator:
+                comparison_result = fetch_and_compare_with_workflow(
+                    entry, workflow_config, arxiv_fetcher, crossref_fetcher,
+                    semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+                )
+            # Create entry report
+            entry_report = EntryReport(
+                entry=entry,
+                comparison=comparison_result,
+                usage=usage_result,
+                evaluations=[]
+            )
+            report_gen.add_entry_report(entry_report)
+        progress(0.85, desc="Generating structured reports...")
+        # Generate Bibliography HTML Report
+        bib_report = generate_bibliography_html(report_gen, entries)
+        # Generate LaTeX Quality HTML Report
+        latex_report = generate_latex_html(submission_results)
+        # Generate Line-by-Line HTML Report
+        line_report = ""
+        if submission_results:
+            line_report = generate_line_html(tex_content, submission_results)
+        else:
+             line_report = '<div class="report-container"><div class="report-card"><div class="card-content">No issues to display line-by-line.</div></div></div>'
+        progress(1.0, desc="Done!")
+        return bib_report, latex_report, line_report
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        import traceback
+        error_msg += f"\n\n```\n{traceback.format_exc()}\n```"
+        return error_msg, error_msg, error_msg
+def create_app():
+    """Create and configure the Gradio app."""
+    # Load icon as base64
+    icon_html = ""
+    try:
+        icon_path = Path("assets/icon-192.png")
+        if icon_path.exists():
+            with open(icon_path, "rb") as f:
+                encoding = base64.b64encode(f.read()).decode()
+            icon_html = f'<img src="data:image/png;base64,{encoding}" style="width: 48px; height: 48px; border-radius: 8px;" alt="BibGuard">'
+        else:
+            icon_html = '<span style="font-size: 48px;">📚</span>'
+    except Exception:
+        icon_html = '<span style="font-size: 48px;">📚</span>'
+    with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app:
+        # Header with icon
+        with gr.Row(elem_classes=["app-header"]):
+            gr.HTML(f"""
+            <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 16px;">
+                {icon_html}
+                <div>
+                    <h1 style="margin: 0; font-size: 1.8em;">BibGuard</h1>
+                    <p style="margin: 0; color: #666; font-size: 14px;">Bibliography & LaTeX Quality Checker</p>
+                </div>
+            </div>
+            """)
+        with gr.Row(elem_classes=["app-body"]):
+            # Left column: Upload & Settings
+            with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]):
+                gr.Markdown("### 📁 Upload Files")
+                bib_file = gr.File(
+                    label="Bibliography (.bib)",
+                    file_types=[".bib"],
+                    file_count="single"
+                )
+                tex_file = gr.File(
+                    label="LaTeX Source (.tex)",
+                    file_types=[".tex"],
+                    file_count="single"
+                )
+                # Check options in grid layout
+                gr.Markdown("#### ⚙️ Options")
+                with gr.Row():
+                    check_metadata = gr.Checkbox(label="🔍 Metadata", value=False)
+                    check_usage = gr.Checkbox(label="📊 Usage", value=True)
+                with gr.Row():
+                    check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True)
+                    check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True)
+                with gr.Row():
+                    caption = gr.Checkbox(label="🖼️ Captions", value=True)
+                    reference = gr.Checkbox(label="🔗 References", value=True)
+                with gr.Row():
+                    formatting = gr.Checkbox(label="✨ Formatting", value=True)
+                    equation = gr.Checkbox(label="🔢 Equations", value=True)
+                with gr.Row():
+                    ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True)
+                    sentence = gr.Checkbox(label="📝 Sentences", value=True)
+                with gr.Row():
+                    consistency = gr.Checkbox(label="🔄 Consistency", value=True)
+                    acronym = gr.Checkbox(label="🔤 Acronyms", value=True)
+                with gr.Row():
+                    number = gr.Checkbox(label="🔢 Numbers", value=True)
+                    citation_quality = gr.Checkbox(label="📚 Citations", value=True)
+                with gr.Row():
+                    anonymization = gr.Checkbox(label="🎭 Anonymization", value=True)
+                run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg")
+                gr.HTML("""
+                <div style="text-align: center; margin-top: 16px;">
+                <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="text-decoration: none; color: #666; display: inline-flex; align-items: center; gap: 6px;">
+                <svg height="20" width="20" viewBox="0 0 16 16"><path fill="currentColor" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
+                GitHub
+                </a>
+                <p style="margin: 8px 0 0 0; color: #999; font-size: 12px;">Developed with ❤️ for researchers</p>
+                </div>
+                """)
+            # Right column: Reports
+            with gr.Column(scale=4, elem_classes=["app-content"]):
+                with gr.Tabs():
+                    with gr.Tab("📚 Bibliography Report"):
+                        bib_report = gr.HTML(
+                            value=WELCOME_HTML,
+                            elem_classes=["report-panel"]
+                        )
+                    with gr.Tab("📝 LaTeX Quality"):
+                        latex_report = gr.HTML(
+                            value=WELCOME_HTML,
+                            elem_classes=["report-panel"]
+                        )
+                    with gr.Tab("📋 Line-by-Line"):
+                        line_report = gr.HTML(
+                            value=WELCOME_HTML,
+                            elem_classes=["report-panel"]
+                        )
+        # Event handling
+        run_btn.click(
+            fn=run_check,
+            inputs=[
+                bib_file, tex_file,
+                check_metadata, check_usage, check_duplicates, check_preprint_ratio,
+                caption, reference, formatting, equation, ai_artifacts,
+                sentence, consistency, acronym, number, citation_quality, anonymization
+            ],
+            outputs=[bib_report, latex_report, line_report]
+        )
+    return app
+# Create the app
+app = create_app()
+if __name__ == "__main__":
+    app.launch(
+        favicon_path="assets/icon-192.png",
+        show_error=True,
+        css=CUSTOM_CSS,
+        theme=gr.themes.Soft()
+    )

app_helper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+def fetch_and_compare_with_workflow(
+    entry, workflow_steps, arxiv_fetcher, crossref_fetcher,
+    semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+):
+    """Fetch metadata from online sources using the configured workflow."""
+    from src.utils.normalizer import TextNormalizer
+    best_result = None
+    # If no steps provided, use default order
+    if not workflow_steps:
+        # Create a default list of steps if needed, or simply handle logic here
+        pass
+    # Simplified workflow execution: Run through enabled steps
+    # We manualy iterate through sources in a preferred order if workflow is not fully configured
+    # Or iterate through the steps list.
+    # Since extracting WorkflowConfig logic is complex, let's just implement a robust
+    # default search strategy here which is what the user likely wants.
+    results = []
+    # 1. DBLP (High quality for CS)
+    if dblp_fetcher and entry.title:
+        try:
+            dblp_result = dblp_fetcher.search_by_title(entry.title)
+            if dblp_result:
+                res = comparator.compare_with_dblp(entry, dblp_result)
+                if res.is_match: return res
+                results.append(res)
+        except Exception: pass
+    # 2. Semantic Scholar (Comprehensive)
+    if semantic_scholar_fetcher and entry.title:
+        try:
+            ss_result = None
+            if entry.doi:
+                ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
+            if not ss_result:
+                ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
+            if ss_result:
+                res = comparator.compare_with_semantic_scholar(entry, ss_result)
+                if res.is_match: return res
+                results.append(res)
+        except Exception: pass
+    # 3. OpenAlex
+    if openalex_fetcher and entry.title:
+        try:
+            oa_result = None
+            if entry.doi:
+                oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
+            if not oa_result:
+                oa_result = openalex_fetcher.search_by_title(entry.title)
+            if oa_result:
+                res = comparator.compare_with_openalex(entry, oa_result)
+                if res.is_match: return res
+                results.append(res)
+        except Exception: pass
+    # 4. CrossRef (Official metadata)
+    if crossref_fetcher and entry.doi:
+        try:
+            crossref_result = crossref_fetcher.search_by_doi(entry.doi)
+            if crossref_result:
+                res = comparator.compare_with_crossref(entry, crossref_result)
+                if res.is_match: return res
+                results.append(res)
+        except Exception: pass
+    # 5. ArXiv
+    if arxiv_fetcher:
+        try:
+            arxiv_meta = None
+            if entry.has_arxiv:
+                arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+            elif entry.title:
+                # Search by title
+                search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
+                if search_results:
+                    arxiv_meta = search_results[0]
+            if arxiv_meta:
+                res = comparator.compare_with_arxiv(entry, arxiv_meta)
+                if res.is_match: return res
+                results.append(res)
+        except Exception: pass
+    # Return the best result (highest confidence) if no perfect match found
+    if results:
+        results.sort(key=lambda x: x.confidence, reverse=True)
+        return results[0]
+    # If absolutely nothing found, return None or an 'Unable' result
+    return comparator.create_unable_result(entry, "No metadata found in any source")

assets/icon-192.png ADDED Viewed

Git LFS Details

SHA256: 158c7c199e8e4978d2e8d6da90c4896022bf83436b0ab2c9b6285078cad60863
Pointer size: 131 Bytes
Size of remote file: 340 kB

assets/icon-512.png ADDED Viewed

Git LFS Details

SHA256: da47e48d79d2aae7f81cd1b04b39f0b7a66e760ee2338dfcdde36f66293f3ccf
Pointer size: 131 Bytes
Size of remote file: 313 kB

bibguard.yaml ADDED Viewed

	@@ -0,0 +1,197 @@

+# ==============================================================================
+# BibGuard Configuration File
+# ==============================================================================
+#
+# Usage: python main.py --config bibguard.yaml
+#        python main.py  (auto-detect bibguard.yaml in current/parent directories)
+#
+# All paths are relative to this configuration file's directory.
+# ==============================================================================
+# 📁 File Settings
+# ==============================================================================
+files:
+  # Required: Path to your .bib bibliography file
+  bib: "test.bib"
+  # Required: Path to your .tex LaTeX source file
+  tex: "test.tex"
+  # Optional: Directory path for recursive scanning (Experimental)
+  # When set, BibGuard will recursively search for all .tex and .bib files in this directory.
+  # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
+  # input_dir: "./paper_project"
+  # Output directory for all generated reports and files (default: bibguard_output)
+  # All outputs including reports, cleaned .bib, and input file copies will be saved here
+  output_dir: "test"
+# ==============================================================================
+# 🎓 Conference Template
+# ==============================================================================
+# Specify a conference template for venue-specific checks and formatting rules.
+# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
+# Leave empty ("") to skip template-specific checks.
+template: ""
+# ==============================================================================
+# 📚 Bibliography Checks
+# ==============================================================================
+bibliography:
+  # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
+  # Detects incorrect titles, authors, venues, and publication years
+  # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
+  check_metadata: true
+  # Usage Check - Detect unused bib entries and missing citations
+  # Identifies entries in .bib not cited in .tex, and citations without bib entries
+  check_usage: true
+  # Duplicate Detection - Find duplicate entries with different keys
+  # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
+  check_duplicates: true
+  # Preprint Ratio Check - Warn if too many references are preprints
+  # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
+  check_preprint_ratio: true
+  preprint_warning_threshold: 0.50  # Warn if more than 50% of used entries are preprints
+  # Relevance Assessment - Use LLM to evaluate if citations match their context
+  # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
+  check_relevance: false
+# ==============================================================================
+# 📋 Submission Quality Checks
+# ==============================================================================
+submission:
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Format Checks
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Caption Position - Ensure table captions are above, figure captions below
+  # Checks \caption placement relative to \begin{table}/\begin{figure}
+  caption: true
+  # Cross-References - Verify all figures/tables/sections are referenced in text
+  # Detects orphaned floats that are never mentioned
+  reference: true
+  # Formatting Standards - Check citation format, spacing, special characters
+  # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
+  formatting: true
+  # Equation Checks - Verify equation punctuation and numbering consistency
+  # Ensures equations end with proper punctuation and labels are used correctly
+  equation: true
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Writing Quality
+  # ─────────────────────────────────────────────────────────────────────────────
+  # AI Artifacts - Detect traces of AI-generated text
+  # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
+  ai_artifacts: true
+  # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
+  # Helps improve readability and academic writing style
+  sentence: true
+  # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
+  # Examples: "deep learning" vs "deep-learning", "color" vs "colour"
+  consistency: true
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Academic Standards
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Acronym Definitions - Ensure acronyms are defined on first use
+  # Example: "Natural Language Processing (NLP)" before using "NLP" alone
+  acronym: true
+  # Number Formatting - Check percentage formatting consistency
+  # Ensures no space before % sign and consistent use of '%' vs 'percent'
+  number: true
+  # Citation Quality - Flag outdated references and citation formatting issues
+  # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
+  citation_quality: true
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Review Compliance
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Anonymization - Check double-blind review compliance
+  # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
+  anonymization: true
+# ==============================================================================
+# 🔍 Metadata Check Workflow
+# ==============================================================================
+# Define the data sources and order for metadata validation.
+# BibGuard will try each enabled source in sequence until a match is found.
+# Set enabled: false to skip a particular source.
+workflow:
+  - name: arxiv_id
+    enabled: true
+    description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
+  - name: crossref_doi
+    enabled: true
+    description: "Lookup by DOI via CrossRef (authoritative for published papers)"
+  - name: semantic_scholar
+    enabled: true
+    description: "Semantic Scholar API (good coverage, includes citations)"
+  - name: dblp
+    enabled: true
+    description: "DBLP database (comprehensive for computer science papers)"
+  - name: openalex
+    enabled: true
+    description: "OpenAlex API (broad coverage across disciplines)"
+  - name: arxiv_title
+    enabled: true
+    description: "Search arXiv by title (fallback when ID unavailable)"
+  - name: crossref_title
+    enabled: true
+    description: "Search CrossRef by title (fallback when DOI unavailable)"
+  - name: google_scholar
+    enabled: false  # May be rate-limited, disabled by default
+    description: "Google Scholar web scraping (use as last resort)"
+# ==============================================================================
+# 🤖 LLM Configuration (for Relevance Checking)
+# ==============================================================================
+llm:
+  # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
+  # Each backend requires different setup (API keys, local installation, etc.)
+  backend: "gemini"
+  # Model name (leave empty to use backend default)
+  # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
+  model: ""
+  # API endpoint (leave empty to use backend default)
+  # Only needed for self-hosted models (vllm, ollama) or custom endpoints
+  endpoint: ""
+  # API key (recommended to use environment variables instead)
+  # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
+  api_key: ""
+# ==============================================================================
+# 📊 Output Settings
+# ==============================================================================
+output:
+  # Quiet mode - Suppress progress messages, only output final reports
+  # Useful for CI/CD pipelines or batch processing
+  quiet: false
+  # Minimal verified entries - Hide detailed info for entries that passed all checks
+  # Reduces report size when you only care about issues
+  minimal_verified: false

main.py ADDED Viewed

	@@ -0,0 +1,564 @@

+#!/usr/bin/env python3
+"""
+BibGuard - Bibliography Checker & Paper Submission Quality Tool
+Usage:
+    python main.py                    # Use bibguard.yaml in current directory
+    python main.py --config my.yaml   # Use specified config file
+    python main.py --init             # Create default config file
+    python main.py --list-templates   # List available templates
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Optional, List
+from src.parsers import BibParser, TexParser
+from src.fetchers import ArxivFetcher, ScholarFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
+from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, DuplicateDetector
+from src.analyzers.llm_evaluator import LLMBackend
+from src.report.generator import ReportGenerator, EntryReport
+from src.utils.progress import ProgressDisplay
+from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
+from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
+from src.templates.base_template import get_template, get_all_templates
+from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
+def main():
+    parser = argparse.ArgumentParser(
+        description="BibGuard: Bibliography Checker & Paper Submission Quality Tool",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Usage Examples:
+  python main.py                      # Auto-detect config.yaml in current directory
+  python main.py --config my.yaml     # Use specified config file
+  python main.py --init               # Create default config.yaml
+  python main.py --list-templates     # List available conference templates
+        """
+    )
+    parser.add_argument(
+        "--config", "-c",
+        help="Config file path (default: auto-detect config.yaml)"
+    )
+    parser.add_argument(
+        "--init",
+        action="store_true",
+        help="Create default config.yaml in current directory"
+    )
+    parser.add_argument(
+        "--list-templates",
+        action="store_true",
+        help="List all available conference templates"
+    )
+    args = parser.parse_args()
+    # Handle --init
+    if args.init:
+        output = create_default_config()
+        print(f"✓ Created configuration file: {output}")
+        print("")
+        print("  Next steps:")
+        print("  1. Edit the 'bib' and 'tex' paths in config.yaml")
+        print("  2. Run: python main.py --config config.yaml")
+        print("")
+        sys.exit(0)
+    # Handle --list-templates
+    if args.list_templates:
+        from src.ui.template_selector import list_templates
+        list_templates()
+        sys.exit(0)
+    # Find and load config
+    config_path = args.config
+    if not config_path:
+        found = find_config_file()
+        if found:
+            config_path = str(found)
+        else:
+            print("Error: Config file not found")
+            print("")
+            print("Please run 'python main.py --init' to create config.yaml")
+            print("Or use 'python main.py --config <path>' to specify a config file")
+            print("")
+            sys.exit(1)
+    try:
+        config = load_config(config_path)
+    except FileNotFoundError:
+        print(f"Error: Config file does not exist: {config_path}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: Failed to parse config file: {e}")
+        sys.exit(1)
+    # Validate required fields
+    mode_dir = bool(config.files.input_dir)
+    if mode_dir:
+        input_dir = config.input_dir_path
+        if not input_dir.exists() or not input_dir.is_dir():
+            print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
+            sys.exit(1)
+        tex_files = list(input_dir.rglob("*.tex"))
+        bib_files = list(input_dir.rglob("*.bib"))
+        if not tex_files:
+            print(f"Error: No .tex files found in {input_dir}")
+            sys.exit(1)
+        if not bib_files:
+            print(f"Error: No .bib files found in {input_dir}")
+            sys.exit(1)
+        config._tex_files = tex_files
+        config._bib_files = bib_files
+    else:
+        if not config.files.bib:
+            print("Error: bib file path not specified in config")
+            sys.exit(1)
+        if not config.files.tex:
+            print("Error: tex file path not specified in config")
+            sys.exit(1)
+        # Validate files exist
+        if not config.bib_path.exists():
+            print(f"Error: Bib file does not exist: {config.bib_path}")
+            sys.exit(1)
+        if not config.tex_path.exists():
+            print(f"Error: TeX file does not exist: {config.tex_path}")
+            sys.exit(1)
+        config._tex_files = [config.tex_path]
+        config._bib_files = [config.bib_path]
+    # Load template if specified
+    template = None
+    if config.template:
+        template = get_template(config.template)
+        if not template:
+            print(f"Error: Unknown template: {config.template}")
+            print("Use --list-templates to see available templates")
+            sys.exit(1)
+    # Run the checker
+    try:
+        run_checker(config, template)
+    except KeyboardInterrupt:
+        print("\n\nCancelled")
+        sys.exit(130)
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+def run_checker(config: BibGuardConfig, template=None):
+    """Run the bibliography checker with the given configuration."""
+    progress = ProgressDisplay()
+    # Show config info (minimal)
+    if template:
+        pass # Skip printing header/info here to keep output clean
+    # Parse files (silent)
+    bib_parser = BibParser()
+    entries = []
+    for bib_path in config._bib_files:
+        entries.extend(bib_parser.parse_file(str(bib_path)))
+    tex_parser = TexParser()
+    tex_contents = {}
+    merged_citations = {}
+    merged_all_keys = set()
+    for tex_path in config._tex_files:
+        cits = tex_parser.parse_file(str(tex_path))
+        # Accumulate citations
+        for k, v in cits.items():
+            if k not in merged_citations:
+                merged_citations[k] = []
+            merged_citations[k].extend(v)
+        # Accumulate keys
+        merged_all_keys.update(tex_parser.get_all_cited_keys())
+        # Store content
+        tex_contents[str(tex_path)] = tex_path.read_text(encoding='utf-8', errors='replace')
+    # Inject merged data back into parser for components that use it
+    tex_parser.citations = merged_citations
+    tex_parser.all_keys = merged_all_keys
+    # Initialize components based on config
+    bib_config = config.bibliography
+    arxiv_fetcher = None
+    crossref_fetcher = None
+    scholar_fetcher = None
+    semantic_scholar_fetcher = None
+    openalex_fetcher = None
+    dblp_fetcher = None
+    comparator = None
+    usage_checker = None
+    llm_evaluator = None
+    duplicate_detector = None
+    if bib_config.check_metadata or bib_config.check_relevance:
+        arxiv_fetcher = ArxivFetcher()
+    if bib_config.check_metadata:
+        semantic_scholar_fetcher = SemanticScholarFetcher()
+        openalex_fetcher = OpenAlexFetcher()
+        dblp_fetcher = DBLPFetcher()
+        crossref_fetcher = CrossRefFetcher()
+        scholar_fetcher = ScholarFetcher()
+        comparator = MetadataComparator()
+    if bib_config.check_usage:
+        usage_checker = UsageChecker(tex_parser)
+    if bib_config.check_duplicates:
+        duplicate_detector = DuplicateDetector()
+    if bib_config.check_relevance:
+        llm_config = config.llm
+        backend = LLMBackend(llm_config.backend)
+        llm_evaluator = LLMEvaluator(
+            backend=backend,
+            endpoint=llm_config.endpoint or None,
+            model=llm_config.model or None,
+            api_key=llm_config.api_key or None
+        )
+        # Test LLM connection (silent)
+        llm_evaluator.test_connection()
+        if not usage_checker:
+            usage_checker = UsageChecker(tex_parser)
+    # Initialize report generator
+    report_gen = ReportGenerator(
+        minimal_verified=config.output.minimal_verified,
+        check_preprint_ratio=config.bibliography.check_preprint_ratio,
+        preprint_warning_threshold=config.bibliography.preprint_warning_threshold
+    )
+    report_gen.set_metadata(
+        [str(f) for f in config._bib_files],
+        [str(f) for f in config._tex_files]
+    )
+    # Run submission quality checks
+    submission_results = []
+    enabled_checkers = config.submission.get_enabled_checkers()
+    for checker_name in enabled_checkers:
+        if checker_name in CHECKER_REGISTRY:
+            checker = CHECKER_REGISTRY[checker_name]()
+            for tex_path_str, content in tex_contents.items():
+                results = checker.check(content, {})
+                # Tag results with file path
+                for r in results:
+                    r.file_path = tex_path_str
+                submission_results.extend(results)
+    # Set results in report generator for summary calculation
+    report_gen.set_submission_results(submission_results, template)
+    # Check for duplicates (silent)
+    if bib_config.check_duplicates and duplicate_detector:
+        duplicate_groups = duplicate_detector.find_duplicates(entries)
+        report_gen.set_duplicate_groups(duplicate_groups)
+    # Check missing citations (silent)
+    if bib_config.check_usage and usage_checker:
+        missing = usage_checker.get_missing_entries(entries)
+        report_gen.set_missing_citations(missing)
+    # Process entries
+    # Build workflow from config
+    from src.config.workflow import WorkflowConfig, get_default_workflow, WorkflowStep as WFStep
+    workflow_config = get_default_workflow()
+    if config.workflow:
+        workflow_config = WorkflowConfig(
+            steps=[
+                WFStep(
+                    name=step.name,
+                    display_name=step.name,
+                    description=step.description,
+                    enabled=step.enabled,
+                    priority=i
+                )
+                for i, step in enumerate(config.workflow)
+            ]
+        )
+    # Process entries in parallel for metadata checks
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    import threading
+    # Thread-safe progress tracking
+    progress_lock = threading.Lock()
+    completed_count = [0]  # Use list for mutability in closure
+    def process_single_entry(entry):
+        """Process a single entry (thread-safe)."""
+        # Check usage
+        usage_result = None
+        if usage_checker:
+            usage_result = usage_checker.check_usage(entry)
+        # Fetch and compare metadata
+        comparison_result = None
+        if bib_config.check_metadata and comparator:
+            comparison_result = fetch_and_compare_with_workflow(
+                entry, workflow_config, arxiv_fetcher, crossref_fetcher,
+                scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher,
+                dblp_fetcher, comparator
+            )
+        # LLM evaluation (keep sequential per entry)
+        evaluations = []
+        if bib_config.check_relevance and llm_evaluator:
+            if usage_result and usage_result.is_used:
+                abstract = get_abstract(entry, comparison_result, arxiv_fetcher)
+                if abstract:
+                    for ctx in usage_result.contexts:
+                        eval_result = llm_evaluator.evaluate(
+                            entry.key, ctx.full_context, abstract
+                        )
+                        eval_result.line_number = ctx.line_number
+                        eval_result.file_path = ctx.file_path
+                        evaluations.append(eval_result)
+        # Create entry report
+        entry_report = EntryReport(
+            entry=entry,
+            comparison=comparison_result,
+            usage=usage_result,
+            evaluations=evaluations
+        )
+        return entry_report, comparison_result
+    # Determine number of workers (max 10 to avoid overwhelming APIs)
+    max_workers = min(10, len(entries))
+    with progress.progress_context(len(entries), "Processing bibliography") as prog:
+        # Use ThreadPoolExecutor for parallel processing
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all tasks
+            future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
+            # Process completed tasks
+            for future in as_completed(future_to_entry):
+                entry = future_to_entry[future]
+                try:
+                    entry_report, comparison_result = future.result()
+                    # Thread-safe progress update
+                    with progress_lock:
+                        report_gen.add_entry_report(entry_report)
+                        # Update progress
+                        if comparison_result and comparison_result.is_match:
+                            prog.mark_success()
+                        elif comparison_result and comparison_result.has_issues:
+                            prog.mark_warning()
+                        else:
+                            prog.mark_error()
+                        completed_count[0] += 1
+                        prog.update(entry.key, "Done", 1)
+                except Exception as e:
+                    with progress_lock:
+                        prog.mark_error()
+                        progress.print_error(f"Error processing {entry.key}: {e}")
+                        completed_count[0] += 1
+                        prog.update(entry.key, "Failed", 1)
+    # Summary will be printed at the very end
+    # Generate reports and organize outputs (silent)
+    # Create output directory
+    output_dir = config.output_dir_path
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Copy input files to output directory
+    import shutil
+    for bib_path in config._bib_files:
+        shutil.copy2(bib_path, output_dir / bib_path.name)
+    for tex_path in config._tex_files:
+        shutil.copy2(tex_path, output_dir / tex_path.name)
+    # 1. Bibliography Report
+    bib_report_path = output_dir / "bibliography_report.md"
+    report_gen.save_bibliography_report(str(bib_report_path))
+    # 2. LaTeX Quality Report
+    if submission_results:
+        latex_report_path = output_dir / "latex_quality_report.md"
+        report_gen.save_latex_quality_report(
+            str(latex_report_path),
+            submission_results,
+            template
+        )
+        # 3. Line-by-Line Report
+        from src.report.line_report import generate_line_report
+        line_report_path = output_dir / "line_by_line_report.md"
+        # For multiple files, we generate one big report with sections
+        all_line_reports = []
+        for tex_path_str, content in tex_contents.items():
+            file_results = [r for r in submission_results if r.file_path == tex_path_str]
+            if not file_results:
+                continue
+            from src.report.line_report import LineByLineReportGenerator
+            gen = LineByLineReportGenerator(content, tex_path_str)
+            gen.add_results(file_results)
+            all_line_reports.append(gen.generate())
+        if all_line_reports:
+            with open(line_report_path, 'w', encoding='utf-8') as f:
+                f.write("\n\n".join(all_line_reports))
+    # 4. Clean bib file (if generated earlier)
+    if bib_config.check_usage and usage_checker:
+        used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
+        if used_entries:
+            try:
+                keys_to_keep = {entry.key for entry in used_entries}
+                # If multiple bibs, we merge them into one cleaned file
+                # or just use the first one if it's single mode.
+                # For now, let's just use a default name if multiple.
+                if len(config._bib_files) == 1:
+                    clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
+                    bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
+                else:
+                    clean_bib_path = output_dir / "merged_only_used.bib"
+                    # We need a way to filter multiple files into one.
+                    # BibParser.filter_file currently takes one input.
+                    # Let's just write all used entries to a new file.
+                    with open(clean_bib_path, 'w', encoding='utf-8') as f:
+                        for entry in used_entries:
+                            f.write(entry.raw + "\n\n")
+            except Exception as e:
+                pass
+    # Print beautiful console summary
+    if not config.output.quiet:
+        bib_stats, latex_stats = report_gen.get_summary_stats()
+        progress.print_detailed_summary(bib_stats, latex_stats, str(output_dir.absolute()))
+def fetch_and_compare_with_workflow(
+    entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
+    semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
+):
+    """Fetch metadata from online sources using the configured workflow."""
+    from src.utils.normalizer import TextNormalizer
+    all_results = []
+    enabled_steps = workflow_config.get_enabled_steps()
+    for step in enabled_steps:
+        result = None
+        if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher:
+            arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+            if arxiv_meta:
+                result = comparator.compare_with_arxiv(entry, arxiv_meta)
+        elif step.name == "crossref_doi" and entry.doi and crossref_fetcher:
+            crossref_result = crossref_fetcher.search_by_doi(entry.doi)
+            if crossref_result:
+                result = comparator.compare_with_crossref(entry, crossref_result)
+        elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher:
+            ss_result = None
+            if entry.doi:
+                ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
+            if not ss_result:
+                ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
+            if ss_result:
+                result = comparator.compare_with_semantic_scholar(entry, ss_result)
+        elif step.name == "dblp" and entry.title and dblp_fetcher:
+            dblp_result = dblp_fetcher.search_by_title(entry.title)
+            if dblp_result:
+                result = comparator.compare_with_dblp(entry, dblp_result)
+        elif step.name == "openalex" and entry.title and openalex_fetcher:
+            oa_result = None
+            if entry.doi:
+                oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
+            if not oa_result:
+                oa_result = openalex_fetcher.search_by_title(entry.title)
+            if oa_result:
+                result = comparator.compare_with_openalex(entry, oa_result)
+        elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
+            results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
+            if results:
+                best_result = None
+                best_sim = 0.0
+                norm1 = TextNormalizer.normalize_for_comparison(entry.title)
+                for r in results:
+                    norm2 = TextNormalizer.normalize_for_comparison(r.title)
+                    sim = TextNormalizer.similarity_ratio(norm1, norm2)
+                    if sim > best_sim:
+                        best_sim = sim
+                        best_result = r
+                if best_result and best_sim > 0.5:
+                    result = comparator.compare_with_arxiv(entry, best_result)
+        elif step.name == "crossref_title" and entry.title and crossref_fetcher:
+            crossref_result = crossref_fetcher.search_by_title(entry.title)
+            if crossref_result:
+                result = comparator.compare_with_crossref(entry, crossref_result)
+        elif step.name == "google_scholar" and entry.title and scholar_fetcher:
+            scholar_result = scholar_fetcher.search_by_title(entry.title)
+            if scholar_result:
+                result = comparator.compare_with_scholar(entry, scholar_result)
+        if result:
+            all_results.append(result)
+            if result.is_match:
+                return result
+    if all_results:
+        all_results.sort(key=lambda r: r.confidence, reverse=True)
+        return all_results[0]
+    return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
+def get_abstract(entry, comparison_result, arxiv_fetcher):
+    """Get abstract for an entry from various sources."""
+    if entry.abstract:
+        return entry.abstract
+    if entry.has_arxiv and arxiv_fetcher:
+        arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
+        if arxiv_meta and arxiv_meta.abstract:
+            return arxiv_meta.abstract
+    if entry.title and arxiv_fetcher:
+        results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
+        if results and results[0].abstract:
+            return results[0].abstract
+    return ""
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.0.0
+bibtexparser>=1.4.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+rich>=13.7.0
+Unidecode>=1.3.0
+lxml>=5.0.0
+PyYAML>=6.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Bibliography Checker Package"""

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (190 Bytes). View file

src/analyzers/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Analyzers package"""
+from .metadata_comparator import MetadataComparator
+from .usage_checker import UsageChecker
+from .llm_evaluator import LLMEvaluator
+from .duplicate_detector import DuplicateDetector
+__all__ = ['MetadataComparator', 'UsageChecker', 'LLMEvaluator', 'DuplicateDetector']

src/analyzers/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (464 Bytes). View file

src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc ADDED Viewed

Binary file (8.29 kB). View file

src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc ADDED Viewed

Binary file (5.4 kB). View file

src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc ADDED Viewed

Binary file (14.3 kB). View file

src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc ADDED Viewed

Binary file (18.9 kB). View file

src/analyzers/__pycache__/retraction_checker.cpython-313.pyc ADDED Viewed

Binary file (4.94 kB). View file

src/analyzers/__pycache__/url_validator.cpython-313.pyc ADDED Viewed

Binary file (8.3 kB). View file

src/analyzers/__pycache__/usage_checker.cpython-313.pyc ADDED Viewed

Binary file (4.4 kB). View file

src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc ADDED Viewed

Binary file (13.3 kB). View file

src/analyzers/duplicate_detector.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Duplicate entry detector for bibliography files.
+Uses fuzzy matching to find potential duplicates.
+"""
+from dataclasses import dataclass
+from typing import List, Tuple
+from ..parsers.bib_parser import BibEntry
+from ..utils.normalizer import TextNormalizer
+@dataclass
+class DuplicateGroup:
+    """A group of potentially duplicate entries."""
+    entries: List[BibEntry]
+    similarity_score: float
+    reason: str
+    @property
+    def entry_keys(self) -> List[str]:
+        return [e.key for e in self.entries]
+class DuplicateDetector:
+    """Detects duplicate bibliography entries using fuzzy matching."""
+    # Thresholds for duplicate detection
+    TITLE_SIMILARITY_THRESHOLD = 0.85
+    COMBINED_SIMILARITY_THRESHOLD = 0.80
+    def __init__(self):
+        self.normalizer = TextNormalizer
+    def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]:
+        """
+        Find all duplicate groups in the bibliography.
+        Returns:
+            List of DuplicateGroup objects, each containing 2+ similar entries.
+        """
+        duplicates = []
+        processed = set()
+        for i, entry1 in enumerate(entries):
+            if entry1.key in processed:
+                continue
+            # Find all entries similar to this one
+            similar_entries = [entry1]
+            for j, entry2 in enumerate(entries[i+1:], start=i+1):
+                if entry2.key in processed:
+                    continue
+                similarity, reason = self._calculate_similarity(entry1, entry2)
+                if similarity >= self.COMBINED_SIMILARITY_THRESHOLD:
+                    similar_entries.append(entry2)
+                    processed.add(entry2.key)
+            # If we found duplicates, create a group
+            if len(similar_entries) > 1:
+                processed.add(entry1.key)
+                # Calculate average similarity for the group
+                avg_similarity = self._calculate_group_similarity(similar_entries)
+                reason = self._generate_reason(similar_entries)
+                duplicates.append(DuplicateGroup(
+                    entries=similar_entries,
+                    similarity_score=avg_similarity,
+                    reason=reason
+                ))
+        # Sort by similarity score (highest first)
+        duplicates.sort(key=lambda g: g.similarity_score, reverse=True)
+        return duplicates
+    def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]:
+        """
+        Calculate similarity between two entries.
+        Returns:
+            (similarity_score, reason_string)
+        """
+        # Normalize titles
+        title1 = self.normalizer.normalize_for_comparison(entry1.title)
+        title2 = self.normalizer.normalize_for_comparison(entry2.title)
+        # Calculate title similarity
+        title_sim = self.normalizer.similarity_ratio(title1, title2)
+        # If titles are very similar, likely a duplicate
+        if title_sim >= self.TITLE_SIMILARITY_THRESHOLD:
+            return title_sim, "Very similar titles"
+        # Check author similarity
+        author_sim = self._calculate_author_similarity(entry1, entry2)
+        # Combined score: weighted average
+        # Title is more important (70%) than authors (30%)
+        combined_sim = 0.7 * title_sim + 0.3 * author_sim
+        if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD:
+            return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})"
+        return combined_sim, ""
+    def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float:
+        """Calculate similarity between author lists."""
+        # Parse author strings
+        authors1 = self._parse_authors(entry1.author)
+        authors2 = self._parse_authors(entry2.author)
+        if not authors1 or not authors2:
+            return 0.0
+        # Normalize author names
+        norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1]
+        norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2]
+        # Count matching authors
+        matches = 0
+        for a1 in norm_authors1:
+            for a2 in norm_authors2:
+                if self._authors_match(a1, a2):
+                    matches += 1
+                    break
+        # Calculate Jaccard similarity
+        total_unique = len(set(norm_authors1) | set(norm_authors2))
+        if total_unique == 0:
+            return 0.0
+        return matches / total_unique
+    def _parse_authors(self, author_string: str) -> List[str]:
+        """Parse author string into list of names."""
+        if not author_string:
+            return []
+        # Split by 'and'
+        authors = author_string.split(' and ')
+        # Clean up each author
+        cleaned = []
+        for author in authors:
+            # Remove extra whitespace
+            author = ' '.join(author.split())
+            if author:
+                cleaned.append(author)
+        return cleaned
+    def _authors_match(self, name1: str, name2: str) -> bool:
+        """Check if two author names match (handles initials)."""
+        # Simple exact match after normalization
+        if name1 == name2:
+            return True
+        # Check if one is a substring of the other (handles initials)
+        if name1 in name2 or name2 in name1:
+            return True
+        # Calculate string similarity
+        sim = self.normalizer.similarity_ratio(name1, name2)
+        return sim >= 0.8
+    def _calculate_group_similarity(self, entries: List[BibEntry]) -> float:
+        """Calculate average similarity within a group."""
+        if len(entries) < 2:
+            return 1.0
+        total_sim = 0.0
+        count = 0
+        for i, entry1 in enumerate(entries):
+            for entry2 in entries[i+1:]:
+                sim, _ = self._calculate_similarity(entry1, entry2)
+                total_sim += sim
+                count += 1
+        return total_sim / count if count > 0 else 0.0
+    def _generate_reason(self, entries: List[BibEntry]) -> str:
+        """Generate a human-readable reason for the duplicate group."""
+        # Check if all titles are very similar
+        titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries]
+        # Calculate pairwise title similarities
+        title_sims = []
+        for i, t1 in enumerate(titles):
+            for t2 in titles[i+1:]:
+                title_sims.append(self.normalizer.similarity_ratio(t1, t2))
+        avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0
+        if avg_title_sim >= 0.95:
+            return "Nearly identical titles"
+        elif avg_title_sim >= 0.85:
+            return "Very similar titles"
+        else:
+            return "Similar titles and authors"

src/analyzers/llm_evaluator.py ADDED Viewed

	@@ -0,0 +1,376 @@

+"""
+LLM-based citation relevance evaluator.
+Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
+"""
+import json
+import re
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+from enum import Enum
+import os
+import requests
+class LLMBackend(Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    GEMINI = "gemini"
+    VLLM = "vllm"
+    OLLAMA = "ollama"
+    DEEPSEEK = "deepseek"
+@dataclass
+class EvaluationResult:
+    """Result of LLM citation evaluation."""
+    entry_key: str
+    relevance_score: int  # 1-5
+    is_relevant: bool
+    explanation: str
+    context_used: str
+    abstract_used: str
+    line_number: Optional[int] = None
+    file_path: Optional[str] = None
+    error: Optional[str] = None
+    @property
+    def score_label(self) -> str:
+        labels = {
+            1: "Not Relevant",
+            2: "Marginally Relevant",
+            3: "Somewhat Relevant",
+            4: "Relevant",
+            5: "Highly Relevant"
+        }
+        return labels.get(self.relevance_score, "Unknown")
+class LLMEvaluator:
+    """Evaluates citation relevance using LLM."""
+    PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
+## Citation Context (from the manuscript):
+{context}
+## Cited Paper's Abstract:
+{abstract}
+## Task:
+Evaluate the relevance and appropriateness of this citation. Consider:
+1. Does the citation support the claim being made in the context?
+2. Is the cited paper's topic related to the discussion?
+3. Is this citation necessary, or could it be replaced with a more relevant one?
+## Response Format:
+Provide your response in the following JSON format:
+{{
+    "relevance_score": <1-5 integer>,
+    "is_relevant": <true/false>,
+    "explanation": "<brief explanation in 1-2 sentences>"
+}}
+Score guide:
+- 1: Not relevant at all
+- 2: Marginally relevant
+- 3: Somewhat relevant
+- 4: Relevant and appropriate
+- 5: Highly relevant and essential
+STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
+    def __init__(
+        self,
+        backend: LLMBackend = LLMBackend.GEMINI,
+        endpoint: Optional[str] = None,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None
+    ):
+        self.backend = backend
+        self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
+        # Set defaults based on backend
+        if backend == LLMBackend.OPENAI:
+            self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
+            self.model = model or "gpt-5-mini"
+        elif backend == LLMBackend.ANTHROPIC:
+            self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
+            self.model = model or "claude-4.5-haiku"
+        elif backend == LLMBackend.DEEPSEEK:
+            self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
+            self.model = model or "deepseek-chat"
+        elif backend == LLMBackend.OLLAMA:
+            self.endpoint = endpoint or "http://localhost:11434/api/generate"
+            self.model = model or "Qwen/qwen3-4B-Instruct-2507"
+        elif backend == LLMBackend.VLLM:
+            self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
+            self.model = model or "Qwen/qwen3-4B-Instruct-2507"
+        elif backend == LLMBackend.GEMINI:
+            self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
+            self.model = model or "gemini-2.5-flash-lite"
+    def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
+        """Evaluate citation relevance."""
+        if not context or not abstract:
+            return EvaluationResult(
+                entry_key=entry_key,
+                relevance_score=0,
+                is_relevant=False,
+                explanation="Missing context or abstract",
+                context_used=context,
+                abstract_used=abstract,
+                error="Missing context or abstract for evaluation"
+            )
+        # Don't truncate - preserve full context and abstract
+        prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
+        try:
+            if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
+                response = self._call_openai_compatible(prompt)
+            elif self.backend == LLMBackend.ANTHROPIC:
+                response = self._call_anthropic(prompt)
+            elif self.backend == LLMBackend.OLLAMA:
+                response = self._call_ollama(prompt)
+            elif self.backend == LLMBackend.GEMINI:
+                response = self._call_gemini(prompt)
+            else:
+                raise ValueError(f"Unknown backend: {self.backend}")
+            return self._parse_response(entry_key, response, context, abstract)
+        except Exception as e:
+            return EvaluationResult(
+                entry_key=entry_key,
+                relevance_score=0,
+                is_relevant=False,
+                explanation="",
+                context_used=context,
+                abstract_used=abstract,
+                error=str(e)
+            )
+    def _call_openai_compatible(self, prompt: str) -> str:
+        """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.1,
+            "max_tokens": 2000,
+            "response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None
+        }
+        response = requests.post(
+            self.endpoint,
+            json=payload,
+            headers=headers,
+            timeout=60
+        )
+        response.raise_for_status()
+        data = response.json()
+        choices = data.get("choices", [])
+        if choices:
+            return choices[0].get("message", {}).get("content", "")
+        return ""
+    def _call_anthropic(self, prompt: str) -> str:
+        """Call Anthropic API."""
+        headers = {
+            "x-api-key": self.api_key,
+            "anthropic-version": "2023-06-01",
+            "content-type": "application/json"
+        }
+        payload = {
+            "model": self.model,
+            "max_tokens": 2000,
+            "temperature": 0.1,
+            "messages": [
+                {"role": "user", "content": prompt}
+            ]
+        }
+        response = requests.post(
+            self.endpoint,
+            json=payload,
+            headers=headers,
+            timeout=60
+        )
+        response.raise_for_status()
+        data = response.json()
+        content = data.get("content", [])
+        if content and content[0].get("type") == "text":
+            return content[0].get("text", "")
+        return ""
+    def _call_ollama(self, prompt: str) -> str:
+        """Call Ollama API."""
+        payload = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": False,
+            "options": {
+                "temperature": 0.1,
+                "num_predict": 2000
+            },
+            "format": "json"
+        }
+        response = requests.post(
+            self.endpoint,
+            json=payload,
+            timeout=60
+        )
+        response.raise_for_status()
+        return response.json().get("response", "")
+    def _call_gemini(self, prompt: str) -> str:
+        """Call Gemini API."""
+        # Build URL with model
+        url = f"{self.endpoint}/{self.model}:generateContent"
+        if self.api_key:
+            url += f"?key={self.api_key}"
+        payload = {
+            "contents": [
+                {
+                    "parts": [
+                        {"text": prompt}
+                    ]
+                }
+            ],
+            "generationConfig": {
+                "temperature": 0.1,
+                "maxOutputTokens": 2000,
+                "responseMimeType": "application/json"
+            }
+        }
+        response = requests.post(
+            url,
+            json=payload,
+            timeout=60
+        )
+        response.raise_for_status()
+        candidates = response.json().get("candidates", [])
+        if candidates:
+            content = candidates[0].get("content", {})
+            parts = content.get("parts", [])
+            if parts:
+                return parts[0].get("text", "")
+        return ""
+    def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
+        """Parse LLM response."""
+        # Try to extract JSON from response
+        json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
+        data = {}
+        if not json_match:
+            # Try to parse the whole response as JSON
+            try:
+                data = json.loads(response.strip())
+            except json.JSONDecodeError:
+                pass
+        else:
+            try:
+                data = json.loads(json_match.group())
+            except json.JSONDecodeError:
+                pass
+        if not data:
+             return EvaluationResult(
+                entry_key=entry_key,
+                relevance_score=0,
+                is_relevant=False,
+                explanation=response,
+                context_used=context,
+                abstract_used=abstract,
+                error="Failed to parse LLM response as JSON"
+            )
+        # Extract fields
+        relevance_score = data.get("relevance_score", 0)
+        if isinstance(relevance_score, str):
+            try:
+                relevance_score = int(relevance_score)
+            except ValueError:
+                relevance_score = 0
+        is_relevant = data.get("is_relevant", False)
+        if isinstance(is_relevant, str):
+            is_relevant = is_relevant.lower() in ("true", "yes", "1")
+        explanation = data.get("explanation", "")
+        return EvaluationResult(
+            entry_key=entry_key,
+            relevance_score=relevance_score,
+            is_relevant=is_relevant,
+            explanation=explanation,
+            context_used=context,
+            abstract_used=abstract
+        )
+    def test_connection(self) -> bool:
+        """Test if LLM backend is accessible."""
+        try:
+            if self.backend == LLMBackend.OLLAMA:
+                response = requests.get(
+                    self.endpoint.replace("/api/generate", "/api/tags"),
+                    timeout=5
+                )
+                return response.status_code == 200
+            elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
+                # Test with a simple model list or empty completion
+                headers = {"Authorization": f"Bearer {self.api_key}"}
+                # Try listing models if possible, otherwise simple completion
+                if "chat/completions" in self.endpoint:
+                    # Try a minimal completion
+                    payload = {
+                        "model": self.model,
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "max_tokens": 1
+                    }
+                    response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
+                    return response.status_code == 200
+                else:
+                    return False
+            elif self.backend == LLMBackend.ANTHROPIC:
+                headers = {
+                    "x-api-key": self.api_key,
+                    "anthropic-version": "2023-06-01",
+                    "content-type": "application/json"
+                }
+                payload = {
+                    "model": self.model,
+                    "max_tokens": 1,
+                    "messages": [{"role": "user", "content": "hi"}]
+                }
+                response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
+                return response.status_code == 200
+            elif self.backend == LLMBackend.GEMINI:
+                if not self.api_key:
+                    return False
+                url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}"
+                payload = {
+                    "contents": [{"parts": [{"text": "test"}]}],
+                    "generationConfig": {"maxOutputTokens": 10}
+                }
+                response = requests.post(url, json=payload, timeout=10)
+                return response.status_code == 200
+        except Exception:
+            return False
+        return False

src/analyzers/metadata_comparator.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+Metadata comparison between bib entries and fetched metadata.
+"""
+from dataclasses import dataclass
+from typing import Optional
+from ..parsers.bib_parser import BibEntry
+from ..fetchers.arxiv_fetcher import ArxivMetadata
+from ..fetchers.scholar_fetcher import ScholarResult
+from ..fetchers.crossref_fetcher import CrossRefResult
+from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult
+from ..fetchers.openalex_fetcher import OpenAlexResult
+from ..fetchers.dblp_fetcher import DBLPResult
+from ..utils.normalizer import TextNormalizer
+@dataclass
+class ComparisonResult:
+    """Result of comparing bib entry with fetched metadata."""
+    entry_key: str
+    # Title comparison
+    title_match: bool
+    title_similarity: float
+    bib_title: str
+    fetched_title: str
+    # Author comparison
+    author_match: bool
+    author_similarity: float
+    bib_authors: list[str]
+    fetched_authors: list[str]
+    # Year comparison
+    year_match: bool
+    bib_year: str
+    fetched_year: str
+    # Overall assessment
+    is_match: bool
+    confidence: float
+    issues: list[str]
+    source: str  # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
+    @property
+    def has_issues(self) -> bool:
+        return len(self.issues) > 0
+class MetadataComparator:
+    """Compares bibliography entries with fetched metadata."""
+    # Thresholds for matching
+    TITLE_THRESHOLD = 0.8
+    AUTHOR_THRESHOLD = 0.6
+    def __init__(self):
+        self.normalizer = TextNormalizer
+    def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
+        """Compare bib entry with arXiv metadata."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm)
+        # Also try Levenshtein for short titles
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors]
+        author_similarity = self._compare_author_lists(bib_authors, arxiv_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        arxiv_year = arxiv_meta.year
+        year_match = bib_year == arxiv_year
+        if not year_match and bib_year and arxiv_year:
+            issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=arxiv_meta.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=arxiv_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=arxiv_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="arxiv"
+        )
+    def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
+        """Compare bib entry with Scholar search result."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors (Scholar format is less structured)
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        # Scholar authors are comma-separated
+        scholar_authors_raw = scholar_result.authors.split(',')
+        scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw]
+        author_similarity = self._compare_author_lists(bib_authors, scholar_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        scholar_year = scholar_result.year
+        year_match = bib_year == scholar_year
+        if not year_match and bib_year and scholar_year:
+            issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=scholar_result.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=scholar_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=scholar_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="scholar"
+        )
+    def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult:
+        """Compare bib entry with CrossRef search result."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors]
+        author_similarity = self._compare_author_lists(bib_authors, crossref_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        crossref_year = crossref_result.year
+        year_match = bib_year == crossref_year
+        if not year_match and bib_year and crossref_year:
+            issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=crossref_result.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=crossref_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=crossref_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="crossref"
+        )
+    def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
+        """Create result when metadata couldn't be fetched."""
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=False,
+            title_similarity=0.0,
+            bib_title=bib_entry.title,
+            fetched_title="",
+            author_match=False,
+            author_similarity=0.0,
+            bib_authors=self.normalizer.normalize_author_list(bib_entry.author),
+            fetched_authors=[],
+            year_match=False,
+            bib_year=bib_entry.year,
+            fetched_year="",
+            is_match=False,
+            confidence=0.0,
+            issues=[reason],
+            source="unable"
+        )
+    def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
+        """Compare two author lists."""
+        if not list1 and not list2:
+            return 1.0
+        if not list1 or not list2:
+            return 0.0
+        # Find best matches for each author in list1
+        total_similarity = 0.0
+        for author1 in list1:
+            best_match = 0.0
+            for author2 in list2:
+                # Check if one name contains the other (handle abbreviated names)
+                if self._names_match(author1, author2):
+                    best_match = 1.0
+                    break
+                sim = self.normalizer.similarity_ratio(author1, author2)
+                best_match = max(best_match, sim)
+            total_similarity += best_match
+        return total_similarity / len(list1)
+    def _names_match(self, name1: str, name2: str) -> bool:
+        """Check if two names match (handles abbreviated names)."""
+        words1 = name1.split()
+        words2 = name2.split()
+        if not words1 or not words2:
+            return False
+        # Check if last names match
+        if words1[-1] != words2[-1]:
+            # Try first word as last name too
+            if words1[0] != words2[-1] and words1[-1] != words2[0]:
+                return False
+        return True
+    def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult:
+        """Compare bib entry with Semantic Scholar result."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors]
+        author_similarity = self._compare_author_lists(bib_authors, ss_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        ss_year = ss_result.year
+        year_match = bib_year == ss_year
+        if not year_match and bib_year and ss_year:
+            issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=ss_result.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=ss_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=ss_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="semantic_scholar"
+        )
+    def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult:
+        """Compare bib entry with OpenAlex result."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors]
+        author_similarity = self._compare_author_lists(bib_authors, oa_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        oa_year = oa_result.year
+        year_match = bib_year == oa_year
+        if not year_match and bib_year and oa_year:
+            issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=oa_result.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=oa_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=oa_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="openalex"
+        )
+    def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult:
+        """Compare bib entry with DBLP result."""
+        issues = []
+        # Compare titles
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # Compare authors
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors]
+        author_similarity = self._compare_author_lists(bib_authors, dblp_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # Compare years
+        bib_year = bib_entry.year.strip()
+        dblp_year = dblp_result.year
+        year_match = bib_year == dblp_year
+        if not year_match and bib_year and dblp_year:
+            issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}")
+        # Overall assessment
+        is_match = title_match and author_match
+        confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=dblp_result.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=dblp_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=dblp_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source="dblp"
+        )

src/analyzers/usage_checker.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Usage checker for bibliography entries in TeX files.
+"""
+from dataclasses import dataclass
+from typing import Optional
+from ..parsers.bib_parser import BibEntry
+from ..parsers.tex_parser import TexParser, CitationContext
+@dataclass
+class UsageResult:
+    """Result of checking if a bib entry is used."""
+    entry_key: str
+    is_used: bool
+    usage_count: int
+    contexts: list[CitationContext]
+    line_numbers: list[int]
+    @property
+    def first_usage_line(self) -> Optional[int]:
+        return self.line_numbers[0] if self.line_numbers else None
+class UsageChecker:
+    """Checks if bibliography entries are used in TeX files."""
+    def __init__(self, tex_parser: TexParser):
+        self.tex_parser = tex_parser
+        self._cited_keys = tex_parser.get_all_cited_keys()
+    def check_usage(self, entry: BibEntry) -> UsageResult:
+        """Check if a bib entry is used in the TeX document."""
+        key = entry.key
+        is_used = key in self._cited_keys
+        contexts = self.tex_parser.get_citation_contexts(key)
+        return UsageResult(
+            entry_key=key,
+            is_used=is_used,
+            usage_count=len(contexts),
+            contexts=contexts,
+            line_numbers=[ctx.line_number for ctx in contexts]
+        )
+    def get_unused_entries(self, entries: list[BibEntry]) -> list[BibEntry]:
+        """Get list of entries that are not cited in the document."""
+        unused = []
+        for entry in entries:
+            if entry.key not in self._cited_keys:
+                unused.append(entry)
+        return unused
+    def get_missing_entries(self, entries: list[BibEntry]) -> list[str]:
+        """Get list of citation keys that don't have corresponding bib entries."""
+        entry_keys = {e.key for e in entries}
+        missing = []
+        for key in self._cited_keys:
+            if key not in entry_keys:
+                missing.append(key)
+        return missing
+    def get_combined_context(self, key: str, max_chars: int = 1000) -> str:
+        """Get combined context for all usages of a key."""
+        contexts = self.tex_parser.get_citation_contexts(key)
+        if not contexts:
+            return ""
+        combined = []
+        total_chars = 0
+        for ctx in contexts:
+            if total_chars + len(ctx.full_context) > max_chars:
+                # Add truncated context
+                remaining = max_chars - total_chars
+                if remaining > 100:
+                    combined.append(ctx.full_context[:remaining] + "...")
+                break
+            combined.append(ctx.full_context)
+            total_chars += len(ctx.full_context)
+        return "\n---\n".join(combined)

src/checkers/__init__.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Checkers module for paper submission quality checks."""
+from .base import BaseChecker, CheckResult, CheckSeverity
+from .caption_checker import CaptionChecker
+from .reference_checker import ReferenceChecker
+from .ai_artifacts_checker import AIArtifactsChecker
+from .formatting_checker import FormattingChecker
+from .anonymization_checker import AnonymizationChecker
+from .number_checker import NumberChecker
+from .sentence_checker import SentenceChecker
+from .consistency_checker import ConsistencyChecker
+from .citation_quality_checker import CitationQualityChecker
+from .equation_checker import EquationChecker
+from .acronym_checker import AcronymChecker
+__all__ = [
+    'BaseChecker',
+    'CheckResult',
+    'CheckSeverity',
+    'CaptionChecker',
+    'ReferenceChecker',
+    'AIArtifactsChecker',
+    'FormattingChecker',
+    'AnonymizationChecker',
+    'NumberChecker',
+    'SentenceChecker',
+    'ConsistencyChecker',
+    'CitationQualityChecker',
+    'EquationChecker',
+    'AcronymChecker',
+]
+# Registry of all available checkers
+CHECKER_REGISTRY = {
+    'caption': CaptionChecker,
+    'reference': ReferenceChecker,
+    'ai_artifacts': AIArtifactsChecker,
+    'formatting': FormattingChecker,
+    'anonymization': AnonymizationChecker,
+    'number': NumberChecker,
+    'sentence': SentenceChecker,
+    'consistency': ConsistencyChecker,
+    'citation_quality': CitationQualityChecker,
+    'equation': EquationChecker,
+    'acronym': AcronymChecker,
+}
+def get_checker(name: str) -> BaseChecker:
+    """Get a checker instance by name."""
+    if name not in CHECKER_REGISTRY:
+        raise ValueError(f"Unknown checker: {name}")
+    return CHECKER_REGISTRY[name]()
+def run_all_checkers(tex_content: str, config: dict = None) -> list:
+    """Run all checkers and return combined results."""
+    results = []
+    config = config or {}
+    for name, checker_class in CHECKER_REGISTRY.items():
+        checker = checker_class()
+        checker_results = checker.check(tex_content, config)
+        results.extend(checker_results)
+    return results

src/checkers/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (2.2 kB). View file

src/checkers/__pycache__/acronym_checker.cpython-313.pyc ADDED Viewed

Binary file (10.8 kB). View file

src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc ADDED Viewed

Binary file (6.14 kB). View file

src/checkers/__pycache__/anonymization_checker.cpython-313.pyc ADDED Viewed

Binary file (8.38 kB). View file

src/checkers/__pycache__/base.cpython-313.pyc ADDED Viewed

Binary file (7.68 kB). View file

src/checkers/__pycache__/caption_checker.cpython-313.pyc ADDED Viewed

Binary file (5.63 kB). View file

src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc ADDED Viewed

Binary file (5.41 kB). View file

src/checkers/__pycache__/consistency_checker.cpython-313.pyc ADDED Viewed

Binary file (11 kB). View file

src/checkers/__pycache__/equation_checker.cpython-313.pyc ADDED Viewed

Binary file (5.62 kB). View file

src/checkers/__pycache__/formatting_checker.cpython-313.pyc ADDED Viewed

Binary file (9.45 kB). View file

src/checkers/__pycache__/number_checker.cpython-313.pyc ADDED Viewed

Binary file (3.8 kB). View file

src/checkers/__pycache__/reference_checker.cpython-313.pyc ADDED Viewed

Binary file (8.3 kB). View file

src/checkers/__pycache__/sentence_checker.cpython-313.pyc ADDED Viewed

Binary file (4.36 kB). View file

src/checkers/acronym_checker.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Acronym and abbreviation checker.
+Validates that:
+- Acronyms found in text have corresponding full forms defined
+- Acronyms are used after their definition
+- Only checks acronyms that have matching full forms in the document
+"""
+import re
+from typing import List, Dict, Set, Tuple
+from collections import defaultdict
+from .base import BaseChecker, CheckResult, CheckSeverity
+class AcronymChecker(BaseChecker):
+    """Check acronym definitions and consistency."""
+    name = "acronym"
+    display_name = "Acronyms"
+    description = "Check acronym definitions and consistent usage"
+    # Enhanced pattern to find defined acronyms with LaTeX formatting support
+    # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
+    DEFINITION_PATTERN = re.compile(
+        r'([A-Z][a-zA-Z\s\-]+)\s*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|'  # Full Name (ABC) or Full Name (\textbf{ABC})
+        r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)',  # (ABC; Full Name) or (\textbf{ABC}; Full Name)
+        re.MULTILINE
+    )
+    # Pattern to find standalone acronyms (3+ capital letters)
+    ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
+    # Comprehensive list of common acronyms that don't need definition
+    COMMON_ACRONYMS = {
+        # Hardware & Computing
+        'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
+        'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
+        'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
+        # File Formats & Standards
+        'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
+        'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
+        # AI & Machine Learning (General)
+        'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
+        'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
+        'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
+        # NLP & Language Models
+        'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
+        'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
+        'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
+        # Computer Vision
+        'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
+        'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
+        # Reinforcement Learning
+        'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
+        'POMDP', 'RLHF', 'RLAIF',
+        # Metrics & Evaluation
+        'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
+        'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
+        # Data & Statistics
+        'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
+        'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
+        # Academic & Organizations
+        'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
+        'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
+        'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
+        'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
+        # Methods & Techniques (Common in ML papers)
+        'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
+        'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
+        'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
+        # Misc
+        'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
+        'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
+        'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
+        'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
+    }
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        # Remove comments using base class method
+        content = self._remove_comments(tex_content)
+        # Find all defined acronyms with their positions
+        defined_acronyms = self._find_definitions(content)
+        # Find all acronym usages (excluding special contexts)
+        all_usages = self._find_all_usages(content)
+        # NEW: Find potential full forms for each acronym
+        acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
+        # Check for undefined acronyms (only those with matching full forms)
+        for acronym, positions in all_usages.items():
+            if acronym in self.COMMON_ACRONYMS:
+                continue
+            # Skip if no matching full form found in document
+            if acronym not in acronym_full_forms:
+                continue
+            if acronym not in defined_acronyms:
+                # First usage should define it
+                first_pos = positions[0]
+                line_num = self._find_line_number(content, first_pos)
+                full_form = acronym_full_forms[acronym]
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.WARNING,
+                    message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
+                    line_number=line_num,
+                    suggestion=f"Define on first use: '{full_form} ({acronym})'"
+                ))
+            else:
+                # Check if used before definition
+                def_pos = defined_acronyms[acronym]
+                for pos in positions:
+                    if pos < def_pos:
+                        line_num = self._find_line_number(content, pos)
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.WARNING,
+                            message=f"Acronym '{acronym}' used before definition",
+                            line_number=line_num,
+                            suggestion="Move definition before first use"
+                        ))
+                        break
+        return results
+    def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
+        """Find potential full forms for acronyms by matching capital letters."""
+        full_forms = {}
+        for acronym in acronyms:
+            if acronym in self.COMMON_ACRONYMS:
+                continue
+            # Build regex pattern to match full form
+            # For "ABC", match words starting with A, B, C
+            acronym_clean = acronym.rstrip('s')  # Remove plural
+            if len(acronym_clean) < 3:
+                continue
+            # Create pattern: match sequence of words where first letters spell the acronym
+            # Allow optional words in between (like "of", "the", "and")
+            pattern_parts = []
+            for i, letter in enumerate(acronym_clean):
+                if i == 0:
+                    # First word must start with the letter
+                    pattern_parts.append(f'{letter}[a-z]+')
+                else:
+                    # Subsequent words: allow optional filler words
+                    pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
+            full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
+            try:
+                matches = re.finditer(full_pattern, content, re.IGNORECASE)
+                for match in matches:
+                    candidate = match.group(0)
+                    # Skip if candidate contains common non-content words
+                    # These words indicate the match is part of a sentence, not an acronym full form
+                    excluded_words = {
+                        'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+                        'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
+                        'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
+                        'can', 'could', 'may', 'might', 'must', 'shall',
+                        'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
+                        'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
+                        'more', 'most', 'less', 'few', 'several', 'other', 'another'
+                    }
+                    candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
+                    if any(word in excluded_words for word in candidate_words):
+                        continue
+                    # Verify: extract first letters and check if they match acronym
+                    words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
+                    # Filter out filler words (allowed in between but not counted)
+                    filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
+                    meaningful_words = [w for w in words if w.lower() not in filler_words]
+                    if len(meaningful_words) >= len(acronym_clean):
+                        first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
+                        if first_letters == acronym_clean:
+                            full_forms[acronym] = candidate
+                            break  # Found a match, use the first one
+            except re.error:
+                # Invalid regex, skip this acronym
+                continue
+        return full_forms
+    def _find_definitions(self, content: str) -> Dict[str, int]:
+        """Find all acronym definitions and their positions."""
+        definitions = {}
+        for match in self.DEFINITION_PATTERN.finditer(content):
+            # Get acronym from either pattern
+            acronym = match.group(2) or match.group(3)
+            if acronym:
+                acronym = acronym.rstrip('s')  # Remove plural
+                definitions[acronym] = match.start()
+        return definitions
+    def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
+        """Find all acronym usages, excluding special contexts."""
+        usages = defaultdict(list)
+        for match in self.ACRONYM_PATTERN.finditer(content):
+            acronym = match.group(1).rstrip('s')
+            pos = match.start()
+            # Skip if in special context
+            if self._is_in_special_context(content, pos, acronym):
+                continue
+            usages[acronym].append(pos)
+        return usages
+    def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
+        """Check if acronym at position is in a special context that should be ignored."""
+        # Get surrounding context
+        start = max(0, pos - 50)
+        end = min(len(content), pos + len(acronym) + 50)
+        before = content[start:pos]
+        after = content[pos + len(acronym):end]
+        # Skip if inside definition parentheses: (ACRONYM)
+        if before.endswith('(') and after.startswith(')'):
+            return True
+        # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
+        if before.rstrip().endswith('\\'):
+            return True
+        # Skip if inside label: \label{...:ACRONYM...}
+        if r'\label{' in before[-20:] and '}' in after[:20]:
+            return True
+        # Skip if inside ref: \ref{...:ACRONYM...}
+        if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
+            return True
+        # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
+        if r'\url{' in before[-20:] or 'http' in before[-20:]:
+            return True
+        # Skip if inside math mode (simple heuristic)
+        # Count $ signs before position
+        dollar_count = before.count('$') - before.count(r'\$')
+        if dollar_count % 2 == 1:  # Odd number means we're inside math mode
+            return True
+        # Skip if inside \begin{equation} or similar
+        if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
+            if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
+                return True
+        # Skip if it looks like a LaTeX command argument: \command[ACRONYM]
+        if before.endswith('[') and after.startswith(']'):
+            return True
+        # Skip if part of a file path or extension
+        if '.' in before[-5:] or '/' in before[-10:]:
+            return True
+        return False

src/checkers/ai_artifacts_checker.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+AI artifacts checker.
+Detects leftover text from AI writing assistants that should be removed
+before submission, such as:
+- Conversational responses ("Sure, here is...")
+- Placeholder text
+- Markdown formatting artifacts
+- Common AI response patterns
+"""
+import re
+from typing import List, Tuple
+from .base import BaseChecker, CheckResult, CheckSeverity
+class AIArtifactsChecker(BaseChecker):
+    """Detect AI-generated text artifacts that should be removed."""
+    name = "ai_artifacts"
+    display_name = "AI Artifacts"
+    description = "Detect leftover AI assistant text and placeholders"
+    # Conversational AI patterns (case insensitive)
+    # These are phrases that clearly indicate a dialogue between user and AI assistant
+    AI_CONVERSATION_PATTERNS = [
+        # Responses to requests
+        (r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
+        (r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
+        (r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
+        (r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
+        (r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
+        (r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
+        # Self-identification
+        (r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
+        (r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
+        (r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
+        # Explanatory transitions typical of chat
+        (r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
+        (r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
+        (r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
+        # Closing/Politeness
+        (r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
+        (r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
+        (r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
+        (r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
+        (r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
+        (r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
+        # Instructions/Meta-commentary
+        (r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
+        (r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
+        (r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
+    ]
+    # Placeholder patterns
+    PLACEHOLDER_PATTERNS = [
+        (r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
+        (r'\[add\s+[^\]]+\]', "Placeholder text"),
+        (r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
+        (r'\btodo\s*:\s*.{0,50}', "TODO comment"),
+        (r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
+        (r'\bxxx\b', "XXX placeholder"),
+        (r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
+        (r'author[\s_-]*name', "Author name placeholder"),
+        (r'your\.?email@example\.com', "Email placeholder"),
+        (r'example@(example\.com|university\.edu)', "Email placeholder"),
+        (r'\[citation\s+needed\]', "Citation needed placeholder"),
+    ]
+    # Markdown artifacts (should not appear in LaTeX)
+    MARKDOWN_PATTERNS = [
+        (r'^\s*#{1,6}\s+\w', "Markdown header"),
+        (r'\*\*[^*]+\*\*', "Markdown bold"),
+        (r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
+        (r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
+        (r'```[\s\S]*?```', "Markdown code block"),
+        (r'^\s*[-*+]\s+\w', "Markdown bullet point"),
+        (r'^\s*\d+\.\s+\w', "Markdown numbered list"),
+        (r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
+    ]
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        lines = tex_content.split('\n')
+        # Track if we are inside a verbatim-like environment
+        in_verbatim = False
+        verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
+        # Check each line
+        for line_num, line in enumerate(lines, 1):
+            # Check for environment boundaries
+            # Handle \begin{env}
+            if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
+                in_verbatim = True
+                continue # Skip the begin line itself
+            # Handle \end{env}
+            if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
+                in_verbatim = False
+                continue # Skip the end line itself
+            # Skip checks if inside verbatim environment
+            if in_verbatim:
+                continue
+            # Skip commented lines using base class method
+            if self._is_comment_line(line):
+                continue
+            # Remove inline comments for checking using base class method
+            line_to_check = self._remove_line_comment(line)
+            # Check AI conversation patterns
+            for pattern, description in self.AI_CONVERSATION_PATTERNS:
+                if re.search(pattern, line_to_check, re.IGNORECASE):
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.ERROR,
+                        message=f"{description} detected",
+                        line_number=line_num,
+                        line_content=line.strip()[:100],
+                        suggestion="Remove AI-generated conversational text"
+                    ))
+                    break  # One match per line for this category
+            # Check placeholder patterns
+            for pattern, description in self.PLACEHOLDER_PATTERNS:
+                match = re.search(pattern, line_to_check, re.IGNORECASE)
+                if match:
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.WARNING,
+                        message=f"{description}: '{match.group(0)[:50]}'",
+                        line_number=line_num,
+                        line_content=line.strip()[:100],
+                        suggestion="Replace placeholder with actual content or remove"
+                    ))
+            # Check Markdown patterns (less strict - might be intentional in some cases)
+            for pattern, description in self.MARKDOWN_PATTERNS:
+                # Skip if line looks like a LaTeX command (starts with \)
+                if line_to_check.strip().startswith('\\'):
+                    continue
+                # Special handling for bullet points: ensure space after
+                if "bullet point" in description:
+                    # Skip if it looks like a math subtraction or negative number
+                    if re.search(r'[-+]\d', line_to_check):
+                        continue
+                    # Skip if inside math mode (simple heuristic)
+                    if '$' in line_to_check:
+                        continue
+                # Special handling for italics: avoid matching math mode like $x*y$
+                if "italic" in description:
+                    if '$' in line_to_check:
+                        continue
+                if re.search(pattern, line_to_check):
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.INFO,
+                        message=f"Possible {description} in LaTeX",
+                        line_number=line_num,
+                        line_content=line.strip()[:100],
+                        suggestion="Convert to LaTeX formatting or remove if unintentional"
+                    ))
+        return results

src/checkers/anonymization_checker.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+Anonymization checker.
+For double-blind review submissions, checks for:
+- Author name leaks in acknowledgments
+- Personal URLs (GitHub, personal pages)
+- Self-citations that reveal identity
+- Institutional information in comments
+"""
+import re
+from typing import List
+from .base import BaseChecker, CheckResult, CheckSeverity
+class AnonymizationChecker(BaseChecker):
+    """Check for anonymization issues in double-blind submissions."""
+    name = "anonymization"
+    display_name = "Anonymization"
+    description = "Detect potential identity leaks in double-blind submissions"
+    # Patterns for identity-revealing content
+    PERSONAL_URL_PATTERNS = [
+        (r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
+        (r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
+        (r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
+        (r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
+        (r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
+        (r'~[a-zA-Z]+/', "Personal university page"),
+        (r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
+        (r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
+    ]
+    # Anonymous submission indicators (should be present)
+    ANONYMOUS_MARKERS = [
+        r'\\author\{[^}]*anonymous[^}]*\}',
+        r'anonymous\s+submission',
+        r'\\runningauthor\{[^}]*\}',  # Should be empty or generic
+    ]
+    # Potentially revealing patterns
+    SELF_CITE_PATTERNS = [
+        r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
+        r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
+        r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
+    ]
+    # Acknowledgment patterns
+    ACK_PATTERN = re.compile(
+        r'\\(?:section\*?\{acknowledgment|begin\{ack)',
+        re.IGNORECASE
+    )
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        lines = tex_content.split('\n')
+        # Check if this is a review submission (look for anonymous author)
+        is_review_version = self._is_review_version(tex_content)
+        if not is_review_version:
+            # If camera-ready, skip anonymization checks
+            results.append(self._create_result(
+                passed=True,
+                severity=CheckSeverity.INFO,
+                message="Document appears to be camera-ready version (not checking anonymization)"
+            ))
+            return results
+        # Check for personal URLs
+        for line_num, line in enumerate(lines, 1):
+            # Skip comments, but still check for leaks in comments!
+            if self._is_comment_line(line):
+                for pattern, desc in self.PERSONAL_URL_PATTERNS:
+                    if re.search(pattern, line, re.IGNORECASE):
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.WARNING,
+                            message=f"{desc} in comment (could be revealed when compiling)",
+                            line_number=line_num,
+                            line_content=line.strip()[:100],
+                            suggestion="Remove or anonymize URL even in comments"
+                        ))
+                continue
+            for pattern, desc in self.PERSONAL_URL_PATTERNS:
+                if re.search(pattern, line, re.IGNORECASE):
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.ERROR,
+                        message=f"{desc} may reveal author identity",
+                        line_number=line_num,
+                        line_content=line.strip()[:100],
+                        suggestion="Replace with anonymized URL or remove for review"
+                    ))
+        # Check acknowledgments section
+        ack_results = self._check_acknowledgments(tex_content, lines)
+        results.extend(ack_results)
+        # Check for self-revealing citations
+        for line_num, line in enumerate(lines, 1):
+            # Skip comments using base class method
+            if self._is_comment_line(line):
+                continue
+            for pattern in self.SELF_CITE_PATTERNS:
+                if re.search(pattern, line, re.IGNORECASE):
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.WARNING,
+                        message="Potentially self-revealing citation pattern",
+                        line_number=line_num,
+                        line_content=line.strip()[:100],
+                        suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
+                    ))
+        # Check for \author content
+        author_results = self._check_author_field(tex_content)
+        results.extend(author_results)
+        return results
+    def _is_review_version(self, content: str) -> bool:
+        """Detect if this is a review (anonymous) version."""
+        # Check for common anonymous submission markers
+        review_indicators = [
+            r'review',
+            r'submitted\s+to',
+            r'under\s+review',
+            r'anonymous',
+            r'\\usepackage\[review\]',
+        ]
+        for indicator in review_indicators:
+            if re.search(indicator, content[:2000], re.IGNORECASE):
+                return True
+        # Check for camera-ready indicators (negative)
+        camera_indicators = [
+            r'\\usepackage\[accepted\]',
+            r'\\usepackage\[final\]',
+            r'camera[\s-]?ready',
+        ]
+        for indicator in camera_indicators:
+            if re.search(indicator, content[:2000], re.IGNORECASE):
+                return False
+        # Default to review version (safer)
+        return True
+    def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
+        """Check acknowledgments section for identity leaks."""
+        results = []
+        # Find acknowledgment section
+        ack_match = self.ACK_PATTERN.search(content)
+        if not ack_match:
+            return results
+        # Find the line number
+        ack_line = self._find_line_number(content, ack_match.start())
+        # Check if it's commented out
+        actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
+        if not actual_line.lstrip().startswith('%'):
+            results.append(self._create_result(
+                passed=False,
+                severity=CheckSeverity.WARNING,
+                message="Acknowledgments section found - should be commented out for review",
+                line_number=ack_line,
+                suggestion="Comment out acknowledgments with % for anonymous submission"
+            ))
+        return results
+    def _check_author_field(self, content: str) -> List[CheckResult]:
+        """Check \\author{} field for revealing content."""
+        results = []
+        # Find \author{...} - handle multiline
+        author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
+        match = author_pattern.search(content)
+        if match:
+            # Extract author content (handle nested braces)
+            start = match.end()
+            brace_count = 1
+            i = start
+            while i < len(content) and brace_count > 0:
+                if content[i] == '{':
+                    brace_count += 1
+                elif content[i] == '}':
+                    brace_count -= 1
+                i += 1
+            author_content = content[start:i-1]
+            line_num = self._find_line_number(content, match.start())
+            # Check if author content looks anonymous
+            if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
+                # Check if it's not using \Anonymous or similar
+                if not re.search(r'\\(Anonymous|blindauthor)', author_content):
+                    # Might contain real author info
+                    if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.ERROR,
+                            message="Author field may contain real names",
+                            line_number=line_num,
+                            suggestion="Replace with 'Anonymous' or use anonymization command"
+                        ))
+        return results

src/checkers/base.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+Base checker class for paper submission quality checks.
+All specific checkers inherit from BaseChecker and implement
+the check() method to validate specific aspects of the TeX document.
+"""
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Tuple
+class CheckSeverity(Enum):
+    """Severity levels for check results."""
+    ERROR = "error"         # Must fix before submission
+    WARNING = "warning"     # Strongly recommended to fix
+    INFO = "info"           # Suggestion or best practice
+@dataclass
+class CheckResult:
+    """Result of a single check."""
+    checker_name: str
+    passed: bool
+    severity: CheckSeverity
+    message: str
+    line_number: Optional[int] = None
+    line_content: Optional[str] = None
+    suggestion: Optional[str] = None
+    file_path: Optional[str] = None
+    def to_dict(self) -> dict:
+        return {
+            'checker': self.checker_name,
+            'passed': self.passed,
+            'severity': self.severity.value,
+            'message': self.message,
+            'line': self.line_number,
+            'content': self.line_content,
+            'suggestion': self.suggestion,
+            'file_path': self.file_path
+        }
+class BaseChecker(ABC):
+    """
+    Abstract base class for all paper submission checkers.
+    Each checker validates a specific aspect of the paper,
+    such as caption placement, reference integrity, or formatting.
+    """
+    # Checker metadata - override in subclasses
+    name: str = "base"
+    display_name: str = "Base Checker"
+    description: str = "Base checker class"
+    @abstractmethod
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        """
+        Run the check on the given TeX content.
+        Args:
+            tex_content: The full content of the TeX file
+            config: Optional configuration dict (e.g., conference-specific settings)
+        Returns:
+            List of CheckResult objects describing found issues
+        """
+        pass
+    def _remove_comments(self, content: str) -> str:
+        """
+        Remove all LaTeX comments from content.
+        Preserves line structure (replaces comment with empty string on same line).
+        Handles escaped percent signs (\\%) correctly.
+        """
+        lines = content.split('\n')
+        result = []
+        for line in lines:
+            # Find first unescaped %
+            cleaned = self._remove_line_comment(line)
+            result.append(cleaned)
+        return '\n'.join(result)
+    def _remove_line_comment(self, line: str) -> str:
+        """Remove comment from a single line, preserving content before %."""
+        i = 0
+        while i < len(line):
+            if line[i] == '%':
+                # Check if escaped
+                num_backslashes = 0
+                j = i - 1
+                while j >= 0 and line[j] == '\\':
+                    num_backslashes += 1
+                    j -= 1
+                if num_backslashes % 2 == 0:
+                    # Not escaped, this is a comment start
+                    return line[:i]
+            i += 1
+        return line
+    def _is_comment_line(self, line: str) -> bool:
+        """Check if a line is entirely a comment (starts with %)."""
+        stripped = line.lstrip()
+        if not stripped:
+            return False
+        return stripped[0] == '%'
+    def _get_non_comment_lines(self, content: str) -> List[Tuple[int, str]]:
+        """
+        Get all non-comment lines with their line numbers.
+        Returns:
+            List of (line_number, line_content) tuples for non-comment lines.
+            Line content has inline comments removed.
+        """
+        lines = content.split('\n')
+        result = []
+        for line_num, line in enumerate(lines, 1):
+            # Skip pure comment lines
+            if self._is_comment_line(line):
+                continue
+            # Remove inline comments
+            cleaned = self._remove_line_comment(line)
+            # Skip if nothing left after removing comment
+            if not cleaned.strip():
+                continue
+            result.append((line_num, cleaned))
+        return result
+    def _find_line_number(self, content: str, position: int) -> int:
+        """Find line number for a character position in content."""
+        return content[:position].count('\n') + 1
+    def _get_line_content(self, content: str, line_number: int) -> str:
+        """Get the content of a specific line."""
+        lines = content.split('\n')
+        if 1 <= line_number <= len(lines):
+            return lines[line_number - 1].strip()
+        return ""
+    def _is_commented(self, content: str, position: int) -> bool:
+        """Check if a position is within a LaTeX comment."""
+        # Find the start of the current line
+        line_start = content.rfind('\n', 0, position) + 1
+        line_before = content[line_start:position]
+        # Check for unescaped % before this position on the same line
+        i = 0
+        while i < len(line_before):
+            if line_before[i] == '%':
+                # Check if escaped
+                num_backslashes = 0
+                j = i - 1
+                while j >= 0 and line_before[j] == '\\':
+                    num_backslashes += 1
+                    j -= 1
+                if num_backslashes % 2 == 0:
+                    # Not escaped, this is a comment
+                    return True
+            i += 1
+        return False
+    def _create_result(
+        self,
+        passed: bool,
+        severity: CheckSeverity,
+        message: str,
+        line_number: Optional[int] = None,
+        line_content: Optional[str] = None,
+        suggestion: Optional[str] = None
+    ) -> CheckResult:
+        """Helper to create a CheckResult with this checker's name."""
+        return CheckResult(
+            checker_name=self.name,
+            passed=passed,
+            severity=severity,
+            message=message,
+            line_number=line_number,
+            line_content=line_content,
+            suggestion=suggestion
+        )

src/checkers/caption_checker.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Caption placement checker.
+Validates that:
+- Table captions appear ABOVE the table content
+- Figure captions appear BELOW the figure content
+"""
+import re
+from typing import List
+from .base import BaseChecker, CheckResult, CheckSeverity
+class CaptionChecker(BaseChecker):
+    """Check for correct caption placement in tables and figures."""
+    name = "caption"
+    display_name = "Caption Placement"
+    description = "Verify table captions are above and figure captions are below"
+    # Patterns for environments
+    TABLE_ENV_PATTERN = re.compile(
+        r'\\begin\{table\*?\}(.*?)\\end\{table\*?\}',
+        re.DOTALL | re.IGNORECASE
+    )
+    FIGURE_ENV_PATTERN = re.compile(
+        r'\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}',
+        re.DOTALL | re.IGNORECASE
+    )
+    # Content patterns
+    CAPTION_PATTERN = re.compile(r'\\caption\s*[\[{]')
+    TABULAR_PATTERN = re.compile(r'\\begin\{tabular')
+    INCLUDEGRAPHICS_PATTERN = re.compile(r'\\includegraphics')
+    TIKZ_PATTERN = re.compile(r'\\begin\{tikzpicture\}')
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        # Check table environments
+        for match in self.TABLE_ENV_PATTERN.finditer(tex_content):
+            env_content = match.group(1)
+            env_start = match.start()
+            # Skip if commented
+            if self._is_commented(tex_content, env_start):
+                continue
+            result = self._check_table_caption(env_content, tex_content, env_start)
+            if result:
+                results.append(result)
+        # Check figure environments
+        for match in self.FIGURE_ENV_PATTERN.finditer(tex_content):
+            env_content = match.group(1)
+            env_start = match.start()
+            # Skip if commented
+            if self._is_commented(tex_content, env_start):
+                continue
+            result = self._check_figure_caption(env_content, tex_content, env_start)
+            if result:
+                results.append(result)
+        return results
+    def _check_table_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
+        """Check that table caption is above tabular content."""
+        caption_match = self.CAPTION_PATTERN.search(env_content)
+        tabular_match = self.TABULAR_PATTERN.search(env_content)
+        if not caption_match:
+            line_num = self._find_line_number(full_content, env_start)
+            return self._create_result(
+                passed=False,
+                severity=CheckSeverity.WARNING,
+                message="Table environment missing caption",
+                line_number=line_num,
+                suggestion="Add \\caption{} before \\begin{tabular}"
+            )
+        if not tabular_match:
+            # Table without tabular content - skip
+            return None
+        # Caption should come BEFORE tabular
+        if caption_match.start() > tabular_match.start():
+            line_num = self._find_line_number(full_content, env_start + caption_match.start())
+            return self._create_result(
+                passed=False,
+                severity=CheckSeverity.ERROR,
+                message="Table caption should be placed ABOVE the table content",
+                line_number=line_num,
+                line_content=self._get_line_content(full_content, line_num),
+                suggestion="Move \\caption{} before \\begin{tabular}"
+            )
+        return None
+    def _check_figure_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
+        """Check that figure caption is below image content."""
+        caption_match = self.CAPTION_PATTERN.search(env_content)
+        graphics_match = self.INCLUDEGRAPHICS_PATTERN.search(env_content)
+        tikz_match = self.TIKZ_PATTERN.search(env_content)
+        # Find the actual content (either graphics or tikz)
+        content_match = graphics_match or tikz_match
+        if not caption_match:
+            line_num = self._find_line_number(full_content, env_start)
+            return self._create_result(
+                passed=False,
+                severity=CheckSeverity.WARNING,
+                message="Figure environment missing caption",
+                line_number=line_num,
+                suggestion="Add \\caption{} after \\includegraphics"
+            )
+        if not content_match:
+            # Figure without graphics/tikz - could be custom content, skip
+            return None
+        # Caption should come AFTER content
+        if caption_match.start() < content_match.start():
+            line_num = self._find_line_number(full_content, env_start + caption_match.start())
+            return self._create_result(
+                passed=False,
+                severity=CheckSeverity.ERROR,
+                message="Figure caption should be placed BELOW the figure content",
+                line_number=line_num,
+                line_content=self._get_line_content(full_content, line_num),
+                suggestion="Move \\caption{} after \\includegraphics"
+            )
+        return None

src/checkers/citation_quality_checker.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Citation quality checker.
+Validates:
+- Old citations (>30 years) that might need updating
+- Citation formatting patterns (et al., hardcoded citations, etc.)
+"""
+import re
+from typing import List, Dict
+from datetime import datetime
+from collections import defaultdict
+from .base import BaseChecker, CheckResult, CheckSeverity
+class CitationQualityChecker(BaseChecker):
+    """Check citation quality and balance."""
+    name = "citation_quality"
+    display_name = "Citation Quality"
+    description = "Check citation age, balance, and formatting"
+    # Thresholds
+    OLD_CITATION_YEARS = 30  # Citations older than this get flagged
+    CURRENT_YEAR = datetime.now().year
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        # This checker works best with bib content, but we can do some analysis
+        # on the tex file alone by looking at citation patterns
+        # Check for inline year citations that are old
+        old_cite_results = self._check_old_citations_in_text(tex_content)
+        results.extend(old_cite_results)
+        # Check for citation formatting issues
+        format_results = self._check_citation_formatting(tex_content)
+        results.extend(format_results)
+        return results
+    def _check_old_citations_in_text(self, content: str) -> List[CheckResult]:
+        """Look for citations with old years visible in text."""
+        results = []
+        lines = content.split('\n')
+        # Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)"
+        year_pattern = re.compile(
+            r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|'  # Parenthetical
+            r'\b(?:19[89]\d|20[01]\d)\b)',  # Standalone year
+            re.IGNORECASE
+        )
+        old_years_found = set()
+        for line_num, line in enumerate(lines, 1):
+            # Skip comments using base class method
+            if self._is_comment_line(line):
+                continue
+            for match in year_pattern.finditer(line):
+                year_str = re.search(r'(19[89]\d|20[01]\d)', match.group())
+                if year_str:
+                    year = int(year_str.group())
+                    age = self.CURRENT_YEAR - year
+                    if age >= self.OLD_CITATION_YEARS and year not in old_years_found:
+                        old_years_found.add(year)
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.INFO,
+                            message=f"Citation from {year} ({age} years old)",
+                            line_number=line_num,
+                            suggestion=f"Consider if there's more recent work on this topic"
+                        ))
+        return results
+    def _check_citation_formatting(self, content: str) -> List[CheckResult]:
+        """Check for common citation formatting issues."""
+        results = []
+        lines = content.split('\n')
+        for line_num, line in enumerate(lines, 1):
+            if line.lstrip().startswith('%'):
+                continue
+            # Check for "et al" without period
+            if re.search(r'\bet al\b(?!\.)', line):
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.WARNING,
+                    message="'et al' should be 'et al.'",
+                    line_number=line_num,
+                    suggestion="Add period after 'et al.'"
+                ))
+            # Check for "[1]" style citations (might want natbib style)
+            # Skip if it's a command definition or argument
+            if re.search(r'\[\d+\]', line):
+                # Skip if in command definition
+                if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line:
+                    continue
+                # Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1]
+                if re.search(r'\\[a-zA-Z]+\[\d+\]', line):
+                    continue
+                # Only flag if it looks like actual citation in text
+                if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]):
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.INFO,
+                        message="Numeric citation style detected",
+                        line_number=line_num,
+                        suggestion="Consider author-year style for better readability"
+                    ))
+            # Check for hardcoded citations instead of \cite
+            if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line):
+                if '\\cite' not in line:
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.WARNING,
+                        message="Appears to be hardcoded citation instead of \\cite",
+                        line_number=line_num,
+                        line_content=line.strip()[:80],
+                        suggestion="Use \\cite{} for proper bibliography management"
+                    ))
+        return results

src/checkers/consistency_checker.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+Terminology consistency checker.
+Validates:
+- Consistent spelling of the same term
+- Consistent hyphenation
+- Consistent capitalization of technical terms
+"""
+import re
+from typing import List, Dict, Set
+from collections import defaultdict
+from .base import BaseChecker, CheckResult, CheckSeverity
+class ConsistencyChecker(BaseChecker):
+    """Check terminology and spelling consistency."""
+    name = "consistency"
+    display_name = "Consistency"
+    description = "Check for inconsistent terminology and spelling"
+    # Known variant pairs (canonical -> variants)
+    KNOWN_VARIANTS = {
+        # Hyphenation variants
+        'self-supervised': ['self supervised', 'selfsupervised'],
+        'pre-trained': ['pre trained', 'pretrained'],
+        'fine-tuned': ['fine tuned', 'finetuned'],
+        'state-of-the-art': ['state of the art', 'stateoftheart'],
+        'real-world': ['real world', 'realworld'],
+        'end-to-end': ['end to end', 'endtoend', 'e2e'],
+        'large-scale': ['large scale', 'largescale'],
+        'long-term': ['long term', 'longterm'],
+        'short-term': ['short term', 'shortterm'],
+        'multi-task': ['multi task', 'multitask'],
+        'multi-modal': ['multi modal', 'multimodal'],
+        'cross-lingual': ['cross lingual', 'crosslingual'],
+        'zero-shot': ['zero shot', 'zeroshot'],
+        'few-shot': ['few shot', 'fewshot'],
+        'in-context': ['in context', 'incontext'],
+        # American vs British English (comprehensive list)
+        # -or/-our endings
+        'color': ['colour'],
+        'behavior': ['behaviour'],
+        'favor': ['favour'],
+        'honor': ['honour'],
+        'labor': ['labour'],
+        'neighbor': ['neighbour'],
+        'rumor': ['rumour'],
+        'vapor': ['vapour'],
+        # -ize/-ise endings
+        'analyze': ['analyse'],
+        'characterize': ['characterise'],
+        'generalize': ['generalise'],
+        'initialize': ['initialise'],
+        'maximize': ['maximise'],
+        'minimize': ['minimise'],
+        'normalize': ['normalise'],
+        'optimize': ['optimise'],
+        'organize': ['organise'],
+        'realize': ['realise'],
+        'recognize': ['recognise'],
+        'specialize': ['specialise'],
+        'standardize': ['standardise'],
+        'summarize': ['summarise'],
+        'utilize': ['utilise'],
+        'visualize': ['visualise'],
+        'categorize': ['categorise'],
+        'emphasize': ['emphasise'],
+        'hypothesize': ['hypothesise'],
+        'prioritize': ['prioritise'],
+        'synchronize': ['synchronise'],
+        # -ization/-isation endings
+        'generalization': ['generalisation'],
+        'initialization': ['initialisation'],
+        'maximization': ['maximisation'],
+        'minimization': ['minimisation'],
+        'normalization': ['normalisation'],
+        'optimization': ['optimisation'],
+        'organization': ['organisation'],
+        'realization': ['realisation'],
+        'regularization': ['regularisation'],
+        'specialization': ['specialisation'],
+        'standardization': ['standardisation'],
+        'summarization': ['summarisation'],
+        'utilization': ['utilisation'],
+        'visualization': ['visualisation'],
+        'categorization': ['categorisation'],
+        'characterization': ['characterisation'],
+        'parametrization': ['parametrisation'],
+        'quantization': ['quantisation'],
+        # -er/-re endings
+        'center': ['centre'],
+        'fiber': ['fibre'],
+        'meter': ['metre'],
+        'liter': ['litre'],
+        # -l-/-ll- (American single, British double)
+        'modeling': ['modelling'],
+        'labeled': ['labelled'],
+        'labeling': ['labelling'],
+        'traveled': ['travelled'],
+        'traveling': ['travelling'],
+        'canceled': ['cancelled'],
+        'canceling': ['cancelling'],
+        'signaled': ['signalled'],
+        'signaling': ['signalling'],
+        # -og/-ogue endings
+        'analog': ['analogue'],
+        'catalog': ['catalogue'],
+        'dialog': ['dialogue'],
+        # -ense/-ence endings
+        'defense': ['defence'],
+        'license': ['licence'],
+        'offense': ['offence'],
+        # Other common differences
+        'gray': ['grey'],
+        'artifact': ['artefact'],
+        'program': ['programme'],  # Note: 'program' is standard in computing
+        'skeptical': ['sceptical'],
+        'aluminum': ['aluminium'],
+        # Verb forms
+        'learned': ['learnt'],
+        'burned': ['burnt'],
+        'spelled': ['spelt'],
+        # Common term variants
+        'dataset': ['data set', 'data-set'],
+        'benchmark': ['bench mark', 'bench-mark'],
+        'baseline': ['base line', 'base-line'],
+        'downstream': ['down stream', 'down-stream'],
+        'upstream': ['up stream', 'up-stream'],
+        'encoder': ['en-coder'],
+        'decoder': ['de-coder'],
+    }
+    # Capitalization variants to track
+    CAPITALIZATION_TERMS = [
+        'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
+        'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
+    ]
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        # Remove comments
+        content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
+        content_lower = content.lower()
+        # Check for known variant inconsistencies
+        for canonical, variants in self.KNOWN_VARIANTS.items():
+            found_forms = []
+            # Check canonical form
+            if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
+                found_forms.append(canonical)
+            # Check variants
+            for variant in variants:
+                if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
+                    found_forms.append(variant)
+            if len(found_forms) > 1:
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.WARNING,
+                    message=f"Inconsistent spelling: {', '.join(found_forms)}",
+                    suggestion=f"Use '{canonical}' consistently throughout"
+                ))
+        # Check hyphenated word consistency
+        hyphen_results = self._check_hyphenation_consistency(content)
+        results.extend(hyphen_results)
+        # Check capitalization consistency
+        cap_results = self._check_capitalization_consistency(content)
+        results.extend(cap_results)
+        return results
+    def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
+        """Find words that appear both hyphenated and non-hyphenated."""
+        results = []
+        # Common terms that should always be hyphenated (exceptions)
+        ALWAYS_HYPHENATED = {
+            'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
+            'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
+            'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
+            'self-supervised', 'self-attention', 'co-training', 'pre-training',
+            'post-processing', 'pre-processing', 'well-known', 'well-defined',
+            'high-quality', 'low-quality', 'long-term', 'short-term'
+        }
+        # Find all hyphenated words
+        hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))
+        for hyph_word in hyphenated:
+            # Skip if it's a known compound that should always be hyphenated
+            if hyph_word.lower() in ALWAYS_HYPHENATED:
+                continue
+            # Create non-hyphenated version
+            non_hyph = hyph_word.replace('-', ' ')
+            combined = hyph_word.replace('-', '')
+            # Check if non-hyphenated version exists
+            if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.INFO,
+                    message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
+                    suggestion="Choose one form and use it consistently"
+                ))
+            elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.INFO,
+                    message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
+                    suggestion="Choose one form and use it consistently"
+                ))
+        return results
+    def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
+        """Check if technical terms have consistent capitalization."""
+        results = []
+        for term in self.CAPITALIZATION_TERMS:
+            # Find all case variations
+            pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
+            matches = pattern.findall(content)
+            if len(matches) > 1:
+                # Check if there are mixed capitalizations
+                unique_forms = set(matches)
+                if len(unique_forms) > 1:
+                    forms_str = ', '.join(f"'{f}'" for f in unique_forms)
+                    results.append(self._create_result(
+                        passed=False,
+                        severity=CheckSeverity.INFO,
+                        message=f"Inconsistent capitalization: {forms_str}",
+                        suggestion="Use consistent capitalization for technical terms"
+                    ))
+        return results

src/checkers/equation_checker.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Equation formatting checker.
+Validates:
+- Punctuation after equations (based on grammar)
+- Equation numbering consistency
+- Variable definitions
+"""
+import re
+from typing import List, Set
+from .base import BaseChecker, CheckResult, CheckSeverity
+class EquationChecker(BaseChecker):
+    """Check equation formatting and consistency."""
+    name = "equation"
+    display_name = "Equations"
+    description = "Check equation formatting and punctuation"
+    # Equation environments
+    EQUATION_ENVS = [
+        'equation', 'align', 'gather', 'multline', 'eqnarray',
+        'equation*', 'align*', 'gather*', 'multline*', 'eqnarray*'
+    ]
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        # Check equation punctuation
+        punct_results = self._check_equation_punctuation(tex_content)
+        results.extend(punct_results)
+        # Check for numbered vs unnumbered consistency
+        numbering_results = self._check_numbering_consistency(tex_content)
+        results.extend(numbering_results)
+        # Check inline math consistency ($...$ vs \(...\))
+        inline_results = self._check_inline_math_consistency(tex_content)
+        results.extend(inline_results)
+        return results
+    def _check_equation_punctuation(self, content: str) -> List[CheckResult]:
+        """Check if equations end with appropriate punctuation."""
+        results = []
+        for env in self.EQUATION_ENVS:
+            if '*' in env:
+                env_escaped = env.replace('*', r'\*')
+            else:
+                env_escaped = env
+            # Find equation content
+            pattern = re.compile(
+                rf'\\begin\{{{env_escaped}\}}(.*?)\\end\{{{env_escaped}\}}',
+                re.DOTALL
+            )
+            for match in pattern.finditer(content):
+                eq_content = match.group(1).strip()
+                # Check what comes after the equation
+                after_pos = match.end()
+                after_text = content[after_pos:after_pos + 50].strip()
+                # Equations in running text should have punctuation
+                # Check if equation content ends with punctuation
+                eq_content_clean = re.sub(r'\\label\{[^}]+\}', '', eq_content).strip()
+                if eq_content_clean and not re.search(r'[.,;]$', eq_content_clean):
+                    # Check if next text starts lowercase (indicating sentence continues)
+                    if after_text and after_text[0].islower():
+                        line_num = self._find_line_number(content, match.end())
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.INFO,
+                            message="Equation may need punctuation (sentence continues after)",
+                            line_number=line_num,
+                            suggestion="Add comma or period inside equation if it ends a clause"
+                        ))
+        return results
+    def _check_numbering_consistency(self, content: str) -> List[CheckResult]:
+        """Check for mixed numbered and unnumbered equations."""
+        results = []
+        # Count numbered vs unnumbered
+        numbered = 0
+        unnumbered = 0
+        for env in self.EQUATION_ENVS:
+            count = len(re.findall(rf'\\begin\{{{env}\}}', content))
+            if '*' in env or 'nonumber' in content:
+                unnumbered += count
+            else:
+                numbered += count
+        # Also count \nonumber and \notag usage
+        unnumbered += len(re.findall(r'\\nonumber|\\notag', content))
+        # If there's a significant mix, warn
+        total = numbered + unnumbered
+        if total > 3 and numbered > 0 and unnumbered > 0:
+            ratio = min(numbered, unnumbered) / total
+            if ratio > 0.2:  # More than 20% in minority
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.INFO,
+                    message=f"Mixed equation numbering: {numbered} numbered, {unnumbered} unnumbered",
+                    suggestion="Consider consistent numbering strategy"
+                ))
+        return results
+    def _check_inline_math_consistency(self, content: str) -> List[CheckResult]:
+        """Check for mixed inline math delimiters."""
+        results = []
+        # Count different inline math styles
+        dollar_count = len(re.findall(r'(?<!\$)\$(?!\$)[^$]+\$(?!\$)', content))
+        paren_count = len(re.findall(r'\\\(.*?\\\)', content))
+        if dollar_count > 0 and paren_count > 0:
+            results.append(self._create_result(
+                passed=False,
+                severity=CheckSeverity.INFO,
+                message=f"Mixed inline math: ${dollar_count} \\$...\\$ and {paren_count} \\(...\\)",
+                suggestion="Use consistent inline math delimiters throughout"
+            ))
+        return results

src/checkers/formatting_checker.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Formatting checker.
+Validates common LaTeX formatting issues:
+- Citation formatting consistency
+- Non-breaking spaces before citations
+- Special character escaping
+- Whitespace issues
+"""
+import re
+from typing import List
+from .base import BaseChecker, CheckResult, CheckSeverity
+class FormattingChecker(BaseChecker):
+    """Check for common LaTeX formatting issues."""
+    name = "formatting"
+    display_name = "Formatting"
+    description = "Check citation style, spacing, and special characters"
+    # Citation commands
+    CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
+                     'citeauthor', 'citeyear', 'autocite', 'textcite',
+                     'parencite', 'footcite']
+    # Pattern for citations without non-breaking space
+    # Matches: "word \cite" but not "word~\cite"
+    CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
+    # Pattern for multiple consecutive spaces
+    MULTI_SPACE_PATTERN = re.compile(r'(?<!\\)  +')
+    # Pattern for unescaped special characters (outside math mode)
+    SPECIAL_CHARS = {
+        '%': r'(?<!\\)%',  # Unescaped %
+        '&': r'(?<!\\)&(?![a-zA-Z]+;)',  # Unescaped & (not HTML entities)
+        '#': r'(?<!\\)#',  # Unescaped #
+        '_': r'(?<![\\$])_(?![^$]*\$)',  # Unescaped _ outside math
+        '^': r'(?<![\\$])\^(?![^$]*\$)',  # Unescaped ^ outside math
+    }
+    # Multiple blank lines pattern (3 or more blank lines)
+    MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
+    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
+        results = []
+        lines = tex_content.split('\n')
+        # Track citation style consistency
+        cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
+        for line_num, line in enumerate(lines, 1):
+            # Skip commented lines using base class method
+            if self._is_comment_line(line):
+                continue
+            # Remove inline comments using base class method
+            line_content = self._remove_line_comment(line)
+            # Check citation non-breaking space
+            for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
+                results.append(self._create_result(
+                    passed=False,
+                    severity=CheckSeverity.INFO,
+                    message="Citation without non-breaking space",
+                    line_number=line_num,
+                    line_content=line.strip()[:100],
+                    suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
+                ))
+            # Track citation styles
+            for cmd in self.CITE_COMMANDS:
+                if re.search(rf'\\{cmd}\b', line_content):
+                    if cmd in ['citep', 'parencite', 'autocite']:
+                        cite_styles['parenthetical'] += 1
+                    elif cmd in ['citet', 'textcite']:
+                        cite_styles['textual'] += 1
+                    elif cmd == 'cite':
+                        cite_styles['plain'] += 1
+        # Check citation style consistency
+        styles_used = [s for s, count in cite_styles.items() if count > 0]
+        if len(styles_used) > 1:
+            results.append(self._create_result(
+                passed=False,
+                severity=CheckSeverity.INFO,
+                message=f"Mixed citation styles detected: {', '.join(styles_used)}",
+                suggestion="Consider using consistent citation style throughout"
+            ))
+        # Check for multiple blank lines (3 or more)
+        for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
+            line_num = self._find_line_number(tex_content, match.start())
+            # Count how many blank lines
+            blank_count = match.group(0).count('\n') - 1
+            # Get context: the line before, blank lines, and the line after
+            start_pos = match.start()
+            end_pos = match.end()
+            # Find the line before the blank lines
+            prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
+            prev_line_end = start_pos
+            prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
+            # Find the line after the blank lines
+            next_line_end = tex_content.find('\n', end_pos)
+            if next_line_end == -1:
+                next_line_end = len(tex_content)
+            next_line = tex_content[end_pos:next_line_end].rstrip()
+            # Create visual representation with warning markers
+            blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
+            line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
+            results.append(self._create_result(
+                passed=False,
+                severity=CheckSeverity.INFO,
+                message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
+                line_number=line_num,
+                line_content=line_content,
+                suggestion="Reduce to single blank line or use \\vspace"
+            ))
+        # Check for common issues with special characters
+        results.extend(self._check_special_chars(tex_content, lines))
+        return results
+    def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
+        """Check for unescaped special characters."""
+        results = []
+        # Find math environments to skip
+        math_regions = self._find_math_regions(content)
+        for line_num, line in enumerate(lines, 1):
+            # Skip commented lines using base class method
+            if self._is_comment_line(line):
+                continue
+            # Remove inline comments using base class method
+            line_content = self._remove_line_comment(line)
+            # Get position of this line in full content
+            line_start = sum(len(l) + 1 for l in lines[:line_num-1])
+            # Check for unescaped & (common error)
+            for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
+                pos = line_start + match.start()
+                # Skip if in math
+                if not self._in_math_region(pos, math_regions):
+                    # Also skip if inside tabular
+                    if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
+                        results.append(self._create_result(
+                            passed=False,
+                            severity=CheckSeverity.WARNING,
+                            message="Unescaped & outside tabular/math environment",
+                            line_number=line_num,
+                            line_content=line.strip()[:100],
+                            suggestion="Use \\& to escape"
+                        ))
+        return results
+    def _find_math_regions(self, content: str) -> List[tuple]:
+        """Find regions that are inside math mode."""
+        regions = []
+        # Inline math $ ... $
+        for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
+            regions.append((match.start(), match.end()))
+        # Display math $$ ... $$
+        for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
+            regions.append((match.start(), match.end()))
+        # \[ ... \]
+        for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
+            regions.append((match.start(), match.end()))
+        # Math environments
+        for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
+            pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
+            for match in re.finditer(pattern, content, re.DOTALL):
+                regions.append((match.start(), match.end()))
+        return regions
+    def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
+        """Check if position is inside a math region."""
+        return any(start <= pos <= end for start, end in regions)
+    def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
+        """Check if position is inside any of the given environments."""
+        for env in env_names:
+            # Find all instances of this environment
+            pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
+            for match in re.finditer(pattern, content, re.DOTALL):
+                if match.start() <= pos <= match.end():
+                    return True
+        return False