thinkwee commited on
Commit
46df5f0
·
1 Parent(s): 6984298
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +62 -0
  3. README.md +196 -11
  4. app.py +922 -0
  5. app_helper.py +98 -0
  6. assets/icon-192.png +3 -0
  7. assets/icon-512.png +3 -0
  8. bibguard.yaml +197 -0
  9. main.py +564 -0
  10. requirements.txt +8 -0
  11. src/__init__.py +1 -0
  12. src/__pycache__/__init__.cpython-311.pyc +0 -0
  13. src/__pycache__/__init__.cpython-313.pyc +0 -0
  14. src/analyzers/__init__.py +7 -0
  15. src/analyzers/__pycache__/__init__.cpython-313.pyc +0 -0
  16. src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
  17. src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc +0 -0
  18. src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc +0 -0
  19. src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc +0 -0
  20. src/analyzers/__pycache__/retraction_checker.cpython-313.pyc +0 -0
  21. src/analyzers/__pycache__/url_validator.cpython-313.pyc +0 -0
  22. src/analyzers/__pycache__/usage_checker.cpython-313.pyc +0 -0
  23. src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc +0 -0
  24. src/analyzers/duplicate_detector.py +204 -0
  25. src/analyzers/llm_evaluator.py +376 -0
  26. src/analyzers/metadata_comparator.py +474 -0
  27. src/analyzers/usage_checker.py +82 -0
  28. src/checkers/__init__.py +66 -0
  29. src/checkers/__pycache__/__init__.cpython-313.pyc +0 -0
  30. src/checkers/__pycache__/acronym_checker.cpython-313.pyc +0 -0
  31. src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc +0 -0
  32. src/checkers/__pycache__/anonymization_checker.cpython-313.pyc +0 -0
  33. src/checkers/__pycache__/base.cpython-313.pyc +0 -0
  34. src/checkers/__pycache__/caption_checker.cpython-313.pyc +0 -0
  35. src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc +0 -0
  36. src/checkers/__pycache__/consistency_checker.cpython-313.pyc +0 -0
  37. src/checkers/__pycache__/equation_checker.cpython-313.pyc +0 -0
  38. src/checkers/__pycache__/formatting_checker.cpython-313.pyc +0 -0
  39. src/checkers/__pycache__/number_checker.cpython-313.pyc +0 -0
  40. src/checkers/__pycache__/reference_checker.cpython-313.pyc +0 -0
  41. src/checkers/__pycache__/sentence_checker.cpython-313.pyc +0 -0
  42. src/checkers/acronym_checker.py +284 -0
  43. src/checkers/ai_artifacts_checker.py +176 -0
  44. src/checkers/anonymization_checker.py +216 -0
  45. src/checkers/base.py +193 -0
  46. src/checkers/caption_checker.py +136 -0
  47. src/checkers/citation_quality_checker.py +131 -0
  48. src/checkers/consistency_checker.py +254 -0
  49. src/checkers/equation_checker.py +134 -0
  50. src/checkers/formatting_checker.py +204 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/*.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual Environments
25
+ venv/
26
+ env/
27
+ .env
28
+ .venv/
29
+
30
+ # IDEs
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+
36
+ # macOS
37
+ .DS_Store
38
+ .AppleDouble
39
+ .LSOverride
40
+
41
+ # Project Specific Outputs
42
+ *.txt
43
+ *.md
44
+ !README.md
45
+ *_only_used_entry.bib
46
+
47
+ # LaTeX and Bibliography (User Data)
48
+ # Ignoring these to prevent committing personal paper content
49
+ *.tex
50
+ *.bib
51
+ *.pdf
52
+ *.log
53
+ *.aux
54
+ *.out
55
+ *.bbl
56
+ *.blg
57
+ *.synctex.gz
58
+ *.fls
59
+ *.fdb_latexmk
60
+
61
+ # cache
62
+ .cache
README.md CHANGED
@@ -1,13 +1,198 @@
1
- ---
2
- title: BibGuard
3
- emoji:
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.3.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: Automated bibliography verification and LaTeX quality auditi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # BibGuard: Bibliography & LaTeX Quality Auditor
2
+
3
+ **BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit.
4
+
5
+ AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims.
6
+
7
+ ## 🛡 Why BibGuard?
8
+
9
+ - **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
10
+ - **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems
11
+ - **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated
12
+ - **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM)
13
+ - **⚡ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations
14
+
15
+ ## 🚀 Features
16
+
17
+ ### Bibliography Validation
18
+ - **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
19
+ - **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional)
20
+ - **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.)
21
+ - **👀 Usage Analysis**: Highlights missing citations and unused bib entries
22
+ - **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching
23
+
24
+ ### LaTeX Quality Checks
25
+ - **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
26
+ - **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
27
+ - **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology
28
+ - **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
29
+ - **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching)
30
+ - **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
31
+ - **📅 Citation Age**: Flags references older than 30 years
32
+
33
+ ## 📦 Installation
34
+
35
+ ```bash
36
+ git clone git@github.com:thinkwee/BibGuard.git
37
+ cd BibGuard
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ## ⚡ Quick Start
42
+
43
+ ### 1. Initialize Configuration
44
+
45
+ ```bash
46
+ python main.py --init
47
+ ```
48
+
49
+ This creates `config.yaml`. Edit it to set your file paths. You have two modes:
50
+
51
+ #### Option A: Single File Mode
52
+ Best for individual papers.
53
+ ```yaml
54
+ files:
55
+ bib: "paper.bib"
56
+ tex: "paper.tex"
57
+ output_dir: "bibguard_output"
58
+ ```
59
+
60
+ #### Option B: Directory Scan Mode
61
+ Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files.
62
+ ```yaml
63
+ files:
64
+ input_dir: "./my_project_dir"
65
+ output_dir: "bibguard_output"
66
+ ```
67
+
68
+ ### 2. Run Full Check
69
+
70
+ ```bash
71
+ python main.py
72
+ ```
73
+
74
+ **Output** (in `bibguard_output/`):
75
+ - `bibliography_report.md` - Bibliography validation results
76
+ - `latex_quality_report.md` - Writing and formatting issues
77
+ - `line_by_line_report.md` - All issues sorted by line number
78
+ - `*_only_used.bib` - Clean bibliography (used entries only)
79
+
80
+ ## 🛠 Configuration
81
+
82
+ Edit `config.yaml` to customize checks:
83
+
84
+ ```yaml
85
+ bibliography:
86
+ check_metadata: true # Validate against online databases (takes time)
87
+ check_usage: true # Find unused/missing entries
88
+ check_duplicates: true # Detect duplicate entries
89
+ check_preprint_ratio: true # Warn if >50% are preprints
90
+ check_relevance: false # LLM-based relevance check (requires API key)
91
+
92
+ submission:
93
+ # Format checks
94
+ caption: true # Table/figure caption placement
95
+ reference: true # Cross-reference integrity
96
+ formatting: true # Citation spacing, blank lines
97
+ equation: true # Equation punctuation, numbering
98
+
99
+ # Writing quality
100
+ sentence: true # Weak starters, hedging language
101
+ consistency: true # Spelling, hyphenation, terminology
102
+ acronym: true # Acronym definitions (3+ letters)
103
+
104
+ # Submission compliance
105
+ ai_artifacts: true # AI-generated text detection
106
+ anonymization: true # Double-blind compliance
107
+ citation_quality: true # Old citations (>30 years)
108
+ number: true # Percentage formatting
109
+ ```
110
+
111
+ ## 🤖 LLM-Based Relevance Check
112
+
113
+ To verify citations match their context using AI:
114
+
115
+ ```yaml
116
+ bibliography:
117
+ check_relevance: true
118
+
119
+ llm:
120
+ backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm
121
+ api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY)
122
+ ```
123
+
124
+ **Supported Backends:**
125
+ - **Gemini** (Google): `GEMINI_API_KEY`
126
+ - **OpenAI**: `OPENAI_API_KEY`
127
+ - **Anthropic**: `ANTHROPIC_API_KEY`
128
+ - **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance)
129
+ - **Ollama**: Local models (no API key needed)
130
+ - **vLLM**: Custom endpoint
131
+
132
+ Then run:
133
+ ```bash
134
+ python main.py
135
+ ```
136
+
137
+ ## 📝 Understanding Reports
138
+
139
+ ### Bibliography Report
140
+ Shows for each entry:
141
+ - ✅ **Verified**: Metadata matches online databases
142
+ - ⚠️ **Issues**: Mismatches, missing entries, duplicates
143
+ - 📊 **Statistics**: Usage, duplicates, preprint ratio
144
+
145
+ ### LaTeX Quality Report
146
+ Organized by severity:
147
+ - 🔴 **Errors**: Critical issues (e.g., undefined references)
148
+ - 🟡 **Warnings**: Important issues (e.g., inconsistent spelling)
149
+ - 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
150
+
151
+ ### Line-by-Line Report
152
+ All LaTeX issues sorted by line number for easy fixing.
153
+
154
+ ## 🧐 Understanding Mismatches
155
+
156
+ BibGuard is strict, but false positives happen:
157
+
158
+ 1. **Year Discrepancy (±1 Year)**:
159
+ - *Reason*: Delay between preprint (arXiv) and official publication
160
+ - *Action*: Verify which version you intend to cite
161
+
162
+ 2. **Author List Variations**:
163
+ - *Reason*: Different databases handle large author lists differently
164
+ - *Action*: Check if primary authors match
165
+
166
+ 3. **Venue Name Differences**:
167
+ - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
168
+ - *Action*: Both are usually correct
169
+
170
+ 4. **Non-Academic Sources**:
171
+ - *Reason*: Blogs, documentation not indexed by academic databases
172
+ - *Action*: Manually verify URL and title
173
+
174
+ ## 🔧 Advanced Options
175
+
176
+ ```bash
177
+ python main.py --help # Show all options
178
+ python main.py --list-templates # List conference templates
179
+ python main.py --config my.yaml # Use custom config file
180
+ ```
181
+
182
+ ## 🤝 Contributing
183
+
184
+ Contributions welcome! Please open an issue or pull request.
185
+
186
+ ## 🙏 Acknowledgments
187
+
188
+ BibGuard uses multiple data sources:
189
+ - arXiv API
190
+ - CrossRef API
191
+ - Semantic Scholar API
192
+ - DBLP API
193
+ - OpenAlex API
194
+ - Google Scholar (via scholarly)
195
+
196
  ---
197
 
198
+ **Made with ❤️ for researchers who care about their submission**
app.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BibGuard Gradio Web Application
4
+
5
+ A web interface for checking bibliography and LaTeX quality.
6
+ """
7
+ import gradio as gr
8
+ import tempfile
9
+ import shutil
10
+ from pathlib import Path
11
+ from typing import Optional, Tuple
12
+ import base64
13
+
14
+ from src.parsers import BibParser, TexParser
15
+ from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
16
+ from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
17
+ from src.report.generator import ReportGenerator, EntryReport
18
+ from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep
19
+ from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
20
+ from src.checkers import CHECKER_REGISTRY
21
+ from src.report.line_report import LineByLineReportGenerator
22
+ from app_helper import fetch_and_compare_with_workflow
23
+
24
+
25
+ # Custom CSS for better Markdown rendering
26
+ CUSTOM_CSS = """
27
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
28
+
29
+ * {
30
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
31
+ }
32
+ """
33
+
34
+ WELCOME_HTML = """
35
+ <div class="scrollable-report-area">
36
+ <div class="report-card" style="max-width: 800px; margin: 0 auto;">
37
+ <div class="card-header">
38
+ <h3 class="card-title" style="font-size: 1.5em;">👋 Welcome to BibGuard</h3>
39
+ </div>
40
+ <div class="card-content" style="line-height: 1.6; color: #374151;">
41
+ <p style="font-size: 1.1em; margin-bottom: 24px;">
42
+ Ensure your academic paper is flawless. Upload your <code>.bib</code> and <code>.tex</code> files on the left and click <strong>"Check Now"</strong>.
43
+ </p>
44
+
45
+ <div style="display: grid; gap: 20px; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));">
46
+ <div style="background: #fefce8; padding: 16px; border-radius: 8px; border: 1px solid #fde047;">
47
+ <strong style="color: #854d0e; display: block; margin-bottom: 8px;">⚠️ Metadata Check Defaults</strong>
48
+ "🔍 Metadata" is <strong>disabled by default</strong>. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification.
49
+ </div>
50
+
51
+ <div style="background: #eff6ff; padding: 16px; border-radius: 8px; border: 1px solid #bfdbfe;">
52
+ <strong style="color: #1e40af; display: block; margin-bottom: 8px;">🚀 Go Pro with Local Version</strong>
53
+ LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="color: #2563eb; text-decoration: underline; font-weight: 600;">GitHub repo</a> to use the full power with your API key.
54
+ </div>
55
+ </div>
56
+
57
+ <h4 style="margin: 24px 0 12px 0; color: #111827; font-size: 1.1em;">📊 Understanding Your Reports</h4>
58
+ <div style="display: grid; gap: 12px;">
59
+ <div style="display: flex; gap: 12px; align-items: baseline;">
60
+ <span style="background: #e0e7ff; color: #3730a3; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📚 Bibliography</span>
61
+ <span>Validates metadata fields, detects duplicates, and checks citation counts.</span>
62
+ </div>
63
+ <div style="display: flex; gap: 12px; align-items: baseline;">
64
+ <span style="background: #dcfce7; color: #166534; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📝 LaTeX Quality</span>
65
+ <span>Syntax check, caption validation, acronym consistency, and style suggestions.</span>
66
+ </div>
67
+ <div style="display: flex; gap: 12px; align-items: baseline;">
68
+ <span style="background: #f3f4f6; color: #4b5563; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📋 Line-by-Line</span>
69
+ <span>Maps every issue found directly to the line number in your source file.</span>
70
+ </div>
71
+ </div>
72
+ </div>
73
+ </div>
74
+ </div>
75
+ """
76
+
77
+ CUSTOM_CSS += """
78
+ /* Global Reset */
79
+ body, gradio-app {
80
+ overflow: hidden !important; /* Prevent double scrollbars on the page */
81
+ }
82
+
83
+ .gradio-container {
84
+ max-width: none !important;
85
+ width: 100% !important;
86
+ height: 100vh !important;
87
+ padding: 0 !important;
88
+ margin: 0 !important;
89
+ }
90
+
91
+ /* Header Styling */
92
+ .app-header {
93
+ padding: 20px;
94
+ background: white;
95
+ border-bottom: 1px solid #e5e7eb;
96
+ }
97
+
98
+ /* Sidebar Styling */
99
+ .app-sidebar {
100
+ height: calc(100vh - 100px) !important;
101
+ overflow-y: auto !important;
102
+ padding: 20px !important;
103
+ border-right: 1px solid #e5e7eb;
104
+ }
105
+
106
+ /* Main Content Area */
107
+ .app-content {
108
+ height: calc(100vh - 100px) !important;
109
+ padding: 0 !important;
110
+ }
111
+
112
+ /* The Magic Scroll Container - Clean and Explicit */
113
+ .scrollable-report-area {
114
+ height: calc(100vh - 180px) !important; /* Fixed height relative to viewport */
115
+ overflow-y: auto !important;
116
+ padding: 24px;
117
+ background-color: #f9fafb;
118
+ border: 1px solid #e5e7eb;
119
+ border-radius: 8px;
120
+ margin-top: 10px;
121
+ }
122
+
123
+ /* Report Card Styling */
124
+ .report-card {
125
+ background: white;
126
+ border-radius: 12px;
127
+ padding: 24px;
128
+ margin-bottom: 16px; /* Spacing between cards */
129
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
130
+ border: 1px solid #e5e7eb;
131
+ transition: transform 0.2s, box-shadow 0.2s;
132
+ }
133
+
134
+ .report-card:hover {
135
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
136
+ transform: translateY(-2px);
137
+ }
138
+
139
+ /* Card Internals */
140
+ .card-header {
141
+ display: flex;
142
+ justify-content: space-between;
143
+ align-items: flex-start;
144
+ margin-bottom: 16px;
145
+ padding-bottom: 16px;
146
+ border-bottom: 1px solid #f3f4f6;
147
+ }
148
+
149
+ .card-title {
150
+ font-size: 1.1em;
151
+ font-weight: 600;
152
+ color: #111827;
153
+ margin: 0 0 4px 0;
154
+ }
155
+
156
+ .card-subtitle {
157
+ font-size: 0.9em;
158
+ color: #6b7280;
159
+ font-family: monospace;
160
+ }
161
+
162
+ .card-content {
163
+ font-size: 0.95em;
164
+ color: #374151;
165
+ line-height: 1.5;
166
+ }
167
+
168
+ /* Badges */
169
+ .badge {
170
+ display: inline-flex;
171
+ align-items: center;
172
+ padding: 4px 10px;
173
+ border-radius: 9999px;
174
+ font-size: 0.8em;
175
+ font-weight: 500;
176
+ }
177
+
178
+ .badge-success { background-color: #dcfce7; color: #166534; }
179
+ .badge-warning { background-color: #fef9c3; color: #854d0e; }
180
+ .badge-error { background-color: #fee2e2; color: #991b1b; }
181
+ .badge-info { background-color: #dbeafe; color: #1e40af; }
182
+ .badge-neutral { background-color: #f3f4f6; color: #4b5563; }
183
+
184
+ /* Stats Grid */
185
+ .stats-container {
186
+ display: grid;
187
+ grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
188
+ gap: 16px;
189
+ margin-bottom: 24px;
190
+ }
191
+
192
+ .stat-card {
193
+ padding: 16px;
194
+ border-radius: 12px;
195
+ color: white;
196
+ text-align: center;
197
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
198
+ }
199
+
200
+ .stat-value { font-size: 1.8em; font-weight: 700; }
201
+ .stat-label { font-size: 0.9em; opacity: 0.9; }
202
+
203
+ /* Detail Grid - Flexbox for better filling */
204
+ .detail-grid {
205
+ display: flex;
206
+ flex-wrap: wrap;
207
+ gap: 12px;
208
+ margin-bottom: 16px;
209
+ width: 100%;
210
+ }
211
+
212
+ .detail-item {
213
+ background: #f9fafb;
214
+ padding: 10px 12px;
215
+ border-radius: 8px;
216
+ border: 1px solid #f3f4f6;
217
+
218
+ /* Flex sizing: grow, shrink, min-basis */
219
+ flex: 1 1 160px;
220
+ min-width: 0; /* Important for word-break to work in flex children */
221
+
222
+ /* Layout control */
223
+ display: flex;
224
+ flex-direction: column;
225
+
226
+ /* Height constraint to prevent one huge card from stretching the row */
227
+ max-height: 100px;
228
+ overflow-y: auto;
229
+ }
230
+
231
+ /* Custom scrollbar for detail items */
232
+ .detail-item::-webkit-scrollbar {
233
+ width: 4px;
234
+ }
235
+ .detail-item::-webkit-scrollbar-thumb {
236
+ background-color: #d1d5db;
237
+ border-radius: 4px;
238
+ }
239
+
240
+ .detail-label {
241
+ font-size: 0.75em;
242
+ color: #6b7280;
243
+ text-transform: uppercase;
244
+ letter-spacing: 0.05em;
245
+ margin-bottom: 2px;
246
+ position: sticky;
247
+ top: 0;
248
+ background: #f9fafb; /* Maintain bg on scroll */
249
+ z-index: 1;
250
+ }
251
+
252
+ .detail-value {
253
+ font-weight: 500;
254
+ color: #1f2937;
255
+ font-size: 0.9em;
256
+ line-height: 1.4;
257
+ word-break: break-word; /* Fix overflow */
258
+ overflow-wrap: break-word;
259
+ } border: 1px solid #e5e7eb;
260
+ transition: all 0.2s;
261
+ }
262
+
263
+ .report-card:hover {
264
+ box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
265
+ }
266
+
267
+ /* Card Header */
268
+ .card-header {
269
+ display: flex;
270
+ justify-content: space-between;
271
+ align-items: flex-start;
272
+ margin-bottom: 12px;
273
+ border-bottom: 1px solid #f3f4f6;
274
+ padding-bottom: 12px;
275
+ }
276
+
277
+ .card-title {
278
+ font-size: 1.1em;
279
+ font-weight: 600;
280
+ color: #1f2937;
281
+ margin: 0;
282
+ }
283
+
284
+ .card-subtitle {
285
+ font-size: 0.9em;
286
+ color: #6b7280;
287
+ margin-top: 4px;
288
+ }
289
+
290
+ /* Status Badges */
291
+ .badge {
292
+ display: inline-flex;
293
+ align-items: center;
294
+ padding: 4px 10px;
295
+ border-radius: 9999px;
296
+ font-size: 0.8em;
297
+ font-weight: 500;
298
+ }
299
+
300
+ .badge-success { background-color: #dcfce7; color: #166534; }
301
+ .badge-warning { background-color: #fef9c3; color: #854d0e; }
302
+ .badge-error { background-color: #fee2e2; color: #991b1b; }
303
+ .badge-info { background-color: #dbeafe; color: #1e40af; }
304
+ .badge-neutral { background-color: #f3f4f6; color: #374151; }
305
+
306
+ /* Content Styling */
307
+ .card-content {
308
+ font-size: 15px;
309
+ color: #374151;
310
+ line-height: 1.6;
311
+ }
312
+
313
+ .card-content code {
314
+ background-color: #f3f4f6;
315
+ padding: 2px 6px;
316
+ border-radius: 4px;
317
+ font-family: monospace;
318
+ font-size: 0.9em;
319
+ color: #c2410c;
320
+ }
321
+
322
+ /* Grid for details */
323
+ .detail-grid {
324
+ display: grid;
325
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
326
+ gap: 12px;
327
+ margin-top: 12px;
328
+ }
329
+
330
+ .detail-item {
331
+ background: #f9fafb;
332
+ padding: 10px;
333
+ border-radius: 6px;
334
+ }
335
+
336
+ .detail-label {
337
+ font-size: 0.8em;
338
+ color: #6b7280;
339
+ text-transform: uppercase;
340
+ letter-spacing: 0.05em;
341
+ }
342
+
343
+ .detail-value {
344
+ font-weight: 500;
345
+ color: #111827;
346
+ }
347
+
348
+ /* Summary Stats */
349
+ .stats-container {
350
+ display: grid;
351
+ grid-template-columns: repeat(3, 1fr);
352
+ gap: 16px;
353
+ margin-bottom: 24px;
354
+ }
355
+
356
+ .stat-card {
357
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
358
+ color: white;
359
+ padding: 20px;
360
+ border-radius: 12px;
361
+ text-align: center;
362
+ box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25);
363
+ }
364
+
365
+ .stat-value {
366
+ font-size: 2em;
367
+ font-weight: 700;
368
+ }
369
+
370
+ .stat-label {
371
+ font-size: 0.9em;
372
+ opacity: 0.9;
373
+ margin-top: 4px;
374
+ }
375
+
376
+ /* Button styling */
377
+ .primary-btn {
378
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
379
+ border: none !important;
380
+ font-weight: 600 !important;
381
+ }
382
+
383
+ /* Tab styling */
384
+ .tab-nav button {
385
+ font-weight: 500 !important;
386
+ font-size: 15px !important;
387
+ }
388
+ """
389
+
390
+
391
+ def create_config_from_ui(
392
+ check_metadata: bool,
393
+ check_usage: bool,
394
+ check_duplicates: bool,
395
+ check_preprint_ratio: bool,
396
+ caption: bool,
397
+ reference: bool,
398
+ formatting: bool,
399
+ equation: bool,
400
+ ai_artifacts: bool,
401
+ sentence: bool,
402
+ consistency: bool,
403
+ acronym: bool,
404
+ number: bool,
405
+ citation_quality: bool,
406
+ anonymization: bool
407
+ ) -> BibGuardConfig:
408
+ """Create a BibGuardConfig from UI settings."""
409
+ config = BibGuardConfig()
410
+
411
+ config.bibliography = BibliographyConfig(
412
+ check_metadata=check_metadata,
413
+ check_usage=check_usage,
414
+ check_duplicates=check_duplicates,
415
+ check_preprint_ratio=check_preprint_ratio,
416
+ check_relevance=False # Disabled for web
417
+ )
418
+
419
+ config.submission = SubmissionConfig(
420
+ caption=caption,
421
+ reference=reference,
422
+ formatting=formatting,
423
+ equation=equation,
424
+ ai_artifacts=ai_artifacts,
425
+ sentence=sentence,
426
+ consistency=consistency,
427
+ acronym=acronym,
428
+ number=number,
429
+ citation_quality=citation_quality,
430
+ anonymization=anonymization
431
+ )
432
+
433
+ config.output = OutputConfig(quiet=True, minimal_verified=False)
434
+
435
+ return config
436
+
437
+
438
+ def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str:
439
+ """Generate HTML content for bibliography report."""
440
+ html = ['<div class="scrollable-report-area">']
441
+
442
+ # 1. Summary Stats
443
+ total = len(entries)
444
+ verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match)
445
+ used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used)
446
+
447
+ html.append('<div class="stats-container">')
448
+ html.append(f'<div class="stat-card"><div class="stat-value">{total}</div><div class="stat-label">Total Entries</div></div>')
449
+ html.append(f'<div class="stat-card"><div class="stat-value">{verified}</div><div class="stat-label">Verified</div></div>')
450
+ html.append(f'<div class="stat-card"><div class="stat-value">{used}</div><div class="stat-label">Used in Text</div></div>')
451
+ html.append('</div>')
452
+
453
+ # 2. Entries
454
+ for report in report_gen.entries:
455
+ entry = report.entry
456
+ status_badges = []
457
+
458
+ # Metadata Status
459
+ if report.comparison:
460
+ if report.comparison.is_match:
461
+ status_badges.append('<span class="badge badge-success">✓ Verified</span>')
462
+ if report.comparison.source:
463
+ status_badges.append(f'<span class="badge badge-info">{report.comparison.source.upper()}</span>')
464
+ else:
465
+ status_badges.append('<span class="badge badge-error">⚠ Metadata Mismatch</span>')
466
+ else:
467
+ status_badges.append('<span class="badge badge-neutral">No Metadata Check</span>')
468
+
469
+ # Usage Status
470
+ if report.usage:
471
+ if report.usage.is_used:
472
+ status_badges.append(f'<span class="badge badge-success">Used: {report.usage.usage_count}x</span>')
473
+ else:
474
+ status_badges.append('<span class="badge badge-warning">Unused</span>')
475
+
476
+ # Build Card
477
+ html.append(f'''
478
+ <div class="report-card">
479
+ <div class="card-header">
480
+ <div>
481
+ <h3 class="card-title">{entry.title or "No Title"}</h3>
482
+ <div class="card-subtitle">{entry.key} • {entry.year} • {entry.entry_type}</div>
483
+ </div>
484
+ <div style="display: flex; gap: 8px;">
485
+ {" ".join(status_badges)}
486
+ </div>
487
+ </div>
488
+
489
+ <div class="card-content">
490
+ <div class="detail-grid">
491
+ {
492
+ (lambda e: "".join([
493
+ f'<div class="detail-item"><div class="detail-label">{k}</div><div class="detail-value">{v}</div></div>'
494
+ for k, v in filter(None, [
495
+ ("Authors", e.author or "N/A"),
496
+ ("Venue", e.journal or e.booktitle or e.publisher or "N/A"),
497
+ ("DOI", e.doi) if e.doi else None,
498
+ ("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None,
499
+ ("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None,
500
+ ("URL", f'<a href="{e.url}" target="_blank" style="text-decoration:underline;">Link</a>') if e.url else None
501
+ ])
502
+ ]))(entry)
503
+ }
504
+ </div>
505
+ ''')
506
+
507
+ # Add issues if any
508
+ issues = []
509
+ if report.comparison and not report.comparison.is_match:
510
+ # Add main message derived from match status
511
+ if report.comparison.issues:
512
+ for issue in report.comparison.issues:
513
+ issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• {issue}</div>')
514
+ else:
515
+ issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• Verification failed</div>')
516
+
517
+ if issues:
518
+ html.append('<div style="margin-top: 16px; padding-top: 12px; border-top: 1px solid #eee;">')
519
+ html.append("".join(issues))
520
+ html.append('</div>')
521
+
522
+ html.append('</div></div>') # Close card-content and report-card
523
+
524
+ html.append('</div>') # Close container
525
+ return "".join(html)
526
+
527
+ def generate_latex_html(results: list) -> str:
528
+ """Generate HTML for LaTeX quality check."""
529
+ from src.checkers import CheckSeverity
530
+
531
+ html = ['<div class="scrollable-report-area">']
532
+
533
+ # Stats
534
+ errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR)
535
+ warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING)
536
+ infos = sum(1 for r in results if r.severity == CheckSeverity.INFO)
537
+
538
+ html.append('<div class="stats-container">')
539
+ html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #ef4444 0%, #b91c1c 100%);"><div class="stat-value">{errors}</div><div class="stat-label">Errors</div></div>')
540
+ html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);"><div class="stat-value">{warnings}</div><div class="stat-label">Warnings</div></div>')
541
+ html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);"><div class="stat-value">{infos}</div><div class="stat-label">Suggestions</div></div>')
542
+ html.append('</div>')
543
+
544
+ if not results:
545
+ html.append('<div class="report-card"><div class="card-content" style="text-align: center; padding: 40px; color: #166534; font-size: 1.2em;">✅ No issues found in LaTeX code!</div></div>')
546
+ else:
547
+ # Group by Checker
548
+ results.sort(key=lambda x: x.checker_name)
549
+ current_checker = None
550
+
551
+ for result in results:
552
+ badge_class = "badge-neutral"
553
+ if result.severity == CheckSeverity.ERROR: badge_class = "badge-error"
554
+ elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning"
555
+ elif result.severity == CheckSeverity.INFO: badge_class = "badge-info"
556
+
557
+ html.append(f'''
558
+ <div class="report-card">
559
+ <div class="card-header">
560
+ <div>
561
+ <h3 class="card-title">{result.checker_name}</h3>
562
+ <div class="card-subtitle">Line {result.line_number}</div>
563
+ </div>
564
+ <span class="badge {badge_class}">{result.severity.name}</span>
565
+ </div>
566
+ <div class="card-content">
567
+ {result.message}
568
+ {f'<div style="margin-top: 8px; background: #f3f4f6; padding: 8px; border-radius: 4px; font-family: monospace;">{result.line_content}</div>' if result.line_content else ''}
569
+ {f'<div style="margin-top: 8px; color: #166534;">💡 Suggestion: {result.suggestion}</div>' if result.suggestion else ''}
570
+ </div>
571
+ </div>
572
+ ''')
573
+
574
+ html.append('</div>')
575
+ return "".join(html)
576
+
577
+ def generate_line_html(content: str, results: list) -> str:
578
+ """Generate HTML for Line-by-Line report."""
579
+ # Build a dictionary of line_number -> list of issues
580
+ issues_by_line = {}
581
+ for r in results:
582
+ if r.line_number not in issues_by_line:
583
+ issues_by_line[r.line_number] = []
584
+ issues_by_line[r.line_number].append(r)
585
+
586
+ lines = content.split('\n')
587
+
588
+ html = ['<div class="scrollable-report-area">']
589
+
590
+ html.append('<div class="report-card"><div class="card-content">Issues are mapped to specific lines below.</div></div>')
591
+
592
+ for i, line in enumerate(lines, 1):
593
+ if i in issues_by_line:
594
+ # Highlight this line
595
+ line_issues = issues_by_line[i]
596
+
597
+ html.append(f'''
598
+ <div class="report-card" style="border-left: 4px solid #ef4444; padding: 12px;">
599
+ <div style="font-family: monospace; color: #6b7280; font-size: 0.9em; margin-bottom: 4px;">Line {i}</div>
600
+ <div style="font-family: monospace; background: #fee2e2; padding: 4px; border-radius: 4px; overflow-x: auto; white-space: pre;">{line}</div>
601
+ <div style="margin-top: 8px;">
602
+ ''')
603
+
604
+ for issue in line_issues:
605
+ html.append(f'<div style="color: #991b1b; font-size: 0.95em; margin-top: 4px;">• {issue.message}</div>')
606
+
607
+ html.append('</div></div>')
608
+
609
+ html.append('</div>')
610
+ return "".join(html)
611
+
612
+
613
+
614
+
615
+ def run_check(
616
+ bib_file,
617
+ tex_file,
618
+ check_metadata: bool,
619
+ check_usage: bool,
620
+ check_duplicates: bool,
621
+ check_preprint_ratio: bool,
622
+ caption: bool,
623
+ reference: bool,
624
+ formatting: bool,
625
+ equation: bool,
626
+ ai_artifacts: bool,
627
+ sentence: bool,
628
+ consistency: bool,
629
+ acronym: bool,
630
+ number: bool,
631
+ citation_quality: bool,
632
+ anonymization: bool,
633
+ progress=gr.Progress()
634
+ ) -> Tuple[str, str, str]:
635
+ """Run BibGuard checks and return three reports."""
636
+
637
+ if bib_file is None or tex_file is None:
638
+ return (
639
+ "⚠️ Please upload both `.bib` and `.tex` files.",
640
+ "⚠️ Please upload both `.bib` and `.tex` files.",
641
+ "⚠️ Please upload both `.bib` and `.tex` files."
642
+ )
643
+
644
+ try:
645
+ # Create config from UI
646
+ config = create_config_from_ui(
647
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
648
+ caption, reference, formatting, equation, ai_artifacts,
649
+ sentence, consistency, acronym, number, citation_quality, anonymization
650
+ )
651
+
652
+ # Get file paths from uploaded files
653
+ bib_path = bib_file.name
654
+ tex_path = tex_file.name
655
+
656
+ # Read tex content for checkers
657
+ tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
658
+
659
+ # Parse files
660
+ bib_parser = BibParser()
661
+ entries = bib_parser.parse_file(bib_path)
662
+
663
+ tex_parser = TexParser()
664
+ tex_parser.parse_file(tex_path)
665
+
666
+ bib_config = config.bibliography
667
+
668
+ # Initialize components
669
+ arxiv_fetcher = None
670
+ crossref_fetcher = None
671
+ semantic_scholar_fetcher = None
672
+ openalex_fetcher = None
673
+ dblp_fetcher = None
674
+ comparator = None
675
+ usage_checker = None
676
+ duplicate_detector = None
677
+
678
+ if bib_config.check_metadata:
679
+ arxiv_fetcher = ArxivFetcher()
680
+ semantic_scholar_fetcher = SemanticScholarFetcher()
681
+ openalex_fetcher = OpenAlexFetcher()
682
+ dblp_fetcher = DBLPFetcher()
683
+ crossref_fetcher = CrossRefFetcher()
684
+ comparator = MetadataComparator()
685
+
686
+ if bib_config.check_usage:
687
+ usage_checker = UsageChecker(tex_parser)
688
+
689
+ if bib_config.check_duplicates:
690
+ duplicate_detector = DuplicateDetector()
691
+
692
+ # Initialize report generator
693
+ report_gen = ReportGenerator(
694
+ minimal_verified=False,
695
+ check_preprint_ratio=bib_config.check_preprint_ratio,
696
+ preprint_warning_threshold=bib_config.preprint_warning_threshold
697
+ )
698
+ report_gen.set_metadata([bib_file.name], [tex_file.name])
699
+
700
+ # Run submission quality checks
701
+ progress(0.2, desc="Running LaTeX quality checks...")
702
+ submission_results = []
703
+ enabled_checkers = config.submission.get_enabled_checkers()
704
+
705
+ for checker_name in enabled_checkers:
706
+ if checker_name in CHECKER_REGISTRY:
707
+ checker = CHECKER_REGISTRY[checker_name]()
708
+ results = checker.check(tex_content, {})
709
+ for r in results:
710
+ r.file_path = tex_file.name
711
+ submission_results.extend(results)
712
+
713
+ report_gen.set_submission_results(submission_results, None)
714
+
715
+ # Check for duplicates
716
+ if bib_config.check_duplicates and duplicate_detector:
717
+ duplicate_groups = duplicate_detector.find_duplicates(entries)
718
+ report_gen.set_duplicate_groups(duplicate_groups)
719
+
720
+ # Check missing citations
721
+ if bib_config.check_usage and usage_checker:
722
+ missing = usage_checker.get_missing_entries(entries)
723
+ report_gen.set_missing_citations(missing)
724
+
725
+ # Build workflow
726
+ workflow_config = get_default_workflow()
727
+
728
+ # Process entries
729
+ progress(0.3, desc="Processing bibliography entries...")
730
+ total_entries = len(entries)
731
+
732
+ for i, entry in enumerate(entries):
733
+ progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}")
734
+
735
+ # Check usage
736
+ usage_result = None
737
+ if usage_checker:
738
+ usage_result = usage_checker.check_usage(entry)
739
+
740
+ # Fetch and compare metadata
741
+ comparison_result = None
742
+ if bib_config.check_metadata and comparator:
743
+ comparison_result = fetch_and_compare_with_workflow(
744
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher,
745
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
746
+ )
747
+
748
+ # Create entry report
749
+ entry_report = EntryReport(
750
+ entry=entry,
751
+ comparison=comparison_result,
752
+ usage=usage_result,
753
+ evaluations=[]
754
+ )
755
+ report_gen.add_entry_report(entry_report)
756
+
757
+ progress(0.85, desc="Generating structured reports...")
758
+
759
+ # Generate Bibliography HTML Report
760
+ bib_report = generate_bibliography_html(report_gen, entries)
761
+
762
+ # Generate LaTeX Quality HTML Report
763
+ latex_report = generate_latex_html(submission_results)
764
+
765
+ # Generate Line-by-Line HTML Report
766
+ line_report = ""
767
+ if submission_results:
768
+ line_report = generate_line_html(tex_content, submission_results)
769
+ else:
770
+ line_report = '<div class="report-container"><div class="report-card"><div class="card-content">No issues to display line-by-line.</div></div></div>'
771
+
772
+ progress(1.0, desc="Done!")
773
+
774
+ return bib_report, latex_report, line_report
775
+
776
+ except Exception as e:
777
+ error_msg = f"❌ Error: {str(e)}"
778
+ import traceback
779
+ error_msg += f"\n\n```\n{traceback.format_exc()}\n```"
780
+ return error_msg, error_msg, error_msg
781
+
782
+
783
+
784
+ def create_app():
785
+ """Create and configure the Gradio app."""
786
+
787
+ # Load icon as base64
788
+ icon_html = ""
789
+ try:
790
+ icon_path = Path("assets/icon-192.png")
791
+ if icon_path.exists():
792
+ with open(icon_path, "rb") as f:
793
+ encoding = base64.b64encode(f.read()).decode()
794
+ icon_html = f'<img src="data:image/png;base64,{encoding}" style="width: 48px; height: 48px; border-radius: 8px;" alt="BibGuard">'
795
+ else:
796
+ icon_html = '<span style="font-size: 48px;">📚</span>'
797
+ except Exception:
798
+ icon_html = '<span style="font-size: 48px;">📚</span>'
799
+
800
+ with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app:
801
+
802
+ # Header with icon
803
+ with gr.Row(elem_classes=["app-header"]):
804
+ gr.HTML(f"""
805
+ <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 16px;">
806
+ {icon_html}
807
+ <div>
808
+ <h1 style="margin: 0; font-size: 1.8em;">BibGuard</h1>
809
+ <p style="margin: 0; color: #666; font-size: 14px;">Bibliography & LaTeX Quality Checker</p>
810
+ </div>
811
+ </div>
812
+ """)
813
+
814
+ with gr.Row(elem_classes=["app-body"]):
815
+ # Left column: Upload & Settings
816
+ with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]):
817
+ gr.Markdown("### 📁 Upload Files")
818
+
819
+ bib_file = gr.File(
820
+ label="Bibliography (.bib)",
821
+ file_types=[".bib"],
822
+ file_count="single"
823
+ )
824
+
825
+ tex_file = gr.File(
826
+ label="LaTeX Source (.tex)",
827
+ file_types=[".tex"],
828
+ file_count="single"
829
+ )
830
+
831
+ # Check options in grid layout
832
+ gr.Markdown("#### ⚙️ Options")
833
+
834
+ with gr.Row():
835
+ check_metadata = gr.Checkbox(label="🔍 Metadata", value=False)
836
+ check_usage = gr.Checkbox(label="📊 Usage", value=True)
837
+
838
+ with gr.Row():
839
+ check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True)
840
+ check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True)
841
+
842
+ with gr.Row():
843
+ caption = gr.Checkbox(label="🖼️ Captions", value=True)
844
+ reference = gr.Checkbox(label="🔗 References", value=True)
845
+
846
+ with gr.Row():
847
+ formatting = gr.Checkbox(label="✨ Formatting", value=True)
848
+ equation = gr.Checkbox(label="🔢 Equations", value=True)
849
+
850
+ with gr.Row():
851
+ ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True)
852
+ sentence = gr.Checkbox(label="📝 Sentences", value=True)
853
+
854
+ with gr.Row():
855
+ consistency = gr.Checkbox(label="🔄 Consistency", value=True)
856
+ acronym = gr.Checkbox(label="🔤 Acronyms", value=True)
857
+
858
+ with gr.Row():
859
+ number = gr.Checkbox(label="🔢 Numbers", value=True)
860
+ citation_quality = gr.Checkbox(label="📚 Citations", value=True)
861
+
862
+ with gr.Row():
863
+ anonymization = gr.Checkbox(label="🎭 Anonymization", value=True)
864
+
865
+ run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg")
866
+
867
+ gr.HTML("""
868
+ <div style="text-align: center; margin-top: 16px;">
869
+ <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="text-decoration: none; color: #666; display: inline-flex; align-items: center; gap: 6px;">
870
+ <svg height="20" width="20" viewBox="0 0 16 16"><path fill="currentColor" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
871
+ GitHub
872
+ </a>
873
+ <p style="margin: 8px 0 0 0; color: #999; font-size: 12px;">Developed with ❤️ for researchers</p>
874
+ </div>
875
+ """)
876
+
877
+ # Right column: Reports
878
+ with gr.Column(scale=4, elem_classes=["app-content"]):
879
+ with gr.Tabs():
880
+ with gr.Tab("📚 Bibliography Report"):
881
+ bib_report = gr.HTML(
882
+ value=WELCOME_HTML,
883
+ elem_classes=["report-panel"]
884
+ )
885
+
886
+ with gr.Tab("📝 LaTeX Quality"):
887
+ latex_report = gr.HTML(
888
+ value=WELCOME_HTML,
889
+ elem_classes=["report-panel"]
890
+ )
891
+
892
+ with gr.Tab("📋 Line-by-Line"):
893
+ line_report = gr.HTML(
894
+ value=WELCOME_HTML,
895
+ elem_classes=["report-panel"]
896
+ )
897
+
898
+ # Event handling
899
+ run_btn.click(
900
+ fn=run_check,
901
+ inputs=[
902
+ bib_file, tex_file,
903
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
904
+ caption, reference, formatting, equation, ai_artifacts,
905
+ sentence, consistency, acronym, number, citation_quality, anonymization
906
+ ],
907
+ outputs=[bib_report, latex_report, line_report]
908
+ )
909
+
910
+ return app
911
+
912
+
913
+ # Create the app
914
+ app = create_app()
915
+
916
+ if __name__ == "__main__":
917
+ app.launch(
918
+ favicon_path="assets/icon-192.png",
919
+ show_error=True,
920
+ css=CUSTOM_CSS,
921
+ theme=gr.themes.Soft()
922
+ )
app_helper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def fetch_and_compare_with_workflow(
2
+ entry, workflow_steps, arxiv_fetcher, crossref_fetcher,
3
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
4
+ ):
5
+ """Fetch metadata from online sources using the configured workflow."""
6
+ from src.utils.normalizer import TextNormalizer
7
+
8
+ best_result = None
9
+
10
+ # If no steps provided, use default order
11
+ if not workflow_steps:
12
+ # Create a default list of steps if needed, or simply handle logic here
13
+ pass
14
+
15
+ # Simplified workflow execution: Run through enabled steps
16
+ # We manualy iterate through sources in a preferred order if workflow is not fully configured
17
+ # Or iterate through the steps list.
18
+
19
+ # Since extracting WorkflowConfig logic is complex, let's just implement a robust
20
+ # default search strategy here which is what the user likely wants.
21
+
22
+ results = []
23
+
24
+ # 1. DBLP (High quality for CS)
25
+ if dblp_fetcher and entry.title:
26
+ try:
27
+ dblp_result = dblp_fetcher.search_by_title(entry.title)
28
+ if dblp_result:
29
+ res = comparator.compare_with_dblp(entry, dblp_result)
30
+ if res.is_match: return res
31
+ results.append(res)
32
+ except Exception: pass
33
+
34
+ # 2. Semantic Scholar (Comprehensive)
35
+ if semantic_scholar_fetcher and entry.title:
36
+ try:
37
+ ss_result = None
38
+ if entry.doi:
39
+ ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
40
+ if not ss_result:
41
+ ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
42
+
43
+ if ss_result:
44
+ res = comparator.compare_with_semantic_scholar(entry, ss_result)
45
+ if res.is_match: return res
46
+ results.append(res)
47
+ except Exception: pass
48
+
49
+ # 3. OpenAlex
50
+ if openalex_fetcher and entry.title:
51
+ try:
52
+ oa_result = None
53
+ if entry.doi:
54
+ oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
55
+ if not oa_result:
56
+ oa_result = openalex_fetcher.search_by_title(entry.title)
57
+
58
+ if oa_result:
59
+ res = comparator.compare_with_openalex(entry, oa_result)
60
+ if res.is_match: return res
61
+ results.append(res)
62
+ except Exception: pass
63
+
64
+ # 4. CrossRef (Official metadata)
65
+ if crossref_fetcher and entry.doi:
66
+ try:
67
+ crossref_result = crossref_fetcher.search_by_doi(entry.doi)
68
+ if crossref_result:
69
+ res = comparator.compare_with_crossref(entry, crossref_result)
70
+ if res.is_match: return res
71
+ results.append(res)
72
+ except Exception: pass
73
+
74
+ # 5. ArXiv
75
+ if arxiv_fetcher:
76
+ try:
77
+ arxiv_meta = None
78
+ if entry.has_arxiv:
79
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
80
+ elif entry.title:
81
+ # Search by title
82
+ search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
83
+ if search_results:
84
+ arxiv_meta = search_results[0]
85
+
86
+ if arxiv_meta:
87
+ res = comparator.compare_with_arxiv(entry, arxiv_meta)
88
+ if res.is_match: return res
89
+ results.append(res)
90
+ except Exception: pass
91
+
92
+ # Return the best result (highest confidence) if no perfect match found
93
+ if results:
94
+ results.sort(key=lambda x: x.confidence, reverse=True)
95
+ return results[0]
96
+
97
+ # If absolutely nothing found, return None or an 'Unable' result
98
+ return comparator.create_unable_result(entry, "No metadata found in any source")
assets/icon-192.png ADDED

Git LFS Details

  • SHA256: 158c7c199e8e4978d2e8d6da90c4896022bf83436b0ab2c9b6285078cad60863
  • Pointer size: 131 Bytes
  • Size of remote file: 340 kB
assets/icon-512.png ADDED

Git LFS Details

  • SHA256: da47e48d79d2aae7f81cd1b04b39f0b7a66e760ee2338dfcdde36f66293f3ccf
  • Pointer size: 131 Bytes
  • Size of remote file: 313 kB
bibguard.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # BibGuard Configuration File
3
+ # ==============================================================================
4
+ #
5
+ # Usage: python main.py --config bibguard.yaml
6
+ # python main.py (auto-detect bibguard.yaml in current/parent directories)
7
+ #
8
+ # All paths are relative to this configuration file's directory.
9
+
10
+ # ==============================================================================
11
+ # 📁 File Settings
12
+ # ==============================================================================
13
+ files:
14
+ # Required: Path to your .bib bibliography file
15
+ bib: "test.bib"
16
+
17
+ # Required: Path to your .tex LaTeX source file
18
+ tex: "test.tex"
19
+
20
+ # Optional: Directory path for recursive scanning (Experimental)
21
+ # When set, BibGuard will recursively search for all .tex and .bib files in this directory.
22
+ # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
23
+ # input_dir: "./paper_project"
24
+
25
+ # Output directory for all generated reports and files (default: bibguard_output)
26
+ # All outputs including reports, cleaned .bib, and input file copies will be saved here
27
+ output_dir: "test"
28
+
29
+
30
+ # ==============================================================================
31
+ # 🎓 Conference Template
32
+ # ==============================================================================
33
+ # Specify a conference template for venue-specific checks and formatting rules.
34
+ # Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
35
+ # Leave empty ("") to skip template-specific checks.
36
+ template: ""
37
+
38
+ # ==============================================================================
39
+ # 📚 Bibliography Checks
40
+ # ==============================================================================
41
+ bibliography:
42
+ # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
43
+ # Detects incorrect titles, authors, venues, and publication years
44
+ # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
45
+ check_metadata: true
46
+
47
+ # Usage Check - Detect unused bib entries and missing citations
48
+ # Identifies entries in .bib not cited in .tex, and citations without bib entries
49
+ check_usage: true
50
+
51
+ # Duplicate Detection - Find duplicate entries with different keys
52
+ # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
53
+ check_duplicates: true
54
+
55
+ # Preprint Ratio Check - Warn if too many references are preprints
56
+ # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
57
+ check_preprint_ratio: true
58
+ preprint_warning_threshold: 0.50 # Warn if more than 50% of used entries are preprints
59
+
60
+ # Relevance Assessment - Use LLM to evaluate if citations match their context
61
+ # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
62
+ check_relevance: false
63
+
64
+ # ==============================================================================
65
+ # 📋 Submission Quality Checks
66
+ # ==============================================================================
67
+ submission:
68
+ # ─────────────────────────────────────────────────────────────────────────────
69
+ # Format Checks
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+
72
+ # Caption Position - Ensure table captions are above, figure captions below
73
+ # Checks \caption placement relative to \begin{table}/\begin{figure}
74
+ caption: true
75
+
76
+ # Cross-References - Verify all figures/tables/sections are referenced in text
77
+ # Detects orphaned floats that are never mentioned
78
+ reference: true
79
+
80
+ # Formatting Standards - Check citation format, spacing, special characters
81
+ # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
82
+ formatting: true
83
+
84
+ # Equation Checks - Verify equation punctuation and numbering consistency
85
+ # Ensures equations end with proper punctuation and labels are used correctly
86
+ equation: true
87
+
88
+ # ─────────────────────────────────────────────────────────────────────────────
89
+ # Writing Quality
90
+ # ─────────────────────────────────────────────────────────────────────────────
91
+
92
+ # AI Artifacts - Detect traces of AI-generated text
93
+ # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
94
+ ai_artifacts: true
95
+
96
+ # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
97
+ # Helps improve readability and academic writing style
98
+ sentence: true
99
+
100
+ # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
101
+ # Examples: "deep learning" vs "deep-learning", "color" vs "colour"
102
+ consistency: true
103
+
104
+ # ─────────────────────────────────────────────────────────────────────────────
105
+ # Academic Standards
106
+ # ─────────────────────────────────────────────────────────────────────────────
107
+
108
+ # Acronym Definitions - Ensure acronyms are defined on first use
109
+ # Example: "Natural Language Processing (NLP)" before using "NLP" alone
110
+ acronym: true
111
+
112
+ # Number Formatting - Check percentage formatting consistency
113
+ # Ensures no space before % sign and consistent use of '%' vs 'percent'
114
+ number: true
115
+
116
+ # Citation Quality - Flag outdated references and citation formatting issues
117
+ # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
118
+ citation_quality: true
119
+
120
+ # ─────────────────────────────────────────────────────────────────────────────
121
+ # Review Compliance
122
+ # ─────────────────────────────────────────────────────────────────────────────
123
+
124
+ # Anonymization - Check double-blind review compliance
125
+ # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
126
+ anonymization: true
127
+
128
+ # ==============================================================================
129
+ # 🔍 Metadata Check Workflow
130
+ # ==============================================================================
131
+ # Define the data sources and order for metadata validation.
132
+ # BibGuard will try each enabled source in sequence until a match is found.
133
+ # Set enabled: false to skip a particular source.
134
+ workflow:
135
+ - name: arxiv_id
136
+ enabled: true
137
+ description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
138
+
139
+ - name: crossref_doi
140
+ enabled: true
141
+ description: "Lookup by DOI via CrossRef (authoritative for published papers)"
142
+
143
+ - name: semantic_scholar
144
+ enabled: true
145
+ description: "Semantic Scholar API (good coverage, includes citations)"
146
+
147
+ - name: dblp
148
+ enabled: true
149
+ description: "DBLP database (comprehensive for computer science papers)"
150
+
151
+ - name: openalex
152
+ enabled: true
153
+ description: "OpenAlex API (broad coverage across disciplines)"
154
+
155
+ - name: arxiv_title
156
+ enabled: true
157
+ description: "Search arXiv by title (fallback when ID unavailable)"
158
+
159
+ - name: crossref_title
160
+ enabled: true
161
+ description: "Search CrossRef by title (fallback when DOI unavailable)"
162
+
163
+ - name: google_scholar
164
+ enabled: false # May be rate-limited, disabled by default
165
+ description: "Google Scholar web scraping (use as last resort)"
166
+
167
+ # ==============================================================================
168
+ # 🤖 LLM Configuration (for Relevance Checking)
169
+ # ==============================================================================
170
+ llm:
171
+ # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
172
+ # Each backend requires different setup (API keys, local installation, etc.)
173
+ backend: "gemini"
174
+
175
+ # Model name (leave empty to use backend default)
176
+ # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
177
+ model: ""
178
+
179
+ # API endpoint (leave empty to use backend default)
180
+ # Only needed for self-hosted models (vllm, ollama) or custom endpoints
181
+ endpoint: ""
182
+
183
+ # API key (recommended to use environment variables instead)
184
+ # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
185
+ api_key: ""
186
+
187
+ # ==============================================================================
188
+ # 📊 Output Settings
189
+ # ==============================================================================
190
+ output:
191
+ # Quiet mode - Suppress progress messages, only output final reports
192
+ # Useful for CI/CD pipelines or batch processing
193
+ quiet: false
194
+
195
+ # Minimal verified entries - Hide detailed info for entries that passed all checks
196
+ # Reduces report size when you only care about issues
197
+ minimal_verified: false
main.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BibGuard - Bibliography Checker & Paper Submission Quality Tool
4
+
5
+ Usage:
6
+ python main.py # Use bibguard.yaml in current directory
7
+ python main.py --config my.yaml # Use specified config file
8
+ python main.py --init # Create default config file
9
+ python main.py --list-templates # List available templates
10
+ """
11
+ import argparse
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Optional, List
15
+
16
+ from src.parsers import BibParser, TexParser
17
+ from src.fetchers import ArxivFetcher, ScholarFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
18
+ from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, DuplicateDetector
19
+ from src.analyzers.llm_evaluator import LLMBackend
20
+ from src.report.generator import ReportGenerator, EntryReport
21
+ from src.utils.progress import ProgressDisplay
22
+ from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
23
+ from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
24
+ from src.templates.base_template import get_template, get_all_templates
25
+ from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
26
+
27
+
28
+ def main():
29
+ parser = argparse.ArgumentParser(
30
+ description="BibGuard: Bibliography Checker & Paper Submission Quality Tool",
31
+ formatter_class=argparse.RawDescriptionHelpFormatter,
32
+ epilog="""
33
+ Usage Examples:
34
+ python main.py # Auto-detect config.yaml in current directory
35
+ python main.py --config my.yaml # Use specified config file
36
+ python main.py --init # Create default config.yaml
37
+ python main.py --list-templates # List available conference templates
38
+ """
39
+ )
40
+
41
+ parser.add_argument(
42
+ "--config", "-c",
43
+ help="Config file path (default: auto-detect config.yaml)"
44
+ )
45
+ parser.add_argument(
46
+ "--init",
47
+ action="store_true",
48
+ help="Create default config.yaml in current directory"
49
+ )
50
+ parser.add_argument(
51
+ "--list-templates",
52
+ action="store_true",
53
+ help="List all available conference templates"
54
+ )
55
+
56
+ args = parser.parse_args()
57
+
58
+ # Handle --init
59
+ if args.init:
60
+ output = create_default_config()
61
+ print(f"✓ Created configuration file: {output}")
62
+ print("")
63
+ print(" Next steps:")
64
+ print(" 1. Edit the 'bib' and 'tex' paths in config.yaml")
65
+ print(" 2. Run: python main.py --config config.yaml")
66
+ print("")
67
+ sys.exit(0)
68
+
69
+ # Handle --list-templates
70
+ if args.list_templates:
71
+ from src.ui.template_selector import list_templates
72
+ list_templates()
73
+ sys.exit(0)
74
+
75
+ # Find and load config
76
+ config_path = args.config
77
+ if not config_path:
78
+ found = find_config_file()
79
+ if found:
80
+ config_path = str(found)
81
+ else:
82
+ print("Error: Config file not found")
83
+ print("")
84
+ print("Please run 'python main.py --init' to create config.yaml")
85
+ print("Or use 'python main.py --config <path>' to specify a config file")
86
+ print("")
87
+ sys.exit(1)
88
+
89
+ try:
90
+ config = load_config(config_path)
91
+ except FileNotFoundError:
92
+ print(f"Error: Config file does not exist: {config_path}")
93
+ sys.exit(1)
94
+ except Exception as e:
95
+ print(f"Error: Failed to parse config file: {e}")
96
+ sys.exit(1)
97
+
98
+ # Validate required fields
99
+ mode_dir = bool(config.files.input_dir)
100
+
101
+ if mode_dir:
102
+ input_dir = config.input_dir_path
103
+ if not input_dir.exists() or not input_dir.is_dir():
104
+ print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
105
+ sys.exit(1)
106
+
107
+ tex_files = list(input_dir.rglob("*.tex"))
108
+ bib_files = list(input_dir.rglob("*.bib"))
109
+
110
+ if not tex_files:
111
+ print(f"Error: No .tex files found in {input_dir}")
112
+ sys.exit(1)
113
+ if not bib_files:
114
+ print(f"Error: No .bib files found in {input_dir}")
115
+ sys.exit(1)
116
+
117
+ config._tex_files = tex_files
118
+ config._bib_files = bib_files
119
+ else:
120
+ if not config.files.bib:
121
+ print("Error: bib file path not specified in config")
122
+ sys.exit(1)
123
+ if not config.files.tex:
124
+ print("Error: tex file path not specified in config")
125
+ sys.exit(1)
126
+
127
+ # Validate files exist
128
+ if not config.bib_path.exists():
129
+ print(f"Error: Bib file does not exist: {config.bib_path}")
130
+ sys.exit(1)
131
+ if not config.tex_path.exists():
132
+ print(f"Error: TeX file does not exist: {config.tex_path}")
133
+ sys.exit(1)
134
+
135
+ config._tex_files = [config.tex_path]
136
+ config._bib_files = [config.bib_path]
137
+
138
+ # Load template if specified
139
+ template = None
140
+ if config.template:
141
+ template = get_template(config.template)
142
+ if not template:
143
+ print(f"Error: Unknown template: {config.template}")
144
+ print("Use --list-templates to see available templates")
145
+ sys.exit(1)
146
+
147
+ # Run the checker
148
+ try:
149
+ run_checker(config, template)
150
+ except KeyboardInterrupt:
151
+ print("\n\nCancelled")
152
+ sys.exit(130)
153
+ except Exception as e:
154
+ print(f"\nError: {e}")
155
+ import traceback
156
+ traceback.print_exc()
157
+ sys.exit(1)
158
+
159
+
160
+ def run_checker(config: BibGuardConfig, template=None):
161
+ """Run the bibliography checker with the given configuration."""
162
+ progress = ProgressDisplay()
163
+
164
+ # Show config info (minimal)
165
+ if template:
166
+ pass # Skip printing header/info here to keep output clean
167
+
168
+ # Parse files (silent)
169
+ bib_parser = BibParser()
170
+ entries = []
171
+ for bib_path in config._bib_files:
172
+ entries.extend(bib_parser.parse_file(str(bib_path)))
173
+
174
+ tex_parser = TexParser()
175
+ tex_contents = {}
176
+ merged_citations = {}
177
+ merged_all_keys = set()
178
+
179
+ for tex_path in config._tex_files:
180
+ cits = tex_parser.parse_file(str(tex_path))
181
+ # Accumulate citations
182
+ for k, v in cits.items():
183
+ if k not in merged_citations:
184
+ merged_citations[k] = []
185
+ merged_citations[k].extend(v)
186
+ # Accumulate keys
187
+ merged_all_keys.update(tex_parser.get_all_cited_keys())
188
+ # Store content
189
+ tex_contents[str(tex_path)] = tex_path.read_text(encoding='utf-8', errors='replace')
190
+
191
+ # Inject merged data back into parser for components that use it
192
+ tex_parser.citations = merged_citations
193
+ tex_parser.all_keys = merged_all_keys
194
+
195
+ # Initialize components based on config
196
+ bib_config = config.bibliography
197
+
198
+ arxiv_fetcher = None
199
+ crossref_fetcher = None
200
+ scholar_fetcher = None
201
+ semantic_scholar_fetcher = None
202
+ openalex_fetcher = None
203
+ dblp_fetcher = None
204
+ comparator = None
205
+ usage_checker = None
206
+ llm_evaluator = None
207
+ duplicate_detector = None
208
+
209
+ if bib_config.check_metadata or bib_config.check_relevance:
210
+ arxiv_fetcher = ArxivFetcher()
211
+
212
+ if bib_config.check_metadata:
213
+ semantic_scholar_fetcher = SemanticScholarFetcher()
214
+ openalex_fetcher = OpenAlexFetcher()
215
+ dblp_fetcher = DBLPFetcher()
216
+ crossref_fetcher = CrossRefFetcher()
217
+ scholar_fetcher = ScholarFetcher()
218
+ comparator = MetadataComparator()
219
+
220
+ if bib_config.check_usage:
221
+ usage_checker = UsageChecker(tex_parser)
222
+
223
+ if bib_config.check_duplicates:
224
+ duplicate_detector = DuplicateDetector()
225
+
226
+ if bib_config.check_relevance:
227
+ llm_config = config.llm
228
+ backend = LLMBackend(llm_config.backend)
229
+ llm_evaluator = LLMEvaluator(
230
+ backend=backend,
231
+ endpoint=llm_config.endpoint or None,
232
+ model=llm_config.model or None,
233
+ api_key=llm_config.api_key or None
234
+ )
235
+
236
+ # Test LLM connection (silent)
237
+ llm_evaluator.test_connection()
238
+
239
+ if not usage_checker:
240
+ usage_checker = UsageChecker(tex_parser)
241
+
242
+ # Initialize report generator
243
+ report_gen = ReportGenerator(
244
+ minimal_verified=config.output.minimal_verified,
245
+ check_preprint_ratio=config.bibliography.check_preprint_ratio,
246
+ preprint_warning_threshold=config.bibliography.preprint_warning_threshold
247
+ )
248
+ report_gen.set_metadata(
249
+ [str(f) for f in config._bib_files],
250
+ [str(f) for f in config._tex_files]
251
+ )
252
+
253
+ # Run submission quality checks
254
+ submission_results = []
255
+ enabled_checkers = config.submission.get_enabled_checkers()
256
+
257
+ for checker_name in enabled_checkers:
258
+ if checker_name in CHECKER_REGISTRY:
259
+ checker = CHECKER_REGISTRY[checker_name]()
260
+ for tex_path_str, content in tex_contents.items():
261
+ results = checker.check(content, {})
262
+ # Tag results with file path
263
+ for r in results:
264
+ r.file_path = tex_path_str
265
+ submission_results.extend(results)
266
+
267
+ # Set results in report generator for summary calculation
268
+ report_gen.set_submission_results(submission_results, template)
269
+
270
+ # Check for duplicates (silent)
271
+ if bib_config.check_duplicates and duplicate_detector:
272
+ duplicate_groups = duplicate_detector.find_duplicates(entries)
273
+ report_gen.set_duplicate_groups(duplicate_groups)
274
+
275
+ # Check missing citations (silent)
276
+ if bib_config.check_usage and usage_checker:
277
+ missing = usage_checker.get_missing_entries(entries)
278
+ report_gen.set_missing_citations(missing)
279
+
280
+ # Process entries
281
+
282
+ # Build workflow from config
283
+ from src.config.workflow import WorkflowConfig, get_default_workflow, WorkflowStep as WFStep
284
+ workflow_config = get_default_workflow()
285
+ if config.workflow:
286
+ workflow_config = WorkflowConfig(
287
+ steps=[
288
+ WFStep(
289
+ name=step.name,
290
+ display_name=step.name,
291
+ description=step.description,
292
+ enabled=step.enabled,
293
+ priority=i
294
+ )
295
+ for i, step in enumerate(config.workflow)
296
+ ]
297
+ )
298
+
299
+ # Process entries in parallel for metadata checks
300
+ from concurrent.futures import ThreadPoolExecutor, as_completed
301
+ import threading
302
+
303
+ # Thread-safe progress tracking
304
+ progress_lock = threading.Lock()
305
+ completed_count = [0] # Use list for mutability in closure
306
+
307
+ def process_single_entry(entry):
308
+ """Process a single entry (thread-safe)."""
309
+ # Check usage
310
+ usage_result = None
311
+ if usage_checker:
312
+ usage_result = usage_checker.check_usage(entry)
313
+
314
+ # Fetch and compare metadata
315
+ comparison_result = None
316
+ if bib_config.check_metadata and comparator:
317
+ comparison_result = fetch_and_compare_with_workflow(
318
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher,
319
+ scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher,
320
+ dblp_fetcher, comparator
321
+ )
322
+
323
+ # LLM evaluation (keep sequential per entry)
324
+ evaluations = []
325
+ if bib_config.check_relevance and llm_evaluator:
326
+ if usage_result and usage_result.is_used:
327
+ abstract = get_abstract(entry, comparison_result, arxiv_fetcher)
328
+ if abstract:
329
+ for ctx in usage_result.contexts:
330
+ eval_result = llm_evaluator.evaluate(
331
+ entry.key, ctx.full_context, abstract
332
+ )
333
+ eval_result.line_number = ctx.line_number
334
+ eval_result.file_path = ctx.file_path
335
+ evaluations.append(eval_result)
336
+
337
+ # Create entry report
338
+ entry_report = EntryReport(
339
+ entry=entry,
340
+ comparison=comparison_result,
341
+ usage=usage_result,
342
+ evaluations=evaluations
343
+ )
344
+
345
+ return entry_report, comparison_result
346
+
347
+ # Determine number of workers (max 10 to avoid overwhelming APIs)
348
+ max_workers = min(10, len(entries))
349
+
350
+ with progress.progress_context(len(entries), "Processing bibliography") as prog:
351
+ # Use ThreadPoolExecutor for parallel processing
352
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
353
+ # Submit all tasks
354
+ future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
355
+
356
+ # Process completed tasks
357
+ for future in as_completed(future_to_entry):
358
+ entry = future_to_entry[future]
359
+ try:
360
+ entry_report, comparison_result = future.result()
361
+
362
+ # Thread-safe progress update
363
+ with progress_lock:
364
+ report_gen.add_entry_report(entry_report)
365
+
366
+ # Update progress
367
+ if comparison_result and comparison_result.is_match:
368
+ prog.mark_success()
369
+ elif comparison_result and comparison_result.has_issues:
370
+ prog.mark_warning()
371
+ else:
372
+ prog.mark_error()
373
+
374
+ completed_count[0] += 1
375
+ prog.update(entry.key, "Done", 1)
376
+
377
+ except Exception as e:
378
+ with progress_lock:
379
+ prog.mark_error()
380
+ progress.print_error(f"Error processing {entry.key}: {e}")
381
+ completed_count[0] += 1
382
+ prog.update(entry.key, "Failed", 1)
383
+
384
+ # Summary will be printed at the very end
385
+
386
+ # Generate reports and organize outputs (silent)
387
+
388
+ # Create output directory
389
+ output_dir = config.output_dir_path
390
+ output_dir.mkdir(parents=True, exist_ok=True)
391
+
392
+ # Copy input files to output directory
393
+ import shutil
394
+ for bib_path in config._bib_files:
395
+ shutil.copy2(bib_path, output_dir / bib_path.name)
396
+ for tex_path in config._tex_files:
397
+ shutil.copy2(tex_path, output_dir / tex_path.name)
398
+ # 1. Bibliography Report
399
+ bib_report_path = output_dir / "bibliography_report.md"
400
+ report_gen.save_bibliography_report(str(bib_report_path))
401
+
402
+ # 2. LaTeX Quality Report
403
+ if submission_results:
404
+ latex_report_path = output_dir / "latex_quality_report.md"
405
+ report_gen.save_latex_quality_report(
406
+ str(latex_report_path),
407
+ submission_results,
408
+ template
409
+ )
410
+
411
+ # 3. Line-by-Line Report
412
+ from src.report.line_report import generate_line_report
413
+ line_report_path = output_dir / "line_by_line_report.md"
414
+
415
+ # For multiple files, we generate one big report with sections
416
+ all_line_reports = []
417
+ for tex_path_str, content in tex_contents.items():
418
+ file_results = [r for r in submission_results if r.file_path == tex_path_str]
419
+ if not file_results:
420
+ continue
421
+
422
+ from src.report.line_report import LineByLineReportGenerator
423
+ gen = LineByLineReportGenerator(content, tex_path_str)
424
+ gen.add_results(file_results)
425
+ all_line_reports.append(gen.generate())
426
+
427
+ if all_line_reports:
428
+ with open(line_report_path, 'w', encoding='utf-8') as f:
429
+ f.write("\n\n".join(all_line_reports))
430
+
431
+ # 4. Clean bib file (if generated earlier)
432
+ if bib_config.check_usage and usage_checker:
433
+ used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
434
+ if used_entries:
435
+ try:
436
+ keys_to_keep = {entry.key for entry in used_entries}
437
+ # If multiple bibs, we merge them into one cleaned file
438
+ # or just use the first one if it's single mode.
439
+ # For now, let's just use a default name if multiple.
440
+ if len(config._bib_files) == 1:
441
+ clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
442
+ bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
443
+ else:
444
+ clean_bib_path = output_dir / "merged_only_used.bib"
445
+ # We need a way to filter multiple files into one.
446
+ # BibParser.filter_file currently takes one input.
447
+ # Let's just write all used entries to a new file.
448
+ with open(clean_bib_path, 'w', encoding='utf-8') as f:
449
+ for entry in used_entries:
450
+ f.write(entry.raw + "\n\n")
451
+ except Exception as e:
452
+ pass
453
+
454
+ # Print beautiful console summary
455
+ if not config.output.quiet:
456
+ bib_stats, latex_stats = report_gen.get_summary_stats()
457
+ progress.print_detailed_summary(bib_stats, latex_stats, str(output_dir.absolute()))
458
+
459
+
460
+ def fetch_and_compare_with_workflow(
461
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
462
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
463
+ ):
464
+ """Fetch metadata from online sources using the configured workflow."""
465
+ from src.utils.normalizer import TextNormalizer
466
+
467
+ all_results = []
468
+ enabled_steps = workflow_config.get_enabled_steps()
469
+
470
+ for step in enabled_steps:
471
+ result = None
472
+
473
+ if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher:
474
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
475
+ if arxiv_meta:
476
+ result = comparator.compare_with_arxiv(entry, arxiv_meta)
477
+
478
+ elif step.name == "crossref_doi" and entry.doi and crossref_fetcher:
479
+ crossref_result = crossref_fetcher.search_by_doi(entry.doi)
480
+ if crossref_result:
481
+ result = comparator.compare_with_crossref(entry, crossref_result)
482
+
483
+ elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher:
484
+ ss_result = None
485
+ if entry.doi:
486
+ ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
487
+ if not ss_result:
488
+ ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
489
+ if ss_result:
490
+ result = comparator.compare_with_semantic_scholar(entry, ss_result)
491
+
492
+ elif step.name == "dblp" and entry.title and dblp_fetcher:
493
+ dblp_result = dblp_fetcher.search_by_title(entry.title)
494
+ if dblp_result:
495
+ result = comparator.compare_with_dblp(entry, dblp_result)
496
+
497
+ elif step.name == "openalex" and entry.title and openalex_fetcher:
498
+ oa_result = None
499
+ if entry.doi:
500
+ oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
501
+ if not oa_result:
502
+ oa_result = openalex_fetcher.search_by_title(entry.title)
503
+ if oa_result:
504
+ result = comparator.compare_with_openalex(entry, oa_result)
505
+
506
+ elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
507
+ results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
508
+ if results:
509
+ best_result = None
510
+ best_sim = 0.0
511
+ norm1 = TextNormalizer.normalize_for_comparison(entry.title)
512
+
513
+ for r in results:
514
+ norm2 = TextNormalizer.normalize_for_comparison(r.title)
515
+ sim = TextNormalizer.similarity_ratio(norm1, norm2)
516
+ if sim > best_sim:
517
+ best_sim = sim
518
+ best_result = r
519
+
520
+ if best_result and best_sim > 0.5:
521
+ result = comparator.compare_with_arxiv(entry, best_result)
522
+
523
+ elif step.name == "crossref_title" and entry.title and crossref_fetcher:
524
+ crossref_result = crossref_fetcher.search_by_title(entry.title)
525
+ if crossref_result:
526
+ result = comparator.compare_with_crossref(entry, crossref_result)
527
+
528
+ elif step.name == "google_scholar" and entry.title and scholar_fetcher:
529
+ scholar_result = scholar_fetcher.search_by_title(entry.title)
530
+ if scholar_result:
531
+ result = comparator.compare_with_scholar(entry, scholar_result)
532
+
533
+ if result:
534
+ all_results.append(result)
535
+ if result.is_match:
536
+ return result
537
+
538
+ if all_results:
539
+ all_results.sort(key=lambda r: r.confidence, reverse=True)
540
+ return all_results[0]
541
+
542
+ return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
543
+
544
+
545
+ def get_abstract(entry, comparison_result, arxiv_fetcher):
546
+ """Get abstract for an entry from various sources."""
547
+ if entry.abstract:
548
+ return entry.abstract
549
+
550
+ if entry.has_arxiv and arxiv_fetcher:
551
+ arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
552
+ if arxiv_meta and arxiv_meta.abstract:
553
+ return arxiv_meta.abstract
554
+
555
+ if entry.title and arxiv_fetcher:
556
+ results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
557
+ if results and results[0].abstract:
558
+ return results[0].abstract
559
+
560
+ return ""
561
+
562
+
563
+ if __name__ == "__main__":
564
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ bibtexparser>=1.4.0
3
+ requests>=2.31.0
4
+ beautifulsoup4>=4.12.0
5
+ rich>=13.7.0
6
+ Unidecode>=1.3.0
7
+ lxml>=5.0.0
8
+ PyYAML>=6.0
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Bibliography Checker Package"""
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (202 Bytes). View file
 
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (190 Bytes). View file
 
src/analyzers/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Analyzers package"""
2
+ from .metadata_comparator import MetadataComparator
3
+ from .usage_checker import UsageChecker
4
+ from .llm_evaluator import LLMEvaluator
5
+ from .duplicate_detector import DuplicateDetector
6
+
7
+ __all__ = ['MetadataComparator', 'UsageChecker', 'LLMEvaluator', 'DuplicateDetector']
src/analyzers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (464 Bytes). View file
 
src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc ADDED
Binary file (8.29 kB). View file
 
src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc ADDED
Binary file (5.4 kB). View file
 
src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc ADDED
Binary file (14.3 kB). View file
 
src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc ADDED
Binary file (18.9 kB). View file
 
src/analyzers/__pycache__/retraction_checker.cpython-313.pyc ADDED
Binary file (4.94 kB). View file
 
src/analyzers/__pycache__/url_validator.cpython-313.pyc ADDED
Binary file (8.3 kB). View file
 
src/analyzers/__pycache__/usage_checker.cpython-313.pyc ADDED
Binary file (4.4 kB). View file
 
src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc ADDED
Binary file (13.3 kB). View file
 
src/analyzers/duplicate_detector.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Duplicate entry detector for bibliography files.
3
+ Uses fuzzy matching to find potential duplicates.
4
+ """
5
+ from dataclasses import dataclass
6
+ from typing import List, Tuple
7
+
8
+ from ..parsers.bib_parser import BibEntry
9
+ from ..utils.normalizer import TextNormalizer
10
+
11
+
12
+ @dataclass
13
+ class DuplicateGroup:
14
+ """A group of potentially duplicate entries."""
15
+ entries: List[BibEntry]
16
+ similarity_score: float
17
+ reason: str
18
+
19
+ @property
20
+ def entry_keys(self) -> List[str]:
21
+ return [e.key for e in self.entries]
22
+
23
+
24
+ class DuplicateDetector:
25
+ """Detects duplicate bibliography entries using fuzzy matching."""
26
+
27
+ # Thresholds for duplicate detection
28
+ TITLE_SIMILARITY_THRESHOLD = 0.85
29
+ COMBINED_SIMILARITY_THRESHOLD = 0.80
30
+
31
+ def __init__(self):
32
+ self.normalizer = TextNormalizer
33
+
34
+ def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]:
35
+ """
36
+ Find all duplicate groups in the bibliography.
37
+
38
+ Returns:
39
+ List of DuplicateGroup objects, each containing 2+ similar entries.
40
+ """
41
+ duplicates = []
42
+ processed = set()
43
+
44
+ for i, entry1 in enumerate(entries):
45
+ if entry1.key in processed:
46
+ continue
47
+
48
+ # Find all entries similar to this one
49
+ similar_entries = [entry1]
50
+
51
+ for j, entry2 in enumerate(entries[i+1:], start=i+1):
52
+ if entry2.key in processed:
53
+ continue
54
+
55
+ similarity, reason = self._calculate_similarity(entry1, entry2)
56
+
57
+ if similarity >= self.COMBINED_SIMILARITY_THRESHOLD:
58
+ similar_entries.append(entry2)
59
+ processed.add(entry2.key)
60
+
61
+ # If we found duplicates, create a group
62
+ if len(similar_entries) > 1:
63
+ processed.add(entry1.key)
64
+
65
+ # Calculate average similarity for the group
66
+ avg_similarity = self._calculate_group_similarity(similar_entries)
67
+ reason = self._generate_reason(similar_entries)
68
+
69
+ duplicates.append(DuplicateGroup(
70
+ entries=similar_entries,
71
+ similarity_score=avg_similarity,
72
+ reason=reason
73
+ ))
74
+
75
+ # Sort by similarity score (highest first)
76
+ duplicates.sort(key=lambda g: g.similarity_score, reverse=True)
77
+
78
+ return duplicates
79
+
80
+ def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]:
81
+ """
82
+ Calculate similarity between two entries.
83
+
84
+ Returns:
85
+ (similarity_score, reason_string)
86
+ """
87
+ # Normalize titles
88
+ title1 = self.normalizer.normalize_for_comparison(entry1.title)
89
+ title2 = self.normalizer.normalize_for_comparison(entry2.title)
90
+
91
+ # Calculate title similarity
92
+ title_sim = self.normalizer.similarity_ratio(title1, title2)
93
+
94
+ # If titles are very similar, likely a duplicate
95
+ if title_sim >= self.TITLE_SIMILARITY_THRESHOLD:
96
+ return title_sim, "Very similar titles"
97
+
98
+ # Check author similarity
99
+ author_sim = self._calculate_author_similarity(entry1, entry2)
100
+
101
+ # Combined score: weighted average
102
+ # Title is more important (70%) than authors (30%)
103
+ combined_sim = 0.7 * title_sim + 0.3 * author_sim
104
+
105
+ if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD:
106
+ return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})"
107
+
108
+ return combined_sim, ""
109
+
110
+ def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float:
111
+ """Calculate similarity between author lists."""
112
+ # Parse author strings
113
+ authors1 = self._parse_authors(entry1.author)
114
+ authors2 = self._parse_authors(entry2.author)
115
+
116
+ if not authors1 or not authors2:
117
+ return 0.0
118
+
119
+ # Normalize author names
120
+ norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1]
121
+ norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2]
122
+
123
+ # Count matching authors
124
+ matches = 0
125
+ for a1 in norm_authors1:
126
+ for a2 in norm_authors2:
127
+ if self._authors_match(a1, a2):
128
+ matches += 1
129
+ break
130
+
131
+ # Calculate Jaccard similarity
132
+ total_unique = len(set(norm_authors1) | set(norm_authors2))
133
+ if total_unique == 0:
134
+ return 0.0
135
+
136
+ return matches / total_unique
137
+
138
+ def _parse_authors(self, author_string: str) -> List[str]:
139
+ """Parse author string into list of names."""
140
+ if not author_string:
141
+ return []
142
+
143
+ # Split by 'and'
144
+ authors = author_string.split(' and ')
145
+
146
+ # Clean up each author
147
+ cleaned = []
148
+ for author in authors:
149
+ # Remove extra whitespace
150
+ author = ' '.join(author.split())
151
+ if author:
152
+ cleaned.append(author)
153
+
154
+ return cleaned
155
+
156
+ def _authors_match(self, name1: str, name2: str) -> bool:
157
+ """Check if two author names match (handles initials)."""
158
+ # Simple exact match after normalization
159
+ if name1 == name2:
160
+ return True
161
+
162
+ # Check if one is a substring of the other (handles initials)
163
+ if name1 in name2 or name2 in name1:
164
+ return True
165
+
166
+ # Calculate string similarity
167
+ sim = self.normalizer.similarity_ratio(name1, name2)
168
+ return sim >= 0.8
169
+
170
+ def _calculate_group_similarity(self, entries: List[BibEntry]) -> float:
171
+ """Calculate average similarity within a group."""
172
+ if len(entries) < 2:
173
+ return 1.0
174
+
175
+ total_sim = 0.0
176
+ count = 0
177
+
178
+ for i, entry1 in enumerate(entries):
179
+ for entry2 in entries[i+1:]:
180
+ sim, _ = self._calculate_similarity(entry1, entry2)
181
+ total_sim += sim
182
+ count += 1
183
+
184
+ return total_sim / count if count > 0 else 0.0
185
+
186
+ def _generate_reason(self, entries: List[BibEntry]) -> str:
187
+ """Generate a human-readable reason for the duplicate group."""
188
+ # Check if all titles are very similar
189
+ titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries]
190
+
191
+ # Calculate pairwise title similarities
192
+ title_sims = []
193
+ for i, t1 in enumerate(titles):
194
+ for t2 in titles[i+1:]:
195
+ title_sims.append(self.normalizer.similarity_ratio(t1, t2))
196
+
197
+ avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0
198
+
199
+ if avg_title_sim >= 0.95:
200
+ return "Nearly identical titles"
201
+ elif avg_title_sim >= 0.85:
202
+ return "Very similar titles"
203
+ else:
204
+ return "Similar titles and authors"
src/analyzers/llm_evaluator.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM-based citation relevance evaluator.
3
+ Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
4
+ """
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from typing import Optional, Dict, Any
9
+ from enum import Enum
10
+ import os
11
+
12
+ import requests
13
+
14
+
15
+ class LLMBackend(Enum):
16
+ OPENAI = "openai"
17
+ ANTHROPIC = "anthropic"
18
+ GEMINI = "gemini"
19
+ VLLM = "vllm"
20
+ OLLAMA = "ollama"
21
+ DEEPSEEK = "deepseek"
22
+
23
+
24
+ @dataclass
25
+ class EvaluationResult:
26
+ """Result of LLM citation evaluation."""
27
+ entry_key: str
28
+ relevance_score: int # 1-5
29
+ is_relevant: bool
30
+ explanation: str
31
+ context_used: str
32
+ abstract_used: str
33
+ line_number: Optional[int] = None
34
+ file_path: Optional[str] = None
35
+ error: Optional[str] = None
36
+
37
+ @property
38
+ def score_label(self) -> str:
39
+ labels = {
40
+ 1: "Not Relevant",
41
+ 2: "Marginally Relevant",
42
+ 3: "Somewhat Relevant",
43
+ 4: "Relevant",
44
+ 5: "Highly Relevant"
45
+ }
46
+ return labels.get(self.relevance_score, "Unknown")
47
+
48
+
49
+ class LLMEvaluator:
50
+ """Evaluates citation relevance using LLM."""
51
+
52
+ PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
53
+
54
+ ## Citation Context (from the manuscript):
55
+ {context}
56
+
57
+ ## Cited Paper's Abstract:
58
+ {abstract}
59
+
60
+ ## Task:
61
+ Evaluate the relevance and appropriateness of this citation. Consider:
62
+ 1. Does the citation support the claim being made in the context?
63
+ 2. Is the cited paper's topic related to the discussion?
64
+ 3. Is this citation necessary, or could it be replaced with a more relevant one?
65
+
66
+ ## Response Format:
67
+ Provide your response in the following JSON format:
68
+ {{
69
+ "relevance_score": <1-5 integer>,
70
+ "is_relevant": <true/false>,
71
+ "explanation": "<brief explanation in 1-2 sentences>"
72
+ }}
73
+
74
+ Score guide:
75
+ - 1: Not relevant at all
76
+ - 2: Marginally relevant
77
+ - 3: Somewhat relevant
78
+ - 4: Relevant and appropriate
79
+ - 5: Highly relevant and essential
80
+
81
+ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
82
+
83
+ def __init__(
84
+ self,
85
+ backend: LLMBackend = LLMBackend.GEMINI,
86
+ endpoint: Optional[str] = None,
87
+ model: Optional[str] = None,
88
+ api_key: Optional[str] = None
89
+ ):
90
+ self.backend = backend
91
+ self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
92
+
93
+ # Set defaults based on backend
94
+ if backend == LLMBackend.OPENAI:
95
+ self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
96
+ self.model = model or "gpt-5-mini"
97
+ elif backend == LLMBackend.ANTHROPIC:
98
+ self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
99
+ self.model = model or "claude-4.5-haiku"
100
+ elif backend == LLMBackend.DEEPSEEK:
101
+ self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
102
+ self.model = model or "deepseek-chat"
103
+ elif backend == LLMBackend.OLLAMA:
104
+ self.endpoint = endpoint or "http://localhost:11434/api/generate"
105
+ self.model = model or "Qwen/qwen3-4B-Instruct-2507"
106
+ elif backend == LLMBackend.VLLM:
107
+ self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
108
+ self.model = model or "Qwen/qwen3-4B-Instruct-2507"
109
+ elif backend == LLMBackend.GEMINI:
110
+ self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
111
+ self.model = model or "gemini-2.5-flash-lite"
112
+
113
+ def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
114
+ """Evaluate citation relevance."""
115
+ if not context or not abstract:
116
+ return EvaluationResult(
117
+ entry_key=entry_key,
118
+ relevance_score=0,
119
+ is_relevant=False,
120
+ explanation="Missing context or abstract",
121
+ context_used=context,
122
+ abstract_used=abstract,
123
+ error="Missing context or abstract for evaluation"
124
+ )
125
+
126
+ # Don't truncate - preserve full context and abstract
127
+ prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
128
+
129
+ try:
130
+ if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
131
+ response = self._call_openai_compatible(prompt)
132
+ elif self.backend == LLMBackend.ANTHROPIC:
133
+ response = self._call_anthropic(prompt)
134
+ elif self.backend == LLMBackend.OLLAMA:
135
+ response = self._call_ollama(prompt)
136
+ elif self.backend == LLMBackend.GEMINI:
137
+ response = self._call_gemini(prompt)
138
+ else:
139
+ raise ValueError(f"Unknown backend: {self.backend}")
140
+
141
+ return self._parse_response(entry_key, response, context, abstract)
142
+
143
+ except Exception as e:
144
+ return EvaluationResult(
145
+ entry_key=entry_key,
146
+ relevance_score=0,
147
+ is_relevant=False,
148
+ explanation="",
149
+ context_used=context,
150
+ abstract_used=abstract,
151
+ error=str(e)
152
+ )
153
+
154
+ def _call_openai_compatible(self, prompt: str) -> str:
155
+ """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
156
+ headers = {
157
+ "Content-Type": "application/json",
158
+ "Authorization": f"Bearer {self.api_key}"
159
+ }
160
+
161
+ payload = {
162
+ "model": self.model,
163
+ "messages": [
164
+ {"role": "user", "content": prompt}
165
+ ],
166
+ "temperature": 0.1,
167
+ "max_tokens": 2000,
168
+ "response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None
169
+ }
170
+
171
+ response = requests.post(
172
+ self.endpoint,
173
+ json=payload,
174
+ headers=headers,
175
+ timeout=60
176
+ )
177
+ response.raise_for_status()
178
+
179
+ data = response.json()
180
+ choices = data.get("choices", [])
181
+ if choices:
182
+ return choices[0].get("message", {}).get("content", "")
183
+ return ""
184
+
185
+ def _call_anthropic(self, prompt: str) -> str:
186
+ """Call Anthropic API."""
187
+ headers = {
188
+ "x-api-key": self.api_key,
189
+ "anthropic-version": "2023-06-01",
190
+ "content-type": "application/json"
191
+ }
192
+
193
+ payload = {
194
+ "model": self.model,
195
+ "max_tokens": 2000,
196
+ "temperature": 0.1,
197
+ "messages": [
198
+ {"role": "user", "content": prompt}
199
+ ]
200
+ }
201
+
202
+ response = requests.post(
203
+ self.endpoint,
204
+ json=payload,
205
+ headers=headers,
206
+ timeout=60
207
+ )
208
+ response.raise_for_status()
209
+
210
+ data = response.json()
211
+ content = data.get("content", [])
212
+ if content and content[0].get("type") == "text":
213
+ return content[0].get("text", "")
214
+ return ""
215
+
216
+ def _call_ollama(self, prompt: str) -> str:
217
+ """Call Ollama API."""
218
+ payload = {
219
+ "model": self.model,
220
+ "prompt": prompt,
221
+ "stream": False,
222
+ "options": {
223
+ "temperature": 0.1,
224
+ "num_predict": 2000
225
+ },
226
+ "format": "json"
227
+ }
228
+
229
+ response = requests.post(
230
+ self.endpoint,
231
+ json=payload,
232
+ timeout=60
233
+ )
234
+ response.raise_for_status()
235
+
236
+ return response.json().get("response", "")
237
+
238
+ def _call_gemini(self, prompt: str) -> str:
239
+ """Call Gemini API."""
240
+ # Build URL with model
241
+ url = f"{self.endpoint}/{self.model}:generateContent"
242
+ if self.api_key:
243
+ url += f"?key={self.api_key}"
244
+
245
+ payload = {
246
+ "contents": [
247
+ {
248
+ "parts": [
249
+ {"text": prompt}
250
+ ]
251
+ }
252
+ ],
253
+ "generationConfig": {
254
+ "temperature": 0.1,
255
+ "maxOutputTokens": 2000,
256
+ "responseMimeType": "application/json"
257
+ }
258
+ }
259
+
260
+ response = requests.post(
261
+ url,
262
+ json=payload,
263
+ timeout=60
264
+ )
265
+ response.raise_for_status()
266
+
267
+ candidates = response.json().get("candidates", [])
268
+ if candidates:
269
+ content = candidates[0].get("content", {})
270
+ parts = content.get("parts", [])
271
+ if parts:
272
+ return parts[0].get("text", "")
273
+ return ""
274
+
275
+ def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
276
+ """Parse LLM response."""
277
+ # Try to extract JSON from response
278
+ json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
279
+
280
+ data = {}
281
+ if not json_match:
282
+ # Try to parse the whole response as JSON
283
+ try:
284
+ data = json.loads(response.strip())
285
+ except json.JSONDecodeError:
286
+ pass
287
+ else:
288
+ try:
289
+ data = json.loads(json_match.group())
290
+ except json.JSONDecodeError:
291
+ pass
292
+
293
+ if not data:
294
+ return EvaluationResult(
295
+ entry_key=entry_key,
296
+ relevance_score=0,
297
+ is_relevant=False,
298
+ explanation=response,
299
+ context_used=context,
300
+ abstract_used=abstract,
301
+ error="Failed to parse LLM response as JSON"
302
+ )
303
+
304
+ # Extract fields
305
+ relevance_score = data.get("relevance_score", 0)
306
+ if isinstance(relevance_score, str):
307
+ try:
308
+ relevance_score = int(relevance_score)
309
+ except ValueError:
310
+ relevance_score = 0
311
+
312
+ is_relevant = data.get("is_relevant", False)
313
+ if isinstance(is_relevant, str):
314
+ is_relevant = is_relevant.lower() in ("true", "yes", "1")
315
+
316
+ explanation = data.get("explanation", "")
317
+
318
+ return EvaluationResult(
319
+ entry_key=entry_key,
320
+ relevance_score=relevance_score,
321
+ is_relevant=is_relevant,
322
+ explanation=explanation,
323
+ context_used=context,
324
+ abstract_used=abstract
325
+ )
326
+
327
+ def test_connection(self) -> bool:
328
+ """Test if LLM backend is accessible."""
329
+ try:
330
+ if self.backend == LLMBackend.OLLAMA:
331
+ response = requests.get(
332
+ self.endpoint.replace("/api/generate", "/api/tags"),
333
+ timeout=5
334
+ )
335
+ return response.status_code == 200
336
+ elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
337
+ # Test with a simple model list or empty completion
338
+ headers = {"Authorization": f"Bearer {self.api_key}"}
339
+ # Try listing models if possible, otherwise simple completion
340
+ if "chat/completions" in self.endpoint:
341
+ # Try a minimal completion
342
+ payload = {
343
+ "model": self.model,
344
+ "messages": [{"role": "user", "content": "hi"}],
345
+ "max_tokens": 1
346
+ }
347
+ response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
348
+ return response.status_code == 200
349
+ else:
350
+ return False
351
+ elif self.backend == LLMBackend.ANTHROPIC:
352
+ headers = {
353
+ "x-api-key": self.api_key,
354
+ "anthropic-version": "2023-06-01",
355
+ "content-type": "application/json"
356
+ }
357
+ payload = {
358
+ "model": self.model,
359
+ "max_tokens": 1,
360
+ "messages": [{"role": "user", "content": "hi"}]
361
+ }
362
+ response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
363
+ return response.status_code == 200
364
+ elif self.backend == LLMBackend.GEMINI:
365
+ if not self.api_key:
366
+ return False
367
+ url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}"
368
+ payload = {
369
+ "contents": [{"parts": [{"text": "test"}]}],
370
+ "generationConfig": {"maxOutputTokens": 10}
371
+ }
372
+ response = requests.post(url, json=payload, timeout=10)
373
+ return response.status_code == 200
374
+ except Exception:
375
+ return False
376
+ return False
src/analyzers/metadata_comparator.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata comparison between bib entries and fetched metadata.
3
+ """
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ from ..parsers.bib_parser import BibEntry
8
+ from ..fetchers.arxiv_fetcher import ArxivMetadata
9
+ from ..fetchers.scholar_fetcher import ScholarResult
10
+ from ..fetchers.crossref_fetcher import CrossRefResult
11
+ from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult
12
+ from ..fetchers.openalex_fetcher import OpenAlexResult
13
+ from ..fetchers.dblp_fetcher import DBLPResult
14
+ from ..utils.normalizer import TextNormalizer
15
+
16
+
17
+ @dataclass
18
+ class ComparisonResult:
19
+ """Result of comparing bib entry with fetched metadata."""
20
+ entry_key: str
21
+
22
+ # Title comparison
23
+ title_match: bool
24
+ title_similarity: float
25
+ bib_title: str
26
+ fetched_title: str
27
+
28
+ # Author comparison
29
+ author_match: bool
30
+ author_similarity: float
31
+ bib_authors: list[str]
32
+ fetched_authors: list[str]
33
+
34
+ # Year comparison
35
+ year_match: bool
36
+ bib_year: str
37
+ fetched_year: str
38
+
39
+ # Overall assessment
40
+ is_match: bool
41
+ confidence: float
42
+ issues: list[str]
43
+ source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
44
+
45
+ @property
46
+ def has_issues(self) -> bool:
47
+ return len(self.issues) > 0
48
+
49
+
50
+ class MetadataComparator:
51
+ """Compares bibliography entries with fetched metadata."""
52
+
53
+ # Thresholds for matching
54
+ TITLE_THRESHOLD = 0.8
55
+ AUTHOR_THRESHOLD = 0.6
56
+
57
+ def __init__(self):
58
+ self.normalizer = TextNormalizer
59
+
60
+ def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
61
+ """Compare bib entry with arXiv metadata."""
62
+ issues = []
63
+
64
+ # Compare titles
65
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
66
+ arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
67
+
68
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm)
69
+ # Also try Levenshtein for short titles
70
+ if len(bib_title_norm) < 100:
71
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm)
72
+ title_similarity = max(title_similarity, lev_sim)
73
+
74
+ title_match = title_similarity >= self.TITLE_THRESHOLD
75
+
76
+ if not title_match:
77
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
78
+
79
+ # Compare authors
80
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
81
+ arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors]
82
+
83
+ author_similarity = self._compare_author_lists(bib_authors, arxiv_authors)
84
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
85
+
86
+ if not author_match:
87
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
88
+
89
+ # Compare years
90
+ bib_year = bib_entry.year.strip()
91
+ arxiv_year = arxiv_meta.year
92
+ year_match = bib_year == arxiv_year
93
+
94
+ if not year_match and bib_year and arxiv_year:
95
+ issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}")
96
+
97
+ # Overall assessment
98
+ is_match = title_match and author_match
99
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
100
+
101
+ return ComparisonResult(
102
+ entry_key=bib_entry.key,
103
+ title_match=title_match,
104
+ title_similarity=title_similarity,
105
+ bib_title=bib_entry.title,
106
+ fetched_title=arxiv_meta.title,
107
+ author_match=author_match,
108
+ author_similarity=author_similarity,
109
+ bib_authors=bib_authors,
110
+ fetched_authors=arxiv_authors,
111
+ year_match=year_match,
112
+ bib_year=bib_year,
113
+ fetched_year=arxiv_year,
114
+ is_match=is_match,
115
+ confidence=confidence,
116
+ issues=issues,
117
+ source="arxiv"
118
+ )
119
+
120
+ def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
121
+ """Compare bib entry with Scholar search result."""
122
+ issues = []
123
+
124
+ # Compare titles
125
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
126
+ scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title)
127
+
128
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm)
129
+ if len(bib_title_norm) < 100:
130
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm)
131
+ title_similarity = max(title_similarity, lev_sim)
132
+
133
+ title_match = title_similarity >= self.TITLE_THRESHOLD
134
+
135
+ if not title_match:
136
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
137
+
138
+ # Compare authors (Scholar format is less structured)
139
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
140
+ # Scholar authors are comma-separated
141
+ scholar_authors_raw = scholar_result.authors.split(',')
142
+ scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw]
143
+
144
+ author_similarity = self._compare_author_lists(bib_authors, scholar_authors)
145
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
146
+
147
+ if not author_match:
148
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
149
+
150
+ # Compare years
151
+ bib_year = bib_entry.year.strip()
152
+ scholar_year = scholar_result.year
153
+ year_match = bib_year == scholar_year
154
+
155
+ if not year_match and bib_year and scholar_year:
156
+ issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}")
157
+
158
+ # Overall assessment
159
+ is_match = title_match and author_match
160
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
161
+
162
+ return ComparisonResult(
163
+ entry_key=bib_entry.key,
164
+ title_match=title_match,
165
+ title_similarity=title_similarity,
166
+ bib_title=bib_entry.title,
167
+ fetched_title=scholar_result.title,
168
+ author_match=author_match,
169
+ author_similarity=author_similarity,
170
+ bib_authors=bib_authors,
171
+ fetched_authors=scholar_authors,
172
+ year_match=year_match,
173
+ bib_year=bib_year,
174
+ fetched_year=scholar_year,
175
+ is_match=is_match,
176
+ confidence=confidence,
177
+ issues=issues,
178
+ source="scholar"
179
+ )
180
+
181
+ def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult:
182
+ """Compare bib entry with CrossRef search result."""
183
+ issues = []
184
+
185
+ # Compare titles
186
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
187
+ crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title)
188
+
189
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm)
190
+ if len(bib_title_norm) < 100:
191
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm)
192
+ title_similarity = max(title_similarity, lev_sim)
193
+
194
+ title_match = title_similarity >= self.TITLE_THRESHOLD
195
+
196
+ if not title_match:
197
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
198
+
199
+ # Compare authors
200
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
201
+ crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors]
202
+
203
+ author_similarity = self._compare_author_lists(bib_authors, crossref_authors)
204
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
205
+
206
+ if not author_match:
207
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
208
+
209
+ # Compare years
210
+ bib_year = bib_entry.year.strip()
211
+ crossref_year = crossref_result.year
212
+ year_match = bib_year == crossref_year
213
+
214
+ if not year_match and bib_year and crossref_year:
215
+ issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}")
216
+
217
+ # Overall assessment
218
+ is_match = title_match and author_match
219
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
220
+
221
+ return ComparisonResult(
222
+ entry_key=bib_entry.key,
223
+ title_match=title_match,
224
+ title_similarity=title_similarity,
225
+ bib_title=bib_entry.title,
226
+ fetched_title=crossref_result.title,
227
+ author_match=author_match,
228
+ author_similarity=author_similarity,
229
+ bib_authors=bib_authors,
230
+ fetched_authors=crossref_authors,
231
+ year_match=year_match,
232
+ bib_year=bib_year,
233
+ fetched_year=crossref_year,
234
+ is_match=is_match,
235
+ confidence=confidence,
236
+ issues=issues,
237
+ source="crossref"
238
+ )
239
+
240
+ def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
241
+ """Create result when metadata couldn't be fetched."""
242
+ return ComparisonResult(
243
+ entry_key=bib_entry.key,
244
+ title_match=False,
245
+ title_similarity=0.0,
246
+ bib_title=bib_entry.title,
247
+ fetched_title="",
248
+ author_match=False,
249
+ author_similarity=0.0,
250
+ bib_authors=self.normalizer.normalize_author_list(bib_entry.author),
251
+ fetched_authors=[],
252
+ year_match=False,
253
+ bib_year=bib_entry.year,
254
+ fetched_year="",
255
+ is_match=False,
256
+ confidence=0.0,
257
+ issues=[reason],
258
+ source="unable"
259
+ )
260
+
261
+ def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
262
+ """Compare two author lists."""
263
+ if not list1 and not list2:
264
+ return 1.0
265
+ if not list1 or not list2:
266
+ return 0.0
267
+
268
+ # Find best matches for each author in list1
269
+ total_similarity = 0.0
270
+ for author1 in list1:
271
+ best_match = 0.0
272
+ for author2 in list2:
273
+ # Check if one name contains the other (handle abbreviated names)
274
+ if self._names_match(author1, author2):
275
+ best_match = 1.0
276
+ break
277
+ sim = self.normalizer.similarity_ratio(author1, author2)
278
+ best_match = max(best_match, sim)
279
+ total_similarity += best_match
280
+
281
+ return total_similarity / len(list1)
282
+
283
+ def _names_match(self, name1: str, name2: str) -> bool:
284
+ """Check if two names match (handles abbreviated names)."""
285
+ words1 = name1.split()
286
+ words2 = name2.split()
287
+
288
+ if not words1 or not words2:
289
+ return False
290
+
291
+ # Check if last names match
292
+ if words1[-1] != words2[-1]:
293
+ # Try first word as last name too
294
+ if words1[0] != words2[-1] and words1[-1] != words2[0]:
295
+ return False
296
+
297
+ return True
298
+
299
+ def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult:
300
+ """Compare bib entry with Semantic Scholar result."""
301
+ issues = []
302
+
303
+ # Compare titles
304
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
305
+ ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title)
306
+
307
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm)
308
+ if len(bib_title_norm) < 100:
309
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm)
310
+ title_similarity = max(title_similarity, lev_sim)
311
+
312
+ title_match = title_similarity >= self.TITLE_THRESHOLD
313
+
314
+ if not title_match:
315
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
316
+
317
+ # Compare authors
318
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
319
+ ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors]
320
+
321
+ author_similarity = self._compare_author_lists(bib_authors, ss_authors)
322
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
323
+
324
+ if not author_match:
325
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
326
+
327
+ # Compare years
328
+ bib_year = bib_entry.year.strip()
329
+ ss_year = ss_result.year
330
+ year_match = bib_year == ss_year
331
+
332
+ if not year_match and bib_year and ss_year:
333
+ issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}")
334
+
335
+ # Overall assessment
336
+ is_match = title_match and author_match
337
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
338
+
339
+ return ComparisonResult(
340
+ entry_key=bib_entry.key,
341
+ title_match=title_match,
342
+ title_similarity=title_similarity,
343
+ bib_title=bib_entry.title,
344
+ fetched_title=ss_result.title,
345
+ author_match=author_match,
346
+ author_similarity=author_similarity,
347
+ bib_authors=bib_authors,
348
+ fetched_authors=ss_authors,
349
+ year_match=year_match,
350
+ bib_year=bib_year,
351
+ fetched_year=ss_year,
352
+ is_match=is_match,
353
+ confidence=confidence,
354
+ issues=issues,
355
+ source="semantic_scholar"
356
+ )
357
+
358
+ def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult:
359
+ """Compare bib entry with OpenAlex result."""
360
+ issues = []
361
+
362
+ # Compare titles
363
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
364
+ oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title)
365
+
366
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm)
367
+ if len(bib_title_norm) < 100:
368
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm)
369
+ title_similarity = max(title_similarity, lev_sim)
370
+
371
+ title_match = title_similarity >= self.TITLE_THRESHOLD
372
+
373
+ if not title_match:
374
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
375
+
376
+ # Compare authors
377
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
378
+ oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors]
379
+
380
+ author_similarity = self._compare_author_lists(bib_authors, oa_authors)
381
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
382
+
383
+ if not author_match:
384
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
385
+
386
+ # Compare years
387
+ bib_year = bib_entry.year.strip()
388
+ oa_year = oa_result.year
389
+ year_match = bib_year == oa_year
390
+
391
+ if not year_match and bib_year and oa_year:
392
+ issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}")
393
+
394
+ # Overall assessment
395
+ is_match = title_match and author_match
396
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
397
+
398
+ return ComparisonResult(
399
+ entry_key=bib_entry.key,
400
+ title_match=title_match,
401
+ title_similarity=title_similarity,
402
+ bib_title=bib_entry.title,
403
+ fetched_title=oa_result.title,
404
+ author_match=author_match,
405
+ author_similarity=author_similarity,
406
+ bib_authors=bib_authors,
407
+ fetched_authors=oa_authors,
408
+ year_match=year_match,
409
+ bib_year=bib_year,
410
+ fetched_year=oa_year,
411
+ is_match=is_match,
412
+ confidence=confidence,
413
+ issues=issues,
414
+ source="openalex"
415
+ )
416
+
417
+ def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult:
418
+ """Compare bib entry with DBLP result."""
419
+ issues = []
420
+
421
+ # Compare titles
422
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
423
+ dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title)
424
+
425
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm)
426
+ if len(bib_title_norm) < 100:
427
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm)
428
+ title_similarity = max(title_similarity, lev_sim)
429
+
430
+ title_match = title_similarity >= self.TITLE_THRESHOLD
431
+
432
+ if not title_match:
433
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
434
+
435
+ # Compare authors
436
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
437
+ dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors]
438
+
439
+ author_similarity = self._compare_author_lists(bib_authors, dblp_authors)
440
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
441
+
442
+ if not author_match:
443
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
444
+
445
+ # Compare years
446
+ bib_year = bib_entry.year.strip()
447
+ dblp_year = dblp_result.year
448
+ year_match = bib_year == dblp_year
449
+
450
+ if not year_match and bib_year and dblp_year:
451
+ issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}")
452
+
453
+ # Overall assessment
454
+ is_match = title_match and author_match
455
+ confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
456
+
457
+ return ComparisonResult(
458
+ entry_key=bib_entry.key,
459
+ title_match=title_match,
460
+ title_similarity=title_similarity,
461
+ bib_title=bib_entry.title,
462
+ fetched_title=dblp_result.title,
463
+ author_match=author_match,
464
+ author_similarity=author_similarity,
465
+ bib_authors=bib_authors,
466
+ fetched_authors=dblp_authors,
467
+ year_match=year_match,
468
+ bib_year=bib_year,
469
+ fetched_year=dblp_year,
470
+ is_match=is_match,
471
+ confidence=confidence,
472
+ issues=issues,
473
+ source="dblp"
474
+ )
src/analyzers/usage_checker.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage checker for bibliography entries in TeX files.
3
+ """
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ from ..parsers.bib_parser import BibEntry
8
+ from ..parsers.tex_parser import TexParser, CitationContext
9
+
10
+
11
+ @dataclass
12
+ class UsageResult:
13
+ """Result of checking if a bib entry is used."""
14
+ entry_key: str
15
+ is_used: bool
16
+ usage_count: int
17
+ contexts: list[CitationContext]
18
+ line_numbers: list[int]
19
+
20
+ @property
21
+ def first_usage_line(self) -> Optional[int]:
22
+ return self.line_numbers[0] if self.line_numbers else None
23
+
24
+
25
+ class UsageChecker:
26
+ """Checks if bibliography entries are used in TeX files."""
27
+
28
+ def __init__(self, tex_parser: TexParser):
29
+ self.tex_parser = tex_parser
30
+ self._cited_keys = tex_parser.get_all_cited_keys()
31
+
32
+ def check_usage(self, entry: BibEntry) -> UsageResult:
33
+ """Check if a bib entry is used in the TeX document."""
34
+ key = entry.key
35
+ is_used = key in self._cited_keys
36
+ contexts = self.tex_parser.get_citation_contexts(key)
37
+
38
+ return UsageResult(
39
+ entry_key=key,
40
+ is_used=is_used,
41
+ usage_count=len(contexts),
42
+ contexts=contexts,
43
+ line_numbers=[ctx.line_number for ctx in contexts]
44
+ )
45
+
46
+ def get_unused_entries(self, entries: list[BibEntry]) -> list[BibEntry]:
47
+ """Get list of entries that are not cited in the document."""
48
+ unused = []
49
+ for entry in entries:
50
+ if entry.key not in self._cited_keys:
51
+ unused.append(entry)
52
+ return unused
53
+
54
+ def get_missing_entries(self, entries: list[BibEntry]) -> list[str]:
55
+ """Get list of citation keys that don't have corresponding bib entries."""
56
+ entry_keys = {e.key for e in entries}
57
+ missing = []
58
+ for key in self._cited_keys:
59
+ if key not in entry_keys:
60
+ missing.append(key)
61
+ return missing
62
+
63
+ def get_combined_context(self, key: str, max_chars: int = 1000) -> str:
64
+ """Get combined context for all usages of a key."""
65
+ contexts = self.tex_parser.get_citation_contexts(key)
66
+ if not contexts:
67
+ return ""
68
+
69
+ combined = []
70
+ total_chars = 0
71
+
72
+ for ctx in contexts:
73
+ if total_chars + len(ctx.full_context) > max_chars:
74
+ # Add truncated context
75
+ remaining = max_chars - total_chars
76
+ if remaining > 100:
77
+ combined.append(ctx.full_context[:remaining] + "...")
78
+ break
79
+ combined.append(ctx.full_context)
80
+ total_chars += len(ctx.full_context)
81
+
82
+ return "\n---\n".join(combined)
src/checkers/__init__.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Checkers module for paper submission quality checks."""
2
+ from .base import BaseChecker, CheckResult, CheckSeverity
3
+ from .caption_checker import CaptionChecker
4
+ from .reference_checker import ReferenceChecker
5
+ from .ai_artifacts_checker import AIArtifactsChecker
6
+ from .formatting_checker import FormattingChecker
7
+ from .anonymization_checker import AnonymizationChecker
8
+ from .number_checker import NumberChecker
9
+ from .sentence_checker import SentenceChecker
10
+ from .consistency_checker import ConsistencyChecker
11
+ from .citation_quality_checker import CitationQualityChecker
12
+ from .equation_checker import EquationChecker
13
+ from .acronym_checker import AcronymChecker
14
+
15
+ __all__ = [
16
+ 'BaseChecker',
17
+ 'CheckResult',
18
+ 'CheckSeverity',
19
+ 'CaptionChecker',
20
+ 'ReferenceChecker',
21
+ 'AIArtifactsChecker',
22
+ 'FormattingChecker',
23
+ 'AnonymizationChecker',
24
+ 'NumberChecker',
25
+ 'SentenceChecker',
26
+ 'ConsistencyChecker',
27
+ 'CitationQualityChecker',
28
+ 'EquationChecker',
29
+ 'AcronymChecker',
30
+ ]
31
+
32
+
33
+ # Registry of all available checkers
34
+ CHECKER_REGISTRY = {
35
+ 'caption': CaptionChecker,
36
+ 'reference': ReferenceChecker,
37
+ 'ai_artifacts': AIArtifactsChecker,
38
+ 'formatting': FormattingChecker,
39
+ 'anonymization': AnonymizationChecker,
40
+ 'number': NumberChecker,
41
+ 'sentence': SentenceChecker,
42
+ 'consistency': ConsistencyChecker,
43
+ 'citation_quality': CitationQualityChecker,
44
+ 'equation': EquationChecker,
45
+ 'acronym': AcronymChecker,
46
+ }
47
+
48
+
49
+ def get_checker(name: str) -> BaseChecker:
50
+ """Get a checker instance by name."""
51
+ if name not in CHECKER_REGISTRY:
52
+ raise ValueError(f"Unknown checker: {name}")
53
+ return CHECKER_REGISTRY[name]()
54
+
55
+
56
+ def run_all_checkers(tex_content: str, config: dict = None) -> list:
57
+ """Run all checkers and return combined results."""
58
+ results = []
59
+ config = config or {}
60
+
61
+ for name, checker_class in CHECKER_REGISTRY.items():
62
+ checker = checker_class()
63
+ checker_results = checker.check(tex_content, config)
64
+ results.extend(checker_results)
65
+
66
+ return results
src/checkers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (2.2 kB). View file
 
src/checkers/__pycache__/acronym_checker.cpython-313.pyc ADDED
Binary file (10.8 kB). View file
 
src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc ADDED
Binary file (6.14 kB). View file
 
src/checkers/__pycache__/anonymization_checker.cpython-313.pyc ADDED
Binary file (8.38 kB). View file
 
src/checkers/__pycache__/base.cpython-313.pyc ADDED
Binary file (7.68 kB). View file
 
src/checkers/__pycache__/caption_checker.cpython-313.pyc ADDED
Binary file (5.63 kB). View file
 
src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc ADDED
Binary file (5.41 kB). View file
 
src/checkers/__pycache__/consistency_checker.cpython-313.pyc ADDED
Binary file (11 kB). View file
 
src/checkers/__pycache__/equation_checker.cpython-313.pyc ADDED
Binary file (5.62 kB). View file
 
src/checkers/__pycache__/formatting_checker.cpython-313.pyc ADDED
Binary file (9.45 kB). View file
 
src/checkers/__pycache__/number_checker.cpython-313.pyc ADDED
Binary file (3.8 kB). View file
 
src/checkers/__pycache__/reference_checker.cpython-313.pyc ADDED
Binary file (8.3 kB). View file
 
src/checkers/__pycache__/sentence_checker.cpython-313.pyc ADDED
Binary file (4.36 kB). View file
 
src/checkers/acronym_checker.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Acronym and abbreviation checker.
3
+
4
+ Validates that:
5
+ - Acronyms found in text have corresponding full forms defined
6
+ - Acronyms are used after their definition
7
+ - Only checks acronyms that have matching full forms in the document
8
+ """
9
+ import re
10
+ from typing import List, Dict, Set, Tuple
11
+ from collections import defaultdict
12
+
13
+ from .base import BaseChecker, CheckResult, CheckSeverity
14
+
15
+
16
+ class AcronymChecker(BaseChecker):
17
+ """Check acronym definitions and consistency."""
18
+
19
+ name = "acronym"
20
+ display_name = "Acronyms"
21
+ description = "Check acronym definitions and consistent usage"
22
+
23
+ # Enhanced pattern to find defined acronyms with LaTeX formatting support
24
+ # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
25
+ DEFINITION_PATTERN = re.compile(
26
+ r'([A-Z][a-zA-Z\s\-]+)\s*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC})
27
+ r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name)
28
+ re.MULTILINE
29
+ )
30
+
31
+ # Pattern to find standalone acronyms (3+ capital letters)
32
+ ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
33
+
34
+ # Comprehensive list of common acronyms that don't need definition
35
+ COMMON_ACRONYMS = {
36
+ # Hardware & Computing
37
+ 'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
38
+ 'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
39
+ 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
40
+
41
+ # File Formats & Standards
42
+ 'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
43
+ 'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
44
+
45
+ # AI & Machine Learning (General)
46
+ 'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
47
+ 'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
48
+ 'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
49
+
50
+ # NLP & Language Models
51
+ 'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
52
+ 'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
53
+ 'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
54
+
55
+ # Computer Vision
56
+ 'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
57
+ 'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
58
+
59
+ # Reinforcement Learning
60
+ 'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
61
+ 'POMDP', 'RLHF', 'RLAIF',
62
+
63
+ # Metrics & Evaluation
64
+ 'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
65
+ 'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
66
+
67
+ # Data & Statistics
68
+ 'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
69
+ 'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
70
+
71
+ # Academic & Organizations
72
+ 'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
73
+ 'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
74
+ 'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
75
+ 'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
76
+
77
+ # Methods & Techniques (Common in ML papers)
78
+ 'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
79
+ 'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
80
+ 'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
81
+
82
+ # Misc
83
+ 'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
84
+ 'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
85
+ 'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
86
+ 'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
87
+ }
88
+
89
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
90
+ results = []
91
+
92
+ # Remove comments using base class method
93
+ content = self._remove_comments(tex_content)
94
+
95
+ # Find all defined acronyms with their positions
96
+ defined_acronyms = self._find_definitions(content)
97
+
98
+ # Find all acronym usages (excluding special contexts)
99
+ all_usages = self._find_all_usages(content)
100
+
101
+ # NEW: Find potential full forms for each acronym
102
+ acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
103
+
104
+ # Check for undefined acronyms (only those with matching full forms)
105
+ for acronym, positions in all_usages.items():
106
+ if acronym in self.COMMON_ACRONYMS:
107
+ continue
108
+
109
+ # Skip if no matching full form found in document
110
+ if acronym not in acronym_full_forms:
111
+ continue
112
+
113
+ if acronym not in defined_acronyms:
114
+ # First usage should define it
115
+ first_pos = positions[0]
116
+ line_num = self._find_line_number(content, first_pos)
117
+ full_form = acronym_full_forms[acronym]
118
+
119
+ results.append(self._create_result(
120
+ passed=False,
121
+ severity=CheckSeverity.WARNING,
122
+ message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
123
+ line_number=line_num,
124
+ suggestion=f"Define on first use: '{full_form} ({acronym})'"
125
+ ))
126
+ else:
127
+ # Check if used before definition
128
+ def_pos = defined_acronyms[acronym]
129
+ for pos in positions:
130
+ if pos < def_pos:
131
+ line_num = self._find_line_number(content, pos)
132
+ results.append(self._create_result(
133
+ passed=False,
134
+ severity=CheckSeverity.WARNING,
135
+ message=f"Acronym '{acronym}' used before definition",
136
+ line_number=line_num,
137
+ suggestion="Move definition before first use"
138
+ ))
139
+ break
140
+
141
+ return results
142
+
143
+ def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
144
+ """Find potential full forms for acronyms by matching capital letters."""
145
+ full_forms = {}
146
+
147
+ for acronym in acronyms:
148
+ if acronym in self.COMMON_ACRONYMS:
149
+ continue
150
+
151
+ # Build regex pattern to match full form
152
+ # For "ABC", match words starting with A, B, C
153
+ acronym_clean = acronym.rstrip('s') # Remove plural
154
+ if len(acronym_clean) < 3:
155
+ continue
156
+
157
+ # Create pattern: match sequence of words where first letters spell the acronym
158
+ # Allow optional words in between (like "of", "the", "and")
159
+ pattern_parts = []
160
+ for i, letter in enumerate(acronym_clean):
161
+ if i == 0:
162
+ # First word must start with the letter
163
+ pattern_parts.append(f'{letter}[a-z]+')
164
+ else:
165
+ # Subsequent words: allow optional filler words
166
+ pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
167
+
168
+ full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
169
+
170
+ try:
171
+ matches = re.finditer(full_pattern, content, re.IGNORECASE)
172
+ for match in matches:
173
+ candidate = match.group(0)
174
+
175
+ # Skip if candidate contains common non-content words
176
+ # These words indicate the match is part of a sentence, not an acronym full form
177
+ excluded_words = {
178
+ 'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
179
+ 'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
180
+ 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
181
+ 'can', 'could', 'may', 'might', 'must', 'shall',
182
+ 'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
183
+ 'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
184
+ 'more', 'most', 'less', 'few', 'several', 'other', 'another'
185
+ }
186
+
187
+ candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
188
+ if any(word in excluded_words for word in candidate_words):
189
+ continue
190
+
191
+ # Verify: extract first letters and check if they match acronym
192
+ words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
193
+ # Filter out filler words (allowed in between but not counted)
194
+ filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
195
+ meaningful_words = [w for w in words if w.lower() not in filler_words]
196
+
197
+ if len(meaningful_words) >= len(acronym_clean):
198
+ first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
199
+ if first_letters == acronym_clean:
200
+ full_forms[acronym] = candidate
201
+ break # Found a match, use the first one
202
+ except re.error:
203
+ # Invalid regex, skip this acronym
204
+ continue
205
+
206
+ return full_forms
207
+
208
+ def _find_definitions(self, content: str) -> Dict[str, int]:
209
+ """Find all acronym definitions and their positions."""
210
+ definitions = {}
211
+
212
+ for match in self.DEFINITION_PATTERN.finditer(content):
213
+ # Get acronym from either pattern
214
+ acronym = match.group(2) or match.group(3)
215
+ if acronym:
216
+ acronym = acronym.rstrip('s') # Remove plural
217
+ definitions[acronym] = match.start()
218
+
219
+ return definitions
220
+
221
+ def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
222
+ """Find all acronym usages, excluding special contexts."""
223
+ usages = defaultdict(list)
224
+
225
+ for match in self.ACRONYM_PATTERN.finditer(content):
226
+ acronym = match.group(1).rstrip('s')
227
+ pos = match.start()
228
+
229
+ # Skip if in special context
230
+ if self._is_in_special_context(content, pos, acronym):
231
+ continue
232
+
233
+ usages[acronym].append(pos)
234
+
235
+ return usages
236
+
237
+ def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
238
+ """Check if acronym at position is in a special context that should be ignored."""
239
+ # Get surrounding context
240
+ start = max(0, pos - 50)
241
+ end = min(len(content), pos + len(acronym) + 50)
242
+ before = content[start:pos]
243
+ after = content[pos + len(acronym):end]
244
+
245
+ # Skip if inside definition parentheses: (ACRONYM)
246
+ if before.endswith('(') and after.startswith(')'):
247
+ return True
248
+
249
+ # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
250
+ if before.rstrip().endswith('\\'):
251
+ return True
252
+
253
+ # Skip if inside label: \label{...:ACRONYM...}
254
+ if r'\label{' in before[-20:] and '}' in after[:20]:
255
+ return True
256
+
257
+ # Skip if inside ref: \ref{...:ACRONYM...}
258
+ if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
259
+ return True
260
+
261
+ # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
262
+ if r'\url{' in before[-20:] or 'http' in before[-20:]:
263
+ return True
264
+
265
+ # Skip if inside math mode (simple heuristic)
266
+ # Count $ signs before position
267
+ dollar_count = before.count('$') - before.count(r'\$')
268
+ if dollar_count % 2 == 1: # Odd number means we're inside math mode
269
+ return True
270
+
271
+ # Skip if inside \begin{equation} or similar
272
+ if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
273
+ if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
274
+ return True
275
+
276
+ # Skip if it looks like a LaTeX command argument: \command[ACRONYM]
277
+ if before.endswith('[') and after.startswith(']'):
278
+ return True
279
+
280
+ # Skip if part of a file path or extension
281
+ if '.' in before[-5:] or '/' in before[-10:]:
282
+ return True
283
+
284
+ return False
src/checkers/ai_artifacts_checker.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI artifacts checker.
3
+
4
+ Detects leftover text from AI writing assistants that should be removed
5
+ before submission, such as:
6
+ - Conversational responses ("Sure, here is...")
7
+ - Placeholder text
8
+ - Markdown formatting artifacts
9
+ - Common AI response patterns
10
+ """
11
+ import re
12
+ from typing import List, Tuple
13
+
14
+ from .base import BaseChecker, CheckResult, CheckSeverity
15
+
16
+
17
+ class AIArtifactsChecker(BaseChecker):
18
+ """Detect AI-generated text artifacts that should be removed."""
19
+
20
+ name = "ai_artifacts"
21
+ display_name = "AI Artifacts"
22
+ description = "Detect leftover AI assistant text and placeholders"
23
+
24
+ # Conversational AI patterns (case insensitive)
25
+ # These are phrases that clearly indicate a dialogue between user and AI assistant
26
+ AI_CONVERSATION_PATTERNS = [
27
+ # Responses to requests
28
+ (r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
29
+ (r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
30
+ (r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
31
+ (r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
32
+ (r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
33
+ (r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
34
+
35
+ # Self-identification
36
+ (r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
37
+ (r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
38
+ (r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
39
+
40
+ # Explanatory transitions typical of chat
41
+ (r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
42
+ (r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
43
+ (r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
44
+
45
+ # Closing/Politeness
46
+ (r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
47
+ (r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
48
+ (r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
49
+ (r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
50
+ (r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
51
+ (r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
52
+
53
+ # Instructions/Meta-commentary
54
+ (r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
55
+ (r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
56
+ (r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
57
+ ]
58
+
59
+ # Placeholder patterns
60
+ PLACEHOLDER_PATTERNS = [
61
+ (r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
62
+ (r'\[add\s+[^\]]+\]', "Placeholder text"),
63
+ (r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
64
+ (r'\btodo\s*:\s*.{0,50}', "TODO comment"),
65
+ (r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
66
+ (r'\bxxx\b', "XXX placeholder"),
67
+ (r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
68
+ (r'author[\s_-]*name', "Author name placeholder"),
69
+ (r'your\.?email@example\.com', "Email placeholder"),
70
+ (r'example@(example\.com|university\.edu)', "Email placeholder"),
71
+ (r'\[citation\s+needed\]', "Citation needed placeholder"),
72
+ ]
73
+
74
+ # Markdown artifacts (should not appear in LaTeX)
75
+ MARKDOWN_PATTERNS = [
76
+ (r'^\s*#{1,6}\s+\w', "Markdown header"),
77
+ (r'\*\*[^*]+\*\*', "Markdown bold"),
78
+ (r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
79
+ (r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
80
+ (r'```[\s\S]*?```', "Markdown code block"),
81
+ (r'^\s*[-*+]\s+\w', "Markdown bullet point"),
82
+ (r'^\s*\d+\.\s+\w', "Markdown numbered list"),
83
+ (r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
84
+ ]
85
+
86
+
87
+
88
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
89
+ results = []
90
+ lines = tex_content.split('\n')
91
+
92
+ # Track if we are inside a verbatim-like environment
93
+ in_verbatim = False
94
+ verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
95
+
96
+ # Check each line
97
+ for line_num, line in enumerate(lines, 1):
98
+ # Check for environment boundaries
99
+ # Handle \begin{env}
100
+ if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
101
+ in_verbatim = True
102
+ continue # Skip the begin line itself
103
+
104
+ # Handle \end{env}
105
+ if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
106
+ in_verbatim = False
107
+ continue # Skip the end line itself
108
+
109
+ # Skip checks if inside verbatim environment
110
+ if in_verbatim:
111
+ continue
112
+
113
+ # Skip commented lines using base class method
114
+ if self._is_comment_line(line):
115
+ continue
116
+
117
+ # Remove inline comments for checking using base class method
118
+ line_to_check = self._remove_line_comment(line)
119
+
120
+ # Check AI conversation patterns
121
+ for pattern, description in self.AI_CONVERSATION_PATTERNS:
122
+ if re.search(pattern, line_to_check, re.IGNORECASE):
123
+ results.append(self._create_result(
124
+ passed=False,
125
+ severity=CheckSeverity.ERROR,
126
+ message=f"{description} detected",
127
+ line_number=line_num,
128
+ line_content=line.strip()[:100],
129
+ suggestion="Remove AI-generated conversational text"
130
+ ))
131
+ break # One match per line for this category
132
+
133
+ # Check placeholder patterns
134
+ for pattern, description in self.PLACEHOLDER_PATTERNS:
135
+ match = re.search(pattern, line_to_check, re.IGNORECASE)
136
+ if match:
137
+ results.append(self._create_result(
138
+ passed=False,
139
+ severity=CheckSeverity.WARNING,
140
+ message=f"{description}: '{match.group(0)[:50]}'",
141
+ line_number=line_num,
142
+ line_content=line.strip()[:100],
143
+ suggestion="Replace placeholder with actual content or remove"
144
+ ))
145
+
146
+ # Check Markdown patterns (less strict - might be intentional in some cases)
147
+ for pattern, description in self.MARKDOWN_PATTERNS:
148
+ # Skip if line looks like a LaTeX command (starts with \)
149
+ if line_to_check.strip().startswith('\\'):
150
+ continue
151
+
152
+ # Special handling for bullet points: ensure space after
153
+ if "bullet point" in description:
154
+ # Skip if it looks like a math subtraction or negative number
155
+ if re.search(r'[-+]\d', line_to_check):
156
+ continue
157
+ # Skip if inside math mode (simple heuristic)
158
+ if '$' in line_to_check:
159
+ continue
160
+
161
+ # Special handling for italics: avoid matching math mode like $x*y$
162
+ if "italic" in description:
163
+ if '$' in line_to_check:
164
+ continue
165
+
166
+ if re.search(pattern, line_to_check):
167
+ results.append(self._create_result(
168
+ passed=False,
169
+ severity=CheckSeverity.INFO,
170
+ message=f"Possible {description} in LaTeX",
171
+ line_number=line_num,
172
+ line_content=line.strip()[:100],
173
+ suggestion="Convert to LaTeX formatting or remove if unintentional"
174
+ ))
175
+
176
+ return results
src/checkers/anonymization_checker.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Anonymization checker.
3
+
4
+ For double-blind review submissions, checks for:
5
+ - Author name leaks in acknowledgments
6
+ - Personal URLs (GitHub, personal pages)
7
+ - Self-citations that reveal identity
8
+ - Institutional information in comments
9
+ """
10
+ import re
11
+ from typing import List
12
+
13
+ from .base import BaseChecker, CheckResult, CheckSeverity
14
+
15
+
16
+ class AnonymizationChecker(BaseChecker):
17
+ """Check for anonymization issues in double-blind submissions."""
18
+
19
+ name = "anonymization"
20
+ display_name = "Anonymization"
21
+ description = "Detect potential identity leaks in double-blind submissions"
22
+
23
+ # Patterns for identity-revealing content
24
+ PERSONAL_URL_PATTERNS = [
25
+ (r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
26
+ (r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
27
+ (r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
28
+ (r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
29
+ (r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
30
+ (r'~[a-zA-Z]+/', "Personal university page"),
31
+ (r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
32
+ (r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
33
+ ]
34
+
35
+ # Anonymous submission indicators (should be present)
36
+ ANONYMOUS_MARKERS = [
37
+ r'\\author\{[^}]*anonymous[^}]*\}',
38
+ r'anonymous\s+submission',
39
+ r'\\runningauthor\{[^}]*\}', # Should be empty or generic
40
+ ]
41
+
42
+ # Potentially revealing patterns
43
+ SELF_CITE_PATTERNS = [
44
+ r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
45
+ r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
46
+ r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
47
+ ]
48
+
49
+ # Acknowledgment patterns
50
+ ACK_PATTERN = re.compile(
51
+ r'\\(?:section\*?\{acknowledgment|begin\{ack)',
52
+ re.IGNORECASE
53
+ )
54
+
55
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
56
+ results = []
57
+ lines = tex_content.split('\n')
58
+
59
+ # Check if this is a review submission (look for anonymous author)
60
+ is_review_version = self._is_review_version(tex_content)
61
+
62
+ if not is_review_version:
63
+ # If camera-ready, skip anonymization checks
64
+ results.append(self._create_result(
65
+ passed=True,
66
+ severity=CheckSeverity.INFO,
67
+ message="Document appears to be camera-ready version (not checking anonymization)"
68
+ ))
69
+ return results
70
+
71
+ # Check for personal URLs
72
+ for line_num, line in enumerate(lines, 1):
73
+ # Skip comments, but still check for leaks in comments!
74
+ if self._is_comment_line(line):
75
+ for pattern, desc in self.PERSONAL_URL_PATTERNS:
76
+ if re.search(pattern, line, re.IGNORECASE):
77
+ results.append(self._create_result(
78
+ passed=False,
79
+ severity=CheckSeverity.WARNING,
80
+ message=f"{desc} in comment (could be revealed when compiling)",
81
+ line_number=line_num,
82
+ line_content=line.strip()[:100],
83
+ suggestion="Remove or anonymize URL even in comments"
84
+ ))
85
+ continue
86
+
87
+ for pattern, desc in self.PERSONAL_URL_PATTERNS:
88
+ if re.search(pattern, line, re.IGNORECASE):
89
+ results.append(self._create_result(
90
+ passed=False,
91
+ severity=CheckSeverity.ERROR,
92
+ message=f"{desc} may reveal author identity",
93
+ line_number=line_num,
94
+ line_content=line.strip()[:100],
95
+ suggestion="Replace with anonymized URL or remove for review"
96
+ ))
97
+
98
+ # Check acknowledgments section
99
+ ack_results = self._check_acknowledgments(tex_content, lines)
100
+ results.extend(ack_results)
101
+
102
+ # Check for self-revealing citations
103
+ for line_num, line in enumerate(lines, 1):
104
+ # Skip comments using base class method
105
+ if self._is_comment_line(line):
106
+ continue
107
+
108
+ for pattern in self.SELF_CITE_PATTERNS:
109
+ if re.search(pattern, line, re.IGNORECASE):
110
+ results.append(self._create_result(
111
+ passed=False,
112
+ severity=CheckSeverity.WARNING,
113
+ message="Potentially self-revealing citation pattern",
114
+ line_number=line_num,
115
+ line_content=line.strip()[:100],
116
+ suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
117
+ ))
118
+
119
+ # Check for \author content
120
+ author_results = self._check_author_field(tex_content)
121
+ results.extend(author_results)
122
+
123
+ return results
124
+
125
+ def _is_review_version(self, content: str) -> bool:
126
+ """Detect if this is a review (anonymous) version."""
127
+ # Check for common anonymous submission markers
128
+ review_indicators = [
129
+ r'review',
130
+ r'submitted\s+to',
131
+ r'under\s+review',
132
+ r'anonymous',
133
+ r'\\usepackage\[review\]',
134
+ ]
135
+
136
+ for indicator in review_indicators:
137
+ if re.search(indicator, content[:2000], re.IGNORECASE):
138
+ return True
139
+
140
+ # Check for camera-ready indicators (negative)
141
+ camera_indicators = [
142
+ r'\\usepackage\[accepted\]',
143
+ r'\\usepackage\[final\]',
144
+ r'camera[\s-]?ready',
145
+ ]
146
+
147
+ for indicator in camera_indicators:
148
+ if re.search(indicator, content[:2000], re.IGNORECASE):
149
+ return False
150
+
151
+ # Default to review version (safer)
152
+ return True
153
+
154
+ def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
155
+ """Check acknowledgments section for identity leaks."""
156
+ results = []
157
+
158
+ # Find acknowledgment section
159
+ ack_match = self.ACK_PATTERN.search(content)
160
+ if not ack_match:
161
+ return results
162
+
163
+ # Find the line number
164
+ ack_line = self._find_line_number(content, ack_match.start())
165
+
166
+ # Check if it's commented out
167
+ actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
168
+ if not actual_line.lstrip().startswith('%'):
169
+ results.append(self._create_result(
170
+ passed=False,
171
+ severity=CheckSeverity.WARNING,
172
+ message="Acknowledgments section found - should be commented out for review",
173
+ line_number=ack_line,
174
+ suggestion="Comment out acknowledgments with % for anonymous submission"
175
+ ))
176
+
177
+ return results
178
+
179
+ def _check_author_field(self, content: str) -> List[CheckResult]:
180
+ """Check \\author{} field for revealing content."""
181
+ results = []
182
+
183
+ # Find \author{...} - handle multiline
184
+ author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
185
+ match = author_pattern.search(content)
186
+
187
+ if match:
188
+ # Extract author content (handle nested braces)
189
+ start = match.end()
190
+ brace_count = 1
191
+ i = start
192
+ while i < len(content) and brace_count > 0:
193
+ if content[i] == '{':
194
+ brace_count += 1
195
+ elif content[i] == '}':
196
+ brace_count -= 1
197
+ i += 1
198
+
199
+ author_content = content[start:i-1]
200
+ line_num = self._find_line_number(content, match.start())
201
+
202
+ # Check if author content looks anonymous
203
+ if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
204
+ # Check if it's not using \Anonymous or similar
205
+ if not re.search(r'\\(Anonymous|blindauthor)', author_content):
206
+ # Might contain real author info
207
+ if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
208
+ results.append(self._create_result(
209
+ passed=False,
210
+ severity=CheckSeverity.ERROR,
211
+ message="Author field may contain real names",
212
+ line_number=line_num,
213
+ suggestion="Replace with 'Anonymous' or use anonymization command"
214
+ ))
215
+
216
+ return results
src/checkers/base.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base checker class for paper submission quality checks.
3
+
4
+ All specific checkers inherit from BaseChecker and implement
5
+ the check() method to validate specific aspects of the TeX document.
6
+ """
7
+ import re
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass
10
+ from enum import Enum
11
+ from typing import List, Optional, Tuple
12
+
13
+
14
+ class CheckSeverity(Enum):
15
+ """Severity levels for check results."""
16
+ ERROR = "error" # Must fix before submission
17
+ WARNING = "warning" # Strongly recommended to fix
18
+ INFO = "info" # Suggestion or best practice
19
+
20
+
21
+ @dataclass
22
+ class CheckResult:
23
+ """Result of a single check."""
24
+ checker_name: str
25
+ passed: bool
26
+ severity: CheckSeverity
27
+ message: str
28
+ line_number: Optional[int] = None
29
+ line_content: Optional[str] = None
30
+ suggestion: Optional[str] = None
31
+ file_path: Optional[str] = None
32
+
33
+ def to_dict(self) -> dict:
34
+ return {
35
+ 'checker': self.checker_name,
36
+ 'passed': self.passed,
37
+ 'severity': self.severity.value,
38
+ 'message': self.message,
39
+ 'line': self.line_number,
40
+ 'content': self.line_content,
41
+ 'suggestion': self.suggestion,
42
+ 'file_path': self.file_path
43
+ }
44
+
45
+
46
+ class BaseChecker(ABC):
47
+ """
48
+ Abstract base class for all paper submission checkers.
49
+
50
+ Each checker validates a specific aspect of the paper,
51
+ such as caption placement, reference integrity, or formatting.
52
+ """
53
+
54
+ # Checker metadata - override in subclasses
55
+ name: str = "base"
56
+ display_name: str = "Base Checker"
57
+ description: str = "Base checker class"
58
+
59
+ @abstractmethod
60
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
61
+ """
62
+ Run the check on the given TeX content.
63
+
64
+ Args:
65
+ tex_content: The full content of the TeX file
66
+ config: Optional configuration dict (e.g., conference-specific settings)
67
+
68
+ Returns:
69
+ List of CheckResult objects describing found issues
70
+ """
71
+ pass
72
+
73
+ def _remove_comments(self, content: str) -> str:
74
+ """
75
+ Remove all LaTeX comments from content.
76
+
77
+ Preserves line structure (replaces comment with empty string on same line).
78
+ Handles escaped percent signs (\\%) correctly.
79
+ """
80
+ lines = content.split('\n')
81
+ result = []
82
+
83
+ for line in lines:
84
+ # Find first unescaped %
85
+ cleaned = self._remove_line_comment(line)
86
+ result.append(cleaned)
87
+
88
+ return '\n'.join(result)
89
+
90
+ def _remove_line_comment(self, line: str) -> str:
91
+ """Remove comment from a single line, preserving content before %."""
92
+ i = 0
93
+ while i < len(line):
94
+ if line[i] == '%':
95
+ # Check if escaped
96
+ num_backslashes = 0
97
+ j = i - 1
98
+ while j >= 0 and line[j] == '\\':
99
+ num_backslashes += 1
100
+ j -= 1
101
+ if num_backslashes % 2 == 0:
102
+ # Not escaped, this is a comment start
103
+ return line[:i]
104
+ i += 1
105
+ return line
106
+
107
+ def _is_comment_line(self, line: str) -> bool:
108
+ """Check if a line is entirely a comment (starts with %)."""
109
+ stripped = line.lstrip()
110
+ if not stripped:
111
+ return False
112
+ return stripped[0] == '%'
113
+
114
+ def _get_non_comment_lines(self, content: str) -> List[Tuple[int, str]]:
115
+ """
116
+ Get all non-comment lines with their line numbers.
117
+
118
+ Returns:
119
+ List of (line_number, line_content) tuples for non-comment lines.
120
+ Line content has inline comments removed.
121
+ """
122
+ lines = content.split('\n')
123
+ result = []
124
+
125
+ for line_num, line in enumerate(lines, 1):
126
+ # Skip pure comment lines
127
+ if self._is_comment_line(line):
128
+ continue
129
+
130
+ # Remove inline comments
131
+ cleaned = self._remove_line_comment(line)
132
+
133
+ # Skip if nothing left after removing comment
134
+ if not cleaned.strip():
135
+ continue
136
+
137
+ result.append((line_num, cleaned))
138
+
139
+ return result
140
+
141
+ def _find_line_number(self, content: str, position: int) -> int:
142
+ """Find line number for a character position in content."""
143
+ return content[:position].count('\n') + 1
144
+
145
+ def _get_line_content(self, content: str, line_number: int) -> str:
146
+ """Get the content of a specific line."""
147
+ lines = content.split('\n')
148
+ if 1 <= line_number <= len(lines):
149
+ return lines[line_number - 1].strip()
150
+ return ""
151
+
152
+ def _is_commented(self, content: str, position: int) -> bool:
153
+ """Check if a position is within a LaTeX comment."""
154
+ # Find the start of the current line
155
+ line_start = content.rfind('\n', 0, position) + 1
156
+ line_before = content[line_start:position]
157
+
158
+ # Check for unescaped % before this position on the same line
159
+ i = 0
160
+ while i < len(line_before):
161
+ if line_before[i] == '%':
162
+ # Check if escaped
163
+ num_backslashes = 0
164
+ j = i - 1
165
+ while j >= 0 and line_before[j] == '\\':
166
+ num_backslashes += 1
167
+ j -= 1
168
+ if num_backslashes % 2 == 0:
169
+ # Not escaped, this is a comment
170
+ return True
171
+ i += 1
172
+ return False
173
+
174
+ def _create_result(
175
+ self,
176
+ passed: bool,
177
+ severity: CheckSeverity,
178
+ message: str,
179
+ line_number: Optional[int] = None,
180
+ line_content: Optional[str] = None,
181
+ suggestion: Optional[str] = None
182
+ ) -> CheckResult:
183
+ """Helper to create a CheckResult with this checker's name."""
184
+ return CheckResult(
185
+ checker_name=self.name,
186
+ passed=passed,
187
+ severity=severity,
188
+ message=message,
189
+ line_number=line_number,
190
+ line_content=line_content,
191
+ suggestion=suggestion
192
+ )
193
+
src/checkers/caption_checker.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Caption placement checker.
3
+
4
+ Validates that:
5
+ - Table captions appear ABOVE the table content
6
+ - Figure captions appear BELOW the figure content
7
+ """
8
+ import re
9
+ from typing import List
10
+
11
+ from .base import BaseChecker, CheckResult, CheckSeverity
12
+
13
+
14
+ class CaptionChecker(BaseChecker):
15
+ """Check for correct caption placement in tables and figures."""
16
+
17
+ name = "caption"
18
+ display_name = "Caption Placement"
19
+ description = "Verify table captions are above and figure captions are below"
20
+
21
+ # Patterns for environments
22
+ TABLE_ENV_PATTERN = re.compile(
23
+ r'\\begin\{table\*?\}(.*?)\\end\{table\*?\}',
24
+ re.DOTALL | re.IGNORECASE
25
+ )
26
+ FIGURE_ENV_PATTERN = re.compile(
27
+ r'\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}',
28
+ re.DOTALL | re.IGNORECASE
29
+ )
30
+
31
+ # Content patterns
32
+ CAPTION_PATTERN = re.compile(r'\\caption\s*[\[{]')
33
+ TABULAR_PATTERN = re.compile(r'\\begin\{tabular')
34
+ INCLUDEGRAPHICS_PATTERN = re.compile(r'\\includegraphics')
35
+ TIKZ_PATTERN = re.compile(r'\\begin\{tikzpicture\}')
36
+
37
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
38
+ results = []
39
+
40
+ # Check table environments
41
+ for match in self.TABLE_ENV_PATTERN.finditer(tex_content):
42
+ env_content = match.group(1)
43
+ env_start = match.start()
44
+
45
+ # Skip if commented
46
+ if self._is_commented(tex_content, env_start):
47
+ continue
48
+
49
+ result = self._check_table_caption(env_content, tex_content, env_start)
50
+ if result:
51
+ results.append(result)
52
+
53
+ # Check figure environments
54
+ for match in self.FIGURE_ENV_PATTERN.finditer(tex_content):
55
+ env_content = match.group(1)
56
+ env_start = match.start()
57
+
58
+ # Skip if commented
59
+ if self._is_commented(tex_content, env_start):
60
+ continue
61
+
62
+ result = self._check_figure_caption(env_content, tex_content, env_start)
63
+ if result:
64
+ results.append(result)
65
+
66
+ return results
67
+
68
+ def _check_table_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
69
+ """Check that table caption is above tabular content."""
70
+ caption_match = self.CAPTION_PATTERN.search(env_content)
71
+ tabular_match = self.TABULAR_PATTERN.search(env_content)
72
+
73
+ if not caption_match:
74
+ line_num = self._find_line_number(full_content, env_start)
75
+ return self._create_result(
76
+ passed=False,
77
+ severity=CheckSeverity.WARNING,
78
+ message="Table environment missing caption",
79
+ line_number=line_num,
80
+ suggestion="Add \\caption{} before \\begin{tabular}"
81
+ )
82
+
83
+ if not tabular_match:
84
+ # Table without tabular content - skip
85
+ return None
86
+
87
+ # Caption should come BEFORE tabular
88
+ if caption_match.start() > tabular_match.start():
89
+ line_num = self._find_line_number(full_content, env_start + caption_match.start())
90
+ return self._create_result(
91
+ passed=False,
92
+ severity=CheckSeverity.ERROR,
93
+ message="Table caption should be placed ABOVE the table content",
94
+ line_number=line_num,
95
+ line_content=self._get_line_content(full_content, line_num),
96
+ suggestion="Move \\caption{} before \\begin{tabular}"
97
+ )
98
+
99
+ return None
100
+
101
+ def _check_figure_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
102
+ """Check that figure caption is below image content."""
103
+ caption_match = self.CAPTION_PATTERN.search(env_content)
104
+ graphics_match = self.INCLUDEGRAPHICS_PATTERN.search(env_content)
105
+ tikz_match = self.TIKZ_PATTERN.search(env_content)
106
+
107
+ # Find the actual content (either graphics or tikz)
108
+ content_match = graphics_match or tikz_match
109
+
110
+ if not caption_match:
111
+ line_num = self._find_line_number(full_content, env_start)
112
+ return self._create_result(
113
+ passed=False,
114
+ severity=CheckSeverity.WARNING,
115
+ message="Figure environment missing caption",
116
+ line_number=line_num,
117
+ suggestion="Add \\caption{} after \\includegraphics"
118
+ )
119
+
120
+ if not content_match:
121
+ # Figure without graphics/tikz - could be custom content, skip
122
+ return None
123
+
124
+ # Caption should come AFTER content
125
+ if caption_match.start() < content_match.start():
126
+ line_num = self._find_line_number(full_content, env_start + caption_match.start())
127
+ return self._create_result(
128
+ passed=False,
129
+ severity=CheckSeverity.ERROR,
130
+ message="Figure caption should be placed BELOW the figure content",
131
+ line_number=line_num,
132
+ line_content=self._get_line_content(full_content, line_num),
133
+ suggestion="Move \\caption{} after \\includegraphics"
134
+ )
135
+
136
+ return None
src/checkers/citation_quality_checker.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Citation quality checker.
3
+
4
+ Validates:
5
+ - Old citations (>30 years) that might need updating
6
+ - Citation formatting patterns (et al., hardcoded citations, etc.)
7
+ """
8
+ import re
9
+ from typing import List, Dict
10
+ from datetime import datetime
11
+ from collections import defaultdict
12
+
13
+ from .base import BaseChecker, CheckResult, CheckSeverity
14
+
15
+
16
+ class CitationQualityChecker(BaseChecker):
17
+ """Check citation quality and balance."""
18
+
19
+ name = "citation_quality"
20
+ display_name = "Citation Quality"
21
+ description = "Check citation age, balance, and formatting"
22
+
23
+ # Thresholds
24
+ OLD_CITATION_YEARS = 30 # Citations older than this get flagged
25
+
26
+ CURRENT_YEAR = datetime.now().year
27
+
28
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
29
+ results = []
30
+
31
+ # This checker works best with bib content, but we can do some analysis
32
+ # on the tex file alone by looking at citation patterns
33
+
34
+ # Check for inline year citations that are old
35
+ old_cite_results = self._check_old_citations_in_text(tex_content)
36
+ results.extend(old_cite_results)
37
+
38
+ # Check for citation formatting issues
39
+ format_results = self._check_citation_formatting(tex_content)
40
+ results.extend(format_results)
41
+
42
+ return results
43
+
44
+ def _check_old_citations_in_text(self, content: str) -> List[CheckResult]:
45
+ """Look for citations with old years visible in text."""
46
+ results = []
47
+ lines = content.split('\n')
48
+
49
+ # Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)"
50
+ year_pattern = re.compile(
51
+ r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|' # Parenthetical
52
+ r'\b(?:19[89]\d|20[01]\d)\b)', # Standalone year
53
+ re.IGNORECASE
54
+ )
55
+
56
+ old_years_found = set()
57
+
58
+ for line_num, line in enumerate(lines, 1):
59
+ # Skip comments using base class method
60
+ if self._is_comment_line(line):
61
+ continue
62
+
63
+ for match in year_pattern.finditer(line):
64
+ year_str = re.search(r'(19[89]\d|20[01]\d)', match.group())
65
+ if year_str:
66
+ year = int(year_str.group())
67
+ age = self.CURRENT_YEAR - year
68
+
69
+ if age >= self.OLD_CITATION_YEARS and year not in old_years_found:
70
+ old_years_found.add(year)
71
+ results.append(self._create_result(
72
+ passed=False,
73
+ severity=CheckSeverity.INFO,
74
+ message=f"Citation from {year} ({age} years old)",
75
+ line_number=line_num,
76
+ suggestion=f"Consider if there's more recent work on this topic"
77
+ ))
78
+
79
+ return results
80
+
81
+ def _check_citation_formatting(self, content: str) -> List[CheckResult]:
82
+ """Check for common citation formatting issues."""
83
+ results = []
84
+ lines = content.split('\n')
85
+
86
+ for line_num, line in enumerate(lines, 1):
87
+ if line.lstrip().startswith('%'):
88
+ continue
89
+
90
+ # Check for "et al" without period
91
+ if re.search(r'\bet al\b(?!\.)', line):
92
+ results.append(self._create_result(
93
+ passed=False,
94
+ severity=CheckSeverity.WARNING,
95
+ message="'et al' should be 'et al.'",
96
+ line_number=line_num,
97
+ suggestion="Add period after 'et al.'"
98
+ ))
99
+
100
+ # Check for "[1]" style citations (might want natbib style)
101
+ # Skip if it's a command definition or argument
102
+ if re.search(r'\[\d+\]', line):
103
+ # Skip if in command definition
104
+ if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line:
105
+ continue
106
+ # Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1]
107
+ if re.search(r'\\[a-zA-Z]+\[\d+\]', line):
108
+ continue
109
+ # Only flag if it looks like actual citation in text
110
+ if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]):
111
+ results.append(self._create_result(
112
+ passed=False,
113
+ severity=CheckSeverity.INFO,
114
+ message="Numeric citation style detected",
115
+ line_number=line_num,
116
+ suggestion="Consider author-year style for better readability"
117
+ ))
118
+
119
+ # Check for hardcoded citations instead of \cite
120
+ if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line):
121
+ if '\\cite' not in line:
122
+ results.append(self._create_result(
123
+ passed=False,
124
+ severity=CheckSeverity.WARNING,
125
+ message="Appears to be hardcoded citation instead of \\cite",
126
+ line_number=line_num,
127
+ line_content=line.strip()[:80],
128
+ suggestion="Use \\cite{} for proper bibliography management"
129
+ ))
130
+
131
+ return results
src/checkers/consistency_checker.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Terminology consistency checker.
3
+
4
+ Validates:
5
+ - Consistent spelling of the same term
6
+ - Consistent hyphenation
7
+ - Consistent capitalization of technical terms
8
+ """
9
+ import re
10
+ from typing import List, Dict, Set
11
+ from collections import defaultdict
12
+
13
+ from .base import BaseChecker, CheckResult, CheckSeverity
14
+
15
+
16
+ class ConsistencyChecker(BaseChecker):
17
+ """Check terminology and spelling consistency."""
18
+
19
+ name = "consistency"
20
+ display_name = "Consistency"
21
+ description = "Check for inconsistent terminology and spelling"
22
+
23
+ # Known variant pairs (canonical -> variants)
24
+ KNOWN_VARIANTS = {
25
+ # Hyphenation variants
26
+ 'self-supervised': ['self supervised', 'selfsupervised'],
27
+ 'pre-trained': ['pre trained', 'pretrained'],
28
+ 'fine-tuned': ['fine tuned', 'finetuned'],
29
+ 'state-of-the-art': ['state of the art', 'stateoftheart'],
30
+ 'real-world': ['real world', 'realworld'],
31
+ 'end-to-end': ['end to end', 'endtoend', 'e2e'],
32
+ 'large-scale': ['large scale', 'largescale'],
33
+ 'long-term': ['long term', 'longterm'],
34
+ 'short-term': ['short term', 'shortterm'],
35
+ 'multi-task': ['multi task', 'multitask'],
36
+ 'multi-modal': ['multi modal', 'multimodal'],
37
+ 'cross-lingual': ['cross lingual', 'crosslingual'],
38
+ 'zero-shot': ['zero shot', 'zeroshot'],
39
+ 'few-shot': ['few shot', 'fewshot'],
40
+ 'in-context': ['in context', 'incontext'],
41
+
42
+ # American vs British English (comprehensive list)
43
+ # -or/-our endings
44
+ 'color': ['colour'],
45
+ 'behavior': ['behaviour'],
46
+ 'favor': ['favour'],
47
+ 'honor': ['honour'],
48
+ 'labor': ['labour'],
49
+ 'neighbor': ['neighbour'],
50
+ 'rumor': ['rumour'],
51
+ 'vapor': ['vapour'],
52
+
53
+ # -ize/-ise endings
54
+ 'analyze': ['analyse'],
55
+ 'characterize': ['characterise'],
56
+ 'generalize': ['generalise'],
57
+ 'initialize': ['initialise'],
58
+ 'maximize': ['maximise'],
59
+ 'minimize': ['minimise'],
60
+ 'normalize': ['normalise'],
61
+ 'optimize': ['optimise'],
62
+ 'organize': ['organise'],
63
+ 'realize': ['realise'],
64
+ 'recognize': ['recognise'],
65
+ 'specialize': ['specialise'],
66
+ 'standardize': ['standardise'],
67
+ 'summarize': ['summarise'],
68
+ 'utilize': ['utilise'],
69
+ 'visualize': ['visualise'],
70
+ 'categorize': ['categorise'],
71
+ 'emphasize': ['emphasise'],
72
+ 'hypothesize': ['hypothesise'],
73
+ 'prioritize': ['prioritise'],
74
+ 'synchronize': ['synchronise'],
75
+
76
+ # -ization/-isation endings
77
+ 'generalization': ['generalisation'],
78
+ 'initialization': ['initialisation'],
79
+ 'maximization': ['maximisation'],
80
+ 'minimization': ['minimisation'],
81
+ 'normalization': ['normalisation'],
82
+ 'optimization': ['optimisation'],
83
+ 'organization': ['organisation'],
84
+ 'realization': ['realisation'],
85
+ 'regularization': ['regularisation'],
86
+ 'specialization': ['specialisation'],
87
+ 'standardization': ['standardisation'],
88
+ 'summarization': ['summarisation'],
89
+ 'utilization': ['utilisation'],
90
+ 'visualization': ['visualisation'],
91
+ 'categorization': ['categorisation'],
92
+ 'characterization': ['characterisation'],
93
+ 'parametrization': ['parametrisation'],
94
+ 'quantization': ['quantisation'],
95
+
96
+ # -er/-re endings
97
+ 'center': ['centre'],
98
+ 'fiber': ['fibre'],
99
+ 'meter': ['metre'],
100
+ 'liter': ['litre'],
101
+
102
+ # -l-/-ll- (American single, British double)
103
+ 'modeling': ['modelling'],
104
+ 'labeled': ['labelled'],
105
+ 'labeling': ['labelling'],
106
+ 'traveled': ['travelled'],
107
+ 'traveling': ['travelling'],
108
+ 'canceled': ['cancelled'],
109
+ 'canceling': ['cancelling'],
110
+ 'signaled': ['signalled'],
111
+ 'signaling': ['signalling'],
112
+
113
+ # -og/-ogue endings
114
+ 'analog': ['analogue'],
115
+ 'catalog': ['catalogue'],
116
+ 'dialog': ['dialogue'],
117
+
118
+ # -ense/-ence endings
119
+ 'defense': ['defence'],
120
+ 'license': ['licence'],
121
+ 'offense': ['offence'],
122
+
123
+ # Other common differences
124
+ 'gray': ['grey'],
125
+ 'artifact': ['artefact'],
126
+ 'program': ['programme'], # Note: 'program' is standard in computing
127
+ 'skeptical': ['sceptical'],
128
+ 'aluminum': ['aluminium'],
129
+
130
+ # Verb forms
131
+ 'learned': ['learnt'],
132
+ 'burned': ['burnt'],
133
+ 'spelled': ['spelt'],
134
+
135
+ # Common term variants
136
+ 'dataset': ['data set', 'data-set'],
137
+ 'benchmark': ['bench mark', 'bench-mark'],
138
+ 'baseline': ['base line', 'base-line'],
139
+ 'downstream': ['down stream', 'down-stream'],
140
+ 'upstream': ['up stream', 'up-stream'],
141
+ 'encoder': ['en-coder'],
142
+ 'decoder': ['de-coder'],
143
+ }
144
+
145
+ # Capitalization variants to track
146
+ CAPITALIZATION_TERMS = [
147
+ 'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
148
+ 'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
149
+ ]
150
+
151
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
152
+ results = []
153
+
154
+ # Remove comments
155
+ content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
156
+ content_lower = content.lower()
157
+
158
+ # Check for known variant inconsistencies
159
+ for canonical, variants in self.KNOWN_VARIANTS.items():
160
+ found_forms = []
161
+
162
+ # Check canonical form
163
+ if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
164
+ found_forms.append(canonical)
165
+
166
+ # Check variants
167
+ for variant in variants:
168
+ if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
169
+ found_forms.append(variant)
170
+
171
+ if len(found_forms) > 1:
172
+ results.append(self._create_result(
173
+ passed=False,
174
+ severity=CheckSeverity.WARNING,
175
+ message=f"Inconsistent spelling: {', '.join(found_forms)}",
176
+ suggestion=f"Use '{canonical}' consistently throughout"
177
+ ))
178
+
179
+ # Check hyphenated word consistency
180
+ hyphen_results = self._check_hyphenation_consistency(content)
181
+ results.extend(hyphen_results)
182
+
183
+ # Check capitalization consistency
184
+ cap_results = self._check_capitalization_consistency(content)
185
+ results.extend(cap_results)
186
+
187
+ return results
188
+
189
+ def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
190
+ """Find words that appear both hyphenated and non-hyphenated."""
191
+ results = []
192
+
193
+ # Common terms that should always be hyphenated (exceptions)
194
+ ALWAYS_HYPHENATED = {
195
+ 'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
196
+ 'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
197
+ 'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
198
+ 'self-supervised', 'self-attention', 'co-training', 'pre-training',
199
+ 'post-processing', 'pre-processing', 'well-known', 'well-defined',
200
+ 'high-quality', 'low-quality', 'long-term', 'short-term'
201
+ }
202
+
203
+ # Find all hyphenated words
204
+ hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))
205
+
206
+ for hyph_word in hyphenated:
207
+ # Skip if it's a known compound that should always be hyphenated
208
+ if hyph_word.lower() in ALWAYS_HYPHENATED:
209
+ continue
210
+
211
+ # Create non-hyphenated version
212
+ non_hyph = hyph_word.replace('-', ' ')
213
+ combined = hyph_word.replace('-', '')
214
+
215
+ # Check if non-hyphenated version exists
216
+ if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
217
+ results.append(self._create_result(
218
+ passed=False,
219
+ severity=CheckSeverity.INFO,
220
+ message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
221
+ suggestion="Choose one form and use it consistently"
222
+ ))
223
+ elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
224
+ results.append(self._create_result(
225
+ passed=False,
226
+ severity=CheckSeverity.INFO,
227
+ message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
228
+ suggestion="Choose one form and use it consistently"
229
+ ))
230
+
231
+ return results
232
+
233
+ def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
234
+ """Check if technical terms have consistent capitalization."""
235
+ results = []
236
+
237
+ for term in self.CAPITALIZATION_TERMS:
238
+ # Find all case variations
239
+ pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
240
+ matches = pattern.findall(content)
241
+
242
+ if len(matches) > 1:
243
+ # Check if there are mixed capitalizations
244
+ unique_forms = set(matches)
245
+ if len(unique_forms) > 1:
246
+ forms_str = ', '.join(f"'{f}'" for f in unique_forms)
247
+ results.append(self._create_result(
248
+ passed=False,
249
+ severity=CheckSeverity.INFO,
250
+ message=f"Inconsistent capitalization: {forms_str}",
251
+ suggestion="Use consistent capitalization for technical terms"
252
+ ))
253
+
254
+ return results
src/checkers/equation_checker.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Equation formatting checker.
3
+
4
+ Validates:
5
+ - Punctuation after equations (based on grammar)
6
+ - Equation numbering consistency
7
+ - Variable definitions
8
+ """
9
+ import re
10
+ from typing import List, Set
11
+
12
+ from .base import BaseChecker, CheckResult, CheckSeverity
13
+
14
+
15
+ class EquationChecker(BaseChecker):
16
+ """Check equation formatting and consistency."""
17
+
18
+ name = "equation"
19
+ display_name = "Equations"
20
+ description = "Check equation formatting and punctuation"
21
+
22
+ # Equation environments
23
+ EQUATION_ENVS = [
24
+ 'equation', 'align', 'gather', 'multline', 'eqnarray',
25
+ 'equation*', 'align*', 'gather*', 'multline*', 'eqnarray*'
26
+ ]
27
+
28
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
29
+ results = []
30
+
31
+ # Check equation punctuation
32
+ punct_results = self._check_equation_punctuation(tex_content)
33
+ results.extend(punct_results)
34
+
35
+ # Check for numbered vs unnumbered consistency
36
+ numbering_results = self._check_numbering_consistency(tex_content)
37
+ results.extend(numbering_results)
38
+
39
+ # Check inline math consistency ($...$ vs \(...\))
40
+ inline_results = self._check_inline_math_consistency(tex_content)
41
+ results.extend(inline_results)
42
+
43
+ return results
44
+
45
+ def _check_equation_punctuation(self, content: str) -> List[CheckResult]:
46
+ """Check if equations end with appropriate punctuation."""
47
+ results = []
48
+
49
+ for env in self.EQUATION_ENVS:
50
+ if '*' in env:
51
+ env_escaped = env.replace('*', r'\*')
52
+ else:
53
+ env_escaped = env
54
+
55
+ # Find equation content
56
+ pattern = re.compile(
57
+ rf'\\begin\{{{env_escaped}\}}(.*?)\\end\{{{env_escaped}\}}',
58
+ re.DOTALL
59
+ )
60
+
61
+ for match in pattern.finditer(content):
62
+ eq_content = match.group(1).strip()
63
+
64
+ # Check what comes after the equation
65
+ after_pos = match.end()
66
+ after_text = content[after_pos:after_pos + 50].strip()
67
+
68
+ # Equations in running text should have punctuation
69
+ # Check if equation content ends with punctuation
70
+ eq_content_clean = re.sub(r'\\label\{[^}]+\}', '', eq_content).strip()
71
+
72
+ if eq_content_clean and not re.search(r'[.,;]$', eq_content_clean):
73
+ # Check if next text starts lowercase (indicating sentence continues)
74
+ if after_text and after_text[0].islower():
75
+ line_num = self._find_line_number(content, match.end())
76
+ results.append(self._create_result(
77
+ passed=False,
78
+ severity=CheckSeverity.INFO,
79
+ message="Equation may need punctuation (sentence continues after)",
80
+ line_number=line_num,
81
+ suggestion="Add comma or period inside equation if it ends a clause"
82
+ ))
83
+
84
+ return results
85
+
86
+ def _check_numbering_consistency(self, content: str) -> List[CheckResult]:
87
+ """Check for mixed numbered and unnumbered equations."""
88
+ results = []
89
+
90
+ # Count numbered vs unnumbered
91
+ numbered = 0
92
+ unnumbered = 0
93
+
94
+ for env in self.EQUATION_ENVS:
95
+ count = len(re.findall(rf'\\begin\{{{env}\}}', content))
96
+ if '*' in env or 'nonumber' in content:
97
+ unnumbered += count
98
+ else:
99
+ numbered += count
100
+
101
+ # Also count \nonumber and \notag usage
102
+ unnumbered += len(re.findall(r'\\nonumber|\\notag', content))
103
+
104
+ # If there's a significant mix, warn
105
+ total = numbered + unnumbered
106
+ if total > 3 and numbered > 0 and unnumbered > 0:
107
+ ratio = min(numbered, unnumbered) / total
108
+ if ratio > 0.2: # More than 20% in minority
109
+ results.append(self._create_result(
110
+ passed=False,
111
+ severity=CheckSeverity.INFO,
112
+ message=f"Mixed equation numbering: {numbered} numbered, {unnumbered} unnumbered",
113
+ suggestion="Consider consistent numbering strategy"
114
+ ))
115
+
116
+ return results
117
+
118
+ def _check_inline_math_consistency(self, content: str) -> List[CheckResult]:
119
+ """Check for mixed inline math delimiters."""
120
+ results = []
121
+
122
+ # Count different inline math styles
123
+ dollar_count = len(re.findall(r'(?<!\$)\$(?!\$)[^$]+\$(?!\$)', content))
124
+ paren_count = len(re.findall(r'\\\(.*?\\\)', content))
125
+
126
+ if dollar_count > 0 and paren_count > 0:
127
+ results.append(self._create_result(
128
+ passed=False,
129
+ severity=CheckSeverity.INFO,
130
+ message=f"Mixed inline math: ${dollar_count} \\$...\\$ and {paren_count} \\(...\\)",
131
+ suggestion="Use consistent inline math delimiters throughout"
132
+ ))
133
+
134
+ return results
src/checkers/formatting_checker.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Formatting checker.
3
+
4
+ Validates common LaTeX formatting issues:
5
+ - Citation formatting consistency
6
+ - Non-breaking spaces before citations
7
+ - Special character escaping
8
+ - Whitespace issues
9
+ """
10
+ import re
11
+ from typing import List
12
+
13
+ from .base import BaseChecker, CheckResult, CheckSeverity
14
+
15
+
16
+ class FormattingChecker(BaseChecker):
17
+ """Check for common LaTeX formatting issues."""
18
+
19
+ name = "formatting"
20
+ display_name = "Formatting"
21
+ description = "Check citation style, spacing, and special characters"
22
+
23
+ # Citation commands
24
+ CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
25
+ 'citeauthor', 'citeyear', 'autocite', 'textcite',
26
+ 'parencite', 'footcite']
27
+
28
+ # Pattern for citations without non-breaking space
29
+ # Matches: "word \cite" but not "word~\cite"
30
+ CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
31
+
32
+ # Pattern for multiple consecutive spaces
33
+ MULTI_SPACE_PATTERN = re.compile(r'(?<!\\) +')
34
+
35
+ # Pattern for unescaped special characters (outside math mode)
36
+ SPECIAL_CHARS = {
37
+ '%': r'(?<!\\)%', # Unescaped %
38
+ '&': r'(?<!\\)&(?![a-zA-Z]+;)', # Unescaped & (not HTML entities)
39
+ '#': r'(?<!\\)#', # Unescaped #
40
+ '_': r'(?<![\\$])_(?![^$]*\$)', # Unescaped _ outside math
41
+ '^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
42
+ }
43
+
44
+ # Multiple blank lines pattern (3 or more blank lines)
45
+ MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
46
+
47
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
48
+ results = []
49
+ lines = tex_content.split('\n')
50
+
51
+ # Track citation style consistency
52
+ cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
53
+
54
+ for line_num, line in enumerate(lines, 1):
55
+ # Skip commented lines using base class method
56
+ if self._is_comment_line(line):
57
+ continue
58
+
59
+ # Remove inline comments using base class method
60
+ line_content = self._remove_line_comment(line)
61
+
62
+ # Check citation non-breaking space
63
+ for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
64
+ results.append(self._create_result(
65
+ passed=False,
66
+ severity=CheckSeverity.INFO,
67
+ message="Citation without non-breaking space",
68
+ line_number=line_num,
69
+ line_content=line.strip()[:100],
70
+ suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
71
+ ))
72
+
73
+ # Track citation styles
74
+ for cmd in self.CITE_COMMANDS:
75
+ if re.search(rf'\\{cmd}\b', line_content):
76
+ if cmd in ['citep', 'parencite', 'autocite']:
77
+ cite_styles['parenthetical'] += 1
78
+ elif cmd in ['citet', 'textcite']:
79
+ cite_styles['textual'] += 1
80
+ elif cmd == 'cite':
81
+ cite_styles['plain'] += 1
82
+
83
+ # Check citation style consistency
84
+ styles_used = [s for s, count in cite_styles.items() if count > 0]
85
+ if len(styles_used) > 1:
86
+ results.append(self._create_result(
87
+ passed=False,
88
+ severity=CheckSeverity.INFO,
89
+ message=f"Mixed citation styles detected: {', '.join(styles_used)}",
90
+ suggestion="Consider using consistent citation style throughout"
91
+ ))
92
+
93
+ # Check for multiple blank lines (3 or more)
94
+ for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
95
+ line_num = self._find_line_number(tex_content, match.start())
96
+ # Count how many blank lines
97
+ blank_count = match.group(0).count('\n') - 1
98
+
99
+ # Get context: the line before, blank lines, and the line after
100
+ start_pos = match.start()
101
+ end_pos = match.end()
102
+
103
+ # Find the line before the blank lines
104
+ prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
105
+ prev_line_end = start_pos
106
+ prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
107
+
108
+ # Find the line after the blank lines
109
+ next_line_end = tex_content.find('\n', end_pos)
110
+ if next_line_end == -1:
111
+ next_line_end = len(tex_content)
112
+ next_line = tex_content[end_pos:next_line_end].rstrip()
113
+
114
+ # Create visual representation with warning markers
115
+ blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
116
+ line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
117
+
118
+ results.append(self._create_result(
119
+ passed=False,
120
+ severity=CheckSeverity.INFO,
121
+ message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
122
+ line_number=line_num,
123
+ line_content=line_content,
124
+ suggestion="Reduce to single blank line or use \\vspace"
125
+ ))
126
+
127
+ # Check for common issues with special characters
128
+ results.extend(self._check_special_chars(tex_content, lines))
129
+
130
+ return results
131
+
132
+ def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
133
+ """Check for unescaped special characters."""
134
+ results = []
135
+
136
+ # Find math environments to skip
137
+ math_regions = self._find_math_regions(content)
138
+
139
+ for line_num, line in enumerate(lines, 1):
140
+ # Skip commented lines using base class method
141
+ if self._is_comment_line(line):
142
+ continue
143
+
144
+ # Remove inline comments using base class method
145
+ line_content = self._remove_line_comment(line)
146
+
147
+ # Get position of this line in full content
148
+ line_start = sum(len(l) + 1 for l in lines[:line_num-1])
149
+
150
+ # Check for unescaped & (common error)
151
+ for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
152
+ pos = line_start + match.start()
153
+ # Skip if in math
154
+ if not self._in_math_region(pos, math_regions):
155
+ # Also skip if inside tabular
156
+ if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
157
+ results.append(self._create_result(
158
+ passed=False,
159
+ severity=CheckSeverity.WARNING,
160
+ message="Unescaped & outside tabular/math environment",
161
+ line_number=line_num,
162
+ line_content=line.strip()[:100],
163
+ suggestion="Use \\& to escape"
164
+ ))
165
+
166
+ return results
167
+
168
+ def _find_math_regions(self, content: str) -> List[tuple]:
169
+ """Find regions that are inside math mode."""
170
+ regions = []
171
+
172
+ # Inline math $ ... $
173
+ for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
174
+ regions.append((match.start(), match.end()))
175
+
176
+ # Display math $$ ... $$
177
+ for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
178
+ regions.append((match.start(), match.end()))
179
+
180
+ # \[ ... \]
181
+ for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
182
+ regions.append((match.start(), match.end()))
183
+
184
+ # Math environments
185
+ for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
186
+ pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
187
+ for match in re.finditer(pattern, content, re.DOTALL):
188
+ regions.append((match.start(), match.end()))
189
+
190
+ return regions
191
+
192
+ def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
193
+ """Check if position is inside a math region."""
194
+ return any(start <= pos <= end for start, end in regions)
195
+
196
+ def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
197
+ """Check if position is inside any of the given environments."""
198
+ for env in env_names:
199
+ # Find all instances of this environment
200
+ pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
201
+ for match in re.finditer(pattern, content, re.DOTALL):
202
+ if match.start() <= pos <= match.end():
203
+ return True
204
+ return False