thinkwee
commited on
Commit
·
46df5f0
1
Parent(s):
6984298
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +62 -0
- README.md +196 -11
- app.py +922 -0
- app_helper.py +98 -0
- assets/icon-192.png +3 -0
- assets/icon-512.png +3 -0
- bibguard.yaml +197 -0
- main.py +564 -0
- requirements.txt +8 -0
- src/__init__.py +1 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/analyzers/__init__.py +7 -0
- src/analyzers/__pycache__/__init__.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/retraction_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/url_validator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/usage_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc +0 -0
- src/analyzers/duplicate_detector.py +204 -0
- src/analyzers/llm_evaluator.py +376 -0
- src/analyzers/metadata_comparator.py +474 -0
- src/analyzers/usage_checker.py +82 -0
- src/checkers/__init__.py +66 -0
- src/checkers/__pycache__/__init__.cpython-313.pyc +0 -0
- src/checkers/__pycache__/acronym_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/anonymization_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/base.cpython-313.pyc +0 -0
- src/checkers/__pycache__/caption_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/consistency_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/equation_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/formatting_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/number_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/reference_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/sentence_checker.cpython-313.pyc +0 -0
- src/checkers/acronym_checker.py +284 -0
- src/checkers/ai_artifacts_checker.py +176 -0
- src/checkers/anonymization_checker.py +216 -0
- src/checkers/base.py +193 -0
- src/checkers/caption_checker.py +136 -0
- src/checkers/citation_quality_checker.py +131 -0
- src/checkers/consistency_checker.py +254 -0
- src/checkers/equation_checker.py +134 -0
- src/checkers/formatting_checker.py +204 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# Virtual Environments
|
| 25 |
+
venv/
|
| 26 |
+
env/
|
| 27 |
+
.env
|
| 28 |
+
.venv/
|
| 29 |
+
|
| 30 |
+
# IDEs
|
| 31 |
+
.idea/
|
| 32 |
+
.vscode/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
|
| 36 |
+
# macOS
|
| 37 |
+
.DS_Store
|
| 38 |
+
.AppleDouble
|
| 39 |
+
.LSOverride
|
| 40 |
+
|
| 41 |
+
# Project Specific Outputs
|
| 42 |
+
*.txt
|
| 43 |
+
*.md
|
| 44 |
+
!README.md
|
| 45 |
+
*_only_used_entry.bib
|
| 46 |
+
|
| 47 |
+
# LaTeX and Bibliography (User Data)
|
| 48 |
+
# Ignoring these to prevent committing personal paper content
|
| 49 |
+
*.tex
|
| 50 |
+
*.bib
|
| 51 |
+
*.pdf
|
| 52 |
+
*.log
|
| 53 |
+
*.aux
|
| 54 |
+
*.out
|
| 55 |
+
*.bbl
|
| 56 |
+
*.blg
|
| 57 |
+
*.synctex.gz
|
| 58 |
+
*.fls
|
| 59 |
+
*.fdb_latexmk
|
| 60 |
+
|
| 61 |
+
# cache
|
| 62 |
+
.cache
|
README.md
CHANGED
|
@@ -1,13 +1,198 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
| 1 |
+
# BibGuard: Bibliography & LaTeX Quality Auditor
|
| 2 |
+
|
| 3 |
+
**BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit.
|
| 4 |
+
|
| 5 |
+
AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims.
|
| 6 |
+
|
| 7 |
+
## 🛡 Why BibGuard?
|
| 8 |
+
|
| 9 |
+
- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
|
| 10 |
+
- **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems
|
| 11 |
+
- **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated
|
| 12 |
+
- **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM)
|
| 13 |
+
- **⚡ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations
|
| 14 |
+
|
| 15 |
+
## 🚀 Features
|
| 16 |
+
|
| 17 |
+
### Bibliography Validation
|
| 18 |
+
- **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
|
| 19 |
+
- **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional)
|
| 20 |
+
- **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.)
|
| 21 |
+
- **👀 Usage Analysis**: Highlights missing citations and unused bib entries
|
| 22 |
+
- **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching
|
| 23 |
+
|
| 24 |
+
### LaTeX Quality Checks
|
| 25 |
+
- **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
|
| 26 |
+
- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
|
| 27 |
+
- **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology
|
| 28 |
+
- **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
|
| 29 |
+
- **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching)
|
| 30 |
+
- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
|
| 31 |
+
- **📅 Citation Age**: Flags references older than 30 years
|
| 32 |
+
|
| 33 |
+
## 📦 Installation
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
git clone git@github.com:thinkwee/BibGuard.git
|
| 37 |
+
cd BibGuard
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## ⚡ Quick Start
|
| 42 |
+
|
| 43 |
+
### 1. Initialize Configuration
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
python main.py --init
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
This creates `config.yaml`. Edit it to set your file paths. You have two modes:
|
| 50 |
+
|
| 51 |
+
#### Option A: Single File Mode
|
| 52 |
+
Best for individual papers.
|
| 53 |
+
```yaml
|
| 54 |
+
files:
|
| 55 |
+
bib: "paper.bib"
|
| 56 |
+
tex: "paper.tex"
|
| 57 |
+
output_dir: "bibguard_output"
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
#### Option B: Directory Scan Mode
|
| 61 |
+
Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files.
|
| 62 |
+
```yaml
|
| 63 |
+
files:
|
| 64 |
+
input_dir: "./my_project_dir"
|
| 65 |
+
output_dir: "bibguard_output"
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
### 2. Run Full Check
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
python main.py
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
**Output** (in `bibguard_output/`):
|
| 75 |
+
- `bibliography_report.md` - Bibliography validation results
|
| 76 |
+
- `latex_quality_report.md` - Writing and formatting issues
|
| 77 |
+
- `line_by_line_report.md` - All issues sorted by line number
|
| 78 |
+
- `*_only_used.bib` - Clean bibliography (used entries only)
|
| 79 |
+
|
| 80 |
+
## 🛠 Configuration
|
| 81 |
+
|
| 82 |
+
Edit `config.yaml` to customize checks:
|
| 83 |
+
|
| 84 |
+
```yaml
|
| 85 |
+
bibliography:
|
| 86 |
+
check_metadata: true # Validate against online databases (takes time)
|
| 87 |
+
check_usage: true # Find unused/missing entries
|
| 88 |
+
check_duplicates: true # Detect duplicate entries
|
| 89 |
+
check_preprint_ratio: true # Warn if >50% are preprints
|
| 90 |
+
check_relevance: false # LLM-based relevance check (requires API key)
|
| 91 |
+
|
| 92 |
+
submission:
|
| 93 |
+
# Format checks
|
| 94 |
+
caption: true # Table/figure caption placement
|
| 95 |
+
reference: true # Cross-reference integrity
|
| 96 |
+
formatting: true # Citation spacing, blank lines
|
| 97 |
+
equation: true # Equation punctuation, numbering
|
| 98 |
+
|
| 99 |
+
# Writing quality
|
| 100 |
+
sentence: true # Weak starters, hedging language
|
| 101 |
+
consistency: true # Spelling, hyphenation, terminology
|
| 102 |
+
acronym: true # Acronym definitions (3+ letters)
|
| 103 |
+
|
| 104 |
+
# Submission compliance
|
| 105 |
+
ai_artifacts: true # AI-generated text detection
|
| 106 |
+
anonymization: true # Double-blind compliance
|
| 107 |
+
citation_quality: true # Old citations (>30 years)
|
| 108 |
+
number: true # Percentage formatting
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## 🤖 LLM-Based Relevance Check
|
| 112 |
+
|
| 113 |
+
To verify citations match their context using AI:
|
| 114 |
+
|
| 115 |
+
```yaml
|
| 116 |
+
bibliography:
|
| 117 |
+
check_relevance: true
|
| 118 |
+
|
| 119 |
+
llm:
|
| 120 |
+
backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm
|
| 121 |
+
api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY)
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
**Supported Backends:**
|
| 125 |
+
- **Gemini** (Google): `GEMINI_API_KEY`
|
| 126 |
+
- **OpenAI**: `OPENAI_API_KEY`
|
| 127 |
+
- **Anthropic**: `ANTHROPIC_API_KEY`
|
| 128 |
+
- **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance)
|
| 129 |
+
- **Ollama**: Local models (no API key needed)
|
| 130 |
+
- **vLLM**: Custom endpoint
|
| 131 |
+
|
| 132 |
+
Then run:
|
| 133 |
+
```bash
|
| 134 |
+
python main.py
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
## 📝 Understanding Reports
|
| 138 |
+
|
| 139 |
+
### Bibliography Report
|
| 140 |
+
Shows for each entry:
|
| 141 |
+
- ✅ **Verified**: Metadata matches online databases
|
| 142 |
+
- ⚠️ **Issues**: Mismatches, missing entries, duplicates
|
| 143 |
+
- 📊 **Statistics**: Usage, duplicates, preprint ratio
|
| 144 |
+
|
| 145 |
+
### LaTeX Quality Report
|
| 146 |
+
Organized by severity:
|
| 147 |
+
- 🔴 **Errors**: Critical issues (e.g., undefined references)
|
| 148 |
+
- 🟡 **Warnings**: Important issues (e.g., inconsistent spelling)
|
| 149 |
+
- 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
|
| 150 |
+
|
| 151 |
+
### Line-by-Line Report
|
| 152 |
+
All LaTeX issues sorted by line number for easy fixing.
|
| 153 |
+
|
| 154 |
+
## 🧐 Understanding Mismatches
|
| 155 |
+
|
| 156 |
+
BibGuard is strict, but false positives happen:
|
| 157 |
+
|
| 158 |
+
1. **Year Discrepancy (±1 Year)**:
|
| 159 |
+
- *Reason*: Delay between preprint (arXiv) and official publication
|
| 160 |
+
- *Action*: Verify which version you intend to cite
|
| 161 |
+
|
| 162 |
+
2. **Author List Variations**:
|
| 163 |
+
- *Reason*: Different databases handle large author lists differently
|
| 164 |
+
- *Action*: Check if primary authors match
|
| 165 |
+
|
| 166 |
+
3. **Venue Name Differences**:
|
| 167 |
+
- *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
|
| 168 |
+
- *Action*: Both are usually correct
|
| 169 |
+
|
| 170 |
+
4. **Non-Academic Sources**:
|
| 171 |
+
- *Reason*: Blogs, documentation not indexed by academic databases
|
| 172 |
+
- *Action*: Manually verify URL and title
|
| 173 |
+
|
| 174 |
+
## 🔧 Advanced Options
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
python main.py --help # Show all options
|
| 178 |
+
python main.py --list-templates # List conference templates
|
| 179 |
+
python main.py --config my.yaml # Use custom config file
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
## 🤝 Contributing
|
| 183 |
+
|
| 184 |
+
Contributions welcome! Please open an issue or pull request.
|
| 185 |
+
|
| 186 |
+
## 🙏 Acknowledgments
|
| 187 |
+
|
| 188 |
+
BibGuard uses multiple data sources:
|
| 189 |
+
- arXiv API
|
| 190 |
+
- CrossRef API
|
| 191 |
+
- Semantic Scholar API
|
| 192 |
+
- DBLP API
|
| 193 |
+
- OpenAlex API
|
| 194 |
+
- Google Scholar (via scholarly)
|
| 195 |
+
|
| 196 |
---
|
| 197 |
|
| 198 |
+
**Made with ❤️ for researchers who care about their submission**
|
app.py
ADDED
|
@@ -0,0 +1,922 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BibGuard Gradio Web Application
|
| 4 |
+
|
| 5 |
+
A web interface for checking bibliography and LaTeX quality.
|
| 6 |
+
"""
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import tempfile
|
| 9 |
+
import shutil
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, Tuple
|
| 12 |
+
import base64
|
| 13 |
+
|
| 14 |
+
from src.parsers import BibParser, TexParser
|
| 15 |
+
from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
|
| 16 |
+
from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
|
| 17 |
+
from src.report.generator import ReportGenerator, EntryReport
|
| 18 |
+
from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep
|
| 19 |
+
from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
|
| 20 |
+
from src.checkers import CHECKER_REGISTRY
|
| 21 |
+
from src.report.line_report import LineByLineReportGenerator
|
| 22 |
+
from app_helper import fetch_and_compare_with_workflow
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Custom CSS for better Markdown rendering
|
| 26 |
+
CUSTOM_CSS = """
|
| 27 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 28 |
+
|
| 29 |
+
* {
|
| 30 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 31 |
+
}
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
WELCOME_HTML = """
|
| 35 |
+
<div class="scrollable-report-area">
|
| 36 |
+
<div class="report-card" style="max-width: 800px; margin: 0 auto;">
|
| 37 |
+
<div class="card-header">
|
| 38 |
+
<h3 class="card-title" style="font-size: 1.5em;">👋 Welcome to BibGuard</h3>
|
| 39 |
+
</div>
|
| 40 |
+
<div class="card-content" style="line-height: 1.6; color: #374151;">
|
| 41 |
+
<p style="font-size: 1.1em; margin-bottom: 24px;">
|
| 42 |
+
Ensure your academic paper is flawless. Upload your <code>.bib</code> and <code>.tex</code> files on the left and click <strong>"Check Now"</strong>.
|
| 43 |
+
</p>
|
| 44 |
+
|
| 45 |
+
<div style="display: grid; gap: 20px; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));">
|
| 46 |
+
<div style="background: #fefce8; padding: 16px; border-radius: 8px; border: 1px solid #fde047;">
|
| 47 |
+
<strong style="color: #854d0e; display: block; margin-bottom: 8px;">⚠️ Metadata Check Defaults</strong>
|
| 48 |
+
"🔍 Metadata" is <strong>disabled by default</strong>. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification.
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
<div style="background: #eff6ff; padding: 16px; border-radius: 8px; border: 1px solid #bfdbfe;">
|
| 52 |
+
<strong style="color: #1e40af; display: block; margin-bottom: 8px;">🚀 Go Pro with Local Version</strong>
|
| 53 |
+
LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="color: #2563eb; text-decoration: underline; font-weight: 600;">GitHub repo</a> to use the full power with your API key.
|
| 54 |
+
</div>
|
| 55 |
+
</div>
|
| 56 |
+
|
| 57 |
+
<h4 style="margin: 24px 0 12px 0; color: #111827; font-size: 1.1em;">📊 Understanding Your Reports</h4>
|
| 58 |
+
<div style="display: grid; gap: 12px;">
|
| 59 |
+
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 60 |
+
<span style="background: #e0e7ff; color: #3730a3; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📚 Bibliography</span>
|
| 61 |
+
<span>Validates metadata fields, detects duplicates, and checks citation counts.</span>
|
| 62 |
+
</div>
|
| 63 |
+
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 64 |
+
<span style="background: #dcfce7; color: #166534; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📝 LaTeX Quality</span>
|
| 65 |
+
<span>Syntax check, caption validation, acronym consistency, and style suggestions.</span>
|
| 66 |
+
</div>
|
| 67 |
+
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 68 |
+
<span style="background: #f3f4f6; color: #4b5563; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📋 Line-by-Line</span>
|
| 69 |
+
<span>Maps every issue found directly to the line number in your source file.</span>
|
| 70 |
+
</div>
|
| 71 |
+
</div>
|
| 72 |
+
</div>
|
| 73 |
+
</div>
|
| 74 |
+
</div>
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
CUSTOM_CSS += """
|
| 78 |
+
/* Global Reset */
|
| 79 |
+
body, gradio-app {
|
| 80 |
+
overflow: hidden !important; /* Prevent double scrollbars on the page */
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.gradio-container {
|
| 84 |
+
max-width: none !important;
|
| 85 |
+
width: 100% !important;
|
| 86 |
+
height: 100vh !important;
|
| 87 |
+
padding: 0 !important;
|
| 88 |
+
margin: 0 !important;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/* Header Styling */
|
| 92 |
+
.app-header {
|
| 93 |
+
padding: 20px;
|
| 94 |
+
background: white;
|
| 95 |
+
border-bottom: 1px solid #e5e7eb;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
/* Sidebar Styling */
|
| 99 |
+
.app-sidebar {
|
| 100 |
+
height: calc(100vh - 100px) !important;
|
| 101 |
+
overflow-y: auto !important;
|
| 102 |
+
padding: 20px !important;
|
| 103 |
+
border-right: 1px solid #e5e7eb;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
/* Main Content Area */
|
| 107 |
+
.app-content {
|
| 108 |
+
height: calc(100vh - 100px) !important;
|
| 109 |
+
padding: 0 !important;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
/* The Magic Scroll Container - Clean and Explicit */
|
| 113 |
+
.scrollable-report-area {
|
| 114 |
+
height: calc(100vh - 180px) !important; /* Fixed height relative to viewport */
|
| 115 |
+
overflow-y: auto !important;
|
| 116 |
+
padding: 24px;
|
| 117 |
+
background-color: #f9fafb;
|
| 118 |
+
border: 1px solid #e5e7eb;
|
| 119 |
+
border-radius: 8px;
|
| 120 |
+
margin-top: 10px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* Report Card Styling */
|
| 124 |
+
.report-card {
|
| 125 |
+
background: white;
|
| 126 |
+
border-radius: 12px;
|
| 127 |
+
padding: 24px;
|
| 128 |
+
margin-bottom: 16px; /* Spacing between cards */
|
| 129 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
| 130 |
+
border: 1px solid #e5e7eb;
|
| 131 |
+
transition: transform 0.2s, box-shadow 0.2s;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
.report-card:hover {
|
| 135 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
|
| 136 |
+
transform: translateY(-2px);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* Card Internals */
|
| 140 |
+
.card-header {
|
| 141 |
+
display: flex;
|
| 142 |
+
justify-content: space-between;
|
| 143 |
+
align-items: flex-start;
|
| 144 |
+
margin-bottom: 16px;
|
| 145 |
+
padding-bottom: 16px;
|
| 146 |
+
border-bottom: 1px solid #f3f4f6;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.card-title {
|
| 150 |
+
font-size: 1.1em;
|
| 151 |
+
font-weight: 600;
|
| 152 |
+
color: #111827;
|
| 153 |
+
margin: 0 0 4px 0;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.card-subtitle {
|
| 157 |
+
font-size: 0.9em;
|
| 158 |
+
color: #6b7280;
|
| 159 |
+
font-family: monospace;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.card-content {
|
| 163 |
+
font-size: 0.95em;
|
| 164 |
+
color: #374151;
|
| 165 |
+
line-height: 1.5;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
/* Badges */
|
| 169 |
+
.badge {
|
| 170 |
+
display: inline-flex;
|
| 171 |
+
align-items: center;
|
| 172 |
+
padding: 4px 10px;
|
| 173 |
+
border-radius: 9999px;
|
| 174 |
+
font-size: 0.8em;
|
| 175 |
+
font-weight: 500;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.badge-success { background-color: #dcfce7; color: #166534; }
|
| 179 |
+
.badge-warning { background-color: #fef9c3; color: #854d0e; }
|
| 180 |
+
.badge-error { background-color: #fee2e2; color: #991b1b; }
|
| 181 |
+
.badge-info { background-color: #dbeafe; color: #1e40af; }
|
| 182 |
+
.badge-neutral { background-color: #f3f4f6; color: #4b5563; }
|
| 183 |
+
|
| 184 |
+
/* Stats Grid */
|
| 185 |
+
.stats-container {
|
| 186 |
+
display: grid;
|
| 187 |
+
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
| 188 |
+
gap: 16px;
|
| 189 |
+
margin-bottom: 24px;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.stat-card {
|
| 193 |
+
padding: 16px;
|
| 194 |
+
border-radius: 12px;
|
| 195 |
+
color: white;
|
| 196 |
+
text-align: center;
|
| 197 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.stat-value { font-size: 1.8em; font-weight: 700; }
|
| 201 |
+
.stat-label { font-size: 0.9em; opacity: 0.9; }
|
| 202 |
+
|
| 203 |
+
/* Detail Grid - Flexbox for better filling */
|
| 204 |
+
.detail-grid {
|
| 205 |
+
display: flex;
|
| 206 |
+
flex-wrap: wrap;
|
| 207 |
+
gap: 12px;
|
| 208 |
+
margin-bottom: 16px;
|
| 209 |
+
width: 100%;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.detail-item {
|
| 213 |
+
background: #f9fafb;
|
| 214 |
+
padding: 10px 12px;
|
| 215 |
+
border-radius: 8px;
|
| 216 |
+
border: 1px solid #f3f4f6;
|
| 217 |
+
|
| 218 |
+
/* Flex sizing: grow, shrink, min-basis */
|
| 219 |
+
flex: 1 1 160px;
|
| 220 |
+
min-width: 0; /* Important for word-break to work in flex children */
|
| 221 |
+
|
| 222 |
+
/* Layout control */
|
| 223 |
+
display: flex;
|
| 224 |
+
flex-direction: column;
|
| 225 |
+
|
| 226 |
+
/* Height constraint to prevent one huge card from stretching the row */
|
| 227 |
+
max-height: 100px;
|
| 228 |
+
overflow-y: auto;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
/* Custom scrollbar for detail items */
|
| 232 |
+
.detail-item::-webkit-scrollbar {
|
| 233 |
+
width: 4px;
|
| 234 |
+
}
|
| 235 |
+
.detail-item::-webkit-scrollbar-thumb {
|
| 236 |
+
background-color: #d1d5db;
|
| 237 |
+
border-radius: 4px;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.detail-label {
|
| 241 |
+
font-size: 0.75em;
|
| 242 |
+
color: #6b7280;
|
| 243 |
+
text-transform: uppercase;
|
| 244 |
+
letter-spacing: 0.05em;
|
| 245 |
+
margin-bottom: 2px;
|
| 246 |
+
position: sticky;
|
| 247 |
+
top: 0;
|
| 248 |
+
background: #f9fafb; /* Maintain bg on scroll */
|
| 249 |
+
z-index: 1;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.detail-value {
|
| 253 |
+
font-weight: 500;
|
| 254 |
+
color: #1f2937;
|
| 255 |
+
font-size: 0.9em;
|
| 256 |
+
line-height: 1.4;
|
| 257 |
+
word-break: break-word; /* Fix overflow */
|
| 258 |
+
overflow-wrap: break-word;
|
| 259 |
+
} border: 1px solid #e5e7eb;
|
| 260 |
+
transition: all 0.2s;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
.report-card:hover {
|
| 264 |
+
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
/* Card Header */
|
| 268 |
+
.card-header {
|
| 269 |
+
display: flex;
|
| 270 |
+
justify-content: space-between;
|
| 271 |
+
align-items: flex-start;
|
| 272 |
+
margin-bottom: 12px;
|
| 273 |
+
border-bottom: 1px solid #f3f4f6;
|
| 274 |
+
padding-bottom: 12px;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
.card-title {
|
| 278 |
+
font-size: 1.1em;
|
| 279 |
+
font-weight: 600;
|
| 280 |
+
color: #1f2937;
|
| 281 |
+
margin: 0;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
.card-subtitle {
|
| 285 |
+
font-size: 0.9em;
|
| 286 |
+
color: #6b7280;
|
| 287 |
+
margin-top: 4px;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
/* Status Badges */
|
| 291 |
+
.badge {
|
| 292 |
+
display: inline-flex;
|
| 293 |
+
align-items: center;
|
| 294 |
+
padding: 4px 10px;
|
| 295 |
+
border-radius: 9999px;
|
| 296 |
+
font-size: 0.8em;
|
| 297 |
+
font-weight: 500;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
.badge-success { background-color: #dcfce7; color: #166534; }
|
| 301 |
+
.badge-warning { background-color: #fef9c3; color: #854d0e; }
|
| 302 |
+
.badge-error { background-color: #fee2e2; color: #991b1b; }
|
| 303 |
+
.badge-info { background-color: #dbeafe; color: #1e40af; }
|
| 304 |
+
.badge-neutral { background-color: #f3f4f6; color: #374151; }
|
| 305 |
+
|
| 306 |
+
/* Content Styling */
|
| 307 |
+
.card-content {
|
| 308 |
+
font-size: 15px;
|
| 309 |
+
color: #374151;
|
| 310 |
+
line-height: 1.6;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.card-content code {
|
| 314 |
+
background-color: #f3f4f6;
|
| 315 |
+
padding: 2px 6px;
|
| 316 |
+
border-radius: 4px;
|
| 317 |
+
font-family: monospace;
|
| 318 |
+
font-size: 0.9em;
|
| 319 |
+
color: #c2410c;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
/* Grid for details */
|
| 323 |
+
.detail-grid {
|
| 324 |
+
display: grid;
|
| 325 |
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
| 326 |
+
gap: 12px;
|
| 327 |
+
margin-top: 12px;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.detail-item {
|
| 331 |
+
background: #f9fafb;
|
| 332 |
+
padding: 10px;
|
| 333 |
+
border-radius: 6px;
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
.detail-label {
|
| 337 |
+
font-size: 0.8em;
|
| 338 |
+
color: #6b7280;
|
| 339 |
+
text-transform: uppercase;
|
| 340 |
+
letter-spacing: 0.05em;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.detail-value {
|
| 344 |
+
font-weight: 500;
|
| 345 |
+
color: #111827;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
/* Summary Stats */
|
| 349 |
+
.stats-container {
|
| 350 |
+
display: grid;
|
| 351 |
+
grid-template-columns: repeat(3, 1fr);
|
| 352 |
+
gap: 16px;
|
| 353 |
+
margin-bottom: 24px;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
.stat-card {
|
| 357 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 358 |
+
color: white;
|
| 359 |
+
padding: 20px;
|
| 360 |
+
border-radius: 12px;
|
| 361 |
+
text-align: center;
|
| 362 |
+
box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25);
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.stat-value {
|
| 366 |
+
font-size: 2em;
|
| 367 |
+
font-weight: 700;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
.stat-label {
|
| 371 |
+
font-size: 0.9em;
|
| 372 |
+
opacity: 0.9;
|
| 373 |
+
margin-top: 4px;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
/* Button styling */
|
| 377 |
+
.primary-btn {
|
| 378 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 379 |
+
border: none !important;
|
| 380 |
+
font-weight: 600 !important;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
/* Tab styling */
|
| 384 |
+
.tab-nav button {
|
| 385 |
+
font-weight: 500 !important;
|
| 386 |
+
font-size: 15px !important;
|
| 387 |
+
}
|
| 388 |
+
"""
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def create_config_from_ui(
|
| 392 |
+
check_metadata: bool,
|
| 393 |
+
check_usage: bool,
|
| 394 |
+
check_duplicates: bool,
|
| 395 |
+
check_preprint_ratio: bool,
|
| 396 |
+
caption: bool,
|
| 397 |
+
reference: bool,
|
| 398 |
+
formatting: bool,
|
| 399 |
+
equation: bool,
|
| 400 |
+
ai_artifacts: bool,
|
| 401 |
+
sentence: bool,
|
| 402 |
+
consistency: bool,
|
| 403 |
+
acronym: bool,
|
| 404 |
+
number: bool,
|
| 405 |
+
citation_quality: bool,
|
| 406 |
+
anonymization: bool
|
| 407 |
+
) -> BibGuardConfig:
|
| 408 |
+
"""Create a BibGuardConfig from UI settings."""
|
| 409 |
+
config = BibGuardConfig()
|
| 410 |
+
|
| 411 |
+
config.bibliography = BibliographyConfig(
|
| 412 |
+
check_metadata=check_metadata,
|
| 413 |
+
check_usage=check_usage,
|
| 414 |
+
check_duplicates=check_duplicates,
|
| 415 |
+
check_preprint_ratio=check_preprint_ratio,
|
| 416 |
+
check_relevance=False # Disabled for web
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
config.submission = SubmissionConfig(
|
| 420 |
+
caption=caption,
|
| 421 |
+
reference=reference,
|
| 422 |
+
formatting=formatting,
|
| 423 |
+
equation=equation,
|
| 424 |
+
ai_artifacts=ai_artifacts,
|
| 425 |
+
sentence=sentence,
|
| 426 |
+
consistency=consistency,
|
| 427 |
+
acronym=acronym,
|
| 428 |
+
number=number,
|
| 429 |
+
citation_quality=citation_quality,
|
| 430 |
+
anonymization=anonymization
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
config.output = OutputConfig(quiet=True, minimal_verified=False)
|
| 434 |
+
|
| 435 |
+
return config
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str:
|
| 439 |
+
"""Generate HTML content for bibliography report."""
|
| 440 |
+
html = ['<div class="scrollable-report-area">']
|
| 441 |
+
|
| 442 |
+
# 1. Summary Stats
|
| 443 |
+
total = len(entries)
|
| 444 |
+
verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match)
|
| 445 |
+
used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used)
|
| 446 |
+
|
| 447 |
+
html.append('<div class="stats-container">')
|
| 448 |
+
html.append(f'<div class="stat-card"><div class="stat-value">{total}</div><div class="stat-label">Total Entries</div></div>')
|
| 449 |
+
html.append(f'<div class="stat-card"><div class="stat-value">{verified}</div><div class="stat-label">Verified</div></div>')
|
| 450 |
+
html.append(f'<div class="stat-card"><div class="stat-value">{used}</div><div class="stat-label">Used in Text</div></div>')
|
| 451 |
+
html.append('</div>')
|
| 452 |
+
|
| 453 |
+
# 2. Entries
|
| 454 |
+
for report in report_gen.entries:
|
| 455 |
+
entry = report.entry
|
| 456 |
+
status_badges = []
|
| 457 |
+
|
| 458 |
+
# Metadata Status
|
| 459 |
+
if report.comparison:
|
| 460 |
+
if report.comparison.is_match:
|
| 461 |
+
status_badges.append('<span class="badge badge-success">✓ Verified</span>')
|
| 462 |
+
if report.comparison.source:
|
| 463 |
+
status_badges.append(f'<span class="badge badge-info">{report.comparison.source.upper()}</span>')
|
| 464 |
+
else:
|
| 465 |
+
status_badges.append('<span class="badge badge-error">⚠ Metadata Mismatch</span>')
|
| 466 |
+
else:
|
| 467 |
+
status_badges.append('<span class="badge badge-neutral">No Metadata Check</span>')
|
| 468 |
+
|
| 469 |
+
# Usage Status
|
| 470 |
+
if report.usage:
|
| 471 |
+
if report.usage.is_used:
|
| 472 |
+
status_badges.append(f'<span class="badge badge-success">Used: {report.usage.usage_count}x</span>')
|
| 473 |
+
else:
|
| 474 |
+
status_badges.append('<span class="badge badge-warning">Unused</span>')
|
| 475 |
+
|
| 476 |
+
# Build Card
|
| 477 |
+
html.append(f'''
|
| 478 |
+
<div class="report-card">
|
| 479 |
+
<div class="card-header">
|
| 480 |
+
<div>
|
| 481 |
+
<h3 class="card-title">{entry.title or "No Title"}</h3>
|
| 482 |
+
<div class="card-subtitle">{entry.key} • {entry.year} • {entry.entry_type}</div>
|
| 483 |
+
</div>
|
| 484 |
+
<div style="display: flex; gap: 8px;">
|
| 485 |
+
{" ".join(status_badges)}
|
| 486 |
+
</div>
|
| 487 |
+
</div>
|
| 488 |
+
|
| 489 |
+
<div class="card-content">
|
| 490 |
+
<div class="detail-grid">
|
| 491 |
+
{
|
| 492 |
+
(lambda e: "".join([
|
| 493 |
+
f'<div class="detail-item"><div class="detail-label">{k}</div><div class="detail-value">{v}</div></div>'
|
| 494 |
+
for k, v in filter(None, [
|
| 495 |
+
("Authors", e.author or "N/A"),
|
| 496 |
+
("Venue", e.journal or e.booktitle or e.publisher or "N/A"),
|
| 497 |
+
("DOI", e.doi) if e.doi else None,
|
| 498 |
+
("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None,
|
| 499 |
+
("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None,
|
| 500 |
+
("URL", f'<a href="{e.url}" target="_blank" style="text-decoration:underline;">Link</a>') if e.url else None
|
| 501 |
+
])
|
| 502 |
+
]))(entry)
|
| 503 |
+
}
|
| 504 |
+
</div>
|
| 505 |
+
''')
|
| 506 |
+
|
| 507 |
+
# Add issues if any
|
| 508 |
+
issues = []
|
| 509 |
+
if report.comparison and not report.comparison.is_match:
|
| 510 |
+
# Add main message derived from match status
|
| 511 |
+
if report.comparison.issues:
|
| 512 |
+
for issue in report.comparison.issues:
|
| 513 |
+
issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• {issue}</div>')
|
| 514 |
+
else:
|
| 515 |
+
issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• Verification failed</div>')
|
| 516 |
+
|
| 517 |
+
if issues:
|
| 518 |
+
html.append('<div style="margin-top: 16px; padding-top: 12px; border-top: 1px solid #eee;">')
|
| 519 |
+
html.append("".join(issues))
|
| 520 |
+
html.append('</div>')
|
| 521 |
+
|
| 522 |
+
html.append('</div></div>') # Close card-content and report-card
|
| 523 |
+
|
| 524 |
+
html.append('</div>') # Close container
|
| 525 |
+
return "".join(html)
|
| 526 |
+
|
| 527 |
+
def generate_latex_html(results: list) -> str:
|
| 528 |
+
"""Generate HTML for LaTeX quality check."""
|
| 529 |
+
from src.checkers import CheckSeverity
|
| 530 |
+
|
| 531 |
+
html = ['<div class="scrollable-report-area">']
|
| 532 |
+
|
| 533 |
+
# Stats
|
| 534 |
+
errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR)
|
| 535 |
+
warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING)
|
| 536 |
+
infos = sum(1 for r in results if r.severity == CheckSeverity.INFO)
|
| 537 |
+
|
| 538 |
+
html.append('<div class="stats-container">')
|
| 539 |
+
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #ef4444 0%, #b91c1c 100%);"><div class="stat-value">{errors}</div><div class="stat-label">Errors</div></div>')
|
| 540 |
+
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);"><div class="stat-value">{warnings}</div><div class="stat-label">Warnings</div></div>')
|
| 541 |
+
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);"><div class="stat-value">{infos}</div><div class="stat-label">Suggestions</div></div>')
|
| 542 |
+
html.append('</div>')
|
| 543 |
+
|
| 544 |
+
if not results:
|
| 545 |
+
html.append('<div class="report-card"><div class="card-content" style="text-align: center; padding: 40px; color: #166534; font-size: 1.2em;">✅ No issues found in LaTeX code!</div></div>')
|
| 546 |
+
else:
|
| 547 |
+
# Group by Checker
|
| 548 |
+
results.sort(key=lambda x: x.checker_name)
|
| 549 |
+
current_checker = None
|
| 550 |
+
|
| 551 |
+
for result in results:
|
| 552 |
+
badge_class = "badge-neutral"
|
| 553 |
+
if result.severity == CheckSeverity.ERROR: badge_class = "badge-error"
|
| 554 |
+
elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning"
|
| 555 |
+
elif result.severity == CheckSeverity.INFO: badge_class = "badge-info"
|
| 556 |
+
|
| 557 |
+
html.append(f'''
|
| 558 |
+
<div class="report-card">
|
| 559 |
+
<div class="card-header">
|
| 560 |
+
<div>
|
| 561 |
+
<h3 class="card-title">{result.checker_name}</h3>
|
| 562 |
+
<div class="card-subtitle">Line {result.line_number}</div>
|
| 563 |
+
</div>
|
| 564 |
+
<span class="badge {badge_class}">{result.severity.name}</span>
|
| 565 |
+
</div>
|
| 566 |
+
<div class="card-content">
|
| 567 |
+
{result.message}
|
| 568 |
+
{f'<div style="margin-top: 8px; background: #f3f4f6; padding: 8px; border-radius: 4px; font-family: monospace;">{result.line_content}</div>' if result.line_content else ''}
|
| 569 |
+
{f'<div style="margin-top: 8px; color: #166534;">💡 Suggestion: {result.suggestion}</div>' if result.suggestion else ''}
|
| 570 |
+
</div>
|
| 571 |
+
</div>
|
| 572 |
+
''')
|
| 573 |
+
|
| 574 |
+
html.append('</div>')
|
| 575 |
+
return "".join(html)
|
| 576 |
+
|
| 577 |
+
def generate_line_html(content: str, results: list) -> str:
|
| 578 |
+
"""Generate HTML for Line-by-Line report."""
|
| 579 |
+
# Build a dictionary of line_number -> list of issues
|
| 580 |
+
issues_by_line = {}
|
| 581 |
+
for r in results:
|
| 582 |
+
if r.line_number not in issues_by_line:
|
| 583 |
+
issues_by_line[r.line_number] = []
|
| 584 |
+
issues_by_line[r.line_number].append(r)
|
| 585 |
+
|
| 586 |
+
lines = content.split('\n')
|
| 587 |
+
|
| 588 |
+
html = ['<div class="scrollable-report-area">']
|
| 589 |
+
|
| 590 |
+
html.append('<div class="report-card"><div class="card-content">Issues are mapped to specific lines below.</div></div>')
|
| 591 |
+
|
| 592 |
+
for i, line in enumerate(lines, 1):
|
| 593 |
+
if i in issues_by_line:
|
| 594 |
+
# Highlight this line
|
| 595 |
+
line_issues = issues_by_line[i]
|
| 596 |
+
|
| 597 |
+
html.append(f'''
|
| 598 |
+
<div class="report-card" style="border-left: 4px solid #ef4444; padding: 12px;">
|
| 599 |
+
<div style="font-family: monospace; color: #6b7280; font-size: 0.9em; margin-bottom: 4px;">Line {i}</div>
|
| 600 |
+
<div style="font-family: monospace; background: #fee2e2; padding: 4px; border-radius: 4px; overflow-x: auto; white-space: pre;">{line}</div>
|
| 601 |
+
<div style="margin-top: 8px;">
|
| 602 |
+
''')
|
| 603 |
+
|
| 604 |
+
for issue in line_issues:
|
| 605 |
+
html.append(f'<div style="color: #991b1b; font-size: 0.95em; margin-top: 4px;">• {issue.message}</div>')
|
| 606 |
+
|
| 607 |
+
html.append('</div></div>')
|
| 608 |
+
|
| 609 |
+
html.append('</div>')
|
| 610 |
+
return "".join(html)
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def run_check(
|
| 616 |
+
bib_file,
|
| 617 |
+
tex_file,
|
| 618 |
+
check_metadata: bool,
|
| 619 |
+
check_usage: bool,
|
| 620 |
+
check_duplicates: bool,
|
| 621 |
+
check_preprint_ratio: bool,
|
| 622 |
+
caption: bool,
|
| 623 |
+
reference: bool,
|
| 624 |
+
formatting: bool,
|
| 625 |
+
equation: bool,
|
| 626 |
+
ai_artifacts: bool,
|
| 627 |
+
sentence: bool,
|
| 628 |
+
consistency: bool,
|
| 629 |
+
acronym: bool,
|
| 630 |
+
number: bool,
|
| 631 |
+
citation_quality: bool,
|
| 632 |
+
anonymization: bool,
|
| 633 |
+
progress=gr.Progress()
|
| 634 |
+
) -> Tuple[str, str, str]:
|
| 635 |
+
"""Run BibGuard checks and return three reports."""
|
| 636 |
+
|
| 637 |
+
if bib_file is None or tex_file is None:
|
| 638 |
+
return (
|
| 639 |
+
"⚠️ Please upload both `.bib` and `.tex` files.",
|
| 640 |
+
"⚠️ Please upload both `.bib` and `.tex` files.",
|
| 641 |
+
"⚠️ Please upload both `.bib` and `.tex` files."
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
try:
|
| 645 |
+
# Create config from UI
|
| 646 |
+
config = create_config_from_ui(
|
| 647 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 648 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 649 |
+
sentence, consistency, acronym, number, citation_quality, anonymization
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
# Get file paths from uploaded files
|
| 653 |
+
bib_path = bib_file.name
|
| 654 |
+
tex_path = tex_file.name
|
| 655 |
+
|
| 656 |
+
# Read tex content for checkers
|
| 657 |
+
tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
|
| 658 |
+
|
| 659 |
+
# Parse files
|
| 660 |
+
bib_parser = BibParser()
|
| 661 |
+
entries = bib_parser.parse_file(bib_path)
|
| 662 |
+
|
| 663 |
+
tex_parser = TexParser()
|
| 664 |
+
tex_parser.parse_file(tex_path)
|
| 665 |
+
|
| 666 |
+
bib_config = config.bibliography
|
| 667 |
+
|
| 668 |
+
# Initialize components
|
| 669 |
+
arxiv_fetcher = None
|
| 670 |
+
crossref_fetcher = None
|
| 671 |
+
semantic_scholar_fetcher = None
|
| 672 |
+
openalex_fetcher = None
|
| 673 |
+
dblp_fetcher = None
|
| 674 |
+
comparator = None
|
| 675 |
+
usage_checker = None
|
| 676 |
+
duplicate_detector = None
|
| 677 |
+
|
| 678 |
+
if bib_config.check_metadata:
|
| 679 |
+
arxiv_fetcher = ArxivFetcher()
|
| 680 |
+
semantic_scholar_fetcher = SemanticScholarFetcher()
|
| 681 |
+
openalex_fetcher = OpenAlexFetcher()
|
| 682 |
+
dblp_fetcher = DBLPFetcher()
|
| 683 |
+
crossref_fetcher = CrossRefFetcher()
|
| 684 |
+
comparator = MetadataComparator()
|
| 685 |
+
|
| 686 |
+
if bib_config.check_usage:
|
| 687 |
+
usage_checker = UsageChecker(tex_parser)
|
| 688 |
+
|
| 689 |
+
if bib_config.check_duplicates:
|
| 690 |
+
duplicate_detector = DuplicateDetector()
|
| 691 |
+
|
| 692 |
+
# Initialize report generator
|
| 693 |
+
report_gen = ReportGenerator(
|
| 694 |
+
minimal_verified=False,
|
| 695 |
+
check_preprint_ratio=bib_config.check_preprint_ratio,
|
| 696 |
+
preprint_warning_threshold=bib_config.preprint_warning_threshold
|
| 697 |
+
)
|
| 698 |
+
report_gen.set_metadata([bib_file.name], [tex_file.name])
|
| 699 |
+
|
| 700 |
+
# Run submission quality checks
|
| 701 |
+
progress(0.2, desc="Running LaTeX quality checks...")
|
| 702 |
+
submission_results = []
|
| 703 |
+
enabled_checkers = config.submission.get_enabled_checkers()
|
| 704 |
+
|
| 705 |
+
for checker_name in enabled_checkers:
|
| 706 |
+
if checker_name in CHECKER_REGISTRY:
|
| 707 |
+
checker = CHECKER_REGISTRY[checker_name]()
|
| 708 |
+
results = checker.check(tex_content, {})
|
| 709 |
+
for r in results:
|
| 710 |
+
r.file_path = tex_file.name
|
| 711 |
+
submission_results.extend(results)
|
| 712 |
+
|
| 713 |
+
report_gen.set_submission_results(submission_results, None)
|
| 714 |
+
|
| 715 |
+
# Check for duplicates
|
| 716 |
+
if bib_config.check_duplicates and duplicate_detector:
|
| 717 |
+
duplicate_groups = duplicate_detector.find_duplicates(entries)
|
| 718 |
+
report_gen.set_duplicate_groups(duplicate_groups)
|
| 719 |
+
|
| 720 |
+
# Check missing citations
|
| 721 |
+
if bib_config.check_usage and usage_checker:
|
| 722 |
+
missing = usage_checker.get_missing_entries(entries)
|
| 723 |
+
report_gen.set_missing_citations(missing)
|
| 724 |
+
|
| 725 |
+
# Build workflow
|
| 726 |
+
workflow_config = get_default_workflow()
|
| 727 |
+
|
| 728 |
+
# Process entries
|
| 729 |
+
progress(0.3, desc="Processing bibliography entries...")
|
| 730 |
+
total_entries = len(entries)
|
| 731 |
+
|
| 732 |
+
for i, entry in enumerate(entries):
|
| 733 |
+
progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}")
|
| 734 |
+
|
| 735 |
+
# Check usage
|
| 736 |
+
usage_result = None
|
| 737 |
+
if usage_checker:
|
| 738 |
+
usage_result = usage_checker.check_usage(entry)
|
| 739 |
+
|
| 740 |
+
# Fetch and compare metadata
|
| 741 |
+
comparison_result = None
|
| 742 |
+
if bib_config.check_metadata and comparator:
|
| 743 |
+
comparison_result = fetch_and_compare_with_workflow(
|
| 744 |
+
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
|
| 745 |
+
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
|
| 746 |
+
)
|
| 747 |
+
|
| 748 |
+
# Create entry report
|
| 749 |
+
entry_report = EntryReport(
|
| 750 |
+
entry=entry,
|
| 751 |
+
comparison=comparison_result,
|
| 752 |
+
usage=usage_result,
|
| 753 |
+
evaluations=[]
|
| 754 |
+
)
|
| 755 |
+
report_gen.add_entry_report(entry_report)
|
| 756 |
+
|
| 757 |
+
progress(0.85, desc="Generating structured reports...")
|
| 758 |
+
|
| 759 |
+
# Generate Bibliography HTML Report
|
| 760 |
+
bib_report = generate_bibliography_html(report_gen, entries)
|
| 761 |
+
|
| 762 |
+
# Generate LaTeX Quality HTML Report
|
| 763 |
+
latex_report = generate_latex_html(submission_results)
|
| 764 |
+
|
| 765 |
+
# Generate Line-by-Line HTML Report
|
| 766 |
+
line_report = ""
|
| 767 |
+
if submission_results:
|
| 768 |
+
line_report = generate_line_html(tex_content, submission_results)
|
| 769 |
+
else:
|
| 770 |
+
line_report = '<div class="report-container"><div class="report-card"><div class="card-content">No issues to display line-by-line.</div></div></div>'
|
| 771 |
+
|
| 772 |
+
progress(1.0, desc="Done!")
|
| 773 |
+
|
| 774 |
+
return bib_report, latex_report, line_report
|
| 775 |
+
|
| 776 |
+
except Exception as e:
|
| 777 |
+
error_msg = f"❌ Error: {str(e)}"
|
| 778 |
+
import traceback
|
| 779 |
+
error_msg += f"\n\n```\n{traceback.format_exc()}\n```"
|
| 780 |
+
return error_msg, error_msg, error_msg
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
def create_app():
|
| 785 |
+
"""Create and configure the Gradio app."""
|
| 786 |
+
|
| 787 |
+
# Load icon as base64
|
| 788 |
+
icon_html = ""
|
| 789 |
+
try:
|
| 790 |
+
icon_path = Path("assets/icon-192.png")
|
| 791 |
+
if icon_path.exists():
|
| 792 |
+
with open(icon_path, "rb") as f:
|
| 793 |
+
encoding = base64.b64encode(f.read()).decode()
|
| 794 |
+
icon_html = f'<img src="data:image/png;base64,{encoding}" style="width: 48px; height: 48px; border-radius: 8px;" alt="BibGuard">'
|
| 795 |
+
else:
|
| 796 |
+
icon_html = '<span style="font-size: 48px;">📚</span>'
|
| 797 |
+
except Exception:
|
| 798 |
+
icon_html = '<span style="font-size: 48px;">📚</span>'
|
| 799 |
+
|
| 800 |
+
with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app:
|
| 801 |
+
|
| 802 |
+
# Header with icon
|
| 803 |
+
with gr.Row(elem_classes=["app-header"]):
|
| 804 |
+
gr.HTML(f"""
|
| 805 |
+
<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 16px;">
|
| 806 |
+
{icon_html}
|
| 807 |
+
<div>
|
| 808 |
+
<h1 style="margin: 0; font-size: 1.8em;">BibGuard</h1>
|
| 809 |
+
<p style="margin: 0; color: #666; font-size: 14px;">Bibliography & LaTeX Quality Checker</p>
|
| 810 |
+
</div>
|
| 811 |
+
</div>
|
| 812 |
+
""")
|
| 813 |
+
|
| 814 |
+
with gr.Row(elem_classes=["app-body"]):
|
| 815 |
+
# Left column: Upload & Settings
|
| 816 |
+
with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]):
|
| 817 |
+
gr.Markdown("### 📁 Upload Files")
|
| 818 |
+
|
| 819 |
+
bib_file = gr.File(
|
| 820 |
+
label="Bibliography (.bib)",
|
| 821 |
+
file_types=[".bib"],
|
| 822 |
+
file_count="single"
|
| 823 |
+
)
|
| 824 |
+
|
| 825 |
+
tex_file = gr.File(
|
| 826 |
+
label="LaTeX Source (.tex)",
|
| 827 |
+
file_types=[".tex"],
|
| 828 |
+
file_count="single"
|
| 829 |
+
)
|
| 830 |
+
|
| 831 |
+
# Check options in grid layout
|
| 832 |
+
gr.Markdown("#### ⚙️ Options")
|
| 833 |
+
|
| 834 |
+
with gr.Row():
|
| 835 |
+
check_metadata = gr.Checkbox(label="🔍 Metadata", value=False)
|
| 836 |
+
check_usage = gr.Checkbox(label="📊 Usage", value=True)
|
| 837 |
+
|
| 838 |
+
with gr.Row():
|
| 839 |
+
check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True)
|
| 840 |
+
check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True)
|
| 841 |
+
|
| 842 |
+
with gr.Row():
|
| 843 |
+
caption = gr.Checkbox(label="🖼️ Captions", value=True)
|
| 844 |
+
reference = gr.Checkbox(label="🔗 References", value=True)
|
| 845 |
+
|
| 846 |
+
with gr.Row():
|
| 847 |
+
formatting = gr.Checkbox(label="✨ Formatting", value=True)
|
| 848 |
+
equation = gr.Checkbox(label="🔢 Equations", value=True)
|
| 849 |
+
|
| 850 |
+
with gr.Row():
|
| 851 |
+
ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True)
|
| 852 |
+
sentence = gr.Checkbox(label="📝 Sentences", value=True)
|
| 853 |
+
|
| 854 |
+
with gr.Row():
|
| 855 |
+
consistency = gr.Checkbox(label="🔄 Consistency", value=True)
|
| 856 |
+
acronym = gr.Checkbox(label="🔤 Acronyms", value=True)
|
| 857 |
+
|
| 858 |
+
with gr.Row():
|
| 859 |
+
number = gr.Checkbox(label="🔢 Numbers", value=True)
|
| 860 |
+
citation_quality = gr.Checkbox(label="📚 Citations", value=True)
|
| 861 |
+
|
| 862 |
+
with gr.Row():
|
| 863 |
+
anonymization = gr.Checkbox(label="🎭 Anonymization", value=True)
|
| 864 |
+
|
| 865 |
+
run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg")
|
| 866 |
+
|
| 867 |
+
gr.HTML("""
|
| 868 |
+
<div style="text-align: center; margin-top: 16px;">
|
| 869 |
+
<a href="https://github.com/thinkwee/BibGuard" target="_blank" style="text-decoration: none; color: #666; display: inline-flex; align-items: center; gap: 6px;">
|
| 870 |
+
<svg height="20" width="20" viewBox="0 0 16 16"><path fill="currentColor" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
|
| 871 |
+
GitHub
|
| 872 |
+
</a>
|
| 873 |
+
<p style="margin: 8px 0 0 0; color: #999; font-size: 12px;">Developed with ❤️ for researchers</p>
|
| 874 |
+
</div>
|
| 875 |
+
""")
|
| 876 |
+
|
| 877 |
+
# Right column: Reports
|
| 878 |
+
with gr.Column(scale=4, elem_classes=["app-content"]):
|
| 879 |
+
with gr.Tabs():
|
| 880 |
+
with gr.Tab("📚 Bibliography Report"):
|
| 881 |
+
bib_report = gr.HTML(
|
| 882 |
+
value=WELCOME_HTML,
|
| 883 |
+
elem_classes=["report-panel"]
|
| 884 |
+
)
|
| 885 |
+
|
| 886 |
+
with gr.Tab("📝 LaTeX Quality"):
|
| 887 |
+
latex_report = gr.HTML(
|
| 888 |
+
value=WELCOME_HTML,
|
| 889 |
+
elem_classes=["report-panel"]
|
| 890 |
+
)
|
| 891 |
+
|
| 892 |
+
with gr.Tab("📋 Line-by-Line"):
|
| 893 |
+
line_report = gr.HTML(
|
| 894 |
+
value=WELCOME_HTML,
|
| 895 |
+
elem_classes=["report-panel"]
|
| 896 |
+
)
|
| 897 |
+
|
| 898 |
+
# Event handling
|
| 899 |
+
run_btn.click(
|
| 900 |
+
fn=run_check,
|
| 901 |
+
inputs=[
|
| 902 |
+
bib_file, tex_file,
|
| 903 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 904 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 905 |
+
sentence, consistency, acronym, number, citation_quality, anonymization
|
| 906 |
+
],
|
| 907 |
+
outputs=[bib_report, latex_report, line_report]
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
return app
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
# Create the app
|
| 914 |
+
app = create_app()
|
| 915 |
+
|
| 916 |
+
if __name__ == "__main__":
|
| 917 |
+
app.launch(
|
| 918 |
+
favicon_path="assets/icon-192.png",
|
| 919 |
+
show_error=True,
|
| 920 |
+
css=CUSTOM_CSS,
|
| 921 |
+
theme=gr.themes.Soft()
|
| 922 |
+
)
|
app_helper.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def fetch_and_compare_with_workflow(
|
| 2 |
+
entry, workflow_steps, arxiv_fetcher, crossref_fetcher,
|
| 3 |
+
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
|
| 4 |
+
):
|
| 5 |
+
"""Fetch metadata from online sources using the configured workflow."""
|
| 6 |
+
from src.utils.normalizer import TextNormalizer
|
| 7 |
+
|
| 8 |
+
best_result = None
|
| 9 |
+
|
| 10 |
+
# If no steps provided, use default order
|
| 11 |
+
if not workflow_steps:
|
| 12 |
+
# Create a default list of steps if needed, or simply handle logic here
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
# Simplified workflow execution: Run through enabled steps
|
| 16 |
+
# We manualy iterate through sources in a preferred order if workflow is not fully configured
|
| 17 |
+
# Or iterate through the steps list.
|
| 18 |
+
|
| 19 |
+
# Since extracting WorkflowConfig logic is complex, let's just implement a robust
|
| 20 |
+
# default search strategy here which is what the user likely wants.
|
| 21 |
+
|
| 22 |
+
results = []
|
| 23 |
+
|
| 24 |
+
# 1. DBLP (High quality for CS)
|
| 25 |
+
if dblp_fetcher and entry.title:
|
| 26 |
+
try:
|
| 27 |
+
dblp_result = dblp_fetcher.search_by_title(entry.title)
|
| 28 |
+
if dblp_result:
|
| 29 |
+
res = comparator.compare_with_dblp(entry, dblp_result)
|
| 30 |
+
if res.is_match: return res
|
| 31 |
+
results.append(res)
|
| 32 |
+
except Exception: pass
|
| 33 |
+
|
| 34 |
+
# 2. Semantic Scholar (Comprehensive)
|
| 35 |
+
if semantic_scholar_fetcher and entry.title:
|
| 36 |
+
try:
|
| 37 |
+
ss_result = None
|
| 38 |
+
if entry.doi:
|
| 39 |
+
ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
|
| 40 |
+
if not ss_result:
|
| 41 |
+
ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
|
| 42 |
+
|
| 43 |
+
if ss_result:
|
| 44 |
+
res = comparator.compare_with_semantic_scholar(entry, ss_result)
|
| 45 |
+
if res.is_match: return res
|
| 46 |
+
results.append(res)
|
| 47 |
+
except Exception: pass
|
| 48 |
+
|
| 49 |
+
# 3. OpenAlex
|
| 50 |
+
if openalex_fetcher and entry.title:
|
| 51 |
+
try:
|
| 52 |
+
oa_result = None
|
| 53 |
+
if entry.doi:
|
| 54 |
+
oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
|
| 55 |
+
if not oa_result:
|
| 56 |
+
oa_result = openalex_fetcher.search_by_title(entry.title)
|
| 57 |
+
|
| 58 |
+
if oa_result:
|
| 59 |
+
res = comparator.compare_with_openalex(entry, oa_result)
|
| 60 |
+
if res.is_match: return res
|
| 61 |
+
results.append(res)
|
| 62 |
+
except Exception: pass
|
| 63 |
+
|
| 64 |
+
# 4. CrossRef (Official metadata)
|
| 65 |
+
if crossref_fetcher and entry.doi:
|
| 66 |
+
try:
|
| 67 |
+
crossref_result = crossref_fetcher.search_by_doi(entry.doi)
|
| 68 |
+
if crossref_result:
|
| 69 |
+
res = comparator.compare_with_crossref(entry, crossref_result)
|
| 70 |
+
if res.is_match: return res
|
| 71 |
+
results.append(res)
|
| 72 |
+
except Exception: pass
|
| 73 |
+
|
| 74 |
+
# 5. ArXiv
|
| 75 |
+
if arxiv_fetcher:
|
| 76 |
+
try:
|
| 77 |
+
arxiv_meta = None
|
| 78 |
+
if entry.has_arxiv:
|
| 79 |
+
arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
|
| 80 |
+
elif entry.title:
|
| 81 |
+
# Search by title
|
| 82 |
+
search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
|
| 83 |
+
if search_results:
|
| 84 |
+
arxiv_meta = search_results[0]
|
| 85 |
+
|
| 86 |
+
if arxiv_meta:
|
| 87 |
+
res = comparator.compare_with_arxiv(entry, arxiv_meta)
|
| 88 |
+
if res.is_match: return res
|
| 89 |
+
results.append(res)
|
| 90 |
+
except Exception: pass
|
| 91 |
+
|
| 92 |
+
# Return the best result (highest confidence) if no perfect match found
|
| 93 |
+
if results:
|
| 94 |
+
results.sort(key=lambda x: x.confidence, reverse=True)
|
| 95 |
+
return results[0]
|
| 96 |
+
|
| 97 |
+
# If absolutely nothing found, return None or an 'Unable' result
|
| 98 |
+
return comparator.create_unable_result(entry, "No metadata found in any source")
|
assets/icon-192.png
ADDED
|
|
Git LFS Details
|
assets/icon-512.png
ADDED
|
|
Git LFS Details
|
bibguard.yaml
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# BibGuard Configuration File
|
| 3 |
+
# ==============================================================================
|
| 4 |
+
#
|
| 5 |
+
# Usage: python main.py --config bibguard.yaml
|
| 6 |
+
# python main.py (auto-detect bibguard.yaml in current/parent directories)
|
| 7 |
+
#
|
| 8 |
+
# All paths are relative to this configuration file's directory.
|
| 9 |
+
|
| 10 |
+
# ==============================================================================
|
| 11 |
+
# 📁 File Settings
|
| 12 |
+
# ==============================================================================
|
| 13 |
+
files:
|
| 14 |
+
# Required: Path to your .bib bibliography file
|
| 15 |
+
bib: "test.bib"
|
| 16 |
+
|
| 17 |
+
# Required: Path to your .tex LaTeX source file
|
| 18 |
+
tex: "test.tex"
|
| 19 |
+
|
| 20 |
+
# Optional: Directory path for recursive scanning (Experimental)
|
| 21 |
+
# When set, BibGuard will recursively search for all .tex and .bib files in this directory.
|
| 22 |
+
# This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
|
| 23 |
+
# input_dir: "./paper_project"
|
| 24 |
+
|
| 25 |
+
# Output directory for all generated reports and files (default: bibguard_output)
|
| 26 |
+
# All outputs including reports, cleaned .bib, and input file copies will be saved here
|
| 27 |
+
output_dir: "test"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ==============================================================================
|
| 31 |
+
# 🎓 Conference Template
|
| 32 |
+
# ==============================================================================
|
| 33 |
+
# Specify a conference template for venue-specific checks and formatting rules.
|
| 34 |
+
# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
|
| 35 |
+
# Leave empty ("") to skip template-specific checks.
|
| 36 |
+
template: ""
|
| 37 |
+
|
| 38 |
+
# ==============================================================================
|
| 39 |
+
# 📚 Bibliography Checks
|
| 40 |
+
# ==============================================================================
|
| 41 |
+
bibliography:
|
| 42 |
+
# Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
|
| 43 |
+
# Detects incorrect titles, authors, venues, and publication years
|
| 44 |
+
# ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
|
| 45 |
+
check_metadata: true
|
| 46 |
+
|
| 47 |
+
# Usage Check - Detect unused bib entries and missing citations
|
| 48 |
+
# Identifies entries in .bib not cited in .tex, and citations without bib entries
|
| 49 |
+
check_usage: true
|
| 50 |
+
|
| 51 |
+
# Duplicate Detection - Find duplicate entries with different keys
|
| 52 |
+
# Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
|
| 53 |
+
check_duplicates: true
|
| 54 |
+
|
| 55 |
+
# Preprint Ratio Check - Warn if too many references are preprints
|
| 56 |
+
# Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
|
| 57 |
+
check_preprint_ratio: true
|
| 58 |
+
preprint_warning_threshold: 0.50 # Warn if more than 50% of used entries are preprints
|
| 59 |
+
|
| 60 |
+
# Relevance Assessment - Use LLM to evaluate if citations match their context
|
| 61 |
+
# Requires LLM configuration (see llm section below). Disabled by default due to API costs.
|
| 62 |
+
check_relevance: false
|
| 63 |
+
|
| 64 |
+
# ==============================================================================
|
| 65 |
+
# 📋 Submission Quality Checks
|
| 66 |
+
# ==============================================================================
|
| 67 |
+
submission:
|
| 68 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 69 |
+
# Format Checks
|
| 70 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 71 |
+
|
| 72 |
+
# Caption Position - Ensure table captions are above, figure captions below
|
| 73 |
+
# Checks \caption placement relative to \begin{table}/\begin{figure}
|
| 74 |
+
caption: true
|
| 75 |
+
|
| 76 |
+
# Cross-References - Verify all figures/tables/sections are referenced in text
|
| 77 |
+
# Detects orphaned floats that are never mentioned
|
| 78 |
+
reference: true
|
| 79 |
+
|
| 80 |
+
# Formatting Standards - Check citation format, spacing, special characters
|
| 81 |
+
# Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
|
| 82 |
+
formatting: true
|
| 83 |
+
|
| 84 |
+
# Equation Checks - Verify equation punctuation and numbering consistency
|
| 85 |
+
# Ensures equations end with proper punctuation and labels are used correctly
|
| 86 |
+
equation: true
|
| 87 |
+
|
| 88 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 89 |
+
# Writing Quality
|
| 90 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 91 |
+
|
| 92 |
+
# AI Artifacts - Detect traces of AI-generated text
|
| 93 |
+
# Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
|
| 94 |
+
ai_artifacts: true
|
| 95 |
+
|
| 96 |
+
# Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
|
| 97 |
+
# Helps improve readability and academic writing style
|
| 98 |
+
sentence: true
|
| 99 |
+
|
| 100 |
+
# Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
|
| 101 |
+
# Examples: "deep learning" vs "deep-learning", "color" vs "colour"
|
| 102 |
+
consistency: true
|
| 103 |
+
|
| 104 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 105 |
+
# Academic Standards
|
| 106 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 107 |
+
|
| 108 |
+
# Acronym Definitions - Ensure acronyms are defined on first use
|
| 109 |
+
# Example: "Natural Language Processing (NLP)" before using "NLP" alone
|
| 110 |
+
acronym: true
|
| 111 |
+
|
| 112 |
+
# Number Formatting - Check percentage formatting consistency
|
| 113 |
+
# Ensures no space before % sign and consistent use of '%' vs 'percent'
|
| 114 |
+
number: true
|
| 115 |
+
|
| 116 |
+
# Citation Quality - Flag outdated references and citation formatting issues
|
| 117 |
+
# Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
|
| 118 |
+
citation_quality: true
|
| 119 |
+
|
| 120 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 121 |
+
# Review Compliance
|
| 122 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 123 |
+
|
| 124 |
+
# Anonymization - Check double-blind review compliance
|
| 125 |
+
# Detects GitHub links, acknowledgments, self-citations that may reveal author identity
|
| 126 |
+
anonymization: true
|
| 127 |
+
|
| 128 |
+
# ==============================================================================
|
| 129 |
+
# 🔍 Metadata Check Workflow
|
| 130 |
+
# ==============================================================================
|
| 131 |
+
# Define the data sources and order for metadata validation.
|
| 132 |
+
# BibGuard will try each enabled source in sequence until a match is found.
|
| 133 |
+
# Set enabled: false to skip a particular source.
|
| 134 |
+
workflow:
|
| 135 |
+
- name: arxiv_id
|
| 136 |
+
enabled: true
|
| 137 |
+
description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
|
| 138 |
+
|
| 139 |
+
- name: crossref_doi
|
| 140 |
+
enabled: true
|
| 141 |
+
description: "Lookup by DOI via CrossRef (authoritative for published papers)"
|
| 142 |
+
|
| 143 |
+
- name: semantic_scholar
|
| 144 |
+
enabled: true
|
| 145 |
+
description: "Semantic Scholar API (good coverage, includes citations)"
|
| 146 |
+
|
| 147 |
+
- name: dblp
|
| 148 |
+
enabled: true
|
| 149 |
+
description: "DBLP database (comprehensive for computer science papers)"
|
| 150 |
+
|
| 151 |
+
- name: openalex
|
| 152 |
+
enabled: true
|
| 153 |
+
description: "OpenAlex API (broad coverage across disciplines)"
|
| 154 |
+
|
| 155 |
+
- name: arxiv_title
|
| 156 |
+
enabled: true
|
| 157 |
+
description: "Search arXiv by title (fallback when ID unavailable)"
|
| 158 |
+
|
| 159 |
+
- name: crossref_title
|
| 160 |
+
enabled: true
|
| 161 |
+
description: "Search CrossRef by title (fallback when DOI unavailable)"
|
| 162 |
+
|
| 163 |
+
- name: google_scholar
|
| 164 |
+
enabled: false # May be rate-limited, disabled by default
|
| 165 |
+
description: "Google Scholar web scraping (use as last resort)"
|
| 166 |
+
|
| 167 |
+
# ==============================================================================
|
| 168 |
+
# 🤖 LLM Configuration (for Relevance Checking)
|
| 169 |
+
# ==============================================================================
|
| 170 |
+
llm:
|
| 171 |
+
# Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
|
| 172 |
+
# Each backend requires different setup (API keys, local installation, etc.)
|
| 173 |
+
backend: "gemini"
|
| 174 |
+
|
| 175 |
+
# Model name (leave empty to use backend default)
|
| 176 |
+
# Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
|
| 177 |
+
model: ""
|
| 178 |
+
|
| 179 |
+
# API endpoint (leave empty to use backend default)
|
| 180 |
+
# Only needed for self-hosted models (vllm, ollama) or custom endpoints
|
| 181 |
+
endpoint: ""
|
| 182 |
+
|
| 183 |
+
# API key (recommended to use environment variables instead)
|
| 184 |
+
# Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
|
| 185 |
+
api_key: ""
|
| 186 |
+
|
| 187 |
+
# ==============================================================================
|
| 188 |
+
# 📊 Output Settings
|
| 189 |
+
# ==============================================================================
|
| 190 |
+
output:
|
| 191 |
+
# Quiet mode - Suppress progress messages, only output final reports
|
| 192 |
+
# Useful for CI/CD pipelines or batch processing
|
| 193 |
+
quiet: false
|
| 194 |
+
|
| 195 |
+
# Minimal verified entries - Hide detailed info for entries that passed all checks
|
| 196 |
+
# Reduces report size when you only care about issues
|
| 197 |
+
minimal_verified: false
|
main.py
ADDED
|
@@ -0,0 +1,564 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BibGuard - Bibliography Checker & Paper Submission Quality Tool
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python main.py # Use bibguard.yaml in current directory
|
| 7 |
+
python main.py --config my.yaml # Use specified config file
|
| 8 |
+
python main.py --init # Create default config file
|
| 9 |
+
python main.py --list-templates # List available templates
|
| 10 |
+
"""
|
| 11 |
+
import argparse
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Optional, List
|
| 15 |
+
|
| 16 |
+
from src.parsers import BibParser, TexParser
|
| 17 |
+
from src.fetchers import ArxivFetcher, ScholarFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
|
| 18 |
+
from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, DuplicateDetector
|
| 19 |
+
from src.analyzers.llm_evaluator import LLMBackend
|
| 20 |
+
from src.report.generator import ReportGenerator, EntryReport
|
| 21 |
+
from src.utils.progress import ProgressDisplay
|
| 22 |
+
from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
|
| 23 |
+
from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
|
| 24 |
+
from src.templates.base_template import get_template, get_all_templates
|
| 25 |
+
from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def main():
|
| 29 |
+
parser = argparse.ArgumentParser(
|
| 30 |
+
description="BibGuard: Bibliography Checker & Paper Submission Quality Tool",
|
| 31 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 32 |
+
epilog="""
|
| 33 |
+
Usage Examples:
|
| 34 |
+
python main.py # Auto-detect config.yaml in current directory
|
| 35 |
+
python main.py --config my.yaml # Use specified config file
|
| 36 |
+
python main.py --init # Create default config.yaml
|
| 37 |
+
python main.py --list-templates # List available conference templates
|
| 38 |
+
"""
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--config", "-c",
|
| 43 |
+
help="Config file path (default: auto-detect config.yaml)"
|
| 44 |
+
)
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
"--init",
|
| 47 |
+
action="store_true",
|
| 48 |
+
help="Create default config.yaml in current directory"
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--list-templates",
|
| 52 |
+
action="store_true",
|
| 53 |
+
help="List all available conference templates"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
args = parser.parse_args()
|
| 57 |
+
|
| 58 |
+
# Handle --init
|
| 59 |
+
if args.init:
|
| 60 |
+
output = create_default_config()
|
| 61 |
+
print(f"✓ Created configuration file: {output}")
|
| 62 |
+
print("")
|
| 63 |
+
print(" Next steps:")
|
| 64 |
+
print(" 1. Edit the 'bib' and 'tex' paths in config.yaml")
|
| 65 |
+
print(" 2. Run: python main.py --config config.yaml")
|
| 66 |
+
print("")
|
| 67 |
+
sys.exit(0)
|
| 68 |
+
|
| 69 |
+
# Handle --list-templates
|
| 70 |
+
if args.list_templates:
|
| 71 |
+
from src.ui.template_selector import list_templates
|
| 72 |
+
list_templates()
|
| 73 |
+
sys.exit(0)
|
| 74 |
+
|
| 75 |
+
# Find and load config
|
| 76 |
+
config_path = args.config
|
| 77 |
+
if not config_path:
|
| 78 |
+
found = find_config_file()
|
| 79 |
+
if found:
|
| 80 |
+
config_path = str(found)
|
| 81 |
+
else:
|
| 82 |
+
print("Error: Config file not found")
|
| 83 |
+
print("")
|
| 84 |
+
print("Please run 'python main.py --init' to create config.yaml")
|
| 85 |
+
print("Or use 'python main.py --config <path>' to specify a config file")
|
| 86 |
+
print("")
|
| 87 |
+
sys.exit(1)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
config = load_config(config_path)
|
| 91 |
+
except FileNotFoundError:
|
| 92 |
+
print(f"Error: Config file does not exist: {config_path}")
|
| 93 |
+
sys.exit(1)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"Error: Failed to parse config file: {e}")
|
| 96 |
+
sys.exit(1)
|
| 97 |
+
|
| 98 |
+
# Validate required fields
|
| 99 |
+
mode_dir = bool(config.files.input_dir)
|
| 100 |
+
|
| 101 |
+
if mode_dir:
|
| 102 |
+
input_dir = config.input_dir_path
|
| 103 |
+
if not input_dir.exists() or not input_dir.is_dir():
|
| 104 |
+
print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
|
| 105 |
+
sys.exit(1)
|
| 106 |
+
|
| 107 |
+
tex_files = list(input_dir.rglob("*.tex"))
|
| 108 |
+
bib_files = list(input_dir.rglob("*.bib"))
|
| 109 |
+
|
| 110 |
+
if not tex_files:
|
| 111 |
+
print(f"Error: No .tex files found in {input_dir}")
|
| 112 |
+
sys.exit(1)
|
| 113 |
+
if not bib_files:
|
| 114 |
+
print(f"Error: No .bib files found in {input_dir}")
|
| 115 |
+
sys.exit(1)
|
| 116 |
+
|
| 117 |
+
config._tex_files = tex_files
|
| 118 |
+
config._bib_files = bib_files
|
| 119 |
+
else:
|
| 120 |
+
if not config.files.bib:
|
| 121 |
+
print("Error: bib file path not specified in config")
|
| 122 |
+
sys.exit(1)
|
| 123 |
+
if not config.files.tex:
|
| 124 |
+
print("Error: tex file path not specified in config")
|
| 125 |
+
sys.exit(1)
|
| 126 |
+
|
| 127 |
+
# Validate files exist
|
| 128 |
+
if not config.bib_path.exists():
|
| 129 |
+
print(f"Error: Bib file does not exist: {config.bib_path}")
|
| 130 |
+
sys.exit(1)
|
| 131 |
+
if not config.tex_path.exists():
|
| 132 |
+
print(f"Error: TeX file does not exist: {config.tex_path}")
|
| 133 |
+
sys.exit(1)
|
| 134 |
+
|
| 135 |
+
config._tex_files = [config.tex_path]
|
| 136 |
+
config._bib_files = [config.bib_path]
|
| 137 |
+
|
| 138 |
+
# Load template if specified
|
| 139 |
+
template = None
|
| 140 |
+
if config.template:
|
| 141 |
+
template = get_template(config.template)
|
| 142 |
+
if not template:
|
| 143 |
+
print(f"Error: Unknown template: {config.template}")
|
| 144 |
+
print("Use --list-templates to see available templates")
|
| 145 |
+
sys.exit(1)
|
| 146 |
+
|
| 147 |
+
# Run the checker
|
| 148 |
+
try:
|
| 149 |
+
run_checker(config, template)
|
| 150 |
+
except KeyboardInterrupt:
|
| 151 |
+
print("\n\nCancelled")
|
| 152 |
+
sys.exit(130)
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"\nError: {e}")
|
| 155 |
+
import traceback
|
| 156 |
+
traceback.print_exc()
|
| 157 |
+
sys.exit(1)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def run_checker(config: BibGuardConfig, template=None):
|
| 161 |
+
"""Run the bibliography checker with the given configuration."""
|
| 162 |
+
progress = ProgressDisplay()
|
| 163 |
+
|
| 164 |
+
# Show config info (minimal)
|
| 165 |
+
if template:
|
| 166 |
+
pass # Skip printing header/info here to keep output clean
|
| 167 |
+
|
| 168 |
+
# Parse files (silent)
|
| 169 |
+
bib_parser = BibParser()
|
| 170 |
+
entries = []
|
| 171 |
+
for bib_path in config._bib_files:
|
| 172 |
+
entries.extend(bib_parser.parse_file(str(bib_path)))
|
| 173 |
+
|
| 174 |
+
tex_parser = TexParser()
|
| 175 |
+
tex_contents = {}
|
| 176 |
+
merged_citations = {}
|
| 177 |
+
merged_all_keys = set()
|
| 178 |
+
|
| 179 |
+
for tex_path in config._tex_files:
|
| 180 |
+
cits = tex_parser.parse_file(str(tex_path))
|
| 181 |
+
# Accumulate citations
|
| 182 |
+
for k, v in cits.items():
|
| 183 |
+
if k not in merged_citations:
|
| 184 |
+
merged_citations[k] = []
|
| 185 |
+
merged_citations[k].extend(v)
|
| 186 |
+
# Accumulate keys
|
| 187 |
+
merged_all_keys.update(tex_parser.get_all_cited_keys())
|
| 188 |
+
# Store content
|
| 189 |
+
tex_contents[str(tex_path)] = tex_path.read_text(encoding='utf-8', errors='replace')
|
| 190 |
+
|
| 191 |
+
# Inject merged data back into parser for components that use it
|
| 192 |
+
tex_parser.citations = merged_citations
|
| 193 |
+
tex_parser.all_keys = merged_all_keys
|
| 194 |
+
|
| 195 |
+
# Initialize components based on config
|
| 196 |
+
bib_config = config.bibliography
|
| 197 |
+
|
| 198 |
+
arxiv_fetcher = None
|
| 199 |
+
crossref_fetcher = None
|
| 200 |
+
scholar_fetcher = None
|
| 201 |
+
semantic_scholar_fetcher = None
|
| 202 |
+
openalex_fetcher = None
|
| 203 |
+
dblp_fetcher = None
|
| 204 |
+
comparator = None
|
| 205 |
+
usage_checker = None
|
| 206 |
+
llm_evaluator = None
|
| 207 |
+
duplicate_detector = None
|
| 208 |
+
|
| 209 |
+
if bib_config.check_metadata or bib_config.check_relevance:
|
| 210 |
+
arxiv_fetcher = ArxivFetcher()
|
| 211 |
+
|
| 212 |
+
if bib_config.check_metadata:
|
| 213 |
+
semantic_scholar_fetcher = SemanticScholarFetcher()
|
| 214 |
+
openalex_fetcher = OpenAlexFetcher()
|
| 215 |
+
dblp_fetcher = DBLPFetcher()
|
| 216 |
+
crossref_fetcher = CrossRefFetcher()
|
| 217 |
+
scholar_fetcher = ScholarFetcher()
|
| 218 |
+
comparator = MetadataComparator()
|
| 219 |
+
|
| 220 |
+
if bib_config.check_usage:
|
| 221 |
+
usage_checker = UsageChecker(tex_parser)
|
| 222 |
+
|
| 223 |
+
if bib_config.check_duplicates:
|
| 224 |
+
duplicate_detector = DuplicateDetector()
|
| 225 |
+
|
| 226 |
+
if bib_config.check_relevance:
|
| 227 |
+
llm_config = config.llm
|
| 228 |
+
backend = LLMBackend(llm_config.backend)
|
| 229 |
+
llm_evaluator = LLMEvaluator(
|
| 230 |
+
backend=backend,
|
| 231 |
+
endpoint=llm_config.endpoint or None,
|
| 232 |
+
model=llm_config.model or None,
|
| 233 |
+
api_key=llm_config.api_key or None
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# Test LLM connection (silent)
|
| 237 |
+
llm_evaluator.test_connection()
|
| 238 |
+
|
| 239 |
+
if not usage_checker:
|
| 240 |
+
usage_checker = UsageChecker(tex_parser)
|
| 241 |
+
|
| 242 |
+
# Initialize report generator
|
| 243 |
+
report_gen = ReportGenerator(
|
| 244 |
+
minimal_verified=config.output.minimal_verified,
|
| 245 |
+
check_preprint_ratio=config.bibliography.check_preprint_ratio,
|
| 246 |
+
preprint_warning_threshold=config.bibliography.preprint_warning_threshold
|
| 247 |
+
)
|
| 248 |
+
report_gen.set_metadata(
|
| 249 |
+
[str(f) for f in config._bib_files],
|
| 250 |
+
[str(f) for f in config._tex_files]
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# Run submission quality checks
|
| 254 |
+
submission_results = []
|
| 255 |
+
enabled_checkers = config.submission.get_enabled_checkers()
|
| 256 |
+
|
| 257 |
+
for checker_name in enabled_checkers:
|
| 258 |
+
if checker_name in CHECKER_REGISTRY:
|
| 259 |
+
checker = CHECKER_REGISTRY[checker_name]()
|
| 260 |
+
for tex_path_str, content in tex_contents.items():
|
| 261 |
+
results = checker.check(content, {})
|
| 262 |
+
# Tag results with file path
|
| 263 |
+
for r in results:
|
| 264 |
+
r.file_path = tex_path_str
|
| 265 |
+
submission_results.extend(results)
|
| 266 |
+
|
| 267 |
+
# Set results in report generator for summary calculation
|
| 268 |
+
report_gen.set_submission_results(submission_results, template)
|
| 269 |
+
|
| 270 |
+
# Check for duplicates (silent)
|
| 271 |
+
if bib_config.check_duplicates and duplicate_detector:
|
| 272 |
+
duplicate_groups = duplicate_detector.find_duplicates(entries)
|
| 273 |
+
report_gen.set_duplicate_groups(duplicate_groups)
|
| 274 |
+
|
| 275 |
+
# Check missing citations (silent)
|
| 276 |
+
if bib_config.check_usage and usage_checker:
|
| 277 |
+
missing = usage_checker.get_missing_entries(entries)
|
| 278 |
+
report_gen.set_missing_citations(missing)
|
| 279 |
+
|
| 280 |
+
# Process entries
|
| 281 |
+
|
| 282 |
+
# Build workflow from config
|
| 283 |
+
from src.config.workflow import WorkflowConfig, get_default_workflow, WorkflowStep as WFStep
|
| 284 |
+
workflow_config = get_default_workflow()
|
| 285 |
+
if config.workflow:
|
| 286 |
+
workflow_config = WorkflowConfig(
|
| 287 |
+
steps=[
|
| 288 |
+
WFStep(
|
| 289 |
+
name=step.name,
|
| 290 |
+
display_name=step.name,
|
| 291 |
+
description=step.description,
|
| 292 |
+
enabled=step.enabled,
|
| 293 |
+
priority=i
|
| 294 |
+
)
|
| 295 |
+
for i, step in enumerate(config.workflow)
|
| 296 |
+
]
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Process entries in parallel for metadata checks
|
| 300 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 301 |
+
import threading
|
| 302 |
+
|
| 303 |
+
# Thread-safe progress tracking
|
| 304 |
+
progress_lock = threading.Lock()
|
| 305 |
+
completed_count = [0] # Use list for mutability in closure
|
| 306 |
+
|
| 307 |
+
def process_single_entry(entry):
|
| 308 |
+
"""Process a single entry (thread-safe)."""
|
| 309 |
+
# Check usage
|
| 310 |
+
usage_result = None
|
| 311 |
+
if usage_checker:
|
| 312 |
+
usage_result = usage_checker.check_usage(entry)
|
| 313 |
+
|
| 314 |
+
# Fetch and compare metadata
|
| 315 |
+
comparison_result = None
|
| 316 |
+
if bib_config.check_metadata and comparator:
|
| 317 |
+
comparison_result = fetch_and_compare_with_workflow(
|
| 318 |
+
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
|
| 319 |
+
scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher,
|
| 320 |
+
dblp_fetcher, comparator
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# LLM evaluation (keep sequential per entry)
|
| 324 |
+
evaluations = []
|
| 325 |
+
if bib_config.check_relevance and llm_evaluator:
|
| 326 |
+
if usage_result and usage_result.is_used:
|
| 327 |
+
abstract = get_abstract(entry, comparison_result, arxiv_fetcher)
|
| 328 |
+
if abstract:
|
| 329 |
+
for ctx in usage_result.contexts:
|
| 330 |
+
eval_result = llm_evaluator.evaluate(
|
| 331 |
+
entry.key, ctx.full_context, abstract
|
| 332 |
+
)
|
| 333 |
+
eval_result.line_number = ctx.line_number
|
| 334 |
+
eval_result.file_path = ctx.file_path
|
| 335 |
+
evaluations.append(eval_result)
|
| 336 |
+
|
| 337 |
+
# Create entry report
|
| 338 |
+
entry_report = EntryReport(
|
| 339 |
+
entry=entry,
|
| 340 |
+
comparison=comparison_result,
|
| 341 |
+
usage=usage_result,
|
| 342 |
+
evaluations=evaluations
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
return entry_report, comparison_result
|
| 346 |
+
|
| 347 |
+
# Determine number of workers (max 10 to avoid overwhelming APIs)
|
| 348 |
+
max_workers = min(10, len(entries))
|
| 349 |
+
|
| 350 |
+
with progress.progress_context(len(entries), "Processing bibliography") as prog:
|
| 351 |
+
# Use ThreadPoolExecutor for parallel processing
|
| 352 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 353 |
+
# Submit all tasks
|
| 354 |
+
future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
|
| 355 |
+
|
| 356 |
+
# Process completed tasks
|
| 357 |
+
for future in as_completed(future_to_entry):
|
| 358 |
+
entry = future_to_entry[future]
|
| 359 |
+
try:
|
| 360 |
+
entry_report, comparison_result = future.result()
|
| 361 |
+
|
| 362 |
+
# Thread-safe progress update
|
| 363 |
+
with progress_lock:
|
| 364 |
+
report_gen.add_entry_report(entry_report)
|
| 365 |
+
|
| 366 |
+
# Update progress
|
| 367 |
+
if comparison_result and comparison_result.is_match:
|
| 368 |
+
prog.mark_success()
|
| 369 |
+
elif comparison_result and comparison_result.has_issues:
|
| 370 |
+
prog.mark_warning()
|
| 371 |
+
else:
|
| 372 |
+
prog.mark_error()
|
| 373 |
+
|
| 374 |
+
completed_count[0] += 1
|
| 375 |
+
prog.update(entry.key, "Done", 1)
|
| 376 |
+
|
| 377 |
+
except Exception as e:
|
| 378 |
+
with progress_lock:
|
| 379 |
+
prog.mark_error()
|
| 380 |
+
progress.print_error(f"Error processing {entry.key}: {e}")
|
| 381 |
+
completed_count[0] += 1
|
| 382 |
+
prog.update(entry.key, "Failed", 1)
|
| 383 |
+
|
| 384 |
+
# Summary will be printed at the very end
|
| 385 |
+
|
| 386 |
+
# Generate reports and organize outputs (silent)
|
| 387 |
+
|
| 388 |
+
# Create output directory
|
| 389 |
+
output_dir = config.output_dir_path
|
| 390 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 391 |
+
|
| 392 |
+
# Copy input files to output directory
|
| 393 |
+
import shutil
|
| 394 |
+
for bib_path in config._bib_files:
|
| 395 |
+
shutil.copy2(bib_path, output_dir / bib_path.name)
|
| 396 |
+
for tex_path in config._tex_files:
|
| 397 |
+
shutil.copy2(tex_path, output_dir / tex_path.name)
|
| 398 |
+
# 1. Bibliography Report
|
| 399 |
+
bib_report_path = output_dir / "bibliography_report.md"
|
| 400 |
+
report_gen.save_bibliography_report(str(bib_report_path))
|
| 401 |
+
|
| 402 |
+
# 2. LaTeX Quality Report
|
| 403 |
+
if submission_results:
|
| 404 |
+
latex_report_path = output_dir / "latex_quality_report.md"
|
| 405 |
+
report_gen.save_latex_quality_report(
|
| 406 |
+
str(latex_report_path),
|
| 407 |
+
submission_results,
|
| 408 |
+
template
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
# 3. Line-by-Line Report
|
| 412 |
+
from src.report.line_report import generate_line_report
|
| 413 |
+
line_report_path = output_dir / "line_by_line_report.md"
|
| 414 |
+
|
| 415 |
+
# For multiple files, we generate one big report with sections
|
| 416 |
+
all_line_reports = []
|
| 417 |
+
for tex_path_str, content in tex_contents.items():
|
| 418 |
+
file_results = [r for r in submission_results if r.file_path == tex_path_str]
|
| 419 |
+
if not file_results:
|
| 420 |
+
continue
|
| 421 |
+
|
| 422 |
+
from src.report.line_report import LineByLineReportGenerator
|
| 423 |
+
gen = LineByLineReportGenerator(content, tex_path_str)
|
| 424 |
+
gen.add_results(file_results)
|
| 425 |
+
all_line_reports.append(gen.generate())
|
| 426 |
+
|
| 427 |
+
if all_line_reports:
|
| 428 |
+
with open(line_report_path, 'w', encoding='utf-8') as f:
|
| 429 |
+
f.write("\n\n".join(all_line_reports))
|
| 430 |
+
|
| 431 |
+
# 4. Clean bib file (if generated earlier)
|
| 432 |
+
if bib_config.check_usage and usage_checker:
|
| 433 |
+
used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
|
| 434 |
+
if used_entries:
|
| 435 |
+
try:
|
| 436 |
+
keys_to_keep = {entry.key for entry in used_entries}
|
| 437 |
+
# If multiple bibs, we merge them into one cleaned file
|
| 438 |
+
# or just use the first one if it's single mode.
|
| 439 |
+
# For now, let's just use a default name if multiple.
|
| 440 |
+
if len(config._bib_files) == 1:
|
| 441 |
+
clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
|
| 442 |
+
bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
|
| 443 |
+
else:
|
| 444 |
+
clean_bib_path = output_dir / "merged_only_used.bib"
|
| 445 |
+
# We need a way to filter multiple files into one.
|
| 446 |
+
# BibParser.filter_file currently takes one input.
|
| 447 |
+
# Let's just write all used entries to a new file.
|
| 448 |
+
with open(clean_bib_path, 'w', encoding='utf-8') as f:
|
| 449 |
+
for entry in used_entries:
|
| 450 |
+
f.write(entry.raw + "\n\n")
|
| 451 |
+
except Exception as e:
|
| 452 |
+
pass
|
| 453 |
+
|
| 454 |
+
# Print beautiful console summary
|
| 455 |
+
if not config.output.quiet:
|
| 456 |
+
bib_stats, latex_stats = report_gen.get_summary_stats()
|
| 457 |
+
progress.print_detailed_summary(bib_stats, latex_stats, str(output_dir.absolute()))
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
def fetch_and_compare_with_workflow(
|
| 461 |
+
entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
|
| 462 |
+
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
|
| 463 |
+
):
|
| 464 |
+
"""Fetch metadata from online sources using the configured workflow."""
|
| 465 |
+
from src.utils.normalizer import TextNormalizer
|
| 466 |
+
|
| 467 |
+
all_results = []
|
| 468 |
+
enabled_steps = workflow_config.get_enabled_steps()
|
| 469 |
+
|
| 470 |
+
for step in enabled_steps:
|
| 471 |
+
result = None
|
| 472 |
+
|
| 473 |
+
if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher:
|
| 474 |
+
arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
|
| 475 |
+
if arxiv_meta:
|
| 476 |
+
result = comparator.compare_with_arxiv(entry, arxiv_meta)
|
| 477 |
+
|
| 478 |
+
elif step.name == "crossref_doi" and entry.doi and crossref_fetcher:
|
| 479 |
+
crossref_result = crossref_fetcher.search_by_doi(entry.doi)
|
| 480 |
+
if crossref_result:
|
| 481 |
+
result = comparator.compare_with_crossref(entry, crossref_result)
|
| 482 |
+
|
| 483 |
+
elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher:
|
| 484 |
+
ss_result = None
|
| 485 |
+
if entry.doi:
|
| 486 |
+
ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
|
| 487 |
+
if not ss_result:
|
| 488 |
+
ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
|
| 489 |
+
if ss_result:
|
| 490 |
+
result = comparator.compare_with_semantic_scholar(entry, ss_result)
|
| 491 |
+
|
| 492 |
+
elif step.name == "dblp" and entry.title and dblp_fetcher:
|
| 493 |
+
dblp_result = dblp_fetcher.search_by_title(entry.title)
|
| 494 |
+
if dblp_result:
|
| 495 |
+
result = comparator.compare_with_dblp(entry, dblp_result)
|
| 496 |
+
|
| 497 |
+
elif step.name == "openalex" and entry.title and openalex_fetcher:
|
| 498 |
+
oa_result = None
|
| 499 |
+
if entry.doi:
|
| 500 |
+
oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
|
| 501 |
+
if not oa_result:
|
| 502 |
+
oa_result = openalex_fetcher.search_by_title(entry.title)
|
| 503 |
+
if oa_result:
|
| 504 |
+
result = comparator.compare_with_openalex(entry, oa_result)
|
| 505 |
+
|
| 506 |
+
elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
|
| 507 |
+
results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
|
| 508 |
+
if results:
|
| 509 |
+
best_result = None
|
| 510 |
+
best_sim = 0.0
|
| 511 |
+
norm1 = TextNormalizer.normalize_for_comparison(entry.title)
|
| 512 |
+
|
| 513 |
+
for r in results:
|
| 514 |
+
norm2 = TextNormalizer.normalize_for_comparison(r.title)
|
| 515 |
+
sim = TextNormalizer.similarity_ratio(norm1, norm2)
|
| 516 |
+
if sim > best_sim:
|
| 517 |
+
best_sim = sim
|
| 518 |
+
best_result = r
|
| 519 |
+
|
| 520 |
+
if best_result and best_sim > 0.5:
|
| 521 |
+
result = comparator.compare_with_arxiv(entry, best_result)
|
| 522 |
+
|
| 523 |
+
elif step.name == "crossref_title" and entry.title and crossref_fetcher:
|
| 524 |
+
crossref_result = crossref_fetcher.search_by_title(entry.title)
|
| 525 |
+
if crossref_result:
|
| 526 |
+
result = comparator.compare_with_crossref(entry, crossref_result)
|
| 527 |
+
|
| 528 |
+
elif step.name == "google_scholar" and entry.title and scholar_fetcher:
|
| 529 |
+
scholar_result = scholar_fetcher.search_by_title(entry.title)
|
| 530 |
+
if scholar_result:
|
| 531 |
+
result = comparator.compare_with_scholar(entry, scholar_result)
|
| 532 |
+
|
| 533 |
+
if result:
|
| 534 |
+
all_results.append(result)
|
| 535 |
+
if result.is_match:
|
| 536 |
+
return result
|
| 537 |
+
|
| 538 |
+
if all_results:
|
| 539 |
+
all_results.sort(key=lambda r: r.confidence, reverse=True)
|
| 540 |
+
return all_results[0]
|
| 541 |
+
|
| 542 |
+
return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def get_abstract(entry, comparison_result, arxiv_fetcher):
|
| 546 |
+
"""Get abstract for an entry from various sources."""
|
| 547 |
+
if entry.abstract:
|
| 548 |
+
return entry.abstract
|
| 549 |
+
|
| 550 |
+
if entry.has_arxiv and arxiv_fetcher:
|
| 551 |
+
arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
|
| 552 |
+
if arxiv_meta and arxiv_meta.abstract:
|
| 553 |
+
return arxiv_meta.abstract
|
| 554 |
+
|
| 555 |
+
if entry.title and arxiv_fetcher:
|
| 556 |
+
results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
|
| 557 |
+
if results and results[0].abstract:
|
| 558 |
+
return results[0].abstract
|
| 559 |
+
|
| 560 |
+
return ""
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
if __name__ == "__main__":
|
| 564 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
bibtexparser>=1.4.0
|
| 3 |
+
requests>=2.31.0
|
| 4 |
+
beautifulsoup4>=4.12.0
|
| 5 |
+
rich>=13.7.0
|
| 6 |
+
Unidecode>=1.3.0
|
| 7 |
+
lxml>=5.0.0
|
| 8 |
+
PyYAML>=6.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Bibliography Checker Package"""
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (202 Bytes). View file
|
|
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (190 Bytes). View file
|
|
|
src/analyzers/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyzers package"""
|
| 2 |
+
from .metadata_comparator import MetadataComparator
|
| 3 |
+
from .usage_checker import UsageChecker
|
| 4 |
+
from .llm_evaluator import LLMEvaluator
|
| 5 |
+
from .duplicate_detector import DuplicateDetector
|
| 6 |
+
|
| 7 |
+
__all__ = ['MetadataComparator', 'UsageChecker', 'LLMEvaluator', 'DuplicateDetector']
|
src/analyzers/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (464 Bytes). View file
|
|
|
src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc
ADDED
|
Binary file (8.29 kB). View file
|
|
|
src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc
ADDED
|
Binary file (5.4 kB). View file
|
|
|
src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc
ADDED
|
Binary file (14.3 kB). View file
|
|
|
src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc
ADDED
|
Binary file (18.9 kB). View file
|
|
|
src/analyzers/__pycache__/retraction_checker.cpython-313.pyc
ADDED
|
Binary file (4.94 kB). View file
|
|
|
src/analyzers/__pycache__/url_validator.cpython-313.pyc
ADDED
|
Binary file (8.3 kB). View file
|
|
|
src/analyzers/__pycache__/usage_checker.cpython-313.pyc
ADDED
|
Binary file (4.4 kB). View file
|
|
|
src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
src/analyzers/duplicate_detector.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Duplicate entry detector for bibliography files.
|
| 3 |
+
Uses fuzzy matching to find potential duplicates.
|
| 4 |
+
"""
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import List, Tuple
|
| 7 |
+
|
| 8 |
+
from ..parsers.bib_parser import BibEntry
|
| 9 |
+
from ..utils.normalizer import TextNormalizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class DuplicateGroup:
|
| 14 |
+
"""A group of potentially duplicate entries."""
|
| 15 |
+
entries: List[BibEntry]
|
| 16 |
+
similarity_score: float
|
| 17 |
+
reason: str
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def entry_keys(self) -> List[str]:
|
| 21 |
+
return [e.key for e in self.entries]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DuplicateDetector:
|
| 25 |
+
"""Detects duplicate bibliography entries using fuzzy matching."""
|
| 26 |
+
|
| 27 |
+
# Thresholds for duplicate detection
|
| 28 |
+
TITLE_SIMILARITY_THRESHOLD = 0.85
|
| 29 |
+
COMBINED_SIMILARITY_THRESHOLD = 0.80
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self.normalizer = TextNormalizer
|
| 33 |
+
|
| 34 |
+
def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]:
|
| 35 |
+
"""
|
| 36 |
+
Find all duplicate groups in the bibliography.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
List of DuplicateGroup objects, each containing 2+ similar entries.
|
| 40 |
+
"""
|
| 41 |
+
duplicates = []
|
| 42 |
+
processed = set()
|
| 43 |
+
|
| 44 |
+
for i, entry1 in enumerate(entries):
|
| 45 |
+
if entry1.key in processed:
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
# Find all entries similar to this one
|
| 49 |
+
similar_entries = [entry1]
|
| 50 |
+
|
| 51 |
+
for j, entry2 in enumerate(entries[i+1:], start=i+1):
|
| 52 |
+
if entry2.key in processed:
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
similarity, reason = self._calculate_similarity(entry1, entry2)
|
| 56 |
+
|
| 57 |
+
if similarity >= self.COMBINED_SIMILARITY_THRESHOLD:
|
| 58 |
+
similar_entries.append(entry2)
|
| 59 |
+
processed.add(entry2.key)
|
| 60 |
+
|
| 61 |
+
# If we found duplicates, create a group
|
| 62 |
+
if len(similar_entries) > 1:
|
| 63 |
+
processed.add(entry1.key)
|
| 64 |
+
|
| 65 |
+
# Calculate average similarity for the group
|
| 66 |
+
avg_similarity = self._calculate_group_similarity(similar_entries)
|
| 67 |
+
reason = self._generate_reason(similar_entries)
|
| 68 |
+
|
| 69 |
+
duplicates.append(DuplicateGroup(
|
| 70 |
+
entries=similar_entries,
|
| 71 |
+
similarity_score=avg_similarity,
|
| 72 |
+
reason=reason
|
| 73 |
+
))
|
| 74 |
+
|
| 75 |
+
# Sort by similarity score (highest first)
|
| 76 |
+
duplicates.sort(key=lambda g: g.similarity_score, reverse=True)
|
| 77 |
+
|
| 78 |
+
return duplicates
|
| 79 |
+
|
| 80 |
+
def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]:
|
| 81 |
+
"""
|
| 82 |
+
Calculate similarity between two entries.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
(similarity_score, reason_string)
|
| 86 |
+
"""
|
| 87 |
+
# Normalize titles
|
| 88 |
+
title1 = self.normalizer.normalize_for_comparison(entry1.title)
|
| 89 |
+
title2 = self.normalizer.normalize_for_comparison(entry2.title)
|
| 90 |
+
|
| 91 |
+
# Calculate title similarity
|
| 92 |
+
title_sim = self.normalizer.similarity_ratio(title1, title2)
|
| 93 |
+
|
| 94 |
+
# If titles are very similar, likely a duplicate
|
| 95 |
+
if title_sim >= self.TITLE_SIMILARITY_THRESHOLD:
|
| 96 |
+
return title_sim, "Very similar titles"
|
| 97 |
+
|
| 98 |
+
# Check author similarity
|
| 99 |
+
author_sim = self._calculate_author_similarity(entry1, entry2)
|
| 100 |
+
|
| 101 |
+
# Combined score: weighted average
|
| 102 |
+
# Title is more important (70%) than authors (30%)
|
| 103 |
+
combined_sim = 0.7 * title_sim + 0.3 * author_sim
|
| 104 |
+
|
| 105 |
+
if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD:
|
| 106 |
+
return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})"
|
| 107 |
+
|
| 108 |
+
return combined_sim, ""
|
| 109 |
+
|
| 110 |
+
def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float:
|
| 111 |
+
"""Calculate similarity between author lists."""
|
| 112 |
+
# Parse author strings
|
| 113 |
+
authors1 = self._parse_authors(entry1.author)
|
| 114 |
+
authors2 = self._parse_authors(entry2.author)
|
| 115 |
+
|
| 116 |
+
if not authors1 or not authors2:
|
| 117 |
+
return 0.0
|
| 118 |
+
|
| 119 |
+
# Normalize author names
|
| 120 |
+
norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1]
|
| 121 |
+
norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2]
|
| 122 |
+
|
| 123 |
+
# Count matching authors
|
| 124 |
+
matches = 0
|
| 125 |
+
for a1 in norm_authors1:
|
| 126 |
+
for a2 in norm_authors2:
|
| 127 |
+
if self._authors_match(a1, a2):
|
| 128 |
+
matches += 1
|
| 129 |
+
break
|
| 130 |
+
|
| 131 |
+
# Calculate Jaccard similarity
|
| 132 |
+
total_unique = len(set(norm_authors1) | set(norm_authors2))
|
| 133 |
+
if total_unique == 0:
|
| 134 |
+
return 0.0
|
| 135 |
+
|
| 136 |
+
return matches / total_unique
|
| 137 |
+
|
| 138 |
+
def _parse_authors(self, author_string: str) -> List[str]:
|
| 139 |
+
"""Parse author string into list of names."""
|
| 140 |
+
if not author_string:
|
| 141 |
+
return []
|
| 142 |
+
|
| 143 |
+
# Split by 'and'
|
| 144 |
+
authors = author_string.split(' and ')
|
| 145 |
+
|
| 146 |
+
# Clean up each author
|
| 147 |
+
cleaned = []
|
| 148 |
+
for author in authors:
|
| 149 |
+
# Remove extra whitespace
|
| 150 |
+
author = ' '.join(author.split())
|
| 151 |
+
if author:
|
| 152 |
+
cleaned.append(author)
|
| 153 |
+
|
| 154 |
+
return cleaned
|
| 155 |
+
|
| 156 |
+
def _authors_match(self, name1: str, name2: str) -> bool:
|
| 157 |
+
"""Check if two author names match (handles initials)."""
|
| 158 |
+
# Simple exact match after normalization
|
| 159 |
+
if name1 == name2:
|
| 160 |
+
return True
|
| 161 |
+
|
| 162 |
+
# Check if one is a substring of the other (handles initials)
|
| 163 |
+
if name1 in name2 or name2 in name1:
|
| 164 |
+
return True
|
| 165 |
+
|
| 166 |
+
# Calculate string similarity
|
| 167 |
+
sim = self.normalizer.similarity_ratio(name1, name2)
|
| 168 |
+
return sim >= 0.8
|
| 169 |
+
|
| 170 |
+
def _calculate_group_similarity(self, entries: List[BibEntry]) -> float:
|
| 171 |
+
"""Calculate average similarity within a group."""
|
| 172 |
+
if len(entries) < 2:
|
| 173 |
+
return 1.0
|
| 174 |
+
|
| 175 |
+
total_sim = 0.0
|
| 176 |
+
count = 0
|
| 177 |
+
|
| 178 |
+
for i, entry1 in enumerate(entries):
|
| 179 |
+
for entry2 in entries[i+1:]:
|
| 180 |
+
sim, _ = self._calculate_similarity(entry1, entry2)
|
| 181 |
+
total_sim += sim
|
| 182 |
+
count += 1
|
| 183 |
+
|
| 184 |
+
return total_sim / count if count > 0 else 0.0
|
| 185 |
+
|
| 186 |
+
def _generate_reason(self, entries: List[BibEntry]) -> str:
|
| 187 |
+
"""Generate a human-readable reason for the duplicate group."""
|
| 188 |
+
# Check if all titles are very similar
|
| 189 |
+
titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries]
|
| 190 |
+
|
| 191 |
+
# Calculate pairwise title similarities
|
| 192 |
+
title_sims = []
|
| 193 |
+
for i, t1 in enumerate(titles):
|
| 194 |
+
for t2 in titles[i+1:]:
|
| 195 |
+
title_sims.append(self.normalizer.similarity_ratio(t1, t2))
|
| 196 |
+
|
| 197 |
+
avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0
|
| 198 |
+
|
| 199 |
+
if avg_title_sim >= 0.95:
|
| 200 |
+
return "Nearly identical titles"
|
| 201 |
+
elif avg_title_sim >= 0.85:
|
| 202 |
+
return "Very similar titles"
|
| 203 |
+
else:
|
| 204 |
+
return "Similar titles and authors"
|
src/analyzers/llm_evaluator.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM-based citation relevance evaluator.
|
| 3 |
+
Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
from enum import Enum
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
import requests
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LLMBackend(Enum):
|
| 16 |
+
OPENAI = "openai"
|
| 17 |
+
ANTHROPIC = "anthropic"
|
| 18 |
+
GEMINI = "gemini"
|
| 19 |
+
VLLM = "vllm"
|
| 20 |
+
OLLAMA = "ollama"
|
| 21 |
+
DEEPSEEK = "deepseek"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class EvaluationResult:
|
| 26 |
+
"""Result of LLM citation evaluation."""
|
| 27 |
+
entry_key: str
|
| 28 |
+
relevance_score: int # 1-5
|
| 29 |
+
is_relevant: bool
|
| 30 |
+
explanation: str
|
| 31 |
+
context_used: str
|
| 32 |
+
abstract_used: str
|
| 33 |
+
line_number: Optional[int] = None
|
| 34 |
+
file_path: Optional[str] = None
|
| 35 |
+
error: Optional[str] = None
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def score_label(self) -> str:
|
| 39 |
+
labels = {
|
| 40 |
+
1: "Not Relevant",
|
| 41 |
+
2: "Marginally Relevant",
|
| 42 |
+
3: "Somewhat Relevant",
|
| 43 |
+
4: "Relevant",
|
| 44 |
+
5: "Highly Relevant"
|
| 45 |
+
}
|
| 46 |
+
return labels.get(self.relevance_score, "Unknown")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class LLMEvaluator:
|
| 50 |
+
"""Evaluates citation relevance using LLM."""
|
| 51 |
+
|
| 52 |
+
PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
|
| 53 |
+
|
| 54 |
+
## Citation Context (from the manuscript):
|
| 55 |
+
{context}
|
| 56 |
+
|
| 57 |
+
## Cited Paper's Abstract:
|
| 58 |
+
{abstract}
|
| 59 |
+
|
| 60 |
+
## Task:
|
| 61 |
+
Evaluate the relevance and appropriateness of this citation. Consider:
|
| 62 |
+
1. Does the citation support the claim being made in the context?
|
| 63 |
+
2. Is the cited paper's topic related to the discussion?
|
| 64 |
+
3. Is this citation necessary, or could it be replaced with a more relevant one?
|
| 65 |
+
|
| 66 |
+
## Response Format:
|
| 67 |
+
Provide your response in the following JSON format:
|
| 68 |
+
{{
|
| 69 |
+
"relevance_score": <1-5 integer>,
|
| 70 |
+
"is_relevant": <true/false>,
|
| 71 |
+
"explanation": "<brief explanation in 1-2 sentences>"
|
| 72 |
+
}}
|
| 73 |
+
|
| 74 |
+
Score guide:
|
| 75 |
+
- 1: Not relevant at all
|
| 76 |
+
- 2: Marginally relevant
|
| 77 |
+
- 3: Somewhat relevant
|
| 78 |
+
- 4: Relevant and appropriate
|
| 79 |
+
- 5: Highly relevant and essential
|
| 80 |
+
|
| 81 |
+
STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
|
| 82 |
+
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
backend: LLMBackend = LLMBackend.GEMINI,
|
| 86 |
+
endpoint: Optional[str] = None,
|
| 87 |
+
model: Optional[str] = None,
|
| 88 |
+
api_key: Optional[str] = None
|
| 89 |
+
):
|
| 90 |
+
self.backend = backend
|
| 91 |
+
self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
|
| 92 |
+
|
| 93 |
+
# Set defaults based on backend
|
| 94 |
+
if backend == LLMBackend.OPENAI:
|
| 95 |
+
self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
|
| 96 |
+
self.model = model or "gpt-5-mini"
|
| 97 |
+
elif backend == LLMBackend.ANTHROPIC:
|
| 98 |
+
self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
|
| 99 |
+
self.model = model or "claude-4.5-haiku"
|
| 100 |
+
elif backend == LLMBackend.DEEPSEEK:
|
| 101 |
+
self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
|
| 102 |
+
self.model = model or "deepseek-chat"
|
| 103 |
+
elif backend == LLMBackend.OLLAMA:
|
| 104 |
+
self.endpoint = endpoint or "http://localhost:11434/api/generate"
|
| 105 |
+
self.model = model or "Qwen/qwen3-4B-Instruct-2507"
|
| 106 |
+
elif backend == LLMBackend.VLLM:
|
| 107 |
+
self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
|
| 108 |
+
self.model = model or "Qwen/qwen3-4B-Instruct-2507"
|
| 109 |
+
elif backend == LLMBackend.GEMINI:
|
| 110 |
+
self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
|
| 111 |
+
self.model = model or "gemini-2.5-flash-lite"
|
| 112 |
+
|
| 113 |
+
def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
|
| 114 |
+
"""Evaluate citation relevance."""
|
| 115 |
+
if not context or not abstract:
|
| 116 |
+
return EvaluationResult(
|
| 117 |
+
entry_key=entry_key,
|
| 118 |
+
relevance_score=0,
|
| 119 |
+
is_relevant=False,
|
| 120 |
+
explanation="Missing context or abstract",
|
| 121 |
+
context_used=context,
|
| 122 |
+
abstract_used=abstract,
|
| 123 |
+
error="Missing context or abstract for evaluation"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Don't truncate - preserve full context and abstract
|
| 127 |
+
prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
|
| 131 |
+
response = self._call_openai_compatible(prompt)
|
| 132 |
+
elif self.backend == LLMBackend.ANTHROPIC:
|
| 133 |
+
response = self._call_anthropic(prompt)
|
| 134 |
+
elif self.backend == LLMBackend.OLLAMA:
|
| 135 |
+
response = self._call_ollama(prompt)
|
| 136 |
+
elif self.backend == LLMBackend.GEMINI:
|
| 137 |
+
response = self._call_gemini(prompt)
|
| 138 |
+
else:
|
| 139 |
+
raise ValueError(f"Unknown backend: {self.backend}")
|
| 140 |
+
|
| 141 |
+
return self._parse_response(entry_key, response, context, abstract)
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
return EvaluationResult(
|
| 145 |
+
entry_key=entry_key,
|
| 146 |
+
relevance_score=0,
|
| 147 |
+
is_relevant=False,
|
| 148 |
+
explanation="",
|
| 149 |
+
context_used=context,
|
| 150 |
+
abstract_used=abstract,
|
| 151 |
+
error=str(e)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
def _call_openai_compatible(self, prompt: str) -> str:
|
| 155 |
+
"""Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
|
| 156 |
+
headers = {
|
| 157 |
+
"Content-Type": "application/json",
|
| 158 |
+
"Authorization": f"Bearer {self.api_key}"
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
payload = {
|
| 162 |
+
"model": self.model,
|
| 163 |
+
"messages": [
|
| 164 |
+
{"role": "user", "content": prompt}
|
| 165 |
+
],
|
| 166 |
+
"temperature": 0.1,
|
| 167 |
+
"max_tokens": 2000,
|
| 168 |
+
"response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
response = requests.post(
|
| 172 |
+
self.endpoint,
|
| 173 |
+
json=payload,
|
| 174 |
+
headers=headers,
|
| 175 |
+
timeout=60
|
| 176 |
+
)
|
| 177 |
+
response.raise_for_status()
|
| 178 |
+
|
| 179 |
+
data = response.json()
|
| 180 |
+
choices = data.get("choices", [])
|
| 181 |
+
if choices:
|
| 182 |
+
return choices[0].get("message", {}).get("content", "")
|
| 183 |
+
return ""
|
| 184 |
+
|
| 185 |
+
def _call_anthropic(self, prompt: str) -> str:
|
| 186 |
+
"""Call Anthropic API."""
|
| 187 |
+
headers = {
|
| 188 |
+
"x-api-key": self.api_key,
|
| 189 |
+
"anthropic-version": "2023-06-01",
|
| 190 |
+
"content-type": "application/json"
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
payload = {
|
| 194 |
+
"model": self.model,
|
| 195 |
+
"max_tokens": 2000,
|
| 196 |
+
"temperature": 0.1,
|
| 197 |
+
"messages": [
|
| 198 |
+
{"role": "user", "content": prompt}
|
| 199 |
+
]
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
response = requests.post(
|
| 203 |
+
self.endpoint,
|
| 204 |
+
json=payload,
|
| 205 |
+
headers=headers,
|
| 206 |
+
timeout=60
|
| 207 |
+
)
|
| 208 |
+
response.raise_for_status()
|
| 209 |
+
|
| 210 |
+
data = response.json()
|
| 211 |
+
content = data.get("content", [])
|
| 212 |
+
if content and content[0].get("type") == "text":
|
| 213 |
+
return content[0].get("text", "")
|
| 214 |
+
return ""
|
| 215 |
+
|
| 216 |
+
def _call_ollama(self, prompt: str) -> str:
|
| 217 |
+
"""Call Ollama API."""
|
| 218 |
+
payload = {
|
| 219 |
+
"model": self.model,
|
| 220 |
+
"prompt": prompt,
|
| 221 |
+
"stream": False,
|
| 222 |
+
"options": {
|
| 223 |
+
"temperature": 0.1,
|
| 224 |
+
"num_predict": 2000
|
| 225 |
+
},
|
| 226 |
+
"format": "json"
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
response = requests.post(
|
| 230 |
+
self.endpoint,
|
| 231 |
+
json=payload,
|
| 232 |
+
timeout=60
|
| 233 |
+
)
|
| 234 |
+
response.raise_for_status()
|
| 235 |
+
|
| 236 |
+
return response.json().get("response", "")
|
| 237 |
+
|
| 238 |
+
def _call_gemini(self, prompt: str) -> str:
|
| 239 |
+
"""Call Gemini API."""
|
| 240 |
+
# Build URL with model
|
| 241 |
+
url = f"{self.endpoint}/{self.model}:generateContent"
|
| 242 |
+
if self.api_key:
|
| 243 |
+
url += f"?key={self.api_key}"
|
| 244 |
+
|
| 245 |
+
payload = {
|
| 246 |
+
"contents": [
|
| 247 |
+
{
|
| 248 |
+
"parts": [
|
| 249 |
+
{"text": prompt}
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"generationConfig": {
|
| 254 |
+
"temperature": 0.1,
|
| 255 |
+
"maxOutputTokens": 2000,
|
| 256 |
+
"responseMimeType": "application/json"
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
response = requests.post(
|
| 261 |
+
url,
|
| 262 |
+
json=payload,
|
| 263 |
+
timeout=60
|
| 264 |
+
)
|
| 265 |
+
response.raise_for_status()
|
| 266 |
+
|
| 267 |
+
candidates = response.json().get("candidates", [])
|
| 268 |
+
if candidates:
|
| 269 |
+
content = candidates[0].get("content", {})
|
| 270 |
+
parts = content.get("parts", [])
|
| 271 |
+
if parts:
|
| 272 |
+
return parts[0].get("text", "")
|
| 273 |
+
return ""
|
| 274 |
+
|
| 275 |
+
def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
|
| 276 |
+
"""Parse LLM response."""
|
| 277 |
+
# Try to extract JSON from response
|
| 278 |
+
json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
|
| 279 |
+
|
| 280 |
+
data = {}
|
| 281 |
+
if not json_match:
|
| 282 |
+
# Try to parse the whole response as JSON
|
| 283 |
+
try:
|
| 284 |
+
data = json.loads(response.strip())
|
| 285 |
+
except json.JSONDecodeError:
|
| 286 |
+
pass
|
| 287 |
+
else:
|
| 288 |
+
try:
|
| 289 |
+
data = json.loads(json_match.group())
|
| 290 |
+
except json.JSONDecodeError:
|
| 291 |
+
pass
|
| 292 |
+
|
| 293 |
+
if not data:
|
| 294 |
+
return EvaluationResult(
|
| 295 |
+
entry_key=entry_key,
|
| 296 |
+
relevance_score=0,
|
| 297 |
+
is_relevant=False,
|
| 298 |
+
explanation=response,
|
| 299 |
+
context_used=context,
|
| 300 |
+
abstract_used=abstract,
|
| 301 |
+
error="Failed to parse LLM response as JSON"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Extract fields
|
| 305 |
+
relevance_score = data.get("relevance_score", 0)
|
| 306 |
+
if isinstance(relevance_score, str):
|
| 307 |
+
try:
|
| 308 |
+
relevance_score = int(relevance_score)
|
| 309 |
+
except ValueError:
|
| 310 |
+
relevance_score = 0
|
| 311 |
+
|
| 312 |
+
is_relevant = data.get("is_relevant", False)
|
| 313 |
+
if isinstance(is_relevant, str):
|
| 314 |
+
is_relevant = is_relevant.lower() in ("true", "yes", "1")
|
| 315 |
+
|
| 316 |
+
explanation = data.get("explanation", "")
|
| 317 |
+
|
| 318 |
+
return EvaluationResult(
|
| 319 |
+
entry_key=entry_key,
|
| 320 |
+
relevance_score=relevance_score,
|
| 321 |
+
is_relevant=is_relevant,
|
| 322 |
+
explanation=explanation,
|
| 323 |
+
context_used=context,
|
| 324 |
+
abstract_used=abstract
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
def test_connection(self) -> bool:
|
| 328 |
+
"""Test if LLM backend is accessible."""
|
| 329 |
+
try:
|
| 330 |
+
if self.backend == LLMBackend.OLLAMA:
|
| 331 |
+
response = requests.get(
|
| 332 |
+
self.endpoint.replace("/api/generate", "/api/tags"),
|
| 333 |
+
timeout=5
|
| 334 |
+
)
|
| 335 |
+
return response.status_code == 200
|
| 336 |
+
elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
|
| 337 |
+
# Test with a simple model list or empty completion
|
| 338 |
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
| 339 |
+
# Try listing models if possible, otherwise simple completion
|
| 340 |
+
if "chat/completions" in self.endpoint:
|
| 341 |
+
# Try a minimal completion
|
| 342 |
+
payload = {
|
| 343 |
+
"model": self.model,
|
| 344 |
+
"messages": [{"role": "user", "content": "hi"}],
|
| 345 |
+
"max_tokens": 1
|
| 346 |
+
}
|
| 347 |
+
response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
|
| 348 |
+
return response.status_code == 200
|
| 349 |
+
else:
|
| 350 |
+
return False
|
| 351 |
+
elif self.backend == LLMBackend.ANTHROPIC:
|
| 352 |
+
headers = {
|
| 353 |
+
"x-api-key": self.api_key,
|
| 354 |
+
"anthropic-version": "2023-06-01",
|
| 355 |
+
"content-type": "application/json"
|
| 356 |
+
}
|
| 357 |
+
payload = {
|
| 358 |
+
"model": self.model,
|
| 359 |
+
"max_tokens": 1,
|
| 360 |
+
"messages": [{"role": "user", "content": "hi"}]
|
| 361 |
+
}
|
| 362 |
+
response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10)
|
| 363 |
+
return response.status_code == 200
|
| 364 |
+
elif self.backend == LLMBackend.GEMINI:
|
| 365 |
+
if not self.api_key:
|
| 366 |
+
return False
|
| 367 |
+
url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}"
|
| 368 |
+
payload = {
|
| 369 |
+
"contents": [{"parts": [{"text": "test"}]}],
|
| 370 |
+
"generationConfig": {"maxOutputTokens": 10}
|
| 371 |
+
}
|
| 372 |
+
response = requests.post(url, json=payload, timeout=10)
|
| 373 |
+
return response.status_code == 200
|
| 374 |
+
except Exception:
|
| 375 |
+
return False
|
| 376 |
+
return False
|
src/analyzers/metadata_comparator.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metadata comparison between bib entries and fetched metadata.
|
| 3 |
+
"""
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
from ..parsers.bib_parser import BibEntry
|
| 8 |
+
from ..fetchers.arxiv_fetcher import ArxivMetadata
|
| 9 |
+
from ..fetchers.scholar_fetcher import ScholarResult
|
| 10 |
+
from ..fetchers.crossref_fetcher import CrossRefResult
|
| 11 |
+
from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult
|
| 12 |
+
from ..fetchers.openalex_fetcher import OpenAlexResult
|
| 13 |
+
from ..fetchers.dblp_fetcher import DBLPResult
|
| 14 |
+
from ..utils.normalizer import TextNormalizer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class ComparisonResult:
|
| 19 |
+
"""Result of comparing bib entry with fetched metadata."""
|
| 20 |
+
entry_key: str
|
| 21 |
+
|
| 22 |
+
# Title comparison
|
| 23 |
+
title_match: bool
|
| 24 |
+
title_similarity: float
|
| 25 |
+
bib_title: str
|
| 26 |
+
fetched_title: str
|
| 27 |
+
|
| 28 |
+
# Author comparison
|
| 29 |
+
author_match: bool
|
| 30 |
+
author_similarity: float
|
| 31 |
+
bib_authors: list[str]
|
| 32 |
+
fetched_authors: list[str]
|
| 33 |
+
|
| 34 |
+
# Year comparison
|
| 35 |
+
year_match: bool
|
| 36 |
+
bib_year: str
|
| 37 |
+
fetched_year: str
|
| 38 |
+
|
| 39 |
+
# Overall assessment
|
| 40 |
+
is_match: bool
|
| 41 |
+
confidence: float
|
| 42 |
+
issues: list[str]
|
| 43 |
+
source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def has_issues(self) -> bool:
|
| 47 |
+
return len(self.issues) > 0
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class MetadataComparator:
|
| 51 |
+
"""Compares bibliography entries with fetched metadata."""
|
| 52 |
+
|
| 53 |
+
# Thresholds for matching
|
| 54 |
+
TITLE_THRESHOLD = 0.8
|
| 55 |
+
AUTHOR_THRESHOLD = 0.6
|
| 56 |
+
|
| 57 |
+
def __init__(self):
|
| 58 |
+
self.normalizer = TextNormalizer
|
| 59 |
+
|
| 60 |
+
def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
|
| 61 |
+
"""Compare bib entry with arXiv metadata."""
|
| 62 |
+
issues = []
|
| 63 |
+
|
| 64 |
+
# Compare titles
|
| 65 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 66 |
+
arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
|
| 67 |
+
|
| 68 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm)
|
| 69 |
+
# Also try Levenshtein for short titles
|
| 70 |
+
if len(bib_title_norm) < 100:
|
| 71 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm)
|
| 72 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 73 |
+
|
| 74 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 75 |
+
|
| 76 |
+
if not title_match:
|
| 77 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 78 |
+
|
| 79 |
+
# Compare authors
|
| 80 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 81 |
+
arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors]
|
| 82 |
+
|
| 83 |
+
author_similarity = self._compare_author_lists(bib_authors, arxiv_authors)
|
| 84 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 85 |
+
|
| 86 |
+
if not author_match:
|
| 87 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 88 |
+
|
| 89 |
+
# Compare years
|
| 90 |
+
bib_year = bib_entry.year.strip()
|
| 91 |
+
arxiv_year = arxiv_meta.year
|
| 92 |
+
year_match = bib_year == arxiv_year
|
| 93 |
+
|
| 94 |
+
if not year_match and bib_year and arxiv_year:
|
| 95 |
+
issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}")
|
| 96 |
+
|
| 97 |
+
# Overall assessment
|
| 98 |
+
is_match = title_match and author_match
|
| 99 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 100 |
+
|
| 101 |
+
return ComparisonResult(
|
| 102 |
+
entry_key=bib_entry.key,
|
| 103 |
+
title_match=title_match,
|
| 104 |
+
title_similarity=title_similarity,
|
| 105 |
+
bib_title=bib_entry.title,
|
| 106 |
+
fetched_title=arxiv_meta.title,
|
| 107 |
+
author_match=author_match,
|
| 108 |
+
author_similarity=author_similarity,
|
| 109 |
+
bib_authors=bib_authors,
|
| 110 |
+
fetched_authors=arxiv_authors,
|
| 111 |
+
year_match=year_match,
|
| 112 |
+
bib_year=bib_year,
|
| 113 |
+
fetched_year=arxiv_year,
|
| 114 |
+
is_match=is_match,
|
| 115 |
+
confidence=confidence,
|
| 116 |
+
issues=issues,
|
| 117 |
+
source="arxiv"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
|
| 121 |
+
"""Compare bib entry with Scholar search result."""
|
| 122 |
+
issues = []
|
| 123 |
+
|
| 124 |
+
# Compare titles
|
| 125 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 126 |
+
scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title)
|
| 127 |
+
|
| 128 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm)
|
| 129 |
+
if len(bib_title_norm) < 100:
|
| 130 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm)
|
| 131 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 132 |
+
|
| 133 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 134 |
+
|
| 135 |
+
if not title_match:
|
| 136 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 137 |
+
|
| 138 |
+
# Compare authors (Scholar format is less structured)
|
| 139 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 140 |
+
# Scholar authors are comma-separated
|
| 141 |
+
scholar_authors_raw = scholar_result.authors.split(',')
|
| 142 |
+
scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw]
|
| 143 |
+
|
| 144 |
+
author_similarity = self._compare_author_lists(bib_authors, scholar_authors)
|
| 145 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 146 |
+
|
| 147 |
+
if not author_match:
|
| 148 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 149 |
+
|
| 150 |
+
# Compare years
|
| 151 |
+
bib_year = bib_entry.year.strip()
|
| 152 |
+
scholar_year = scholar_result.year
|
| 153 |
+
year_match = bib_year == scholar_year
|
| 154 |
+
|
| 155 |
+
if not year_match and bib_year and scholar_year:
|
| 156 |
+
issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}")
|
| 157 |
+
|
| 158 |
+
# Overall assessment
|
| 159 |
+
is_match = title_match and author_match
|
| 160 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 161 |
+
|
| 162 |
+
return ComparisonResult(
|
| 163 |
+
entry_key=bib_entry.key,
|
| 164 |
+
title_match=title_match,
|
| 165 |
+
title_similarity=title_similarity,
|
| 166 |
+
bib_title=bib_entry.title,
|
| 167 |
+
fetched_title=scholar_result.title,
|
| 168 |
+
author_match=author_match,
|
| 169 |
+
author_similarity=author_similarity,
|
| 170 |
+
bib_authors=bib_authors,
|
| 171 |
+
fetched_authors=scholar_authors,
|
| 172 |
+
year_match=year_match,
|
| 173 |
+
bib_year=bib_year,
|
| 174 |
+
fetched_year=scholar_year,
|
| 175 |
+
is_match=is_match,
|
| 176 |
+
confidence=confidence,
|
| 177 |
+
issues=issues,
|
| 178 |
+
source="scholar"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult:
|
| 182 |
+
"""Compare bib entry with CrossRef search result."""
|
| 183 |
+
issues = []
|
| 184 |
+
|
| 185 |
+
# Compare titles
|
| 186 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 187 |
+
crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title)
|
| 188 |
+
|
| 189 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm)
|
| 190 |
+
if len(bib_title_norm) < 100:
|
| 191 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm)
|
| 192 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 193 |
+
|
| 194 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 195 |
+
|
| 196 |
+
if not title_match:
|
| 197 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 198 |
+
|
| 199 |
+
# Compare authors
|
| 200 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 201 |
+
crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors]
|
| 202 |
+
|
| 203 |
+
author_similarity = self._compare_author_lists(bib_authors, crossref_authors)
|
| 204 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 205 |
+
|
| 206 |
+
if not author_match:
|
| 207 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 208 |
+
|
| 209 |
+
# Compare years
|
| 210 |
+
bib_year = bib_entry.year.strip()
|
| 211 |
+
crossref_year = crossref_result.year
|
| 212 |
+
year_match = bib_year == crossref_year
|
| 213 |
+
|
| 214 |
+
if not year_match and bib_year and crossref_year:
|
| 215 |
+
issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}")
|
| 216 |
+
|
| 217 |
+
# Overall assessment
|
| 218 |
+
is_match = title_match and author_match
|
| 219 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 220 |
+
|
| 221 |
+
return ComparisonResult(
|
| 222 |
+
entry_key=bib_entry.key,
|
| 223 |
+
title_match=title_match,
|
| 224 |
+
title_similarity=title_similarity,
|
| 225 |
+
bib_title=bib_entry.title,
|
| 226 |
+
fetched_title=crossref_result.title,
|
| 227 |
+
author_match=author_match,
|
| 228 |
+
author_similarity=author_similarity,
|
| 229 |
+
bib_authors=bib_authors,
|
| 230 |
+
fetched_authors=crossref_authors,
|
| 231 |
+
year_match=year_match,
|
| 232 |
+
bib_year=bib_year,
|
| 233 |
+
fetched_year=crossref_year,
|
| 234 |
+
is_match=is_match,
|
| 235 |
+
confidence=confidence,
|
| 236 |
+
issues=issues,
|
| 237 |
+
source="crossref"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
|
| 241 |
+
"""Create result when metadata couldn't be fetched."""
|
| 242 |
+
return ComparisonResult(
|
| 243 |
+
entry_key=bib_entry.key,
|
| 244 |
+
title_match=False,
|
| 245 |
+
title_similarity=0.0,
|
| 246 |
+
bib_title=bib_entry.title,
|
| 247 |
+
fetched_title="",
|
| 248 |
+
author_match=False,
|
| 249 |
+
author_similarity=0.0,
|
| 250 |
+
bib_authors=self.normalizer.normalize_author_list(bib_entry.author),
|
| 251 |
+
fetched_authors=[],
|
| 252 |
+
year_match=False,
|
| 253 |
+
bib_year=bib_entry.year,
|
| 254 |
+
fetched_year="",
|
| 255 |
+
is_match=False,
|
| 256 |
+
confidence=0.0,
|
| 257 |
+
issues=[reason],
|
| 258 |
+
source="unable"
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
|
| 262 |
+
"""Compare two author lists."""
|
| 263 |
+
if not list1 and not list2:
|
| 264 |
+
return 1.0
|
| 265 |
+
if not list1 or not list2:
|
| 266 |
+
return 0.0
|
| 267 |
+
|
| 268 |
+
# Find best matches for each author in list1
|
| 269 |
+
total_similarity = 0.0
|
| 270 |
+
for author1 in list1:
|
| 271 |
+
best_match = 0.0
|
| 272 |
+
for author2 in list2:
|
| 273 |
+
# Check if one name contains the other (handle abbreviated names)
|
| 274 |
+
if self._names_match(author1, author2):
|
| 275 |
+
best_match = 1.0
|
| 276 |
+
break
|
| 277 |
+
sim = self.normalizer.similarity_ratio(author1, author2)
|
| 278 |
+
best_match = max(best_match, sim)
|
| 279 |
+
total_similarity += best_match
|
| 280 |
+
|
| 281 |
+
return total_similarity / len(list1)
|
| 282 |
+
|
| 283 |
+
def _names_match(self, name1: str, name2: str) -> bool:
|
| 284 |
+
"""Check if two names match (handles abbreviated names)."""
|
| 285 |
+
words1 = name1.split()
|
| 286 |
+
words2 = name2.split()
|
| 287 |
+
|
| 288 |
+
if not words1 or not words2:
|
| 289 |
+
return False
|
| 290 |
+
|
| 291 |
+
# Check if last names match
|
| 292 |
+
if words1[-1] != words2[-1]:
|
| 293 |
+
# Try first word as last name too
|
| 294 |
+
if words1[0] != words2[-1] and words1[-1] != words2[0]:
|
| 295 |
+
return False
|
| 296 |
+
|
| 297 |
+
return True
|
| 298 |
+
|
| 299 |
+
def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult:
|
| 300 |
+
"""Compare bib entry with Semantic Scholar result."""
|
| 301 |
+
issues = []
|
| 302 |
+
|
| 303 |
+
# Compare titles
|
| 304 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 305 |
+
ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title)
|
| 306 |
+
|
| 307 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm)
|
| 308 |
+
if len(bib_title_norm) < 100:
|
| 309 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm)
|
| 310 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 311 |
+
|
| 312 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 313 |
+
|
| 314 |
+
if not title_match:
|
| 315 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 316 |
+
|
| 317 |
+
# Compare authors
|
| 318 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 319 |
+
ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors]
|
| 320 |
+
|
| 321 |
+
author_similarity = self._compare_author_lists(bib_authors, ss_authors)
|
| 322 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 323 |
+
|
| 324 |
+
if not author_match:
|
| 325 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 326 |
+
|
| 327 |
+
# Compare years
|
| 328 |
+
bib_year = bib_entry.year.strip()
|
| 329 |
+
ss_year = ss_result.year
|
| 330 |
+
year_match = bib_year == ss_year
|
| 331 |
+
|
| 332 |
+
if not year_match and bib_year and ss_year:
|
| 333 |
+
issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}")
|
| 334 |
+
|
| 335 |
+
# Overall assessment
|
| 336 |
+
is_match = title_match and author_match
|
| 337 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 338 |
+
|
| 339 |
+
return ComparisonResult(
|
| 340 |
+
entry_key=bib_entry.key,
|
| 341 |
+
title_match=title_match,
|
| 342 |
+
title_similarity=title_similarity,
|
| 343 |
+
bib_title=bib_entry.title,
|
| 344 |
+
fetched_title=ss_result.title,
|
| 345 |
+
author_match=author_match,
|
| 346 |
+
author_similarity=author_similarity,
|
| 347 |
+
bib_authors=bib_authors,
|
| 348 |
+
fetched_authors=ss_authors,
|
| 349 |
+
year_match=year_match,
|
| 350 |
+
bib_year=bib_year,
|
| 351 |
+
fetched_year=ss_year,
|
| 352 |
+
is_match=is_match,
|
| 353 |
+
confidence=confidence,
|
| 354 |
+
issues=issues,
|
| 355 |
+
source="semantic_scholar"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult:
|
| 359 |
+
"""Compare bib entry with OpenAlex result."""
|
| 360 |
+
issues = []
|
| 361 |
+
|
| 362 |
+
# Compare titles
|
| 363 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 364 |
+
oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title)
|
| 365 |
+
|
| 366 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm)
|
| 367 |
+
if len(bib_title_norm) < 100:
|
| 368 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm)
|
| 369 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 370 |
+
|
| 371 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 372 |
+
|
| 373 |
+
if not title_match:
|
| 374 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 375 |
+
|
| 376 |
+
# Compare authors
|
| 377 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 378 |
+
oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors]
|
| 379 |
+
|
| 380 |
+
author_similarity = self._compare_author_lists(bib_authors, oa_authors)
|
| 381 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 382 |
+
|
| 383 |
+
if not author_match:
|
| 384 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 385 |
+
|
| 386 |
+
# Compare years
|
| 387 |
+
bib_year = bib_entry.year.strip()
|
| 388 |
+
oa_year = oa_result.year
|
| 389 |
+
year_match = bib_year == oa_year
|
| 390 |
+
|
| 391 |
+
if not year_match and bib_year and oa_year:
|
| 392 |
+
issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}")
|
| 393 |
+
|
| 394 |
+
# Overall assessment
|
| 395 |
+
is_match = title_match and author_match
|
| 396 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 397 |
+
|
| 398 |
+
return ComparisonResult(
|
| 399 |
+
entry_key=bib_entry.key,
|
| 400 |
+
title_match=title_match,
|
| 401 |
+
title_similarity=title_similarity,
|
| 402 |
+
bib_title=bib_entry.title,
|
| 403 |
+
fetched_title=oa_result.title,
|
| 404 |
+
author_match=author_match,
|
| 405 |
+
author_similarity=author_similarity,
|
| 406 |
+
bib_authors=bib_authors,
|
| 407 |
+
fetched_authors=oa_authors,
|
| 408 |
+
year_match=year_match,
|
| 409 |
+
bib_year=bib_year,
|
| 410 |
+
fetched_year=oa_year,
|
| 411 |
+
is_match=is_match,
|
| 412 |
+
confidence=confidence,
|
| 413 |
+
issues=issues,
|
| 414 |
+
source="openalex"
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult:
|
| 418 |
+
"""Compare bib entry with DBLP result."""
|
| 419 |
+
issues = []
|
| 420 |
+
|
| 421 |
+
# Compare titles
|
| 422 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 423 |
+
dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title)
|
| 424 |
+
|
| 425 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm)
|
| 426 |
+
if len(bib_title_norm) < 100:
|
| 427 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm)
|
| 428 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 429 |
+
|
| 430 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 431 |
+
|
| 432 |
+
if not title_match:
|
| 433 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 434 |
+
|
| 435 |
+
# Compare authors
|
| 436 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 437 |
+
dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors]
|
| 438 |
+
|
| 439 |
+
author_similarity = self._compare_author_lists(bib_authors, dblp_authors)
|
| 440 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 441 |
+
|
| 442 |
+
if not author_match:
|
| 443 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 444 |
+
|
| 445 |
+
# Compare years
|
| 446 |
+
bib_year = bib_entry.year.strip()
|
| 447 |
+
dblp_year = dblp_result.year
|
| 448 |
+
year_match = bib_year == dblp_year
|
| 449 |
+
|
| 450 |
+
if not year_match and bib_year and dblp_year:
|
| 451 |
+
issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}")
|
| 452 |
+
|
| 453 |
+
# Overall assessment
|
| 454 |
+
is_match = title_match and author_match
|
| 455 |
+
confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2)
|
| 456 |
+
|
| 457 |
+
return ComparisonResult(
|
| 458 |
+
entry_key=bib_entry.key,
|
| 459 |
+
title_match=title_match,
|
| 460 |
+
title_similarity=title_similarity,
|
| 461 |
+
bib_title=bib_entry.title,
|
| 462 |
+
fetched_title=dblp_result.title,
|
| 463 |
+
author_match=author_match,
|
| 464 |
+
author_similarity=author_similarity,
|
| 465 |
+
bib_authors=bib_authors,
|
| 466 |
+
fetched_authors=dblp_authors,
|
| 467 |
+
year_match=year_match,
|
| 468 |
+
bib_year=bib_year,
|
| 469 |
+
fetched_year=dblp_year,
|
| 470 |
+
is_match=is_match,
|
| 471 |
+
confidence=confidence,
|
| 472 |
+
issues=issues,
|
| 473 |
+
source="dblp"
|
| 474 |
+
)
|
src/analyzers/usage_checker.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Usage checker for bibliography entries in TeX files.
|
| 3 |
+
"""
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
from ..parsers.bib_parser import BibEntry
|
| 8 |
+
from ..parsers.tex_parser import TexParser, CitationContext
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class UsageResult:
|
| 13 |
+
"""Result of checking if a bib entry is used."""
|
| 14 |
+
entry_key: str
|
| 15 |
+
is_used: bool
|
| 16 |
+
usage_count: int
|
| 17 |
+
contexts: list[CitationContext]
|
| 18 |
+
line_numbers: list[int]
|
| 19 |
+
|
| 20 |
+
@property
|
| 21 |
+
def first_usage_line(self) -> Optional[int]:
|
| 22 |
+
return self.line_numbers[0] if self.line_numbers else None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class UsageChecker:
|
| 26 |
+
"""Checks if bibliography entries are used in TeX files."""
|
| 27 |
+
|
| 28 |
+
def __init__(self, tex_parser: TexParser):
|
| 29 |
+
self.tex_parser = tex_parser
|
| 30 |
+
self._cited_keys = tex_parser.get_all_cited_keys()
|
| 31 |
+
|
| 32 |
+
def check_usage(self, entry: BibEntry) -> UsageResult:
|
| 33 |
+
"""Check if a bib entry is used in the TeX document."""
|
| 34 |
+
key = entry.key
|
| 35 |
+
is_used = key in self._cited_keys
|
| 36 |
+
contexts = self.tex_parser.get_citation_contexts(key)
|
| 37 |
+
|
| 38 |
+
return UsageResult(
|
| 39 |
+
entry_key=key,
|
| 40 |
+
is_used=is_used,
|
| 41 |
+
usage_count=len(contexts),
|
| 42 |
+
contexts=contexts,
|
| 43 |
+
line_numbers=[ctx.line_number for ctx in contexts]
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def get_unused_entries(self, entries: list[BibEntry]) -> list[BibEntry]:
|
| 47 |
+
"""Get list of entries that are not cited in the document."""
|
| 48 |
+
unused = []
|
| 49 |
+
for entry in entries:
|
| 50 |
+
if entry.key not in self._cited_keys:
|
| 51 |
+
unused.append(entry)
|
| 52 |
+
return unused
|
| 53 |
+
|
| 54 |
+
def get_missing_entries(self, entries: list[BibEntry]) -> list[str]:
|
| 55 |
+
"""Get list of citation keys that don't have corresponding bib entries."""
|
| 56 |
+
entry_keys = {e.key for e in entries}
|
| 57 |
+
missing = []
|
| 58 |
+
for key in self._cited_keys:
|
| 59 |
+
if key not in entry_keys:
|
| 60 |
+
missing.append(key)
|
| 61 |
+
return missing
|
| 62 |
+
|
| 63 |
+
def get_combined_context(self, key: str, max_chars: int = 1000) -> str:
|
| 64 |
+
"""Get combined context for all usages of a key."""
|
| 65 |
+
contexts = self.tex_parser.get_citation_contexts(key)
|
| 66 |
+
if not contexts:
|
| 67 |
+
return ""
|
| 68 |
+
|
| 69 |
+
combined = []
|
| 70 |
+
total_chars = 0
|
| 71 |
+
|
| 72 |
+
for ctx in contexts:
|
| 73 |
+
if total_chars + len(ctx.full_context) > max_chars:
|
| 74 |
+
# Add truncated context
|
| 75 |
+
remaining = max_chars - total_chars
|
| 76 |
+
if remaining > 100:
|
| 77 |
+
combined.append(ctx.full_context[:remaining] + "...")
|
| 78 |
+
break
|
| 79 |
+
combined.append(ctx.full_context)
|
| 80 |
+
total_chars += len(ctx.full_context)
|
| 81 |
+
|
| 82 |
+
return "\n---\n".join(combined)
|
src/checkers/__init__.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Checkers module for paper submission quality checks."""
|
| 2 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 3 |
+
from .caption_checker import CaptionChecker
|
| 4 |
+
from .reference_checker import ReferenceChecker
|
| 5 |
+
from .ai_artifacts_checker import AIArtifactsChecker
|
| 6 |
+
from .formatting_checker import FormattingChecker
|
| 7 |
+
from .anonymization_checker import AnonymizationChecker
|
| 8 |
+
from .number_checker import NumberChecker
|
| 9 |
+
from .sentence_checker import SentenceChecker
|
| 10 |
+
from .consistency_checker import ConsistencyChecker
|
| 11 |
+
from .citation_quality_checker import CitationQualityChecker
|
| 12 |
+
from .equation_checker import EquationChecker
|
| 13 |
+
from .acronym_checker import AcronymChecker
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
'BaseChecker',
|
| 17 |
+
'CheckResult',
|
| 18 |
+
'CheckSeverity',
|
| 19 |
+
'CaptionChecker',
|
| 20 |
+
'ReferenceChecker',
|
| 21 |
+
'AIArtifactsChecker',
|
| 22 |
+
'FormattingChecker',
|
| 23 |
+
'AnonymizationChecker',
|
| 24 |
+
'NumberChecker',
|
| 25 |
+
'SentenceChecker',
|
| 26 |
+
'ConsistencyChecker',
|
| 27 |
+
'CitationQualityChecker',
|
| 28 |
+
'EquationChecker',
|
| 29 |
+
'AcronymChecker',
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Registry of all available checkers
|
| 34 |
+
CHECKER_REGISTRY = {
|
| 35 |
+
'caption': CaptionChecker,
|
| 36 |
+
'reference': ReferenceChecker,
|
| 37 |
+
'ai_artifacts': AIArtifactsChecker,
|
| 38 |
+
'formatting': FormattingChecker,
|
| 39 |
+
'anonymization': AnonymizationChecker,
|
| 40 |
+
'number': NumberChecker,
|
| 41 |
+
'sentence': SentenceChecker,
|
| 42 |
+
'consistency': ConsistencyChecker,
|
| 43 |
+
'citation_quality': CitationQualityChecker,
|
| 44 |
+
'equation': EquationChecker,
|
| 45 |
+
'acronym': AcronymChecker,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def get_checker(name: str) -> BaseChecker:
|
| 50 |
+
"""Get a checker instance by name."""
|
| 51 |
+
if name not in CHECKER_REGISTRY:
|
| 52 |
+
raise ValueError(f"Unknown checker: {name}")
|
| 53 |
+
return CHECKER_REGISTRY[name]()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def run_all_checkers(tex_content: str, config: dict = None) -> list:
|
| 57 |
+
"""Run all checkers and return combined results."""
|
| 58 |
+
results = []
|
| 59 |
+
config = config or {}
|
| 60 |
+
|
| 61 |
+
for name, checker_class in CHECKER_REGISTRY.items():
|
| 62 |
+
checker = checker_class()
|
| 63 |
+
checker_results = checker.check(tex_content, config)
|
| 64 |
+
results.extend(checker_results)
|
| 65 |
+
|
| 66 |
+
return results
|
src/checkers/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (2.2 kB). View file
|
|
|
src/checkers/__pycache__/acronym_checker.cpython-313.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc
ADDED
|
Binary file (6.14 kB). View file
|
|
|
src/checkers/__pycache__/anonymization_checker.cpython-313.pyc
ADDED
|
Binary file (8.38 kB). View file
|
|
|
src/checkers/__pycache__/base.cpython-313.pyc
ADDED
|
Binary file (7.68 kB). View file
|
|
|
src/checkers/__pycache__/caption_checker.cpython-313.pyc
ADDED
|
Binary file (5.63 kB). View file
|
|
|
src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc
ADDED
|
Binary file (5.41 kB). View file
|
|
|
src/checkers/__pycache__/consistency_checker.cpython-313.pyc
ADDED
|
Binary file (11 kB). View file
|
|
|
src/checkers/__pycache__/equation_checker.cpython-313.pyc
ADDED
|
Binary file (5.62 kB). View file
|
|
|
src/checkers/__pycache__/formatting_checker.cpython-313.pyc
ADDED
|
Binary file (9.45 kB). View file
|
|
|
src/checkers/__pycache__/number_checker.cpython-313.pyc
ADDED
|
Binary file (3.8 kB). View file
|
|
|
src/checkers/__pycache__/reference_checker.cpython-313.pyc
ADDED
|
Binary file (8.3 kB). View file
|
|
|
src/checkers/__pycache__/sentence_checker.cpython-313.pyc
ADDED
|
Binary file (4.36 kB). View file
|
|
|
src/checkers/acronym_checker.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Acronym and abbreviation checker.
|
| 3 |
+
|
| 4 |
+
Validates that:
|
| 5 |
+
- Acronyms found in text have corresponding full forms defined
|
| 6 |
+
- Acronyms are used after their definition
|
| 7 |
+
- Only checks acronyms that have matching full forms in the document
|
| 8 |
+
"""
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Set, Tuple
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
|
| 13 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AcronymChecker(BaseChecker):
|
| 17 |
+
"""Check acronym definitions and consistency."""
|
| 18 |
+
|
| 19 |
+
name = "acronym"
|
| 20 |
+
display_name = "Acronyms"
|
| 21 |
+
description = "Check acronym definitions and consistent usage"
|
| 22 |
+
|
| 23 |
+
# Enhanced pattern to find defined acronyms with LaTeX formatting support
|
| 24 |
+
# Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
|
| 25 |
+
DEFINITION_PATTERN = re.compile(
|
| 26 |
+
r'([A-Z][a-zA-Z\s\-]+)\s*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC})
|
| 27 |
+
r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name)
|
| 28 |
+
re.MULTILINE
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Pattern to find standalone acronyms (3+ capital letters)
|
| 32 |
+
ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
|
| 33 |
+
|
| 34 |
+
# Comprehensive list of common acronyms that don't need definition
|
| 35 |
+
COMMON_ACRONYMS = {
|
| 36 |
+
# Hardware & Computing
|
| 37 |
+
'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
|
| 38 |
+
'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
|
| 39 |
+
'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
|
| 40 |
+
|
| 41 |
+
# File Formats & Standards
|
| 42 |
+
'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
|
| 43 |
+
'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
|
| 44 |
+
|
| 45 |
+
# AI & Machine Learning (General)
|
| 46 |
+
'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
|
| 47 |
+
'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
|
| 48 |
+
'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
|
| 49 |
+
|
| 50 |
+
# NLP & Language Models
|
| 51 |
+
'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
|
| 52 |
+
'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
|
| 53 |
+
'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
|
| 54 |
+
|
| 55 |
+
# Computer Vision
|
| 56 |
+
'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
|
| 57 |
+
'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
|
| 58 |
+
|
| 59 |
+
# Reinforcement Learning
|
| 60 |
+
'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
|
| 61 |
+
'POMDP', 'RLHF', 'RLAIF',
|
| 62 |
+
|
| 63 |
+
# Metrics & Evaluation
|
| 64 |
+
'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
|
| 65 |
+
'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
|
| 66 |
+
|
| 67 |
+
# Data & Statistics
|
| 68 |
+
'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
|
| 69 |
+
'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
|
| 70 |
+
|
| 71 |
+
# Academic & Organizations
|
| 72 |
+
'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
|
| 73 |
+
'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
|
| 74 |
+
'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
|
| 75 |
+
'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
|
| 76 |
+
|
| 77 |
+
# Methods & Techniques (Common in ML papers)
|
| 78 |
+
'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
|
| 79 |
+
'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
|
| 80 |
+
'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
|
| 81 |
+
|
| 82 |
+
# Misc
|
| 83 |
+
'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
|
| 84 |
+
'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
|
| 85 |
+
'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
|
| 86 |
+
'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 90 |
+
results = []
|
| 91 |
+
|
| 92 |
+
# Remove comments using base class method
|
| 93 |
+
content = self._remove_comments(tex_content)
|
| 94 |
+
|
| 95 |
+
# Find all defined acronyms with their positions
|
| 96 |
+
defined_acronyms = self._find_definitions(content)
|
| 97 |
+
|
| 98 |
+
# Find all acronym usages (excluding special contexts)
|
| 99 |
+
all_usages = self._find_all_usages(content)
|
| 100 |
+
|
| 101 |
+
# NEW: Find potential full forms for each acronym
|
| 102 |
+
acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
|
| 103 |
+
|
| 104 |
+
# Check for undefined acronyms (only those with matching full forms)
|
| 105 |
+
for acronym, positions in all_usages.items():
|
| 106 |
+
if acronym in self.COMMON_ACRONYMS:
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
# Skip if no matching full form found in document
|
| 110 |
+
if acronym not in acronym_full_forms:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
if acronym not in defined_acronyms:
|
| 114 |
+
# First usage should define it
|
| 115 |
+
first_pos = positions[0]
|
| 116 |
+
line_num = self._find_line_number(content, first_pos)
|
| 117 |
+
full_form = acronym_full_forms[acronym]
|
| 118 |
+
|
| 119 |
+
results.append(self._create_result(
|
| 120 |
+
passed=False,
|
| 121 |
+
severity=CheckSeverity.WARNING,
|
| 122 |
+
message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
|
| 123 |
+
line_number=line_num,
|
| 124 |
+
suggestion=f"Define on first use: '{full_form} ({acronym})'"
|
| 125 |
+
))
|
| 126 |
+
else:
|
| 127 |
+
# Check if used before definition
|
| 128 |
+
def_pos = defined_acronyms[acronym]
|
| 129 |
+
for pos in positions:
|
| 130 |
+
if pos < def_pos:
|
| 131 |
+
line_num = self._find_line_number(content, pos)
|
| 132 |
+
results.append(self._create_result(
|
| 133 |
+
passed=False,
|
| 134 |
+
severity=CheckSeverity.WARNING,
|
| 135 |
+
message=f"Acronym '{acronym}' used before definition",
|
| 136 |
+
line_number=line_num,
|
| 137 |
+
suggestion="Move definition before first use"
|
| 138 |
+
))
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
return results
|
| 142 |
+
|
| 143 |
+
def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
|
| 144 |
+
"""Find potential full forms for acronyms by matching capital letters."""
|
| 145 |
+
full_forms = {}
|
| 146 |
+
|
| 147 |
+
for acronym in acronyms:
|
| 148 |
+
if acronym in self.COMMON_ACRONYMS:
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
# Build regex pattern to match full form
|
| 152 |
+
# For "ABC", match words starting with A, B, C
|
| 153 |
+
acronym_clean = acronym.rstrip('s') # Remove plural
|
| 154 |
+
if len(acronym_clean) < 3:
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
# Create pattern: match sequence of words where first letters spell the acronym
|
| 158 |
+
# Allow optional words in between (like "of", "the", "and")
|
| 159 |
+
pattern_parts = []
|
| 160 |
+
for i, letter in enumerate(acronym_clean):
|
| 161 |
+
if i == 0:
|
| 162 |
+
# First word must start with the letter
|
| 163 |
+
pattern_parts.append(f'{letter}[a-z]+')
|
| 164 |
+
else:
|
| 165 |
+
# Subsequent words: allow optional filler words
|
| 166 |
+
pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
|
| 167 |
+
|
| 168 |
+
full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
matches = re.finditer(full_pattern, content, re.IGNORECASE)
|
| 172 |
+
for match in matches:
|
| 173 |
+
candidate = match.group(0)
|
| 174 |
+
|
| 175 |
+
# Skip if candidate contains common non-content words
|
| 176 |
+
# These words indicate the match is part of a sentence, not an acronym full form
|
| 177 |
+
excluded_words = {
|
| 178 |
+
'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
| 179 |
+
'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
|
| 180 |
+
'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
|
| 181 |
+
'can', 'could', 'may', 'might', 'must', 'shall',
|
| 182 |
+
'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
|
| 183 |
+
'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
|
| 184 |
+
'more', 'most', 'less', 'few', 'several', 'other', 'another'
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
|
| 188 |
+
if any(word in excluded_words for word in candidate_words):
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
# Verify: extract first letters and check if they match acronym
|
| 192 |
+
words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
|
| 193 |
+
# Filter out filler words (allowed in between but not counted)
|
| 194 |
+
filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
|
| 195 |
+
meaningful_words = [w for w in words if w.lower() not in filler_words]
|
| 196 |
+
|
| 197 |
+
if len(meaningful_words) >= len(acronym_clean):
|
| 198 |
+
first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
|
| 199 |
+
if first_letters == acronym_clean:
|
| 200 |
+
full_forms[acronym] = candidate
|
| 201 |
+
break # Found a match, use the first one
|
| 202 |
+
except re.error:
|
| 203 |
+
# Invalid regex, skip this acronym
|
| 204 |
+
continue
|
| 205 |
+
|
| 206 |
+
return full_forms
|
| 207 |
+
|
| 208 |
+
def _find_definitions(self, content: str) -> Dict[str, int]:
|
| 209 |
+
"""Find all acronym definitions and their positions."""
|
| 210 |
+
definitions = {}
|
| 211 |
+
|
| 212 |
+
for match in self.DEFINITION_PATTERN.finditer(content):
|
| 213 |
+
# Get acronym from either pattern
|
| 214 |
+
acronym = match.group(2) or match.group(3)
|
| 215 |
+
if acronym:
|
| 216 |
+
acronym = acronym.rstrip('s') # Remove plural
|
| 217 |
+
definitions[acronym] = match.start()
|
| 218 |
+
|
| 219 |
+
return definitions
|
| 220 |
+
|
| 221 |
+
def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
|
| 222 |
+
"""Find all acronym usages, excluding special contexts."""
|
| 223 |
+
usages = defaultdict(list)
|
| 224 |
+
|
| 225 |
+
for match in self.ACRONYM_PATTERN.finditer(content):
|
| 226 |
+
acronym = match.group(1).rstrip('s')
|
| 227 |
+
pos = match.start()
|
| 228 |
+
|
| 229 |
+
# Skip if in special context
|
| 230 |
+
if self._is_in_special_context(content, pos, acronym):
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
usages[acronym].append(pos)
|
| 234 |
+
|
| 235 |
+
return usages
|
| 236 |
+
|
| 237 |
+
def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
|
| 238 |
+
"""Check if acronym at position is in a special context that should be ignored."""
|
| 239 |
+
# Get surrounding context
|
| 240 |
+
start = max(0, pos - 50)
|
| 241 |
+
end = min(len(content), pos + len(acronym) + 50)
|
| 242 |
+
before = content[start:pos]
|
| 243 |
+
after = content[pos + len(acronym):end]
|
| 244 |
+
|
| 245 |
+
# Skip if inside definition parentheses: (ACRONYM)
|
| 246 |
+
if before.endswith('(') and after.startswith(')'):
|
| 247 |
+
return True
|
| 248 |
+
|
| 249 |
+
# Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
|
| 250 |
+
if before.rstrip().endswith('\\'):
|
| 251 |
+
return True
|
| 252 |
+
|
| 253 |
+
# Skip if inside label: \label{...:ACRONYM...}
|
| 254 |
+
if r'\label{' in before[-20:] and '}' in after[:20]:
|
| 255 |
+
return True
|
| 256 |
+
|
| 257 |
+
# Skip if inside ref: \ref{...:ACRONYM...}
|
| 258 |
+
if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
|
| 259 |
+
return True
|
| 260 |
+
|
| 261 |
+
# Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
|
| 262 |
+
if r'\url{' in before[-20:] or 'http' in before[-20:]:
|
| 263 |
+
return True
|
| 264 |
+
|
| 265 |
+
# Skip if inside math mode (simple heuristic)
|
| 266 |
+
# Count $ signs before position
|
| 267 |
+
dollar_count = before.count('$') - before.count(r'\$')
|
| 268 |
+
if dollar_count % 2 == 1: # Odd number means we're inside math mode
|
| 269 |
+
return True
|
| 270 |
+
|
| 271 |
+
# Skip if inside \begin{equation} or similar
|
| 272 |
+
if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
|
| 273 |
+
if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
|
| 274 |
+
return True
|
| 275 |
+
|
| 276 |
+
# Skip if it looks like a LaTeX command argument: \command[ACRONYM]
|
| 277 |
+
if before.endswith('[') and after.startswith(']'):
|
| 278 |
+
return True
|
| 279 |
+
|
| 280 |
+
# Skip if part of a file path or extension
|
| 281 |
+
if '.' in before[-5:] or '/' in before[-10:]:
|
| 282 |
+
return True
|
| 283 |
+
|
| 284 |
+
return False
|
src/checkers/ai_artifacts_checker.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AI artifacts checker.
|
| 3 |
+
|
| 4 |
+
Detects leftover text from AI writing assistants that should be removed
|
| 5 |
+
before submission, such as:
|
| 6 |
+
- Conversational responses ("Sure, here is...")
|
| 7 |
+
- Placeholder text
|
| 8 |
+
- Markdown formatting artifacts
|
| 9 |
+
- Common AI response patterns
|
| 10 |
+
"""
|
| 11 |
+
import re
|
| 12 |
+
from typing import List, Tuple
|
| 13 |
+
|
| 14 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AIArtifactsChecker(BaseChecker):
|
| 18 |
+
"""Detect AI-generated text artifacts that should be removed."""
|
| 19 |
+
|
| 20 |
+
name = "ai_artifacts"
|
| 21 |
+
display_name = "AI Artifacts"
|
| 22 |
+
description = "Detect leftover AI assistant text and placeholders"
|
| 23 |
+
|
| 24 |
+
# Conversational AI patterns (case insensitive)
|
| 25 |
+
# These are phrases that clearly indicate a dialogue between user and AI assistant
|
| 26 |
+
AI_CONVERSATION_PATTERNS = [
|
| 27 |
+
# Responses to requests
|
| 28 |
+
(r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"),
|
| 29 |
+
(r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"),
|
| 30 |
+
(r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"),
|
| 31 |
+
(r'\bcertainly[!,]\s*here\b', "Conversational AI response"),
|
| 32 |
+
(r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"),
|
| 33 |
+
(r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"),
|
| 34 |
+
|
| 35 |
+
# Self-identification
|
| 36 |
+
(r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"),
|
| 37 |
+
(r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"),
|
| 38 |
+
(r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"),
|
| 39 |
+
|
| 40 |
+
# Explanatory transitions typical of chat
|
| 41 |
+
(r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"),
|
| 42 |
+
(r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"),
|
| 43 |
+
(r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"),
|
| 44 |
+
|
| 45 |
+
# Closing/Politeness
|
| 46 |
+
(r'\bhope\s+this\s+helps\b', "Conversational AI closing"),
|
| 47 |
+
(r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"),
|
| 48 |
+
(r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"),
|
| 49 |
+
(r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"),
|
| 50 |
+
(r'\bgreat\s+question[!,]?\b', "Conversational AI response"),
|
| 51 |
+
(r'\b(excellent|good|great)\s+point\b', "Conversational AI response"),
|
| 52 |
+
|
| 53 |
+
# Instructions/Meta-commentary
|
| 54 |
+
(r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"),
|
| 55 |
+
(r'\b(remember|note)\s+that\b', "Conversational AI instruction"),
|
| 56 |
+
(r'\bplease\s+note\s+that\b', "Conversational AI instruction"),
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
# Placeholder patterns
|
| 60 |
+
PLACEHOLDER_PATTERNS = [
|
| 61 |
+
(r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"),
|
| 62 |
+
(r'\[add\s+[^\]]+\]', "Placeholder text"),
|
| 63 |
+
(r'\[todo[:\s][^\]]*\]', "TODO placeholder"),
|
| 64 |
+
(r'\btodo\s*:\s*.{0,50}', "TODO comment"),
|
| 65 |
+
(r'\bfixme\s*:\s*.{0,50}', "FIXME comment"),
|
| 66 |
+
(r'\bxxx\b', "XXX placeholder"),
|
| 67 |
+
(r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"),
|
| 68 |
+
(r'author[\s_-]*name', "Author name placeholder"),
|
| 69 |
+
(r'your\.?email@example\.com', "Email placeholder"),
|
| 70 |
+
(r'example@(example\.com|university\.edu)', "Email placeholder"),
|
| 71 |
+
(r'\[citation\s+needed\]', "Citation needed placeholder"),
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
# Markdown artifacts (should not appear in LaTeX)
|
| 75 |
+
MARKDOWN_PATTERNS = [
|
| 76 |
+
(r'^\s*#{1,6}\s+\w', "Markdown header"),
|
| 77 |
+
(r'\*\*[^*]+\*\*', "Markdown bold"),
|
| 78 |
+
(r'(?<!\*)\*[^*\s][^*]*[^*\s]\*(?!\*)', "Markdown italic"),
|
| 79 |
+
(r'(?<!`)`[^`\n]+`(?!`)', "Markdown inline code"),
|
| 80 |
+
(r'```[\s\S]*?```', "Markdown code block"),
|
| 81 |
+
(r'^\s*[-*+]\s+\w', "Markdown bullet point"),
|
| 82 |
+
(r'^\s*\d+\.\s+\w', "Markdown numbered list"),
|
| 83 |
+
(r'\[([^\]]+)\]\(([^)]+)\)', "Markdown link"),
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 89 |
+
results = []
|
| 90 |
+
lines = tex_content.split('\n')
|
| 91 |
+
|
| 92 |
+
# Track if we are inside a verbatim-like environment
|
| 93 |
+
in_verbatim = False
|
| 94 |
+
verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox']
|
| 95 |
+
|
| 96 |
+
# Check each line
|
| 97 |
+
for line_num, line in enumerate(lines, 1):
|
| 98 |
+
# Check for environment boundaries
|
| 99 |
+
# Handle \begin{env}
|
| 100 |
+
if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
|
| 101 |
+
in_verbatim = True
|
| 102 |
+
continue # Skip the begin line itself
|
| 103 |
+
|
| 104 |
+
# Handle \end{env}
|
| 105 |
+
if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line):
|
| 106 |
+
in_verbatim = False
|
| 107 |
+
continue # Skip the end line itself
|
| 108 |
+
|
| 109 |
+
# Skip checks if inside verbatim environment
|
| 110 |
+
if in_verbatim:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
# Skip commented lines using base class method
|
| 114 |
+
if self._is_comment_line(line):
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
# Remove inline comments for checking using base class method
|
| 118 |
+
line_to_check = self._remove_line_comment(line)
|
| 119 |
+
|
| 120 |
+
# Check AI conversation patterns
|
| 121 |
+
for pattern, description in self.AI_CONVERSATION_PATTERNS:
|
| 122 |
+
if re.search(pattern, line_to_check, re.IGNORECASE):
|
| 123 |
+
results.append(self._create_result(
|
| 124 |
+
passed=False,
|
| 125 |
+
severity=CheckSeverity.ERROR,
|
| 126 |
+
message=f"{description} detected",
|
| 127 |
+
line_number=line_num,
|
| 128 |
+
line_content=line.strip()[:100],
|
| 129 |
+
suggestion="Remove AI-generated conversational text"
|
| 130 |
+
))
|
| 131 |
+
break # One match per line for this category
|
| 132 |
+
|
| 133 |
+
# Check placeholder patterns
|
| 134 |
+
for pattern, description in self.PLACEHOLDER_PATTERNS:
|
| 135 |
+
match = re.search(pattern, line_to_check, re.IGNORECASE)
|
| 136 |
+
if match:
|
| 137 |
+
results.append(self._create_result(
|
| 138 |
+
passed=False,
|
| 139 |
+
severity=CheckSeverity.WARNING,
|
| 140 |
+
message=f"{description}: '{match.group(0)[:50]}'",
|
| 141 |
+
line_number=line_num,
|
| 142 |
+
line_content=line.strip()[:100],
|
| 143 |
+
suggestion="Replace placeholder with actual content or remove"
|
| 144 |
+
))
|
| 145 |
+
|
| 146 |
+
# Check Markdown patterns (less strict - might be intentional in some cases)
|
| 147 |
+
for pattern, description in self.MARKDOWN_PATTERNS:
|
| 148 |
+
# Skip if line looks like a LaTeX command (starts with \)
|
| 149 |
+
if line_to_check.strip().startswith('\\'):
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
# Special handling for bullet points: ensure space after
|
| 153 |
+
if "bullet point" in description:
|
| 154 |
+
# Skip if it looks like a math subtraction or negative number
|
| 155 |
+
if re.search(r'[-+]\d', line_to_check):
|
| 156 |
+
continue
|
| 157 |
+
# Skip if inside math mode (simple heuristic)
|
| 158 |
+
if '$' in line_to_check:
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
# Special handling for italics: avoid matching math mode like $x*y$
|
| 162 |
+
if "italic" in description:
|
| 163 |
+
if '$' in line_to_check:
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
if re.search(pattern, line_to_check):
|
| 167 |
+
results.append(self._create_result(
|
| 168 |
+
passed=False,
|
| 169 |
+
severity=CheckSeverity.INFO,
|
| 170 |
+
message=f"Possible {description} in LaTeX",
|
| 171 |
+
line_number=line_num,
|
| 172 |
+
line_content=line.strip()[:100],
|
| 173 |
+
suggestion="Convert to LaTeX formatting or remove if unintentional"
|
| 174 |
+
))
|
| 175 |
+
|
| 176 |
+
return results
|
src/checkers/anonymization_checker.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Anonymization checker.
|
| 3 |
+
|
| 4 |
+
For double-blind review submissions, checks for:
|
| 5 |
+
- Author name leaks in acknowledgments
|
| 6 |
+
- Personal URLs (GitHub, personal pages)
|
| 7 |
+
- Self-citations that reveal identity
|
| 8 |
+
- Institutional information in comments
|
| 9 |
+
"""
|
| 10 |
+
import re
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AnonymizationChecker(BaseChecker):
|
| 17 |
+
"""Check for anonymization issues in double-blind submissions."""
|
| 18 |
+
|
| 19 |
+
name = "anonymization"
|
| 20 |
+
display_name = "Anonymization"
|
| 21 |
+
description = "Detect potential identity leaks in double-blind submissions"
|
| 22 |
+
|
| 23 |
+
# Patterns for identity-revealing content
|
| 24 |
+
PERSONAL_URL_PATTERNS = [
|
| 25 |
+
(r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"),
|
| 26 |
+
(r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"),
|
| 27 |
+
(r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"),
|
| 28 |
+
(r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"),
|
| 29 |
+
(r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"),
|
| 30 |
+
(r'~[a-zA-Z]+/', "Personal university page"),
|
| 31 |
+
(r'people\.[a-zA-Z]+\.edu', "Academic personal page"),
|
| 32 |
+
(r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"),
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Anonymous submission indicators (should be present)
|
| 36 |
+
ANONYMOUS_MARKERS = [
|
| 37 |
+
r'\\author\{[^}]*anonymous[^}]*\}',
|
| 38 |
+
r'anonymous\s+submission',
|
| 39 |
+
r'\\runningauthor\{[^}]*\}', # Should be empty or generic
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# Potentially revealing patterns
|
| 43 |
+
SELF_CITE_PATTERNS = [
|
| 44 |
+
r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)',
|
| 45 |
+
r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)',
|
| 46 |
+
r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)',
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# Acknowledgment patterns
|
| 50 |
+
ACK_PATTERN = re.compile(
|
| 51 |
+
r'\\(?:section\*?\{acknowledgment|begin\{ack)',
|
| 52 |
+
re.IGNORECASE
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 56 |
+
results = []
|
| 57 |
+
lines = tex_content.split('\n')
|
| 58 |
+
|
| 59 |
+
# Check if this is a review submission (look for anonymous author)
|
| 60 |
+
is_review_version = self._is_review_version(tex_content)
|
| 61 |
+
|
| 62 |
+
if not is_review_version:
|
| 63 |
+
# If camera-ready, skip anonymization checks
|
| 64 |
+
results.append(self._create_result(
|
| 65 |
+
passed=True,
|
| 66 |
+
severity=CheckSeverity.INFO,
|
| 67 |
+
message="Document appears to be camera-ready version (not checking anonymization)"
|
| 68 |
+
))
|
| 69 |
+
return results
|
| 70 |
+
|
| 71 |
+
# Check for personal URLs
|
| 72 |
+
for line_num, line in enumerate(lines, 1):
|
| 73 |
+
# Skip comments, but still check for leaks in comments!
|
| 74 |
+
if self._is_comment_line(line):
|
| 75 |
+
for pattern, desc in self.PERSONAL_URL_PATTERNS:
|
| 76 |
+
if re.search(pattern, line, re.IGNORECASE):
|
| 77 |
+
results.append(self._create_result(
|
| 78 |
+
passed=False,
|
| 79 |
+
severity=CheckSeverity.WARNING,
|
| 80 |
+
message=f"{desc} in comment (could be revealed when compiling)",
|
| 81 |
+
line_number=line_num,
|
| 82 |
+
line_content=line.strip()[:100],
|
| 83 |
+
suggestion="Remove or anonymize URL even in comments"
|
| 84 |
+
))
|
| 85 |
+
continue
|
| 86 |
+
|
| 87 |
+
for pattern, desc in self.PERSONAL_URL_PATTERNS:
|
| 88 |
+
if re.search(pattern, line, re.IGNORECASE):
|
| 89 |
+
results.append(self._create_result(
|
| 90 |
+
passed=False,
|
| 91 |
+
severity=CheckSeverity.ERROR,
|
| 92 |
+
message=f"{desc} may reveal author identity",
|
| 93 |
+
line_number=line_num,
|
| 94 |
+
line_content=line.strip()[:100],
|
| 95 |
+
suggestion="Replace with anonymized URL or remove for review"
|
| 96 |
+
))
|
| 97 |
+
|
| 98 |
+
# Check acknowledgments section
|
| 99 |
+
ack_results = self._check_acknowledgments(tex_content, lines)
|
| 100 |
+
results.extend(ack_results)
|
| 101 |
+
|
| 102 |
+
# Check for self-revealing citations
|
| 103 |
+
for line_num, line in enumerate(lines, 1):
|
| 104 |
+
# Skip comments using base class method
|
| 105 |
+
if self._is_comment_line(line):
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
for pattern in self.SELF_CITE_PATTERNS:
|
| 109 |
+
if re.search(pattern, line, re.IGNORECASE):
|
| 110 |
+
results.append(self._create_result(
|
| 111 |
+
passed=False,
|
| 112 |
+
severity=CheckSeverity.WARNING,
|
| 113 |
+
message="Potentially self-revealing citation pattern",
|
| 114 |
+
line_number=line_num,
|
| 115 |
+
line_content=line.strip()[:100],
|
| 116 |
+
suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
|
| 117 |
+
))
|
| 118 |
+
|
| 119 |
+
# Check for \author content
|
| 120 |
+
author_results = self._check_author_field(tex_content)
|
| 121 |
+
results.extend(author_results)
|
| 122 |
+
|
| 123 |
+
return results
|
| 124 |
+
|
| 125 |
+
def _is_review_version(self, content: str) -> bool:
|
| 126 |
+
"""Detect if this is a review (anonymous) version."""
|
| 127 |
+
# Check for common anonymous submission markers
|
| 128 |
+
review_indicators = [
|
| 129 |
+
r'review',
|
| 130 |
+
r'submitted\s+to',
|
| 131 |
+
r'under\s+review',
|
| 132 |
+
r'anonymous',
|
| 133 |
+
r'\\usepackage\[review\]',
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
for indicator in review_indicators:
|
| 137 |
+
if re.search(indicator, content[:2000], re.IGNORECASE):
|
| 138 |
+
return True
|
| 139 |
+
|
| 140 |
+
# Check for camera-ready indicators (negative)
|
| 141 |
+
camera_indicators = [
|
| 142 |
+
r'\\usepackage\[accepted\]',
|
| 143 |
+
r'\\usepackage\[final\]',
|
| 144 |
+
r'camera[\s-]?ready',
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
for indicator in camera_indicators:
|
| 148 |
+
if re.search(indicator, content[:2000], re.IGNORECASE):
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
# Default to review version (safer)
|
| 152 |
+
return True
|
| 153 |
+
|
| 154 |
+
def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]:
|
| 155 |
+
"""Check acknowledgments section for identity leaks."""
|
| 156 |
+
results = []
|
| 157 |
+
|
| 158 |
+
# Find acknowledgment section
|
| 159 |
+
ack_match = self.ACK_PATTERN.search(content)
|
| 160 |
+
if not ack_match:
|
| 161 |
+
return results
|
| 162 |
+
|
| 163 |
+
# Find the line number
|
| 164 |
+
ack_line = self._find_line_number(content, ack_match.start())
|
| 165 |
+
|
| 166 |
+
# Check if it's commented out
|
| 167 |
+
actual_line = lines[ack_line - 1] if ack_line <= len(lines) else ""
|
| 168 |
+
if not actual_line.lstrip().startswith('%'):
|
| 169 |
+
results.append(self._create_result(
|
| 170 |
+
passed=False,
|
| 171 |
+
severity=CheckSeverity.WARNING,
|
| 172 |
+
message="Acknowledgments section found - should be commented out for review",
|
| 173 |
+
line_number=ack_line,
|
| 174 |
+
suggestion="Comment out acknowledgments with % for anonymous submission"
|
| 175 |
+
))
|
| 176 |
+
|
| 177 |
+
return results
|
| 178 |
+
|
| 179 |
+
def _check_author_field(self, content: str) -> List[CheckResult]:
|
| 180 |
+
"""Check \\author{} field for revealing content."""
|
| 181 |
+
results = []
|
| 182 |
+
|
| 183 |
+
# Find \author{...} - handle multiline
|
| 184 |
+
author_pattern = re.compile(r'\\author\s*\{', re.DOTALL)
|
| 185 |
+
match = author_pattern.search(content)
|
| 186 |
+
|
| 187 |
+
if match:
|
| 188 |
+
# Extract author content (handle nested braces)
|
| 189 |
+
start = match.end()
|
| 190 |
+
brace_count = 1
|
| 191 |
+
i = start
|
| 192 |
+
while i < len(content) and brace_count > 0:
|
| 193 |
+
if content[i] == '{':
|
| 194 |
+
brace_count += 1
|
| 195 |
+
elif content[i] == '}':
|
| 196 |
+
brace_count -= 1
|
| 197 |
+
i += 1
|
| 198 |
+
|
| 199 |
+
author_content = content[start:i-1]
|
| 200 |
+
line_num = self._find_line_number(content, match.start())
|
| 201 |
+
|
| 202 |
+
# Check if author content looks anonymous
|
| 203 |
+
if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE):
|
| 204 |
+
# Check if it's not using \Anonymous or similar
|
| 205 |
+
if not re.search(r'\\(Anonymous|blindauthor)', author_content):
|
| 206 |
+
# Might contain real author info
|
| 207 |
+
if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content):
|
| 208 |
+
results.append(self._create_result(
|
| 209 |
+
passed=False,
|
| 210 |
+
severity=CheckSeverity.ERROR,
|
| 211 |
+
message="Author field may contain real names",
|
| 212 |
+
line_number=line_num,
|
| 213 |
+
suggestion="Replace with 'Anonymous' or use anonymization command"
|
| 214 |
+
))
|
| 215 |
+
|
| 216 |
+
return results
|
src/checkers/base.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base checker class for paper submission quality checks.
|
| 3 |
+
|
| 4 |
+
All specific checkers inherit from BaseChecker and implement
|
| 5 |
+
the check() method to validate specific aspects of the TeX document.
|
| 6 |
+
"""
|
| 7 |
+
import re
|
| 8 |
+
from abc import ABC, abstractmethod
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import List, Optional, Tuple
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CheckSeverity(Enum):
|
| 15 |
+
"""Severity levels for check results."""
|
| 16 |
+
ERROR = "error" # Must fix before submission
|
| 17 |
+
WARNING = "warning" # Strongly recommended to fix
|
| 18 |
+
INFO = "info" # Suggestion or best practice
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class CheckResult:
|
| 23 |
+
"""Result of a single check."""
|
| 24 |
+
checker_name: str
|
| 25 |
+
passed: bool
|
| 26 |
+
severity: CheckSeverity
|
| 27 |
+
message: str
|
| 28 |
+
line_number: Optional[int] = None
|
| 29 |
+
line_content: Optional[str] = None
|
| 30 |
+
suggestion: Optional[str] = None
|
| 31 |
+
file_path: Optional[str] = None
|
| 32 |
+
|
| 33 |
+
def to_dict(self) -> dict:
|
| 34 |
+
return {
|
| 35 |
+
'checker': self.checker_name,
|
| 36 |
+
'passed': self.passed,
|
| 37 |
+
'severity': self.severity.value,
|
| 38 |
+
'message': self.message,
|
| 39 |
+
'line': self.line_number,
|
| 40 |
+
'content': self.line_content,
|
| 41 |
+
'suggestion': self.suggestion,
|
| 42 |
+
'file_path': self.file_path
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class BaseChecker(ABC):
|
| 47 |
+
"""
|
| 48 |
+
Abstract base class for all paper submission checkers.
|
| 49 |
+
|
| 50 |
+
Each checker validates a specific aspect of the paper,
|
| 51 |
+
such as caption placement, reference integrity, or formatting.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# Checker metadata - override in subclasses
|
| 55 |
+
name: str = "base"
|
| 56 |
+
display_name: str = "Base Checker"
|
| 57 |
+
description: str = "Base checker class"
|
| 58 |
+
|
| 59 |
+
@abstractmethod
|
| 60 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 61 |
+
"""
|
| 62 |
+
Run the check on the given TeX content.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
tex_content: The full content of the TeX file
|
| 66 |
+
config: Optional configuration dict (e.g., conference-specific settings)
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
List of CheckResult objects describing found issues
|
| 70 |
+
"""
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
def _remove_comments(self, content: str) -> str:
|
| 74 |
+
"""
|
| 75 |
+
Remove all LaTeX comments from content.
|
| 76 |
+
|
| 77 |
+
Preserves line structure (replaces comment with empty string on same line).
|
| 78 |
+
Handles escaped percent signs (\\%) correctly.
|
| 79 |
+
"""
|
| 80 |
+
lines = content.split('\n')
|
| 81 |
+
result = []
|
| 82 |
+
|
| 83 |
+
for line in lines:
|
| 84 |
+
# Find first unescaped %
|
| 85 |
+
cleaned = self._remove_line_comment(line)
|
| 86 |
+
result.append(cleaned)
|
| 87 |
+
|
| 88 |
+
return '\n'.join(result)
|
| 89 |
+
|
| 90 |
+
def _remove_line_comment(self, line: str) -> str:
|
| 91 |
+
"""Remove comment from a single line, preserving content before %."""
|
| 92 |
+
i = 0
|
| 93 |
+
while i < len(line):
|
| 94 |
+
if line[i] == '%':
|
| 95 |
+
# Check if escaped
|
| 96 |
+
num_backslashes = 0
|
| 97 |
+
j = i - 1
|
| 98 |
+
while j >= 0 and line[j] == '\\':
|
| 99 |
+
num_backslashes += 1
|
| 100 |
+
j -= 1
|
| 101 |
+
if num_backslashes % 2 == 0:
|
| 102 |
+
# Not escaped, this is a comment start
|
| 103 |
+
return line[:i]
|
| 104 |
+
i += 1
|
| 105 |
+
return line
|
| 106 |
+
|
| 107 |
+
def _is_comment_line(self, line: str) -> bool:
|
| 108 |
+
"""Check if a line is entirely a comment (starts with %)."""
|
| 109 |
+
stripped = line.lstrip()
|
| 110 |
+
if not stripped:
|
| 111 |
+
return False
|
| 112 |
+
return stripped[0] == '%'
|
| 113 |
+
|
| 114 |
+
def _get_non_comment_lines(self, content: str) -> List[Tuple[int, str]]:
|
| 115 |
+
"""
|
| 116 |
+
Get all non-comment lines with their line numbers.
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of (line_number, line_content) tuples for non-comment lines.
|
| 120 |
+
Line content has inline comments removed.
|
| 121 |
+
"""
|
| 122 |
+
lines = content.split('\n')
|
| 123 |
+
result = []
|
| 124 |
+
|
| 125 |
+
for line_num, line in enumerate(lines, 1):
|
| 126 |
+
# Skip pure comment lines
|
| 127 |
+
if self._is_comment_line(line):
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
# Remove inline comments
|
| 131 |
+
cleaned = self._remove_line_comment(line)
|
| 132 |
+
|
| 133 |
+
# Skip if nothing left after removing comment
|
| 134 |
+
if not cleaned.strip():
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
result.append((line_num, cleaned))
|
| 138 |
+
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
def _find_line_number(self, content: str, position: int) -> int:
|
| 142 |
+
"""Find line number for a character position in content."""
|
| 143 |
+
return content[:position].count('\n') + 1
|
| 144 |
+
|
| 145 |
+
def _get_line_content(self, content: str, line_number: int) -> str:
|
| 146 |
+
"""Get the content of a specific line."""
|
| 147 |
+
lines = content.split('\n')
|
| 148 |
+
if 1 <= line_number <= len(lines):
|
| 149 |
+
return lines[line_number - 1].strip()
|
| 150 |
+
return ""
|
| 151 |
+
|
| 152 |
+
def _is_commented(self, content: str, position: int) -> bool:
|
| 153 |
+
"""Check if a position is within a LaTeX comment."""
|
| 154 |
+
# Find the start of the current line
|
| 155 |
+
line_start = content.rfind('\n', 0, position) + 1
|
| 156 |
+
line_before = content[line_start:position]
|
| 157 |
+
|
| 158 |
+
# Check for unescaped % before this position on the same line
|
| 159 |
+
i = 0
|
| 160 |
+
while i < len(line_before):
|
| 161 |
+
if line_before[i] == '%':
|
| 162 |
+
# Check if escaped
|
| 163 |
+
num_backslashes = 0
|
| 164 |
+
j = i - 1
|
| 165 |
+
while j >= 0 and line_before[j] == '\\':
|
| 166 |
+
num_backslashes += 1
|
| 167 |
+
j -= 1
|
| 168 |
+
if num_backslashes % 2 == 0:
|
| 169 |
+
# Not escaped, this is a comment
|
| 170 |
+
return True
|
| 171 |
+
i += 1
|
| 172 |
+
return False
|
| 173 |
+
|
| 174 |
+
def _create_result(
|
| 175 |
+
self,
|
| 176 |
+
passed: bool,
|
| 177 |
+
severity: CheckSeverity,
|
| 178 |
+
message: str,
|
| 179 |
+
line_number: Optional[int] = None,
|
| 180 |
+
line_content: Optional[str] = None,
|
| 181 |
+
suggestion: Optional[str] = None
|
| 182 |
+
) -> CheckResult:
|
| 183 |
+
"""Helper to create a CheckResult with this checker's name."""
|
| 184 |
+
return CheckResult(
|
| 185 |
+
checker_name=self.name,
|
| 186 |
+
passed=passed,
|
| 187 |
+
severity=severity,
|
| 188 |
+
message=message,
|
| 189 |
+
line_number=line_number,
|
| 190 |
+
line_content=line_content,
|
| 191 |
+
suggestion=suggestion
|
| 192 |
+
)
|
| 193 |
+
|
src/checkers/caption_checker.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Caption placement checker.
|
| 3 |
+
|
| 4 |
+
Validates that:
|
| 5 |
+
- Table captions appear ABOVE the table content
|
| 6 |
+
- Figure captions appear BELOW the figure content
|
| 7 |
+
"""
|
| 8 |
+
import re
|
| 9 |
+
from typing import List
|
| 10 |
+
|
| 11 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CaptionChecker(BaseChecker):
|
| 15 |
+
"""Check for correct caption placement in tables and figures."""
|
| 16 |
+
|
| 17 |
+
name = "caption"
|
| 18 |
+
display_name = "Caption Placement"
|
| 19 |
+
description = "Verify table captions are above and figure captions are below"
|
| 20 |
+
|
| 21 |
+
# Patterns for environments
|
| 22 |
+
TABLE_ENV_PATTERN = re.compile(
|
| 23 |
+
r'\\begin\{table\*?\}(.*?)\\end\{table\*?\}',
|
| 24 |
+
re.DOTALL | re.IGNORECASE
|
| 25 |
+
)
|
| 26 |
+
FIGURE_ENV_PATTERN = re.compile(
|
| 27 |
+
r'\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}',
|
| 28 |
+
re.DOTALL | re.IGNORECASE
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Content patterns
|
| 32 |
+
CAPTION_PATTERN = re.compile(r'\\caption\s*[\[{]')
|
| 33 |
+
TABULAR_PATTERN = re.compile(r'\\begin\{tabular')
|
| 34 |
+
INCLUDEGRAPHICS_PATTERN = re.compile(r'\\includegraphics')
|
| 35 |
+
TIKZ_PATTERN = re.compile(r'\\begin\{tikzpicture\}')
|
| 36 |
+
|
| 37 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 38 |
+
results = []
|
| 39 |
+
|
| 40 |
+
# Check table environments
|
| 41 |
+
for match in self.TABLE_ENV_PATTERN.finditer(tex_content):
|
| 42 |
+
env_content = match.group(1)
|
| 43 |
+
env_start = match.start()
|
| 44 |
+
|
| 45 |
+
# Skip if commented
|
| 46 |
+
if self._is_commented(tex_content, env_start):
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
result = self._check_table_caption(env_content, tex_content, env_start)
|
| 50 |
+
if result:
|
| 51 |
+
results.append(result)
|
| 52 |
+
|
| 53 |
+
# Check figure environments
|
| 54 |
+
for match in self.FIGURE_ENV_PATTERN.finditer(tex_content):
|
| 55 |
+
env_content = match.group(1)
|
| 56 |
+
env_start = match.start()
|
| 57 |
+
|
| 58 |
+
# Skip if commented
|
| 59 |
+
if self._is_commented(tex_content, env_start):
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
result = self._check_figure_caption(env_content, tex_content, env_start)
|
| 63 |
+
if result:
|
| 64 |
+
results.append(result)
|
| 65 |
+
|
| 66 |
+
return results
|
| 67 |
+
|
| 68 |
+
def _check_table_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
|
| 69 |
+
"""Check that table caption is above tabular content."""
|
| 70 |
+
caption_match = self.CAPTION_PATTERN.search(env_content)
|
| 71 |
+
tabular_match = self.TABULAR_PATTERN.search(env_content)
|
| 72 |
+
|
| 73 |
+
if not caption_match:
|
| 74 |
+
line_num = self._find_line_number(full_content, env_start)
|
| 75 |
+
return self._create_result(
|
| 76 |
+
passed=False,
|
| 77 |
+
severity=CheckSeverity.WARNING,
|
| 78 |
+
message="Table environment missing caption",
|
| 79 |
+
line_number=line_num,
|
| 80 |
+
suggestion="Add \\caption{} before \\begin{tabular}"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
if not tabular_match:
|
| 84 |
+
# Table without tabular content - skip
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
# Caption should come BEFORE tabular
|
| 88 |
+
if caption_match.start() > tabular_match.start():
|
| 89 |
+
line_num = self._find_line_number(full_content, env_start + caption_match.start())
|
| 90 |
+
return self._create_result(
|
| 91 |
+
passed=False,
|
| 92 |
+
severity=CheckSeverity.ERROR,
|
| 93 |
+
message="Table caption should be placed ABOVE the table content",
|
| 94 |
+
line_number=line_num,
|
| 95 |
+
line_content=self._get_line_content(full_content, line_num),
|
| 96 |
+
suggestion="Move \\caption{} before \\begin{tabular}"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
def _check_figure_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult:
|
| 102 |
+
"""Check that figure caption is below image content."""
|
| 103 |
+
caption_match = self.CAPTION_PATTERN.search(env_content)
|
| 104 |
+
graphics_match = self.INCLUDEGRAPHICS_PATTERN.search(env_content)
|
| 105 |
+
tikz_match = self.TIKZ_PATTERN.search(env_content)
|
| 106 |
+
|
| 107 |
+
# Find the actual content (either graphics or tikz)
|
| 108 |
+
content_match = graphics_match or tikz_match
|
| 109 |
+
|
| 110 |
+
if not caption_match:
|
| 111 |
+
line_num = self._find_line_number(full_content, env_start)
|
| 112 |
+
return self._create_result(
|
| 113 |
+
passed=False,
|
| 114 |
+
severity=CheckSeverity.WARNING,
|
| 115 |
+
message="Figure environment missing caption",
|
| 116 |
+
line_number=line_num,
|
| 117 |
+
suggestion="Add \\caption{} after \\includegraphics"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
if not content_match:
|
| 121 |
+
# Figure without graphics/tikz - could be custom content, skip
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
# Caption should come AFTER content
|
| 125 |
+
if caption_match.start() < content_match.start():
|
| 126 |
+
line_num = self._find_line_number(full_content, env_start + caption_match.start())
|
| 127 |
+
return self._create_result(
|
| 128 |
+
passed=False,
|
| 129 |
+
severity=CheckSeverity.ERROR,
|
| 130 |
+
message="Figure caption should be placed BELOW the figure content",
|
| 131 |
+
line_number=line_num,
|
| 132 |
+
line_content=self._get_line_content(full_content, line_num),
|
| 133 |
+
suggestion="Move \\caption{} after \\includegraphics"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return None
|
src/checkers/citation_quality_checker.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Citation quality checker.
|
| 3 |
+
|
| 4 |
+
Validates:
|
| 5 |
+
- Old citations (>30 years) that might need updating
|
| 6 |
+
- Citation formatting patterns (et al., hardcoded citations, etc.)
|
| 7 |
+
"""
|
| 8 |
+
import re
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
|
| 13 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CitationQualityChecker(BaseChecker):
|
| 17 |
+
"""Check citation quality and balance."""
|
| 18 |
+
|
| 19 |
+
name = "citation_quality"
|
| 20 |
+
display_name = "Citation Quality"
|
| 21 |
+
description = "Check citation age, balance, and formatting"
|
| 22 |
+
|
| 23 |
+
# Thresholds
|
| 24 |
+
OLD_CITATION_YEARS = 30 # Citations older than this get flagged
|
| 25 |
+
|
| 26 |
+
CURRENT_YEAR = datetime.now().year
|
| 27 |
+
|
| 28 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 29 |
+
results = []
|
| 30 |
+
|
| 31 |
+
# This checker works best with bib content, but we can do some analysis
|
| 32 |
+
# on the tex file alone by looking at citation patterns
|
| 33 |
+
|
| 34 |
+
# Check for inline year citations that are old
|
| 35 |
+
old_cite_results = self._check_old_citations_in_text(tex_content)
|
| 36 |
+
results.extend(old_cite_results)
|
| 37 |
+
|
| 38 |
+
# Check for citation formatting issues
|
| 39 |
+
format_results = self._check_citation_formatting(tex_content)
|
| 40 |
+
results.extend(format_results)
|
| 41 |
+
|
| 42 |
+
return results
|
| 43 |
+
|
| 44 |
+
def _check_old_citations_in_text(self, content: str) -> List[CheckResult]:
|
| 45 |
+
"""Look for citations with old years visible in text."""
|
| 46 |
+
results = []
|
| 47 |
+
lines = content.split('\n')
|
| 48 |
+
|
| 49 |
+
# Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)"
|
| 50 |
+
year_pattern = re.compile(
|
| 51 |
+
r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|' # Parenthetical
|
| 52 |
+
r'\b(?:19[89]\d|20[01]\d)\b)', # Standalone year
|
| 53 |
+
re.IGNORECASE
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
old_years_found = set()
|
| 57 |
+
|
| 58 |
+
for line_num, line in enumerate(lines, 1):
|
| 59 |
+
# Skip comments using base class method
|
| 60 |
+
if self._is_comment_line(line):
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
for match in year_pattern.finditer(line):
|
| 64 |
+
year_str = re.search(r'(19[89]\d|20[01]\d)', match.group())
|
| 65 |
+
if year_str:
|
| 66 |
+
year = int(year_str.group())
|
| 67 |
+
age = self.CURRENT_YEAR - year
|
| 68 |
+
|
| 69 |
+
if age >= self.OLD_CITATION_YEARS and year not in old_years_found:
|
| 70 |
+
old_years_found.add(year)
|
| 71 |
+
results.append(self._create_result(
|
| 72 |
+
passed=False,
|
| 73 |
+
severity=CheckSeverity.INFO,
|
| 74 |
+
message=f"Citation from {year} ({age} years old)",
|
| 75 |
+
line_number=line_num,
|
| 76 |
+
suggestion=f"Consider if there's more recent work on this topic"
|
| 77 |
+
))
|
| 78 |
+
|
| 79 |
+
return results
|
| 80 |
+
|
| 81 |
+
def _check_citation_formatting(self, content: str) -> List[CheckResult]:
|
| 82 |
+
"""Check for common citation formatting issues."""
|
| 83 |
+
results = []
|
| 84 |
+
lines = content.split('\n')
|
| 85 |
+
|
| 86 |
+
for line_num, line in enumerate(lines, 1):
|
| 87 |
+
if line.lstrip().startswith('%'):
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
# Check for "et al" without period
|
| 91 |
+
if re.search(r'\bet al\b(?!\.)', line):
|
| 92 |
+
results.append(self._create_result(
|
| 93 |
+
passed=False,
|
| 94 |
+
severity=CheckSeverity.WARNING,
|
| 95 |
+
message="'et al' should be 'et al.'",
|
| 96 |
+
line_number=line_num,
|
| 97 |
+
suggestion="Add period after 'et al.'"
|
| 98 |
+
))
|
| 99 |
+
|
| 100 |
+
# Check for "[1]" style citations (might want natbib style)
|
| 101 |
+
# Skip if it's a command definition or argument
|
| 102 |
+
if re.search(r'\[\d+\]', line):
|
| 103 |
+
# Skip if in command definition
|
| 104 |
+
if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line:
|
| 105 |
+
continue
|
| 106 |
+
# Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1]
|
| 107 |
+
if re.search(r'\\[a-zA-Z]+\[\d+\]', line):
|
| 108 |
+
continue
|
| 109 |
+
# Only flag if it looks like actual citation in text
|
| 110 |
+
if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]):
|
| 111 |
+
results.append(self._create_result(
|
| 112 |
+
passed=False,
|
| 113 |
+
severity=CheckSeverity.INFO,
|
| 114 |
+
message="Numeric citation style detected",
|
| 115 |
+
line_number=line_num,
|
| 116 |
+
suggestion="Consider author-year style for better readability"
|
| 117 |
+
))
|
| 118 |
+
|
| 119 |
+
# Check for hardcoded citations instead of \cite
|
| 120 |
+
if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line):
|
| 121 |
+
if '\\cite' not in line:
|
| 122 |
+
results.append(self._create_result(
|
| 123 |
+
passed=False,
|
| 124 |
+
severity=CheckSeverity.WARNING,
|
| 125 |
+
message="Appears to be hardcoded citation instead of \\cite",
|
| 126 |
+
line_number=line_num,
|
| 127 |
+
line_content=line.strip()[:80],
|
| 128 |
+
suggestion="Use \\cite{} for proper bibliography management"
|
| 129 |
+
))
|
| 130 |
+
|
| 131 |
+
return results
|
src/checkers/consistency_checker.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Terminology consistency checker.
|
| 3 |
+
|
| 4 |
+
Validates:
|
| 5 |
+
- Consistent spelling of the same term
|
| 6 |
+
- Consistent hyphenation
|
| 7 |
+
- Consistent capitalization of technical terms
|
| 8 |
+
"""
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Set
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
|
| 13 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ConsistencyChecker(BaseChecker):
|
| 17 |
+
"""Check terminology and spelling consistency."""
|
| 18 |
+
|
| 19 |
+
name = "consistency"
|
| 20 |
+
display_name = "Consistency"
|
| 21 |
+
description = "Check for inconsistent terminology and spelling"
|
| 22 |
+
|
| 23 |
+
# Known variant pairs (canonical -> variants)
|
| 24 |
+
KNOWN_VARIANTS = {
|
| 25 |
+
# Hyphenation variants
|
| 26 |
+
'self-supervised': ['self supervised', 'selfsupervised'],
|
| 27 |
+
'pre-trained': ['pre trained', 'pretrained'],
|
| 28 |
+
'fine-tuned': ['fine tuned', 'finetuned'],
|
| 29 |
+
'state-of-the-art': ['state of the art', 'stateoftheart'],
|
| 30 |
+
'real-world': ['real world', 'realworld'],
|
| 31 |
+
'end-to-end': ['end to end', 'endtoend', 'e2e'],
|
| 32 |
+
'large-scale': ['large scale', 'largescale'],
|
| 33 |
+
'long-term': ['long term', 'longterm'],
|
| 34 |
+
'short-term': ['short term', 'shortterm'],
|
| 35 |
+
'multi-task': ['multi task', 'multitask'],
|
| 36 |
+
'multi-modal': ['multi modal', 'multimodal'],
|
| 37 |
+
'cross-lingual': ['cross lingual', 'crosslingual'],
|
| 38 |
+
'zero-shot': ['zero shot', 'zeroshot'],
|
| 39 |
+
'few-shot': ['few shot', 'fewshot'],
|
| 40 |
+
'in-context': ['in context', 'incontext'],
|
| 41 |
+
|
| 42 |
+
# American vs British English (comprehensive list)
|
| 43 |
+
# -or/-our endings
|
| 44 |
+
'color': ['colour'],
|
| 45 |
+
'behavior': ['behaviour'],
|
| 46 |
+
'favor': ['favour'],
|
| 47 |
+
'honor': ['honour'],
|
| 48 |
+
'labor': ['labour'],
|
| 49 |
+
'neighbor': ['neighbour'],
|
| 50 |
+
'rumor': ['rumour'],
|
| 51 |
+
'vapor': ['vapour'],
|
| 52 |
+
|
| 53 |
+
# -ize/-ise endings
|
| 54 |
+
'analyze': ['analyse'],
|
| 55 |
+
'characterize': ['characterise'],
|
| 56 |
+
'generalize': ['generalise'],
|
| 57 |
+
'initialize': ['initialise'],
|
| 58 |
+
'maximize': ['maximise'],
|
| 59 |
+
'minimize': ['minimise'],
|
| 60 |
+
'normalize': ['normalise'],
|
| 61 |
+
'optimize': ['optimise'],
|
| 62 |
+
'organize': ['organise'],
|
| 63 |
+
'realize': ['realise'],
|
| 64 |
+
'recognize': ['recognise'],
|
| 65 |
+
'specialize': ['specialise'],
|
| 66 |
+
'standardize': ['standardise'],
|
| 67 |
+
'summarize': ['summarise'],
|
| 68 |
+
'utilize': ['utilise'],
|
| 69 |
+
'visualize': ['visualise'],
|
| 70 |
+
'categorize': ['categorise'],
|
| 71 |
+
'emphasize': ['emphasise'],
|
| 72 |
+
'hypothesize': ['hypothesise'],
|
| 73 |
+
'prioritize': ['prioritise'],
|
| 74 |
+
'synchronize': ['synchronise'],
|
| 75 |
+
|
| 76 |
+
# -ization/-isation endings
|
| 77 |
+
'generalization': ['generalisation'],
|
| 78 |
+
'initialization': ['initialisation'],
|
| 79 |
+
'maximization': ['maximisation'],
|
| 80 |
+
'minimization': ['minimisation'],
|
| 81 |
+
'normalization': ['normalisation'],
|
| 82 |
+
'optimization': ['optimisation'],
|
| 83 |
+
'organization': ['organisation'],
|
| 84 |
+
'realization': ['realisation'],
|
| 85 |
+
'regularization': ['regularisation'],
|
| 86 |
+
'specialization': ['specialisation'],
|
| 87 |
+
'standardization': ['standardisation'],
|
| 88 |
+
'summarization': ['summarisation'],
|
| 89 |
+
'utilization': ['utilisation'],
|
| 90 |
+
'visualization': ['visualisation'],
|
| 91 |
+
'categorization': ['categorisation'],
|
| 92 |
+
'characterization': ['characterisation'],
|
| 93 |
+
'parametrization': ['parametrisation'],
|
| 94 |
+
'quantization': ['quantisation'],
|
| 95 |
+
|
| 96 |
+
# -er/-re endings
|
| 97 |
+
'center': ['centre'],
|
| 98 |
+
'fiber': ['fibre'],
|
| 99 |
+
'meter': ['metre'],
|
| 100 |
+
'liter': ['litre'],
|
| 101 |
+
|
| 102 |
+
# -l-/-ll- (American single, British double)
|
| 103 |
+
'modeling': ['modelling'],
|
| 104 |
+
'labeled': ['labelled'],
|
| 105 |
+
'labeling': ['labelling'],
|
| 106 |
+
'traveled': ['travelled'],
|
| 107 |
+
'traveling': ['travelling'],
|
| 108 |
+
'canceled': ['cancelled'],
|
| 109 |
+
'canceling': ['cancelling'],
|
| 110 |
+
'signaled': ['signalled'],
|
| 111 |
+
'signaling': ['signalling'],
|
| 112 |
+
|
| 113 |
+
# -og/-ogue endings
|
| 114 |
+
'analog': ['analogue'],
|
| 115 |
+
'catalog': ['catalogue'],
|
| 116 |
+
'dialog': ['dialogue'],
|
| 117 |
+
|
| 118 |
+
# -ense/-ence endings
|
| 119 |
+
'defense': ['defence'],
|
| 120 |
+
'license': ['licence'],
|
| 121 |
+
'offense': ['offence'],
|
| 122 |
+
|
| 123 |
+
# Other common differences
|
| 124 |
+
'gray': ['grey'],
|
| 125 |
+
'artifact': ['artefact'],
|
| 126 |
+
'program': ['programme'], # Note: 'program' is standard in computing
|
| 127 |
+
'skeptical': ['sceptical'],
|
| 128 |
+
'aluminum': ['aluminium'],
|
| 129 |
+
|
| 130 |
+
# Verb forms
|
| 131 |
+
'learned': ['learnt'],
|
| 132 |
+
'burned': ['burnt'],
|
| 133 |
+
'spelled': ['spelt'],
|
| 134 |
+
|
| 135 |
+
# Common term variants
|
| 136 |
+
'dataset': ['data set', 'data-set'],
|
| 137 |
+
'benchmark': ['bench mark', 'bench-mark'],
|
| 138 |
+
'baseline': ['base line', 'base-line'],
|
| 139 |
+
'downstream': ['down stream', 'down-stream'],
|
| 140 |
+
'upstream': ['up stream', 'up-stream'],
|
| 141 |
+
'encoder': ['en-coder'],
|
| 142 |
+
'decoder': ['de-coder'],
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Capitalization variants to track
|
| 146 |
+
CAPITALIZATION_TERMS = [
|
| 147 |
+
'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
|
| 148 |
+
'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 152 |
+
results = []
|
| 153 |
+
|
| 154 |
+
# Remove comments
|
| 155 |
+
content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
|
| 156 |
+
content_lower = content.lower()
|
| 157 |
+
|
| 158 |
+
# Check for known variant inconsistencies
|
| 159 |
+
for canonical, variants in self.KNOWN_VARIANTS.items():
|
| 160 |
+
found_forms = []
|
| 161 |
+
|
| 162 |
+
# Check canonical form
|
| 163 |
+
if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
|
| 164 |
+
found_forms.append(canonical)
|
| 165 |
+
|
| 166 |
+
# Check variants
|
| 167 |
+
for variant in variants:
|
| 168 |
+
if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
|
| 169 |
+
found_forms.append(variant)
|
| 170 |
+
|
| 171 |
+
if len(found_forms) > 1:
|
| 172 |
+
results.append(self._create_result(
|
| 173 |
+
passed=False,
|
| 174 |
+
severity=CheckSeverity.WARNING,
|
| 175 |
+
message=f"Inconsistent spelling: {', '.join(found_forms)}",
|
| 176 |
+
suggestion=f"Use '{canonical}' consistently throughout"
|
| 177 |
+
))
|
| 178 |
+
|
| 179 |
+
# Check hyphenated word consistency
|
| 180 |
+
hyphen_results = self._check_hyphenation_consistency(content)
|
| 181 |
+
results.extend(hyphen_results)
|
| 182 |
+
|
| 183 |
+
# Check capitalization consistency
|
| 184 |
+
cap_results = self._check_capitalization_consistency(content)
|
| 185 |
+
results.extend(cap_results)
|
| 186 |
+
|
| 187 |
+
return results
|
| 188 |
+
|
| 189 |
+
def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
|
| 190 |
+
"""Find words that appear both hyphenated and non-hyphenated."""
|
| 191 |
+
results = []
|
| 192 |
+
|
| 193 |
+
# Common terms that should always be hyphenated (exceptions)
|
| 194 |
+
ALWAYS_HYPHENATED = {
|
| 195 |
+
'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
|
| 196 |
+
'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
|
| 197 |
+
'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
|
| 198 |
+
'self-supervised', 'self-attention', 'co-training', 'pre-training',
|
| 199 |
+
'post-processing', 'pre-processing', 'well-known', 'well-defined',
|
| 200 |
+
'high-quality', 'low-quality', 'long-term', 'short-term'
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
# Find all hyphenated words
|
| 204 |
+
hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))
|
| 205 |
+
|
| 206 |
+
for hyph_word in hyphenated:
|
| 207 |
+
# Skip if it's a known compound that should always be hyphenated
|
| 208 |
+
if hyph_word.lower() in ALWAYS_HYPHENATED:
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
# Create non-hyphenated version
|
| 212 |
+
non_hyph = hyph_word.replace('-', ' ')
|
| 213 |
+
combined = hyph_word.replace('-', '')
|
| 214 |
+
|
| 215 |
+
# Check if non-hyphenated version exists
|
| 216 |
+
if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
|
| 217 |
+
results.append(self._create_result(
|
| 218 |
+
passed=False,
|
| 219 |
+
severity=CheckSeverity.INFO,
|
| 220 |
+
message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
|
| 221 |
+
suggestion="Choose one form and use it consistently"
|
| 222 |
+
))
|
| 223 |
+
elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
|
| 224 |
+
results.append(self._create_result(
|
| 225 |
+
passed=False,
|
| 226 |
+
severity=CheckSeverity.INFO,
|
| 227 |
+
message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
|
| 228 |
+
suggestion="Choose one form and use it consistently"
|
| 229 |
+
))
|
| 230 |
+
|
| 231 |
+
return results
|
| 232 |
+
|
| 233 |
+
def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
|
| 234 |
+
"""Check if technical terms have consistent capitalization."""
|
| 235 |
+
results = []
|
| 236 |
+
|
| 237 |
+
for term in self.CAPITALIZATION_TERMS:
|
| 238 |
+
# Find all case variations
|
| 239 |
+
pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
|
| 240 |
+
matches = pattern.findall(content)
|
| 241 |
+
|
| 242 |
+
if len(matches) > 1:
|
| 243 |
+
# Check if there are mixed capitalizations
|
| 244 |
+
unique_forms = set(matches)
|
| 245 |
+
if len(unique_forms) > 1:
|
| 246 |
+
forms_str = ', '.join(f"'{f}'" for f in unique_forms)
|
| 247 |
+
results.append(self._create_result(
|
| 248 |
+
passed=False,
|
| 249 |
+
severity=CheckSeverity.INFO,
|
| 250 |
+
message=f"Inconsistent capitalization: {forms_str}",
|
| 251 |
+
suggestion="Use consistent capitalization for technical terms"
|
| 252 |
+
))
|
| 253 |
+
|
| 254 |
+
return results
|
src/checkers/equation_checker.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Equation formatting checker.
|
| 3 |
+
|
| 4 |
+
Validates:
|
| 5 |
+
- Punctuation after equations (based on grammar)
|
| 6 |
+
- Equation numbering consistency
|
| 7 |
+
- Variable definitions
|
| 8 |
+
"""
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Set
|
| 11 |
+
|
| 12 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EquationChecker(BaseChecker):
|
| 16 |
+
"""Check equation formatting and consistency."""
|
| 17 |
+
|
| 18 |
+
name = "equation"
|
| 19 |
+
display_name = "Equations"
|
| 20 |
+
description = "Check equation formatting and punctuation"
|
| 21 |
+
|
| 22 |
+
# Equation environments
|
| 23 |
+
EQUATION_ENVS = [
|
| 24 |
+
'equation', 'align', 'gather', 'multline', 'eqnarray',
|
| 25 |
+
'equation*', 'align*', 'gather*', 'multline*', 'eqnarray*'
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 29 |
+
results = []
|
| 30 |
+
|
| 31 |
+
# Check equation punctuation
|
| 32 |
+
punct_results = self._check_equation_punctuation(tex_content)
|
| 33 |
+
results.extend(punct_results)
|
| 34 |
+
|
| 35 |
+
# Check for numbered vs unnumbered consistency
|
| 36 |
+
numbering_results = self._check_numbering_consistency(tex_content)
|
| 37 |
+
results.extend(numbering_results)
|
| 38 |
+
|
| 39 |
+
# Check inline math consistency ($...$ vs \(...\))
|
| 40 |
+
inline_results = self._check_inline_math_consistency(tex_content)
|
| 41 |
+
results.extend(inline_results)
|
| 42 |
+
|
| 43 |
+
return results
|
| 44 |
+
|
| 45 |
+
def _check_equation_punctuation(self, content: str) -> List[CheckResult]:
|
| 46 |
+
"""Check if equations end with appropriate punctuation."""
|
| 47 |
+
results = []
|
| 48 |
+
|
| 49 |
+
for env in self.EQUATION_ENVS:
|
| 50 |
+
if '*' in env:
|
| 51 |
+
env_escaped = env.replace('*', r'\*')
|
| 52 |
+
else:
|
| 53 |
+
env_escaped = env
|
| 54 |
+
|
| 55 |
+
# Find equation content
|
| 56 |
+
pattern = re.compile(
|
| 57 |
+
rf'\\begin\{{{env_escaped}\}}(.*?)\\end\{{{env_escaped}\}}',
|
| 58 |
+
re.DOTALL
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
for match in pattern.finditer(content):
|
| 62 |
+
eq_content = match.group(1).strip()
|
| 63 |
+
|
| 64 |
+
# Check what comes after the equation
|
| 65 |
+
after_pos = match.end()
|
| 66 |
+
after_text = content[after_pos:after_pos + 50].strip()
|
| 67 |
+
|
| 68 |
+
# Equations in running text should have punctuation
|
| 69 |
+
# Check if equation content ends with punctuation
|
| 70 |
+
eq_content_clean = re.sub(r'\\label\{[^}]+\}', '', eq_content).strip()
|
| 71 |
+
|
| 72 |
+
if eq_content_clean and not re.search(r'[.,;]$', eq_content_clean):
|
| 73 |
+
# Check if next text starts lowercase (indicating sentence continues)
|
| 74 |
+
if after_text and after_text[0].islower():
|
| 75 |
+
line_num = self._find_line_number(content, match.end())
|
| 76 |
+
results.append(self._create_result(
|
| 77 |
+
passed=False,
|
| 78 |
+
severity=CheckSeverity.INFO,
|
| 79 |
+
message="Equation may need punctuation (sentence continues after)",
|
| 80 |
+
line_number=line_num,
|
| 81 |
+
suggestion="Add comma or period inside equation if it ends a clause"
|
| 82 |
+
))
|
| 83 |
+
|
| 84 |
+
return results
|
| 85 |
+
|
| 86 |
+
def _check_numbering_consistency(self, content: str) -> List[CheckResult]:
|
| 87 |
+
"""Check for mixed numbered and unnumbered equations."""
|
| 88 |
+
results = []
|
| 89 |
+
|
| 90 |
+
# Count numbered vs unnumbered
|
| 91 |
+
numbered = 0
|
| 92 |
+
unnumbered = 0
|
| 93 |
+
|
| 94 |
+
for env in self.EQUATION_ENVS:
|
| 95 |
+
count = len(re.findall(rf'\\begin\{{{env}\}}', content))
|
| 96 |
+
if '*' in env or 'nonumber' in content:
|
| 97 |
+
unnumbered += count
|
| 98 |
+
else:
|
| 99 |
+
numbered += count
|
| 100 |
+
|
| 101 |
+
# Also count \nonumber and \notag usage
|
| 102 |
+
unnumbered += len(re.findall(r'\\nonumber|\\notag', content))
|
| 103 |
+
|
| 104 |
+
# If there's a significant mix, warn
|
| 105 |
+
total = numbered + unnumbered
|
| 106 |
+
if total > 3 and numbered > 0 and unnumbered > 0:
|
| 107 |
+
ratio = min(numbered, unnumbered) / total
|
| 108 |
+
if ratio > 0.2: # More than 20% in minority
|
| 109 |
+
results.append(self._create_result(
|
| 110 |
+
passed=False,
|
| 111 |
+
severity=CheckSeverity.INFO,
|
| 112 |
+
message=f"Mixed equation numbering: {numbered} numbered, {unnumbered} unnumbered",
|
| 113 |
+
suggestion="Consider consistent numbering strategy"
|
| 114 |
+
))
|
| 115 |
+
|
| 116 |
+
return results
|
| 117 |
+
|
| 118 |
+
def _check_inline_math_consistency(self, content: str) -> List[CheckResult]:
|
| 119 |
+
"""Check for mixed inline math delimiters."""
|
| 120 |
+
results = []
|
| 121 |
+
|
| 122 |
+
# Count different inline math styles
|
| 123 |
+
dollar_count = len(re.findall(r'(?<!\$)\$(?!\$)[^$]+\$(?!\$)', content))
|
| 124 |
+
paren_count = len(re.findall(r'\\\(.*?\\\)', content))
|
| 125 |
+
|
| 126 |
+
if dollar_count > 0 and paren_count > 0:
|
| 127 |
+
results.append(self._create_result(
|
| 128 |
+
passed=False,
|
| 129 |
+
severity=CheckSeverity.INFO,
|
| 130 |
+
message=f"Mixed inline math: ${dollar_count} \\$...\\$ and {paren_count} \\(...\\)",
|
| 131 |
+
suggestion="Use consistent inline math delimiters throughout"
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
return results
|
src/checkers/formatting_checker.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Formatting checker.
|
| 3 |
+
|
| 4 |
+
Validates common LaTeX formatting issues:
|
| 5 |
+
- Citation formatting consistency
|
| 6 |
+
- Non-breaking spaces before citations
|
| 7 |
+
- Special character escaping
|
| 8 |
+
- Whitespace issues
|
| 9 |
+
"""
|
| 10 |
+
import re
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class FormattingChecker(BaseChecker):
|
| 17 |
+
"""Check for common LaTeX formatting issues."""
|
| 18 |
+
|
| 19 |
+
name = "formatting"
|
| 20 |
+
display_name = "Formatting"
|
| 21 |
+
description = "Check citation style, spacing, and special characters"
|
| 22 |
+
|
| 23 |
+
# Citation commands
|
| 24 |
+
CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
|
| 25 |
+
'citeauthor', 'citeyear', 'autocite', 'textcite',
|
| 26 |
+
'parencite', 'footcite']
|
| 27 |
+
|
| 28 |
+
# Pattern for citations without non-breaking space
|
| 29 |
+
# Matches: "word \cite" but not "word~\cite"
|
| 30 |
+
CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
|
| 31 |
+
|
| 32 |
+
# Pattern for multiple consecutive spaces
|
| 33 |
+
MULTI_SPACE_PATTERN = re.compile(r'(?<!\\) +')
|
| 34 |
+
|
| 35 |
+
# Pattern for unescaped special characters (outside math mode)
|
| 36 |
+
SPECIAL_CHARS = {
|
| 37 |
+
'%': r'(?<!\\)%', # Unescaped %
|
| 38 |
+
'&': r'(?<!\\)&(?![a-zA-Z]+;)', # Unescaped & (not HTML entities)
|
| 39 |
+
'#': r'(?<!\\)#', # Unescaped #
|
| 40 |
+
'_': r'(?<![\\$])_(?![^$]*\$)', # Unescaped _ outside math
|
| 41 |
+
'^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Multiple blank lines pattern (3 or more blank lines)
|
| 45 |
+
MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
|
| 46 |
+
|
| 47 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 48 |
+
results = []
|
| 49 |
+
lines = tex_content.split('\n')
|
| 50 |
+
|
| 51 |
+
# Track citation style consistency
|
| 52 |
+
cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
|
| 53 |
+
|
| 54 |
+
for line_num, line in enumerate(lines, 1):
|
| 55 |
+
# Skip commented lines using base class method
|
| 56 |
+
if self._is_comment_line(line):
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
# Remove inline comments using base class method
|
| 60 |
+
line_content = self._remove_line_comment(line)
|
| 61 |
+
|
| 62 |
+
# Check citation non-breaking space
|
| 63 |
+
for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
|
| 64 |
+
results.append(self._create_result(
|
| 65 |
+
passed=False,
|
| 66 |
+
severity=CheckSeverity.INFO,
|
| 67 |
+
message="Citation without non-breaking space",
|
| 68 |
+
line_number=line_num,
|
| 69 |
+
line_content=line.strip()[:100],
|
| 70 |
+
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
|
| 71 |
+
))
|
| 72 |
+
|
| 73 |
+
# Track citation styles
|
| 74 |
+
for cmd in self.CITE_COMMANDS:
|
| 75 |
+
if re.search(rf'\\{cmd}\b', line_content):
|
| 76 |
+
if cmd in ['citep', 'parencite', 'autocite']:
|
| 77 |
+
cite_styles['parenthetical'] += 1
|
| 78 |
+
elif cmd in ['citet', 'textcite']:
|
| 79 |
+
cite_styles['textual'] += 1
|
| 80 |
+
elif cmd == 'cite':
|
| 81 |
+
cite_styles['plain'] += 1
|
| 82 |
+
|
| 83 |
+
# Check citation style consistency
|
| 84 |
+
styles_used = [s for s, count in cite_styles.items() if count > 0]
|
| 85 |
+
if len(styles_used) > 1:
|
| 86 |
+
results.append(self._create_result(
|
| 87 |
+
passed=False,
|
| 88 |
+
severity=CheckSeverity.INFO,
|
| 89 |
+
message=f"Mixed citation styles detected: {', '.join(styles_used)}",
|
| 90 |
+
suggestion="Consider using consistent citation style throughout"
|
| 91 |
+
))
|
| 92 |
+
|
| 93 |
+
# Check for multiple blank lines (3 or more)
|
| 94 |
+
for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
|
| 95 |
+
line_num = self._find_line_number(tex_content, match.start())
|
| 96 |
+
# Count how many blank lines
|
| 97 |
+
blank_count = match.group(0).count('\n') - 1
|
| 98 |
+
|
| 99 |
+
# Get context: the line before, blank lines, and the line after
|
| 100 |
+
start_pos = match.start()
|
| 101 |
+
end_pos = match.end()
|
| 102 |
+
|
| 103 |
+
# Find the line before the blank lines
|
| 104 |
+
prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
|
| 105 |
+
prev_line_end = start_pos
|
| 106 |
+
prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
|
| 107 |
+
|
| 108 |
+
# Find the line after the blank lines
|
| 109 |
+
next_line_end = tex_content.find('\n', end_pos)
|
| 110 |
+
if next_line_end == -1:
|
| 111 |
+
next_line_end = len(tex_content)
|
| 112 |
+
next_line = tex_content[end_pos:next_line_end].rstrip()
|
| 113 |
+
|
| 114 |
+
# Create visual representation with warning markers
|
| 115 |
+
blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
|
| 116 |
+
line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
|
| 117 |
+
|
| 118 |
+
results.append(self._create_result(
|
| 119 |
+
passed=False,
|
| 120 |
+
severity=CheckSeverity.INFO,
|
| 121 |
+
message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
|
| 122 |
+
line_number=line_num,
|
| 123 |
+
line_content=line_content,
|
| 124 |
+
suggestion="Reduce to single blank line or use \\vspace"
|
| 125 |
+
))
|
| 126 |
+
|
| 127 |
+
# Check for common issues with special characters
|
| 128 |
+
results.extend(self._check_special_chars(tex_content, lines))
|
| 129 |
+
|
| 130 |
+
return results
|
| 131 |
+
|
| 132 |
+
def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
|
| 133 |
+
"""Check for unescaped special characters."""
|
| 134 |
+
results = []
|
| 135 |
+
|
| 136 |
+
# Find math environments to skip
|
| 137 |
+
math_regions = self._find_math_regions(content)
|
| 138 |
+
|
| 139 |
+
for line_num, line in enumerate(lines, 1):
|
| 140 |
+
# Skip commented lines using base class method
|
| 141 |
+
if self._is_comment_line(line):
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
# Remove inline comments using base class method
|
| 145 |
+
line_content = self._remove_line_comment(line)
|
| 146 |
+
|
| 147 |
+
# Get position of this line in full content
|
| 148 |
+
line_start = sum(len(l) + 1 for l in lines[:line_num-1])
|
| 149 |
+
|
| 150 |
+
# Check for unescaped & (common error)
|
| 151 |
+
for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
|
| 152 |
+
pos = line_start + match.start()
|
| 153 |
+
# Skip if in math
|
| 154 |
+
if not self._in_math_region(pos, math_regions):
|
| 155 |
+
# Also skip if inside tabular
|
| 156 |
+
if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
|
| 157 |
+
results.append(self._create_result(
|
| 158 |
+
passed=False,
|
| 159 |
+
severity=CheckSeverity.WARNING,
|
| 160 |
+
message="Unescaped & outside tabular/math environment",
|
| 161 |
+
line_number=line_num,
|
| 162 |
+
line_content=line.strip()[:100],
|
| 163 |
+
suggestion="Use \\& to escape"
|
| 164 |
+
))
|
| 165 |
+
|
| 166 |
+
return results
|
| 167 |
+
|
| 168 |
+
def _find_math_regions(self, content: str) -> List[tuple]:
|
| 169 |
+
"""Find regions that are inside math mode."""
|
| 170 |
+
regions = []
|
| 171 |
+
|
| 172 |
+
# Inline math $ ... $
|
| 173 |
+
for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
|
| 174 |
+
regions.append((match.start(), match.end()))
|
| 175 |
+
|
| 176 |
+
# Display math $$ ... $$
|
| 177 |
+
for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
|
| 178 |
+
regions.append((match.start(), match.end()))
|
| 179 |
+
|
| 180 |
+
# \[ ... \]
|
| 181 |
+
for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
|
| 182 |
+
regions.append((match.start(), match.end()))
|
| 183 |
+
|
| 184 |
+
# Math environments
|
| 185 |
+
for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
|
| 186 |
+
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
|
| 187 |
+
for match in re.finditer(pattern, content, re.DOTALL):
|
| 188 |
+
regions.append((match.start(), match.end()))
|
| 189 |
+
|
| 190 |
+
return regions
|
| 191 |
+
|
| 192 |
+
def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
|
| 193 |
+
"""Check if position is inside a math region."""
|
| 194 |
+
return any(start <= pos <= end for start, end in regions)
|
| 195 |
+
|
| 196 |
+
def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
|
| 197 |
+
"""Check if position is inside any of the given environments."""
|
| 198 |
+
for env in env_names:
|
| 199 |
+
# Find all instances of this environment
|
| 200 |
+
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
|
| 201 |
+
for match in re.finditer(pattern, content, re.DOTALL):
|
| 202 |
+
if match.start() <= pos <= match.end():
|
| 203 |
+
return True
|
| 204 |
+
return False
|