diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..db76649358c20964e976f0e2ba42b6d0a123d256 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +assets/*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1da09a88cfa0ea5df2e0601f6fd830c9c78ce209 --- /dev/null +++ b/.gitignore @@ -0,0 +1,62 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual Environments +venv/ +env/ +.env +.venv/ + +# IDEs +.idea/ +.vscode/ +*.swp +*.swo + +# macOS +.DS_Store +.AppleDouble +.LSOverride + +# Project Specific Outputs +*.txt +*.md +!README.md +*_only_used_entry.bib + +# LaTeX and Bibliography (User Data) +# Ignoring these to prevent committing personal paper content +*.tex +*.bib +*.pdf +*.log +*.aux +*.out +*.bbl +*.blg +*.synctex.gz +*.fls +*.fdb_latexmk + +# cache +.cache \ No newline at end of file diff --git a/README.md b/README.md index ef3acefd1324f42b9925f5caaceb95456768b90c..855a15eddcb5509cf25fd9766415d966a7dcc9c8 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,198 @@ ---- -title: BibGuard -emoji: ⚡ -colorFrom: pink -colorTo: purple -sdk: gradio -sdk_version: 6.3.0 -app_file: app.py -pinned: false -short_description: Automated bibliography verification and LaTeX quality auditi +# BibGuard: Bibliography & LaTeX Quality Auditor + +**BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit. + +AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims. + +## 🛡 Why BibGuard? + +- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata +- **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems +- **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated +- **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM) +- **⚡ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations + +## 🚀 Features + +### Bibliography Validation +- **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar +- **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional) +- **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.) +- **👀 Usage Analysis**: Highlights missing citations and unused bib entries +- **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching + +### LaTeX Quality Checks +- **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation +- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases +- **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology +- **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants +- **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching) +- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions +- **📅 Citation Age**: Flags references older than 30 years + +## 📦 Installation + +```bash +git clone git@github.com:thinkwee/BibGuard.git +cd BibGuard +pip install -r requirements.txt +``` + +## ⚡ Quick Start + +### 1. Initialize Configuration + +```bash +python main.py --init +``` + +This creates `config.yaml`. Edit it to set your file paths. You have two modes: + +#### Option A: Single File Mode +Best for individual papers. +```yaml +files: + bib: "paper.bib" + tex: "paper.tex" + output_dir: "bibguard_output" +``` + +#### Option B: Directory Scan Mode +Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files. +```yaml +files: + input_dir: "./my_project_dir" + output_dir: "bibguard_output" +``` + +### 2. Run Full Check + +```bash +python main.py +``` + +**Output** (in `bibguard_output/`): +- `bibliography_report.md` - Bibliography validation results +- `latex_quality_report.md` - Writing and formatting issues +- `line_by_line_report.md` - All issues sorted by line number +- `*_only_used.bib` - Clean bibliography (used entries only) + +## 🛠 Configuration + +Edit `config.yaml` to customize checks: + +```yaml +bibliography: + check_metadata: true # Validate against online databases (takes time) + check_usage: true # Find unused/missing entries + check_duplicates: true # Detect duplicate entries + check_preprint_ratio: true # Warn if >50% are preprints + check_relevance: false # LLM-based relevance check (requires API key) + +submission: + # Format checks + caption: true # Table/figure caption placement + reference: true # Cross-reference integrity + formatting: true # Citation spacing, blank lines + equation: true # Equation punctuation, numbering + + # Writing quality + sentence: true # Weak starters, hedging language + consistency: true # Spelling, hyphenation, terminology + acronym: true # Acronym definitions (3+ letters) + + # Submission compliance + ai_artifacts: true # AI-generated text detection + anonymization: true # Double-blind compliance + citation_quality: true # Old citations (>30 years) + number: true # Percentage formatting +``` + +## 🤖 LLM-Based Relevance Check + +To verify citations match their context using AI: + +```yaml +bibliography: + check_relevance: true + +llm: + backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm + api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY) +``` + +**Supported Backends:** +- **Gemini** (Google): `GEMINI_API_KEY` +- **OpenAI**: `OPENAI_API_KEY` +- **Anthropic**: `ANTHROPIC_API_KEY` +- **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance) +- **Ollama**: Local models (no API key needed) +- **vLLM**: Custom endpoint + +Then run: +```bash +python main.py +``` + +## 📝 Understanding Reports + +### Bibliography Report +Shows for each entry: +- ✅ **Verified**: Metadata matches online databases +- ⚠️ **Issues**: Mismatches, missing entries, duplicates +- 📊 **Statistics**: Usage, duplicates, preprint ratio + +### LaTeX Quality Report +Organized by severity: +- 🔴 **Errors**: Critical issues (e.g., undefined references) +- 🟡 **Warnings**: Important issues (e.g., inconsistent spelling) +- 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters) + +### Line-by-Line Report +All LaTeX issues sorted by line number for easy fixing. + +## 🧐 Understanding Mismatches + +BibGuard is strict, but false positives happen: + +1. **Year Discrepancy (±1 Year)**: + - *Reason*: Delay between preprint (arXiv) and official publication + - *Action*: Verify which version you intend to cite + +2. **Author List Variations**: + - *Reason*: Different databases handle large author lists differently + - *Action*: Check if primary authors match + +3. **Venue Name Differences**: + - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems") + - *Action*: Both are usually correct + +4. **Non-Academic Sources**: + - *Reason*: Blogs, documentation not indexed by academic databases + - *Action*: Manually verify URL and title + +## 🔧 Advanced Options + +```bash +python main.py --help # Show all options +python main.py --list-templates # List conference templates +python main.py --config my.yaml # Use custom config file +``` + +## 🤝 Contributing + +Contributions welcome! Please open an issue or pull request. + +## 🙏 Acknowledgments + +BibGuard uses multiple data sources: +- arXiv API +- CrossRef API +- Semantic Scholar API +- DBLP API +- OpenAlex API +- Google Scholar (via scholarly) + --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +**Made with ❤️ for researchers who care about their submission** diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..0bc9c83565bb4530858992e45d1ccb01ecb3e482 --- /dev/null +++ b/app.py @@ -0,0 +1,922 @@ +#!/usr/bin/env python3 +""" +BibGuard Gradio Web Application + +A web interface for checking bibliography and LaTeX quality. +""" +import gradio as gr +import tempfile +import shutil +from pathlib import Path +from typing import Optional, Tuple +import base64 + +from src.parsers import BibParser, TexParser +from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher +from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector +from src.report.generator import ReportGenerator, EntryReport +from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep +from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow +from src.checkers import CHECKER_REGISTRY +from src.report.line_report import LineByLineReportGenerator +from app_helper import fetch_and_compare_with_workflow + + +# Custom CSS for better Markdown rendering +CUSTOM_CSS = """ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); + +* { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; +} +""" + +WELCOME_HTML = """ +
+
+
+

👋 Welcome to BibGuard

+
+
+

+ Ensure your academic paper is flawless. Upload your .bib and .tex files on the left and click "Check Now". +

+ +
+
+ ⚠️ Metadata Check Defaults + "🔍 Metadata" is disabled by default. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification. +
+ +
+ 🚀 Go Pro with Local Version + LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the GitHub repo to use the full power with your API key. +
+
+ +

📊 Understanding Your Reports

+
+
+ 📚 Bibliography + Validates metadata fields, detects duplicates, and checks citation counts. +
+
+ 📝 LaTeX Quality + Syntax check, caption validation, acronym consistency, and style suggestions. +
+
+ 📋 Line-by-Line + Maps every issue found directly to the line number in your source file. +
+
+
+
+
+""" + +CUSTOM_CSS += """ +/* Global Reset */ +body, gradio-app { + overflow: hidden !important; /* Prevent double scrollbars on the page */ +} + +.gradio-container { + max-width: none !important; + width: 100% !important; + height: 100vh !important; + padding: 0 !important; + margin: 0 !important; +} + +/* Header Styling */ +.app-header { + padding: 20px; + background: white; + border-bottom: 1px solid #e5e7eb; +} + +/* Sidebar Styling */ +.app-sidebar { + height: calc(100vh - 100px) !important; + overflow-y: auto !important; + padding: 20px !important; + border-right: 1px solid #e5e7eb; +} + +/* Main Content Area */ +.app-content { + height: calc(100vh - 100px) !important; + padding: 0 !important; +} + +/* The Magic Scroll Container - Clean and Explicit */ +.scrollable-report-area { + height: calc(100vh - 180px) !important; /* Fixed height relative to viewport */ + overflow-y: auto !important; + padding: 24px; + background-color: #f9fafb; + border: 1px solid #e5e7eb; + border-radius: 8px; + margin-top: 10px; +} + +/* Report Card Styling */ +.report-card { + background: white; + border-radius: 12px; + padding: 24px; + margin-bottom: 16px; /* Spacing between cards */ + box-shadow: 0 1px 3px rgba(0,0,0,0.1); + border: 1px solid #e5e7eb; + transition: transform 0.2s, box-shadow 0.2s; +} + +.report-card:hover { + box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); + transform: translateY(-2px); +} + +/* Card Internals */ +.card-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + margin-bottom: 16px; + padding-bottom: 16px; + border-bottom: 1px solid #f3f4f6; +} + +.card-title { + font-size: 1.1em; + font-weight: 600; + color: #111827; + margin: 0 0 4px 0; +} + +.card-subtitle { + font-size: 0.9em; + color: #6b7280; + font-family: monospace; +} + +.card-content { + font-size: 0.95em; + color: #374151; + line-height: 1.5; +} + +/* Badges */ +.badge { + display: inline-flex; + align-items: center; + padding: 4px 10px; + border-radius: 9999px; + font-size: 0.8em; + font-weight: 500; +} + +.badge-success { background-color: #dcfce7; color: #166534; } +.badge-warning { background-color: #fef9c3; color: #854d0e; } +.badge-error { background-color: #fee2e2; color: #991b1b; } +.badge-info { background-color: #dbeafe; color: #1e40af; } +.badge-neutral { background-color: #f3f4f6; color: #4b5563; } + +/* Stats Grid */ +.stats-container { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + gap: 16px; + margin-bottom: 24px; +} + +.stat-card { + padding: 16px; + border-radius: 12px; + color: white; + text-align: center; + box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); +} + +.stat-value { font-size: 1.8em; font-weight: 700; } +.stat-label { font-size: 0.9em; opacity: 0.9; } + +/* Detail Grid - Flexbox for better filling */ +.detail-grid { + display: flex; + flex-wrap: wrap; + gap: 12px; + margin-bottom: 16px; + width: 100%; +} + +.detail-item { + background: #f9fafb; + padding: 10px 12px; + border-radius: 8px; + border: 1px solid #f3f4f6; + + /* Flex sizing: grow, shrink, min-basis */ + flex: 1 1 160px; + min-width: 0; /* Important for word-break to work in flex children */ + + /* Layout control */ + display: flex; + flex-direction: column; + + /* Height constraint to prevent one huge card from stretching the row */ + max-height: 100px; + overflow-y: auto; +} + +/* Custom scrollbar for detail items */ +.detail-item::-webkit-scrollbar { + width: 4px; +} +.detail-item::-webkit-scrollbar-thumb { + background-color: #d1d5db; + border-radius: 4px; +} + +.detail-label { + font-size: 0.75em; + color: #6b7280; + text-transform: uppercase; + letter-spacing: 0.05em; + margin-bottom: 2px; + position: sticky; + top: 0; + background: #f9fafb; /* Maintain bg on scroll */ + z-index: 1; +} + +.detail-value { + font-weight: 500; + color: #1f2937; + font-size: 0.9em; + line-height: 1.4; + word-break: break-word; /* Fix overflow */ + overflow-wrap: break-word; +} border: 1px solid #e5e7eb; + transition: all 0.2s; +} + +.report-card:hover { + box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05); +} + +/* Card Header */ +.card-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + margin-bottom: 12px; + border-bottom: 1px solid #f3f4f6; + padding-bottom: 12px; +} + +.card-title { + font-size: 1.1em; + font-weight: 600; + color: #1f2937; + margin: 0; +} + +.card-subtitle { + font-size: 0.9em; + color: #6b7280; + margin-top: 4px; +} + +/* Status Badges */ +.badge { + display: inline-flex; + align-items: center; + padding: 4px 10px; + border-radius: 9999px; + font-size: 0.8em; + font-weight: 500; +} + +.badge-success { background-color: #dcfce7; color: #166534; } +.badge-warning { background-color: #fef9c3; color: #854d0e; } +.badge-error { background-color: #fee2e2; color: #991b1b; } +.badge-info { background-color: #dbeafe; color: #1e40af; } +.badge-neutral { background-color: #f3f4f6; color: #374151; } + +/* Content Styling */ +.card-content { + font-size: 15px; + color: #374151; + line-height: 1.6; +} + +.card-content code { + background-color: #f3f4f6; + padding: 2px 6px; + border-radius: 4px; + font-family: monospace; + font-size: 0.9em; + color: #c2410c; +} + +/* Grid for details */ +.detail-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); + gap: 12px; + margin-top: 12px; +} + +.detail-item { + background: #f9fafb; + padding: 10px; + border-radius: 6px; +} + +.detail-label { + font-size: 0.8em; + color: #6b7280; + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.detail-value { + font-weight: 500; + color: #111827; +} + +/* Summary Stats */ +.stats-container { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 16px; + margin-bottom: 24px; +} + +.stat-card { + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + color: white; + padding: 20px; + border-radius: 12px; + text-align: center; + box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25); +} + +.stat-value { + font-size: 2em; + font-weight: 700; +} + +.stat-label { + font-size: 0.9em; + opacity: 0.9; + margin-top: 4px; +} + +/* Button styling */ +.primary-btn { + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; + border: none !important; + font-weight: 600 !important; +} + +/* Tab styling */ +.tab-nav button { + font-weight: 500 !important; + font-size: 15px !important; +} +""" + + +def create_config_from_ui( + check_metadata: bool, + check_usage: bool, + check_duplicates: bool, + check_preprint_ratio: bool, + caption: bool, + reference: bool, + formatting: bool, + equation: bool, + ai_artifacts: bool, + sentence: bool, + consistency: bool, + acronym: bool, + number: bool, + citation_quality: bool, + anonymization: bool +) -> BibGuardConfig: + """Create a BibGuardConfig from UI settings.""" + config = BibGuardConfig() + + config.bibliography = BibliographyConfig( + check_metadata=check_metadata, + check_usage=check_usage, + check_duplicates=check_duplicates, + check_preprint_ratio=check_preprint_ratio, + check_relevance=False # Disabled for web + ) + + config.submission = SubmissionConfig( + caption=caption, + reference=reference, + formatting=formatting, + equation=equation, + ai_artifacts=ai_artifacts, + sentence=sentence, + consistency=consistency, + acronym=acronym, + number=number, + citation_quality=citation_quality, + anonymization=anonymization + ) + + config.output = OutputConfig(quiet=True, minimal_verified=False) + + return config + + +def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str: + """Generate HTML content for bibliography report.""" + html = ['
'] + + # 1. Summary Stats + total = len(entries) + verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match) + used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used) + + html.append('
') + html.append(f'
{total}
Total Entries
') + html.append(f'
{verified}
Verified
') + html.append(f'
{used}
Used in Text
') + html.append('
') + + # 2. Entries + for report in report_gen.entries: + entry = report.entry + status_badges = [] + + # Metadata Status + if report.comparison: + if report.comparison.is_match: + status_badges.append('✓ Verified') + if report.comparison.source: + status_badges.append(f'{report.comparison.source.upper()}') + else: + status_badges.append('⚠ Metadata Mismatch') + else: + status_badges.append('No Metadata Check') + + # Usage Status + if report.usage: + if report.usage.is_used: + status_badges.append(f'Used: {report.usage.usage_count}x') + else: + status_badges.append('Unused') + + # Build Card + html.append(f''' +
+
+
+

{entry.title or "No Title"}

+
{entry.key} • {entry.year} • {entry.entry_type}
+
+
+ {" ".join(status_badges)} +
+
+ +
+
+ { + (lambda e: "".join([ + f'
{k}
{v}
' + for k, v in filter(None, [ + ("Authors", e.author or "N/A"), + ("Venue", e.journal or e.booktitle or e.publisher or "N/A"), + ("DOI", e.doi) if e.doi else None, + ("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None, + ("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None, + ("URL", f'Link') if e.url else None + ]) + ]))(entry) + } +
+ ''') + + # Add issues if any + issues = [] + if report.comparison and not report.comparison.is_match: + # Add main message derived from match status + if report.comparison.issues: + for issue in report.comparison.issues: + issues.append(f'
• {issue}
') + else: + issues.append(f'
• Verification failed
') + + if issues: + html.append('
') + html.append("".join(issues)) + html.append('
') + + html.append('
') # Close card-content and report-card + + html.append('
') # Close container + return "".join(html) + +def generate_latex_html(results: list) -> str: + """Generate HTML for LaTeX quality check.""" + from src.checkers import CheckSeverity + + html = ['
'] + + # Stats + errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR) + warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING) + infos = sum(1 for r in results if r.severity == CheckSeverity.INFO) + + html.append('
') + html.append(f'
{errors}
Errors
') + html.append(f'
{warnings}
Warnings
') + html.append(f'
{infos}
Suggestions
') + html.append('
') + + if not results: + html.append('
✅ No issues found in LaTeX code!
') + else: + # Group by Checker + results.sort(key=lambda x: x.checker_name) + current_checker = None + + for result in results: + badge_class = "badge-neutral" + if result.severity == CheckSeverity.ERROR: badge_class = "badge-error" + elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning" + elif result.severity == CheckSeverity.INFO: badge_class = "badge-info" + + html.append(f''' +
+
+
+

{result.checker_name}

+
Line {result.line_number}
+
+ {result.severity.name} +
+
+ {result.message} + {f'
{result.line_content}
' if result.line_content else ''} + {f'
💡 Suggestion: {result.suggestion}
' if result.suggestion else ''} +
+
+ ''') + + html.append('
') + return "".join(html) + +def generate_line_html(content: str, results: list) -> str: + """Generate HTML for Line-by-Line report.""" + # Build a dictionary of line_number -> list of issues + issues_by_line = {} + for r in results: + if r.line_number not in issues_by_line: + issues_by_line[r.line_number] = [] + issues_by_line[r.line_number].append(r) + + lines = content.split('\n') + + html = ['
'] + + html.append('
Issues are mapped to specific lines below.
') + + for i, line in enumerate(lines, 1): + if i in issues_by_line: + # Highlight this line + line_issues = issues_by_line[i] + + html.append(f''' +
+
Line {i}
+
{line}
+
+ ''') + + for issue in line_issues: + html.append(f'
• {issue.message}
') + + html.append('
') + + html.append('
') + return "".join(html) + + + + +def run_check( + bib_file, + tex_file, + check_metadata: bool, + check_usage: bool, + check_duplicates: bool, + check_preprint_ratio: bool, + caption: bool, + reference: bool, + formatting: bool, + equation: bool, + ai_artifacts: bool, + sentence: bool, + consistency: bool, + acronym: bool, + number: bool, + citation_quality: bool, + anonymization: bool, + progress=gr.Progress() +) -> Tuple[str, str, str]: + """Run BibGuard checks and return three reports.""" + + if bib_file is None or tex_file is None: + return ( + "⚠️ Please upload both `.bib` and `.tex` files.", + "⚠️ Please upload both `.bib` and `.tex` files.", + "⚠️ Please upload both `.bib` and `.tex` files." + ) + + try: + # Create config from UI + config = create_config_from_ui( + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization + ) + + # Get file paths from uploaded files + bib_path = bib_file.name + tex_path = tex_file.name + + # Read tex content for checkers + tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace') + + # Parse files + bib_parser = BibParser() + entries = bib_parser.parse_file(bib_path) + + tex_parser = TexParser() + tex_parser.parse_file(tex_path) + + bib_config = config.bibliography + + # Initialize components + arxiv_fetcher = None + crossref_fetcher = None + semantic_scholar_fetcher = None + openalex_fetcher = None + dblp_fetcher = None + comparator = None + usage_checker = None + duplicate_detector = None + + if bib_config.check_metadata: + arxiv_fetcher = ArxivFetcher() + semantic_scholar_fetcher = SemanticScholarFetcher() + openalex_fetcher = OpenAlexFetcher() + dblp_fetcher = DBLPFetcher() + crossref_fetcher = CrossRefFetcher() + comparator = MetadataComparator() + + if bib_config.check_usage: + usage_checker = UsageChecker(tex_parser) + + if bib_config.check_duplicates: + duplicate_detector = DuplicateDetector() + + # Initialize report generator + report_gen = ReportGenerator( + minimal_verified=False, + check_preprint_ratio=bib_config.check_preprint_ratio, + preprint_warning_threshold=bib_config.preprint_warning_threshold + ) + report_gen.set_metadata([bib_file.name], [tex_file.name]) + + # Run submission quality checks + progress(0.2, desc="Running LaTeX quality checks...") + submission_results = [] + enabled_checkers = config.submission.get_enabled_checkers() + + for checker_name in enabled_checkers: + if checker_name in CHECKER_REGISTRY: + checker = CHECKER_REGISTRY[checker_name]() + results = checker.check(tex_content, {}) + for r in results: + r.file_path = tex_file.name + submission_results.extend(results) + + report_gen.set_submission_results(submission_results, None) + + # Check for duplicates + if bib_config.check_duplicates and duplicate_detector: + duplicate_groups = duplicate_detector.find_duplicates(entries) + report_gen.set_duplicate_groups(duplicate_groups) + + # Check missing citations + if bib_config.check_usage and usage_checker: + missing = usage_checker.get_missing_entries(entries) + report_gen.set_missing_citations(missing) + + # Build workflow + workflow_config = get_default_workflow() + + # Process entries + progress(0.3, desc="Processing bibliography entries...") + total_entries = len(entries) + + for i, entry in enumerate(entries): + progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}") + + # Check usage + usage_result = None + if usage_checker: + usage_result = usage_checker.check_usage(entry) + + # Fetch and compare metadata + comparison_result = None + if bib_config.check_metadata and comparator: + comparison_result = fetch_and_compare_with_workflow( + entry, workflow_config, arxiv_fetcher, crossref_fetcher, + semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator + ) + + # Create entry report + entry_report = EntryReport( + entry=entry, + comparison=comparison_result, + usage=usage_result, + evaluations=[] + ) + report_gen.add_entry_report(entry_report) + + progress(0.85, desc="Generating structured reports...") + + # Generate Bibliography HTML Report + bib_report = generate_bibliography_html(report_gen, entries) + + # Generate LaTeX Quality HTML Report + latex_report = generate_latex_html(submission_results) + + # Generate Line-by-Line HTML Report + line_report = "" + if submission_results: + line_report = generate_line_html(tex_content, submission_results) + else: + line_report = '
No issues to display line-by-line.
' + + progress(1.0, desc="Done!") + + return bib_report, latex_report, line_report + + except Exception as e: + error_msg = f"❌ Error: {str(e)}" + import traceback + error_msg += f"\n\n```\n{traceback.format_exc()}\n```" + return error_msg, error_msg, error_msg + + + +def create_app(): + """Create and configure the Gradio app.""" + + # Load icon as base64 + icon_html = "" + try: + icon_path = Path("assets/icon-192.png") + if icon_path.exists(): + with open(icon_path, "rb") as f: + encoding = base64.b64encode(f.read()).decode() + icon_html = f'BibGuard' + else: + icon_html = '📚' + except Exception: + icon_html = '📚' + + with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app: + + # Header with icon + with gr.Row(elem_classes=["app-header"]): + gr.HTML(f""" +
+ {icon_html} +
+

BibGuard

+

Bibliography & LaTeX Quality Checker

+
+
+ """) + + with gr.Row(elem_classes=["app-body"]): + # Left column: Upload & Settings + with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]): + gr.Markdown("### 📁 Upload Files") + + bib_file = gr.File( + label="Bibliography (.bib)", + file_types=[".bib"], + file_count="single" + ) + + tex_file = gr.File( + label="LaTeX Source (.tex)", + file_types=[".tex"], + file_count="single" + ) + + # Check options in grid layout + gr.Markdown("#### ⚙️ Options") + + with gr.Row(): + check_metadata = gr.Checkbox(label="🔍 Metadata", value=False) + check_usage = gr.Checkbox(label="📊 Usage", value=True) + + with gr.Row(): + check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True) + check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True) + + with gr.Row(): + caption = gr.Checkbox(label="🖼️ Captions", value=True) + reference = gr.Checkbox(label="🔗 References", value=True) + + with gr.Row(): + formatting = gr.Checkbox(label="✨ Formatting", value=True) + equation = gr.Checkbox(label="🔢 Equations", value=True) + + with gr.Row(): + ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True) + sentence = gr.Checkbox(label="📝 Sentences", value=True) + + with gr.Row(): + consistency = gr.Checkbox(label="🔄 Consistency", value=True) + acronym = gr.Checkbox(label="🔤 Acronyms", value=True) + + with gr.Row(): + number = gr.Checkbox(label="🔢 Numbers", value=True) + citation_quality = gr.Checkbox(label="📚 Citations", value=True) + + with gr.Row(): + anonymization = gr.Checkbox(label="🎭 Anonymization", value=True) + + run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg") + + gr.HTML(""" +
+ + + GitHub + +

Developed with ❤️ for researchers

+
+ """) + + # Right column: Reports + with gr.Column(scale=4, elem_classes=["app-content"]): + with gr.Tabs(): + with gr.Tab("📚 Bibliography Report"): + bib_report = gr.HTML( + value=WELCOME_HTML, + elem_classes=["report-panel"] + ) + + with gr.Tab("📝 LaTeX Quality"): + latex_report = gr.HTML( + value=WELCOME_HTML, + elem_classes=["report-panel"] + ) + + with gr.Tab("📋 Line-by-Line"): + line_report = gr.HTML( + value=WELCOME_HTML, + elem_classes=["report-panel"] + ) + + # Event handling + run_btn.click( + fn=run_check, + inputs=[ + bib_file, tex_file, + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization + ], + outputs=[bib_report, latex_report, line_report] + ) + + return app + + +# Create the app +app = create_app() + +if __name__ == "__main__": + app.launch( + favicon_path="assets/icon-192.png", + show_error=True, + css=CUSTOM_CSS, + theme=gr.themes.Soft() + ) diff --git a/app_helper.py b/app_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..4780a3d5533081b245c538f8394b23192786ad2b --- /dev/null +++ b/app_helper.py @@ -0,0 +1,98 @@ +def fetch_and_compare_with_workflow( + entry, workflow_steps, arxiv_fetcher, crossref_fetcher, + semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator +): + """Fetch metadata from online sources using the configured workflow.""" + from src.utils.normalizer import TextNormalizer + + best_result = None + + # If no steps provided, use default order + if not workflow_steps: + # Create a default list of steps if needed, or simply handle logic here + pass + + # Simplified workflow execution: Run through enabled steps + # We manualy iterate through sources in a preferred order if workflow is not fully configured + # Or iterate through the steps list. + + # Since extracting WorkflowConfig logic is complex, let's just implement a robust + # default search strategy here which is what the user likely wants. + + results = [] + + # 1. DBLP (High quality for CS) + if dblp_fetcher and entry.title: + try: + dblp_result = dblp_fetcher.search_by_title(entry.title) + if dblp_result: + res = comparator.compare_with_dblp(entry, dblp_result) + if res.is_match: return res + results.append(res) + except Exception: pass + + # 2. Semantic Scholar (Comprehensive) + if semantic_scholar_fetcher and entry.title: + try: + ss_result = None + if entry.doi: + ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi) + if not ss_result: + ss_result = semantic_scholar_fetcher.search_by_title(entry.title) + + if ss_result: + res = comparator.compare_with_semantic_scholar(entry, ss_result) + if res.is_match: return res + results.append(res) + except Exception: pass + + # 3. OpenAlex + if openalex_fetcher and entry.title: + try: + oa_result = None + if entry.doi: + oa_result = openalex_fetcher.fetch_by_doi(entry.doi) + if not oa_result: + oa_result = openalex_fetcher.search_by_title(entry.title) + + if oa_result: + res = comparator.compare_with_openalex(entry, oa_result) + if res.is_match: return res + results.append(res) + except Exception: pass + + # 4. CrossRef (Official metadata) + if crossref_fetcher and entry.doi: + try: + crossref_result = crossref_fetcher.search_by_doi(entry.doi) + if crossref_result: + res = comparator.compare_with_crossref(entry, crossref_result) + if res.is_match: return res + results.append(res) + except Exception: pass + + # 5. ArXiv + if arxiv_fetcher: + try: + arxiv_meta = None + if entry.has_arxiv: + arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id) + elif entry.title: + # Search by title + search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1) + if search_results: + arxiv_meta = search_results[0] + + if arxiv_meta: + res = comparator.compare_with_arxiv(entry, arxiv_meta) + if res.is_match: return res + results.append(res) + except Exception: pass + + # Return the best result (highest confidence) if no perfect match found + if results: + results.sort(key=lambda x: x.confidence, reverse=True) + return results[0] + + # If absolutely nothing found, return None or an 'Unable' result + return comparator.create_unable_result(entry, "No metadata found in any source") diff --git a/assets/icon-192.png b/assets/icon-192.png new file mode 100644 index 0000000000000000000000000000000000000000..087848a7f545297b6b770b6dd80b273af062402d --- /dev/null +++ b/assets/icon-192.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:158c7c199e8e4978d2e8d6da90c4896022bf83436b0ab2c9b6285078cad60863 +size 339925 diff --git a/assets/icon-512.png b/assets/icon-512.png new file mode 100644 index 0000000000000000000000000000000000000000..744430c35a3ebdc6d0d56725806a4fae7bf0390f --- /dev/null +++ b/assets/icon-512.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da47e48d79d2aae7f81cd1b04b39f0b7a66e760ee2338dfcdde36f66293f3ccf +size 312990 diff --git a/bibguard.yaml b/bibguard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7dc574f5a798d32b96cd2b2e5a307e0473318e97 --- /dev/null +++ b/bibguard.yaml @@ -0,0 +1,197 @@ +# ============================================================================== +# BibGuard Configuration File +# ============================================================================== +# +# Usage: python main.py --config bibguard.yaml +# python main.py (auto-detect bibguard.yaml in current/parent directories) +# +# All paths are relative to this configuration file's directory. + +# ============================================================================== +# 📁 File Settings +# ============================================================================== +files: + # Required: Path to your .bib bibliography file + bib: "test.bib" + + # Required: Path to your .tex LaTeX source file + tex: "test.tex" + + # Optional: Directory path for recursive scanning (Experimental) + # When set, BibGuard will recursively search for all .tex and .bib files in this directory. + # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex. + # input_dir: "./paper_project" + + # Output directory for all generated reports and files (default: bibguard_output) + # All outputs including reports, cleaned .bib, and input file copies will be saved here + output_dir: "test" + + +# ============================================================================== +# 🎓 Conference Template +# ============================================================================== +# Specify a conference template for venue-specific checks and formatting rules. +# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr +# Leave empty ("") to skip template-specific checks. +template: "" + +# ============================================================================== +# 📚 Bibliography Checks +# ============================================================================== +bibliography: + # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.) + # Detects incorrect titles, authors, venues, and publication years + # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata. + check_metadata: true + + # Usage Check - Detect unused bib entries and missing citations + # Identifies entries in .bib not cited in .tex, and citations without bib entries + check_usage: true + + # Duplicate Detection - Find duplicate entries with different keys + # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times + check_duplicates: true + + # Preprint Ratio Check - Warn if too many references are preprints + # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold. + check_preprint_ratio: true + preprint_warning_threshold: 0.50 # Warn if more than 50% of used entries are preprints + + # Relevance Assessment - Use LLM to evaluate if citations match their context + # Requires LLM configuration (see llm section below). Disabled by default due to API costs. + check_relevance: false + +# ============================================================================== +# 📋 Submission Quality Checks +# ============================================================================== +submission: + # ───────────────────────────────────────────────────────────────────────────── + # Format Checks + # ───────────────────────────────────────────────────────────────────────────── + + # Caption Position - Ensure table captions are above, figure captions below + # Checks \caption placement relative to \begin{table}/\begin{figure} + caption: true + + # Cross-References - Verify all figures/tables/sections are referenced in text + # Detects orphaned floats that are never mentioned + reference: true + + # Formatting Standards - Check citation format, spacing, special characters + # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc. + formatting: true + + # Equation Checks - Verify equation punctuation and numbering consistency + # Ensures equations end with proper punctuation and labels are used correctly + equation: true + + # ───────────────────────────────────────────────────────────────────────────── + # Writing Quality + # ───────────────────────────────────────────────────────────────────────────── + + # AI Artifacts - Detect traces of AI-generated text + # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..." + ai_artifacts: true + + # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases + # Helps improve readability and academic writing style + sentence: true + + # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants + # Examples: "deep learning" vs "deep-learning", "color" vs "colour" + consistency: true + + # ───────────────────────────────────────────────────────────────────────────── + # Academic Standards + # ───────────────────────────────────────────────────────────────────────────── + + # Acronym Definitions - Ensure acronyms are defined on first use + # Example: "Natural Language Processing (NLP)" before using "NLP" alone + acronym: true + + # Number Formatting - Check percentage formatting consistency + # Ensures no space before % sign and consistent use of '%' vs 'percent' + number: true + + # Citation Quality - Flag outdated references and citation formatting issues + # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations) + citation_quality: true + + # ───────────────────────────────────────────────────────────────────────────── + # Review Compliance + # ───────────────────────────────────────────────────────────────────────────── + + # Anonymization - Check double-blind review compliance + # Detects GitHub links, acknowledgments, self-citations that may reveal author identity + anonymization: true + +# ============================================================================== +# 🔍 Metadata Check Workflow +# ============================================================================== +# Define the data sources and order for metadata validation. +# BibGuard will try each enabled source in sequence until a match is found. +# Set enabled: false to skip a particular source. +workflow: + - name: arxiv_id + enabled: true + description: "Lookup by arXiv ID (fastest, most reliable for preprints)" + + - name: crossref_doi + enabled: true + description: "Lookup by DOI via CrossRef (authoritative for published papers)" + + - name: semantic_scholar + enabled: true + description: "Semantic Scholar API (good coverage, includes citations)" + + - name: dblp + enabled: true + description: "DBLP database (comprehensive for computer science papers)" + + - name: openalex + enabled: true + description: "OpenAlex API (broad coverage across disciplines)" + + - name: arxiv_title + enabled: true + description: "Search arXiv by title (fallback when ID unavailable)" + + - name: crossref_title + enabled: true + description: "Search CrossRef by title (fallback when DOI unavailable)" + + - name: google_scholar + enabled: false # May be rate-limited, disabled by default + description: "Google Scholar web scraping (use as last resort)" + +# ============================================================================== +# 🤖 LLM Configuration (for Relevance Checking) +# ============================================================================== +llm: + # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek + # Each backend requires different setup (API keys, local installation, etc.) + backend: "gemini" + + # Model name (leave empty to use backend default) + # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3" + model: "" + + # API endpoint (leave empty to use backend default) + # Only needed for self-hosted models (vllm, ollama) or custom endpoints + endpoint: "" + + # API key (recommended to use environment variables instead) + # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment + api_key: "" + +# ============================================================================== +# 📊 Output Settings +# ============================================================================== +output: + # Quiet mode - Suppress progress messages, only output final reports + # Useful for CI/CD pipelines or batch processing + quiet: false + + # Minimal verified entries - Hide detailed info for entries that passed all checks + # Reduces report size when you only care about issues + minimal_verified: false diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f77d35e1cd9fd687e074ffd493835a0c80fd1e36 --- /dev/null +++ b/main.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 +""" +BibGuard - Bibliography Checker & Paper Submission Quality Tool + +Usage: + python main.py # Use bibguard.yaml in current directory + python main.py --config my.yaml # Use specified config file + python main.py --init # Create default config file + python main.py --list-templates # List available templates +""" +import argparse +import sys +from pathlib import Path +from typing import Optional, List + +from src.parsers import BibParser, TexParser +from src.fetchers import ArxivFetcher, ScholarFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher +from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, DuplicateDetector +from src.analyzers.llm_evaluator import LLMBackend +from src.report.generator import ReportGenerator, EntryReport +from src.utils.progress import ProgressDisplay +from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config +from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow +from src.templates.base_template import get_template, get_all_templates +from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity + + +def main(): + parser = argparse.ArgumentParser( + description="BibGuard: Bibliography Checker & Paper Submission Quality Tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Usage Examples: + python main.py # Auto-detect config.yaml in current directory + python main.py --config my.yaml # Use specified config file + python main.py --init # Create default config.yaml + python main.py --list-templates # List available conference templates + """ + ) + + parser.add_argument( + "--config", "-c", + help="Config file path (default: auto-detect config.yaml)" + ) + parser.add_argument( + "--init", + action="store_true", + help="Create default config.yaml in current directory" + ) + parser.add_argument( + "--list-templates", + action="store_true", + help="List all available conference templates" + ) + + args = parser.parse_args() + + # Handle --init + if args.init: + output = create_default_config() + print(f"✓ Created configuration file: {output}") + print("") + print(" Next steps:") + print(" 1. Edit the 'bib' and 'tex' paths in config.yaml") + print(" 2. Run: python main.py --config config.yaml") + print("") + sys.exit(0) + + # Handle --list-templates + if args.list_templates: + from src.ui.template_selector import list_templates + list_templates() + sys.exit(0) + + # Find and load config + config_path = args.config + if not config_path: + found = find_config_file() + if found: + config_path = str(found) + else: + print("Error: Config file not found") + print("") + print("Please run 'python main.py --init' to create config.yaml") + print("Or use 'python main.py --config ' to specify a config file") + print("") + sys.exit(1) + + try: + config = load_config(config_path) + except FileNotFoundError: + print(f"Error: Config file does not exist: {config_path}") + sys.exit(1) + except Exception as e: + print(f"Error: Failed to parse config file: {e}") + sys.exit(1) + + # Validate required fields + mode_dir = bool(config.files.input_dir) + + if mode_dir: + input_dir = config.input_dir_path + if not input_dir.exists() or not input_dir.is_dir(): + print(f"Error: Input directory does not exist or is not a directory: {input_dir}") + sys.exit(1) + + tex_files = list(input_dir.rglob("*.tex")) + bib_files = list(input_dir.rglob("*.bib")) + + if not tex_files: + print(f"Error: No .tex files found in {input_dir}") + sys.exit(1) + if not bib_files: + print(f"Error: No .bib files found in {input_dir}") + sys.exit(1) + + config._tex_files = tex_files + config._bib_files = bib_files + else: + if not config.files.bib: + print("Error: bib file path not specified in config") + sys.exit(1) + if not config.files.tex: + print("Error: tex file path not specified in config") + sys.exit(1) + + # Validate files exist + if not config.bib_path.exists(): + print(f"Error: Bib file does not exist: {config.bib_path}") + sys.exit(1) + if not config.tex_path.exists(): + print(f"Error: TeX file does not exist: {config.tex_path}") + sys.exit(1) + + config._tex_files = [config.tex_path] + config._bib_files = [config.bib_path] + + # Load template if specified + template = None + if config.template: + template = get_template(config.template) + if not template: + print(f"Error: Unknown template: {config.template}") + print("Use --list-templates to see available templates") + sys.exit(1) + + # Run the checker + try: + run_checker(config, template) + except KeyboardInterrupt: + print("\n\nCancelled") + sys.exit(130) + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +def run_checker(config: BibGuardConfig, template=None): + """Run the bibliography checker with the given configuration.""" + progress = ProgressDisplay() + + # Show config info (minimal) + if template: + pass # Skip printing header/info here to keep output clean + + # Parse files (silent) + bib_parser = BibParser() + entries = [] + for bib_path in config._bib_files: + entries.extend(bib_parser.parse_file(str(bib_path))) + + tex_parser = TexParser() + tex_contents = {} + merged_citations = {} + merged_all_keys = set() + + for tex_path in config._tex_files: + cits = tex_parser.parse_file(str(tex_path)) + # Accumulate citations + for k, v in cits.items(): + if k not in merged_citations: + merged_citations[k] = [] + merged_citations[k].extend(v) + # Accumulate keys + merged_all_keys.update(tex_parser.get_all_cited_keys()) + # Store content + tex_contents[str(tex_path)] = tex_path.read_text(encoding='utf-8', errors='replace') + + # Inject merged data back into parser for components that use it + tex_parser.citations = merged_citations + tex_parser.all_keys = merged_all_keys + + # Initialize components based on config + bib_config = config.bibliography + + arxiv_fetcher = None + crossref_fetcher = None + scholar_fetcher = None + semantic_scholar_fetcher = None + openalex_fetcher = None + dblp_fetcher = None + comparator = None + usage_checker = None + llm_evaluator = None + duplicate_detector = None + + if bib_config.check_metadata or bib_config.check_relevance: + arxiv_fetcher = ArxivFetcher() + + if bib_config.check_metadata: + semantic_scholar_fetcher = SemanticScholarFetcher() + openalex_fetcher = OpenAlexFetcher() + dblp_fetcher = DBLPFetcher() + crossref_fetcher = CrossRefFetcher() + scholar_fetcher = ScholarFetcher() + comparator = MetadataComparator() + + if bib_config.check_usage: + usage_checker = UsageChecker(tex_parser) + + if bib_config.check_duplicates: + duplicate_detector = DuplicateDetector() + + if bib_config.check_relevance: + llm_config = config.llm + backend = LLMBackend(llm_config.backend) + llm_evaluator = LLMEvaluator( + backend=backend, + endpoint=llm_config.endpoint or None, + model=llm_config.model or None, + api_key=llm_config.api_key or None + ) + + # Test LLM connection (silent) + llm_evaluator.test_connection() + + if not usage_checker: + usage_checker = UsageChecker(tex_parser) + + # Initialize report generator + report_gen = ReportGenerator( + minimal_verified=config.output.minimal_verified, + check_preprint_ratio=config.bibliography.check_preprint_ratio, + preprint_warning_threshold=config.bibliography.preprint_warning_threshold + ) + report_gen.set_metadata( + [str(f) for f in config._bib_files], + [str(f) for f in config._tex_files] + ) + + # Run submission quality checks + submission_results = [] + enabled_checkers = config.submission.get_enabled_checkers() + + for checker_name in enabled_checkers: + if checker_name in CHECKER_REGISTRY: + checker = CHECKER_REGISTRY[checker_name]() + for tex_path_str, content in tex_contents.items(): + results = checker.check(content, {}) + # Tag results with file path + for r in results: + r.file_path = tex_path_str + submission_results.extend(results) + + # Set results in report generator for summary calculation + report_gen.set_submission_results(submission_results, template) + + # Check for duplicates (silent) + if bib_config.check_duplicates and duplicate_detector: + duplicate_groups = duplicate_detector.find_duplicates(entries) + report_gen.set_duplicate_groups(duplicate_groups) + + # Check missing citations (silent) + if bib_config.check_usage and usage_checker: + missing = usage_checker.get_missing_entries(entries) + report_gen.set_missing_citations(missing) + + # Process entries + + # Build workflow from config + from src.config.workflow import WorkflowConfig, get_default_workflow, WorkflowStep as WFStep + workflow_config = get_default_workflow() + if config.workflow: + workflow_config = WorkflowConfig( + steps=[ + WFStep( + name=step.name, + display_name=step.name, + description=step.description, + enabled=step.enabled, + priority=i + ) + for i, step in enumerate(config.workflow) + ] + ) + + # Process entries in parallel for metadata checks + from concurrent.futures import ThreadPoolExecutor, as_completed + import threading + + # Thread-safe progress tracking + progress_lock = threading.Lock() + completed_count = [0] # Use list for mutability in closure + + def process_single_entry(entry): + """Process a single entry (thread-safe).""" + # Check usage + usage_result = None + if usage_checker: + usage_result = usage_checker.check_usage(entry) + + # Fetch and compare metadata + comparison_result = None + if bib_config.check_metadata and comparator: + comparison_result = fetch_and_compare_with_workflow( + entry, workflow_config, arxiv_fetcher, crossref_fetcher, + scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher, + dblp_fetcher, comparator + ) + + # LLM evaluation (keep sequential per entry) + evaluations = [] + if bib_config.check_relevance and llm_evaluator: + if usage_result and usage_result.is_used: + abstract = get_abstract(entry, comparison_result, arxiv_fetcher) + if abstract: + for ctx in usage_result.contexts: + eval_result = llm_evaluator.evaluate( + entry.key, ctx.full_context, abstract + ) + eval_result.line_number = ctx.line_number + eval_result.file_path = ctx.file_path + evaluations.append(eval_result) + + # Create entry report + entry_report = EntryReport( + entry=entry, + comparison=comparison_result, + usage=usage_result, + evaluations=evaluations + ) + + return entry_report, comparison_result + + # Determine number of workers (max 10 to avoid overwhelming APIs) + max_workers = min(10, len(entries)) + + with progress.progress_context(len(entries), "Processing bibliography") as prog: + # Use ThreadPoolExecutor for parallel processing + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries} + + # Process completed tasks + for future in as_completed(future_to_entry): + entry = future_to_entry[future] + try: + entry_report, comparison_result = future.result() + + # Thread-safe progress update + with progress_lock: + report_gen.add_entry_report(entry_report) + + # Update progress + if comparison_result and comparison_result.is_match: + prog.mark_success() + elif comparison_result and comparison_result.has_issues: + prog.mark_warning() + else: + prog.mark_error() + + completed_count[0] += 1 + prog.update(entry.key, "Done", 1) + + except Exception as e: + with progress_lock: + prog.mark_error() + progress.print_error(f"Error processing {entry.key}: {e}") + completed_count[0] += 1 + prog.update(entry.key, "Failed", 1) + + # Summary will be printed at the very end + + # Generate reports and organize outputs (silent) + + # Create output directory + output_dir = config.output_dir_path + output_dir.mkdir(parents=True, exist_ok=True) + + # Copy input files to output directory + import shutil + for bib_path in config._bib_files: + shutil.copy2(bib_path, output_dir / bib_path.name) + for tex_path in config._tex_files: + shutil.copy2(tex_path, output_dir / tex_path.name) + # 1. Bibliography Report + bib_report_path = output_dir / "bibliography_report.md" + report_gen.save_bibliography_report(str(bib_report_path)) + + # 2. LaTeX Quality Report + if submission_results: + latex_report_path = output_dir / "latex_quality_report.md" + report_gen.save_latex_quality_report( + str(latex_report_path), + submission_results, + template + ) + + # 3. Line-by-Line Report + from src.report.line_report import generate_line_report + line_report_path = output_dir / "line_by_line_report.md" + + # For multiple files, we generate one big report with sections + all_line_reports = [] + for tex_path_str, content in tex_contents.items(): + file_results = [r for r in submission_results if r.file_path == tex_path_str] + if not file_results: + continue + + from src.report.line_report import LineByLineReportGenerator + gen = LineByLineReportGenerator(content, tex_path_str) + gen.add_results(file_results) + all_line_reports.append(gen.generate()) + + if all_line_reports: + with open(line_report_path, 'w', encoding='utf-8') as f: + f.write("\n\n".join(all_line_reports)) + + # 4. Clean bib file (if generated earlier) + if bib_config.check_usage and usage_checker: + used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used] + if used_entries: + try: + keys_to_keep = {entry.key for entry in used_entries} + # If multiple bibs, we merge them into one cleaned file + # or just use the first one if it's single mode. + # For now, let's just use a default name if multiple. + if len(config._bib_files) == 1: + clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib" + bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep) + else: + clean_bib_path = output_dir / "merged_only_used.bib" + # We need a way to filter multiple files into one. + # BibParser.filter_file currently takes one input. + # Let's just write all used entries to a new file. + with open(clean_bib_path, 'w', encoding='utf-8') as f: + for entry in used_entries: + f.write(entry.raw + "\n\n") + except Exception as e: + pass + + # Print beautiful console summary + if not config.output.quiet: + bib_stats, latex_stats = report_gen.get_summary_stats() + progress.print_detailed_summary(bib_stats, latex_stats, str(output_dir.absolute())) + + +def fetch_and_compare_with_workflow( + entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher, + semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator +): + """Fetch metadata from online sources using the configured workflow.""" + from src.utils.normalizer import TextNormalizer + + all_results = [] + enabled_steps = workflow_config.get_enabled_steps() + + for step in enabled_steps: + result = None + + if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher: + arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id) + if arxiv_meta: + result = comparator.compare_with_arxiv(entry, arxiv_meta) + + elif step.name == "crossref_doi" and entry.doi and crossref_fetcher: + crossref_result = crossref_fetcher.search_by_doi(entry.doi) + if crossref_result: + result = comparator.compare_with_crossref(entry, crossref_result) + + elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher: + ss_result = None + if entry.doi: + ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi) + if not ss_result: + ss_result = semantic_scholar_fetcher.search_by_title(entry.title) + if ss_result: + result = comparator.compare_with_semantic_scholar(entry, ss_result) + + elif step.name == "dblp" and entry.title and dblp_fetcher: + dblp_result = dblp_fetcher.search_by_title(entry.title) + if dblp_result: + result = comparator.compare_with_dblp(entry, dblp_result) + + elif step.name == "openalex" and entry.title and openalex_fetcher: + oa_result = None + if entry.doi: + oa_result = openalex_fetcher.fetch_by_doi(entry.doi) + if not oa_result: + oa_result = openalex_fetcher.search_by_title(entry.title) + if oa_result: + result = comparator.compare_with_openalex(entry, oa_result) + + elif step.name == "arxiv_title" and entry.title and arxiv_fetcher: + results = arxiv_fetcher.search_by_title(entry.title, max_results=3) + if results: + best_result = None + best_sim = 0.0 + norm1 = TextNormalizer.normalize_for_comparison(entry.title) + + for r in results: + norm2 = TextNormalizer.normalize_for_comparison(r.title) + sim = TextNormalizer.similarity_ratio(norm1, norm2) + if sim > best_sim: + best_sim = sim + best_result = r + + if best_result and best_sim > 0.5: + result = comparator.compare_with_arxiv(entry, best_result) + + elif step.name == "crossref_title" and entry.title and crossref_fetcher: + crossref_result = crossref_fetcher.search_by_title(entry.title) + if crossref_result: + result = comparator.compare_with_crossref(entry, crossref_result) + + elif step.name == "google_scholar" and entry.title and scholar_fetcher: + scholar_result = scholar_fetcher.search_by_title(entry.title) + if scholar_result: + result = comparator.compare_with_scholar(entry, scholar_result) + + if result: + all_results.append(result) + if result.is_match: + return result + + if all_results: + all_results.sort(key=lambda r: r.confidence, reverse=True) + return all_results[0] + + return comparator.create_unable_result(entry, "Unable to find this paper in any data source") + + +def get_abstract(entry, comparison_result, arxiv_fetcher): + """Get abstract for an entry from various sources.""" + if entry.abstract: + return entry.abstract + + if entry.has_arxiv and arxiv_fetcher: + arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id) + if arxiv_meta and arxiv_meta.abstract: + return arxiv_meta.abstract + + if entry.title and arxiv_fetcher: + results = arxiv_fetcher.search_by_title(entry.title, max_results=1) + if results and results[0].abstract: + return results[0].abstract + + return "" + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9daac4b5e5b83ea62c2a379a6d8b6b94969dbec2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +gradio>=4.0.0 +bibtexparser>=1.4.0 +requests>=2.31.0 +beautifulsoup4>=4.12.0 +rich>=13.7.0 +Unidecode>=1.3.0 +lxml>=5.0.0 +PyYAML>=6.0 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0117b5587d0af7197827f9cd247130558430ddc1 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Bibliography Checker Package""" diff --git a/src/__pycache__/__init__.cpython-311.pyc b/src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f7f93299a13eacb2dde71c5682e2d6728c5e004 Binary files /dev/null and b/src/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/__pycache__/__init__.cpython-313.pyc b/src/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf00bb1d45f8e27a17b606bff18240ef972d3884 Binary files /dev/null and b/src/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/analyzers/__init__.py b/src/analyzers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0a37a5742021697f75381d6065ac1bce2ea13eb5 --- /dev/null +++ b/src/analyzers/__init__.py @@ -0,0 +1,7 @@ +"""Analyzers package""" +from .metadata_comparator import MetadataComparator +from .usage_checker import UsageChecker +from .llm_evaluator import LLMEvaluator +from .duplicate_detector import DuplicateDetector + +__all__ = ['MetadataComparator', 'UsageChecker', 'LLMEvaluator', 'DuplicateDetector'] diff --git a/src/analyzers/__pycache__/__init__.cpython-313.pyc b/src/analyzers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..509937c364b785deb44375acb3aaba5013361e3d Binary files /dev/null and b/src/analyzers/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc b/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c7bb39c9efd0e7d77bd19c7f34e62d025ab5192 Binary files /dev/null and b/src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc b/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf74a8d1d49fa44054e0e72b9f877dbd49b4f95e Binary files /dev/null and b/src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc b/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe9762b1cde28ad6e29ba3f4c3adc11d15564b76 Binary files /dev/null and b/src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc b/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b19844c8e6b1dcc1954bf4ab5afdf1df5bb02a9b Binary files /dev/null and b/src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc b/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d2cb0813a973d1cf3460c4535633e00d0a04efa Binary files /dev/null and b/src/analyzers/__pycache__/retraction_checker.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/url_validator.cpython-313.pyc b/src/analyzers/__pycache__/url_validator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63eec7a7977c4938f9e48a79bee40a00c45d3e4d Binary files /dev/null and b/src/analyzers/__pycache__/url_validator.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/usage_checker.cpython-313.pyc b/src/analyzers/__pycache__/usage_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7184ac66348bfe9886e264fa93a73cbfe9821a1 Binary files /dev/null and b/src/analyzers/__pycache__/usage_checker.cpython-313.pyc differ diff --git a/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc b/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8adc6c39e805bbc56e450ca5f561022b393e89a8 Binary files /dev/null and b/src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc differ diff --git a/src/analyzers/duplicate_detector.py b/src/analyzers/duplicate_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..6b4a7e13e846fc57b9014204993e6b7e7743734f --- /dev/null +++ b/src/analyzers/duplicate_detector.py @@ -0,0 +1,204 @@ +""" +Duplicate entry detector for bibliography files. +Uses fuzzy matching to find potential duplicates. +""" +from dataclasses import dataclass +from typing import List, Tuple + +from ..parsers.bib_parser import BibEntry +from ..utils.normalizer import TextNormalizer + + +@dataclass +class DuplicateGroup: + """A group of potentially duplicate entries.""" + entries: List[BibEntry] + similarity_score: float + reason: str + + @property + def entry_keys(self) -> List[str]: + return [e.key for e in self.entries] + + +class DuplicateDetector: + """Detects duplicate bibliography entries using fuzzy matching.""" + + # Thresholds for duplicate detection + TITLE_SIMILARITY_THRESHOLD = 0.85 + COMBINED_SIMILARITY_THRESHOLD = 0.80 + + def __init__(self): + self.normalizer = TextNormalizer + + def find_duplicates(self, entries: List[BibEntry]) -> List[DuplicateGroup]: + """ + Find all duplicate groups in the bibliography. + + Returns: + List of DuplicateGroup objects, each containing 2+ similar entries. + """ + duplicates = [] + processed = set() + + for i, entry1 in enumerate(entries): + if entry1.key in processed: + continue + + # Find all entries similar to this one + similar_entries = [entry1] + + for j, entry2 in enumerate(entries[i+1:], start=i+1): + if entry2.key in processed: + continue + + similarity, reason = self._calculate_similarity(entry1, entry2) + + if similarity >= self.COMBINED_SIMILARITY_THRESHOLD: + similar_entries.append(entry2) + processed.add(entry2.key) + + # If we found duplicates, create a group + if len(similar_entries) > 1: + processed.add(entry1.key) + + # Calculate average similarity for the group + avg_similarity = self._calculate_group_similarity(similar_entries) + reason = self._generate_reason(similar_entries) + + duplicates.append(DuplicateGroup( + entries=similar_entries, + similarity_score=avg_similarity, + reason=reason + )) + + # Sort by similarity score (highest first) + duplicates.sort(key=lambda g: g.similarity_score, reverse=True) + + return duplicates + + def _calculate_similarity(self, entry1: BibEntry, entry2: BibEntry) -> Tuple[float, str]: + """ + Calculate similarity between two entries. + + Returns: + (similarity_score, reason_string) + """ + # Normalize titles + title1 = self.normalizer.normalize_for_comparison(entry1.title) + title2 = self.normalizer.normalize_for_comparison(entry2.title) + + # Calculate title similarity + title_sim = self.normalizer.similarity_ratio(title1, title2) + + # If titles are very similar, likely a duplicate + if title_sim >= self.TITLE_SIMILARITY_THRESHOLD: + return title_sim, "Very similar titles" + + # Check author similarity + author_sim = self._calculate_author_similarity(entry1, entry2) + + # Combined score: weighted average + # Title is more important (70%) than authors (30%) + combined_sim = 0.7 * title_sim + 0.3 * author_sim + + if combined_sim >= self.COMBINED_SIMILARITY_THRESHOLD: + return combined_sim, f"Similar title ({title_sim:.0%}) and authors ({author_sim:.0%})" + + return combined_sim, "" + + def _calculate_author_similarity(self, entry1: BibEntry, entry2: BibEntry) -> float: + """Calculate similarity between author lists.""" + # Parse author strings + authors1 = self._parse_authors(entry1.author) + authors2 = self._parse_authors(entry2.author) + + if not authors1 or not authors2: + return 0.0 + + # Normalize author names + norm_authors1 = [self.normalizer.normalize_for_comparison(a) for a in authors1] + norm_authors2 = [self.normalizer.normalize_for_comparison(a) for a in authors2] + + # Count matching authors + matches = 0 + for a1 in norm_authors1: + for a2 in norm_authors2: + if self._authors_match(a1, a2): + matches += 1 + break + + # Calculate Jaccard similarity + total_unique = len(set(norm_authors1) | set(norm_authors2)) + if total_unique == 0: + return 0.0 + + return matches / total_unique + + def _parse_authors(self, author_string: str) -> List[str]: + """Parse author string into list of names.""" + if not author_string: + return [] + + # Split by 'and' + authors = author_string.split(' and ') + + # Clean up each author + cleaned = [] + for author in authors: + # Remove extra whitespace + author = ' '.join(author.split()) + if author: + cleaned.append(author) + + return cleaned + + def _authors_match(self, name1: str, name2: str) -> bool: + """Check if two author names match (handles initials).""" + # Simple exact match after normalization + if name1 == name2: + return True + + # Check if one is a substring of the other (handles initials) + if name1 in name2 or name2 in name1: + return True + + # Calculate string similarity + sim = self.normalizer.similarity_ratio(name1, name2) + return sim >= 0.8 + + def _calculate_group_similarity(self, entries: List[BibEntry]) -> float: + """Calculate average similarity within a group.""" + if len(entries) < 2: + return 1.0 + + total_sim = 0.0 + count = 0 + + for i, entry1 in enumerate(entries): + for entry2 in entries[i+1:]: + sim, _ = self._calculate_similarity(entry1, entry2) + total_sim += sim + count += 1 + + return total_sim / count if count > 0 else 0.0 + + def _generate_reason(self, entries: List[BibEntry]) -> str: + """Generate a human-readable reason for the duplicate group.""" + # Check if all titles are very similar + titles = [self.normalizer.normalize_for_comparison(e.title) for e in entries] + + # Calculate pairwise title similarities + title_sims = [] + for i, t1 in enumerate(titles): + for t2 in titles[i+1:]: + title_sims.append(self.normalizer.similarity_ratio(t1, t2)) + + avg_title_sim = sum(title_sims) / len(title_sims) if title_sims else 0.0 + + if avg_title_sim >= 0.95: + return "Nearly identical titles" + elif avg_title_sim >= 0.85: + return "Very similar titles" + else: + return "Similar titles and authors" diff --git a/src/analyzers/llm_evaluator.py b/src/analyzers/llm_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..285a0fcda794d9e220de2b9ebdee12d9049ef46c --- /dev/null +++ b/src/analyzers/llm_evaluator.py @@ -0,0 +1,376 @@ +""" +LLM-based citation relevance evaluator. +Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends. +""" +import json +import re +from dataclasses import dataclass +from typing import Optional, Dict, Any +from enum import Enum +import os + +import requests + + +class LLMBackend(Enum): + OPENAI = "openai" + ANTHROPIC = "anthropic" + GEMINI = "gemini" + VLLM = "vllm" + OLLAMA = "ollama" + DEEPSEEK = "deepseek" + + +@dataclass +class EvaluationResult: + """Result of LLM citation evaluation.""" + entry_key: str + relevance_score: int # 1-5 + is_relevant: bool + explanation: str + context_used: str + abstract_used: str + line_number: Optional[int] = None + file_path: Optional[str] = None + error: Optional[str] = None + + @property + def score_label(self) -> str: + labels = { + 1: "Not Relevant", + 2: "Marginally Relevant", + 3: "Somewhat Relevant", + 4: "Relevant", + 5: "Highly Relevant" + } + return labels.get(self.relevance_score, "Unknown") + + +class LLMEvaluator: + """Evaluates citation relevance using LLM.""" + + PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant. + +## Citation Context (from the manuscript): +{context} + +## Cited Paper's Abstract: +{abstract} + +## Task: +Evaluate the relevance and appropriateness of this citation. Consider: +1. Does the citation support the claim being made in the context? +2. Is the cited paper's topic related to the discussion? +3. Is this citation necessary, or could it be replaced with a more relevant one? + +## Response Format: +Provide your response in the following JSON format: +{{ + "relevance_score": <1-5 integer>, + "is_relevant": , + "explanation": "" +}} + +Score guide: +- 1: Not relevant at all +- 2: Marginally relevant +- 3: Somewhat relevant +- 4: Relevant and appropriate +- 5: Highly relevant and essential + +STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text.""" + + def __init__( + self, + backend: LLMBackend = LLMBackend.GEMINI, + endpoint: Optional[str] = None, + model: Optional[str] = None, + api_key: Optional[str] = None + ): + self.backend = backend + self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY") + + # Set defaults based on backend + if backend == LLMBackend.OPENAI: + self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions" + self.model = model or "gpt-5-mini" + elif backend == LLMBackend.ANTHROPIC: + self.endpoint = endpoint or "https://api.anthropic.com/v1/messages" + self.model = model or "claude-4.5-haiku" + elif backend == LLMBackend.DEEPSEEK: + self.endpoint = endpoint or "https://api.deepseek.com/chat/completions" + self.model = model or "deepseek-chat" + elif backend == LLMBackend.OLLAMA: + self.endpoint = endpoint or "http://localhost:11434/api/generate" + self.model = model or "Qwen/qwen3-4B-Instruct-2507" + elif backend == LLMBackend.VLLM: + self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions" + self.model = model or "Qwen/qwen3-4B-Instruct-2507" + elif backend == LLMBackend.GEMINI: + self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models" + self.model = model or "gemini-2.5-flash-lite" + + def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult: + """Evaluate citation relevance.""" + if not context or not abstract: + return EvaluationResult( + entry_key=entry_key, + relevance_score=0, + is_relevant=False, + explanation="Missing context or abstract", + context_used=context, + abstract_used=abstract, + error="Missing context or abstract for evaluation" + ) + + # Don't truncate - preserve full context and abstract + prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract) + + try: + if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): + response = self._call_openai_compatible(prompt) + elif self.backend == LLMBackend.ANTHROPIC: + response = self._call_anthropic(prompt) + elif self.backend == LLMBackend.OLLAMA: + response = self._call_ollama(prompt) + elif self.backend == LLMBackend.GEMINI: + response = self._call_gemini(prompt) + else: + raise ValueError(f"Unknown backend: {self.backend}") + + return self._parse_response(entry_key, response, context, abstract) + + except Exception as e: + return EvaluationResult( + entry_key=entry_key, + relevance_score=0, + is_relevant=False, + explanation="", + context_used=context, + abstract_used=abstract, + error=str(e) + ) + + def _call_openai_compatible(self, prompt: str) -> str: + """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM).""" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + payload = { + "model": self.model, + "messages": [ + {"role": "user", "content": prompt} + ], + "temperature": 0.1, + "max_tokens": 2000, + "response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None + } + + response = requests.post( + self.endpoint, + json=payload, + headers=headers, + timeout=60 + ) + response.raise_for_status() + + data = response.json() + choices = data.get("choices", []) + if choices: + return choices[0].get("message", {}).get("content", "") + return "" + + def _call_anthropic(self, prompt: str) -> str: + """Call Anthropic API.""" + headers = { + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json" + } + + payload = { + "model": self.model, + "max_tokens": 2000, + "temperature": 0.1, + "messages": [ + {"role": "user", "content": prompt} + ] + } + + response = requests.post( + self.endpoint, + json=payload, + headers=headers, + timeout=60 + ) + response.raise_for_status() + + data = response.json() + content = data.get("content", []) + if content and content[0].get("type") == "text": + return content[0].get("text", "") + return "" + + def _call_ollama(self, prompt: str) -> str: + """Call Ollama API.""" + payload = { + "model": self.model, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.1, + "num_predict": 2000 + }, + "format": "json" + } + + response = requests.post( + self.endpoint, + json=payload, + timeout=60 + ) + response.raise_for_status() + + return response.json().get("response", "") + + def _call_gemini(self, prompt: str) -> str: + """Call Gemini API.""" + # Build URL with model + url = f"{self.endpoint}/{self.model}:generateContent" + if self.api_key: + url += f"?key={self.api_key}" + + payload = { + "contents": [ + { + "parts": [ + {"text": prompt} + ] + } + ], + "generationConfig": { + "temperature": 0.1, + "maxOutputTokens": 2000, + "responseMimeType": "application/json" + } + } + + response = requests.post( + url, + json=payload, + timeout=60 + ) + response.raise_for_status() + + candidates = response.json().get("candidates", []) + if candidates: + content = candidates[0].get("content", {}) + parts = content.get("parts", []) + if parts: + return parts[0].get("text", "") + return "" + + def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult: + """Parse LLM response.""" + # Try to extract JSON from response + json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL) + + data = {} + if not json_match: + # Try to parse the whole response as JSON + try: + data = json.loads(response.strip()) + except json.JSONDecodeError: + pass + else: + try: + data = json.loads(json_match.group()) + except json.JSONDecodeError: + pass + + if not data: + return EvaluationResult( + entry_key=entry_key, + relevance_score=0, + is_relevant=False, + explanation=response, + context_used=context, + abstract_used=abstract, + error="Failed to parse LLM response as JSON" + ) + + # Extract fields + relevance_score = data.get("relevance_score", 0) + if isinstance(relevance_score, str): + try: + relevance_score = int(relevance_score) + except ValueError: + relevance_score = 0 + + is_relevant = data.get("is_relevant", False) + if isinstance(is_relevant, str): + is_relevant = is_relevant.lower() in ("true", "yes", "1") + + explanation = data.get("explanation", "") + + return EvaluationResult( + entry_key=entry_key, + relevance_score=relevance_score, + is_relevant=is_relevant, + explanation=explanation, + context_used=context, + abstract_used=abstract + ) + + def test_connection(self) -> bool: + """Test if LLM backend is accessible.""" + try: + if self.backend == LLMBackend.OLLAMA: + response = requests.get( + self.endpoint.replace("/api/generate", "/api/tags"), + timeout=5 + ) + return response.status_code == 200 + elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): + # Test with a simple model list or empty completion + headers = {"Authorization": f"Bearer {self.api_key}"} + # Try listing models if possible, otherwise simple completion + if "chat/completions" in self.endpoint: + # Try a minimal completion + payload = { + "model": self.model, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1 + } + response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10) + return response.status_code == 200 + else: + return False + elif self.backend == LLMBackend.ANTHROPIC: + headers = { + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json" + } + payload = { + "model": self.model, + "max_tokens": 1, + "messages": [{"role": "user", "content": "hi"}] + } + response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10) + return response.status_code == 200 + elif self.backend == LLMBackend.GEMINI: + if not self.api_key: + return False + url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}" + payload = { + "contents": [{"parts": [{"text": "test"}]}], + "generationConfig": {"maxOutputTokens": 10} + } + response = requests.post(url, json=payload, timeout=10) + return response.status_code == 200 + except Exception: + return False + return False diff --git a/src/analyzers/metadata_comparator.py b/src/analyzers/metadata_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..ec53a23dc4f5fec944995e6a704f88e4533f20e8 --- /dev/null +++ b/src/analyzers/metadata_comparator.py @@ -0,0 +1,474 @@ +""" +Metadata comparison between bib entries and fetched metadata. +""" +from dataclasses import dataclass +from typing import Optional + +from ..parsers.bib_parser import BibEntry +from ..fetchers.arxiv_fetcher import ArxivMetadata +from ..fetchers.scholar_fetcher import ScholarResult +from ..fetchers.crossref_fetcher import CrossRefResult +from ..fetchers.semantic_scholar_fetcher import SemanticScholarResult +from ..fetchers.openalex_fetcher import OpenAlexResult +from ..fetchers.dblp_fetcher import DBLPResult +from ..utils.normalizer import TextNormalizer + + +@dataclass +class ComparisonResult: + """Result of comparing bib entry with fetched metadata.""" + entry_key: str + + # Title comparison + title_match: bool + title_similarity: float + bib_title: str + fetched_title: str + + # Author comparison + author_match: bool + author_similarity: float + bib_authors: list[str] + fetched_authors: list[str] + + # Year comparison + year_match: bool + bib_year: str + fetched_year: str + + # Overall assessment + is_match: bool + confidence: float + issues: list[str] + source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable' + + @property + def has_issues(self) -> bool: + return len(self.issues) > 0 + + +class MetadataComparator: + """Compares bibliography entries with fetched metadata.""" + + # Thresholds for matching + TITLE_THRESHOLD = 0.8 + AUTHOR_THRESHOLD = 0.6 + + def __init__(self): + self.normalizer = TextNormalizer + + def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult: + """Compare bib entry with arXiv metadata.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, arxiv_title_norm) + # Also try Levenshtein for short titles + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, arxiv_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + arxiv_authors = [self.normalizer.normalize_author_name(a) for a in arxiv_meta.authors] + + author_similarity = self._compare_author_lists(bib_authors, arxiv_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + arxiv_year = arxiv_meta.year + year_match = bib_year == arxiv_year + + if not year_match and bib_year and arxiv_year: + issues.append(f"Year mismatch: bib={bib_year}, arxiv={arxiv_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=arxiv_meta.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=arxiv_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=arxiv_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="arxiv" + ) + + def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult: + """Compare bib entry with Scholar search result.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + scholar_title_norm = self.normalizer.normalize_for_comparison(scholar_result.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, scholar_title_norm) + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, scholar_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors (Scholar format is less structured) + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + # Scholar authors are comma-separated + scholar_authors_raw = scholar_result.authors.split(',') + scholar_authors = [self.normalizer.normalize_author_name(a.strip()) for a in scholar_authors_raw] + + author_similarity = self._compare_author_lists(bib_authors, scholar_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + scholar_year = scholar_result.year + year_match = bib_year == scholar_year + + if not year_match and bib_year and scholar_year: + issues.append(f"Year mismatch: bib={bib_year}, scholar={scholar_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=scholar_result.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=scholar_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=scholar_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="scholar" + ) + + def compare_with_crossref(self, bib_entry: BibEntry, crossref_result: CrossRefResult) -> ComparisonResult: + """Compare bib entry with CrossRef search result.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + crossref_title_norm = self.normalizer.normalize_for_comparison(crossref_result.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, crossref_title_norm) + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, crossref_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + crossref_authors = [self.normalizer.normalize_author_name(a) for a in crossref_result.authors] + + author_similarity = self._compare_author_lists(bib_authors, crossref_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + crossref_year = crossref_result.year + year_match = bib_year == crossref_year + + if not year_match and bib_year and crossref_year: + issues.append(f"Year mismatch: bib={bib_year}, crossref={crossref_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=crossref_result.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=crossref_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=crossref_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="crossref" + ) + + def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult: + """Create result when metadata couldn't be fetched.""" + return ComparisonResult( + entry_key=bib_entry.key, + title_match=False, + title_similarity=0.0, + bib_title=bib_entry.title, + fetched_title="", + author_match=False, + author_similarity=0.0, + bib_authors=self.normalizer.normalize_author_list(bib_entry.author), + fetched_authors=[], + year_match=False, + bib_year=bib_entry.year, + fetched_year="", + is_match=False, + confidence=0.0, + issues=[reason], + source="unable" + ) + + def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float: + """Compare two author lists.""" + if not list1 and not list2: + return 1.0 + if not list1 or not list2: + return 0.0 + + # Find best matches for each author in list1 + total_similarity = 0.0 + for author1 in list1: + best_match = 0.0 + for author2 in list2: + # Check if one name contains the other (handle abbreviated names) + if self._names_match(author1, author2): + best_match = 1.0 + break + sim = self.normalizer.similarity_ratio(author1, author2) + best_match = max(best_match, sim) + total_similarity += best_match + + return total_similarity / len(list1) + + def _names_match(self, name1: str, name2: str) -> bool: + """Check if two names match (handles abbreviated names).""" + words1 = name1.split() + words2 = name2.split() + + if not words1 or not words2: + return False + + # Check if last names match + if words1[-1] != words2[-1]: + # Try first word as last name too + if words1[0] != words2[-1] and words1[-1] != words2[0]: + return False + + return True + + def compare_with_semantic_scholar(self, bib_entry: BibEntry, ss_result: SemanticScholarResult) -> ComparisonResult: + """Compare bib entry with Semantic Scholar result.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + ss_title_norm = self.normalizer.normalize_for_comparison(ss_result.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, ss_title_norm) + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, ss_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + ss_authors = [self.normalizer.normalize_author_name(a) for a in ss_result.authors] + + author_similarity = self._compare_author_lists(bib_authors, ss_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + ss_year = ss_result.year + year_match = bib_year == ss_year + + if not year_match and bib_year and ss_year: + issues.append(f"Year mismatch: bib={bib_year}, semantic_scholar={ss_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=ss_result.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=ss_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=ss_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="semantic_scholar" + ) + + def compare_with_openalex(self, bib_entry: BibEntry, oa_result: OpenAlexResult) -> ComparisonResult: + """Compare bib entry with OpenAlex result.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + oa_title_norm = self.normalizer.normalize_for_comparison(oa_result.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, oa_title_norm) + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, oa_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + oa_authors = [self.normalizer.normalize_author_name(a) for a in oa_result.authors] + + author_similarity = self._compare_author_lists(bib_authors, oa_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + oa_year = oa_result.year + year_match = bib_year == oa_year + + if not year_match and bib_year and oa_year: + issues.append(f"Year mismatch: bib={bib_year}, openalex={oa_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=oa_result.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=oa_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=oa_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="openalex" + ) + + def compare_with_dblp(self, bib_entry: BibEntry, dblp_result: DBLPResult) -> ComparisonResult: + """Compare bib entry with DBLP result.""" + issues = [] + + # Compare titles + bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) + dblp_title_norm = self.normalizer.normalize_for_comparison(dblp_result.title) + + title_similarity = self.normalizer.similarity_ratio(bib_title_norm, dblp_title_norm) + if len(bib_title_norm) < 100: + lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, dblp_title_norm) + title_similarity = max(title_similarity, lev_sim) + + title_match = title_similarity >= self.TITLE_THRESHOLD + + if not title_match: + issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") + + # Compare authors + bib_authors = self.normalizer.normalize_author_list(bib_entry.author) + dblp_authors = [self.normalizer.normalize_author_name(a) for a in dblp_result.authors] + + author_similarity = self._compare_author_lists(bib_authors, dblp_authors) + author_match = author_similarity >= self.AUTHOR_THRESHOLD + + if not author_match: + issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") + + # Compare years + bib_year = bib_entry.year.strip() + dblp_year = dblp_result.year + year_match = bib_year == dblp_year + + if not year_match and bib_year and dblp_year: + issues.append(f"Year mismatch: bib={bib_year}, dblp={dblp_year}") + + # Overall assessment + is_match = title_match and author_match + confidence = (title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2) + + return ComparisonResult( + entry_key=bib_entry.key, + title_match=title_match, + title_similarity=title_similarity, + bib_title=bib_entry.title, + fetched_title=dblp_result.title, + author_match=author_match, + author_similarity=author_similarity, + bib_authors=bib_authors, + fetched_authors=dblp_authors, + year_match=year_match, + bib_year=bib_year, + fetched_year=dblp_year, + is_match=is_match, + confidence=confidence, + issues=issues, + source="dblp" + ) diff --git a/src/analyzers/usage_checker.py b/src/analyzers/usage_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..68c4d82da9aedb140dddf454e3a6f43af4741d7f --- /dev/null +++ b/src/analyzers/usage_checker.py @@ -0,0 +1,82 @@ +""" +Usage checker for bibliography entries in TeX files. +""" +from dataclasses import dataclass +from typing import Optional + +from ..parsers.bib_parser import BibEntry +from ..parsers.tex_parser import TexParser, CitationContext + + +@dataclass +class UsageResult: + """Result of checking if a bib entry is used.""" + entry_key: str + is_used: bool + usage_count: int + contexts: list[CitationContext] + line_numbers: list[int] + + @property + def first_usage_line(self) -> Optional[int]: + return self.line_numbers[0] if self.line_numbers else None + + +class UsageChecker: + """Checks if bibliography entries are used in TeX files.""" + + def __init__(self, tex_parser: TexParser): + self.tex_parser = tex_parser + self._cited_keys = tex_parser.get_all_cited_keys() + + def check_usage(self, entry: BibEntry) -> UsageResult: + """Check if a bib entry is used in the TeX document.""" + key = entry.key + is_used = key in self._cited_keys + contexts = self.tex_parser.get_citation_contexts(key) + + return UsageResult( + entry_key=key, + is_used=is_used, + usage_count=len(contexts), + contexts=contexts, + line_numbers=[ctx.line_number for ctx in contexts] + ) + + def get_unused_entries(self, entries: list[BibEntry]) -> list[BibEntry]: + """Get list of entries that are not cited in the document.""" + unused = [] + for entry in entries: + if entry.key not in self._cited_keys: + unused.append(entry) + return unused + + def get_missing_entries(self, entries: list[BibEntry]) -> list[str]: + """Get list of citation keys that don't have corresponding bib entries.""" + entry_keys = {e.key for e in entries} + missing = [] + for key in self._cited_keys: + if key not in entry_keys: + missing.append(key) + return missing + + def get_combined_context(self, key: str, max_chars: int = 1000) -> str: + """Get combined context for all usages of a key.""" + contexts = self.tex_parser.get_citation_contexts(key) + if not contexts: + return "" + + combined = [] + total_chars = 0 + + for ctx in contexts: + if total_chars + len(ctx.full_context) > max_chars: + # Add truncated context + remaining = max_chars - total_chars + if remaining > 100: + combined.append(ctx.full_context[:remaining] + "...") + break + combined.append(ctx.full_context) + total_chars += len(ctx.full_context) + + return "\n---\n".join(combined) diff --git a/src/checkers/__init__.py b/src/checkers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8dfa6f6c745d324ef8d6b1a4909050ae5fd07987 --- /dev/null +++ b/src/checkers/__init__.py @@ -0,0 +1,66 @@ +"""Checkers module for paper submission quality checks.""" +from .base import BaseChecker, CheckResult, CheckSeverity +from .caption_checker import CaptionChecker +from .reference_checker import ReferenceChecker +from .ai_artifacts_checker import AIArtifactsChecker +from .formatting_checker import FormattingChecker +from .anonymization_checker import AnonymizationChecker +from .number_checker import NumberChecker +from .sentence_checker import SentenceChecker +from .consistency_checker import ConsistencyChecker +from .citation_quality_checker import CitationQualityChecker +from .equation_checker import EquationChecker +from .acronym_checker import AcronymChecker + +__all__ = [ + 'BaseChecker', + 'CheckResult', + 'CheckSeverity', + 'CaptionChecker', + 'ReferenceChecker', + 'AIArtifactsChecker', + 'FormattingChecker', + 'AnonymizationChecker', + 'NumberChecker', + 'SentenceChecker', + 'ConsistencyChecker', + 'CitationQualityChecker', + 'EquationChecker', + 'AcronymChecker', +] + + +# Registry of all available checkers +CHECKER_REGISTRY = { + 'caption': CaptionChecker, + 'reference': ReferenceChecker, + 'ai_artifacts': AIArtifactsChecker, + 'formatting': FormattingChecker, + 'anonymization': AnonymizationChecker, + 'number': NumberChecker, + 'sentence': SentenceChecker, + 'consistency': ConsistencyChecker, + 'citation_quality': CitationQualityChecker, + 'equation': EquationChecker, + 'acronym': AcronymChecker, +} + + +def get_checker(name: str) -> BaseChecker: + """Get a checker instance by name.""" + if name not in CHECKER_REGISTRY: + raise ValueError(f"Unknown checker: {name}") + return CHECKER_REGISTRY[name]() + + +def run_all_checkers(tex_content: str, config: dict = None) -> list: + """Run all checkers and return combined results.""" + results = [] + config = config or {} + + for name, checker_class in CHECKER_REGISTRY.items(): + checker = checker_class() + checker_results = checker.check(tex_content, config) + results.extend(checker_results) + + return results diff --git a/src/checkers/__pycache__/__init__.cpython-313.pyc b/src/checkers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfbadc876a0bde939483c7b0f82bf06d24750cbd Binary files /dev/null and b/src/checkers/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/acronym_checker.cpython-313.pyc b/src/checkers/__pycache__/acronym_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ac22b6f83d4ca40144e2133b2c42c6080d78275 Binary files /dev/null and b/src/checkers/__pycache__/acronym_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc b/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b174f02eae34f07ad56e5e64a30638db94aa0a39 Binary files /dev/null and b/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc b/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..235ac14399e902e0cd1888e78a3ec1aca756822b Binary files /dev/null and b/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/base.cpython-313.pyc b/src/checkers/__pycache__/base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f8a48378cd6d7d880c2ea0c1635ab2d266402df Binary files /dev/null and b/src/checkers/__pycache__/base.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/caption_checker.cpython-313.pyc b/src/checkers/__pycache__/caption_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ded8600e65ad18259c7d969ba6d3a6a88b64c677 Binary files /dev/null and b/src/checkers/__pycache__/caption_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc b/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0403cd564b3cb9927f6b0f05c6c51afb8e213a54 Binary files /dev/null and b/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/consistency_checker.cpython-313.pyc b/src/checkers/__pycache__/consistency_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c64657741db59e4627caae647a1ff9e8a725492 Binary files /dev/null and b/src/checkers/__pycache__/consistency_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/equation_checker.cpython-313.pyc b/src/checkers/__pycache__/equation_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d598e60c925c8e5d0c4505d3f97909dbe9e4e818 Binary files /dev/null and b/src/checkers/__pycache__/equation_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/formatting_checker.cpython-313.pyc b/src/checkers/__pycache__/formatting_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2aaa1357a5a6800b184ae2f5b6dcae7c2df51cb Binary files /dev/null and b/src/checkers/__pycache__/formatting_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/number_checker.cpython-313.pyc b/src/checkers/__pycache__/number_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af09ad160e7cf0d63d4cd3685ca98973e9175fa0 Binary files /dev/null and b/src/checkers/__pycache__/number_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/reference_checker.cpython-313.pyc b/src/checkers/__pycache__/reference_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c02ac2fa5d31c58c4e9909fecfb0616cd0fca9d9 Binary files /dev/null and b/src/checkers/__pycache__/reference_checker.cpython-313.pyc differ diff --git a/src/checkers/__pycache__/sentence_checker.cpython-313.pyc b/src/checkers/__pycache__/sentence_checker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00cf2c808194adcccaa032ee526542c5715e25ca Binary files /dev/null and b/src/checkers/__pycache__/sentence_checker.cpython-313.pyc differ diff --git a/src/checkers/acronym_checker.py b/src/checkers/acronym_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..a79955af212923f0476d3c4de853aa05a3b1933c --- /dev/null +++ b/src/checkers/acronym_checker.py @@ -0,0 +1,284 @@ +""" +Acronym and abbreviation checker. + +Validates that: +- Acronyms found in text have corresponding full forms defined +- Acronyms are used after their definition +- Only checks acronyms that have matching full forms in the document +""" +import re +from typing import List, Dict, Set, Tuple +from collections import defaultdict + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class AcronymChecker(BaseChecker): + """Check acronym definitions and consistency.""" + + name = "acronym" + display_name = "Acronyms" + description = "Check acronym definitions and consistent usage" + + # Enhanced pattern to find defined acronyms with LaTeX formatting support + # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc. + DEFINITION_PATTERN = re.compile( + r'([A-Z][a-zA-Z\s\-]+)\s*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC}) + r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name) + re.MULTILINE + ) + + # Pattern to find standalone acronyms (3+ capital letters) + ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b') + + # Comprehensive list of common acronyms that don't need definition + COMMON_ACRONYMS = { + # Hardware & Computing + 'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS', + 'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP', + 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN', + + # File Formats & Standards + 'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL', + 'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP', + + # AI & Machine Learning (General) + 'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU', + 'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM', + 'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS', + + # NLP & Language Models + 'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA', + 'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT', + 'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER', + + # Computer Vision + 'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG', + 'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR', + + # Reinforcement Learning + 'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP', + 'POMDP', 'RLHF', 'RLAIF', + + # Metrics & Evaluation + 'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE', + 'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS', + + # Data & Statistics + 'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC', + 'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF', + + # Academic & Organizations + 'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS', + 'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV', + 'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD', + 'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT', + + # Methods & Techniques (Common in ML papers) + 'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL', + 'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL', + 'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO', + + # Misc + 'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA', + 'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT', + 'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC', + 'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA', + } + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # Remove comments using base class method + content = self._remove_comments(tex_content) + + # Find all defined acronyms with their positions + defined_acronyms = self._find_definitions(content) + + # Find all acronym usages (excluding special contexts) + all_usages = self._find_all_usages(content) + + # NEW: Find potential full forms for each acronym + acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys()) + + # Check for undefined acronyms (only those with matching full forms) + for acronym, positions in all_usages.items(): + if acronym in self.COMMON_ACRONYMS: + continue + + # Skip if no matching full form found in document + if acronym not in acronym_full_forms: + continue + + if acronym not in defined_acronyms: + # First usage should define it + first_pos = positions[0] + line_num = self._find_line_number(content, first_pos) + full_form = acronym_full_forms[acronym] + + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')", + line_number=line_num, + suggestion=f"Define on first use: '{full_form} ({acronym})'" + )) + else: + # Check if used before definition + def_pos = defined_acronyms[acronym] + for pos in positions: + if pos < def_pos: + line_num = self._find_line_number(content, pos) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"Acronym '{acronym}' used before definition", + line_number=line_num, + suggestion="Move definition before first use" + )) + break + + return results + + def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]: + """Find potential full forms for acronyms by matching capital letters.""" + full_forms = {} + + for acronym in acronyms: + if acronym in self.COMMON_ACRONYMS: + continue + + # Build regex pattern to match full form + # For "ABC", match words starting with A, B, C + acronym_clean = acronym.rstrip('s') # Remove plural + if len(acronym_clean) < 3: + continue + + # Create pattern: match sequence of words where first letters spell the acronym + # Allow optional words in between (like "of", "the", "and") + pattern_parts = [] + for i, letter in enumerate(acronym_clean): + if i == 0: + # First word must start with the letter + pattern_parts.append(f'{letter}[a-z]+') + else: + # Subsequent words: allow optional filler words + pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+') + + full_pattern = r'\b' + ''.join(pattern_parts) + r'\b' + + try: + matches = re.finditer(full_pattern, content, re.IGNORECASE) + for match in matches: + candidate = match.group(0) + + # Skip if candidate contains common non-content words + # These words indicate the match is part of a sentence, not an acronym full form + excluded_words = { + 'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from', + 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should', + 'can', 'could', 'may', 'might', 'must', 'shall', + 'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where', + 'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much', + 'more', 'most', 'less', 'few', 'several', 'other', 'another' + } + + candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower()) + if any(word in excluded_words for word in candidate_words): + continue + + # Verify: extract first letters and check if they match acronym + words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE) + # Filter out filler words (allowed in between but not counted) + filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'} + meaningful_words = [w for w in words if w.lower() not in filler_words] + + if len(meaningful_words) >= len(acronym_clean): + first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)]) + if first_letters == acronym_clean: + full_forms[acronym] = candidate + break # Found a match, use the first one + except re.error: + # Invalid regex, skip this acronym + continue + + return full_forms + + def _find_definitions(self, content: str) -> Dict[str, int]: + """Find all acronym definitions and their positions.""" + definitions = {} + + for match in self.DEFINITION_PATTERN.finditer(content): + # Get acronym from either pattern + acronym = match.group(2) or match.group(3) + if acronym: + acronym = acronym.rstrip('s') # Remove plural + definitions[acronym] = match.start() + + return definitions + + def _find_all_usages(self, content: str) -> Dict[str, List[int]]: + """Find all acronym usages, excluding special contexts.""" + usages = defaultdict(list) + + for match in self.ACRONYM_PATTERN.finditer(content): + acronym = match.group(1).rstrip('s') + pos = match.start() + + # Skip if in special context + if self._is_in_special_context(content, pos, acronym): + continue + + usages[acronym].append(pos) + + return usages + + def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool: + """Check if acronym at position is in a special context that should be ignored.""" + # Get surrounding context + start = max(0, pos - 50) + end = min(len(content), pos + len(acronym) + 50) + before = content[start:pos] + after = content[pos + len(acronym):end] + + # Skip if inside definition parentheses: (ACRONYM) + if before.endswith('(') and after.startswith(')'): + return True + + # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM} + if before.rstrip().endswith('\\'): + return True + + # Skip if inside label: \label{...:ACRONYM...} + if r'\label{' in before[-20:] and '}' in after[:20]: + return True + + # Skip if inside ref: \ref{...:ACRONYM...} + if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]): + return True + + # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM... + if r'\url{' in before[-20:] or 'http' in before[-20:]: + return True + + # Skip if inside math mode (simple heuristic) + # Count $ signs before position + dollar_count = before.count('$') - before.count(r'\$') + if dollar_count % 2 == 1: # Odd number means we're inside math mode + return True + + # Skip if inside \begin{equation} or similar + if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]): + if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]): + return True + + # Skip if it looks like a LaTeX command argument: \command[ACRONYM] + if before.endswith('[') and after.startswith(']'): + return True + + # Skip if part of a file path or extension + if '.' in before[-5:] or '/' in before[-10:]: + return True + + return False diff --git a/src/checkers/ai_artifacts_checker.py b/src/checkers/ai_artifacts_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..209af561a41e25d1d9f617bfc3e7ee9ad1ba9205 --- /dev/null +++ b/src/checkers/ai_artifacts_checker.py @@ -0,0 +1,176 @@ +""" +AI artifacts checker. + +Detects leftover text from AI writing assistants that should be removed +before submission, such as: +- Conversational responses ("Sure, here is...") +- Placeholder text +- Markdown formatting artifacts +- Common AI response patterns +""" +import re +from typing import List, Tuple + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class AIArtifactsChecker(BaseChecker): + """Detect AI-generated text artifacts that should be removed.""" + + name = "ai_artifacts" + display_name = "AI Artifacts" + description = "Detect leftover AI assistant text and placeholders" + + # Conversational AI patterns (case insensitive) + # These are phrases that clearly indicate a dialogue between user and AI assistant + AI_CONVERSATION_PATTERNS = [ + # Responses to requests + (r'\bsure[,!]?\s*(here\s+is|i\'ll|i\s+will|let\s+me)\b', "Conversational AI response"), + (r'\bi\'?d\s+be\s+happy\s+to\b', "Conversational AI response"), + (r'\bi\'?m\s+happy\s+to\s+help\b', "Conversational AI response"), + (r'\bcertainly[!,]\s*here\b', "Conversational AI response"), + (r'\bof\s+course[!,]\s*(here|i)\b', "Conversational AI response"), + (r'\babsolutely[!,]\s*(here|let\s+me)\b', "Conversational AI response"), + + # Self-identification + (r'\bas\s+an?\s+ai\s+(language\s+)?model\b', "AI self-reference"), + (r'\bas\s+a\s+large\s+language\s+model\b', "AI self-reference"), + (r'\bmy\s+knowledge\s+cutoff\b', "AI knowledge cutoff reference"), + + # Explanatory transitions typical of chat + (r'\blet\s+me\s+(explain|help|clarify|break\s+this\s+down)\b', "Conversational AI response"), + (r'\bhere\'?s\s+(a|an|the|my)\s+(revised|updated|improved|rewrite)\b', "Conversational AI response"), + (r'\bhere\s+is\s+(the|a|an)\s+(summary|breakdown|explanation|code|example)\b', "Conversational AI response"), + + # Closing/Politeness + (r'\bhope\s+this\s+helps\b', "Conversational AI closing"), + (r'\bfeel\s+free\s+to\s+ask\b', "Conversational AI closing"), + (r'\blet\s+me\s+know\s+if\b', "Conversational AI closing"), + (r'\bthank\s+you\s+for\s+(asking|your\s+question)\b', "Conversational AI response"), + (r'\bgreat\s+question[!,]?\b', "Conversational AI response"), + (r'\b(excellent|good|great)\s+point\b', "Conversational AI response"), + + # Instructions/Meta-commentary + (r'\bbased\s+on\s+the\s+information\s+provided\b', "Conversational AI response"), + (r'\b(remember|note)\s+that\b', "Conversational AI instruction"), + (r'\bplease\s+note\s+that\b', "Conversational AI instruction"), + ] + + # Placeholder patterns + PLACEHOLDER_PATTERNS = [ + (r'\[insert\s+[^\]]+\s*here\]', "Placeholder text"), + (r'\[add\s+[^\]]+\]', "Placeholder text"), + (r'\[todo[:\s][^\]]*\]', "TODO placeholder"), + (r'\btodo\s*:\s*.{0,50}', "TODO comment"), + (r'\bfixme\s*:\s*.{0,50}', "FIXME comment"), + (r'\bxxx\b', "XXX placeholder"), + (r'\byour[\s_-]*(name|email|institution|university)\b', "Placeholder for personal info"), + (r'author[\s_-]*name', "Author name placeholder"), + (r'your\.?email@example\.com', "Email placeholder"), + (r'example@(example\.com|university\.edu)', "Email placeholder"), + (r'\[citation\s+needed\]', "Citation needed placeholder"), + ] + + # Markdown artifacts (should not appear in LaTeX) + MARKDOWN_PATTERNS = [ + (r'^\s*#{1,6}\s+\w', "Markdown header"), + (r'\*\*[^*]+\*\*', "Markdown bold"), + (r'(? List[CheckResult]: + results = [] + lines = tex_content.split('\n') + + # Track if we are inside a verbatim-like environment + in_verbatim = False + verbatim_envs = ['verbatim', 'lstlisting', 'minted', 'comment', 'raw', 'filecontents', 'tcolorbox'] + + # Check each line + for line_num, line in enumerate(lines, 1): + # Check for environment boundaries + # Handle \begin{env} + if re.search(r'\\begin\{(' + '|'.join(verbatim_envs) + r')\*?\}', line): + in_verbatim = True + continue # Skip the begin line itself + + # Handle \end{env} + if re.search(r'\\end\{(' + '|'.join(verbatim_envs) + r')\*?\}', line): + in_verbatim = False + continue # Skip the end line itself + + # Skip checks if inside verbatim environment + if in_verbatim: + continue + + # Skip commented lines using base class method + if self._is_comment_line(line): + continue + + # Remove inline comments for checking using base class method + line_to_check = self._remove_line_comment(line) + + # Check AI conversation patterns + for pattern, description in self.AI_CONVERSATION_PATTERNS: + if re.search(pattern, line_to_check, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=f"{description} detected", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Remove AI-generated conversational text" + )) + break # One match per line for this category + + # Check placeholder patterns + for pattern, description in self.PLACEHOLDER_PATTERNS: + match = re.search(pattern, line_to_check, re.IGNORECASE) + if match: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"{description}: '{match.group(0)[:50]}'", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Replace placeholder with actual content or remove" + )) + + # Check Markdown patterns (less strict - might be intentional in some cases) + for pattern, description in self.MARKDOWN_PATTERNS: + # Skip if line looks like a LaTeX command (starts with \) + if line_to_check.strip().startswith('\\'): + continue + + # Special handling for bullet points: ensure space after + if "bullet point" in description: + # Skip if it looks like a math subtraction or negative number + if re.search(r'[-+]\d', line_to_check): + continue + # Skip if inside math mode (simple heuristic) + if '$' in line_to_check: + continue + + # Special handling for italics: avoid matching math mode like $x*y$ + if "italic" in description: + if '$' in line_to_check: + continue + + if re.search(pattern, line_to_check): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Possible {description} in LaTeX", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Convert to LaTeX formatting or remove if unintentional" + )) + + return results diff --git a/src/checkers/anonymization_checker.py b/src/checkers/anonymization_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..8c17e53510ffc4954b128a355b79ec95c37b727c --- /dev/null +++ b/src/checkers/anonymization_checker.py @@ -0,0 +1,216 @@ +""" +Anonymization checker. + +For double-blind review submissions, checks for: +- Author name leaks in acknowledgments +- Personal URLs (GitHub, personal pages) +- Self-citations that reveal identity +- Institutional information in comments +""" +import re +from typing import List + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class AnonymizationChecker(BaseChecker): + """Check for anonymization issues in double-blind submissions.""" + + name = "anonymization" + display_name = "Anonymization" + description = "Detect potential identity leaks in double-blind submissions" + + # Patterns for identity-revealing content + PERSONAL_URL_PATTERNS = [ + (r'github\.com/[a-zA-Z0-9_-]+', "GitHub profile URL"), + (r'gitlab\.com/[a-zA-Z0-9_-]+', "GitLab profile URL"), + (r'twitter\.com/[a-zA-Z0-9_]+', "Twitter profile URL"), + (r'linkedin\.com/in/[a-zA-Z0-9_-]+', "LinkedIn profile URL"), + (r'huggingface\.co/[a-zA-Z0-9_-]+', "HuggingFace profile URL"), + (r'~[a-zA-Z]+/', "Personal university page"), + (r'people\.[a-zA-Z]+\.edu', "Academic personal page"), + (r'homes\.[a-zA-Z]+\.(edu|ac\.[a-z]+)', "Academic home page"), + ] + + # Anonymous submission indicators (should be present) + ANONYMOUS_MARKERS = [ + r'\\author\{[^}]*anonymous[^}]*\}', + r'anonymous\s+submission', + r'\\runningauthor\{[^}]*\}', # Should be empty or generic + ] + + # Potentially revealing patterns + SELF_CITE_PATTERNS = [ + r'\\cite[pt]?\{[^}]*\}\s*(?:show|demonstrate|propose|present|introduce)', + r'(?:our|we)\s+(?:previous|prior|earlier)\s+(?:work|paper|study)', + r'(?:as\s+)?(?:we|the\s+authors?)\s+(?:have\s+)?(?:shown|demonstrated|proved)', + ] + + # Acknowledgment patterns + ACK_PATTERN = re.compile( + r'\\(?:section\*?\{acknowledgment|begin\{ack)', + re.IGNORECASE + ) + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + lines = tex_content.split('\n') + + # Check if this is a review submission (look for anonymous author) + is_review_version = self._is_review_version(tex_content) + + if not is_review_version: + # If camera-ready, skip anonymization checks + results.append(self._create_result( + passed=True, + severity=CheckSeverity.INFO, + message="Document appears to be camera-ready version (not checking anonymization)" + )) + return results + + # Check for personal URLs + for line_num, line in enumerate(lines, 1): + # Skip comments, but still check for leaks in comments! + if self._is_comment_line(line): + for pattern, desc in self.PERSONAL_URL_PATTERNS: + if re.search(pattern, line, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"{desc} in comment (could be revealed when compiling)", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Remove or anonymize URL even in comments" + )) + continue + + for pattern, desc in self.PERSONAL_URL_PATTERNS: + if re.search(pattern, line, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=f"{desc} may reveal author identity", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Replace with anonymized URL or remove for review" + )) + + # Check acknowledgments section + ack_results = self._check_acknowledgments(tex_content, lines) + results.extend(ack_results) + + # Check for self-revealing citations + for line_num, line in enumerate(lines, 1): + # Skip comments using base class method + if self._is_comment_line(line): + continue + + for pattern in self.SELF_CITE_PATTERNS: + if re.search(pattern, line, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="Potentially self-revealing citation pattern", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')" + )) + + # Check for \author content + author_results = self._check_author_field(tex_content) + results.extend(author_results) + + return results + + def _is_review_version(self, content: str) -> bool: + """Detect if this is a review (anonymous) version.""" + # Check for common anonymous submission markers + review_indicators = [ + r'review', + r'submitted\s+to', + r'under\s+review', + r'anonymous', + r'\\usepackage\[review\]', + ] + + for indicator in review_indicators: + if re.search(indicator, content[:2000], re.IGNORECASE): + return True + + # Check for camera-ready indicators (negative) + camera_indicators = [ + r'\\usepackage\[accepted\]', + r'\\usepackage\[final\]', + r'camera[\s-]?ready', + ] + + for indicator in camera_indicators: + if re.search(indicator, content[:2000], re.IGNORECASE): + return False + + # Default to review version (safer) + return True + + def _check_acknowledgments(self, content: str, lines: List[str]) -> List[CheckResult]: + """Check acknowledgments section for identity leaks.""" + results = [] + + # Find acknowledgment section + ack_match = self.ACK_PATTERN.search(content) + if not ack_match: + return results + + # Find the line number + ack_line = self._find_line_number(content, ack_match.start()) + + # Check if it's commented out + actual_line = lines[ack_line - 1] if ack_line <= len(lines) else "" + if not actual_line.lstrip().startswith('%'): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="Acknowledgments section found - should be commented out for review", + line_number=ack_line, + suggestion="Comment out acknowledgments with % for anonymous submission" + )) + + return results + + def _check_author_field(self, content: str) -> List[CheckResult]: + """Check \\author{} field for revealing content.""" + results = [] + + # Find \author{...} - handle multiline + author_pattern = re.compile(r'\\author\s*\{', re.DOTALL) + match = author_pattern.search(content) + + if match: + # Extract author content (handle nested braces) + start = match.end() + brace_count = 1 + i = start + while i < len(content) and brace_count > 0: + if content[i] == '{': + brace_count += 1 + elif content[i] == '}': + brace_count -= 1 + i += 1 + + author_content = content[start:i-1] + line_num = self._find_line_number(content, match.start()) + + # Check if author content looks anonymous + if not re.search(r'anonymous|author\s*names?\s*hidden', author_content, re.IGNORECASE): + # Check if it's not using \Anonymous or similar + if not re.search(r'\\(Anonymous|blindauthor)', author_content): + # Might contain real author info + if re.search(r'[A-Z][a-z]+\s+[A-Z][a-z]+', author_content): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message="Author field may contain real names", + line_number=line_num, + suggestion="Replace with 'Anonymous' or use anonymization command" + )) + + return results diff --git a/src/checkers/base.py b/src/checkers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..de8caf1d73fc485eb2a92532bd41ce8efd9539fc --- /dev/null +++ b/src/checkers/base.py @@ -0,0 +1,193 @@ +""" +Base checker class for paper submission quality checks. + +All specific checkers inherit from BaseChecker and implement +the check() method to validate specific aspects of the TeX document. +""" +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Tuple + + +class CheckSeverity(Enum): + """Severity levels for check results.""" + ERROR = "error" # Must fix before submission + WARNING = "warning" # Strongly recommended to fix + INFO = "info" # Suggestion or best practice + + +@dataclass +class CheckResult: + """Result of a single check.""" + checker_name: str + passed: bool + severity: CheckSeverity + message: str + line_number: Optional[int] = None + line_content: Optional[str] = None + suggestion: Optional[str] = None + file_path: Optional[str] = None + + def to_dict(self) -> dict: + return { + 'checker': self.checker_name, + 'passed': self.passed, + 'severity': self.severity.value, + 'message': self.message, + 'line': self.line_number, + 'content': self.line_content, + 'suggestion': self.suggestion, + 'file_path': self.file_path + } + + +class BaseChecker(ABC): + """ + Abstract base class for all paper submission checkers. + + Each checker validates a specific aspect of the paper, + such as caption placement, reference integrity, or formatting. + """ + + # Checker metadata - override in subclasses + name: str = "base" + display_name: str = "Base Checker" + description: str = "Base checker class" + + @abstractmethod + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + """ + Run the check on the given TeX content. + + Args: + tex_content: The full content of the TeX file + config: Optional configuration dict (e.g., conference-specific settings) + + Returns: + List of CheckResult objects describing found issues + """ + pass + + def _remove_comments(self, content: str) -> str: + """ + Remove all LaTeX comments from content. + + Preserves line structure (replaces comment with empty string on same line). + Handles escaped percent signs (\\%) correctly. + """ + lines = content.split('\n') + result = [] + + for line in lines: + # Find first unescaped % + cleaned = self._remove_line_comment(line) + result.append(cleaned) + + return '\n'.join(result) + + def _remove_line_comment(self, line: str) -> str: + """Remove comment from a single line, preserving content before %.""" + i = 0 + while i < len(line): + if line[i] == '%': + # Check if escaped + num_backslashes = 0 + j = i - 1 + while j >= 0 and line[j] == '\\': + num_backslashes += 1 + j -= 1 + if num_backslashes % 2 == 0: + # Not escaped, this is a comment start + return line[:i] + i += 1 + return line + + def _is_comment_line(self, line: str) -> bool: + """Check if a line is entirely a comment (starts with %).""" + stripped = line.lstrip() + if not stripped: + return False + return stripped[0] == '%' + + def _get_non_comment_lines(self, content: str) -> List[Tuple[int, str]]: + """ + Get all non-comment lines with their line numbers. + + Returns: + List of (line_number, line_content) tuples for non-comment lines. + Line content has inline comments removed. + """ + lines = content.split('\n') + result = [] + + for line_num, line in enumerate(lines, 1): + # Skip pure comment lines + if self._is_comment_line(line): + continue + + # Remove inline comments + cleaned = self._remove_line_comment(line) + + # Skip if nothing left after removing comment + if not cleaned.strip(): + continue + + result.append((line_num, cleaned)) + + return result + + def _find_line_number(self, content: str, position: int) -> int: + """Find line number for a character position in content.""" + return content[:position].count('\n') + 1 + + def _get_line_content(self, content: str, line_number: int) -> str: + """Get the content of a specific line.""" + lines = content.split('\n') + if 1 <= line_number <= len(lines): + return lines[line_number - 1].strip() + return "" + + def _is_commented(self, content: str, position: int) -> bool: + """Check if a position is within a LaTeX comment.""" + # Find the start of the current line + line_start = content.rfind('\n', 0, position) + 1 + line_before = content[line_start:position] + + # Check for unescaped % before this position on the same line + i = 0 + while i < len(line_before): + if line_before[i] == '%': + # Check if escaped + num_backslashes = 0 + j = i - 1 + while j >= 0 and line_before[j] == '\\': + num_backslashes += 1 + j -= 1 + if num_backslashes % 2 == 0: + # Not escaped, this is a comment + return True + i += 1 + return False + + def _create_result( + self, + passed: bool, + severity: CheckSeverity, + message: str, + line_number: Optional[int] = None, + line_content: Optional[str] = None, + suggestion: Optional[str] = None + ) -> CheckResult: + """Helper to create a CheckResult with this checker's name.""" + return CheckResult( + checker_name=self.name, + passed=passed, + severity=severity, + message=message, + line_number=line_number, + line_content=line_content, + suggestion=suggestion + ) + diff --git a/src/checkers/caption_checker.py b/src/checkers/caption_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..9238c27db234afbc844a44976cf43d8f9834d49b --- /dev/null +++ b/src/checkers/caption_checker.py @@ -0,0 +1,136 @@ +""" +Caption placement checker. + +Validates that: +- Table captions appear ABOVE the table content +- Figure captions appear BELOW the figure content +""" +import re +from typing import List + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class CaptionChecker(BaseChecker): + """Check for correct caption placement in tables and figures.""" + + name = "caption" + display_name = "Caption Placement" + description = "Verify table captions are above and figure captions are below" + + # Patterns for environments + TABLE_ENV_PATTERN = re.compile( + r'\\begin\{table\*?\}(.*?)\\end\{table\*?\}', + re.DOTALL | re.IGNORECASE + ) + FIGURE_ENV_PATTERN = re.compile( + r'\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}', + re.DOTALL | re.IGNORECASE + ) + + # Content patterns + CAPTION_PATTERN = re.compile(r'\\caption\s*[\[{]') + TABULAR_PATTERN = re.compile(r'\\begin\{tabular') + INCLUDEGRAPHICS_PATTERN = re.compile(r'\\includegraphics') + TIKZ_PATTERN = re.compile(r'\\begin\{tikzpicture\}') + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # Check table environments + for match in self.TABLE_ENV_PATTERN.finditer(tex_content): + env_content = match.group(1) + env_start = match.start() + + # Skip if commented + if self._is_commented(tex_content, env_start): + continue + + result = self._check_table_caption(env_content, tex_content, env_start) + if result: + results.append(result) + + # Check figure environments + for match in self.FIGURE_ENV_PATTERN.finditer(tex_content): + env_content = match.group(1) + env_start = match.start() + + # Skip if commented + if self._is_commented(tex_content, env_start): + continue + + result = self._check_figure_caption(env_content, tex_content, env_start) + if result: + results.append(result) + + return results + + def _check_table_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult: + """Check that table caption is above tabular content.""" + caption_match = self.CAPTION_PATTERN.search(env_content) + tabular_match = self.TABULAR_PATTERN.search(env_content) + + if not caption_match: + line_num = self._find_line_number(full_content, env_start) + return self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="Table environment missing caption", + line_number=line_num, + suggestion="Add \\caption{} before \\begin{tabular}" + ) + + if not tabular_match: + # Table without tabular content - skip + return None + + # Caption should come BEFORE tabular + if caption_match.start() > tabular_match.start(): + line_num = self._find_line_number(full_content, env_start + caption_match.start()) + return self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message="Table caption should be placed ABOVE the table content", + line_number=line_num, + line_content=self._get_line_content(full_content, line_num), + suggestion="Move \\caption{} before \\begin{tabular}" + ) + + return None + + def _check_figure_caption(self, env_content: str, full_content: str, env_start: int) -> CheckResult: + """Check that figure caption is below image content.""" + caption_match = self.CAPTION_PATTERN.search(env_content) + graphics_match = self.INCLUDEGRAPHICS_PATTERN.search(env_content) + tikz_match = self.TIKZ_PATTERN.search(env_content) + + # Find the actual content (either graphics or tikz) + content_match = graphics_match or tikz_match + + if not caption_match: + line_num = self._find_line_number(full_content, env_start) + return self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="Figure environment missing caption", + line_number=line_num, + suggestion="Add \\caption{} after \\includegraphics" + ) + + if not content_match: + # Figure without graphics/tikz - could be custom content, skip + return None + + # Caption should come AFTER content + if caption_match.start() < content_match.start(): + line_num = self._find_line_number(full_content, env_start + caption_match.start()) + return self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message="Figure caption should be placed BELOW the figure content", + line_number=line_num, + line_content=self._get_line_content(full_content, line_num), + suggestion="Move \\caption{} after \\includegraphics" + ) + + return None diff --git a/src/checkers/citation_quality_checker.py b/src/checkers/citation_quality_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..de29ca2900bfd242adf368f33c1026595c10eb8f --- /dev/null +++ b/src/checkers/citation_quality_checker.py @@ -0,0 +1,131 @@ +""" +Citation quality checker. + +Validates: +- Old citations (>30 years) that might need updating +- Citation formatting patterns (et al., hardcoded citations, etc.) +""" +import re +from typing import List, Dict +from datetime import datetime +from collections import defaultdict + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class CitationQualityChecker(BaseChecker): + """Check citation quality and balance.""" + + name = "citation_quality" + display_name = "Citation Quality" + description = "Check citation age, balance, and formatting" + + # Thresholds + OLD_CITATION_YEARS = 30 # Citations older than this get flagged + + CURRENT_YEAR = datetime.now().year + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # This checker works best with bib content, but we can do some analysis + # on the tex file alone by looking at citation patterns + + # Check for inline year citations that are old + old_cite_results = self._check_old_citations_in_text(tex_content) + results.extend(old_cite_results) + + # Check for citation formatting issues + format_results = self._check_citation_formatting(tex_content) + results.extend(format_results) + + return results + + def _check_old_citations_in_text(self, content: str) -> List[CheckResult]: + """Look for citations with old years visible in text.""" + results = [] + lines = content.split('\n') + + # Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)" + year_pattern = re.compile( + r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|' # Parenthetical + r'\b(?:19[89]\d|20[01]\d)\b)', # Standalone year + re.IGNORECASE + ) + + old_years_found = set() + + for line_num, line in enumerate(lines, 1): + # Skip comments using base class method + if self._is_comment_line(line): + continue + + for match in year_pattern.finditer(line): + year_str = re.search(r'(19[89]\d|20[01]\d)', match.group()) + if year_str: + year = int(year_str.group()) + age = self.CURRENT_YEAR - year + + if age >= self.OLD_CITATION_YEARS and year not in old_years_found: + old_years_found.add(year) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Citation from {year} ({age} years old)", + line_number=line_num, + suggestion=f"Consider if there's more recent work on this topic" + )) + + return results + + def _check_citation_formatting(self, content: str) -> List[CheckResult]: + """Check for common citation formatting issues.""" + results = [] + lines = content.split('\n') + + for line_num, line in enumerate(lines, 1): + if line.lstrip().startswith('%'): + continue + + # Check for "et al" without period + if re.search(r'\bet al\b(?!\.)', line): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="'et al' should be 'et al.'", + line_number=line_num, + suggestion="Add period after 'et al.'" + )) + + # Check for "[1]" style citations (might want natbib style) + # Skip if it's a command definition or argument + if re.search(r'\[\d+\]', line): + # Skip if in command definition + if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line: + continue + # Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1] + if re.search(r'\\[a-zA-Z]+\[\d+\]', line): + continue + # Only flag if it looks like actual citation in text + if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message="Numeric citation style detected", + line_number=line_num, + suggestion="Consider author-year style for better readability" + )) + + # Check for hardcoded citations instead of \cite + if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line): + if '\\cite' not in line: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message="Appears to be hardcoded citation instead of \\cite", + line_number=line_num, + line_content=line.strip()[:80], + suggestion="Use \\cite{} for proper bibliography management" + )) + + return results diff --git a/src/checkers/consistency_checker.py b/src/checkers/consistency_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..14849b9b71ed715864b33b0fcda2edfd0f99935d --- /dev/null +++ b/src/checkers/consistency_checker.py @@ -0,0 +1,254 @@ +""" +Terminology consistency checker. + +Validates: +- Consistent spelling of the same term +- Consistent hyphenation +- Consistent capitalization of technical terms +""" +import re +from typing import List, Dict, Set +from collections import defaultdict + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class ConsistencyChecker(BaseChecker): + """Check terminology and spelling consistency.""" + + name = "consistency" + display_name = "Consistency" + description = "Check for inconsistent terminology and spelling" + + # Known variant pairs (canonical -> variants) + KNOWN_VARIANTS = { + # Hyphenation variants + 'self-supervised': ['self supervised', 'selfsupervised'], + 'pre-trained': ['pre trained', 'pretrained'], + 'fine-tuned': ['fine tuned', 'finetuned'], + 'state-of-the-art': ['state of the art', 'stateoftheart'], + 'real-world': ['real world', 'realworld'], + 'end-to-end': ['end to end', 'endtoend', 'e2e'], + 'large-scale': ['large scale', 'largescale'], + 'long-term': ['long term', 'longterm'], + 'short-term': ['short term', 'shortterm'], + 'multi-task': ['multi task', 'multitask'], + 'multi-modal': ['multi modal', 'multimodal'], + 'cross-lingual': ['cross lingual', 'crosslingual'], + 'zero-shot': ['zero shot', 'zeroshot'], + 'few-shot': ['few shot', 'fewshot'], + 'in-context': ['in context', 'incontext'], + + # American vs British English (comprehensive list) + # -or/-our endings + 'color': ['colour'], + 'behavior': ['behaviour'], + 'favor': ['favour'], + 'honor': ['honour'], + 'labor': ['labour'], + 'neighbor': ['neighbour'], + 'rumor': ['rumour'], + 'vapor': ['vapour'], + + # -ize/-ise endings + 'analyze': ['analyse'], + 'characterize': ['characterise'], + 'generalize': ['generalise'], + 'initialize': ['initialise'], + 'maximize': ['maximise'], + 'minimize': ['minimise'], + 'normalize': ['normalise'], + 'optimize': ['optimise'], + 'organize': ['organise'], + 'realize': ['realise'], + 'recognize': ['recognise'], + 'specialize': ['specialise'], + 'standardize': ['standardise'], + 'summarize': ['summarise'], + 'utilize': ['utilise'], + 'visualize': ['visualise'], + 'categorize': ['categorise'], + 'emphasize': ['emphasise'], + 'hypothesize': ['hypothesise'], + 'prioritize': ['prioritise'], + 'synchronize': ['synchronise'], + + # -ization/-isation endings + 'generalization': ['generalisation'], + 'initialization': ['initialisation'], + 'maximization': ['maximisation'], + 'minimization': ['minimisation'], + 'normalization': ['normalisation'], + 'optimization': ['optimisation'], + 'organization': ['organisation'], + 'realization': ['realisation'], + 'regularization': ['regularisation'], + 'specialization': ['specialisation'], + 'standardization': ['standardisation'], + 'summarization': ['summarisation'], + 'utilization': ['utilisation'], + 'visualization': ['visualisation'], + 'categorization': ['categorisation'], + 'characterization': ['characterisation'], + 'parametrization': ['parametrisation'], + 'quantization': ['quantisation'], + + # -er/-re endings + 'center': ['centre'], + 'fiber': ['fibre'], + 'meter': ['metre'], + 'liter': ['litre'], + + # -l-/-ll- (American single, British double) + 'modeling': ['modelling'], + 'labeled': ['labelled'], + 'labeling': ['labelling'], + 'traveled': ['travelled'], + 'traveling': ['travelling'], + 'canceled': ['cancelled'], + 'canceling': ['cancelling'], + 'signaled': ['signalled'], + 'signaling': ['signalling'], + + # -og/-ogue endings + 'analog': ['analogue'], + 'catalog': ['catalogue'], + 'dialog': ['dialogue'], + + # -ense/-ence endings + 'defense': ['defence'], + 'license': ['licence'], + 'offense': ['offence'], + + # Other common differences + 'gray': ['grey'], + 'artifact': ['artefact'], + 'program': ['programme'], # Note: 'program' is standard in computing + 'skeptical': ['sceptical'], + 'aluminum': ['aluminium'], + + # Verb forms + 'learned': ['learnt'], + 'burned': ['burnt'], + 'spelled': ['spelt'], + + # Common term variants + 'dataset': ['data set', 'data-set'], + 'benchmark': ['bench mark', 'bench-mark'], + 'baseline': ['base line', 'base-line'], + 'downstream': ['down stream', 'down-stream'], + 'upstream': ['up stream', 'up-stream'], + 'encoder': ['en-coder'], + 'decoder': ['de-coder'], + } + + # Capitalization variants to track + CAPITALIZATION_TERMS = [ + 'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn', + 'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu', + ] + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # Remove comments + content = re.sub(r'(? 1: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"Inconsistent spelling: {', '.join(found_forms)}", + suggestion=f"Use '{canonical}' consistently throughout" + )) + + # Check hyphenated word consistency + hyphen_results = self._check_hyphenation_consistency(content) + results.extend(hyphen_results) + + # Check capitalization consistency + cap_results = self._check_capitalization_consistency(content) + results.extend(cap_results) + + return results + + def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]: + """Find words that appear both hyphenated and non-hyphenated.""" + results = [] + + # Common terms that should always be hyphenated (exceptions) + ALWAYS_HYPHENATED = { + 'state-of-the-art', 'end-to-end', 'real-time', 'real-world', + 'fine-tuning', 'fine-grained', 'large-scale', 'small-scale', + 'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual', + 'self-supervised', 'self-attention', 'co-training', 'pre-training', + 'post-processing', 'pre-processing', 'well-known', 'well-defined', + 'high-quality', 'low-quality', 'long-term', 'short-term' + } + + # Find all hyphenated words + hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE)) + + for hyph_word in hyphenated: + # Skip if it's a known compound that should always be hyphenated + if hyph_word.lower() in ALWAYS_HYPHENATED: + continue + + # Create non-hyphenated version + non_hyph = hyph_word.replace('-', ' ') + combined = hyph_word.replace('-', '') + + # Check if non-hyphenated version exists + if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'", + suggestion="Choose one form and use it consistently" + )) + elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'", + suggestion="Choose one form and use it consistently" + )) + + return results + + def _check_capitalization_consistency(self, content: str) -> List[CheckResult]: + """Check if technical terms have consistent capitalization.""" + results = [] + + for term in self.CAPITALIZATION_TERMS: + # Find all case variations + pattern = re.compile(rf'\b{term}\b', re.IGNORECASE) + matches = pattern.findall(content) + + if len(matches) > 1: + # Check if there are mixed capitalizations + unique_forms = set(matches) + if len(unique_forms) > 1: + forms_str = ', '.join(f"'{f}'" for f in unique_forms) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Inconsistent capitalization: {forms_str}", + suggestion="Use consistent capitalization for technical terms" + )) + + return results diff --git a/src/checkers/equation_checker.py b/src/checkers/equation_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..84e97e5901bebe56a8349ff499136802dc7f5735 --- /dev/null +++ b/src/checkers/equation_checker.py @@ -0,0 +1,134 @@ +""" +Equation formatting checker. + +Validates: +- Punctuation after equations (based on grammar) +- Equation numbering consistency +- Variable definitions +""" +import re +from typing import List, Set + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class EquationChecker(BaseChecker): + """Check equation formatting and consistency.""" + + name = "equation" + display_name = "Equations" + description = "Check equation formatting and punctuation" + + # Equation environments + EQUATION_ENVS = [ + 'equation', 'align', 'gather', 'multline', 'eqnarray', + 'equation*', 'align*', 'gather*', 'multline*', 'eqnarray*' + ] + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # Check equation punctuation + punct_results = self._check_equation_punctuation(tex_content) + results.extend(punct_results) + + # Check for numbered vs unnumbered consistency + numbering_results = self._check_numbering_consistency(tex_content) + results.extend(numbering_results) + + # Check inline math consistency ($...$ vs \(...\)) + inline_results = self._check_inline_math_consistency(tex_content) + results.extend(inline_results) + + return results + + def _check_equation_punctuation(self, content: str) -> List[CheckResult]: + """Check if equations end with appropriate punctuation.""" + results = [] + + for env in self.EQUATION_ENVS: + if '*' in env: + env_escaped = env.replace('*', r'\*') + else: + env_escaped = env + + # Find equation content + pattern = re.compile( + rf'\\begin\{{{env_escaped}\}}(.*?)\\end\{{{env_escaped}\}}', + re.DOTALL + ) + + for match in pattern.finditer(content): + eq_content = match.group(1).strip() + + # Check what comes after the equation + after_pos = match.end() + after_text = content[after_pos:after_pos + 50].strip() + + # Equations in running text should have punctuation + # Check if equation content ends with punctuation + eq_content_clean = re.sub(r'\\label\{[^}]+\}', '', eq_content).strip() + + if eq_content_clean and not re.search(r'[.,;]$', eq_content_clean): + # Check if next text starts lowercase (indicating sentence continues) + if after_text and after_text[0].islower(): + line_num = self._find_line_number(content, match.end()) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message="Equation may need punctuation (sentence continues after)", + line_number=line_num, + suggestion="Add comma or period inside equation if it ends a clause" + )) + + return results + + def _check_numbering_consistency(self, content: str) -> List[CheckResult]: + """Check for mixed numbered and unnumbered equations.""" + results = [] + + # Count numbered vs unnumbered + numbered = 0 + unnumbered = 0 + + for env in self.EQUATION_ENVS: + count = len(re.findall(rf'\\begin\{{{env}\}}', content)) + if '*' in env or 'nonumber' in content: + unnumbered += count + else: + numbered += count + + # Also count \nonumber and \notag usage + unnumbered += len(re.findall(r'\\nonumber|\\notag', content)) + + # If there's a significant mix, warn + total = numbered + unnumbered + if total > 3 and numbered > 0 and unnumbered > 0: + ratio = min(numbered, unnumbered) / total + if ratio > 0.2: # More than 20% in minority + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Mixed equation numbering: {numbered} numbered, {unnumbered} unnumbered", + suggestion="Consider consistent numbering strategy" + )) + + return results + + def _check_inline_math_consistency(self, content: str) -> List[CheckResult]: + """Check for mixed inline math delimiters.""" + results = [] + + # Count different inline math styles + dollar_count = len(re.findall(r'(? 0 and paren_count > 0: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Mixed inline math: ${dollar_count} \\$...\\$ and {paren_count} \\(...\\)", + suggestion="Use consistent inline math delimiters throughout" + )) + + return results diff --git a/src/checkers/formatting_checker.py b/src/checkers/formatting_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..ffcc3e9df645e870322ab1879e6a70bcb1c4ae1b --- /dev/null +++ b/src/checkers/formatting_checker.py @@ -0,0 +1,204 @@ +""" +Formatting checker. + +Validates common LaTeX formatting issues: +- Citation formatting consistency +- Non-breaking spaces before citations +- Special character escaping +- Whitespace issues +""" +import re +from typing import List + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class FormattingChecker(BaseChecker): + """Check for common LaTeX formatting issues.""" + + name = "formatting" + display_name = "Formatting" + description = "Check citation style, spacing, and special characters" + + # Citation commands + CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp', + 'citeauthor', 'citeyear', 'autocite', 'textcite', + 'parencite', 'footcite'] + + # Pattern for citations without non-breaking space + # Matches: "word \cite" but not "word~\cite" + CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)') + + # Pattern for multiple consecutive spaces + MULTI_SPACE_PATTERN = re.compile(r'(? List[CheckResult]: + results = [] + lines = tex_content.split('\n') + + # Track citation style consistency + cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0} + + for line_num, line in enumerate(lines, 1): + # Skip commented lines using base class method + if self._is_comment_line(line): + continue + + # Remove inline comments using base class method + line_content = self._remove_line_comment(line) + + # Check citation non-breaking space + for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message="Citation without non-breaking space", + line_number=line_num, + line_content=line.strip()[:100], + suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')" + )) + + # Track citation styles + for cmd in self.CITE_COMMANDS: + if re.search(rf'\\{cmd}\b', line_content): + if cmd in ['citep', 'parencite', 'autocite']: + cite_styles['parenthetical'] += 1 + elif cmd in ['citet', 'textcite']: + cite_styles['textual'] += 1 + elif cmd == 'cite': + cite_styles['plain'] += 1 + + # Check citation style consistency + styles_used = [s for s, count in cite_styles.items() if count > 0] + if len(styles_used) > 1: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Mixed citation styles detected: {', '.join(styles_used)}", + suggestion="Consider using consistent citation style throughout" + )) + + # Check for multiple blank lines (3 or more) + for match in self.MULTI_BLANK_PATTERN.finditer(tex_content): + line_num = self._find_line_number(tex_content, match.start()) + # Count how many blank lines + blank_count = match.group(0).count('\n') - 1 + + # Get context: the line before, blank lines, and the line after + start_pos = match.start() + end_pos = match.end() + + # Find the line before the blank lines + prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1 + prev_line_end = start_pos + prev_line = tex_content[prev_line_start:prev_line_end].rstrip() + + # Find the line after the blank lines + next_line_end = tex_content.find('\n', end_pos) + if next_line_end == -1: + next_line_end = len(tex_content) + next_line = tex_content[end_pos:next_line_end].rstrip() + + # Create visual representation with warning markers + blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count) + line_content = f"{prev_line}\n{blank_lines}\n{next_line}" + + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Multiple blank lines ({blank_count} consecutive blank lines)", + line_number=line_num, + line_content=line_content, + suggestion="Reduce to single blank line or use \\vspace" + )) + + # Check for common issues with special characters + results.extend(self._check_special_chars(tex_content, lines)) + + return results + + def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]: + """Check for unescaped special characters.""" + results = [] + + # Find math environments to skip + math_regions = self._find_math_regions(content) + + for line_num, line in enumerate(lines, 1): + # Skip commented lines using base class method + if self._is_comment_line(line): + continue + + # Remove inline comments using base class method + line_content = self._remove_line_comment(line) + + # Get position of this line in full content + line_start = sum(len(l) + 1 for l in lines[:line_num-1]) + + # Check for unescaped & (common error) + for match in re.finditer(r'(? List[tuple]: + """Find regions that are inside math mode.""" + regions = [] + + # Inline math $ ... $ + for match in re.finditer(r'(? bool: + """Check if position is inside a math region.""" + return any(start <= pos <= end for start, end in regions) + + def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool: + """Check if position is inside any of the given environments.""" + for env in env_names: + # Find all instances of this environment + pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}' + for match in re.finditer(pattern, content, re.DOTALL): + if match.start() <= pos <= match.end(): + return True + return False diff --git a/src/checkers/number_checker.py b/src/checkers/number_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..d64ba9f7321d6787175d1969257c62e3d0536320 --- /dev/null +++ b/src/checkers/number_checker.py @@ -0,0 +1,88 @@ +""" +Number and unit formatting checker. + +Validates: +- Percentage format consistency (no space before %, consistent use of % vs 'percent') +""" +import re +from typing import List + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class NumberChecker(BaseChecker): + """Check percentage formatting.""" + + name = "number" + display_name = "Numbers & Units" + description = "Check percentage formatting" + + # Percentage patterns + PERCENT_WITH_SPACE = re.compile(r'\d\s+%') # "50 %" is wrong + + # Inconsistent percentage usage + PERCENT_WORD = re.compile(r'\d+\s+percent\b', re.IGNORECASE) + PERCENT_SYMBOL = re.compile(r'\d+%') + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + lines = tex_content.split('\n') + + # Track percentage style for consistency check + uses_symbol = False + uses_word = False + + for line_num, line in enumerate(lines, 1): + # Skip comments using base class method + if self._is_comment_line(line): + continue + + # Skip lines that are likely in math/tables + if self._in_special_context(line): + continue + + # Skip lines that look like math formulas (contain common math commands) + if re.search(r'\\(frac|sum|prod|int|partial|nabla|approx|neq|leq|geq|log|ln|exp|sin|cos|tan|alpha|beta|gamma|delta|theta|sigma|omega|left|right)', line): + continue + + line_content = re.sub(r'(? bool: + """Check if line is in a context where number rules don't apply.""" + special_patterns = [ + r'\\begin\{(tabular|array|equation|align|gather)', + r'\\includegraphics', + r'\\caption', + r'\\label', + r'\\ref', + r'^\s*&', # Table cell + r'\$.*\$', # Inline math + ] + return any(re.search(p, line) for p in special_patterns) diff --git a/src/checkers/reference_checker.py b/src/checkers/reference_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8bf234c5f11d679aa2ca40995631235dfe25cb --- /dev/null +++ b/src/checkers/reference_checker.py @@ -0,0 +1,196 @@ +""" +Cross-reference checker. + +Validates that: +- All figures and tables are referenced in text +- All labels have corresponding references +- Appendix sections are referenced in main text +""" +import re +from typing import List, Set, Tuple + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class ReferenceChecker(BaseChecker): + """Check cross-reference integrity in the document.""" + + name = "reference" + display_name = "Cross-References" + description = "Verify all figures, tables, and sections are properly referenced" + + # Label pattern: \label{prefix:name} + LABEL_PATTERN = re.compile(r'\\label\{([^}]+)\}') + + # Reference patterns + REF_PATTERNS = [ + re.compile(r'\\ref\{([^}]+)\}'), + re.compile(r'\\autoref\{([^}]+)\}'), + re.compile(r'\\cref\{([^}]+)\}'), + re.compile(r'\\Cref\{([^}]+)\}'), + re.compile(r'\\eqref\{([^}]+)\}'), + re.compile(r'\\pageref\{([^}]+)\}'), + re.compile(r'\\nameref\{([^}]+)\}'), + ] + + # Appendix detection + APPENDIX_START_PATTERN = re.compile(r'\\appendix\b|\\begin\{appendix\}') + SECTION_PATTERN = re.compile(r'\\section\*?\{([^}]+)\}') + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + + # Extract all labels and their positions + labels = self._extract_labels(tex_content) + + # Extract all references + references = self._extract_references(tex_content) + + # Find unreferenced labels + for label, (line_num, line_content) in labels.items(): + if label not in references: + # Determine severity based on label type + severity = self._get_severity_for_label(label) + label_type = self._get_label_type(label) + + results.append(self._create_result( + passed=False, + severity=severity, + message=f"Unreferenced {label_type}: '{label}'", + line_number=line_num, + line_content=line_content, + suggestion=f"Add \\ref{{{label}}} or \\autoref{{{label}}} where appropriate" + )) + + # Find undefined references (refs without labels) + for ref, (line_num, line_content) in references.items(): + if ref not in labels: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=f"Reference to undefined label: '{ref}'", + line_number=line_num, + line_content=line_content, + suggestion=f"Add \\label{{{ref}}} to the target element or fix the reference" + )) + + # Check appendix sections + appendix_results = self._check_appendix_references(tex_content, labels, references) + results.extend(appendix_results) + + return results + + def _extract_labels(self, content: str) -> dict: + """Extract all labels with their line numbers.""" + labels = {} + for match in self.LABEL_PATTERN.finditer(content): + if not self._is_commented(content, match.start()): + label = match.group(1) + line_num = self._find_line_number(content, match.start()) + line_content = self._get_line_content(content, line_num) + labels[label] = (line_num, line_content) + return labels + + def _extract_references(self, content: str) -> dict: + """Extract all references with their line numbers.""" + references = {} + for pattern in self.REF_PATTERNS: + for match in pattern.finditer(content): + if not self._is_commented(content, match.start()): + # Handle comma-separated refs like \ref{fig:a,fig:b} + refs_str = match.group(1) + for ref in refs_str.split(','): + ref = ref.strip() + if ref and ref not in references: + # Skip if ref looks like command parameter (#1, #2) + if ref.startswith('#') and len(ref) == 2 and ref[1].isdigit(): + continue + + # Skip if inside \newcommand or \renewcommand definition + line_num = self._find_line_number(content, match.start()) + line_content = self._get_line_content(content, line_num) + if re.search(r'\\(new|renew|provide)command', line_content): + continue + + references[ref] = (line_num, line_content) + return references + + def _get_label_type(self, label: str) -> str: + """Determine the type of a label based on its prefix.""" + if ':' in label: + prefix = label.split(':')[0].lower() + type_map = { + 'fig': 'figure', + 'tab': 'table', + 'sec': 'section', + 'eq': 'equation', + 'alg': 'algorithm', + 'lst': 'listing', + 'app': 'appendix', + } + return type_map.get(prefix, 'label') + return 'label' + + def _get_severity_for_label(self, label: str) -> CheckSeverity: + """Determine severity based on label type.""" + label_type = self._get_label_type(label) + + # Figures and tables should always be referenced + if label_type in ('figure', 'table'): + return CheckSeverity.WARNING + + # Equations might not always need explicit reference + if label_type == 'equation': + return CheckSeverity.INFO + + return CheckSeverity.INFO + + def _check_appendix_references( + self, + content: str, + labels: dict, + references: dict + ) -> List[CheckResult]: + """Check that appendix sections are referenced in main text.""" + results = [] + + # Find where appendix starts + appendix_match = self.APPENDIX_START_PATTERN.search(content) + if not appendix_match: + return results + + appendix_start = appendix_match.start() + main_content = content[:appendix_start] + appendix_content = content[appendix_start:] + + # Find section labels in appendix + for match in self.LABEL_PATTERN.finditer(appendix_content): + if self._is_commented(appendix_content, match.start()): + continue + + label = match.group(1) + + # Check if this label is for a section + if 'sec' in label.lower() or 'app' in label.lower(): + # Check if referenced in main text (before appendix) + is_referenced = False + for pattern in self.REF_PATTERNS: + if pattern.search(main_content) and label in main_content: + for m in pattern.finditer(main_content): + if label in m.group(1): + is_referenced = True + break + if is_referenced: + break + + if not is_referenced: + line_num = self._find_line_number(content, appendix_start + match.start()) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=f"Appendix section '{label}' is not referenced in main text", + line_number=line_num, + suggestion="Add a reference to this appendix section in the main text" + )) + + return results diff --git a/src/checkers/sentence_checker.py b/src/checkers/sentence_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..f5dd952cd6e852ad08675237fce20d9c85cf2c32 --- /dev/null +++ b/src/checkers/sentence_checker.py @@ -0,0 +1,107 @@ +""" +Sentence quality checker. + +Validates: +- Weak sentence starters +- Common writing issues +""" +import re +from typing import List + +from .base import BaseChecker, CheckResult, CheckSeverity + + +class SentenceChecker(BaseChecker): + """Check sentence quality and readability.""" + + name = "sentence" + display_name = "Sentence Quality" + description = "Check weak patterns and writing issues" + + # Weak sentence starters (avoid these) + WEAK_STARTERS = [ + (r'^There\s+(is|are|was|were|has been|have been)\s+', + "Weak start with 'There is/are'"), + (r'^It\s+(is|was|has been|should be noted)\s+', + "Weak start with 'It is'"), + (r'^This\s+(is|was|shows|demonstrates)\s+', + "Vague 'This' without clear antecedent"), + (r'^As\s+(mentioned|discussed|shown|noted)\s+(above|before|earlier|previously)', + "Consider being more specific about what was mentioned"), + ] + + # Weasel words and hedging + WEASEL_PATTERNS = [ + (r'\b(many|some|most|several)\s+(researchers?|studies|papers?|works?)\s+(have\s+)?(shown?|demonstrated?|suggested?|believe)', + "Vague attribution - consider citing specific work"), + (r'\b(obviously|clearly|of course|needless to say|it is well known)\b', + "Unsupported assertion - consider citing or removing"), + (r'\b(very|really|quite|extremely|highly)\s+(important|significant|good|effective)', + "Consider more precise language"), + (r'\bit\s+is\s+(important|crucial|essential|necessary)\s+to\s+note\s+that', + "Wordy phrase - consider simplifying"), + ] + + # Redundant phrases + REDUNDANT_PATTERNS = [ + (r'\bin order to\b', "Use 'to' instead of 'in order to'"), + (r'\bdue to the fact that\b', "Use 'because' instead"), + (r'\bat this point in time\b', "Use 'now' or 'currently'"), + (r'\bin the event that\b', "Use 'if' instead"), + (r'\bdespite the fact that\b', "Use 'although' instead"), + (r'\bfor the purpose of\b', "Use 'to' or 'for' instead"), + (r'\bwith the exception of\b', "Use 'except' instead"), + (r'\bin close proximity to\b', "Use 'near' instead"), + (r'\ba large number of\b', "Use 'many' instead"), + (r'\bthe vast majority of\b', "Use 'most' instead"), + ] + + def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + results = [] + lines = tex_content.split('\n') + + for line_num, line in enumerate(lines, 1): + # Skip commented lines using base class method + if self._is_comment_line(line): + continue + + # Remove inline comments using base class method + line_content = self._remove_line_comment(line) + + # Check weak starters + for pattern, message in self.WEAK_STARTERS: + if re.search(pattern, line_content, re.IGNORECASE): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=message, + line_number=line_num, + line_content=line.strip()[:80] + )) + break # One per line + + # Check weasel words + for pattern, message in self.WEASEL_PATTERNS: + match = re.search(pattern, line_content, re.IGNORECASE) + if match: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Hedging language: '{match.group(0)[:30]}'", + line_number=line_num, + suggestion=message + )) + + # Check redundant phrases + for pattern, message in self.REDUNDANT_PATTERNS: + match = re.search(pattern, line_content, re.IGNORECASE) + if match: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=f"Redundant phrase: '{match.group(0)}'", + line_number=line_num, + suggestion=message + )) + + return results diff --git a/src/config/__init__.py b/src/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc11c5c27190e3c255ffa26799a1e7e3d99eae0 --- /dev/null +++ b/src/config/__init__.py @@ -0,0 +1,4 @@ +"""Configuration module for BibGuard.""" +from .workflow import WorkflowConfig, WorkflowStep, DEFAULT_WORKFLOW, get_default_workflow + +__all__ = ['WorkflowConfig', 'WorkflowStep', 'DEFAULT_WORKFLOW', 'get_default_workflow'] diff --git a/src/config/__pycache__/__init__.cpython-313.pyc b/src/config/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f34a85f65061f1f6901383f2c3427098a0c19e75 Binary files /dev/null and b/src/config/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/config/__pycache__/workflow.cpython-313.pyc b/src/config/__pycache__/workflow.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b55e19e08f6821df1f5d3550a2332b5ce0226ca Binary files /dev/null and b/src/config/__pycache__/workflow.cpython-313.pyc differ diff --git a/src/config/__pycache__/yaml_config.cpython-313.pyc b/src/config/__pycache__/yaml_config.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..439b18be73390a7bb9137472d739bbc5a36de835 Binary files /dev/null and b/src/config/__pycache__/yaml_config.cpython-313.pyc differ diff --git a/src/config/workflow.py b/src/config/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..47d7f962df8873a977a76c69fc8b0b45b835ad4b --- /dev/null +++ b/src/config/workflow.py @@ -0,0 +1,174 @@ +""" +Workflow configuration for reference checking. + +Allows users to customize the order and enable/disable individual fetchers +in the reference verification workflow. +""" +import json +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import List, Optional + + +@dataclass +class WorkflowStep: + """A single step in the reference checking workflow.""" + name: str + display_name: str + description: str + enabled: bool = True + priority: int = 0 + + # Step type: 'by_id', 'by_doi', 'by_title' + search_type: str = 'by_title' + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict) -> 'WorkflowStep': + return cls(**data) + + +@dataclass +class WorkflowConfig: + """Configuration for the reference checking workflow.""" + steps: List[WorkflowStep] = field(default_factory=list) + name: str = "default" + description: str = "Default workflow configuration" + + def get_enabled_steps(self) -> List[WorkflowStep]: + """Get only enabled steps, sorted by priority.""" + return sorted( + [s for s in self.steps if s.enabled], + key=lambda x: x.priority + ) + + def move_step_up(self, index: int) -> bool: + """Move a step up in priority (swap with previous).""" + if index <= 0 or index >= len(self.steps): + return False + self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index] + self._update_priorities() + return True + + def move_step_down(self, index: int) -> bool: + """Move a step down in priority (swap with next).""" + if index < 0 or index >= len(self.steps) - 1: + return False + self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index] + self._update_priorities() + return True + + def toggle_step(self, index: int) -> bool: + """Toggle enabled status of a step.""" + if 0 <= index < len(self.steps): + self.steps[index].enabled = not self.steps[index].enabled + return True + return False + + def _update_priorities(self): + """Update priority values based on current order.""" + for i, step in enumerate(self.steps): + step.priority = i + + def to_dict(self) -> dict: + return { + 'name': self.name, + 'description': self.description, + 'steps': [s.to_dict() for s in self.steps] + } + + @classmethod + def from_dict(cls, data: dict) -> 'WorkflowConfig': + steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])] + return cls( + steps=steps, + name=data.get('name', 'custom'), + description=data.get('description', '') + ) + + def save(self, filepath: str): + """Save workflow configuration to JSON file.""" + path = Path(filepath) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, 'w', encoding='utf-8') as f: + json.dump(self.to_dict(), f, indent=2) + + @classmethod + def load(cls, filepath: str) -> 'WorkflowConfig': + """Load workflow configuration from JSON file.""" + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + return cls.from_dict(data) + + +# Default workflow matching current implementation order +DEFAULT_WORKFLOW = WorkflowConfig( + name="default", + description="Default reference checking workflow prioritizing reliable APIs", + steps=[ + WorkflowStep( + name="arxiv_id", + display_name="arXiv by ID", + description="Look up paper by arXiv ID (highest priority for arXiv papers)", + priority=0, + search_type="by_id" + ), + WorkflowStep( + name="crossref_doi", + display_name="CrossRef by DOI", + description="Look up paper by DOI (authoritative for DOIs)", + priority=1, + search_type="by_doi" + ), + WorkflowStep( + name="semantic_scholar", + display_name="Semantic Scholar", + description="Official API with high quality metadata", + priority=2, + search_type="by_title" + ), + WorkflowStep( + name="dblp", + display_name="DBLP", + description="Official API, especially good for CS publications", + priority=3, + search_type="by_title" + ), + WorkflowStep( + name="openalex", + display_name="OpenAlex", + description="Official API with broad coverage", + priority=4, + search_type="by_title" + ), + WorkflowStep( + name="arxiv_title", + display_name="arXiv by Title", + description="Search arXiv by title (fallback for non-ID lookups)", + priority=5, + search_type="by_title" + ), + WorkflowStep( + name="crossref_title", + display_name="CrossRef by Title", + description="Search CrossRef by title", + priority=6, + search_type="by_title" + ), + WorkflowStep( + name="google_scholar", + display_name="Google Scholar", + description="Web scraping fallback (may be rate-limited or blocked)", + priority=7, + search_type="by_title", + enabled=True # Still enabled but lowest priority + ), + ] +) + + +def get_default_workflow() -> WorkflowConfig: + """Get a fresh copy of the default workflow.""" + return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict()) diff --git a/src/config/yaml_config.py b/src/config/yaml_config.py new file mode 100644 index 0000000000000000000000000000000000000000..fe34964e7917246d6487797c95f802c42a9bc1f1 --- /dev/null +++ b/src/config/yaml_config.py @@ -0,0 +1,300 @@ +""" +YAML configuration loader for BibGuard. + +Loads configuration from YAML file and provides defaults. +""" +import yaml +from pathlib import Path +from dataclasses import dataclass, field +from typing import Optional, List, Dict, Any + + +@dataclass +class FilesConfig: + """File path configuration.""" + bib: str = "" + tex: str = "" + input_dir: str = "" # Directory to recursive search for .tex and .bib files + output_dir: str = "bibguard_output" # Output directory for all generated files + + +@dataclass +class BibliographyConfig: + """Bibliography check configuration.""" + check_metadata: bool = True + check_usage: bool = True + check_duplicates: bool = True + check_preprint_ratio: bool = True + preprint_warning_threshold: float = 0.50 + check_relevance: bool = False + + +@dataclass +class SubmissionConfig: + """Submission quality check configuration.""" + + # Format checks + caption: bool = True + reference: bool = True + formatting: bool = True + equation: bool = True + + # Writing quality + ai_artifacts: bool = True + sentence: bool = True + consistency: bool = True + + # Academic standards + acronym: bool = True + number: bool = True + citation_quality: bool = True + + # Review compliance + anonymization: bool = True + + def get_enabled_checkers(self) -> List[str]: + """Get list of enabled checker names.""" + checkers = [] + if self.caption: + checkers.append('caption') + if self.reference: + checkers.append('reference') + if self.formatting: + checkers.append('formatting') + if self.equation: + checkers.append('equation') + if self.ai_artifacts: + checkers.append('ai_artifacts') + if self.sentence: + checkers.append('sentence') + if self.consistency: + checkers.append('consistency') + if self.acronym: + checkers.append('acronym') + if self.number: + checkers.append('number') + if self.citation_quality: + checkers.append('citation_quality') + if self.anonymization: + checkers.append('anonymization') + return checkers + + +@dataclass +class WorkflowStep: + """Single step in the reference check workflow.""" + name: str + enabled: bool = True + description: str = "" + + +@dataclass +class LLMConfig: + """LLM configuration for relevance checking.""" + backend: str = "gemini" + model: str = "" + endpoint: str = "" + api_key: str = "" + + +@dataclass +class OutputConfig: + """Output configuration.""" + quiet: bool = False + minimal_verified: bool = False + + +@dataclass +class BibGuardConfig: + """Complete BibGuard configuration.""" + files: FilesConfig = field(default_factory=FilesConfig) + template: str = "" + bibliography: BibliographyConfig = field(default_factory=BibliographyConfig) + submission: SubmissionConfig = field(default_factory=SubmissionConfig) + workflow: List[WorkflowStep] = field(default_factory=list) + llm: LLMConfig = field(default_factory=LLMConfig) + output: OutputConfig = field(default_factory=OutputConfig) + + # Internal fields to store discovered files in directory mode + _bib_files: List[Path] = field(default_factory=list) + _tex_files: List[Path] = field(default_factory=list) + + # Path to the config file (for resolving relative paths) + _config_dir: Path = field(default_factory=lambda: Path.cwd()) + + def resolve_path(self, path: str) -> Path: + """Resolve a path relative to the config file directory.""" + p = Path(path) + if p.is_absolute(): + return p + return self._config_dir / p + + @property + def bib_path(self) -> Path: + return self.resolve_path(self.files.bib) + + @property + def tex_path(self) -> Path: + return self.resolve_path(self.files.tex) + + @property + def input_dir_path(self) -> Path: + return self.resolve_path(self.files.input_dir) + + @property + def output_dir_path(self) -> Path: + return self.resolve_path(self.files.output_dir) + + +def load_config(config_path: str) -> BibGuardConfig: + """Load configuration from YAML file.""" + path = Path(config_path) + + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) or {} + + config = BibGuardConfig() + config._config_dir = path.parent.absolute() + + # Parse files section + if 'files' in data: + files = data['files'] + config.files = FilesConfig( + bib=files.get('bib', ''), + tex=files.get('tex', ''), + input_dir=files.get('input_dir', ''), + output_dir=files.get('output_dir', 'bibguard_output') + ) + + # Parse template + config.template = data.get('template', '') + + # Parse bibliography section + if 'bibliography' in data: + bib = data['bibliography'] + config.bibliography = BibliographyConfig( + check_metadata=bib.get('check_metadata', True), + check_usage=bib.get('check_usage', True), + check_duplicates=bib.get('check_duplicates', True), + check_preprint_ratio=bib.get('check_preprint_ratio', True), + preprint_warning_threshold=bib.get('preprint_warning_threshold', 0.50), + check_relevance=bib.get('check_relevance', False) + ) + + # Parse submission section + if 'submission' in data: + sub = data['submission'] + config.submission = SubmissionConfig( + caption=sub.get('caption', True), + reference=sub.get('reference', True), + formatting=sub.get('formatting', True), + equation=sub.get('equation', True), + ai_artifacts=sub.get('ai_artifacts', True), + sentence=sub.get('sentence', True), + consistency=sub.get('consistency', True), + acronym=sub.get('acronym', True), + number=sub.get('number', True), + citation_quality=sub.get('citation_quality', True), + anonymization=sub.get('anonymization', True) + ) + + # Parse workflow section + if 'workflow' in data: + config.workflow = [ + WorkflowStep( + name=step.get('name', ''), + enabled=step.get('enabled', True), + description=step.get('description', '') + ) + for step in data['workflow'] + ] + + # Parse LLM section + if 'llm' in data: + llm = data['llm'] + config.llm = LLMConfig( + backend=llm.get('backend', 'gemini'), + model=llm.get('model', ''), + endpoint=llm.get('endpoint', ''), + api_key=llm.get('api_key', '') + ) + + # Parse output section + if 'output' in data: + out = data['output'] + config.output = OutputConfig( + quiet=out.get('quiet', False), + minimal_verified=out.get('minimal_verified', False) + ) + + return config + + +def find_config_file() -> Optional[Path]: + """Find config file in current directory or parent directories.""" + config_names = ['config.yaml', 'bibguard.yaml', 'bibguard.yml', '.bibguard.yaml', '.bibguard.yml'] + + current = Path.cwd() + + for _ in range(5): # Check up to 5 levels + for name in config_names: + config_path = current / name + if config_path.exists(): + return config_path + + parent = current.parent + if parent == current: + break + current = parent + + return None + + +def create_default_config(output_path: str = "config.yaml"): + """Create a default config file.""" + default = """# BibGuard Configuration File + +files: + bib: "paper.bib" + tex: "paper.tex" + output_dir: "bibguard_output" + +template: "" + +bibliography: + check_metadata: true + check_usage: true + check_duplicates: true + check_preprint_ratio: true + preprint_warning_threshold: 0.50 + check_relevance: false + +submission: + caption: true + reference: true + formatting: true + equation: true + ai_artifacts: true + sentence: true + consistency: true + acronym: true + number: true + citation_quality: true + anonymization: true + +llm: + backend: "gemini" + model: "" + api_key: "" + +output: + quiet: false + minimal_verified: false +""" + with open(output_path, 'w', encoding='utf-8') as f: + f.write(default) + + return output_path diff --git a/src/fetchers/__init__.py b/src/fetchers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..10c2bb70a42976b4f0eb2b1901b17611a9eec26c --- /dev/null +++ b/src/fetchers/__init__.py @@ -0,0 +1,16 @@ +"""Fetchers package""" +from .arxiv_fetcher import ArxivFetcher +from .scholar_fetcher import ScholarFetcher +from .crossref_fetcher import CrossRefFetcher +from .semantic_scholar_fetcher import SemanticScholarFetcher +from .openalex_fetcher import OpenAlexFetcher +from .dblp_fetcher import DBLPFetcher + +__all__ = [ + 'ArxivFetcher', + 'ScholarFetcher', + 'CrossRefFetcher', + 'SemanticScholarFetcher', + 'OpenAlexFetcher', + 'DBLPFetcher' +] diff --git a/src/fetchers/__pycache__/__init__.cpython-313.pyc b/src/fetchers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2c8c41f22daf3f9ebe4906fbf377db97bdda5b9 Binary files /dev/null and b/src/fetchers/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6c0f6d2bea0f0fc0d4eaa6cc21db0bf88d23ea0 Binary files /dev/null and b/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..308f1616aefb91ef39445c7ffa769df645c0278d Binary files /dev/null and b/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e6c4c3048b8dbe934a6601fd1d5db5ab66f94e1 Binary files /dev/null and b/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c22735c52179987deacb4a0647d9c24b78709ed7 Binary files /dev/null and b/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a5aef1df4d19d55bf6834cf0e7f368896ea8c9b Binary files /dev/null and b/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..089ed166dfaaf59e5fc7b54b7f45a9f3e2b2b744 Binary files /dev/null and b/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc differ diff --git a/src/fetchers/arxiv_fetcher.py b/src/fetchers/arxiv_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..22a8a064ebe5cf07f60a641d41c7fa47193684a4 --- /dev/null +++ b/src/fetchers/arxiv_fetcher.py @@ -0,0 +1,228 @@ +""" +arXiv metadata fetcher using the public API. +""" +import re +import time +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from typing import Optional +from urllib.parse import quote + +import requests + + +@dataclass +class ArxivMetadata: + """Metadata fetched from arXiv.""" + arxiv_id: str + title: str + authors: list[str] + abstract: str + published: str + updated: str + categories: list[str] + primary_category: str + doi: str + journal_ref: str + comment: str + pdf_url: str + abs_url: str + + @property + def year(self) -> str: + """Extract year from published date.""" + if self.published: + match = re.match(r'(\d{4})', self.published) + if match: + return match.group(1) + return "" + + +class ArxivFetcher: + """Fetches metadata from arXiv API.""" + + API_BASE = "http://export.arxiv.org/api/query" + RATE_LIMIT_DELAY = 3.0 # seconds between requests + + def __init__(self): + self._last_request_time = 0.0 + + def _rate_limit(self): + """Ensure rate limiting between requests.""" + elapsed = time.time() - self._last_request_time + if elapsed < self.RATE_LIMIT_DELAY: + time.sleep(self.RATE_LIMIT_DELAY - elapsed) + self._last_request_time = time.time() + + def fetch_by_id(self, arxiv_id: str) -> Optional[ArxivMetadata]: + """Fetch metadata by arXiv ID.""" + # Clean up ID + arxiv_id = arxiv_id.strip() + arxiv_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE) + + self._rate_limit() + + params = { + 'id_list': arxiv_id, + 'max_results': 1 + } + + try: + response = requests.get( + self.API_BASE, + params=params, + timeout=30, + headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'} + ) + response.raise_for_status() + except requests.RequestException as e: + return None + + return self._parse_response(response.text) + + def search_by_title(self, title: str, max_results: int = 5) -> list[ArxivMetadata]: + """Search arXiv by title.""" + self._rate_limit() + + # Clean up title for search + clean_title = re.sub(r'[^\w\s]', ' ', title) + clean_title = re.sub(r'\s+', ' ', clean_title).strip() + + # Build search query + search_query = f'ti:"{clean_title}"' + + params = { + 'search_query': search_query, + 'max_results': max_results, + 'sortBy': 'relevance', + 'sortOrder': 'descending' + } + + try: + response = requests.get( + self.API_BASE, + params=params, + timeout=30, + headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'} + ) + response.raise_for_status() + except requests.RequestException as e: + return [] + + return self._parse_response_multiple(response.text) + + def _parse_response(self, xml_content: str) -> Optional[ArxivMetadata]: + """Parse single entry response.""" + results = self._parse_response_multiple(xml_content) + return results[0] if results else None + + def _parse_response_multiple(self, xml_content: str) -> list[ArxivMetadata]: + """Parse multiple entries from response.""" + results = [] + + try: + root = ET.fromstring(xml_content) + except ET.ParseError: + return results + + # Define namespaces + ns = { + 'atom': 'http://www.w3.org/2005/Atom', + 'arxiv': 'http://arxiv.org/schemas/atom' + } + + entries = root.findall('atom:entry', ns) + + for entry in entries: + try: + metadata = self._parse_entry(entry, ns) + if metadata: + results.append(metadata) + except Exception: + continue + + return results + + def _parse_entry(self, entry: ET.Element, ns: dict) -> Optional[ArxivMetadata]: + """Parse a single entry element.""" + # Get ID + id_elem = entry.find('atom:id', ns) + if id_elem is None or id_elem.text is None: + return None + + abs_url = id_elem.text.strip() + + # Extract arXiv ID from URL + match = re.search(r'arxiv\.org/abs/(.+)$', abs_url) + arxiv_id = match.group(1) if match else "" + + # Get title + title_elem = entry.find('atom:title', ns) + title = self._clean_text(title_elem.text) if title_elem is not None and title_elem.text else "" + + # Get abstract + summary_elem = entry.find('atom:summary', ns) + abstract = self._clean_text(summary_elem.text) if summary_elem is not None and summary_elem.text else "" + + # Get authors + authors = [] + for author_elem in entry.findall('atom:author', ns): + name_elem = author_elem.find('atom:name', ns) + if name_elem is not None and name_elem.text: + authors.append(name_elem.text.strip()) + + # Get dates + published_elem = entry.find('atom:published', ns) + published = published_elem.text.strip() if published_elem is not None and published_elem.text else "" + + updated_elem = entry.find('atom:updated', ns) + updated = updated_elem.text.strip() if updated_elem is not None and updated_elem.text else "" + + # Get categories + categories = [] + for cat_elem in entry.findall('atom:category', ns): + term = cat_elem.get('term') + if term: + categories.append(term) + + primary_cat_elem = entry.find('arxiv:primary_category', ns) + primary_category = primary_cat_elem.get('term', '') if primary_cat_elem is not None else "" + + # Get DOI + doi_elem = entry.find('arxiv:doi', ns) + doi = doi_elem.text.strip() if doi_elem is not None and doi_elem.text else "" + + # Get journal reference + journal_elem = entry.find('arxiv:journal_ref', ns) + journal_ref = journal_elem.text.strip() if journal_elem is not None and journal_elem.text else "" + + # Get comment + comment_elem = entry.find('arxiv:comment', ns) + comment = comment_elem.text.strip() if comment_elem is not None and comment_elem.text else "" + + # Build PDF URL + pdf_url = abs_url.replace('/abs/', '/pdf/') + '.pdf' + + return ArxivMetadata( + arxiv_id=arxiv_id, + title=title, + authors=authors, + abstract=abstract, + published=published, + updated=updated, + categories=categories, + primary_category=primary_category, + doi=doi, + journal_ref=journal_ref, + comment=comment, + pdf_url=pdf_url, + abs_url=abs_url + ) + + def _clean_text(self, text: str) -> str: + """Clean up text from XML.""" + if not text: + return "" + # Normalize whitespace + text = re.sub(r'\s+', ' ', text) + return text.strip() diff --git a/src/fetchers/crossref_fetcher.py b/src/fetchers/crossref_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..1a65cc55b07c2209eb26c8c66958dfc987355643 --- /dev/null +++ b/src/fetchers/crossref_fetcher.py @@ -0,0 +1,197 @@ +""" +CrossRef API fetcher for bibliography metadata. + +CrossRef provides free, reliable access to metadata for academic publications. +No API key required, no rate limiting for reasonable use. +""" +import requests +from dataclasses import dataclass +from typing import Optional, List +import time + + +@dataclass +class CrossRefResult: + """Metadata result from CrossRef API.""" + title: str + authors: List[str] + year: str + doi: str + publisher: str + container_title: str # Journal/conference name + abstract: str = "" + url: str = "" + + +class CrossRefFetcher: + """ + Fetcher for CrossRef API. + + CrossRef is a reliable, free API for academic metadata. + Much more reliable than Google Scholar scraping. + """ + + BASE_URL = "https://api.crossref.org/works" + RATE_LIMIT_DELAY = 1.0 # Be polite + + def __init__(self, mailto: str = "bibguard@example.com"): + """ + Initialize CrossRef fetcher. + + Args: + mailto: Email for polite pool (gets better rate limits) + """ + self.mailto = mailto + self._last_request_time = 0.0 + self._session = requests.Session() + + def _rate_limit(self): + """Ensure rate limiting between requests.""" + elapsed = time.time() - self._last_request_time + if elapsed < self.RATE_LIMIT_DELAY: + time.sleep(self.RATE_LIMIT_DELAY - elapsed) + self._last_request_time = time.time() + + def _get_headers(self) -> dict: + """Get request headers with mailto for polite pool.""" + return { + 'User-Agent': f'BibGuard/1.0 (mailto:{self.mailto})', + 'Accept': 'application/json', + } + + def search_by_title(self, title: str, max_results: int = 5) -> Optional[CrossRefResult]: + """ + Search for a paper by title. + + Args: + title: Paper title to search for + max_results: Maximum number of results to retrieve + + Returns: + Best matching CrossRefResult or None if not found + """ + self._rate_limit() + + params = { + 'query.title': title, + 'rows': max_results, + 'select': 'title,author,published-print,published-online,DOI,publisher,container-title,abstract' + } + + try: + response = self._session.get( + self.BASE_URL, + params=params, + headers=self._get_headers(), + timeout=30 + ) + response.raise_for_status() + + data = response.json() + + if data.get('status') != 'ok': + return None + + items = data.get('message', {}).get('items', []) + + if not items: + return None + + # Return best match (first result, as CrossRef ranks by relevance) + return self._parse_item(items[0]) + + except requests.RequestException: + return None + + def search_by_doi(self, doi: str) -> Optional[CrossRefResult]: + """ + Fetch metadata by DOI. + + Args: + doi: DOI of the paper + + Returns: + CrossRefResult or None if not found + """ + self._rate_limit() + + # Clean DOI (remove https://doi.org/ prefix if present) + doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '') + + try: + response = self._session.get( + f"{self.BASE_URL}/{doi}", + headers=self._get_headers(), + timeout=30 + ) + response.raise_for_status() + + data = response.json() + + if data.get('status') != 'ok': + return None + + item = data.get('message', {}) + return self._parse_item(item) + + except requests.RequestException: + return None + + def _parse_item(self, item: dict) -> Optional[CrossRefResult]: + """Parse a CrossRef API item into CrossRefResult.""" + try: + # Get title + titles = item.get('title', []) + title = titles[0] if titles else "" + + if not title: + return None + + # Get authors + authors = [] + for author in item.get('author', []): + given = author.get('given', '') + family = author.get('family', '') + if family: + if given: + authors.append(f"{given} {family}") + else: + authors.append(family) + + # Get year (try published-print first, then published-online) + year = "" + for date_field in ['published-print', 'published-online', 'created']: + date_parts = item.get(date_field, {}).get('date-parts', [[]]) + if date_parts and date_parts[0]: + year = str(date_parts[0][0]) + break + + # Get DOI + doi = item.get('DOI', '') + + # Get publisher + publisher = item.get('publisher', '') + + # Get container title (journal/conference name) + container_titles = item.get('container-title', []) + container_title = container_titles[0] if container_titles else "" + + # Get abstract (if available) + abstract = item.get('abstract', '') + + # Build URL + url = f"https://doi.org/{doi}" if doi else "" + + return CrossRefResult( + title=title, + authors=authors, + year=year, + doi=doi, + publisher=publisher, + container_title=container_title, + abstract=abstract, + url=url + ) + + except (KeyError, IndexError, TypeError): + return None diff --git a/src/fetchers/dblp_fetcher.py b/src/fetchers/dblp_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..bb493d95f5fe049d2a6fc5a18e437534239be783 --- /dev/null +++ b/src/fetchers/dblp_fetcher.py @@ -0,0 +1,121 @@ +import requests +import time +import logging +from typing import Optional, List, Dict, Any +from dataclasses import dataclass + +@dataclass +class DBLPResult: + title: str + authors: List[str] + year: str + venue: str + url: str + doi: Optional[str] = None + +class DBLPFetcher: + """Fetcher for DBLP API.""" + + BASE_URL = "https://dblp.org/search/publ/api" + + def __init__(self): + self.last_request_time = 0 + # DBLP asks for 1-2 seconds between requests. We'll use 1.5s to be safe. + self.rate_limit_delay = 1.5 + self.logger = logging.getLogger(__name__) + + def _wait_for_rate_limit(self): + """Ensure we don't exceed rate limits.""" + elapsed = time.time() - self.last_request_time + if elapsed < self.rate_limit_delay: + time.sleep(self.rate_limit_delay - elapsed) + self.last_request_time = time.time() + + def search_by_title(self, title: str) -> Optional[DBLPResult]: + """ + Search DBLP by title. + + Args: + title: Paper title to search for + + Returns: + DBLPResult if found, None otherwise + """ + self._wait_for_rate_limit() + + params = { + "q": title, + "format": "json", + "h": 3 # Limit to top 3 hits + } + + try: + response = requests.get(self.BASE_URL, params=params, timeout=10) + + if response.status_code == 429: + self.logger.warning("DBLP rate limit exceeded. Waiting longer...") + time.sleep(5) + return None + + if response.status_code != 200: + self.logger.warning(f"DBLP API error: {response.status_code}") + return None + + data = response.json() + return self._parse_response(data, title) + + except Exception as e: + self.logger.error(f"Error fetching from DBLP: {e}") + return None + + def _parse_response(self, data: Dict[str, Any], query_title: str) -> Optional[DBLPResult]: + """Parse DBLP JSON response.""" + try: + result = data.get("result", {}) + hits = result.get("hits", {}).get("hit", []) + + if not hits: + return None + + # Find best match + best_hit = None + + # Simple check: first hit is usually the best in DBLP for exact title match + # But we can do a quick normalization check if needed. + # For now, let's take the first hit that is a publication (not a person/venue) + # The search/publ/api endpoint should only return publications. + + best_hit = hits[0] + info = best_hit.get("info", {}) + + # Extract authors + authors_data = info.get("authors", {}).get("author", []) + authors = [] + if isinstance(authors_data, list): + authors = [a.get("text", "") for a in authors_data] + elif isinstance(authors_data, dict): + authors = [authors_data.get("text", "")] + + # Extract other fields + title = info.get("title", "") + year = info.get("year", "") + venue = info.get("venue", "") + url = info.get("url", "") + doi = info.get("doi", "") + + # Clean title (DBLP titles often end with a dot) + if title.endswith("."): + title = title[:-1] + + return DBLPResult( + title=title, + authors=authors, + year=year, + venue=venue, + url=url, + doi=doi if doi else None + ) + + except Exception as e: + self.logger.error(f"Error parsing DBLP response: {e}") + return None diff --git a/src/fetchers/openalex_fetcher.py b/src/fetchers/openalex_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..f978cc8d9e5a59e0918b3240804fbb04e6c6c105 --- /dev/null +++ b/src/fetchers/openalex_fetcher.py @@ -0,0 +1,196 @@ +""" +OpenAlex API fetcher. +Free and open API for scholarly metadata. +""" +import time +from dataclasses import dataclass +from typing import Optional + +import requests + + +@dataclass +class OpenAlexResult: + """Search result from OpenAlex API.""" + title: str + authors: list[str] + year: str + abstract: str + doi: str + citation_count: int + url: str + + +class OpenAlexFetcher: + """ + Fetcher using OpenAlex's free API. + + API Docs: https://docs.openalex.org/ + Rate Limits: + - 100,000 requests per day + - 10 requests per second (very generous) + - No API key required (but polite pool recommended) + """ + + BASE_URL = "https://api.openalex.org" + RATE_LIMIT_DELAY = 0.1 # 10 req/sec max + + def __init__(self, email: Optional[str] = None): + """ + Initialize OpenAlex fetcher. + + Args: + email: Optional email for polite pool (faster rate limits) + """ + self.email = email + self._last_request_time = 0.0 + self._session = requests.Session() + + # Set user agent (required by OpenAlex) + self._session.headers.update({ + 'User-Agent': 'BibGuard/1.0 (https://github.com/thinkwee/BibGuard; mailto:bibguard@example.com)' + }) + + # Add email to polite pool if provided + if email: + self._session.headers.update({'From': email}) + + def _rate_limit(self): + """Ensure rate limiting between requests.""" + elapsed = time.time() - self._last_request_time + if elapsed < self.RATE_LIMIT_DELAY: + time.sleep(self.RATE_LIMIT_DELAY - elapsed) + self._last_request_time = time.time() + + def search_by_title(self, title: str, max_results: int = 5) -> Optional[OpenAlexResult]: + """ + Search for a paper by title. + + Args: + title: Paper title to search for + max_results: Maximum number of results to fetch (default: 5) + + Returns: + OpenAlexResult if found, None otherwise + """ + self._rate_limit() + + url = f"{self.BASE_URL}/works" + params = { + 'search': title, + 'per-page': max_results + } + + try: + response = self._session.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = data.get('results', []) + if not results: + return None + + # Return the first (most relevant) result + return self._parse_work(results[0]) + + except requests.RequestException: + return None + + def fetch_by_doi(self, doi: str) -> Optional[OpenAlexResult]: + """ + Fetch paper metadata by DOI. + + Args: + doi: DOI of the paper + + Returns: + OpenAlexResult if found, None otherwise + """ + self._rate_limit() + + # OpenAlex uses DOI URLs + doi_url = f"https://doi.org/{doi}" + url = f"{self.BASE_URL}/works/{doi_url}" + + try: + response = self._session.get(url, timeout=10) + response.raise_for_status() + data = response.json() + return self._parse_work(data) + + except requests.RequestException: + return None + + def _parse_work(self, work_data: dict) -> Optional[OpenAlexResult]: + """Parse work data from API response.""" + try: + # Extract title + title = work_data.get('title', '') + + # Extract authors + authors = [] + authorships = work_data.get('authorships', []) + for authorship in authorships: + author = authorship.get('author', {}) + name = author.get('display_name', '') + if name: + authors.append(name) + + # Get publication year + year = work_data.get('publication_year') + year_str = str(year) if year else "" + + # Get abstract (inverted index format) + abstract = "" + abstract_inverted = work_data.get('abstract_inverted_index') + if abstract_inverted: + # Reconstruct abstract from inverted index + abstract = self._reconstruct_abstract(abstract_inverted) + + # Get DOI + doi = work_data.get('doi', '') + if doi and doi.startswith('https://doi.org/'): + doi = doi.replace('https://doi.org/', '') + + # Get citation count + citation_count = work_data.get('cited_by_count', 0) + + # Get URL + url = work_data.get('id', '') # OpenAlex ID URL + + return OpenAlexResult( + title=title, + authors=authors, + year=year_str, + abstract=abstract, + doi=doi, + citation_count=citation_count, + url=url + ) + except (KeyError, TypeError): + return None + + def _reconstruct_abstract(self, inverted_index: dict) -> str: + """ + Reconstruct abstract text from inverted index. + + OpenAlex stores abstracts in inverted index format: + {"word": [position1, position2, ...], ...} + """ + if not inverted_index: + return "" + + try: + # Create a list to hold words at their positions + max_pos = max(max(positions) for positions in inverted_index.values()) + words = [''] * (max_pos + 1) + + # Place each word at its positions + for word, positions in inverted_index.items(): + for pos in positions: + words[pos] = word + + # Join words with spaces + return ' '.join(word for word in words if word) + except (ValueError, TypeError): + return "" diff --git a/src/fetchers/scholar_fetcher.py b/src/fetchers/scholar_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..47a3cd36fb8a9c88f3890e9f09cfd26e93582f2a --- /dev/null +++ b/src/fetchers/scholar_fetcher.py @@ -0,0 +1,218 @@ +""" +Google Scholar search (scraping-based fallback). +""" +import re +import time +import random +from dataclasses import dataclass +from typing import Optional + +import requests +from bs4 import BeautifulSoup + + +@dataclass +class ScholarResult: + """Search result from Google Scholar.""" + title: str + authors: str + year: str + snippet: str + url: str + cited_by: int + + +class ScholarFetcher: + """ + Fallback fetcher using Google Scholar search. + + Note: This uses scraping and may be blocked. + Use rate limiting and respect robots.txt. + """ + + SEARCH_URL = "https://scholar.google.com/scholar" + RATE_LIMIT_DELAY = 10.0 # Conservative delay to avoid blocking (was 5.0) + MAX_RETRIES = 2 # Retry on failures + + USER_AGENTS = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', + ] + + def __init__(self): + self._last_request_time = 0.0 + self._session = requests.Session() + self._request_count = 0 + self._blocked = False # Track if we've been blocked + + def _rate_limit(self): + """Ensure rate limiting between requests.""" + elapsed = time.time() - self._last_request_time + # Add more randomness to avoid detection (3-5 seconds extra) + delay = self.RATE_LIMIT_DELAY + random.uniform(3, 5) + if elapsed < delay: + time.sleep(delay - elapsed) + self._last_request_time = time.time() + + def _get_headers(self) -> dict: + """Get request headers with random user agent.""" + return { + 'User-Agent': random.choice(self.USER_AGENTS), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + } + + def search(self, query: str, max_results: int = 5) -> list[ScholarResult]: + """ + Search Google Scholar. + + Returns list of search results. + Note: This may fail if blocked by Google. + """ + # If we've been blocked, don't waste time + if self._blocked: + return [] + + self._rate_limit() + self._request_count += 1 + + params = { + 'q': query, + 'hl': 'en', + 'num': min(max_results, 10) # Scholar max is 10 per page + } + + try: + response = self._session.get( + self.SEARCH_URL, + params=params, + headers=self._get_headers(), + timeout=30 + ) + response.raise_for_status() + except requests.RequestException as e: + return [] + + # Check if we're blocked + if 'unusual traffic' in response.text.lower() or response.status_code == 429: + self._blocked = True + print(f"⚠️ Google Scholar blocked after {self._request_count} requests. Skipping further Scholar queries.") + return [] + + return self._parse_results(response.text, max_results) + + def search_by_title(self, title: str) -> Optional[ScholarResult]: + """Search for a specific paper by title.""" + # Use quotes for exact title match + query = f'"{title}"' + results = self.search(query, max_results=3) + + if not results: + # Try without quotes + results = self.search(title, max_results=5) + + return results[0] if results else None + + def _parse_results(self, html: str, max_results: int) -> list[ScholarResult]: + """Parse search results from HTML.""" + results = [] + soup = BeautifulSoup(html, 'lxml') + + # Find all result entries + entries = soup.find_all('div', class_='gs_ri') + + for entry in entries[:max_results]: + try: + result = self._parse_entry(entry) + if result: + results.append(result) + except Exception: + continue + + return results + + def _parse_entry(self, entry) -> Optional[ScholarResult]: + """Parse a single search result entry.""" + # Get title + title_elem = entry.find('h3', class_='gs_rt') + if not title_elem: + return None + + # Get title text and URL + title_link = title_elem.find('a') + if title_link: + title = title_link.get_text(strip=True) + url = title_link.get('href', '') + else: + title = title_elem.get_text(strip=True) + url = '' + + # Clean title (remove [PDF], [HTML] markers) + title = re.sub(r'^\[(PDF|HTML|BOOK|CITATION)\]\s*', '', title) + + # Get authors and year from the green line + meta_elem = entry.find('div', class_='gs_a') + authors = "" + year = "" + + if meta_elem: + meta_text = meta_elem.get_text(strip=True) + + # Extract year first + year_match = re.search(r'\b(19|20)\d{2}\b', meta_text) + if year_match: + year = year_match.group(0) + + # Parse authors more carefully + # Format is usually: "Author1, Author2 - Journal, Year - Publisher" + # or sometimes: "Author1, Author2 - Journal/Conference - Year" + parts = meta_text.split(' - ') + if parts: + author_part = parts[0].strip() + + # Clean up author field - remove year if it leaked in + if year: + # Remove year and anything after it from author field + author_part = re.sub(r',?\s*' + re.escape(year) + r'.*$', '', author_part) + + # Remove common journal/venue keywords that might have leaked + # Handle patterns like "the journal of", "the proceedings", etc. + author_part = re.sub(r'\s+the\s+(journal|proceedings|conference|symposium|workshop|transactions|magazine|review|annals)\s+.*$', '', author_part, flags=re.IGNORECASE) + + # Also handle without "the" prefix + author_part = re.sub(r'\s+(journal|proceedings|conference|symposium|workshop|transactions|magazine|review|annals)\s+.*$', '', author_part, flags=re.IGNORECASE) + + # Remove standalone "the" at the end (in case it's left over) + author_part = re.sub(r'\s+the\s*$', '', author_part, flags=re.IGNORECASE) + + # Remove trailing commas and whitespace + author_part = author_part.rstrip(', ').strip() + + authors = author_part + + # Get snippet + snippet_elem = entry.find('div', class_='gs_rs') + snippet = snippet_elem.get_text(strip=True) if snippet_elem else "" + + # Get cited by count + cited_by = 0 + cited_elem = entry.find('a', string=re.compile(r'Cited by \d+')) + if cited_elem: + match = re.search(r'Cited by (\d+)', cited_elem.get_text()) + if match: + cited_by = int(match.group(1)) + + return ScholarResult( + title=title, + authors=authors, + year=year, + snippet=snippet, + url=url, + cited_by=cited_by + ) diff --git a/src/fetchers/semantic_scholar_fetcher.py b/src/fetchers/semantic_scholar_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..8170a2e9a658d00bf9e5d82a165a5183365b316b --- /dev/null +++ b/src/fetchers/semantic_scholar_fetcher.py @@ -0,0 +1,172 @@ +""" +Semantic Scholar API fetcher. +Official API with high quality metadata and generous rate limits. +""" +import time +from dataclasses import dataclass +from typing import Optional + +import requests + + +@dataclass +class SemanticScholarResult: + """Search result from Semantic Scholar API.""" + title: str + authors: list[str] + year: str + abstract: str + paper_id: str + citation_count: int + url: str + + +class SemanticScholarFetcher: + """ + Fetcher using Semantic Scholar's official API. + + API Docs: https://api.semanticscholar.org/ + Rate Limits: + - Without API key: 100 requests per 5 minutes + - With API key: 5,000 requests per 5 minutes (free) + """ + + BASE_URL = "https://api.semanticscholar.org/graph/v1" + RATE_LIMIT_DELAY = 0.5 # Conservative delay (120 req/min max) + + def __init__(self, api_key: Optional[str] = None): + """ + Initialize Semantic Scholar fetcher. + + Args: + api_key: Optional API key for higher rate limits (free from semanticscholar.org) + """ + self.api_key = api_key + self._last_request_time = 0.0 + self._session = requests.Session() + + if api_key: + self._session.headers.update({'x-api-key': api_key}) + + def _rate_limit(self): + """Ensure rate limiting between requests.""" + elapsed = time.time() - self._last_request_time + if elapsed < self.RATE_LIMIT_DELAY: + time.sleep(self.RATE_LIMIT_DELAY - elapsed) + self._last_request_time = time.time() + + def search_by_title(self, title: str, max_results: int = 5) -> Optional[SemanticScholarResult]: + """ + Search for a paper by title. + + Args: + title: Paper title to search for + max_results: Maximum number of results to fetch (default: 5) + + Returns: + SemanticScholarResult if found, None otherwise + """ + self._rate_limit() + + url = f"{self.BASE_URL}/paper/search" + params = { + 'query': title, + 'limit': max_results, + 'fields': 'title,authors,year,abstract,paperId,citationCount,url' + } + + try: + response = self._session.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + papers = data.get('data', []) + if not papers: + return None + + # Return the first (most relevant) result + return self._parse_paper(papers[0]) + + except requests.RequestException: + return None + + def fetch_by_doi(self, doi: str) -> Optional[SemanticScholarResult]: + """ + Fetch paper metadata by DOI. + + Args: + doi: DOI of the paper + + Returns: + SemanticScholarResult if found, None otherwise + """ + self._rate_limit() + + url = f"{self.BASE_URL}/paper/DOI:{doi}" + params = { + 'fields': 'title,authors,year,abstract,paperId,citationCount,url' + } + + try: + response = self._session.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + return self._parse_paper(data) + + except requests.RequestException: + return None + + def fetch_by_arxiv_id(self, arxiv_id: str) -> Optional[SemanticScholarResult]: + """ + Fetch paper metadata by arXiv ID. + + Args: + arxiv_id: arXiv ID (e.g., "2301.12345" or "arXiv:2301.12345") + + Returns: + SemanticScholarResult if found, None otherwise + """ + self._rate_limit() + + # Clean arXiv ID (remove "arXiv:" prefix if present) + clean_id = arxiv_id.replace('arXiv:', '') + + url = f"{self.BASE_URL}/paper/ARXIV:{clean_id}" + params = { + 'fields': 'title,authors,year,abstract,paperId,citationCount,url' + } + + try: + response = self._session.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + return self._parse_paper(data) + + except requests.RequestException: + return None + + def _parse_paper(self, paper_data: dict) -> Optional[SemanticScholarResult]: + """Parse paper data from API response.""" + try: + # Extract author names + authors = [] + for author in paper_data.get('authors', []): + name = author.get('name', '') + if name: + authors.append(name) + + # Get year (may be None) + year = paper_data.get('year') + year_str = str(year) if year else "" + + return SemanticScholarResult( + title=paper_data.get('title', ''), + authors=authors, + year=year_str, + abstract=paper_data.get('abstract', ''), + paper_id=paper_data.get('paperId', ''), + citation_count=paper_data.get('citationCount', 0), + url=paper_data.get('url', '') + ) + except (KeyError, TypeError): + return None diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d67cd8290224189a418e2ba5fdd3b4f9c6f283 --- /dev/null +++ b/src/parsers/__init__.py @@ -0,0 +1,5 @@ +"""Parsers package""" +from .bib_parser import BibParser +from .tex_parser import TexParser + +__all__ = ['BibParser', 'TexParser'] diff --git a/src/parsers/__pycache__/__init__.cpython-311.pyc b/src/parsers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f017711d2f52991009793611123488f899922d17 Binary files /dev/null and b/src/parsers/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/parsers/__pycache__/__init__.cpython-313.pyc b/src/parsers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a15dd1f751a12d6f5b7ff993742bb1678d571ec6 Binary files /dev/null and b/src/parsers/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/parsers/__pycache__/bib_parser.cpython-311.pyc b/src/parsers/__pycache__/bib_parser.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4282597b28154a0a8adb3ce685336f6c9d67131 Binary files /dev/null and b/src/parsers/__pycache__/bib_parser.cpython-311.pyc differ diff --git a/src/parsers/__pycache__/bib_parser.cpython-313.pyc b/src/parsers/__pycache__/bib_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b404d8ab5625452ced6d60b2ddd63a2c1603767 Binary files /dev/null and b/src/parsers/__pycache__/bib_parser.cpython-313.pyc differ diff --git a/src/parsers/__pycache__/tex_parser.cpython-313.pyc b/src/parsers/__pycache__/tex_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2fd788571f2145b5b053a9de4091a48329fa1e9 Binary files /dev/null and b/src/parsers/__pycache__/tex_parser.cpython-313.pyc differ diff --git a/src/parsers/bib_parser.py b/src/parsers/bib_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..48853ed09ed811d0c500a5e5790fbbdb86a270d4 --- /dev/null +++ b/src/parsers/bib_parser.py @@ -0,0 +1,273 @@ +""" +BibTeX file parser. +""" +import re +from dataclasses import dataclass, field +from typing import Optional +from pathlib import Path + +import bibtexparser +from bibtexparser.bparser import BibTexParser +from bibtexparser.customization import convert_to_unicode + + +@dataclass +class BibEntry: + """Represents a parsed bibliography entry.""" + key: str + entry_type: str + title: str = "" + author: str = "" + year: str = "" + abstract: str = "" + url: str = "" + doi: str = "" + arxiv_id: str = "" + journal: str = "" + booktitle: str = "" + publisher: str = "" + pages: str = "" + volume: str = "" + number: str = "" + raw_entry: dict = field(default_factory=dict) + + @property + def has_arxiv(self) -> bool: + """Check if entry has arXiv information.""" + return bool(self.arxiv_id) + + @property + def search_query(self) -> str: + """Get search query for this entry.""" + return self.title or self.key + + +class BibParser: + """Parser for .bib files.""" + + # Patterns for extracting arXiv IDs + ARXIV_PATTERNS = [ + # New format: 2301.00001 or 2301.00001v1 + r'(\d{4}\.\d{4,5}(?:v\d+)?)', + # Old format: hep-th/9901001 or math.GT/0309136 + r'([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)', + # arXiv: prefix + r'arXiv:(\d{4}\.\d{4,5}(?:v\d+)?)', + r'arXiv:([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)', + ] + + # URL patterns for arXiv + ARXIV_URL_PATTERNS = [ + r'arxiv\.org/abs/(\d{4}\.\d{4,5}(?:v\d+)?)', + r'arxiv\.org/abs/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)', + r'arxiv\.org/pdf/(\d{4}\.\d{4,5}(?:v\d+)?)(?:\.pdf)?', + r'arxiv\.org/pdf/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)(?:\.pdf)?', + ] + + def __init__(self): + self.entries: list[BibEntry] = [] + + def parse_file(self, filepath: str) -> list[BibEntry]: + """Parse a .bib file and return list of entries.""" + path = Path(filepath) + if not path.exists(): + raise FileNotFoundError(f"Bib file not found: {filepath}") + + with open(path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + return self.parse_content(content) + + def parse_content(self, content: str) -> list[BibEntry]: + """Parse bib content string.""" + parser = BibTexParser(common_strings=True) + parser.customization = convert_to_unicode + + try: + bib_database = bibtexparser.loads(content, parser=parser) + except Exception as e: + raise ValueError(f"Failed to parse bib content: {e}") + + self.entries = [] + for entry in bib_database.entries: + bib_entry = self._convert_entry(entry) + self.entries.append(bib_entry) + + return self.entries + + def _convert_entry(self, entry: dict) -> BibEntry: + """Convert a bibtexparser entry to BibEntry.""" + # Extract basic fields + bib_entry = BibEntry( + key=entry.get('ID', ''), + entry_type=entry.get('ENTRYTYPE', ''), + title=entry.get('title', ''), + author=entry.get('author', ''), + year=entry.get('year', ''), + abstract=entry.get('abstract', ''), + url=entry.get('url', ''), + doi=entry.get('doi', ''), + journal=entry.get('journal', ''), + booktitle=entry.get('booktitle', ''), + publisher=entry.get('publisher', ''), + pages=entry.get('pages', ''), + volume=entry.get('volume', ''), + number=entry.get('number', ''), + raw_entry=entry.copy() + ) + + # Extract arXiv ID + bib_entry.arxiv_id = self._extract_arxiv_id(entry) + + return bib_entry + + def _extract_arxiv_id(self, entry: dict) -> str: + """Extract arXiv ID from entry.""" + # Check eprint field first + eprint = entry.get('eprint', '') + if eprint: + arxiv_id = self._parse_arxiv_id(eprint) + if arxiv_id: + return arxiv_id + + # Check arxiv field + arxiv = entry.get('arxiv', '') + if arxiv: + arxiv_id = self._parse_arxiv_id(arxiv) + if arxiv_id: + return arxiv_id + + # Check URL field + url = entry.get('url', '') + if url: + for pattern in self.ARXIV_URL_PATTERNS: + match = re.search(pattern, url, re.IGNORECASE) + if match: + return match.group(1) + + # Check journal field for "arXiv preprint arXiv:XXXX.XXXXX" format + journal = entry.get('journal', '') + if journal and 'arxiv' in journal.lower(): + arxiv_id = self._parse_arxiv_id(journal) + if arxiv_id: + return arxiv_id + + # Check note field + note = entry.get('note', '') + if note: + arxiv_id = self._parse_arxiv_id(note) + if arxiv_id: + return arxiv_id + + return "" + + def _parse_arxiv_id(self, text: str) -> str: + """Parse arXiv ID from text.""" + for pattern in self.ARXIV_PATTERNS: + match = re.search(pattern, text) + if match: + return match.group(1) + return "" + + def get_entry_by_key(self, key: str) -> Optional[BibEntry]: + """Get entry by citation key.""" + for entry in self.entries: + if entry.key == key: + return entry + return None + + def filter_file(self, input_path: str, output_path: str, keys_to_keep: set[str]): + """ + Create a new bib file containing only specified keys. + Preserves original formatting, comments, and strings. + """ + with open(input_path, 'r', encoding='utf-8') as f: + content = f.read() + + filtered_content = self._filter_content(content, keys_to_keep) + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(filtered_content) + + def _filter_content(self, content: str, keys_to_keep: set[str]) -> str: + """Filter content string keeping only specified keys.""" + ranges_to_remove = [] + i = 0 + length = len(content) + + while i < length: + if content[i] == '@': + start = i + # Find opening brace + brace_open = content.find('{', i) + if brace_open == -1: + i += 1 + continue + + # Get entry type + entry_type = content[i+1:brace_open].strip().lower() + + # Skip comments + if entry_type == 'comment': + i = brace_open + 1 + continue + + # Find matching closing brace to determine entry end + balance = 1 + j = brace_open + 1 + in_quote = False + + while j < length and balance > 0: + char = content[j] + + # Handle escaped characters + if char == '\\': + j += 2 + continue + + if char == '"': + in_quote = not in_quote + elif not in_quote: + if char == '{': + balance += 1 + elif char == '}': + balance -= 1 + j += 1 + + end = j + + # Extract key (between { and ,) + # Only for standard entries, not @string or @preamble + if entry_type not in ('string', 'preamble'): + # Find comma or end of entry + # Key is usually the first token after { + key_part = content[brace_open+1:end] + comma_pos = key_part.find(',') + + if comma_pos != -1: + key = key_part[:comma_pos].strip() + + # If key is NOT in keep list, mark for removal + if key not in keys_to_keep: + ranges_to_remove.append((start, end)) + + i = end + else: + i += 1 + + # Reconstruct content + new_content = [] + last_pos = 0 + for start, end in ranges_to_remove: + new_content.append(content[last_pos:start]) + + # Clean up whitespace after removed entry + last_pos = end + while last_pos < length and content[last_pos] in ' \t\r': + last_pos += 1 + if last_pos < length and content[last_pos] == '\n': + last_pos += 1 + + new_content.append(content[last_pos:]) + return "".join(new_content) + diff --git a/src/parsers/tex_parser.py b/src/parsers/tex_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..3f487b9d207636914ca92f5efcb1f98223b13272 --- /dev/null +++ b/src/parsers/tex_parser.py @@ -0,0 +1,200 @@ +""" +LaTeX file parser for citation extraction. +""" +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class CitationContext: + """Represents a citation with its context.""" + key: str + line_number: int + command: str # e.g., \cite, \citep, \citet + context_before: str # Text before citation + context_after: str # Text after citation + full_context: str # Full surrounding context + raw_line: str # The raw line containing the citation + file_path: Optional[str] = None # Added + + +class TexParser: + """Parser for .tex files.""" + + # Citation command patterns + CITE_PATTERNS = [ + # Standard citation commands + r'\\cite(?:p|t|alp|alt|author|year|yearpar)?\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + # natbib commands + r'\\citep?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + r'\\citet?\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + # biblatex commands + r'\\(?:auto|text|paren|foot|super)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + r'\\(?:full|short)cite\*?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + ] + + # Compiled pattern for finding any citation + CITE_REGEX = re.compile( + r'\\(cite[a-z]*)\*?\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}', + re.IGNORECASE + ) + + def __init__(self): + self.citations: dict[str, list[CitationContext]] = {} + self.all_keys: set[str] = set() + self.lines: list[str] = [] + self.content: str = "" + self.current_filepath: Optional[str] = None + + def parse_file(self, filepath: str) -> dict[str, list[CitationContext]]: + """Parse a .tex file and extract all citations.""" + path = Path(filepath) + if not path.exists(): + raise FileNotFoundError(f"TeX file not found: {filepath}") + + with open(path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + self.current_filepath = filepath + return self.parse_content(content) + + def parse_content(self, content: str) -> dict[str, list[CitationContext]]: + """Parse tex content and extract citations.""" + self.content = content + self.lines = content.split('\n') + self.citations = {} + self.all_keys = set() + + # Remove comments + content_no_comments = self._remove_comments(content) + + # Find all citations line by line + for line_num, line in enumerate(self.lines, 1): + # Skip comment lines + if line.strip().startswith('%'): + continue + + # Remove inline comments for matching + line_no_comment = re.sub(r'(? str: + """Remove LaTeX comments from content.""" + # Remove line comments (but keep escaped %) + lines = content.split('\n') + cleaned = [] + for line in lines: + # Remove inline comments + result = re.sub(r'(? dict: + """Extract surrounding context for a citation (sentences).""" + # Get a larger window of lines first to ensure we capture full sentences + start_line = max(0, line_num - 10) + end_line = min(len(self.lines), line_num + 10) + + # Combine lines into a single text block + raw_block = ' '.join(self.lines[start_line:end_line]) + + # Clean the block first to make sentence splitting easier + clean_block = self._clean_text(raw_block) + + # Find the citation in the clean block (approximation) + # Since we cleaned the text, we can't find the exact \cite command easily. + # Instead, we'll use the raw lines to find the citation index, then map to clean text. + # However, a simpler approach for LLM context is to just return the cleaned text + # centered around the line. + + # Better approach: + # 1. Get the raw line content + current_raw_line = self.lines[line_num - 1] + + # 2. Get surrounding lines + before_lines = self.lines[start_line:line_num - 1] + after_lines = self.lines[line_num:end_line] + + # 3. Clean everything + current_clean = self._clean_text(current_raw_line) + before_clean = self._clean_text(' '.join(before_lines)) + after_clean = self._clean_text(' '.join(after_lines)) + + # 4. Split into sentences (simple splitting by .!?) + def split_sentences(text): + return re.split(r'(?<=[.!?])\s+', text) + + before_sentences = split_sentences(before_clean) + after_sentences = split_sentences(after_clean) + + # Take last N sentences from before + context_before = ' '.join(before_sentences[-context_sentences:]) if before_sentences else "" + + # Take first N sentences from after + context_after = ' '.join(after_sentences[:context_sentences]) if after_sentences else "" + + # Combine + full_context = f"{context_before} {current_clean} {context_after}".strip() + + return { + 'before': context_before, + 'after': context_after, + 'full': full_context + } + + def _clean_text(self, text: str) -> str: + """Clean LaTeX text for readability.""" + # Remove common LaTeX commands but keep text content + text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])*\s*', ' ', text) + # Remove braces + text = re.sub(r'[{}]', '', text) + # Normalize whitespace + text = re.sub(r'\s+', ' ', text) + return text.strip() + + def is_cited(self, key: str) -> bool: + """Check if a key is cited in the document.""" + return key in self.all_keys + + def get_citation_contexts(self, key: str) -> list[CitationContext]: + """Get all citation contexts for a key.""" + return self.citations.get(key, []) + + def get_all_cited_keys(self) -> set[str]: + """Get all citation keys found in the document.""" + return self.all_keys.copy() diff --git a/src/report/__init__.py b/src/report/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..078cdff2c97c5e94121741baa5a0a270660d45ba --- /dev/null +++ b/src/report/__init__.py @@ -0,0 +1,4 @@ +"""Report package""" +from .generator import ReportGenerator + +__all__ = ['ReportGenerator'] diff --git a/src/report/__pycache__/__init__.cpython-313.pyc b/src/report/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bec69e5de0fd5315c6a0c642dc565d685207de90 Binary files /dev/null and b/src/report/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/report/__pycache__/generator.cpython-313.pyc b/src/report/__pycache__/generator.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e7df3494844c80e1581b16a5359eba1bd6653a2 Binary files /dev/null and b/src/report/__pycache__/generator.cpython-313.pyc differ diff --git a/src/report/__pycache__/line_report.cpython-313.pyc b/src/report/__pycache__/line_report.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38464dba5318b23773c196083cfaece9c6dcd917 Binary files /dev/null and b/src/report/__pycache__/line_report.cpython-313.pyc differ diff --git a/src/report/generator.py b/src/report/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..73fe74c12642a85d0b6e1a1b04122daebeb1815c --- /dev/null +++ b/src/report/generator.py @@ -0,0 +1,785 @@ +""" +Report generator for bibliography check results. +""" +import re +from dataclasses import dataclass +from datetime import datetime +from typing import Optional, List +from pathlib import Path + +from ..parsers.bib_parser import BibEntry +from ..analyzers.metadata_comparator import ComparisonResult +from ..analyzers.usage_checker import UsageResult +from ..analyzers.llm_evaluator import EvaluationResult +from ..analyzers.duplicate_detector import DuplicateGroup +from ..checkers.base import CheckResult, CheckSeverity + + +@dataclass +class EntryReport: + """Complete report for a single bib entry.""" + entry: BibEntry + comparison: Optional[ComparisonResult] + usage: Optional[UsageResult] + evaluations: list[EvaluationResult] + + +class ReportGenerator: + """Generates formatted markdown reports.""" + + def __init__(self, minimal_verified: bool = False, check_preprint_ratio: bool = True, preprint_warning_threshold: float = 0.50): + self.entries: list[EntryReport] = [] + self.missing_citations: list[str] = [] + self.duplicate_groups: list[DuplicateGroup] | None = None # None means check not run + self.bib_files: list[str] = [] + self.tex_files: list[str] = [] + self.bib_file: str = "" # Keep for backward compatibility/single file + self.tex_file: str = "" # Keep for backward compatibility/single file + self.minimal_verified = minimal_verified # Whether to show minimal info for verified entries + self.submission_results: List[CheckResult] = [] # Submission quality check results + self.template = None # Conference template if used + self.check_preprint_ratio = check_preprint_ratio # Whether to check preprint ratio + self.preprint_warning_threshold = preprint_warning_threshold # Threshold for preprint warning + + + def add_entry_report(self, report: EntryReport): + """Add an entry report.""" + self.entries.append(report) + + def set_metadata(self, bib_files: str | list[str], tex_files: str | list[str]): + """Set source file information.""" + if isinstance(bib_files, str): + self.bib_files = [bib_files] + self.bib_file = bib_files + else: + self.bib_files = bib_files + self.bib_file = bib_files[0] if bib_files else "" + + if isinstance(tex_files, str): + self.tex_files = [tex_files] + self.tex_file = tex_files + else: + self.tex_files = tex_files + self.tex_file = tex_files[0] if tex_files else "" + + def set_missing_citations(self, missing: list[str]): + """Set list of citations without bib entries.""" + self.missing_citations = missing + + def set_duplicate_groups(self, groups: list[DuplicateGroup]): + """Set list of duplicate entry groups.""" + self.duplicate_groups = groups + + def set_submission_results(self, results: List[CheckResult], template=None): + """Set submission quality check results.""" + self.submission_results = results + self.template = template + + def generate(self) -> str: + """Generate the full markdown report.""" + lines = [] + + # Header + lines.extend(self._generate_header()) + lines.append("") + + # Disclaimer + lines.extend(self._generate_disclaimer()) + lines.append("") + + # Summary statistics + lines.extend(self._generate_summary()) + lines.append("") + + # ⚠️ Critical Issues (Detailed) - Bibliography-related issues + lines.extend(self._generate_issues_section()) + lines.append("") + + # ✅ Verified Entries (Clean) + lines.extend(self._generate_verified_section()) + lines.append("") + + # 📋 Submission Quality Checks (LaTeX quality checks) + if self.submission_results: + lines.extend(self._generate_submission_section()) + lines.append("") + + # Footer + lines.extend(self._generate_footer()) + + return "\n".join(lines) + + def get_summary_stats(self) -> tuple[dict, dict]: + """Get summary statistics as dictionaries for console display (Issues only).""" + total = len(self.entries) + + # Bibliography issues breakdown + title_mismatches = 0 + author_mismatches = 0 + year_mismatches = 0 + low_relevance = 0 + unable_to_verify = 0 + + for e in self.entries: + # Metadata issues + if e.comparison: + if e.comparison.has_issues: + # Categorize issues + has_title = False + has_author = False + has_year = False + + for issue in e.comparison.issues: + if "Title mismatch" in issue: has_title = True + elif "Author mismatch" in issue: has_author = True + elif "Year mismatch" in issue: has_year = True + elif "Unable to find" in issue: unable_to_verify += 1 + + if has_title: title_mismatches += 1 + if has_author: author_mismatches += 1 + if has_year: year_mismatches += 1 + + # Relevance issues + if any(ev.relevance_score <= 2 for ev in e.evaluations): + low_relevance += 1 + + bib_stats = {} + if title_mismatches > 0: bib_stats["Title Mismatches"] = title_mismatches + if author_mismatches > 0: bib_stats["Author Mismatches"] = author_mismatches + if year_mismatches > 0: bib_stats["Year Mismatches"] = year_mismatches + if low_relevance > 0: bib_stats["Low Relevance"] = low_relevance + if unable_to_verify > 0: bib_stats["Unable to Verify"] = unable_to_verify + + if self.duplicate_groups: + bib_stats["Duplicate Groups"] = len(self.duplicate_groups) + + if self.missing_citations: + bib_stats["Missing Bib Entries"] = len(self.missing_citations) + + unused = [e for e in self.entries if e.usage and not e.usage.is_used] + if unused: + bib_stats["Unused Entries"] = len(unused) + + # LaTeX stats - Group by precise Rule Names + latex_stats = {} + + # Rule mapping for professional display names + RULE_MAPPING = { + "Very long sentence": "Sentence Length (Critical)", + "Long sentence": "Sentence Length (Warning)", + "Possible Markdown bullet point": "Markdown Bullet Point", + "Possible Markdown numbered list": "Markdown Numbered List", + "Possible Markdown italic": "Markdown Italic", + "Possible Markdown bold": "Markdown Bold", + "Inconsistent hyphenation": "Hyphenation Inconsistency", + "Inconsistent spelling": "Spelling Inconsistency", + "Unreferenced figure": "Unreferenced Figure", + "Unreferenced table": "Unreferenced Table", + "Unreferenced section": "Unreferenced Section", + "Unreferenced label": "Unreferenced Label", + "Multiple blank lines": "Multiple Blank Lines", + "Citation from": "Old Citation (10+ years)", + "Hedging language": "Hedging/Vague Language", + "Redundant phrase": "Redundant Phrasing", + "Weak start with": "Weak Sentence Starter", + "Unescaped &": "Unescaped Special Character", + "Citation without non-breaking space": "Missing Non-breaking Space (~)", + "Mixed citation styles": "Mixed Citation Styles", + "Mixed inline math": "Mixed Math Notation", + "Appendix section": "Unreferenced Appendix", + "Missing space before unit": "Unit Spacing Issue" + } + + for r in self.submission_results: + if r.passed: + continue + + raw_msg = r.message + rule_name = "Unknown Rule" + + # Match against our professional rule names + matched = False + for pattern, official_name in RULE_MAPPING.items(): + if pattern in raw_msg: + rule_name = official_name + matched = True + break + + if not matched: + # Fallback: Clean the message (remove dynamic parts) + clean_msg = re.sub(r"\(.*?\)", "", raw_msg) + clean_msg = re.sub(r"'.*?'", "", clean_msg) + clean_msg = re.sub(r"\d+", "", clean_msg) + rule_name = clean_msg.split(":")[0].strip() + + if rule_name not in latex_stats: + latex_stats[rule_name] = 0 + latex_stats[rule_name] += 1 + + return bib_stats, latex_stats + + def generate_console_output(self) -> str: + """Generate console-friendly output (Summary + Issues only).""" + lines = [] + + # Summary statistics + lines.extend(self._generate_summary()) + lines.append("") + + # Critical Issues + lines.extend(self._generate_issues_section()) + lines.append("") + + return "\n".join(lines) + + def _generate_header(self) -> list[str]: + """Generate report header.""" + bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A" + tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + return [ + "# Bibliography Validation Report", + "", + f"**Generated:** {timestamp}", + "", + "| File Type | Filename |", + "|-----------|----------|", + f"| **Bib File(s)** | {bib_names} |", + f"| **TeX File(s)** | {tex_names} |" + ] + + def _generate_disclaimer(self) -> list[str]: + """Generate disclaimer section.""" + return [ + "> **⚠️ Disclaimer:** This report is generated by an automated tool. While BibGuard strives for accuracy, it may produce false positives or miss certain issues. **This tool cannot replace human review.** Please manually verify all reported issues before making changes to your bibliography." + ] + + def _generate_summary(self) -> list[str]: + """Generate summary statistics.""" + total = len(self.entries) + + # Check availability of results + has_metadata = any(e.comparison is not None for e in self.entries) + has_usage = any(e.usage is not None for e in self.entries) + has_eval = any(len(e.evaluations) > 0 for e in self.entries) + + # Calculate Verified/Issues + # Note: _is_verified depends on _has_issues. + # If a check wasn't run, it won't contribute to issues. + verified = sum(1 for e in self.entries if self._is_verified(e)) + issues = sum(1 for e in self.entries if self._has_issues(e)) + + # Usage stats + if has_usage: + used = sum(1 for e in self.entries if e.usage and e.usage.is_used) + unused = total - used + used_str = str(used) + unused_str = str(unused) + missing_str = str(len(self.missing_citations)) + else: + used_str = "N/A" + unused_str = "N/A" + missing_str = "N/A" + + # Duplicate stats - show N/A if check wasn't run (duplicate_groups is None means not checked) + if self.duplicate_groups is None: + dup_str = "N/A" + else: + dup_str = str(len(self.duplicate_groups)) + + # Preprint detection (only if enabled) + preprint_str = "N/A" + preprint_warning = [] + if self.check_preprint_ratio and has_usage: + used_entries = [e for e in self.entries if e.usage and e.usage.is_used] + if used_entries: + preprint_count = sum(1 for e in used_entries if self._is_preprint(e.entry)) + preprint_ratio = preprint_count / len(used_entries) + preprint_str = f"{preprint_count} ({preprint_ratio:.1%})" + + # Warning if exceeds threshold + if preprint_ratio > self.preprint_warning_threshold: + preprint_warning = [ + "", + f"> ⚠️ **High Preprint Ratio Warning:** {preprint_ratio:.1%} of your used references are preprints (arXiv, bioRxiv, etc.). Consider replacing some with peer-reviewed publications if available." + ] + + summary_lines = [ + "## 📊 Summary", + "", + "### 📚 Bibliography Statistics", + "", + "| Metric | Count |", + "|--------|-------|", + f"| **Total Entries** | {total} |", + f"| ✅ **Verified (Clean)** | {verified} |", + f"| ⚠️ **With Issues** | {issues} |", + f"| 📝 **Used in TeX** | {used_str} |", + f"| 🗑️ **Unused** | {unused_str} |", + f"| 🔄 **Duplicate Groups** | {dup_str} |", + f"| ❌ **Missing Bib Entries** | {missing_str} |", + f"| 📄 **Preprints (Used)** | {preprint_str} |", + ] + + # Add warning if needed + if preprint_warning: + summary_lines.extend(preprint_warning) + + summary_lines.extend([ + "", + "### 📋 LaTeX Quality Checks", + "", + self._get_submission_summary() + ]) + + return summary_lines + + def _is_preprint(self, entry: BibEntry) -> bool: + """Check if an entry is a preprint.""" + # Preprint indicators + preprint_keywords = [ + 'arxiv', 'biorxiv', 'medrxiv', 'ssrn', 'preprint', + 'openreview', 'techreport', 'technical report', 'working paper', + 'tech report', 'tech. report' + ] + + # Check entry type + if entry.entry_type.lower() in ['techreport', 'unpublished', 'misc']: + # Further check if it's actually a preprint + text_to_check = ' '.join([ + entry.journal.lower(), + entry.booktitle.lower(), + entry.publisher.lower(), + entry.entry_type.lower() + ]) + + if any(keyword in text_to_check for keyword in preprint_keywords): + return True + + # Check if arXiv ID exists + if entry.has_arxiv: + return True + + # Check journal/booktitle/publisher fields + venue_text = ' '.join([ + entry.journal.lower(), + entry.booktitle.lower(), + entry.publisher.lower() + ]) + + return any(keyword in venue_text for keyword in preprint_keywords) + + def _get_submission_summary(self) -> str: + """Generate submission quality summary table.""" + if not self.submission_results: + return "*No quality checks were performed.*" + + # Count by severity + error_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.ERROR) + warning_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.WARNING) + info_count = sum(1 for r in self.submission_results if r.severity == CheckSeverity.INFO) + + lines = [ + "| Severity | Count |", + "|----------|-------|", + f"| 🔴 **Errors** | {error_count} |", + f"| 🟡 **Warnings** | {warning_count} |", + f"| 🔵 **Suggestions** | {info_count} |" + ] + return "\n".join(lines) + + def _is_verified(self, entry: EntryReport) -> bool: + """Check if entry is clean (no issues).""" + return not self._has_issues(entry) + + def _has_issues(self, entry: EntryReport) -> bool: + """Check if entry has any issues.""" + # Metadata issues + if entry.comparison and entry.comparison.has_issues: + return True + # LLM issues (low relevance) + if any(ev.relevance_score <= 2 for ev in entry.evaluations): + return True + # NOTE: We don't include usage issues (unused) here because + # unused entries are already shown in the "Unused Entries" section + return False + + def _has_metadata_or_relevance_issues(self, entry: EntryReport) -> bool: + """Check if entry has metadata or relevance issues (excluding duplicate/unused).""" + # Metadata issues + if entry.comparison and entry.comparison.has_issues: + return True + # LLM issues (low relevance) + if any(ev.relevance_score <= 2 for ev in entry.evaluations): + return True + return False + + def _generate_issues_section(self) -> list[str]: + """Generate detailed section for entries with issues.""" + lines = ["## ⚠️ Critical Issues Detected", ""] + + has_any_issues = False + + # 1. Missing Citations + if self.missing_citations: + has_any_issues = True + lines.append("### ❌ Missing Bibliography Entries") + lines.append("The following keys are cited in the TeX file but missing from the .bib file:") + lines.append("") + for key in self.missing_citations: + lines.append(f"- `{key}`") + lines.append("") + + # 2. Duplicate Entries + if self.duplicate_groups: + has_any_issues = True + lines.append("### 🔄 Duplicate Entries") + for i, group in enumerate(self.duplicate_groups, 1): + lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})") + lines.append(f"**Reason:** {group.reason}") + lines.append("") + lines.append("| Key | Title | Year |") + lines.append("|-----|-------|------|") + for entry in group.entries: + lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |") + lines.append("") + + # 3. Unused Entries + unused = [e for e in self.entries if e.usage and not e.usage.is_used] + if unused: + has_any_issues = True + lines.append("### 🗑️ Unused Entries") + lines.append("The following entries are in the .bib file but NOT cited in the TeX file:") + lines.append("") + for e in unused: + lines.append(f"- `{e.entry.key}`: *{e.entry.title}*") + lines.append("") + + # 4. Metadata Mismatches & Low Relevance + issue_entries = [e for e in self.entries if self._has_metadata_or_relevance_issues(e)] + + if issue_entries: + has_any_issues = True + lines.append("### ⚠️ Metadata & Relevance Issues") + + for entry_report in issue_entries: + lines.extend(self._format_entry_detail(entry_report, is_verified=False)) + + if not has_any_issues: + lines.append("🎉 **No critical issues found!**") + + return lines + + def _generate_verified_section(self) -> list[str]: + """Generate section for verified entries.""" + lines = ["## ✅ Verified Entries", ""] + + verified = [e for e in self.entries if self._is_verified(e)] + + if not verified: + lines.append("_No verified entries found._") + return lines + + lines.append(f"Found **{len(verified)}** entries with correct metadata.") + lines.append("") + + # Use a collapsible details block for clean UI + lines.append("
") + lines.append("Click to view verified entries") + lines.append("") + + for entry_report in verified: + lines.extend(self._format_entry_detail(entry_report, minimal=self.minimal_verified, is_verified=True)) + + lines.append("
") + return lines + + def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> list[str]: + """Format a single entry report in Markdown.""" + entry = report.entry + comp = report.comparison + lines = [] + + # Title header - use checkmark for verified entries, warning for issues + icon = "✅" if is_verified else "⚠️" + lines.append(f"#### {icon} `{entry.key}`") + lines.append(f"**Title:** {entry.title}") + lines.append("") + + # Metadata Status + if comp: + status_icon = "✅" if comp.is_match else "❌" + lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})") + + if comp.has_issues and not minimal: + lines.append(" - **Discrepancies:**") + for issue in comp.issues: + # Format mismatch details nicely + if "Mismatch" in issue or "mismatch" in issue: + lines.append(f" - 🔴 {issue}") + if "Title" in issue: + lines.append(f" - **Bib:** `{comp.bib_title}`") + lines.append(f" - **Fetched:** `{comp.fetched_title}`") + elif "Author" in issue: + lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`") + lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`") + else: + lines.append(f" - 🔸 {issue}") + + # Relevance Status + if report.evaluations and not minimal: + lines.append("- **Relevance Analysis:**") + for eval_res in report.evaluations: + score_icon = "🟢" if eval_res.relevance_score >= 4 else ("🟡" if eval_res.relevance_score == 3 else "🔴") + lines.append(f" - {score_icon} **Score {eval_res.relevance_score}/5** ({eval_res.score_label})") + loc = [] + if eval_res.file_path: + loc.append(f"File: `{Path(eval_res.file_path).name}`") + if eval_res.line_number: + loc.append(f"Line {eval_res.line_number}") + if loc: + lines.append(f" - {' | '.join(loc)}") + lines.append(f" - *\"{eval_res.explanation}\"*") + + lines.append("") + lines.append("---") + lines.append("") + return lines + + def _generate_submission_section(self) -> list[str]: + """Generate section for submission quality check results.""" + lines = ["## 📋 Submission Quality Checks", ""] + + # Template info + if self.template: + lines.append(f"**Conference Template:** {self.template.name}") + lines.append(f"**Page Limit:** {self.template.page_limit_review} (review) / {self.template.page_limit_camera} (camera-ready)") + if self.template.mandatory_sections: + lines.append(f"**Required Sections:** {', '.join(self.template.mandatory_sections)}") + lines.append("") + + # Count by severity + errors = [r for r in self.submission_results if r.severity == CheckSeverity.ERROR and not r.passed] + warnings = [r for r in self.submission_results if r.severity == CheckSeverity.WARNING and not r.passed] + infos = [r for r in self.submission_results if r.severity == CheckSeverity.INFO and not r.passed] + + # Summary + if errors or warnings or infos: + lines.append("| Severity | Count |") + lines.append("|----------|-------|") + if errors: + lines.append(f"| 🔴 **Errors** | {len(errors)} |") + if warnings: + lines.append(f"| 🟡 **Warnings** | {len(warnings)} |") + if infos: + lines.append(f"| 🔵 **Suggestions** | {len(infos)} |") + lines.append("") + else: + lines.append("🎉 **No submission issues found!**") + lines.append("") + return lines + + # Group by checker + by_checker = {} + for result in self.submission_results: + if result.passed: + continue + if result.checker_name not in by_checker: + by_checker[result.checker_name] = [] + by_checker[result.checker_name].append(result) + + # Display errors first + if errors: + lines.append("### 🔴 Critical Errors") + lines.append("") + for result in errors: + lines.append(f"- **{result.message}**") + loc = [] + if result.file_path: + loc.append(f"File: `{Path(result.file_path).name}`") + if result.line_number: + loc.append(f"Line {result.line_number}") + if loc: + lines.append(f" - {' | '.join(loc)}") + if result.line_content: + lines.append(f" - `{result.line_content[:80]}`") + if result.suggestion: + lines.append(f" - 💡 *{result.suggestion}*") + lines.append("") + + # Display warnings + if warnings: + lines.append("### 🟡 Warnings") + lines.append("") + for result in warnings: + lines.append(f"- {result.message}") + loc = [] + if result.file_path: + loc.append(f"File: `{Path(result.file_path).name}`") + if result.line_number: + loc.append(f"Line {result.line_number}") + if loc: + lines.append(f" - {' | '.join(loc)}") + if result.suggestion: + lines.append(f" - 💡 *{result.suggestion}*") + lines.append("") + + # Display suggestions (collapsible) + if infos: + lines.append("### 🔵 Suggestions") + lines.append("
") + lines.append("Click to view suggestions") + lines.append("") + for result in infos: + lines.append(f"- {result.message}") + loc = [] + if result.file_path: + loc.append(f"File: `{Path(result.file_path).name}`") + if result.line_number: + loc.append(f"Line {result.line_number}") + if loc: + lines.append(f" - {' | '.join(loc)}") + if result.suggestion: + lines.append(f" - 💡 *{result.suggestion}*") + lines.append("") + lines.append("
") + lines.append("") + + return lines + + def _generate_footer(self) -> list[str]: + """Generate report footer.""" + return [ + "", + "---", + f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ] + + def save(self, filepath: str): + """Save report to file.""" + content = self.generate() + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + def save_bibliography_report(self, filepath: str): + """Generate and save bibliography-only report (all bib-related checks).""" + lines = [] + + # Header + lines.append("# Bibliography Validation Report") + lines.append("") + lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + lines.append("") + bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A" + tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" + lines.append("| File Type | Filename |") + lines.append("|-----------|----------|") + lines.append(f"| **Bib File(s)** | {bib_names} |") + lines.append(f"| **TeX File(s)** | {tex_names} |") + lines.append("") + + # Disclaimer + lines.extend(self._generate_disclaimer()) + lines.append("") + + # Summary - Bibliography only + total = len(self.entries) + verified = sum(1 for e in self.entries if self._is_verified(e)) + issues = sum(1 for e in self.entries if self._has_issues(e)) + + has_usage = any(e.usage is not None for e in self.entries) + if has_usage: + used = sum(1 for e in self.entries if e.usage and e.usage.is_used) + unused = total - used + used_str = str(used) + unused_str = str(unused) + missing_str = str(len(self.missing_citations)) + else: + used_str = "N/A" + unused_str = "N/A" + missing_str = "N/A" + + if self.duplicate_groups is None: + dup_str = "N/A" + else: + dup_str = str(len(self.duplicate_groups)) + + lines.append("## 📊 Summary") + lines.append("") + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| **Total Entries** | {total} |") + lines.append(f"| ✅ **Verified (Clean)** | {verified} |") + lines.append(f"| ⚠️ **With Issues** | {issues} |") + lines.append(f"| 📝 **Used in TeX** | {used_str} |") + lines.append(f"| 🗑️ **Unused** | {unused_str} |") + lines.append(f"| 🔄 **Duplicate Groups** | {dup_str} |") + lines.append(f"| ❌ **Missing Bib Entries** | {missing_str} |") + lines.append("") + + # Issues section + lines.extend(self._generate_issues_section()) + lines.append("") + + # Verified entries + lines.extend(self._generate_verified_section()) + lines.append("") + + # Footer + lines.extend(self._generate_footer()) + + content = "\n".join(lines) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + def save_latex_quality_report(self, filepath: str, submission_results: List[CheckResult], template=None): + """Generate and save LaTeX quality report (all tex-related quality checks).""" + lines = [] + + # Header + lines.append("# LaTeX Quality Report") + lines.append("") + lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + lines.append("") + tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" + lines.append(f"**TeX File(s):** {tex_names}") + lines.append("") + + if template: + lines.append(f"**Template:** {template.name}") + lines.append("") + + # Disclaimer + lines.append("> **⚠️ Note:** This report contains automated quality checks for your LaTeX document. Please review all suggestions carefully before making changes.") + lines.append("") + + # Summary + error_count = sum(1 for r in submission_results if r.severity == CheckSeverity.ERROR) + warning_count = sum(1 for r in submission_results if r.severity == CheckSeverity.WARNING) + info_count = sum(1 for r in submission_results if r.severity == CheckSeverity.INFO) + + lines.append("## 📊 Summary") + lines.append("") + lines.append("| Severity | Count |") + lines.append("|----------|-------|") + lines.append(f"| 🔴 **Errors** | {error_count} |") + lines.append(f"| 🟡 **Warnings** | {warning_count} |") + lines.append(f"| 🔵 **Suggestions** | {info_count} |") + lines.append("") + + # Detailed issues + self.submission_results = submission_results + self.template = template + lines.extend(self._generate_submission_section()) + lines.append("") + + # Footer + lines.append("---") + lines.append("") + lines.append(f"Report generated by **BibGuard** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + content = "\n".join(lines) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + diff --git a/src/report/line_report.py b/src/report/line_report.py new file mode 100644 index 0000000000000000000000000000000000000000..c9bb3a84437284c284d6a21fa5ac2353c79705a1 --- /dev/null +++ b/src/report/line_report.py @@ -0,0 +1,254 @@ +""" +Line-by-line report generator. + +Generates a report that follows the TeX file structure, +showing issues in order of appearance in the document. +""" +import re +from typing import List, Dict, Tuple, Optional +from dataclasses import dataclass, field +from collections import defaultdict +from datetime import datetime +from pathlib import Path + +from ..checkers.base import CheckResult, CheckSeverity + + +@dataclass +class LineIssue: + """An issue associated with a specific line or range.""" + start_line: int + end_line: int + line_content: str + issues: List[CheckResult] = field(default_factory=list) + block_type: Optional[str] = None # 'figure', 'table', 'equation', etc. + + +class LineByLineReportGenerator: + """ + Generates a report organized by TeX file line order. + + Groups consecutive lines and special environments into blocks, + then outputs issues in the order they appear in the document. + """ + + # LaTeX environments that should be grouped as blocks + BLOCK_ENVIRONMENTS = [ + 'figure', 'figure*', 'table', 'table*', 'tabular', 'tabular*', + 'equation', 'equation*', 'align', 'align*', 'gather', 'gather*', + 'algorithm', 'algorithm2e', 'algorithmic', 'lstlisting', + 'verbatim', 'minted', 'tikzpicture', 'minipage', 'subfigure', + ] + + def __init__(self, tex_content: str, tex_path: str): + self.tex_content = tex_content + self.tex_path = tex_path + self.lines = tex_content.split('\n') + self.line_issues: Dict[int, List[CheckResult]] = defaultdict(list) + self.blocks: List[Tuple[int, int, str]] = [] # (start, end, env_type) + + # Parse block environments + self._parse_blocks() + + def _parse_blocks(self): + """Find all block environments in the TeX content.""" + for env in self.BLOCK_ENVIRONMENTS: + env_escaped = env.replace('*', r'\*') + pattern = re.compile( + rf'\\begin\{{{env_escaped}\}}.*?\\end\{{{env_escaped}\}}', + re.DOTALL + ) + + for match in pattern.finditer(self.tex_content): + start_line = self._pos_to_line(match.start()) + end_line = self._pos_to_line(match.end()) + self.blocks.append((start_line, end_line, env)) + + # Sort blocks by start line + self.blocks.sort(key=lambda x: x[0]) + + def _pos_to_line(self, pos: int) -> int: + """Convert character position to line number (1-indexed).""" + return self.tex_content[:pos].count('\n') + 1 + + def add_results(self, results: List[CheckResult]): + """Add check results to the line-by-line mapping.""" + for result in results: + if result.passed: + continue + + line_num = result.line_number or 0 + if line_num > 0: + self.line_issues[line_num].append(result) + + def _get_block_for_line(self, line_num: int) -> Optional[Tuple[int, int, str]]: + """Check if a line is part of a block environment.""" + for start, end, env_type in self.blocks: + if start <= line_num <= end: + return (start, end, env_type) + return None + + def _get_block_content(self, start: int, end: int) -> str: + """Get content for a block of lines.""" + block_lines = self.lines[start-1:end] + if len(block_lines) > 10: + # Truncate long blocks + return '\n'.join(block_lines[:5]) + '\n [...]\n' + '\n'.join(block_lines[-3:]) + return '\n'.join(block_lines) + + def _severity_icon(self, severity: CheckSeverity) -> str: + """Get icon for severity level.""" + icons = { + CheckSeverity.ERROR: '🔴', + CheckSeverity.WARNING: '🟡', + CheckSeverity.INFO: '🔵', + } + return icons.get(severity, '⚪') + + def generate(self) -> str: + """Generate the line-by-line report.""" + lines = [] + + # Header + lines.append("# BibGuard Line-by-Line Report") + lines.append("") + lines.append(f"**File:** `{Path(self.tex_path).name}`") + lines.append(f"**Generated at:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + lines.append("") + lines.append("---") + lines.append("") + + # Summary counts + error_count = sum(1 for issues in self.line_issues.values() + for r in issues if r.severity == CheckSeverity.ERROR) + warning_count = sum(1 for issues in self.line_issues.values() + for r in issues if r.severity == CheckSeverity.WARNING) + info_count = sum(1 for issues in self.line_issues.values() + for r in issues if r.severity == CheckSeverity.INFO) + + lines.append("## 📊 Overview") + lines.append("") + lines.append(f"| 🔴 Errors | 🟡 Warnings | 🔵 Suggestions |") + lines.append(f"|:---------:|:-----------:|:--------------:|") + lines.append(f"| {error_count} | {warning_count} | {info_count} |") + lines.append("") + lines.append("---") + lines.append("") + + if not self.line_issues: + lines.append("🎉 **No issues found!**") + return '\n'.join(lines) + + # Process lines in order + lines.append("## 📝 Line-by-Line Details") + lines.append("") + + processed_lines = set() + sorted_line_nums = sorted(self.line_issues.keys()) + + for line_num in sorted_line_nums: + if line_num in processed_lines: + continue + + issues = self.line_issues[line_num] + if not issues: + continue + + # Check if this line is part of a block + block = self._get_block_for_line(line_num) + + if block: + start, end, env_type = block + + # Mark all lines in block as processed + for ln in range(start, end + 1): + processed_lines.add(ln) + + # Collect all issues in this block + block_issues = [] + for ln in range(start, end + 1): + if ln in self.line_issues: + block_issues.extend(self.line_issues[ln]) + + if block_issues: + lines.append(f"### 📦 `{env_type}` Environment (Lines {start}-{end})") + lines.append("") + lines.append("```latex") + lines.append(self._get_block_content(start, end)) + lines.append("```") + lines.append("") + + # Group issues by type + for issue in block_issues: + icon = self._severity_icon(issue.severity) + lines.append(f"- {icon} **{issue.message}**") + if issue.suggestion: + lines.append(f" - 💡 {issue.suggestion}") + + lines.append("") + else: + # Single line + processed_lines.add(line_num) + + # Use custom line_content from CheckResult if available, otherwise get from file + custom_content = None + for issue in issues: + if issue.line_content: + custom_content = issue.line_content + break + + line_content = custom_content if custom_content else ( + self.lines[line_num - 1] if line_num <= len(self.lines) else "" + ) + + lines.append(f"### Line {line_num}") + lines.append("") + lines.append("```latex") + lines.append(line_content) + lines.append("```") + lines.append("") + + for issue in issues: + icon = self._severity_icon(issue.severity) + lines.append(f"- {icon} **{issue.message}**") + if issue.suggestion: + lines.append(f" - 💡 {issue.suggestion}") + + lines.append("") + + # Footer + lines.append("---") + lines.append("") + lines.append("*Report generated by BibGuard*") + + return '\n'.join(lines) + + def save(self, filepath: str): + """Save report to file.""" + content = self.generate() + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + +def generate_line_report( + tex_content: str, + tex_path: str, + results: List[CheckResult], + output_path: str +) -> str: + """ + Generate a line-by-line report from check results. + + Args: + tex_content: The TeX file content + tex_path: Path to the TeX file + results: List of check results from all checkers + output_path: Where to save the report + + Returns: + Path to the generated report + """ + generator = LineByLineReportGenerator(tex_content, tex_path) + generator.add_results(results) + generator.save(output_path) + return output_path diff --git a/src/templates/__init__.py b/src/templates/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff0977111a09516f9941f54b60c44126d7f499e5 --- /dev/null +++ b/src/templates/__init__.py @@ -0,0 +1,4 @@ +"""Templates module for conference-specific submission requirements.""" +from .base_template import ConferenceTemplate, get_template, get_all_templates + +__all__ = ['ConferenceTemplate', 'get_template', 'get_all_templates'] diff --git a/src/templates/__pycache__/__init__.cpython-313.pyc b/src/templates/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bea3b9a05696703f73715f8a425c228cecc7cb1 Binary files /dev/null and b/src/templates/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/templates/__pycache__/base_template.cpython-313.pyc b/src/templates/__pycache__/base_template.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c144a736268828d8c269b73827a944ae4bb49105 Binary files /dev/null and b/src/templates/__pycache__/base_template.cpython-313.pyc differ diff --git a/src/templates/base_template.py b/src/templates/base_template.py new file mode 100644 index 0000000000000000000000000000000000000000..4e186628afdb647ac9c437f0f33b5a7b7abc2f34 --- /dev/null +++ b/src/templates/base_template.py @@ -0,0 +1,263 @@ +""" +Conference template definitions. + +Each template contains conference-specific formatting requirements +and rules for paper submission quality checking. +""" +from dataclasses import dataclass, field +from typing import List, Dict, Optional +from enum import Enum + + +class ConferenceField(Enum): + """Research field categories.""" + NLP = "Natural Language Processing" + CV = "Computer Vision" + ML = "Machine Learning" + + +@dataclass +class ConferenceTemplate: + """ + Template containing conference-specific submission requirements. + + Attributes: + name: Full conference name (e.g., "ACL 2025") + short_name: Short identifier (e.g., "acl") + field: Research field category + page_limit_review: Page limit for review submission (main content only) + page_limit_camera: Page limit for camera-ready (main content only) + double_blind: Whether the conference uses double-blind review + caption_table_above: Whether table captions should be above + caption_figure_below: Whether figure captions should be below + mandatory_sections: List of required sections (e.g., ["Limitations"]) + optional_sections: List of encouraged sections + style_package: Name of the LaTeX style package + checkers: List of checker names to run for this template + extra_rules: Additional conference-specific rules + """ + name: str + short_name: str + field: ConferenceField + page_limit_review: int + page_limit_camera: int + double_blind: bool = True + caption_table_above: bool = True + caption_figure_below: bool = True + mandatory_sections: List[str] = field(default_factory=list) + optional_sections: List[str] = field(default_factory=list) + style_package: str = "" + checkers: List[str] = field(default_factory=lambda: [ + 'caption', 'reference', 'ai_artifacts', 'formatting', 'anonymization' + ]) + extra_rules: Dict[str, str] = field(default_factory=dict) + + def to_dict(self) -> dict: + return { + 'name': self.name, + 'short_name': self.short_name, + 'field': self.field.value, + 'page_limit_review': self.page_limit_review, + 'page_limit_camera': self.page_limit_camera, + 'double_blind': self.double_blind, + 'mandatory_sections': self.mandatory_sections, + 'optional_sections': self.optional_sections, + 'checkers': self.checkers, + } + + +# ============================================================================ +# NLP Conferences (ACL, EMNLP, NAACL) +# ============================================================================ + +ACL_TEMPLATE = ConferenceTemplate( + name="ACL 2025", + short_name="acl", + field=ConferenceField.NLP, + page_limit_review=8, + page_limit_camera=9, + double_blind=True, + mandatory_sections=["Limitations"], + optional_sections=["Ethical Considerations"], + style_package="acl2025", + extra_rules={ + "format": "Two-column, A4 paper", + "references": "Unlimited pages for references", + "appendix": "Allowed after references, two-column format", + } +) + +EMNLP_TEMPLATE = ConferenceTemplate( + name="EMNLP 2024", + short_name="emnlp", + field=ConferenceField.NLP, + page_limit_review=8, + page_limit_camera=9, + double_blind=True, + mandatory_sections=["Limitations"], + optional_sections=["Ethics Statement"], + style_package="emnlp2024", + extra_rules={ + "format": "Two-column, single-spaced", + "short_paper": "4 pages for short papers (5 camera-ready)", + } +) + +NAACL_TEMPLATE = ConferenceTemplate( + name="NAACL 2025", + short_name="naacl", + field=ConferenceField.NLP, + page_limit_review=8, + page_limit_camera=9, + double_blind=True, + mandatory_sections=["Limitations"], + optional_sections=["Ethics Statement"], + style_package="naacl2025", + extra_rules={ + "review_system": "ACL Rolling Review (ARR)", + "format": "Two-column, A4 paper", + } +) + +# ============================================================================ +# Computer Vision Conferences (CVPR, ICCV, ECCV) +# ============================================================================ + +CVPR_TEMPLATE = ConferenceTemplate( + name="CVPR 2025", + short_name="cvpr", + field=ConferenceField.CV, + page_limit_review=8, + page_limit_camera=8, # No extra page for camera-ready + double_blind=True, + mandatory_sections=[], + optional_sections=[], + style_package="cvpr", + extra_rules={ + "strict_anonymity": "No links to websites that reveal identity", + "supplementary": "Separate PDF allowed, no page limit", + "references": "No limit on references", + } +) + +ICCV_TEMPLATE = ConferenceTemplate( + name="ICCV 2025", + short_name="iccv", + field=ConferenceField.CV, + page_limit_review=8, + page_limit_camera=8, + double_blind=True, + mandatory_sections=[], + optional_sections=[], + style_package="iccv", + extra_rules={ + "format": "Two-column, 10pt Times font", + "supplementary": "Optional PDF for extra material", + } +) + +ECCV_TEMPLATE = ConferenceTemplate( + name="ECCV 2024", + short_name="eccv", + field=ConferenceField.CV, + page_limit_review=14, + page_limit_camera=14, + double_blind=True, + mandatory_sections=[], + optional_sections=[], + style_package="eccv", + extra_rules={ + "format": "Springer LNCS format", + "template": "Do not use TIMES font, use default template font", + "headings": "Capitalize except articles/prepositions/conjunctions", + } +) + +# ============================================================================ +# Machine Learning Conferences (NeurIPS, ICML, ICLR) +# ============================================================================ + +NEURIPS_TEMPLATE = ConferenceTemplate( + name="NeurIPS 2025", + short_name="neurips", + field=ConferenceField.ML, + page_limit_review=9, + page_limit_camera=10, + double_blind=True, + mandatory_sections=["Paper Checklist"], + optional_sections=["Broader Impact"], + style_package="neurips_2025", + extra_rules={ + "checklist": "NeurIPS paper checklist is MANDATORY, desk reject without it", + "appendix": "Technical appendix after checklist, no page limit", + "format": "Single PDF including main content, references, and checklist", + } +) + +ICML_TEMPLATE = ConferenceTemplate( + name="ICML 2025", + short_name="icml", + field=ConferenceField.ML, + page_limit_review=8, + page_limit_camera=9, + double_blind=True, + mandatory_sections=["Impact Statement"], # Required for camera-ready + optional_sections=["Acknowledgments"], + style_package="icml2025", + extra_rules={ + "font": "10 point Times, embedded Type-1 fonts only", + "lay_summary": "Plain language summary required for accepted papers", + "format": "Use pdflatex for best results", + } +) + +ICLR_TEMPLATE = ConferenceTemplate( + name="ICLR 2025", + short_name="iclr", + field=ConferenceField.ML, + page_limit_review=10, + page_limit_camera=10, + double_blind=True, + mandatory_sections=[], + optional_sections=["Ethics Statement", "Reproducibility Statement"], + style_package="iclr2025_conference", + extra_rules={ + "format": "10pt Times New Roman, 11pt vertical spacing", + "submission": "Via OpenReview", + "min_pages": "Main text must be between 6 and 10 pages", + } +) + +# ============================================================================ +# Template Registry +# ============================================================================ + +TEMPLATE_REGISTRY: Dict[str, ConferenceTemplate] = { + # NLP + 'acl': ACL_TEMPLATE, + 'emnlp': EMNLP_TEMPLATE, + 'naacl': NAACL_TEMPLATE, + # CV + 'cvpr': CVPR_TEMPLATE, + 'iccv': ICCV_TEMPLATE, + 'eccv': ECCV_TEMPLATE, + # ML + 'neurips': NEURIPS_TEMPLATE, + 'icml': ICML_TEMPLATE, + 'iclr': ICLR_TEMPLATE, +} + + +def get_template(name: str) -> Optional[ConferenceTemplate]: + """Get a conference template by short name.""" + return TEMPLATE_REGISTRY.get(name.lower()) + + +def get_all_templates() -> Dict[str, ConferenceTemplate]: + """Get all available templates.""" + return TEMPLATE_REGISTRY.copy() + + +def get_templates_by_field(field: ConferenceField) -> List[ConferenceTemplate]: + """Get templates filtered by research field.""" + return [t for t in TEMPLATE_REGISTRY.values() if t.field == field] diff --git a/src/ui/__init__.py b/src/ui/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97ae04a37c5633477b761f41c3d289bde104d670 --- /dev/null +++ b/src/ui/__init__.py @@ -0,0 +1,5 @@ +"""UI module for BibGuard terminal interfaces.""" +from .workflow_editor import WorkflowEditor +from .template_selector import TemplateSelector + +__all__ = ['WorkflowEditor', 'TemplateSelector'] diff --git a/src/ui/__pycache__/__init__.cpython-313.pyc b/src/ui/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bd2ea8343637c994e457e05040eda25af3cb0fe Binary files /dev/null and b/src/ui/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/ui/__pycache__/template_selector.cpython-313.pyc b/src/ui/__pycache__/template_selector.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae6532abad95d6e03be3fb6976bf1110fe17326d Binary files /dev/null and b/src/ui/__pycache__/template_selector.cpython-313.pyc differ diff --git a/src/ui/__pycache__/workflow_editor.cpython-313.pyc b/src/ui/__pycache__/workflow_editor.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81b4ec5be6d7d659ee63902d7ebef17fc82c2fd6 Binary files /dev/null and b/src/ui/__pycache__/workflow_editor.cpython-313.pyc differ diff --git a/src/ui/template_selector.py b/src/ui/template_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..0a011b66ca21b1e91daeb632fdbe3872edea6e88 --- /dev/null +++ b/src/ui/template_selector.py @@ -0,0 +1,186 @@ +""" +Interactive template selector for conference presets. + +Provides a terminal UI for selecting a conference template +with information about requirements and rules. +""" +from typing import Optional + +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich.prompt import Prompt +from rich.columns import Columns +from rich.text import Text + +from ..templates.base_template import ( + ConferenceTemplate, + get_template, + get_all_templates, + ConferenceField +) + + +class TemplateSelector: + """Interactive terminal selector for conference templates.""" + + def __init__(self): + self.console = Console() + self.templates = get_all_templates() + + def display_templates(self): + """Display all available templates grouped by field.""" + self.console.clear() + + # Header + self.console.print(Panel( + "[bold blue]🎓 Conference Template Selector[/bold blue]\n" + "[dim]Choose a conference template for submission checks[/dim]", + border_style="blue" + )) + self.console.print() + + # Group by field + fields = { + ConferenceField.NLP: ("🗣️ NLP Conferences", []), + ConferenceField.CV: ("👁️ Computer Vision Conferences", []), + ConferenceField.ML: ("🧠 Machine Learning Conferences", []), + } + + for template in self.templates.values(): + fields[template.field][1].append(template) + + # Display each field + for field_enum, (title, templates) in fields.items(): + self.console.print(f"[bold cyan]{title}[/bold cyan]") + + table = Table(show_header=True, header_style="bold", box=None, padding=(0, 2)) + table.add_column("ID", style="yellow", width=10) + table.add_column("Conference", width=15) + table.add_column("Pages", width=12) + table.add_column("Key Requirements", style="dim") + + for template in templates: + pages = f"{template.page_limit_review}→{template.page_limit_camera}" + requirements = [] + if template.mandatory_sections: + requirements.append(f"Required: {', '.join(template.mandatory_sections)}") + if template.extra_rules: + first_rule = list(template.extra_rules.values())[0] + requirements.append(first_rule[:50]) + + table.add_row( + template.short_name, + template.name, + pages, + " | ".join(requirements) if requirements else "Standard format" + ) + + self.console.print(table) + self.console.print() + + def display_template_details(self, template: ConferenceTemplate): + """Display detailed information about a template.""" + self.console.print() + self.console.print(Panel( + f"[bold]{template.name}[/bold]", + border_style="cyan" + )) + + # Basic info + info = Table(show_header=False, box=None, padding=(0, 2)) + info.add_column("Label", style="dim") + info.add_column("Value") + + info.add_row("Style Package", f"[cyan]{template.style_package}[/cyan]") + info.add_row("Page Limit (Review)", f"[yellow]{template.page_limit_review}[/yellow] pages") + info.add_row("Page Limit (Camera)", f"[green]{template.page_limit_camera}[/green] pages") + info.add_row("Double-Blind", "✓ Yes" if template.double_blind else "✗ No") + + if template.mandatory_sections: + info.add_row("Mandatory Sections", ", ".join(template.mandatory_sections)) + if template.optional_sections: + info.add_row("Optional Sections", ", ".join(template.optional_sections)) + + self.console.print(info) + + # Extra rules + if template.extra_rules: + self.console.print() + self.console.print("[bold]Special Requirements:[/bold]") + for key, value in template.extra_rules.items(): + self.console.print(f" • [dim]{key}:[/dim] {value}") + + self.console.print() + + def run(self) -> Optional[ConferenceTemplate]: + """Run the interactive selector and return the chosen template.""" + while True: + self.display_templates() + + # Get user input + choice = Prompt.ask( + "[bold]Enter template ID (or 'q' to quit, 'd ' for details)[/bold]", + default="q" + ) + + if choice.lower() == 'q': + return None + + # Handle details command + if choice.lower().startswith('d '): + template_id = choice[2:].strip().lower() + template = get_template(template_id) + if template: + self.display_template_details(template) + Prompt.ask("Press Enter to continue") + else: + self.console.print(f"[red]Unknown template: {template_id}[/red]") + Prompt.ask("Press Enter to continue") + continue + + # Try to get template + template = get_template(choice) + if template: + self.console.print(f"[green]✓ Selected: {template.name}[/green]") + return template + else: + self.console.print(f"[red]Unknown template: {choice}[/red]") + self.console.print("[dim]Available: " + ", ".join(self.templates.keys()) + "[/dim]") + Prompt.ask("Press Enter to continue") + + +def launch_template_selector() -> Optional[ConferenceTemplate]: + """Launch the template selector and return the chosen template.""" + selector = TemplateSelector() + return selector.run() + + +def list_templates(console: Console = None): + """Print a simple list of available templates.""" + if console is None: + console = Console() + + console.print("\n[bold]Available Conference Templates:[/bold]\n") + + templates = get_all_templates() + + # Group by field + by_field = {} + for t in templates.values(): + if t.field not in by_field: + by_field[t.field] = [] + by_field[t.field].append(t) + + field_names = { + ConferenceField.NLP: "NLP", + ConferenceField.CV: "Computer Vision", + ConferenceField.ML: "Machine Learning", + } + + for field, field_templates in by_field.items(): + console.print(f"[cyan]{field_names[field]}:[/cyan]") + for t in field_templates: + console.print(f" • [yellow]{t.short_name:8}[/yellow] - {t.name} ({t.page_limit_review}/{t.page_limit_camera} pages)") + + console.print() diff --git a/src/ui/workflow_editor.py b/src/ui/workflow_editor.py new file mode 100644 index 0000000000000000000000000000000000000000..0a5e832d373da660e49cd7353e6fb35d67e9d7dc --- /dev/null +++ b/src/ui/workflow_editor.py @@ -0,0 +1,164 @@ +""" +Interactive workflow editor for reference checking configuration. + +Provides a terminal-based UI using rich for customizing the order +and enabled state of fetchers in the verification workflow. +""" +from typing import Optional +from pathlib import Path + +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich.prompt import Prompt, Confirm +from rich.text import Text + +from ..config.workflow import WorkflowConfig, get_default_workflow + + +class WorkflowEditor: + """Interactive terminal editor for workflow configuration.""" + + def __init__(self, config: Optional[WorkflowConfig] = None): + self.console = Console() + self.config = config or get_default_workflow() + self.selected_index = 0 + self.modified = False + + def display_workflow(self): + """Display current workflow configuration as a table.""" + self.console.clear() + + # Header + self.console.print(Panel( + "[bold blue]📋 Reference Check Workflow Editor[/bold blue]\n" + "[dim]Customize the order and sources for metadata verification[/dim]", + border_style="blue" + )) + + # Instructions + self.console.print() + self.console.print("[dim]Commands: [cyan]u[/cyan]=move up, [cyan]d[/cyan]=move down, " + "[cyan]t[/cyan]=toggle, [cyan]s[/cyan]=save, [cyan]r[/cyan]=reset, [cyan]q[/cyan]=quit[/dim]") + self.console.print() + + # Workflow table + table = Table(show_header=True, header_style="bold magenta", box=None) + table.add_column("#", style="dim", width=3) + table.add_column("Status", width=8) + table.add_column("Source", width=25) + table.add_column("Description", style="dim") + + for i, step in enumerate(self.config.steps): + # Highlight selected row + row_style = "reverse" if i == self.selected_index else "" + + # Status indicator + if step.enabled: + status = "[green]✓ ON[/green]" + else: + status = "[red]✗ OFF[/red]" + + # Priority number + priority = f"{i + 1}" + + table.add_row( + priority, + status, + step.display_name, + step.description, + style=row_style + ) + + self.console.print(table) + self.console.print() + + # Current selection info + if 0 <= self.selected_index < len(self.config.steps): + step = self.config.steps[self.selected_index] + info = Text() + info.append("Selected: ", style="dim") + info.append(step.display_name, style="cyan bold") + info.append(f" (search type: {step.search_type})", style="dim") + self.console.print(info) + + if self.modified: + self.console.print("[yellow]* Unsaved changes[/yellow]") + + def run(self) -> WorkflowConfig: + """Run the interactive editor loop.""" + while True: + self.display_workflow() + + # Get user input + try: + cmd = Prompt.ask( + "\n[bold]Enter command[/bold]", + choices=["u", "d", "t", "s", "r", "q", "1", "2", "3", "4", "5", "6", "7", "8"], + default="q", + show_choices=False + ) + except KeyboardInterrupt: + cmd = "q" + + if cmd == "q": + if self.modified: + if Confirm.ask("Discard unsaved changes?", default=False): + break + else: + break + elif cmd == "u": + if self.config.move_step_up(self.selected_index): + self.selected_index -= 1 + self.modified = True + elif cmd == "d": + if self.config.move_step_down(self.selected_index): + self.selected_index += 1 + self.modified = True + elif cmd == "t": + self.config.toggle_step(self.selected_index) + self.modified = True + elif cmd == "s": + self._save_workflow() + elif cmd == "r": + if Confirm.ask("Reset to default workflow?", default=False): + self.config = get_default_workflow() + self.selected_index = 0 + self.modified = True + elif cmd.isdigit(): + num = int(cmd) + if 1 <= num <= len(self.config.steps): + self.selected_index = num - 1 + + return self.config + + def _save_workflow(self): + """Save workflow configuration to file.""" + default_path = Path.home() / ".bibguard" / "workflow.json" + + path_str = Prompt.ask( + "Save to", + default=str(default_path) + ) + + try: + self.config.save(path_str) + self.console.print(f"[green]✓ Saved to {path_str}[/green]") + self.modified = False + except Exception as e: + self.console.print(f"[red]✗ Failed to save: {e}[/red]") + + Prompt.ask("Press Enter to continue") + + +def launch_workflow_editor(config_path: Optional[str] = None) -> WorkflowConfig: + """Launch the workflow editor and return the resulting configuration.""" + config = None + if config_path: + try: + config = WorkflowConfig.load(config_path) + except FileNotFoundError: + pass + + editor = WorkflowEditor(config) + return editor.run() diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4598f4578edc81e5e74eaccddea165bd356d5be3 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,5 @@ +"""Utilities package""" +from .normalizer import TextNormalizer +from .progress import ProgressDisplay + +__all__ = ['TextNormalizer', 'ProgressDisplay'] diff --git a/src/utils/__pycache__/__init__.cpython-313.pyc b/src/utils/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fa3d7bac9268b90e0eb75a748c60a6968a18742 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/utils/__pycache__/cache.cpython-313.pyc b/src/utils/__pycache__/cache.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f38d16fda753fc16d22b7ff926ce8ba1bdd40cb Binary files /dev/null and b/src/utils/__pycache__/cache.cpython-313.pyc differ diff --git a/src/utils/__pycache__/logger.cpython-313.pyc b/src/utils/__pycache__/logger.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb8aab83bbd5eb9dd620dcef4ecbde88bb73e480 Binary files /dev/null and b/src/utils/__pycache__/logger.cpython-313.pyc differ diff --git a/src/utils/__pycache__/normalizer.cpython-313.pyc b/src/utils/__pycache__/normalizer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbddfbdb1c6abb30fffd5a6007d5c39cdcb1e4c4 Binary files /dev/null and b/src/utils/__pycache__/normalizer.cpython-313.pyc differ diff --git a/src/utils/__pycache__/progress.cpython-313.pyc b/src/utils/__pycache__/progress.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..149e039a4e27ed25266d6b6c9558e5d01c1d8b4d Binary files /dev/null and b/src/utils/__pycache__/progress.cpython-313.pyc differ diff --git a/src/utils/__pycache__/source_manager.cpython-313.pyc b/src/utils/__pycache__/source_manager.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c47a85cfe68646c85c70cb7ba4f906a1652d9b2 Binary files /dev/null and b/src/utils/__pycache__/source_manager.cpython-313.pyc differ diff --git a/src/utils/normalizer.py b/src/utils/normalizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7c9018f44a97deea650ebda98221d1163be57c7e --- /dev/null +++ b/src/utils/normalizer.py @@ -0,0 +1,236 @@ +""" +Text normalization utilities for comparing bibliography entries. +""" +import re +import unicodedata +from unidecode import unidecode + + +class TextNormalizer: + """Utility class for normalizing text for comparison.""" + + # LaTeX command patterns + LATEX_COMMANDS = [ + (r'\\textbf\{([^}]*)\}', r'\1'), + (r'\\textit\{([^}]*)\}', r'\1'), + (r'\\emph\{([^}]*)\}', r'\1'), + (r'\\textrm\{([^}]*)\}', r'\1'), + (r'\\texttt\{([^}]*)\}', r'\1'), + (r'\\textsf\{([^}]*)\}', r'\1'), + (r'\\textsc\{([^}]*)\}', r'\1'), + (r'\\text\{([^}]*)\}', r'\1'), + (r'\\mathrm\{([^}]*)\}', r'\1'), + (r'\\mathbf\{([^}]*)\}', r'\1'), + (r'\\mathit\{([^}]*)\}', r'\1'), + (r'\\url\{([^}]*)\}', r'\1'), + (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'), + ] + + # LaTeX special character mappings + LATEX_CHARS = { + r'\&': '&', + r'\%': '%', + r'\$': '$', + r'\#': '#', + r'\_': '_', + r'\{': '{', + r'\}': '}', + r'\~': '~', + r'\^': '^', + r'``': '"', + r"''": '"', + r'`': "'", + r"'": "'", + r'--': '–', + r'---': '—', + } + + # LaTeX accent commands + LATEX_ACCENTS = [ + (r"\\'([aeiouAEIOU])", r'\1'), # acute + (r'\\`([aeiouAEIOU])', r'\1'), # grave + (r'\\^([aeiouAEIOU])', r'\1'), # circumflex + (r'\\"([aeiouAEIOU])', r'\1'), # umlaut + (r'\\~([nNaAoO])', r'\1'), # tilde + (r'\\c\{([cC])\}', r'\1'), # cedilla + (r"\\'{([aeiouAEIOU])}", r'\1'), + (r'\\`{([aeiouAEIOU])}', r'\1'), + (r'\\^{([aeiouAEIOU])}', r'\1'), + (r'\\"{([aeiouAEIOU])}', r'\1'), + (r'\\~{([nNaAoO])}', r'\1'), + ] + + @classmethod + def normalize_latex(cls, text: str) -> str: + """Remove LaTeX formatting commands.""" + if not text: + return "" + + result = text + + # Remove LaTeX commands + for pattern, replacement in cls.LATEX_COMMANDS: + result = re.sub(pattern, replacement, result) + + # Handle LaTeX accents + for pattern, replacement in cls.LATEX_ACCENTS: + result = re.sub(pattern, replacement, result) + + # Replace LaTeX special characters + for latex_char, normal_char in cls.LATEX_CHARS.items(): + result = result.replace(latex_char, normal_char) + + # Remove remaining braces + result = re.sub(r'[{}]', '', result) + + return result + + @classmethod + def normalize_unicode(cls, text: str) -> str: + """Normalize Unicode characters to ASCII.""" + if not text: + return "" + + # Normalize unicode + text = unicodedata.normalize('NFKD', text) + # Convert to ASCII + text = unidecode(text) + return text + + @classmethod + def normalize_whitespace(cls, text: str) -> str: + """Normalize whitespace.""" + if not text: + return "" + + # Replace multiple whitespace with single space + text = re.sub(r'\s+', ' ', text) + # Strip leading/trailing whitespace + text = text.strip() + return text + + @classmethod + def remove_punctuation(cls, text: str) -> str: + """Remove punctuation for comparison.""" + if not text: + return "" + + # Keep alphanumeric and spaces only + return re.sub(r'[^\w\s]', '', text) + + @classmethod + def normalize_for_comparison(cls, text: str) -> str: + """ + Full normalization pipeline for text comparison. + + Steps: + 1. Remove LaTeX formatting + 2. Normalize Unicode to ASCII + 3. Convert to lowercase + 4. Normalize whitespace + 5. Remove punctuation + """ + if not text: + return "" + + text = cls.normalize_latex(text) + text = cls.normalize_unicode(text) + text = text.lower() + text = cls.normalize_whitespace(text) + text = cls.remove_punctuation(text) + return text + + @classmethod + def normalize_author_name(cls, name: str) -> str: + """ + Normalize author name format. + Handles: "Last, First" and "First Last" formats. + Returns: normalized "first last" format. + """ + if not name: + return "" + + name = cls.normalize_latex(name) + name = cls.normalize_unicode(name) + name = cls.normalize_whitespace(name) + + # Handle "Last, First" format + if ',' in name: + parts = name.split(',', 1) + if len(parts) == 2: + name = f"{parts[1].strip()} {parts[0].strip()}" + + name = name.lower() + name = cls.remove_punctuation(name) + return name + + @classmethod + def normalize_author_list(cls, authors: str) -> list[str]: + """ + Parse and normalize a list of authors. + Handles "and" as separator and "Last, First" format. + """ + if not authors: + return [] + + # Split by " and " + author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE) + + # Normalize each author + normalized = [] + for author in author_list: + normalized_name = cls.normalize_author_name(author.strip()) + if normalized_name: + normalized.append(normalized_name) + + return normalized + + @classmethod + def similarity_ratio(cls, text1: str, text2: str) -> float: + """ + Calculate similarity ratio between two strings. + Uses simple word-based Jaccard similarity. + """ + if not text1 or not text2: + return 0.0 + + words1 = set(text1.split()) + words2 = set(text2.split()) + + if not words1 and not words2: + return 1.0 + if not words1 or not words2: + return 0.0 + + intersection = words1 & words2 + union = words1 | words2 + + return len(intersection) / len(union) + + @classmethod + def levenshtein_similarity(cls, s1: str, s2: str) -> float: + """Calculate normalized Levenshtein similarity.""" + if not s1 and not s2: + return 1.0 + if not s1 or not s2: + return 0.0 + + # Simple Levenshtein implementation + m, n = len(s1), len(s2) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + + for i in range(1, m + 1): + for j in range(1, n + 1): + if s1[i-1] == s2[j-1]: + dp[i][j] = dp[i-1][j-1] + else: + dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 + + max_len = max(m, n) + distance = dp[m][n] + return 1.0 - (distance / max_len) diff --git a/src/utils/progress.py b/src/utils/progress.py new file mode 100644 index 0000000000000000000000000000000000000000..d8a6b66098113b6eccb4a9d014b47894bc157945 --- /dev/null +++ b/src/utils/progress.py @@ -0,0 +1,202 @@ +""" +Rich progress display for terminal output. +""" +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn +from rich.panel import Panel +from rich.table import Table +from rich.live import Live +from rich.layout import Layout +from rich.text import Text +from contextlib import contextmanager +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ProgressStats: + """Statistics for progress display.""" + total_entries: int = 0 + processed: int = 0 + success: int = 0 + warnings: int = 0 + errors: int = 0 + current_entry: str = "" + current_task: str = "" + + +class ProgressDisplay: + """Rich terminal progress display.""" + + def __init__(self): + self.console = Console() + self.stats = ProgressStats() + self._progress: Optional[Progress] = None + self._live: Optional[Live] = None + self._main_task = None + + def _create_stats_table(self) -> Table: + """Create a statistics table.""" + table = Table(show_header=False, box=None, padding=(0, 2)) + table.add_column("Label", style="dim") + table.add_column("Value", style="bold") + + table.add_row("📚 Total Entries", str(self.stats.total_entries)) + table.add_row("✅ Success", f"[green]{self.stats.success}[/green]") + table.add_row("⚠️ Warnings", f"[yellow]{self.stats.warnings}[/yellow]") + table.add_row("❌ Errors", f"[red]{self.stats.errors}[/red]") + + return table + + def _create_display(self) -> Panel: + """Create the main display panel.""" + layout = Layout() + + # Status text + status_text = Text() + status_text.append("Current: ", style="dim") + status_text.append(self.stats.current_entry or "N/A", style="cyan bold") + status_text.append("\n") + status_text.append("Task: ", style="dim") + status_text.append(self.stats.current_task or "Initializing...", style="white") + + return Panel( + status_text, + title="[bold blue]📖 Bibliography Checker[/bold blue]", + border_style="blue" + ) + + @contextmanager + def progress_context(self, total: int, description: str = "Processing"): + """Context manager for progress display.""" + self.stats.total_entries = total + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(bar_width=40), + TaskProgressColumn(), + TimeElapsedColumn(), + console=self.console, + transient=False + ) as progress: + self._progress = progress + self._main_task = progress.add_task(description, total=total) + try: + yield self + finally: + self._progress = None + self._main_task = None + + def update(self, entry_key: str = "", task: str = "", advance: int = 0): + """Update progress display.""" + if entry_key: + self.stats.current_entry = entry_key + if task: + self.stats.current_task = task + + if self._progress and self._main_task is not None: + desc = f"[cyan]{entry_key}[/cyan] - {task}" if entry_key else task + self._progress.update(self._main_task, description=desc, advance=advance) + self.stats.processed += advance + + def mark_success(self): + """Mark current entry as successful.""" + self.stats.success += 1 + + def mark_warning(self): + """Mark current entry with warning.""" + self.stats.warnings += 1 + + def mark_error(self): + """Mark current entry as error.""" + self.stats.errors += 1 + + def print_header(self, title: str): + """Print a section header.""" + self.console.print() + self.console.print(Panel( + f"[bold]{title}[/bold]", + border_style="blue", + expand=False + )) + + def print_status(self, message: str, style: str = ""): + """Print a status message.""" + self.console.print(f" {message}", style=style) + + def print_success(self, message: str): + """Print a success message.""" + self.console.print(f" [green]✓[/green] {message}") + + def print_warning(self, message: str): + """Print a warning message.""" + self.console.print(f" [yellow]⚠[/yellow] {message}") + + def print_error(self, message: str): + """Print an error message.""" + self.console.print(f" [red]✗[/red] {message}") + + def print_info(self, message: str): + """Print an info message.""" + self.console.print(f" [blue]ℹ[/blue] {message}") + + def print_detailed_summary(self, bib_stats: dict, latex_stats: dict, output_dir: str): + """Print a beautiful detailed summary table (Issues only).""" + self.console.print() + + # Create Bibliography Issues Table + bib_table = Table(show_header=True, header_style="bold cyan", box=None, padding=(0, 1)) + bib_table.add_column("📚 Bibliography Issues", style="white") + bib_table.add_column("Count", justify="right", style="bold red") + + for label, value in bib_stats.items(): + bib_table.add_row(label, str(value)) + + # Create LaTeX Issues Table - Fine-grained Breakdown + latex_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) + latex_table.add_column("📋 LaTeX Quality Issues (Fine-grained)", style="white") + latex_table.add_column("Count", justify="right", style="bold yellow") + + if not latex_stats: + latex_table.add_row("[green]No issues found[/green]", "0") + else: + # Sort by count descending + for category, count in sorted(latex_stats.items(), key=lambda x: x[1], reverse=True): + latex_table.add_row(category, str(count)) + + # Combine into a single panel + from rich.columns import Columns + + # If no bib issues, only show latex table + content = [] + if bib_stats: + content.append(bib_table) + content.append(latex_table) + + summary_panel = Panel( + Columns(content, expand=True), + title="[bold red]⚠️ Issue Summary (Action Required)[/bold red]", + border_style="red", + padding=(1, 2) + ) + + self.console.print(summary_panel) + + # File meaning guide + guide_table = Table(show_header=True, header_style="bold green", box=None, padding=(0, 2)) + guide_table.add_column("File Name", style="cyan") + guide_table.add_column("Description", style="dim") + + guide_table.add_row("bibliography_report.md", "Detailed metadata and usage issues for each bib entry") + guide_table.add_row("latex_quality_report.md", "Summary of all LaTeX writing and formatting issues") + guide_table.add_row("line_by_line_report.md", "All LaTeX issues sorted by line number for easy fixing") + guide_table.add_row("*_only_used.bib", "A cleaned version of your bib file containing only cited entries") + + self.console.print(Panel( + guide_table, + title="[bold green]Output Directory Guide[/bold green]", + subtitle=f"Location: [blue underline]{output_dir}[/blue underline]", + border_style="green", + padding=(1, 1) + ))