File size: 9,787 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# ==============================================================================
# BibGuard Configuration File
# ==============================================================================
# 
# Usage: python main.py --config bibguard.yaml
#        python main.py  (auto-detect bibguard.yaml in current/parent directories)
#
# All paths are relative to this configuration file's directory.

# ==============================================================================
# πŸ“ File Settings
# ==============================================================================
files:
  # Required: Path to your .bib bibliography file
  bib: "test.bib"
  
  # Required: Path to your .tex LaTeX source file
  tex: "test.tex"
  
  # Optional: Directory path for recursive scanning (Experimental)
  # When set, BibGuard will recursively search for all .tex and .bib files in this directory.
  # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
  # input_dir: "./paper_project"
  
  # Output directory for all generated reports and files (default: bibguard_output)
  # All outputs including reports, cleaned .bib, and input file copies will be saved here
  output_dir: "test"


# ==============================================================================
# πŸŽ“ Conference Template
# ==============================================================================
# Specify a conference template for venue-specific checks and formatting rules.
# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
# Leave empty ("") to skip template-specific checks.
template: ""

# ==============================================================================
# πŸ“š Bibliography Checks
# ==============================================================================
bibliography:
  # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
  # Detects incorrect titles, authors, venues, and publication years
  # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
  check_metadata: true
  
  # Usage Check - Detect unused bib entries and missing citations
  # Identifies entries in .bib not cited in .tex, and citations without bib entries
  check_usage: true
  
  # Duplicate Detection - Find duplicate entries with different keys
  # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
  check_duplicates: true
  
  # Preprint Ratio Check - Warn if too many references are preprints
  # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
  check_preprint_ratio: true
  preprint_warning_threshold: 0.50  # Warn if more than 50% of used entries are preprints
  
  # Relevance Assessment - Use LLM to evaluate if citations match their context
  # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
  check_relevance: false

# ==============================================================================
# πŸ“‹ Submission Quality Checks
# ==============================================================================
submission:
  # ─────────────────────────────────────────────────────────────────────────────
  # Format Checks
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Caption Position - Ensure table captions are above, figure captions below
  # Checks \caption placement relative to \begin{table}/\begin{figure}
  caption: true
  
  # Cross-References - Verify all figures/tables/sections are referenced in text
  # Detects orphaned floats that are never mentioned
  reference: true
  
  # Formatting Standards - Check citation format, spacing, special characters
  # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
  formatting: true
  
  # Equation Checks - Verify equation punctuation and numbering consistency
  # Ensures equations end with proper punctuation and labels are used correctly
  equation: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Writing Quality
  # ─────────────────────────────────────────────────────────────────────────────
  
  # AI Artifacts - Detect traces of AI-generated text
  # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
  ai_artifacts: true
  
  # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
  # Helps improve readability and academic writing style
  sentence: true
  
  # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
  # Examples: "deep learning" vs "deep-learning", "color" vs "colour"
  consistency: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Academic Standards
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Acronym Definitions - Ensure acronyms are defined on first use
  # Example: "Natural Language Processing (NLP)" before using "NLP" alone
  acronym: true
  
  # Number Formatting - Check percentage formatting consistency
  # Ensures no space before % sign and consistent use of '%' vs 'percent'
  number: true
  
  # Citation Quality - Flag outdated references and citation formatting issues
  # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
  citation_quality: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Review Compliance
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Anonymization - Check double-blind review compliance
  # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
  anonymization: true

# ==============================================================================
# πŸ” Metadata Check Workflow
# ==============================================================================
# Define the data sources and order for metadata validation.
# BibGuard will try each enabled source in sequence until a match is found.
# Set enabled: false to skip a particular source.
workflow:
  - name: arxiv_id
    enabled: true
    description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
    
  - name: crossref_doi  
    enabled: true
    description: "Lookup by DOI via CrossRef (authoritative for published papers)"
    
  - name: semantic_scholar
    enabled: true
    description: "Semantic Scholar API (good coverage, includes citations)"
    
  - name: dblp
    enabled: true
    description: "DBLP database (comprehensive for computer science papers)"
    
  - name: openalex
    enabled: true
    description: "OpenAlex API (broad coverage across disciplines)"
    
  - name: arxiv_title
    enabled: true
    description: "Search arXiv by title (fallback when ID unavailable)"
    
  - name: crossref_title
    enabled: true
    description: "Search CrossRef by title (fallback when DOI unavailable)"
    
  - name: google_scholar
    enabled: false  # May be rate-limited, disabled by default
    description: "Google Scholar web scraping (use as last resort)"

# ==============================================================================
# πŸ€– LLM Configuration (for Relevance Checking)
# ==============================================================================
llm:
  # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
  # Each backend requires different setup (API keys, local installation, etc.)
  backend: "gemini"
  
  # Model name (leave empty to use backend default)
  # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
  model: ""
  
  # API endpoint (leave empty to use backend default)
  # Only needed for self-hosted models (vllm, ollama) or custom endpoints
  endpoint: ""
  
  # API key (recommended to use environment variables instead)
  # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
  api_key: ""

# ==============================================================================
# πŸ“Š Output Settings
# ==============================================================================
output:
  # Quiet mode - Suppress progress messages, only output final reports
  # Useful for CI/CD pipelines or batch processing
  quiet: false
  
  # Minimal verified entries - Hide detailed info for entries that passed all checks
  # Reduces report size when you only care about issues
  minimal_verified: false