Spaces:

thinkwee
/

BibGuard

Running

File size: 9,787 Bytes

46df5f0

# ==============================================================================
# BibGuard Configuration File
# ==============================================================================
# 
# Usage: python main.py --config bibguard.yaml
#        python main.py  (auto-detect bibguard.yaml in current/parent directories)
#
# All paths are relative to this configuration file's directory.

# ==============================================================================
# 📁 File Settings
# ==============================================================================
files:
  # Required: Path to your .bib bibliography file
  bib: "test.bib"
  
  # Required: Path to your .tex LaTeX source file
  tex: "test.tex"
  
  # Optional: Directory path for recursive scanning (Experimental)
  # When set, BibGuard will recursively search for all .tex and .bib files in this directory.
  # This mode is parallel to 'bib' and 'tex'. Use either this OR bib/tex.
  # input_dir: "./paper_project"
  
  # Output directory for all generated reports and files (default: bibguard_output)
  # All outputs including reports, cleaned .bib, and input file copies will be saved here
  output_dir: "test"


# ==============================================================================
# 🎓 Conference Template
# ==============================================================================
# Specify a conference template for venue-specific checks and formatting rules.
# Available templates: acl, emnlp, naacl, cvpr, iccv, eccv, neurips, icml, iclr
# Leave empty ("") to skip template-specific checks.
template: ""

# ==============================================================================
# 📚 Bibliography Checks
# ==============================================================================
bibliography:
  # Metadata Validation - Verify bib entries against online databases (arXiv, CrossRef, etc.)
  # Detects incorrect titles, authors, venues, and publication years
  # ⚠️ It will take some time to check metadata since it needs to query multiple online sources. Make it to false if you don't need to check metadata.
  check_metadata: true
  
  # Usage Check - Detect unused bib entries and missing citations
  # Identifies entries in .bib not cited in .tex, and citations without bib entries
  check_usage: true
  
  # Duplicate Detection - Find duplicate entries with different keys
  # Uses fuzzy matching on titles and DOIs to identify the same paper cited multiple times
  check_duplicates: true
  
  # Preprint Ratio Check - Warn if too many references are preprints
  # Detects arXiv, bioRxiv, and other preprints. Warns if ratio exceeds threshold.
  check_preprint_ratio: true
  preprint_warning_threshold: 0.50  # Warn if more than 50% of used entries are preprints
  
  # Relevance Assessment - Use LLM to evaluate if citations match their context
  # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
  check_relevance: false

# ==============================================================================
# 📋 Submission Quality Checks
# ==============================================================================
submission:
  # ─────────────────────────────────────────────────────────────────────────────
  # Format Checks
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Caption Position - Ensure table captions are above, figure captions below
  # Checks \caption placement relative to \begin{table}/\begin{figure}
  caption: true
  
  # Cross-References - Verify all figures/tables/sections are referenced in text
  # Detects orphaned floats that are never mentioned
  reference: true
  
  # Formatting Standards - Check citation format, spacing, special characters
  # Validates \cite{} usage, non-breaking spaces, proper quotation marks, etc.
  formatting: true
  
  # Equation Checks - Verify equation punctuation and numbering consistency
  # Ensures equations end with proper punctuation and labels are used correctly
  equation: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Writing Quality
  # ─────────────────────────────────────────────────────────────────────────────
  
  # AI Artifacts - Detect traces of AI-generated text
  # Flags phrases like "Sure, here is...", "As an AI...", "It's important to note..."
  ai_artifacts: true
  
  # Sentence Quality - Identify overly long sentences, weak openings, redundant phrases
  # Helps improve readability and academic writing style
  sentence: true
  
  # Terminology Consistency - Check for inconsistent spelling, hyphenation, US/UK variants
  # Examples: "deep learning" vs "deep-learning", "color" vs "colour"
  consistency: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Academic Standards
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Acronym Definitions - Ensure acronyms are defined on first use
  # Example: "Natural Language Processing (NLP)" before using "NLP" alone
  acronym: true
  
  # Number Formatting - Check percentage formatting consistency
  # Ensures no space before % sign and consistent use of '%' vs 'percent'
  number: true
  
  # Citation Quality - Flag outdated references and citation formatting issues
  # Warns about papers older than 30 years and checks citation formatting (et al., hardcoded citations)
  citation_quality: true
  
  # ─────────────────────────────────────────────────────────────────────────────
  # Review Compliance
  # ─────────────────────────────────────────────────────────────────────────────
  
  # Anonymization - Check double-blind review compliance
  # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
  anonymization: true

# ==============================================================================
# 🔍 Metadata Check Workflow
# ==============================================================================
# Define the data sources and order for metadata validation.
# BibGuard will try each enabled source in sequence until a match is found.
# Set enabled: false to skip a particular source.
workflow:
  - name: arxiv_id
    enabled: true
    description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
    
  - name: crossref_doi  
    enabled: true
    description: "Lookup by DOI via CrossRef (authoritative for published papers)"
    
  - name: semantic_scholar
    enabled: true
    description: "Semantic Scholar API (good coverage, includes citations)"
    
  - name: dblp
    enabled: true
    description: "DBLP database (comprehensive for computer science papers)"
    
  - name: openalex
    enabled: true
    description: "OpenAlex API (broad coverage across disciplines)"
    
  - name: arxiv_title
    enabled: true
    description: "Search arXiv by title (fallback when ID unavailable)"
    
  - name: crossref_title
    enabled: true
    description: "Search CrossRef by title (fallback when DOI unavailable)"
    
  - name: google_scholar
    enabled: false  # May be rate-limited, disabled by default
    description: "Google Scholar web scraping (use as last resort)"

# ==============================================================================
# 🤖 LLM Configuration (for Relevance Checking)
# ==============================================================================
llm:
  # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
  # Each backend requires different setup (API keys, local installation, etc.)
  backend: "gemini"
  
  # Model name (leave empty to use backend default)
  # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
  model: ""
  
  # API endpoint (leave empty to use backend default)
  # Only needed for self-hosted models (vllm, ollama) or custom endpoints
  endpoint: ""
  
  # API key (recommended to use environment variables instead)
  # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
  api_key: ""

# ==============================================================================
# 📊 Output Settings
# ==============================================================================
output:
  # Quiet mode - Suppress progress messages, only output final reports
  # Useful for CI/CD pipelines or batch processing
  quiet: false
  
  # Minimal verified entries - Hide detailed info for entries that passed all checks
  # Reduces report size when you only care about issues
  minimal_verified: false