|
|
""" |
|
|
YAML configuration loader for BibGuard. |
|
|
|
|
|
Loads configuration from YAML file and provides defaults. |
|
|
""" |
|
|
import yaml |
|
|
from pathlib import Path |
|
|
from dataclasses import dataclass, field |
|
|
from typing import Optional, List, Dict, Any |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class FilesConfig: |
|
|
"""File path configuration.""" |
|
|
bib: str = "" |
|
|
tex: str = "" |
|
|
input_dir: str = "" |
|
|
output_dir: str = "bibguard_output" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BibliographyConfig: |
|
|
"""Bibliography check configuration.""" |
|
|
check_metadata: bool = True |
|
|
check_usage: bool = True |
|
|
check_duplicates: bool = True |
|
|
check_preprint_ratio: bool = True |
|
|
preprint_warning_threshold: float = 0.50 |
|
|
check_relevance: bool = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class SubmissionConfig: |
|
|
"""Submission quality check configuration.""" |
|
|
|
|
|
|
|
|
caption: bool = True |
|
|
reference: bool = True |
|
|
formatting: bool = True |
|
|
equation: bool = True |
|
|
|
|
|
|
|
|
ai_artifacts: bool = True |
|
|
sentence: bool = True |
|
|
consistency: bool = True |
|
|
|
|
|
|
|
|
acronym: bool = True |
|
|
number: bool = True |
|
|
citation_quality: bool = True |
|
|
|
|
|
|
|
|
anonymization: bool = True |
|
|
|
|
|
def get_enabled_checkers(self) -> List[str]: |
|
|
"""Get list of enabled checker names.""" |
|
|
checkers = [] |
|
|
if self.caption: |
|
|
checkers.append('caption') |
|
|
if self.reference: |
|
|
checkers.append('reference') |
|
|
if self.formatting: |
|
|
checkers.append('formatting') |
|
|
if self.equation: |
|
|
checkers.append('equation') |
|
|
if self.ai_artifacts: |
|
|
checkers.append('ai_artifacts') |
|
|
if self.sentence: |
|
|
checkers.append('sentence') |
|
|
if self.consistency: |
|
|
checkers.append('consistency') |
|
|
if self.acronym: |
|
|
checkers.append('acronym') |
|
|
if self.number: |
|
|
checkers.append('number') |
|
|
if self.citation_quality: |
|
|
checkers.append('citation_quality') |
|
|
if self.anonymization: |
|
|
checkers.append('anonymization') |
|
|
return checkers |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class WorkflowStep: |
|
|
"""Single step in the reference check workflow.""" |
|
|
name: str |
|
|
enabled: bool = True |
|
|
description: str = "" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class LLMConfig: |
|
|
"""LLM configuration for relevance checking.""" |
|
|
backend: str = "gemini" |
|
|
model: str = "" |
|
|
endpoint: str = "" |
|
|
api_key: str = "" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class OutputConfig: |
|
|
"""Output configuration.""" |
|
|
quiet: bool = False |
|
|
minimal_verified: bool = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BibGuardConfig: |
|
|
"""Complete BibGuard configuration.""" |
|
|
files: FilesConfig = field(default_factory=FilesConfig) |
|
|
template: str = "" |
|
|
bibliography: BibliographyConfig = field(default_factory=BibliographyConfig) |
|
|
submission: SubmissionConfig = field(default_factory=SubmissionConfig) |
|
|
workflow: List[WorkflowStep] = field(default_factory=list) |
|
|
llm: LLMConfig = field(default_factory=LLMConfig) |
|
|
output: OutputConfig = field(default_factory=OutputConfig) |
|
|
|
|
|
|
|
|
_bib_files: List[Path] = field(default_factory=list) |
|
|
_tex_files: List[Path] = field(default_factory=list) |
|
|
|
|
|
|
|
|
_config_dir: Path = field(default_factory=lambda: Path.cwd()) |
|
|
|
|
|
def resolve_path(self, path: str) -> Path: |
|
|
"""Resolve a path relative to the config file directory.""" |
|
|
p = Path(path) |
|
|
if p.is_absolute(): |
|
|
return p |
|
|
return self._config_dir / p |
|
|
|
|
|
@property |
|
|
def bib_path(self) -> Path: |
|
|
return self.resolve_path(self.files.bib) |
|
|
|
|
|
@property |
|
|
def tex_path(self) -> Path: |
|
|
return self.resolve_path(self.files.tex) |
|
|
|
|
|
@property |
|
|
def input_dir_path(self) -> Path: |
|
|
return self.resolve_path(self.files.input_dir) |
|
|
|
|
|
@property |
|
|
def output_dir_path(self) -> Path: |
|
|
return self.resolve_path(self.files.output_dir) |
|
|
|
|
|
|
|
|
def load_config(config_path: str) -> BibGuardConfig: |
|
|
"""Load configuration from YAML file.""" |
|
|
path = Path(config_path) |
|
|
|
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Config file not found: {config_path}") |
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
data = yaml.safe_load(f) or {} |
|
|
|
|
|
config = BibGuardConfig() |
|
|
config._config_dir = path.parent.absolute() |
|
|
|
|
|
|
|
|
if 'files' in data: |
|
|
files = data['files'] |
|
|
config.files = FilesConfig( |
|
|
bib=files.get('bib', ''), |
|
|
tex=files.get('tex', ''), |
|
|
input_dir=files.get('input_dir', ''), |
|
|
output_dir=files.get('output_dir', 'bibguard_output') |
|
|
) |
|
|
|
|
|
|
|
|
config.template = data.get('template', '') |
|
|
|
|
|
|
|
|
if 'bibliography' in data: |
|
|
bib = data['bibliography'] |
|
|
config.bibliography = BibliographyConfig( |
|
|
check_metadata=bib.get('check_metadata', True), |
|
|
check_usage=bib.get('check_usage', True), |
|
|
check_duplicates=bib.get('check_duplicates', True), |
|
|
check_preprint_ratio=bib.get('check_preprint_ratio', True), |
|
|
preprint_warning_threshold=bib.get('preprint_warning_threshold', 0.50), |
|
|
check_relevance=bib.get('check_relevance', False) |
|
|
) |
|
|
|
|
|
|
|
|
if 'submission' in data: |
|
|
sub = data['submission'] |
|
|
config.submission = SubmissionConfig( |
|
|
caption=sub.get('caption', True), |
|
|
reference=sub.get('reference', True), |
|
|
formatting=sub.get('formatting', True), |
|
|
equation=sub.get('equation', True), |
|
|
ai_artifacts=sub.get('ai_artifacts', True), |
|
|
sentence=sub.get('sentence', True), |
|
|
consistency=sub.get('consistency', True), |
|
|
acronym=sub.get('acronym', True), |
|
|
number=sub.get('number', True), |
|
|
citation_quality=sub.get('citation_quality', True), |
|
|
anonymization=sub.get('anonymization', True) |
|
|
) |
|
|
|
|
|
|
|
|
if 'workflow' in data: |
|
|
config.workflow = [ |
|
|
WorkflowStep( |
|
|
name=step.get('name', ''), |
|
|
enabled=step.get('enabled', True), |
|
|
description=step.get('description', '') |
|
|
) |
|
|
for step in data['workflow'] |
|
|
] |
|
|
|
|
|
|
|
|
if 'llm' in data: |
|
|
llm = data['llm'] |
|
|
config.llm = LLMConfig( |
|
|
backend=llm.get('backend', 'gemini'), |
|
|
model=llm.get('model', ''), |
|
|
endpoint=llm.get('endpoint', ''), |
|
|
api_key=llm.get('api_key', '') |
|
|
) |
|
|
|
|
|
|
|
|
if 'output' in data: |
|
|
out = data['output'] |
|
|
config.output = OutputConfig( |
|
|
quiet=out.get('quiet', False), |
|
|
minimal_verified=out.get('minimal_verified', False) |
|
|
) |
|
|
|
|
|
return config |
|
|
|
|
|
|
|
|
def find_config_file() -> Optional[Path]: |
|
|
"""Find config file in current directory or parent directories.""" |
|
|
config_names = ['config.yaml', 'bibguard.yaml', 'bibguard.yml', '.bibguard.yaml', '.bibguard.yml'] |
|
|
|
|
|
current = Path.cwd() |
|
|
|
|
|
for _ in range(5): |
|
|
for name in config_names: |
|
|
config_path = current / name |
|
|
if config_path.exists(): |
|
|
return config_path |
|
|
|
|
|
parent = current.parent |
|
|
if parent == current: |
|
|
break |
|
|
current = parent |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def create_default_config(output_path: str = "config.yaml"): |
|
|
"""Create a default config file.""" |
|
|
default = """# BibGuard Configuration File |
|
|
|
|
|
files: |
|
|
bib: "paper.bib" |
|
|
tex: "paper.tex" |
|
|
output_dir: "bibguard_output" |
|
|
|
|
|
template: "" |
|
|
|
|
|
bibliography: |
|
|
check_metadata: true |
|
|
check_usage: true |
|
|
check_duplicates: true |
|
|
check_preprint_ratio: true |
|
|
preprint_warning_threshold: 0.50 |
|
|
check_relevance: false |
|
|
|
|
|
submission: |
|
|
caption: true |
|
|
reference: true |
|
|
formatting: true |
|
|
equation: true |
|
|
ai_artifacts: true |
|
|
sentence: true |
|
|
consistency: true |
|
|
acronym: true |
|
|
number: true |
|
|
citation_quality: true |
|
|
anonymization: true |
|
|
|
|
|
llm: |
|
|
backend: "gemini" |
|
|
model: "" |
|
|
api_key: "" |
|
|
|
|
|
output: |
|
|
quiet: false |
|
|
minimal_verified: false |
|
|
""" |
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
|
f.write(default) |
|
|
|
|
|
return output_path |
|
|
|