File size: 8,536 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
"""
YAML configuration loader for BibGuard.

Loads configuration from YAML file and provides defaults.
"""
import yaml
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any


@dataclass
class FilesConfig:
    """File path configuration."""
    bib: str = ""
    tex: str = ""
    input_dir: str = ""  # Directory to recursive search for .tex and .bib files
    output_dir: str = "bibguard_output"  # Output directory for all generated files


@dataclass
class BibliographyConfig:
    """Bibliography check configuration."""
    check_metadata: bool = True
    check_usage: bool = True
    check_duplicates: bool = True
    check_preprint_ratio: bool = True
    preprint_warning_threshold: float = 0.50
    check_relevance: bool = False


@dataclass
class SubmissionConfig:
    """Submission quality check configuration."""
    
    # Format checks
    caption: bool = True
    reference: bool = True
    formatting: bool = True
    equation: bool = True
    
    # Writing quality
    ai_artifacts: bool = True
    sentence: bool = True
    consistency: bool = True
    
    # Academic standards
    acronym: bool = True
    number: bool = True
    citation_quality: bool = True
    
    # Review compliance
    anonymization: bool = True
    
    def get_enabled_checkers(self) -> List[str]:
        """Get list of enabled checker names."""
        checkers = []
        if self.caption:
            checkers.append('caption')
        if self.reference:
            checkers.append('reference')
        if self.formatting:
            checkers.append('formatting')
        if self.equation:
            checkers.append('equation')
        if self.ai_artifacts:
            checkers.append('ai_artifacts')
        if self.sentence:
            checkers.append('sentence')
        if self.consistency:
            checkers.append('consistency')
        if self.acronym:
            checkers.append('acronym')
        if self.number:
            checkers.append('number')
        if self.citation_quality:
            checkers.append('citation_quality')
        if self.anonymization:
            checkers.append('anonymization')
        return checkers


@dataclass
class WorkflowStep:
    """Single step in the reference check workflow."""
    name: str
    enabled: bool = True
    description: str = ""


@dataclass
class LLMConfig:
    """LLM configuration for relevance checking."""
    backend: str = "gemini"
    model: str = ""
    endpoint: str = ""
    api_key: str = ""


@dataclass 
class OutputConfig:
    """Output configuration."""
    quiet: bool = False
    minimal_verified: bool = False


@dataclass
class BibGuardConfig:
    """Complete BibGuard configuration."""
    files: FilesConfig = field(default_factory=FilesConfig)
    template: str = ""
    bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
    submission: SubmissionConfig = field(default_factory=SubmissionConfig)
    workflow: List[WorkflowStep] = field(default_factory=list)
    llm: LLMConfig = field(default_factory=LLMConfig)
    output: OutputConfig = field(default_factory=OutputConfig)
    
    # Internal fields to store discovered files in directory mode
    _bib_files: List[Path] = field(default_factory=list)
    _tex_files: List[Path] = field(default_factory=list)
    
    # Path to the config file (for resolving relative paths)
    _config_dir: Path = field(default_factory=lambda: Path.cwd())
    
    def resolve_path(self, path: str) -> Path:
        """Resolve a path relative to the config file directory."""
        p = Path(path)
        if p.is_absolute():
            return p
        return self._config_dir / p
    
    @property
    def bib_path(self) -> Path:
        return self.resolve_path(self.files.bib)
    
    @property
    def tex_path(self) -> Path:
        return self.resolve_path(self.files.tex)
    
    @property
    def input_dir_path(self) -> Path:
        return self.resolve_path(self.files.input_dir)
    
    @property
    def output_dir_path(self) -> Path:
        return self.resolve_path(self.files.output_dir)


def load_config(config_path: str) -> BibGuardConfig:
    """Load configuration from YAML file."""
    path = Path(config_path)
    
    if not path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")
    
    with open(path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f) or {}
    
    config = BibGuardConfig()
    config._config_dir = path.parent.absolute()
    
    # Parse files section
    if 'files' in data:
        files = data['files']
        config.files = FilesConfig(
            bib=files.get('bib', ''),
            tex=files.get('tex', ''),
            input_dir=files.get('input_dir', ''),
            output_dir=files.get('output_dir', 'bibguard_output')
        )
    
    # Parse template
    config.template = data.get('template', '')
    
    # Parse bibliography section
    if 'bibliography' in data:
        bib = data['bibliography']
        config.bibliography = BibliographyConfig(
            check_metadata=bib.get('check_metadata', True),
            check_usage=bib.get('check_usage', True),
            check_duplicates=bib.get('check_duplicates', True),
            check_preprint_ratio=bib.get('check_preprint_ratio', True),
            preprint_warning_threshold=bib.get('preprint_warning_threshold', 0.50),
            check_relevance=bib.get('check_relevance', False)
        )
    
    # Parse submission section
    if 'submission' in data:
        sub = data['submission']
        config.submission = SubmissionConfig(
            caption=sub.get('caption', True),
            reference=sub.get('reference', True),
            formatting=sub.get('formatting', True),
            equation=sub.get('equation', True),
            ai_artifacts=sub.get('ai_artifacts', True),
            sentence=sub.get('sentence', True),
            consistency=sub.get('consistency', True),
            acronym=sub.get('acronym', True),
            number=sub.get('number', True),
            citation_quality=sub.get('citation_quality', True),
            anonymization=sub.get('anonymization', True)
        )
    
    # Parse workflow section
    if 'workflow' in data:
        config.workflow = [
            WorkflowStep(
                name=step.get('name', ''),
                enabled=step.get('enabled', True),
                description=step.get('description', '')
            )
            for step in data['workflow']
        ]
    
    # Parse LLM section
    if 'llm' in data:
        llm = data['llm']
        config.llm = LLMConfig(
            backend=llm.get('backend', 'gemini'),
            model=llm.get('model', ''),
            endpoint=llm.get('endpoint', ''),
            api_key=llm.get('api_key', '')
        )
    
    # Parse output section
    if 'output' in data:
        out = data['output']
        config.output = OutputConfig(
            quiet=out.get('quiet', False),
            minimal_verified=out.get('minimal_verified', False)
        )
    
    return config


def find_config_file() -> Optional[Path]:
    """Find config file in current directory or parent directories."""
    config_names = ['config.yaml', 'bibguard.yaml', 'bibguard.yml', '.bibguard.yaml', '.bibguard.yml']
    
    current = Path.cwd()
    
    for _ in range(5):  # Check up to 5 levels
        for name in config_names:
            config_path = current / name
            if config_path.exists():
                return config_path
        
        parent = current.parent
        if parent == current:
            break
        current = parent
    
    return None


def create_default_config(output_path: str = "config.yaml"):
    """Create a default config file."""
    default = """# BibGuard Configuration File

files:
  bib: "paper.bib"
  tex: "paper.tex"
  output_dir: "bibguard_output"

template: ""

bibliography:
  check_metadata: true
  check_usage: true
  check_duplicates: true
  check_preprint_ratio: true
  preprint_warning_threshold: 0.50
  check_relevance: false

submission:
  caption: true
  reference: true
  formatting: true
  equation: true
  ai_artifacts: true
  sentence: true
  consistency: true
  acronym: true
  number: true
  citation_quality: true
  anonymization: true

llm:
  backend: "gemini"
  model: ""
  api_key: ""

output:
  quiet: false
  minimal_verified: false
"""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(default)
    
    return output_path