|
|
""" |
|
|
Workflow configuration for reference checking. |
|
|
|
|
|
Allows users to customize the order and enable/disable individual fetchers |
|
|
in the reference verification workflow. |
|
|
""" |
|
|
import json |
|
|
from dataclasses import dataclass, field, asdict |
|
|
from pathlib import Path |
|
|
from typing import List, Optional |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class WorkflowStep: |
|
|
"""A single step in the reference checking workflow.""" |
|
|
name: str |
|
|
display_name: str |
|
|
description: str |
|
|
enabled: bool = True |
|
|
priority: int = 0 |
|
|
|
|
|
|
|
|
search_type: str = 'by_title' |
|
|
|
|
|
def to_dict(self) -> dict: |
|
|
return asdict(self) |
|
|
|
|
|
@classmethod |
|
|
def from_dict(cls, data: dict) -> 'WorkflowStep': |
|
|
return cls(**data) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class WorkflowConfig: |
|
|
"""Configuration for the reference checking workflow.""" |
|
|
steps: List[WorkflowStep] = field(default_factory=list) |
|
|
name: str = "default" |
|
|
description: str = "Default workflow configuration" |
|
|
|
|
|
def get_enabled_steps(self) -> List[WorkflowStep]: |
|
|
"""Get only enabled steps, sorted by priority.""" |
|
|
return sorted( |
|
|
[s for s in self.steps if s.enabled], |
|
|
key=lambda x: x.priority |
|
|
) |
|
|
|
|
|
def move_step_up(self, index: int) -> bool: |
|
|
"""Move a step up in priority (swap with previous).""" |
|
|
if index <= 0 or index >= len(self.steps): |
|
|
return False |
|
|
self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index] |
|
|
self._update_priorities() |
|
|
return True |
|
|
|
|
|
def move_step_down(self, index: int) -> bool: |
|
|
"""Move a step down in priority (swap with next).""" |
|
|
if index < 0 or index >= len(self.steps) - 1: |
|
|
return False |
|
|
self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index] |
|
|
self._update_priorities() |
|
|
return True |
|
|
|
|
|
def toggle_step(self, index: int) -> bool: |
|
|
"""Toggle enabled status of a step.""" |
|
|
if 0 <= index < len(self.steps): |
|
|
self.steps[index].enabled = not self.steps[index].enabled |
|
|
return True |
|
|
return False |
|
|
|
|
|
def _update_priorities(self): |
|
|
"""Update priority values based on current order.""" |
|
|
for i, step in enumerate(self.steps): |
|
|
step.priority = i |
|
|
|
|
|
def to_dict(self) -> dict: |
|
|
return { |
|
|
'name': self.name, |
|
|
'description': self.description, |
|
|
'steps': [s.to_dict() for s in self.steps] |
|
|
} |
|
|
|
|
|
@classmethod |
|
|
def from_dict(cls, data: dict) -> 'WorkflowConfig': |
|
|
steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])] |
|
|
return cls( |
|
|
steps=steps, |
|
|
name=data.get('name', 'custom'), |
|
|
description=data.get('description', '') |
|
|
) |
|
|
|
|
|
def save(self, filepath: str): |
|
|
"""Save workflow configuration to JSON file.""" |
|
|
path = Path(filepath) |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
with open(path, 'w', encoding='utf-8') as f: |
|
|
json.dump(self.to_dict(), f, indent=2) |
|
|
|
|
|
@classmethod |
|
|
def load(cls, filepath: str) -> 'WorkflowConfig': |
|
|
"""Load workflow configuration from JSON file.""" |
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
return cls.from_dict(data) |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_WORKFLOW = WorkflowConfig( |
|
|
name="default", |
|
|
description="Default reference checking workflow prioritizing reliable APIs", |
|
|
steps=[ |
|
|
WorkflowStep( |
|
|
name="arxiv_id", |
|
|
display_name="arXiv by ID", |
|
|
description="Look up paper by arXiv ID (highest priority for arXiv papers)", |
|
|
priority=0, |
|
|
search_type="by_id" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="crossref_doi", |
|
|
display_name="CrossRef by DOI", |
|
|
description="Look up paper by DOI (authoritative for DOIs)", |
|
|
priority=1, |
|
|
search_type="by_doi" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="semantic_scholar", |
|
|
display_name="Semantic Scholar", |
|
|
description="Official API with high quality metadata", |
|
|
priority=2, |
|
|
search_type="by_title" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="dblp", |
|
|
display_name="DBLP", |
|
|
description="Official API, especially good for CS publications", |
|
|
priority=3, |
|
|
search_type="by_title" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="openalex", |
|
|
display_name="OpenAlex", |
|
|
description="Official API with broad coverage", |
|
|
priority=4, |
|
|
search_type="by_title" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="arxiv_title", |
|
|
display_name="arXiv by Title", |
|
|
description="Search arXiv by title (fallback for non-ID lookups)", |
|
|
priority=5, |
|
|
search_type="by_title" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="crossref_title", |
|
|
display_name="CrossRef by Title", |
|
|
description="Search CrossRef by title", |
|
|
priority=6, |
|
|
search_type="by_title" |
|
|
), |
|
|
WorkflowStep( |
|
|
name="google_scholar", |
|
|
display_name="Google Scholar", |
|
|
description="Web scraping fallback (may be rate-limited or blocked)", |
|
|
priority=7, |
|
|
search_type="by_title", |
|
|
enabled=True |
|
|
), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
def get_default_workflow() -> WorkflowConfig: |
|
|
"""Get a fresh copy of the default workflow.""" |
|
|
return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict()) |
|
|
|