Spaces:

thinkwee
/

BibGuard

Running

File size: 5,708 Bytes

46df5f0

"""
Workflow configuration for reference checking.

Allows users to customize the order and enable/disable individual fetchers
in the reference verification workflow.
"""
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import List, Optional


@dataclass
class WorkflowStep:
    """A single step in the reference checking workflow."""
    name: str
    display_name: str
    description: str
    enabled: bool = True
    priority: int = 0
    
    # Step type: 'by_id', 'by_doi', 'by_title'
    search_type: str = 'by_title'
    
    def to_dict(self) -> dict:
        return asdict(self)
    
    @classmethod
    def from_dict(cls, data: dict) -> 'WorkflowStep':
        return cls(**data)


@dataclass
class WorkflowConfig:
    """Configuration for the reference checking workflow."""
    steps: List[WorkflowStep] = field(default_factory=list)
    name: str = "default"
    description: str = "Default workflow configuration"
    
    def get_enabled_steps(self) -> List[WorkflowStep]:
        """Get only enabled steps, sorted by priority."""
        return sorted(
            [s for s in self.steps if s.enabled],
            key=lambda x: x.priority
        )
    
    def move_step_up(self, index: int) -> bool:
        """Move a step up in priority (swap with previous)."""
        if index <= 0 or index >= len(self.steps):
            return False
        self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index]
        self._update_priorities()
        return True
    
    def move_step_down(self, index: int) -> bool:
        """Move a step down in priority (swap with next)."""
        if index < 0 or index >= len(self.steps) - 1:
            return False
        self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index]
        self._update_priorities()
        return True
    
    def toggle_step(self, index: int) -> bool:
        """Toggle enabled status of a step."""
        if 0 <= index < len(self.steps):
            self.steps[index].enabled = not self.steps[index].enabled
            return True
        return False
    
    def _update_priorities(self):
        """Update priority values based on current order."""
        for i, step in enumerate(self.steps):
            step.priority = i
    
    def to_dict(self) -> dict:
        return {
            'name': self.name,
            'description': self.description,
            'steps': [s.to_dict() for s in self.steps]
        }
    
    @classmethod
    def from_dict(cls, data: dict) -> 'WorkflowConfig':
        steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])]
        return cls(
            steps=steps,
            name=data.get('name', 'custom'),
            description=data.get('description', '')
        )
    
    def save(self, filepath: str):
        """Save workflow configuration to JSON file."""
        path = Path(filepath)
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(self.to_dict(), f, indent=2)
    
    @classmethod
    def load(cls, filepath: str) -> 'WorkflowConfig':
        """Load workflow configuration from JSON file."""
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return cls.from_dict(data)


# Default workflow matching current implementation order
DEFAULT_WORKFLOW = WorkflowConfig(
    name="default",
    description="Default reference checking workflow prioritizing reliable APIs",
    steps=[
        WorkflowStep(
            name="arxiv_id",
            display_name="arXiv by ID",
            description="Look up paper by arXiv ID (highest priority for arXiv papers)",
            priority=0,
            search_type="by_id"
        ),
        WorkflowStep(
            name="crossref_doi",
            display_name="CrossRef by DOI",
            description="Look up paper by DOI (authoritative for DOIs)",
            priority=1,
            search_type="by_doi"
        ),
        WorkflowStep(
            name="semantic_scholar",
            display_name="Semantic Scholar",
            description="Official API with high quality metadata",
            priority=2,
            search_type="by_title"
        ),
        WorkflowStep(
            name="dblp",
            display_name="DBLP",
            description="Official API, especially good for CS publications",
            priority=3,
            search_type="by_title"
        ),
        WorkflowStep(
            name="openalex",
            display_name="OpenAlex",
            description="Official API with broad coverage",
            priority=4,
            search_type="by_title"
        ),
        WorkflowStep(
            name="arxiv_title",
            display_name="arXiv by Title",
            description="Search arXiv by title (fallback for non-ID lookups)",
            priority=5,
            search_type="by_title"
        ),
        WorkflowStep(
            name="crossref_title",
            display_name="CrossRef by Title",
            description="Search CrossRef by title",
            priority=6,
            search_type="by_title"
        ),
        WorkflowStep(
            name="google_scholar",
            display_name="Google Scholar",
            description="Web scraping fallback (may be rate-limited or blocked)",
            priority=7,
            search_type="by_title",
            enabled=True  # Still enabled but lowest priority
        ),
    ]
)


def get_default_workflow() -> WorkflowConfig:
    """Get a fresh copy of the default workflow."""
    return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict())