File size: 5,708 Bytes
46df5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
"""
Workflow configuration for reference checking.
Allows users to customize the order and enable/disable individual fetchers
in the reference verification workflow.
"""
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import List, Optional
@dataclass
class WorkflowStep:
"""A single step in the reference checking workflow."""
name: str
display_name: str
description: str
enabled: bool = True
priority: int = 0
# Step type: 'by_id', 'by_doi', 'by_title'
search_type: str = 'by_title'
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, data: dict) -> 'WorkflowStep':
return cls(**data)
@dataclass
class WorkflowConfig:
"""Configuration for the reference checking workflow."""
steps: List[WorkflowStep] = field(default_factory=list)
name: str = "default"
description: str = "Default workflow configuration"
def get_enabled_steps(self) -> List[WorkflowStep]:
"""Get only enabled steps, sorted by priority."""
return sorted(
[s for s in self.steps if s.enabled],
key=lambda x: x.priority
)
def move_step_up(self, index: int) -> bool:
"""Move a step up in priority (swap with previous)."""
if index <= 0 or index >= len(self.steps):
return False
self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index]
self._update_priorities()
return True
def move_step_down(self, index: int) -> bool:
"""Move a step down in priority (swap with next)."""
if index < 0 or index >= len(self.steps) - 1:
return False
self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index]
self._update_priorities()
return True
def toggle_step(self, index: int) -> bool:
"""Toggle enabled status of a step."""
if 0 <= index < len(self.steps):
self.steps[index].enabled = not self.steps[index].enabled
return True
return False
def _update_priorities(self):
"""Update priority values based on current order."""
for i, step in enumerate(self.steps):
step.priority = i
def to_dict(self) -> dict:
return {
'name': self.name,
'description': self.description,
'steps': [s.to_dict() for s in self.steps]
}
@classmethod
def from_dict(cls, data: dict) -> 'WorkflowConfig':
steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])]
return cls(
steps=steps,
name=data.get('name', 'custom'),
description=data.get('description', '')
)
def save(self, filepath: str):
"""Save workflow configuration to JSON file."""
path = Path(filepath)
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'w', encoding='utf-8') as f:
json.dump(self.to_dict(), f, indent=2)
@classmethod
def load(cls, filepath: str) -> 'WorkflowConfig':
"""Load workflow configuration from JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
return cls.from_dict(data)
# Default workflow matching current implementation order
DEFAULT_WORKFLOW = WorkflowConfig(
name="default",
description="Default reference checking workflow prioritizing reliable APIs",
steps=[
WorkflowStep(
name="arxiv_id",
display_name="arXiv by ID",
description="Look up paper by arXiv ID (highest priority for arXiv papers)",
priority=0,
search_type="by_id"
),
WorkflowStep(
name="crossref_doi",
display_name="CrossRef by DOI",
description="Look up paper by DOI (authoritative for DOIs)",
priority=1,
search_type="by_doi"
),
WorkflowStep(
name="semantic_scholar",
display_name="Semantic Scholar",
description="Official API with high quality metadata",
priority=2,
search_type="by_title"
),
WorkflowStep(
name="dblp",
display_name="DBLP",
description="Official API, especially good for CS publications",
priority=3,
search_type="by_title"
),
WorkflowStep(
name="openalex",
display_name="OpenAlex",
description="Official API with broad coverage",
priority=4,
search_type="by_title"
),
WorkflowStep(
name="arxiv_title",
display_name="arXiv by Title",
description="Search arXiv by title (fallback for non-ID lookups)",
priority=5,
search_type="by_title"
),
WorkflowStep(
name="crossref_title",
display_name="CrossRef by Title",
description="Search CrossRef by title",
priority=6,
search_type="by_title"
),
WorkflowStep(
name="google_scholar",
display_name="Google Scholar",
description="Web scraping fallback (may be rate-limited or blocked)",
priority=7,
search_type="by_title",
enabled=True # Still enabled but lowest priority
),
]
)
def get_default_workflow() -> WorkflowConfig:
"""Get a fresh copy of the default workflow."""
return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict())
|