BibGuard

Sleeping

thinkwee

init

46df5f0 4 months ago

5.71 kB

	"""
	Workflow configuration for reference checking.

	Allows users to customize the order and enable/disable individual fetchers
	in the reference verification workflow.
	"""
	import json
	from dataclasses import dataclass, field, asdict
	from pathlib import Path
	from typing import List, Optional


	@dataclass
	class WorkflowStep:
	"""A single step in the reference checking workflow."""
	name: str
	display_name: str
	description: str
	enabled: bool = True
	priority: int = 0

	# Step type: 'by_id', 'by_doi', 'by_title'
	search_type: str = 'by_title'

	def to_dict(self) -> dict:
	return asdict(self)

	@classmethod
	def from_dict(cls, data: dict) -> 'WorkflowStep':
	return cls(**data)


	@dataclass
	class WorkflowConfig:
	"""Configuration for the reference checking workflow."""
	steps: List[WorkflowStep] = field(default_factory=list)
	name: str = "default"
	description: str = "Default workflow configuration"

	def get_enabled_steps(self) -> List[WorkflowStep]:
	"""Get only enabled steps, sorted by priority."""
	return sorted(
	[s for s in self.steps if s.enabled],
	key=lambda x: x.priority
	)

	def move_step_up(self, index: int) -> bool:
	"""Move a step up in priority (swap with previous)."""
	if index <= 0 or index >= len(self.steps):
	return False
	self.steps[index], self.steps[index - 1] = self.steps[index - 1], self.steps[index]
	self._update_priorities()
	return True

	def move_step_down(self, index: int) -> bool:
	"""Move a step down in priority (swap with next)."""
	if index < 0 or index >= len(self.steps) - 1:
	return False
	self.steps[index], self.steps[index + 1] = self.steps[index + 1], self.steps[index]
	self._update_priorities()
	return True

	def toggle_step(self, index: int) -> bool:
	"""Toggle enabled status of a step."""
	if 0 <= index < len(self.steps):
	self.steps[index].enabled = not self.steps[index].enabled
	return True
	return False

	def _update_priorities(self):
	"""Update priority values based on current order."""
	for i, step in enumerate(self.steps):
	step.priority = i

	def to_dict(self) -> dict:
	return {
	'name': self.name,
	'description': self.description,
	'steps': [s.to_dict() for s in self.steps]
	}

	@classmethod
	def from_dict(cls, data: dict) -> 'WorkflowConfig':
	steps = [WorkflowStep.from_dict(s) for s in data.get('steps', [])]
	return cls(
	steps=steps,
	name=data.get('name', 'custom'),
	description=data.get('description', '')
	)

	def save(self, filepath: str):
	"""Save workflow configuration to JSON file."""
	path = Path(filepath)
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, 'w', encoding='utf-8') as f:
	json.dump(self.to_dict(), f, indent=2)

	@classmethod
	def load(cls, filepath: str) -> 'WorkflowConfig':
	"""Load workflow configuration from JSON file."""
	with open(filepath, 'r', encoding='utf-8') as f:
	data = json.load(f)
	return cls.from_dict(data)


	# Default workflow matching current implementation order
	DEFAULT_WORKFLOW = WorkflowConfig(
	name="default",
	description="Default reference checking workflow prioritizing reliable APIs",
	steps=[
	WorkflowStep(
	name="arxiv_id",
	display_name="arXiv by ID",
	description="Look up paper by arXiv ID (highest priority for arXiv papers)",
	priority=0,
	search_type="by_id"
	),
	WorkflowStep(
	name="crossref_doi",
	display_name="CrossRef by DOI",
	description="Look up paper by DOI (authoritative for DOIs)",
	priority=1,
	search_type="by_doi"
	),
	WorkflowStep(
	name="semantic_scholar",
	display_name="Semantic Scholar",
	description="Official API with high quality metadata",
	priority=2,
	search_type="by_title"
	),
	WorkflowStep(
	name="dblp",
	display_name="DBLP",
	description="Official API, especially good for CS publications",
	priority=3,
	search_type="by_title"
	),
	WorkflowStep(
	name="openalex",
	display_name="OpenAlex",
	description="Official API with broad coverage",
	priority=4,
	search_type="by_title"
	),
	WorkflowStep(
	name="arxiv_title",
	display_name="arXiv by Title",
	description="Search arXiv by title (fallback for non-ID lookups)",
	priority=5,
	search_type="by_title"
	),
	WorkflowStep(
	name="crossref_title",
	display_name="CrossRef by Title",
	description="Search CrossRef by title",
	priority=6,
	search_type="by_title"
	),
	WorkflowStep(
	name="google_scholar",
	display_name="Google Scholar",
	description="Web scraping fallback (may be rate-limited or blocked)",
	priority=7,
	search_type="by_title",
	enabled=True # Still enabled but lowest priority
	),
	]
	)


	def get_default_workflow() -> WorkflowConfig:
	"""Get a fresh copy of the default workflow."""
	return WorkflowConfig.from_dict(DEFAULT_WORKFLOW.to_dict())