Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /fetchers /dblp_fetcher.py

thinkwee

init

46df5f0 3 months ago

3.97 kB

	import requests
	import time
	import logging
	from typing import Optional, List, Dict, Any
	from dataclasses import dataclass

	@dataclass
	class DBLPResult:
	title: str
	authors: List[str]
	year: str
	venue: str
	url: str
	doi: Optional[str] = None

	class DBLPFetcher:
	"""Fetcher for DBLP API."""

	BASE_URL = "https://dblp.org/search/publ/api"

	def __init__(self):
	self.last_request_time = 0
	# DBLP asks for 1-2 seconds between requests. We'll use 1.5s to be safe.
	self.rate_limit_delay = 1.5
	self.logger = logging.getLogger(__name__)

	def _wait_for_rate_limit(self):
	"""Ensure we don't exceed rate limits."""
	elapsed = time.time() - self.last_request_time
	if elapsed < self.rate_limit_delay:
	time.sleep(self.rate_limit_delay - elapsed)
	self.last_request_time = time.time()

	def search_by_title(self, title: str) -> Optional[DBLPResult]:
	"""
	Search DBLP by title.

	Args:
	title: Paper title to search for

	Returns:
	DBLPResult if found, None otherwise
	"""
	self._wait_for_rate_limit()

	params = {
	"q": title,
	"format": "json",
	"h": 3 # Limit to top 3 hits
	}

	try:
	response = requests.get(self.BASE_URL, params=params, timeout=10)

	if response.status_code == 429:
	self.logger.warning("DBLP rate limit exceeded. Waiting longer...")
	time.sleep(5)
	return None

	if response.status_code != 200:
	self.logger.warning(f"DBLP API error: {response.status_code}")
	return None

	data = response.json()
	return self._parse_response(data, title)

	except Exception as e:
	self.logger.error(f"Error fetching from DBLP: {e}")
	return None

	def _parse_response(self, data: Dict[str, Any], query_title: str) -> Optional[DBLPResult]:
	"""Parse DBLP JSON response."""
	try:
	result = data.get("result", {})
	hits = result.get("hits", {}).get("hit", [])

	if not hits:
	return None

	# Find best match
	best_hit = None

	# Simple check: first hit is usually the best in DBLP for exact title match
	# But we can do a quick normalization check if needed.
	# For now, let's take the first hit that is a publication (not a person/venue)
	# The search/publ/api endpoint should only return publications.

	best_hit = hits[0]
	info = best_hit.get("info", {})

	# Extract authors
	authors_data = info.get("authors", {}).get("author", [])
	authors = []
	if isinstance(authors_data, list):
	authors = [a.get("text", "") for a in authors_data]
	elif isinstance(authors_data, dict):
	authors = [authors_data.get("text", "")]

	# Extract other fields
	title = info.get("title", "")
	year = info.get("year", "")
	venue = info.get("venue", "")
	url = info.get("url", "")
	doi = info.get("doi", "")

	# Clean title (DBLP titles often end with a dot)
	if title.endswith("."):
	title = title[:-1]

	return DBLPResult(
	title=title,
	authors=authors,
	year=year,
	venue=venue,
	url=url,
	doi=doi if doi else None
	)

	except Exception as e:
	self.logger.error(f"Error parsing DBLP response: {e}")
	return None