Spaces:
Runtime error
Runtime error
| import logging | |
| import pathlib | |
| import random | |
| import re | |
| import time | |
| import requests | |
| from tqdm import trange | |
| from src.engine import SearchAPI | |
| from src.interfaces import Paper | |
| from src.utils import dump_json, load_json | |
| logger = logging.getLogger("uvicorn.default") | |
| class DblpPaperList(SearchAPI): | |
| """DBLP paper list | |
| Inputs: | |
| cache_filepath: Filepath to save cached file | |
| use_cache: will use cached file if `True`, otherwise download again | |
| query: Query string, basically the title | |
| you wanna search in a search box. | |
| Special logical grammars refer to the reference. | |
| max_results: Maximal returned papers | |
| request_time_inteval: Seconds to sleep when calling DBLP API | |
| References: | |
| https://dblp.org/faq/How+to+use+the+dblp+search+API.html | |
| """ | |
| API_URL = "https://dblp.org/search/publ/api" | |
| def __init__( | |
| self, | |
| cache_filepath: pathlib.Path, | |
| use_cache: bool = False, | |
| query: str = "", | |
| max_results: int = 5000, | |
| request_time_inteval: float = 3, | |
| ) -> None: | |
| super().__init__() | |
| if isinstance(cache_filepath, str): | |
| cache_filepath = pathlib.Path(cache_filepath) | |
| if (not cache_filepath.exists()) or (not use_cache): | |
| query = query.strip() | |
| query = re.sub(r"\s+?\|\s+?", "|", query) | |
| query = re.sub(r"\s+", "+", query) | |
| searched_results = [] | |
| # max capacity is 1000 | |
| h = 1000 | |
| for f in trange(0, max_results, h, desc="DBLP Downloading"): | |
| url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| page = response.json() | |
| page_data = page["result"]["hits"]["hit"] | |
| if page_data: | |
| searched_results.extend(page_data) | |
| else: | |
| break | |
| except KeyboardInterrupt: | |
| raise KeyboardInterrupt | |
| except Exception as err: | |
| logger.info(err) | |
| break | |
| time.sleep((random.random() + 0.5) * request_time_inteval) | |
| dump_json(searched_results, cache_filepath) | |
| data = load_json(cache_filepath) | |
| for d in data: | |
| # dblp does not provide abstract and month data | |
| authors = [] | |
| if "authors" in d["info"]: | |
| if isinstance(d["info"]["authors"]["author"], dict): | |
| authors.append(d["info"]["authors"]["author"]["text"]) | |
| else: | |
| authors = [a["text"] for a in d["info"]["authors"]["author"]] | |
| venues = [] | |
| if "venue" in d["info"]: | |
| if isinstance(d["info"]["venue"], str): | |
| venues.append(d["info"]["venue"]) | |
| else: | |
| for venue in d["info"]["venue"]: | |
| venues.append(venue) | |
| paper = Paper( | |
| d["info"]["title"], | |
| " , ".join(authors), | |
| "", | |
| d["info"].get("ee", d["info"].get("url", "")), | |
| d["info"].get("doi", ""), | |
| " , ".join(venues), | |
| d["info"].get("year", "9999"), | |
| "99", | |
| ) | |
| self.papers.append(paper) | |
| def build_paper_list( | |
| cls, cache_filepath: str, query: dict, max_results: int = 1000 | |
| ): | |
| title = query.get("title", []) | |
| abstract = query.get("abstract", []) | |
| cls_q = "" | |
| for t in title: | |
| cls_q += " ".join(t) | |
| for a in abstract: | |
| cls_q += " ".join(a) | |
| return cls( | |
| cache_filepath, | |
| use_cache=False, | |
| query=cls_q, | |
| max_results=max_results, | |
| ) | |
| def build_and_search( | |
| cls, cache_filepath: str, query: dict, max_results: int = 1000 | |
| ) -> list[Paper]: | |
| obj = cls.build_paper_list(cache_filepath, query, max_results=max_results) | |
| return obj.search(query)[:max_results] | |