import time, arxiv from query_builder import build_query from db import get_conn from config import MAX_RESULTS import os, pathlib, tempfile,uuid, shutil # set-up code for huggingface spaces CACHE_DIR = pathlib.Path(tempfile.gettempdir()) / "hf_cache" CACHE_DIR.mkdir(parents=True, exist_ok=True) # guaranteed writable for var in ( "HF_HOME", "HF_HUB_CACHE", "TRANSFORMERS_CACHE", "SENTENCE_TRANSFORMERS_HOME", ): os.environ[var] = str(CACHE_DIR) os.environ["XDG_CACHE_HOME"] = str(pathlib.Path(tempfile.gettempdir()) / ".cache") from sentence_transformers import SentenceTransformer from keybert import KeyBERT st_model = SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(CACHE_DIR) # explicit path ) kw_model = KeyBERT(st_model) """ # For my Mac from sentence_transformers import SentenceTransformer from keybert import KeyBERT # Use a writable cache directory on macOS cache_dir = os.path.expanduser("~/cache") os.makedirs(cache_dir, exist_ok=True) os.environ["HF_HOME"] = cache_dir os.environ["HF_HUB_CACHE"] = cache_dir os.environ["TRANSFORMERS_CACHE"] = cache_dir os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir st_model = SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_dir # <- writable ) kw_model = KeyBERT(st_model) """ def make_tags(title, abstract, top_n=5): """ Extract keywords from the title and abstract using KeyBERT. """ phrases = kw_model.extract_keywords(f"{title}. {abstract}", top_n=top_n, stop_words="english", use_mmr=True) return ", ".join(p for p, _ in phrases) def scrape(max_results=MAX_RESULTS, **criteria): query = build_query(**criteria) search = arxiv.Search(query=query, max_results=max_results * 3, # Get more results to filter from sort_by=arxiv.SortCriterion.SubmittedDate) conn = get_conn() search_results = [] # Track papers from current search that aren't in database papers_added = 0 for p in search.results(): # Check if paper already exists in database existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone() if not existing and papers_added < max_results: # Paper doesn't exist, add it tags = make_tags(p.title, p.summary) conn.execute( "INSERT INTO papers VALUES (?,?,?,?,?,?,?)", ( p.entry_id, p.title, ", ".join(a.name for a in p.authors), p.summary, p.published.isoformat(), None, # ummary placeholder tags ), ) # Add to search results search_results.append({ 'title': p.title, 'authors': ", ".join(a.name for a in p.authors), 'abstract': p.summary, 'published': p.published.isoformat() }) papers_added += 1 # Stop if enough papers have been added if papers_added >= max_results: break time.sleep(1) conn.commit() return search_results