Spaces:
Sleeping
Sleeping
File size: 3,486 Bytes
0b51cd7 b9e7b32 06018df 2441327 b9e7b32 0b51cd7 b9e7b32 6b2fd2d 60af45b b9e7b32 06018df b9e7b32 6b2fd2d e469e5f c1d68a4 b9e7b32 c1d68a4 e469e5f b9e7b32 e469e5f 0b51cd7 b9e7b32 0b51cd7 792575c 0b51cd7 792575c fd2e156 0b51cd7 792575c fd2e156 792575c fd2e156 792575c fd2e156 792575c 0b51cd7 fd2e156 792575c 0b51cd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import time, arxiv
from query_builder import build_query
from db import get_conn
from config import MAX_RESULTS
import os, pathlib, tempfile,uuid, shutil
# set-up code for huggingface spaces
CACHE_DIR = pathlib.Path(tempfile.gettempdir()) / "hf_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True) # guaranteed writable
for var in (
"HF_HOME",
"HF_HUB_CACHE",
"TRANSFORMERS_CACHE",
"SENTENCE_TRANSFORMERS_HOME",
):
os.environ[var] = str(CACHE_DIR)
os.environ["XDG_CACHE_HOME"] = str(pathlib.Path(tempfile.gettempdir()) / ".cache")
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
st_model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2",
cache_folder=str(CACHE_DIR) # explicit path
)
kw_model = KeyBERT(st_model)
"""
# For my Mac
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
# Use a writable cache directory on macOS
cache_dir = os.path.expanduser("~/cache")
os.makedirs(cache_dir, exist_ok=True)
os.environ["HF_HOME"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
st_model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2",
cache_folder=cache_dir # <- writable
)
kw_model = KeyBERT(st_model)
"""
def make_tags(title, abstract, top_n=5):
"""
Extract keywords from the title and abstract using KeyBERT.
"""
phrases = kw_model.extract_keywords(f"{title}. {abstract}",
top_n=top_n,
stop_words="english",
use_mmr=True)
return ", ".join(p for p, _ in phrases)
def scrape(max_results=MAX_RESULTS, **criteria):
query = build_query(**criteria)
search = arxiv.Search(query=query,
max_results=max_results * 3, # Get more results to filter from
sort_by=arxiv.SortCriterion.SubmittedDate)
conn = get_conn()
search_results = [] # Track papers from current search that aren't in database
papers_added = 0
for p in search.results():
# Check if paper already exists in database
existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
if not existing and papers_added < max_results:
# Paper doesn't exist, add it
tags = make_tags(p.title, p.summary)
conn.execute(
"INSERT INTO papers VALUES (?,?,?,?,?,?,?)",
(
p.entry_id,
p.title,
", ".join(a.name for a in p.authors),
p.summary,
p.published.isoformat(),
None, # ummary placeholder
tags
),
)
# Add to search results
search_results.append({
'title': p.title,
'authors': ", ".join(a.name for a in p.authors),
'abstract': p.summary,
'published': p.published.isoformat()
})
papers_added += 1
# Stop if enough papers have been added
if papers_added >= max_results:
break
time.sleep(1)
conn.commit()
return search_results
|