Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from db import get_conn | |
| from config import MODEL_NAME | |
| from helpers import rows_by_tag | |
| import os | |
| import tempfile | |
| import pathlib | |
| """ | |
| Summarise the abstract of a paper using a LLM. Further versions should instead summarise the full paper. | |
| """ | |
| PROMPT = ( | |
| "You are a research assistant. Summarise the abstract below in 5 or less bullet points, " | |
| "highlighting method and key findings.\n" | |
| "===ABSTRACT===\n{abstract}\n" | |
| "===SUMMARY===\n" | |
| ) | |
| # ---------------------------------------------------------------------- # | |
| def load_pipe(): | |
| cache_dir = pathlib.Path(tempfile.gettempdir()) / "hf_cache" | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir=cache_dir, | |
| device_map="auto" | |
| ) | |
| tok = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=cache_dir) | |
| tok.pad_token = tok.eos_token | |
| return pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tok, | |
| do_sample=False, | |
| return_full_text=False, | |
| ) | |
| def summarise_by_tag(keyword: str, limit: int = 10) -> int: | |
| """ | |
| Generate summaries only for rows whose tags match `keyword` | |
| AND whose summary is still NULL. | |
| Returns number of rows updated. | |
| """ | |
| pipe = load_pipe() | |
| conn = get_conn() | |
| # 1) get IDs + abstracts for matching rows with summary IS NULL | |
| like = f"%{keyword.lower()}%" | |
| rows = conn.execute( | |
| "SELECT id, abstract FROM papers " | |
| "WHERE summary IS NULL AND LOWER(tags) LIKE ? " | |
| "ORDER BY published DESC LIMIT ?", (like, limit) | |
| ).fetchall() | |
| # 2) run the LLM only on those | |
| for id, abstract in rows: | |
| out = pipe(PROMPT.format(abstract=abstract), max_new_tokens=150)[0]['generated_text'] | |
| conn.execute("UPDATE papers SET summary=? WHERE id=?", (out.strip(), id)) | |
| conn.commit() | |
| return len(rows) | |