Spaces:

Heit39
/

LLM_Screener

Running

App Files Files

diogo.rodrigues.silva commited on Mar 11

Commit

b156781

1 Parent(s): e52b6da

Adding .ris download functionality

Browse files

Files changed (1) hide show

app.py +93 -3

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import threading
 import time
 import hmac
 import hashlib
 from datetime import datetime
 from pathlib import Path
 from tempfile import mkdtemp
@@ -32,7 +33,7 @@ ALLOWED_CRITERIA_SUFFIXES = {".yml", ".yaml", ".json"}
 APP_ROOT_DIR = Path(__file__).resolve().parent
 APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
 APP_RUNS_DIR = APP_TMP_DIR / "runs"
-APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json"}
 MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
 MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
 MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
@@ -156,6 +157,77 @@ def _screening_verdict_counts(screened_excel_path: Path) -> dict[str, int]:
     }
 def _auth_credentials() -> tuple[str, str]:
     username = (os.getenv(APP_USERNAME_ENV) or "").strip()
     password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
@@ -359,20 +431,38 @@ def screen_excel(
         return
     progress(1, desc="Screening complete.")
     try:
         verdict_counts = _screening_verdict_counts(screened_output)
         completed_status = (
             "Screening complete: "
             f"Included {verdict_counts['include']} | "
             f"Excluded {verdict_counts['exclude']} | "
-            f"Unclear {verdict_counts['unclear']}"
         )
     except Exception:
         completed_status = "Screening complete."
     yield (
         completed_status,
-        _download_markdown(screened_output, "Download Screened Excel"),
     )

 import time
 import hmac
 import hashlib
+import re
 from datetime import datetime
 from pathlib import Path
 from tempfile import mkdtemp
 APP_ROOT_DIR = Path(__file__).resolve().parent
 APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
 APP_RUNS_DIR = APP_TMP_DIR / "runs"
+APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json", ".ris"}
 MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
 MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
 MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
     }
+def _write_included_unclear_ris(screened_excel_path: Path, output_ris_path: Path) -> int:
+    """Write a RIS file containing only include/unclear references."""
+    df = pd.read_excel(screened_excel_path, engine="openpyxl")
+    if "LLM_verdict" not in df.columns:
+        raise KeyError("Expected 'LLM_verdict' column was not found in screening output.")
+    selected = df[
+        df["LLM_verdict"]
+        .astype(str)
+        .str.strip()
+        .str.lower()
+        .isin({"include", "unclear"})
+    ]
+    def _clean_value(value) -> str:
+        if pd.isna(value):
+            return ""
+        text = str(value).strip()
+        return re.sub(r"\s+", " ", text)
+    def _authors_from_row(row: pd.Series) -> list[str]:
+        raw = _clean_value(row.get("Authors") or row.get("FullAuthors") or "")
+        if not raw:
+            return []
+        return [a.strip() for a in re.split(r";|\n", raw) if a.strip()]
+    lines: list[str] = []
+    for _, row in selected.iterrows():
+        title = _clean_value(row.get("Title", ""))
+        abstract = _clean_value(row.get("Abstract", ""))
+        journal = _clean_value(row.get("Journal", ""))
+        doi = _clean_value(row.get("DOI", ""))
+        pmid = _clean_value(row.get("PMID", ""))
+        url = _clean_value(row.get("URL", ""))
+        verdict = _clean_value(row.get("LLM_verdict", "")).lower()
+        rationale = _clean_value(row.get("LLM_rationale", ""))
+        year = _clean_value(row.get("Year", ""))
+        if not year:
+            year = _clean_value(row.get("PublicationDate", ""))[:4]
+        elif len(year) > 4:
+            year = year[:4]
+        lines.append("TY  - JOUR")
+        if title:
+            lines.append(f"TI  - {title}")
+        for author in _authors_from_row(row):
+            lines.append(f"AU  - {author}")
+        if journal:
+            lines.append(f"JO  - {journal}")
+        if year:
+            lines.append(f"PY  - {year}")
+        if abstract:
+            lines.append(f"AB  - {abstract}")
+        if doi:
+            lines.append(f"DO  - {doi}")
+        if pmid:
+            lines.append(f"ID  - PMID:{pmid}")
+        if url:
+            lines.append(f"UR  - {url}")
+        if verdict:
+            lines.append(f"N1  - LLM verdict: {verdict}")
+        if rationale:
+            lines.append(f"N1  - LLM rationale: {rationale}")
+        lines.append("ER  -")
+        lines.append("")
+    output_ris_path.write_text("\n".join(lines), encoding="utf-8")
+    return int(len(selected))
 def _auth_credentials() -> tuple[str, str]:
     username = (os.getenv(APP_USERNAME_ENV) or "").strip()
     password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
         return
     progress(1, desc="Screening complete.")
+    screened_ris_output = parsed_path.parent / f"screened_included_unclear_{_timestamp_slug()}.ris"
+    ris_count = 0
+    ris_error = None
+    try:
+        ris_count = _write_included_unclear_ris(screened_output, screened_ris_output)
+    except Exception as exc:
+        ris_error = str(exc)
     try:
         verdict_counts = _screening_verdict_counts(screened_output)
         completed_status = (
             "Screening complete: "
             f"Included {verdict_counts['include']} | "
             f"Excluded {verdict_counts['exclude']} | "
+            f"Unclear {verdict_counts['unclear']} | "
+            f"RIS references {ris_count}"
         )
+        if ris_error:
+            completed_status += f" | RIS export failed: {ris_error}"
     except Exception:
         completed_status = "Screening complete."
+        if ris_error:
+            completed_status += f" RIS export failed: {ris_error}"
+    downloads = [_download_markdown(screened_output, "Download Screened Excel")]
+    if screened_ris_output.exists():
+        downloads.append(
+            _download_markdown(screened_ris_output, "Download Included + Unclear RIS")
+        )
     yield (
         completed_status,
+        " | ".join(downloads),
     )