Spaces:
Running
Running
diogo.rodrigues.silva commited on
Commit ·
b156781
1
Parent(s): e52b6da
Adding .ris download functionality
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import threading
|
|
| 12 |
import time
|
| 13 |
import hmac
|
| 14 |
import hashlib
|
|
|
|
| 15 |
from datetime import datetime
|
| 16 |
from pathlib import Path
|
| 17 |
from tempfile import mkdtemp
|
|
@@ -32,7 +33,7 @@ ALLOWED_CRITERIA_SUFFIXES = {".yml", ".yaml", ".json"}
|
|
| 32 |
APP_ROOT_DIR = Path(__file__).resolve().parent
|
| 33 |
APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
|
| 34 |
APP_RUNS_DIR = APP_TMP_DIR / "runs"
|
| 35 |
-
APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json"}
|
| 36 |
MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
|
| 37 |
MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
|
| 38 |
MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
|
|
@@ -156,6 +157,77 @@ def _screening_verdict_counts(screened_excel_path: Path) -> dict[str, int]:
|
|
| 156 |
}
|
| 157 |
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
def _auth_credentials() -> tuple[str, str]:
|
| 160 |
username = (os.getenv(APP_USERNAME_ENV) or "").strip()
|
| 161 |
password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
|
|
@@ -359,20 +431,38 @@ def screen_excel(
|
|
| 359 |
return
|
| 360 |
|
| 361 |
progress(1, desc="Screening complete.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
try:
|
| 363 |
verdict_counts = _screening_verdict_counts(screened_output)
|
| 364 |
completed_status = (
|
| 365 |
"Screening complete: "
|
| 366 |
f"Included {verdict_counts['include']} | "
|
| 367 |
f"Excluded {verdict_counts['exclude']} | "
|
| 368 |
-
f"Unclear {verdict_counts['unclear']}"
|
|
|
|
| 369 |
)
|
|
|
|
|
|
|
| 370 |
except Exception:
|
| 371 |
completed_status = "Screening complete."
|
|
|
|
|
|
|
| 372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
yield (
|
| 374 |
completed_status,
|
| 375 |
-
|
| 376 |
)
|
| 377 |
|
| 378 |
|
|
|
|
| 12 |
import time
|
| 13 |
import hmac
|
| 14 |
import hashlib
|
| 15 |
+
import re
|
| 16 |
from datetime import datetime
|
| 17 |
from pathlib import Path
|
| 18 |
from tempfile import mkdtemp
|
|
|
|
| 33 |
APP_ROOT_DIR = Path(__file__).resolve().parent
|
| 34 |
APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
|
| 35 |
APP_RUNS_DIR = APP_TMP_DIR / "runs"
|
| 36 |
+
APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json", ".ris"}
|
| 37 |
MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
|
| 38 |
MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
|
| 39 |
MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
|
|
|
|
| 157 |
}
|
| 158 |
|
| 159 |
|
| 160 |
+
def _write_included_unclear_ris(screened_excel_path: Path, output_ris_path: Path) -> int:
|
| 161 |
+
"""Write a RIS file containing only include/unclear references."""
|
| 162 |
+
df = pd.read_excel(screened_excel_path, engine="openpyxl")
|
| 163 |
+
if "LLM_verdict" not in df.columns:
|
| 164 |
+
raise KeyError("Expected 'LLM_verdict' column was not found in screening output.")
|
| 165 |
+
|
| 166 |
+
selected = df[
|
| 167 |
+
df["LLM_verdict"]
|
| 168 |
+
.astype(str)
|
| 169 |
+
.str.strip()
|
| 170 |
+
.str.lower()
|
| 171 |
+
.isin({"include", "unclear"})
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
def _clean_value(value) -> str:
|
| 175 |
+
if pd.isna(value):
|
| 176 |
+
return ""
|
| 177 |
+
text = str(value).strip()
|
| 178 |
+
return re.sub(r"\s+", " ", text)
|
| 179 |
+
|
| 180 |
+
def _authors_from_row(row: pd.Series) -> list[str]:
|
| 181 |
+
raw = _clean_value(row.get("Authors") or row.get("FullAuthors") or "")
|
| 182 |
+
if not raw:
|
| 183 |
+
return []
|
| 184 |
+
return [a.strip() for a in re.split(r";|\n", raw) if a.strip()]
|
| 185 |
+
|
| 186 |
+
lines: list[str] = []
|
| 187 |
+
for _, row in selected.iterrows():
|
| 188 |
+
title = _clean_value(row.get("Title", ""))
|
| 189 |
+
abstract = _clean_value(row.get("Abstract", ""))
|
| 190 |
+
journal = _clean_value(row.get("Journal", ""))
|
| 191 |
+
doi = _clean_value(row.get("DOI", ""))
|
| 192 |
+
pmid = _clean_value(row.get("PMID", ""))
|
| 193 |
+
url = _clean_value(row.get("URL", ""))
|
| 194 |
+
verdict = _clean_value(row.get("LLM_verdict", "")).lower()
|
| 195 |
+
rationale = _clean_value(row.get("LLM_rationale", ""))
|
| 196 |
+
|
| 197 |
+
year = _clean_value(row.get("Year", ""))
|
| 198 |
+
if not year:
|
| 199 |
+
year = _clean_value(row.get("PublicationDate", ""))[:4]
|
| 200 |
+
elif len(year) > 4:
|
| 201 |
+
year = year[:4]
|
| 202 |
+
|
| 203 |
+
lines.append("TY - JOUR")
|
| 204 |
+
if title:
|
| 205 |
+
lines.append(f"TI - {title}")
|
| 206 |
+
for author in _authors_from_row(row):
|
| 207 |
+
lines.append(f"AU - {author}")
|
| 208 |
+
if journal:
|
| 209 |
+
lines.append(f"JO - {journal}")
|
| 210 |
+
if year:
|
| 211 |
+
lines.append(f"PY - {year}")
|
| 212 |
+
if abstract:
|
| 213 |
+
lines.append(f"AB - {abstract}")
|
| 214 |
+
if doi:
|
| 215 |
+
lines.append(f"DO - {doi}")
|
| 216 |
+
if pmid:
|
| 217 |
+
lines.append(f"ID - PMID:{pmid}")
|
| 218 |
+
if url:
|
| 219 |
+
lines.append(f"UR - {url}")
|
| 220 |
+
if verdict:
|
| 221 |
+
lines.append(f"N1 - LLM verdict: {verdict}")
|
| 222 |
+
if rationale:
|
| 223 |
+
lines.append(f"N1 - LLM rationale: {rationale}")
|
| 224 |
+
lines.append("ER -")
|
| 225 |
+
lines.append("")
|
| 226 |
+
|
| 227 |
+
output_ris_path.write_text("\n".join(lines), encoding="utf-8")
|
| 228 |
+
return int(len(selected))
|
| 229 |
+
|
| 230 |
+
|
| 231 |
def _auth_credentials() -> tuple[str, str]:
|
| 232 |
username = (os.getenv(APP_USERNAME_ENV) or "").strip()
|
| 233 |
password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
|
|
|
|
| 431 |
return
|
| 432 |
|
| 433 |
progress(1, desc="Screening complete.")
|
| 434 |
+
screened_ris_output = parsed_path.parent / f"screened_included_unclear_{_timestamp_slug()}.ris"
|
| 435 |
+
ris_count = 0
|
| 436 |
+
ris_error = None
|
| 437 |
+
try:
|
| 438 |
+
ris_count = _write_included_unclear_ris(screened_output, screened_ris_output)
|
| 439 |
+
except Exception as exc:
|
| 440 |
+
ris_error = str(exc)
|
| 441 |
+
|
| 442 |
try:
|
| 443 |
verdict_counts = _screening_verdict_counts(screened_output)
|
| 444 |
completed_status = (
|
| 445 |
"Screening complete: "
|
| 446 |
f"Included {verdict_counts['include']} | "
|
| 447 |
f"Excluded {verdict_counts['exclude']} | "
|
| 448 |
+
f"Unclear {verdict_counts['unclear']} | "
|
| 449 |
+
f"RIS references {ris_count}"
|
| 450 |
)
|
| 451 |
+
if ris_error:
|
| 452 |
+
completed_status += f" | RIS export failed: {ris_error}"
|
| 453 |
except Exception:
|
| 454 |
completed_status = "Screening complete."
|
| 455 |
+
if ris_error:
|
| 456 |
+
completed_status += f" RIS export failed: {ris_error}"
|
| 457 |
|
| 458 |
+
downloads = [_download_markdown(screened_output, "Download Screened Excel")]
|
| 459 |
+
if screened_ris_output.exists():
|
| 460 |
+
downloads.append(
|
| 461 |
+
_download_markdown(screened_ris_output, "Download Included + Unclear RIS")
|
| 462 |
+
)
|
| 463 |
yield (
|
| 464 |
completed_status,
|
| 465 |
+
" | ".join(downloads),
|
| 466 |
)
|
| 467 |
|
| 468 |
|