diogo.rodrigues.silva commited on
Commit
b156781
·
1 Parent(s): e52b6da

Adding .ris download functionality

Browse files
Files changed (1) hide show
  1. app.py +93 -3
app.py CHANGED
@@ -12,6 +12,7 @@ import threading
12
  import time
13
  import hmac
14
  import hashlib
 
15
  from datetime import datetime
16
  from pathlib import Path
17
  from tempfile import mkdtemp
@@ -32,7 +33,7 @@ ALLOWED_CRITERIA_SUFFIXES = {".yml", ".yaml", ".json"}
32
  APP_ROOT_DIR = Path(__file__).resolve().parent
33
  APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
34
  APP_RUNS_DIR = APP_TMP_DIR / "runs"
35
- APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json"}
36
  MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
37
  MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
38
  MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
@@ -156,6 +157,77 @@ def _screening_verdict_counts(screened_excel_path: Path) -> dict[str, int]:
156
  }
157
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def _auth_credentials() -> tuple[str, str]:
160
  username = (os.getenv(APP_USERNAME_ENV) or "").strip()
161
  password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
@@ -359,20 +431,38 @@ def screen_excel(
359
  return
360
 
361
  progress(1, desc="Screening complete.")
 
 
 
 
 
 
 
 
362
  try:
363
  verdict_counts = _screening_verdict_counts(screened_output)
364
  completed_status = (
365
  "Screening complete: "
366
  f"Included {verdict_counts['include']} | "
367
  f"Excluded {verdict_counts['exclude']} | "
368
- f"Unclear {verdict_counts['unclear']}"
 
369
  )
 
 
370
  except Exception:
371
  completed_status = "Screening complete."
 
 
372
 
 
 
 
 
 
373
  yield (
374
  completed_status,
375
- _download_markdown(screened_output, "Download Screened Excel"),
376
  )
377
 
378
 
 
12
  import time
13
  import hmac
14
  import hashlib
15
+ import re
16
  from datetime import datetime
17
  from pathlib import Path
18
  from tempfile import mkdtemp
 
33
  APP_ROOT_DIR = Path(__file__).resolve().parent
34
  APP_TMP_DIR = Path(os.getenv("APP_TMP_DIR", str(APP_ROOT_DIR / "artifacts"))).resolve()
35
  APP_RUNS_DIR = APP_TMP_DIR / "runs"
36
+ APP_TMP_FILE_EXTENSIONS = {".xlsx", ".json", ".ris"}
37
  MAX_UPLOAD_FILES = int(os.getenv("MAX_UPLOAD_FILES", "20"))
38
  MAX_UPLOAD_FILE_MB = int(os.getenv("MAX_UPLOAD_FILE_MB", "25"))
39
  MAX_CRITERIA_FILE_MB = int(os.getenv("MAX_CRITERIA_FILE_MB", "2"))
 
157
  }
158
 
159
 
160
+ def _write_included_unclear_ris(screened_excel_path: Path, output_ris_path: Path) -> int:
161
+ """Write a RIS file containing only include/unclear references."""
162
+ df = pd.read_excel(screened_excel_path, engine="openpyxl")
163
+ if "LLM_verdict" not in df.columns:
164
+ raise KeyError("Expected 'LLM_verdict' column was not found in screening output.")
165
+
166
+ selected = df[
167
+ df["LLM_verdict"]
168
+ .astype(str)
169
+ .str.strip()
170
+ .str.lower()
171
+ .isin({"include", "unclear"})
172
+ ]
173
+
174
+ def _clean_value(value) -> str:
175
+ if pd.isna(value):
176
+ return ""
177
+ text = str(value).strip()
178
+ return re.sub(r"\s+", " ", text)
179
+
180
+ def _authors_from_row(row: pd.Series) -> list[str]:
181
+ raw = _clean_value(row.get("Authors") or row.get("FullAuthors") or "")
182
+ if not raw:
183
+ return []
184
+ return [a.strip() for a in re.split(r";|\n", raw) if a.strip()]
185
+
186
+ lines: list[str] = []
187
+ for _, row in selected.iterrows():
188
+ title = _clean_value(row.get("Title", ""))
189
+ abstract = _clean_value(row.get("Abstract", ""))
190
+ journal = _clean_value(row.get("Journal", ""))
191
+ doi = _clean_value(row.get("DOI", ""))
192
+ pmid = _clean_value(row.get("PMID", ""))
193
+ url = _clean_value(row.get("URL", ""))
194
+ verdict = _clean_value(row.get("LLM_verdict", "")).lower()
195
+ rationale = _clean_value(row.get("LLM_rationale", ""))
196
+
197
+ year = _clean_value(row.get("Year", ""))
198
+ if not year:
199
+ year = _clean_value(row.get("PublicationDate", ""))[:4]
200
+ elif len(year) > 4:
201
+ year = year[:4]
202
+
203
+ lines.append("TY - JOUR")
204
+ if title:
205
+ lines.append(f"TI - {title}")
206
+ for author in _authors_from_row(row):
207
+ lines.append(f"AU - {author}")
208
+ if journal:
209
+ lines.append(f"JO - {journal}")
210
+ if year:
211
+ lines.append(f"PY - {year}")
212
+ if abstract:
213
+ lines.append(f"AB - {abstract}")
214
+ if doi:
215
+ lines.append(f"DO - {doi}")
216
+ if pmid:
217
+ lines.append(f"ID - PMID:{pmid}")
218
+ if url:
219
+ lines.append(f"UR - {url}")
220
+ if verdict:
221
+ lines.append(f"N1 - LLM verdict: {verdict}")
222
+ if rationale:
223
+ lines.append(f"N1 - LLM rationale: {rationale}")
224
+ lines.append("ER -")
225
+ lines.append("")
226
+
227
+ output_ris_path.write_text("\n".join(lines), encoding="utf-8")
228
+ return int(len(selected))
229
+
230
+
231
  def _auth_credentials() -> tuple[str, str]:
232
  username = (os.getenv(APP_USERNAME_ENV) or "").strip()
233
  password = (os.getenv(APP_PASSWORD_ENV) or "").strip()
 
431
  return
432
 
433
  progress(1, desc="Screening complete.")
434
+ screened_ris_output = parsed_path.parent / f"screened_included_unclear_{_timestamp_slug()}.ris"
435
+ ris_count = 0
436
+ ris_error = None
437
+ try:
438
+ ris_count = _write_included_unclear_ris(screened_output, screened_ris_output)
439
+ except Exception as exc:
440
+ ris_error = str(exc)
441
+
442
  try:
443
  verdict_counts = _screening_verdict_counts(screened_output)
444
  completed_status = (
445
  "Screening complete: "
446
  f"Included {verdict_counts['include']} | "
447
  f"Excluded {verdict_counts['exclude']} | "
448
+ f"Unclear {verdict_counts['unclear']} | "
449
+ f"RIS references {ris_count}"
450
  )
451
+ if ris_error:
452
+ completed_status += f" | RIS export failed: {ris_error}"
453
  except Exception:
454
  completed_status = "Screening complete."
455
+ if ris_error:
456
+ completed_status += f" RIS export failed: {ris_error}"
457
 
458
+ downloads = [_download_markdown(screened_output, "Download Screened Excel")]
459
+ if screened_ris_output.exists():
460
+ downloads.append(
461
+ _download_markdown(screened_ris_output, "Download Included + Unclear RIS")
462
+ )
463
  yield (
464
  completed_status,
465
+ " | ".join(downloads),
466
  )
467
 
468