Spaces:
Running
Running
| """ | |
| Internal Medicine Discharge Letter Error-Check — Streamlit App | |
| Prospective study: AI-assisted error detection in ED discharge letters | |
| """ | |
| import streamlit as st | |
| import time | |
| import json | |
| import os | |
| import tempfile | |
| import threading | |
| from concurrent.futures import ThreadPoolExecutor | |
| from datetime import datetime | |
| from pathlib import Path | |
| from backend import ModelResult, translate_to_english, call_model_a, call_model_b | |
| FEEDBACK_FILE = Path(__file__).parent / "feedback_data.json" | |
| HF_DATASET_REPO = "Vrda/im-error-check-data" | |
| HF_DATASET_FILE = "feedback_data.json" | |
| def get_deepseek_job_manager(): | |
| return { | |
| "executor": ThreadPoolExecutor(max_workers=2), | |
| "jobs": {}, | |
| "lock": threading.Lock(), | |
| } | |
| def cleanup_deepseek_jobs(max_age_seconds: int = 1800): | |
| manager = get_deepseek_job_manager() | |
| now = time.time() | |
| stale_job_ids = [] | |
| with manager["lock"]: | |
| for job_id, job in manager["jobs"].items(): | |
| if now - job["created_at"] > max_age_seconds: | |
| stale_job_ids.append(job_id) | |
| for job_id in stale_job_ids: | |
| manager["jobs"].pop(job_id, None) | |
| def submit_deepseek_job(job_id: str, english_text: str): | |
| manager = get_deepseek_job_manager() | |
| future = manager["executor"].submit(call_model_a, english_text) | |
| with manager["lock"]: | |
| manager["jobs"][job_id] = { | |
| "future": future, | |
| "created_at": time.time(), | |
| } | |
| def get_deepseek_job_info(job_id: str): | |
| if not job_id: | |
| return None | |
| manager = get_deepseek_job_manager() | |
| with manager["lock"]: | |
| job = manager["jobs"].get(job_id) | |
| if not job: | |
| return None | |
| return { | |
| "created_at": job["created_at"], | |
| "done": job["future"].done(), | |
| } | |
| def consume_deepseek_job_result(job_id: str) -> ModelResult | None: | |
| if not job_id: | |
| return None | |
| manager = get_deepseek_job_manager() | |
| with manager["lock"]: | |
| job = manager["jobs"].get(job_id) | |
| if not job: | |
| return None | |
| future = job["future"] | |
| if not future.done(): | |
| return None | |
| try: | |
| result = future.result() | |
| except Exception as exc: | |
| result = ModelResult( | |
| model_name="DeepSeek Reasoner", | |
| raw_response="", | |
| success=False, | |
| error_message=f"Background job failed: {exc}", | |
| latency_seconds=0.0, | |
| ) | |
| with manager["lock"]: | |
| manager["jobs"].pop(job_id, None) | |
| return result | |
| # ------------------------------------------------------------------------- | |
| # Feedback persistence (local + HF Hub sync) | |
| # ------------------------------------------------------------------------- | |
| def _sync_from_hub() -> list: | |
| """Pull the latest feedback_data.json from HF Hub if it exists.""" | |
| try: | |
| from huggingface_hub import HfApi | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") | |
| if not token: | |
| return [] | |
| api = HfApi(token=token) | |
| local_path = api.hf_hub_download( | |
| repo_id=HF_DATASET_REPO, | |
| filename=HF_DATASET_FILE, | |
| repo_type="dataset", | |
| ) | |
| with open(local_path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| except Exception: | |
| return [] | |
| def _sync_to_hub(data: list): | |
| """Push feedback_data.json to the HF dataset repo.""" | |
| try: | |
| from huggingface_hub import HfApi | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") | |
| if not token: | |
| return | |
| api = HfApi(token=token) | |
| with tempfile.NamedTemporaryFile( | |
| mode="w", suffix=".json", delete=False, encoding="utf-8" | |
| ) as tmp: | |
| json.dump(data, tmp, ensure_ascii=False, indent=2) | |
| tmp_path = tmp.name | |
| api.upload_file( | |
| path_or_fileobj=tmp_path, | |
| path_in_repo=HF_DATASET_FILE, | |
| repo_id=HF_DATASET_REPO, | |
| repo_type="dataset", | |
| commit_message=f"feedback entry #{len(data)}", | |
| ) | |
| os.unlink(tmp_path) | |
| except Exception as e: | |
| st.toast(f"Hub sync warning: {e}", icon="\u26A0\uFE0F") | |
| def save_feedback(entry: dict) -> int: | |
| hub_data = _sync_from_hub() | |
| if FEEDBACK_FILE.exists(): | |
| with open(FEEDBACK_FILE, "r", encoding="utf-8") as f: | |
| local_data = json.load(f) | |
| else: | |
| local_data = [] | |
| if len(hub_data) > len(local_data): | |
| data = hub_data | |
| else: | |
| data = local_data | |
| data.append(entry) | |
| with open(FEEDBACK_FILE, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| _sync_to_hub(data) | |
| return len(data) | |
| # ------------------------------------------------------------------------- | |
| # Page config & CSS | |
| # ------------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="IM Error-Check", | |
| page_icon="\U0001FA7A", | |
| layout="wide", | |
| ) | |
| st.markdown(""" | |
| <style> | |
| .error-card { | |
| background: #2d1f1f; border-left: 4px solid #e53e3e; | |
| border-radius: 8px; padding: 0.8rem 1rem; margin: 0.5rem 0; | |
| color: #fde8e8; | |
| } | |
| .error-card strong { color: #feb2b2; } | |
| .error-card em { color: #fbd38d; } | |
| .suggestion-card { | |
| background: #1a2e1a; border-left: 4px solid #38a169; | |
| border-radius: 8px; padding: 0.8rem 1rem; margin: 0.5rem 0; | |
| color: #c6f6d5; | |
| } | |
| .suggestion-card strong { color: #9ae6b4; } | |
| .model-header-a { | |
| background: #1a2332; border-left: 4px solid #63b3ed; | |
| border-radius: 8px; padding: 0.6rem 1rem; margin-bottom: 0.5rem; | |
| } | |
| .model-header-b { | |
| background: #2d1f3d; border-left: 4px solid #b794f4; | |
| border-radius: 8px; padding: 0.6rem 1rem; margin-bottom: 0.5rem; | |
| } | |
| .severity-critical { color: #fc8181; font-weight: bold; } | |
| .severity-high { color: #f6ad55; font-weight: bold; } | |
| .severity-medium { color: #f6e05e; } | |
| .severity-low { color: #68d391; } | |
| .category-badge { | |
| display: inline-block; background: #2d3748; color: #e2e8f0; | |
| padding: 2px 8px; border-radius: 12px; font-size: 0.8em; margin-right: 4px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| SAMPLE = """Adresa: VUKOVARSKA 45, SPLIT | |
| Datum dolaska: 10.03.2026. 14:22 | |
| Datum rođenja: 15.05.1958. | |
| Datum otpusta: 10.03.2026. 18:45 | |
| Trijažna kategorija: 3 | |
| Dijagnoze | |
| I21.0 Akutni transmuralni infarkt miokarda prednje stijenke | |
| Podaci s trijaže | |
| Trijaž.kat:3; Puls:92/min; RR:155/95 mmHg; SpO2:94%; Tax: 36.8C; GCS:15; | |
| Razlog dolaska | |
| Bolovi u prsištu od jutros, stezajućeg karaktera s propagacijom u lijevu ruku. Trajanje > 30 min. Uzeo 2x NTG sprej bez učinka. | |
| Anamneza | |
| Osobna: arterijska hipertenzija, DM tip 2, dislipidemija. Terapija: Ramipril 5mg, Metformin 1000mg 2x1, Atorvastatin 20mg. | |
| Status | |
| Pri svijesti, blijed, znojav. Auskultatorno: srčana akcija ritmična, tonovi tiši, bez šumova. Pluća: bazalno obostrano oslabljen šum disanja. | |
| Laboratorij | |
| Troponin I: 2.8 ng/mL (ref <0.04), CK-MB: 45 U/L, L: 12.3, CRP: 8.5 | |
| Na: 138, K: 4.2, Kreatinin: 128 umol/L (eGFR 52), GUK: 14.2 mmol/L | |
| EKG: ST elevacija V1-V4, recipročne promjene II, III, aVF | |
| Terapija | |
| Aspirin 300mg stat, zatim 100mg 1x1 | |
| Klopidogrel 300mg stat, zatim 75mg 1x1 | |
| Heparin 5000 IU i.v. bolus | |
| Morphin 4mg i.v. | |
| Metformin 1000mg nastaviti 2x1 | |
| Atorvastatin 40mg 1x1 | |
| Zaključak | |
| Pacijent s akutnim STEMI prednje stijenke. Transportiran u Kath lab. | |
| Preporučen kontrolni pregled za 14 dana.""" | |
| # ------------------------------------------------------------------------- | |
| # Session state | |
| # ------------------------------------------------------------------------- | |
| for key, default in [ | |
| ("input_text", ""), | |
| ("translated_text", None), | |
| ("model_a_result", None), | |
| ("model_b_result", None), | |
| ("translation_latency", 0), | |
| ("total_elapsed", 0), | |
| ("analysis_started_at", 0.0), | |
| ("deepseek_job_id", None), | |
| ("run_analysis", False), | |
| ("physician_id", ""), | |
| ]: | |
| if key not in st.session_state: | |
| st.session_state[key] = default | |
| if "session_key" not in st.session_state: | |
| import uuid | |
| st.session_state.session_key = str(uuid.uuid4()) | |
| cleanup_deepseek_jobs() | |
| def poll_deepseek_job(): | |
| job_id = st.session_state.deepseek_job_id | |
| if not job_id or st.session_state.model_a_result is not None: | |
| return | |
| result = consume_deepseek_job_result(job_id) | |
| if result is None: | |
| return | |
| st.session_state.model_a_result = result | |
| st.session_state.deepseek_job_id = None | |
| st.session_state.total_elapsed = round( | |
| time.time() - st.session_state.analysis_started_at, 2 | |
| ) | |
| def load_sample(): | |
| st.session_state.input_text = SAMPLE | |
| def trigger_analysis(): | |
| st.session_state.run_analysis = True | |
| # ------------------------------------------------------------------------- | |
| # Header | |
| # ------------------------------------------------------------------------- | |
| st.title("\U0001FA7A Internal Medicine — Discharge Letter Error-Check") | |
| st.markdown("*AI-assisted error detection for Internal Medicine Emergency Department*") | |
| st.warning( | |
| "\u26A0\uFE0F **RESEARCH TOOL**: AI-generated findings require physician verification. " | |
| "Do not use as sole basis for clinical decisions." | |
| ) | |
| # Sidebar | |
| with st.sidebar: | |
| st.header("About") | |
| st.markdown( | |
| "Compares **DeepSeek Reasoner** and **GPT-OSS-120B** for detecting errors " | |
| "in discharge letters." | |
| ) | |
| st.markdown("---") | |
| st.markdown("**Steps:** Paste letter \u2192 Analyze \u2192 Review \u2192 Rate") | |
| st.markdown("---") | |
| st.text_input( | |
| "Physician ID (anonymous):", | |
| placeholder="e.g. Physician A", | |
| key="physician_id", | |
| ) | |
| if FEEDBACK_FILE.exists(): | |
| with open(FEEDBACK_FILE, "r", encoding="utf-8") as f: | |
| count = len(json.load(f)) | |
| st.metric("Cases collected", count) | |
| # ------------------------------------------------------------------------- | |
| # Input | |
| # ------------------------------------------------------------------------- | |
| st.header("Discharge Letter Input") | |
| st.button("Load Sample Case", on_click=load_sample) | |
| st.text_area( | |
| "Paste discharge letter (Croatian):", | |
| height=220, | |
| placeholder="Zalijepite otpusno pismo ovdje...", | |
| key="input_text", | |
| ) | |
| st.button("Analyze", type="primary", on_click=trigger_analysis) | |
| # ------------------------------------------------------------------------- | |
| # Run analysis (progressive: show GPT-OSS first, DeepSeek when ready) | |
| # ------------------------------------------------------------------------- | |
| if st.session_state.run_analysis and st.session_state.input_text.strip(): | |
| st.session_state.run_analysis = False | |
| st.session_state.model_a_result = None | |
| st.session_state.model_b_result = None | |
| st.session_state.total_elapsed = 0 | |
| st.session_state.analysis_started_at = time.time() | |
| st.session_state.deepseek_job_id = None | |
| with st.spinner("Translating discharge letter..."): | |
| t0 = time.time() | |
| st.session_state.translated_text = translate_to_english(st.session_state.input_text) | |
| st.session_state.translation_latency = round(time.time() - t0, 2) | |
| english = st.session_state.translated_text | |
| job_id = f"{st.session_state.session_key}:{int(time.time() * 1000)}" | |
| submit_deepseek_job(job_id, english) | |
| st.session_state.deepseek_job_id = job_id | |
| with st.spinner("GPT-OSS-120B responding (~5s)..."): | |
| st.session_state.model_b_result = call_model_b(english) | |
| st.rerun() | |
| poll_deepseek_job() | |
| # ------------------------------------------------------------------------- | |
| # Helper: render a model's output | |
| # ------------------------------------------------------------------------- | |
| SEVERITY_LABELS = { | |
| "critical": "\U0001F534 Critical", | |
| "high": "\U0001F7E0 High", | |
| "medium": "\U0001F7E1 Medium", | |
| "low": "\U0001F7E2 Low", | |
| } | |
| CATEGORY_LABELS = { | |
| "medication_error": "Medication", | |
| "diagnostic_error": "Diagnostic", | |
| "dosing_error": "Dosing", | |
| "documentation_error": "Documentation", | |
| "lab_interpretation_error": "Lab Interpretation", | |
| "contraindication": "Contraindication", | |
| "omission": "Omission", | |
| "other": "Other", | |
| "documentation_quality": "Documentation Quality", | |
| "clinical_workflow": "Clinical Workflow", | |
| "patient_safety": "Patient Safety", | |
| "completeness": "Completeness", | |
| } | |
| def render_model_output(result, header_class: str): | |
| if not result.success: | |
| st.error(f"Model error: {result.error_message}") | |
| return | |
| st.caption(f"Response time: {result.latency_seconds}s") | |
| if result.summary: | |
| st.markdown(f"**Summary:** {result.summary}") | |
| # Errors | |
| if result.errors: | |
| for i, err in enumerate(result.errors, 1): | |
| sev = SEVERITY_LABELS.get(err.severity, err.severity) | |
| cat = CATEGORY_LABELS.get(err.category, err.category) | |
| st.markdown( | |
| f'<div class="error-card">' | |
| f"<strong>Error {i}</strong> — {sev} " | |
| f'<span class="category-badge">{cat}</span><br>' | |
| f"{err.description}" | |
| f"{'<br><em>Quote: \"' + err.quote + '\"</em>' if err.quote else ''}" | |
| f"</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| else: | |
| st.info("No errors identified.") | |
| # Suggestions | |
| if result.suggestions: | |
| for i, sug in enumerate(result.suggestions, 1): | |
| cat = CATEGORY_LABELS.get(sug.category, sug.category) | |
| details = "" | |
| if getattr(sug, "quote", ""): | |
| details += f'<br><em>Original: "{sug.quote}"</em>' | |
| if getattr(sug, "suggested_rewrite", ""): | |
| details += ( | |
| "<br><strong>Suggested rewrite:</strong> " | |
| f"{sug.suggested_rewrite}" | |
| ) | |
| st.markdown( | |
| f'<div class="suggestion-card">' | |
| f"<strong>Suggestion {i}</strong> " | |
| f'<span class="category-badge">{cat}</span><br>' | |
| f"{sug.description}" | |
| f"{details}" | |
| f"</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Display results (progressive: GPT-OSS first, DeepSeek when ready) | |
| # ------------------------------------------------------------------------- | |
| has_any_result = st.session_state.model_b_result is not None | |
| both_ready = has_any_result and st.session_state.model_a_result is not None | |
| if has_any_result: | |
| st.markdown("---") | |
| st.header("Analysis Results") | |
| if both_ready: | |
| st.success( | |
| f"Both models complete (total: {st.session_state.total_elapsed}s | " | |
| f"translation: {st.session_state.translation_latency}s | " | |
| f"DeepSeek: {st.session_state.model_a_result.latency_seconds}s | " | |
| f"GPT-OSS: {st.session_state.model_b_result.latency_seconds}s)" | |
| ) | |
| else: | |
| st.info( | |
| f"GPT-OSS-120B ready ({st.session_state.model_b_result.latency_seconds}s). " | |
| "DeepSeek Reasoner is still thinking — review and rate GPT-OSS results below. " | |
| "Click `Refresh DeepSeek status` below when you want to check if it has finished." | |
| ) | |
| with st.expander("English Translation"): | |
| st.markdown(st.session_state.translated_text) | |
| st.subheader("Model Comparison") | |
| col_a, col_b = st.columns(2, gap="large") | |
| with col_b: | |
| st.markdown( | |
| '<div class="model-header-b"><h4 style="color:#805ad5; margin:0">' | |
| "GPT-OSS-120B</h4></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| render_model_output(st.session_state.model_b_result, "model-header-b") | |
| with col_a: | |
| st.markdown( | |
| '<div class="model-header-a"><h4 style="color:#3182ce; margin:0">' | |
| "DeepSeek Reasoner</h4></div>", | |
| unsafe_allow_html=True, | |
| ) | |
| if st.session_state.model_a_result is not None: | |
| render_model_output(st.session_state.model_a_result, "model-header-a") | |
| else: | |
| job_info = get_deepseek_job_info(st.session_state.deepseek_job_id) | |
| if job_info is None: | |
| st.warning( | |
| "DeepSeek job is no longer active. Click `Analyze` to run it again." | |
| ) | |
| else: | |
| elapsed = round(time.time() - job_info["created_at"]) | |
| st.markdown( | |
| '<div style="background:#1e293b; border:2px dashed #475569; ' | |
| 'border-radius:8px; padding:2rem; text-align:center; color:#e2e8f0;">' | |
| f"<strong>DeepSeek Reasoner</strong> is still processing... ({elapsed}s)<br>" | |
| "This typically takes 60-90 seconds.<br>" | |
| "Review and rate GPT-OSS results below while you wait." | |
| "</div>", | |
| unsafe_allow_html=True, | |
| ) | |
| if st.button("Refresh DeepSeek status", key="refresh_deepseek_status"): | |
| st.rerun() | |
| # ----------------------------------------------------------------- | |
| # Feedback | |
| # ----------------------------------------------------------------- | |
| st.markdown("---") | |
| st.subheader("Physician Feedback (Research)") | |
| st.markdown( | |
| "*Rate each model's output. Your feedback is essential for evaluating " | |
| "AI error-detection performance.*" | |
| ) | |
| VALIDITY_OPTIONS = ["Valid", "Partially Valid", "Invalid"] | |
| RATING_OPTIONS = ["1 - Poor", "2 - Fair", "3 - Good", "4 - Very Good", "5 - Excellent"] | |
| SAFETY_OPTIONS = [ | |
| "1 - No concern", | |
| "2 - Mild concern", | |
| "3 - Moderate concern", | |
| "4 - Serious concern", | |
| "5 - Critical risk", | |
| ] | |
| feedback_data = {} | |
| available_models = [("model_b", "GPT-OSS-120B", st.session_state.model_b_result)] | |
| if st.session_state.model_a_result is not None: | |
| available_models.insert(0, ("model_a", "DeepSeek Reasoner", st.session_state.model_a_result)) | |
| for model_key, model_label, res in available_models: | |
| st.markdown(f"#### {model_label}") | |
| error_ratings = [] | |
| if res.success and res.errors: | |
| st.markdown("**Rate each error:**") | |
| for i, err in enumerate(res.errors): | |
| c1, c2 = st.columns([3, 1]) | |
| with c1: | |
| st.markdown( | |
| f"*Error {i+1}:* {err.description[:120]}{'...' if len(err.description) > 120 else ''}" | |
| ) | |
| with c2: | |
| validity = st.selectbox( | |
| f"Validity", | |
| VALIDITY_OPTIONS, | |
| key=f"{model_key}_err_{i}_validity", | |
| label_visibility="collapsed", | |
| ) | |
| cat_correct = st.checkbox( | |
| f"Category correct ({CATEGORY_LABELS.get(err.category, err.category)})?", | |
| value=True, | |
| key=f"{model_key}_err_{i}_cat", | |
| ) | |
| error_ratings.append({ | |
| "error_text": err.description, | |
| "model_category": err.category, | |
| "model_severity": err.severity, | |
| "validity": validity.lower().replace(" ", "_"), | |
| "category_correct": cat_correct, | |
| }) | |
| elif res.success: | |
| st.info("Model found no errors — rate the overall output below.") | |
| suggestions_useful = st.select_slider( | |
| f"**Suggestions usefulness:**", | |
| options=RATING_OPTIONS, | |
| value="3 - Good", | |
| key=f"{model_key}_sug_useful", | |
| ) | |
| overall_usefulness = st.select_slider( | |
| f"**Overall usefulness:**", | |
| options=RATING_OPTIONS, | |
| value="3 - Good", | |
| key=f"{model_key}_overall", | |
| ) | |
| safety_severity = st.select_slider( | |
| f"**Safety concern severity** (1=no concern, 5=critical risk):", | |
| options=SAFETY_OPTIONS, | |
| value="1 - No concern", | |
| key=f"{model_key}_safety", | |
| ) | |
| st.caption( | |
| "Use this as a risk scale: 1 means no safety concern, 5 means critical patient risk." | |
| ) | |
| safety_score = int(safety_severity.split(" - ", 1)[0]) | |
| feedback_data[model_key] = { | |
| "errors": error_ratings, | |
| "suggestions_useful": suggestions_useful, | |
| "overall_usefulness": overall_usefulness, | |
| "safety_concern_severity": safety_severity, | |
| "safety_concern_score": safety_score, | |
| } | |
| st.markdown("---") | |
| if not both_ready: | |
| st.warning( | |
| "DeepSeek Reasoner has not finished yet. You can submit partial feedback now " | |
| "(GPT-OSS only), or refresh once later to load the DeepSeek result." | |
| ) | |
| # Missed errors | |
| st.markdown("#### Missed Errors") | |
| missed_errors = st.text_area( | |
| "Did either model miss errors that should have been found? Describe them here:", | |
| placeholder="e.g. Both models missed that Metformin is contraindicated with eGFR < 30...", | |
| key="missed_errors", | |
| height=80, | |
| ) | |
| # General comments | |
| comments = st.text_area( | |
| "Additional comments (optional):", | |
| placeholder="Any other observations about the models' performance?", | |
| key="fb_comments", | |
| height=80, | |
| ) | |
| if st.button("Submit Feedback", type="secondary"): | |
| if not st.session_state.physician_id.strip(): | |
| st.warning("Please enter a Physician ID in the sidebar before submitting.") | |
| else: | |
| model_a_res = st.session_state.model_a_result | |
| model_b_res = st.session_state.model_b_result | |
| entry = { | |
| "timestamp": datetime.now().isoformat(), | |
| "physician_id": st.session_state.physician_id.strip(), | |
| "clinical_input": st.session_state.input_text, | |
| "translation": st.session_state.translated_text, | |
| "model_a_output": model_a_res.raw_response if model_a_res else "", | |
| "model_b_output": model_b_res.raw_response if model_b_res else "", | |
| "model_a_latency": model_a_res.latency_seconds if model_a_res else None, | |
| "model_b_latency": model_b_res.latency_seconds if model_b_res else None, | |
| "translation_latency": st.session_state.translation_latency, | |
| "total_latency": st.session_state.total_elapsed, | |
| "both_models_complete": both_ready, | |
| "ratings": feedback_data, | |
| "missed_errors": missed_errors, | |
| "comments": comments, | |
| } | |
| count = save_feedback(entry) | |
| st.success(f"Feedback saved! (Total entries: {count})") | |
| st.balloons() | |
| st.markdown("---") | |
| st.caption( | |
| "Internal Medicine Error-Check | Prospective Research Study 2026 | " | |
| "Requires physician verification" | |
| ) | |