MCG-Assist-Question-Answering

Sleeping

App Files Files Community

abjasrees commited on Sep 3, 2025

Commit

bdd2e75

verified ·

1 Parent(s): 6baa083

Update files_process.py

Browse files

Files changed (1) hide show

files_process.py +102 -85

files_process.py CHANGED Viewed

@@ -1,90 +1,107 @@
 # files_process.py
-import pathlib
-from typing import Union
-from pypdf import PdfReader
-from docx import Document
-def _read_file_by_ext(p: pathlib.Path) -> str:
-    ext = p.suffix.lower()
-    if ext == ".txt":
-        return p.read_text(encoding="utf-8", errors="ignore")
-    if ext == ".docx":
-        doc = Document(str(p))
-        return "\n".join(paragraph.text for paragraph in doc.paragraphs)
-    if ext == ".pdf":
-        reader = PdfReader(str(p))
-        pages = []
-        for page in reader.pages:
-            t = page.extract_text()
-            if t:
-                pages.append(t)
-        return "\n".join(pages)
-    raise ValueError(f"Unsupported file extension: {ext}. Use .txt / .docx / .pdf.")
-def load_input_text(input_arg: Union[str, pathlib.Path]) -> str:
     """
-    Load text from a string, or from a file path (.txt, .docx, .pdf).
-    - If the argument looks like plain text (contains newlines or is very long), return it as-is.
-    - Otherwise, if it resolves to an existing file, read it by extension.
-    - On any OSError from filesystem probing (e.g., Errno 36), treat as raw text.
     """
-    if input_arg is None:
-        raise ValueError("input_arg is required")
-    if isinstance(input_arg, pathlib.Path):
-        try:
-            if input_arg.exists():
-                return _read_file_by_ext(input_arg)
-            return str(input_arg)
-        except OSError:
-            return str(input_arg)
-    s = str(input_arg)
-    if ("\n" in s) or ("\r" in s) or (len(s) > 512):
-        return s
-    p = pathlib.Path(s)
-    try:
-        if p.exists():
-            return _read_file_by_ext(p)
-        return s
-    except OSError:
-        return s
-def prepare_input_arg(text_value: str | None, file_obj) -> str:
     """
-    Combine textbox text and a single uploaded file (.txt/.docx/.pdf).
-    If both present, concatenate into a temp text file and return its path.
-    Compatible with Gradio/Scripts where file_obj may have a .name attribute or be a dict.
     """
-    text = (text_value or "").strip()
-    if file_obj is None and not text:
-        raise ValueError("Provide either text or upload a .txt/.docx/.pdf")
-    # If only text
-    if file_obj is None:
-        return text
-    # Best-effort path extraction
-    if hasattr(file_obj, "name") and isinstance(file_obj.name, str):
-        up_path = pathlib.Path(file_obj.name)
-    elif isinstance(file_obj, dict) and "name" in file_obj:
-        up_path = pathlib.Path(file_obj["name"])
-    else:
-        # As a fallback, write bytes if available
-        data = getattr(file_obj, "read", None)
-        if callable(data):
-            content = file_obj.read()
-            up_path = pathlib.Path("/tmp/upload.bin")
-            up_path.write_bytes(content)
-        else:
-            raise ValueError("Unsupported uploaded file object; missing .name or .read()")
-    if text:
-        tmp = pathlib.Path("/tmp/_concat_input.txt")
-        tmp.write_text(text + "\n\n", encoding="utf-8")
-        appended = load_input_text(str(up_path))
-        tmp.write_text(tmp.read_text(encoding="utf-8") + appended, encoding="utf-8")
-        return str(tmp)
-    return str(up_path)

 # files_process.py
+import os
+from typing import List, Dict
+import gradio as gr
+import pandas as pd
+from agents import QuestionExtractionEngine, AnsweringEngine
+# ---- Gradio handlers ----
+def extract_questions_handler(guideline_pdf, surgery_type, suggestions):
     """
+    Returns (questions_list, questions_dataframe)
     """
+    if guideline_pdf is None:
+        raise gr.Error("Please upload a surgical guideline PDF (e.g., Knee Arthroplasty guideline).")
+    pdf_path = guideline_pdf.name if hasattr(guideline_pdf, "name") else str(guideline_pdf)
+    q_engine = QuestionExtractionEngine(pdf_path, surgery_type or "Procedure", user_suggestions=suggestions or "")
+    questions: List[str] = q_engine.run()
+    if not questions:
+        raise gr.Error("I couldn't extract any questions. Try adding suggestions or another guideline PDF.")
+    df = pd.DataFrame({"#": list(range(1, len(questions) + 1)), "Question": questions})
+    return questions, df
+def answer_questions_handler(patient_pdf, questions_state):
     """
+    Returns a DataFrame: Question | Answer | Rationale
     """
+    if patient_pdf is None:
+        raise gr.Error("Please upload a patient chart PDF.")
+    if not questions_state:
+        raise gr.Error("No questions available. Extract questions first.")
+    pdf_path = patient_pdf.name if hasattr(patient_pdf, "name") else str(patient_pdf)
+    a_engine = AnsweringEngine(pdf_path)
+    rows = []
+    for q in questions_state:
+        res: Dict[str, Dict[str, str]] = a_engine.answer_one(q)
+        ans = res.get(q, {})
+        rows.append({
+            "Question": q,
+            "Answer": ans.get("answer", "Not Found"),
+            "Rationale": ans.get("rationale", "—")
+        })
+    df = pd.DataFrame(rows, columns=["Question", "Answer", "Rationale"])
+    return df
+with gr.Blocks(title="Guideline → Questions → Chart Answers", theme="soft") as demo:
+    gr.Markdown("# 🏥 Guideline Q&A (Yes/No) — with Iterative Feedback")
+    with gr.Tabs():
+        with gr.Tab("1) Extract Questions"):
+            with gr.Row():
+                guideline_pdf = gr.File(label="Upload Guideline PDF (e.g., Knee_Arthroplasty.pdf)", file_count="single")
+                surgery_type = gr.Textbox(label="Surgery Type", value="Knee Arthroplasty")
+            suggestions = gr.Textbox(
+                label="Suggestions (optional)",
+                placeholder="E.g., 'Focus on contraindications and pre-op lab thresholds; keep questions short.'"
+            )
+            extract_btn = gr.Button("Extract Yes/No Questions", variant="primary")
+            questions_state = gr.State([])  # list[str]
+            questions_df = gr.Dataframe(headers=["#", "Question"], interactive=False, wrap=True)
+            gr.Markdown("If the questions need refinement, update 'Suggestions' and click the button again.")
+            def _extract_and_store(g_pdf, s_type, sugg):
+                q_list, df = extract_questions_handler(g_pdf, s_type, sugg)
+                return q_list, df
+            extract_btn.click(
+                fn=_extract_and_store,
+                inputs=[guideline_pdf, surgery_type, suggestions],
+                outputs=[questions_state, questions_df],
+            )
+        with gr.Tab("2) Answer from Patient Chart"):
+            patient_pdf = gr.File(label="Upload Patient Chart PDF (e.g., JAY_MORGAN.pdf)", file_count="single")
+            answer_btn = gr.Button("Answer All Questions", variant="primary")
+            answers_df = gr.Dataframe(headers=["Question", "Answer", "Rationale"], interactive=False, wrap=True)
+            answer_btn.click(
+                fn=answer_questions_handler,
+                inputs=[patient_pdf, questions_state],
+                outputs=answers_df,
+            )
+    gr.Markdown(
+        "### Notes\n"
+        "- Embeddings are cached under `./embeddings/<PDF-name>/` to avoid recomputation.\n"
+        "- Set your OpenAI key: `export OPENAI_API_KEY=...` before running.\n"
+        "- If a PDF is scanned/image-only, text extraction may be poor (consider OCR pre-processing).\n"
+    )
+if __name__ == "__main__":
+    # Optional: set server params via env (Arch-friendly)
+    # os.environ.setdefault("GRADIO_SERVER_NAME", "0.0.0.0")
+    # os.environ.setdefault("GRADIO_SERVER_PORT", "7860")
+    demo.launch()