Update files_process.py
Browse files- files_process.py +102 -85
files_process.py
CHANGED
|
@@ -1,90 +1,107 @@
|
|
| 1 |
# files_process.py
|
| 2 |
-
import
|
| 3 |
-
from typing import
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
if ext == ".pdf":
|
| 15 |
-
reader = PdfReader(str(p))
|
| 16 |
-
pages = []
|
| 17 |
-
for page in reader.pages:
|
| 18 |
-
t = page.extract_text()
|
| 19 |
-
if t:
|
| 20 |
-
pages.append(t)
|
| 21 |
-
return "\n".join(pages)
|
| 22 |
-
raise ValueError(f"Unsupported file extension: {ext}. Use .txt / .docx / .pdf.")
|
| 23 |
-
|
| 24 |
-
def load_input_text(input_arg: Union[str, pathlib.Path]) -> str:
|
| 25 |
"""
|
| 26 |
-
|
| 27 |
-
- If the argument looks like plain text (contains newlines or is very long), return it as-is.
|
| 28 |
-
- Otherwise, if it resolves to an existing file, read it by extension.
|
| 29 |
-
- On any OSError from filesystem probing (e.g., Errno 36), treat as raw text.
|
| 30 |
"""
|
| 31 |
-
if
|
| 32 |
-
raise
|
| 33 |
-
|
| 34 |
-
if
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
p = pathlib.Path(s)
|
| 47 |
-
try:
|
| 48 |
-
if p.exists():
|
| 49 |
-
return _read_file_by_ext(p)
|
| 50 |
-
return s
|
| 51 |
-
except OSError:
|
| 52 |
-
return s
|
| 53 |
-
|
| 54 |
-
def prepare_input_arg(text_value: str | None, file_obj) -> str:
|
| 55 |
"""
|
| 56 |
-
|
| 57 |
-
If both present, concatenate into a temp text file and return its path.
|
| 58 |
-
Compatible with Gradio/Scripts where file_obj may have a .name attribute or be a dict.
|
| 59 |
"""
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
if
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# files_process.py
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from agents import QuestionExtractionEngine, AnsweringEngine
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ---- Gradio handlers ----
|
| 12 |
+
|
| 13 |
+
def extract_questions_handler(guideline_pdf, surgery_type, suggestions):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
+
Returns (questions_list, questions_dataframe)
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
+
if guideline_pdf is None:
|
| 18 |
+
raise gr.Error("Please upload a surgical guideline PDF (e.g., Knee Arthroplasty guideline).")
|
| 19 |
+
|
| 20 |
+
pdf_path = guideline_pdf.name if hasattr(guideline_pdf, "name") else str(guideline_pdf)
|
| 21 |
+
q_engine = QuestionExtractionEngine(pdf_path, surgery_type or "Procedure", user_suggestions=suggestions or "")
|
| 22 |
+
questions: List[str] = q_engine.run()
|
| 23 |
+
|
| 24 |
+
if not questions:
|
| 25 |
+
raise gr.Error("I couldn't extract any questions. Try adding suggestions or another guideline PDF.")
|
| 26 |
+
|
| 27 |
+
df = pd.DataFrame({"#": list(range(1, len(questions) + 1)), "Question": questions})
|
| 28 |
+
return questions, df
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def answer_questions_handler(patient_pdf, questions_state):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"""
|
| 33 |
+
Returns a DataFrame: Question | Answer | Rationale
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
+
if patient_pdf is None:
|
| 36 |
+
raise gr.Error("Please upload a patient chart PDF.")
|
| 37 |
+
if not questions_state:
|
| 38 |
+
raise gr.Error("No questions available. Extract questions first.")
|
| 39 |
+
|
| 40 |
+
pdf_path = patient_pdf.name if hasattr(patient_pdf, "name") else str(patient_pdf)
|
| 41 |
+
a_engine = AnsweringEngine(pdf_path)
|
| 42 |
+
|
| 43 |
+
rows = []
|
| 44 |
+
for q in questions_state:
|
| 45 |
+
res: Dict[str, Dict[str, str]] = a_engine.answer_one(q)
|
| 46 |
+
ans = res.get(q, {})
|
| 47 |
+
rows.append({
|
| 48 |
+
"Question": q,
|
| 49 |
+
"Answer": ans.get("answer", "Not Found"),
|
| 50 |
+
"Rationale": ans.get("rationale", "—")
|
| 51 |
+
})
|
| 52 |
+
|
| 53 |
+
df = pd.DataFrame(rows, columns=["Question", "Answer", "Rationale"])
|
| 54 |
+
return df
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
with gr.Blocks(title="Guideline → Questions → Chart Answers", theme="soft") as demo:
|
| 58 |
+
gr.Markdown("# 🏥 Guideline Q&A (Yes/No) — with Iterative Feedback")
|
| 59 |
+
|
| 60 |
+
with gr.Tabs():
|
| 61 |
+
with gr.Tab("1) Extract Questions"):
|
| 62 |
+
with gr.Row():
|
| 63 |
+
guideline_pdf = gr.File(label="Upload Guideline PDF (e.g., Knee_Arthroplasty.pdf)", file_count="single")
|
| 64 |
+
surgery_type = gr.Textbox(label="Surgery Type", value="Knee Arthroplasty")
|
| 65 |
+
suggestions = gr.Textbox(
|
| 66 |
+
label="Suggestions (optional)",
|
| 67 |
+
placeholder="E.g., 'Focus on contraindications and pre-op lab thresholds; keep questions short.'"
|
| 68 |
+
)
|
| 69 |
+
extract_btn = gr.Button("Extract Yes/No Questions", variant="primary")
|
| 70 |
+
|
| 71 |
+
questions_state = gr.State([]) # list[str]
|
| 72 |
+
questions_df = gr.Dataframe(headers=["#", "Question"], interactive=False, wrap=True)
|
| 73 |
+
gr.Markdown("If the questions need refinement, update 'Suggestions' and click the button again.")
|
| 74 |
+
|
| 75 |
+
def _extract_and_store(g_pdf, s_type, sugg):
|
| 76 |
+
q_list, df = extract_questions_handler(g_pdf, s_type, sugg)
|
| 77 |
+
return q_list, df
|
| 78 |
+
|
| 79 |
+
extract_btn.click(
|
| 80 |
+
fn=_extract_and_store,
|
| 81 |
+
inputs=[guideline_pdf, surgery_type, suggestions],
|
| 82 |
+
outputs=[questions_state, questions_df],
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
with gr.Tab("2) Answer from Patient Chart"):
|
| 86 |
+
patient_pdf = gr.File(label="Upload Patient Chart PDF (e.g., JAY_MORGAN.pdf)", file_count="single")
|
| 87 |
+
answer_btn = gr.Button("Answer All Questions", variant="primary")
|
| 88 |
+
answers_df = gr.Dataframe(headers=["Question", "Answer", "Rationale"], interactive=False, wrap=True)
|
| 89 |
+
|
| 90 |
+
answer_btn.click(
|
| 91 |
+
fn=answer_questions_handler,
|
| 92 |
+
inputs=[patient_pdf, questions_state],
|
| 93 |
+
outputs=answers_df,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
gr.Markdown(
|
| 97 |
+
"### Notes\n"
|
| 98 |
+
"- Embeddings are cached under `./embeddings/<PDF-name>/` to avoid recomputation.\n"
|
| 99 |
+
"- Set your OpenAI key: `export OPENAI_API_KEY=...` before running.\n"
|
| 100 |
+
"- If a PDF is scanned/image-only, text extraction may be poor (consider OCR pre-processing).\n"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
# Optional: set server params via env (Arch-friendly)
|
| 105 |
+
# os.environ.setdefault("GRADIO_SERVER_NAME", "0.0.0.0")
|
| 106 |
+
# os.environ.setdefault("GRADIO_SERVER_PORT", "7860")
|
| 107 |
+
demo.launch()
|