abjasrees commited on
Commit
bdd2e75
·
verified ·
1 Parent(s): 6baa083

Update files_process.py

Browse files
Files changed (1) hide show
  1. files_process.py +102 -85
files_process.py CHANGED
@@ -1,90 +1,107 @@
1
  # files_process.py
2
- import pathlib
3
- from typing import Union
4
- from pypdf import PdfReader
5
- from docx import Document
6
-
7
- def _read_file_by_ext(p: pathlib.Path) -> str:
8
- ext = p.suffix.lower()
9
- if ext == ".txt":
10
- return p.read_text(encoding="utf-8", errors="ignore")
11
- if ext == ".docx":
12
- doc = Document(str(p))
13
- return "\n".join(paragraph.text for paragraph in doc.paragraphs)
14
- if ext == ".pdf":
15
- reader = PdfReader(str(p))
16
- pages = []
17
- for page in reader.pages:
18
- t = page.extract_text()
19
- if t:
20
- pages.append(t)
21
- return "\n".join(pages)
22
- raise ValueError(f"Unsupported file extension: {ext}. Use .txt / .docx / .pdf.")
23
-
24
- def load_input_text(input_arg: Union[str, pathlib.Path]) -> str:
25
  """
26
- Load text from a string, or from a file path (.txt, .docx, .pdf).
27
- - If the argument looks like plain text (contains newlines or is very long), return it as-is.
28
- - Otherwise, if it resolves to an existing file, read it by extension.
29
- - On any OSError from filesystem probing (e.g., Errno 36), treat as raw text.
30
  """
31
- if input_arg is None:
32
- raise ValueError("input_arg is required")
33
-
34
- if isinstance(input_arg, pathlib.Path):
35
- try:
36
- if input_arg.exists():
37
- return _read_file_by_ext(input_arg)
38
- return str(input_arg)
39
- except OSError:
40
- return str(input_arg)
41
-
42
- s = str(input_arg)
43
- if ("\n" in s) or ("\r" in s) or (len(s) > 512):
44
- return s
45
-
46
- p = pathlib.Path(s)
47
- try:
48
- if p.exists():
49
- return _read_file_by_ext(p)
50
- return s
51
- except OSError:
52
- return s
53
-
54
- def prepare_input_arg(text_value: str | None, file_obj) -> str:
55
  """
56
- Combine textbox text and a single uploaded file (.txt/.docx/.pdf).
57
- If both present, concatenate into a temp text file and return its path.
58
- Compatible with Gradio/Scripts where file_obj may have a .name attribute or be a dict.
59
  """
60
- text = (text_value or "").strip()
61
- if file_obj is None and not text:
62
- raise ValueError("Provide either text or upload a .txt/.docx/.pdf")
63
-
64
- # If only text
65
- if file_obj is None:
66
- return text
67
-
68
- # Best-effort path extraction
69
- if hasattr(file_obj, "name") and isinstance(file_obj.name, str):
70
- up_path = pathlib.Path(file_obj.name)
71
- elif isinstance(file_obj, dict) and "name" in file_obj:
72
- up_path = pathlib.Path(file_obj["name"])
73
- else:
74
- # As a fallback, write bytes if available
75
- data = getattr(file_obj, "read", None)
76
- if callable(data):
77
- content = file_obj.read()
78
- up_path = pathlib.Path("/tmp/upload.bin")
79
- up_path.write_bytes(content)
80
- else:
81
- raise ValueError("Unsupported uploaded file object; missing .name or .read()")
82
-
83
- if text:
84
- tmp = pathlib.Path("/tmp/_concat_input.txt")
85
- tmp.write_text(text + "\n\n", encoding="utf-8")
86
- appended = load_input_text(str(up_path))
87
- tmp.write_text(tmp.read_text(encoding="utf-8") + appended, encoding="utf-8")
88
- return str(tmp)
89
-
90
- return str(up_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # files_process.py
2
+ import os
3
+ from typing import List, Dict
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+
8
+ from agents import QuestionExtractionEngine, AnsweringEngine
9
+
10
+
11
+ # ---- Gradio handlers ----
12
+
13
+ def extract_questions_handler(guideline_pdf, surgery_type, suggestions):
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
+ Returns (questions_list, questions_dataframe)
 
 
 
16
  """
17
+ if guideline_pdf is None:
18
+ raise gr.Error("Please upload a surgical guideline PDF (e.g., Knee Arthroplasty guideline).")
19
+
20
+ pdf_path = guideline_pdf.name if hasattr(guideline_pdf, "name") else str(guideline_pdf)
21
+ q_engine = QuestionExtractionEngine(pdf_path, surgery_type or "Procedure", user_suggestions=suggestions or "")
22
+ questions: List[str] = q_engine.run()
23
+
24
+ if not questions:
25
+ raise gr.Error("I couldn't extract any questions. Try adding suggestions or another guideline PDF.")
26
+
27
+ df = pd.DataFrame({"#": list(range(1, len(questions) + 1)), "Question": questions})
28
+ return questions, df
29
+
30
+
31
+ def answer_questions_handler(patient_pdf, questions_state):
 
 
 
 
 
 
 
 
 
32
  """
33
+ Returns a DataFrame: Question | Answer | Rationale
 
 
34
  """
35
+ if patient_pdf is None:
36
+ raise gr.Error("Please upload a patient chart PDF.")
37
+ if not questions_state:
38
+ raise gr.Error("No questions available. Extract questions first.")
39
+
40
+ pdf_path = patient_pdf.name if hasattr(patient_pdf, "name") else str(patient_pdf)
41
+ a_engine = AnsweringEngine(pdf_path)
42
+
43
+ rows = []
44
+ for q in questions_state:
45
+ res: Dict[str, Dict[str, str]] = a_engine.answer_one(q)
46
+ ans = res.get(q, {})
47
+ rows.append({
48
+ "Question": q,
49
+ "Answer": ans.get("answer", "Not Found"),
50
+ "Rationale": ans.get("rationale", "—")
51
+ })
52
+
53
+ df = pd.DataFrame(rows, columns=["Question", "Answer", "Rationale"])
54
+ return df
55
+
56
+
57
+ with gr.Blocks(title="Guideline → Questions → Chart Answers", theme="soft") as demo:
58
+ gr.Markdown("# 🏥 Guideline Q&A (Yes/No) — with Iterative Feedback")
59
+
60
+ with gr.Tabs():
61
+ with gr.Tab("1) Extract Questions"):
62
+ with gr.Row():
63
+ guideline_pdf = gr.File(label="Upload Guideline PDF (e.g., Knee_Arthroplasty.pdf)", file_count="single")
64
+ surgery_type = gr.Textbox(label="Surgery Type", value="Knee Arthroplasty")
65
+ suggestions = gr.Textbox(
66
+ label="Suggestions (optional)",
67
+ placeholder="E.g., 'Focus on contraindications and pre-op lab thresholds; keep questions short.'"
68
+ )
69
+ extract_btn = gr.Button("Extract Yes/No Questions", variant="primary")
70
+
71
+ questions_state = gr.State([]) # list[str]
72
+ questions_df = gr.Dataframe(headers=["#", "Question"], interactive=False, wrap=True)
73
+ gr.Markdown("If the questions need refinement, update 'Suggestions' and click the button again.")
74
+
75
+ def _extract_and_store(g_pdf, s_type, sugg):
76
+ q_list, df = extract_questions_handler(g_pdf, s_type, sugg)
77
+ return q_list, df
78
+
79
+ extract_btn.click(
80
+ fn=_extract_and_store,
81
+ inputs=[guideline_pdf, surgery_type, suggestions],
82
+ outputs=[questions_state, questions_df],
83
+ )
84
+
85
+ with gr.Tab("2) Answer from Patient Chart"):
86
+ patient_pdf = gr.File(label="Upload Patient Chart PDF (e.g., JAY_MORGAN.pdf)", file_count="single")
87
+ answer_btn = gr.Button("Answer All Questions", variant="primary")
88
+ answers_df = gr.Dataframe(headers=["Question", "Answer", "Rationale"], interactive=False, wrap=True)
89
+
90
+ answer_btn.click(
91
+ fn=answer_questions_handler,
92
+ inputs=[patient_pdf, questions_state],
93
+ outputs=answers_df,
94
+ )
95
+
96
+ gr.Markdown(
97
+ "### Notes\n"
98
+ "- Embeddings are cached under `./embeddings/<PDF-name>/` to avoid recomputation.\n"
99
+ "- Set your OpenAI key: `export OPENAI_API_KEY=...` before running.\n"
100
+ "- If a PDF is scanned/image-only, text extraction may be poor (consider OCR pre-processing).\n"
101
+ )
102
+
103
+ if __name__ == "__main__":
104
+ # Optional: set server params via env (Arch-friendly)
105
+ # os.environ.setdefault("GRADIO_SERVER_NAME", "0.0.0.0")
106
+ # os.environ.setdefault("GRADIO_SERVER_PORT", "7860")
107
+ demo.launch()