Spaces:

Fuzure
/

sheami

Sleeping

App Files Files Community

vikramvasudevan commited on Aug 16, 2025

Commit

0cfb077

verified ·

1 Parent(s): ed2dc57

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.github/workflows/update_space.yml +28 -28
app.py +58 -58
config.py +8 -8
graph.py +448 -448
models.py +14 -14
pdf_reader.py +16 -16

.github/workflows/update_space.yml CHANGED Viewed

@@ -1,28 +1,28 @@
-name: Run Python script
-on:
-  push:
-    branches:
-      - main
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v2
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.9'
-    - name: Install Gradio
-      run: python -m pip install gradio
-    - name: Log in to Hugging Face
-      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
-    - name: Deploy to Spaces
-      run: gradio deploy

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

app.py CHANGED Viewed

@@ -1,58 +1,58 @@
-import gradio as gr
-import uuid
-import os
-from graph import create_graph, SheamiState, HealthReport
-from pdf_reader import read_pdf
-def process_reports(files):
-    if not files:
-        return "Please upload at least one PDF file."
-    thread_id = str(uuid.uuid4())
-    # Create workflow
-    workflow = create_graph(thread_id=thread_id)
-    # Convert uploaded PDFs into HealthReport objects
-    uploaded_reports = []
-    for file in files:
-        file_path = file.name
-        contents = read_pdf(file_path)
-        uploaded_reports.append(
-            HealthReport(
-                report_file_name=os.path.basename(file_path), report_contents=contents
-            )
-        )
-    # Run workflow
-    # Create initial state
-    state = SheamiState(uploaded_reports=uploaded_reports, thread_id=thread_id)
-    config = {"configurable": {"thread_id": thread_id}}
-    response = workflow.invoke(state, config=config)
-    return (
-        f"✅ Processed {len(files)} reports.\n"
-        "Please download the output file from below within 5 min."
-    ), response["interpreted_report"]
-# Build Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## 🩺 Sheami - Smart Healthcare Engine for Artificial Medical Intelligence")
-    with gr.Row():
-        file_input = gr.File(
-            file_types=[".pdf"],
-            type="filepath",
-            file_count="multiple",
-            label="Upload your Lab Reports (PDF)",
-        )
-    run_btn = gr.Button("Process Reports", variant="primary")
-    output_box = gr.Textbox(label="Processing Output", lines=2)
-    pdf_output = gr.File(label="Generated Report")
-    run_btn.click(process_reports, inputs=file_input, outputs=[output_box, pdf_output])
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import uuid
+import os
+from graph import create_graph, SheamiState, HealthReport
+from pdf_reader import read_pdf
+def process_reports(files):
+    if not files:
+        return "Please upload at least one PDF file."
+    thread_id = str(uuid.uuid4())
+    # Create workflow
+    workflow = create_graph(thread_id=thread_id)
+    # Convert uploaded PDFs into HealthReport objects
+    uploaded_reports = []
+    for file in files:
+        file_path = file.name
+        contents = read_pdf(file_path)
+        uploaded_reports.append(
+            HealthReport(
+                report_file_name=os.path.basename(file_path), report_contents=contents
+            )
+        )
+    # Run workflow
+    # Create initial state
+    state = SheamiState(uploaded_reports=uploaded_reports, thread_id=thread_id)
+    config = {"configurable": {"thread_id": thread_id}}
+    response = workflow.invoke(state, config=config)
+    return (
+        f"✅ Processed {len(files)} reports.\n"
+        "Please download the output file from below within 5 min."
+    ), response["interpreted_report"]
+# Build Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🩺 Sheami - Smart Healthcare Engine for Artificial Medical Intelligence")
+    with gr.Row():
+        file_input = gr.File(
+            file_types=[".pdf"],
+            type="filepath",
+            file_count="multiple",
+            label="Upload your Lab Reports (PDF)",
+        )
+    run_btn = gr.Button("Process Reports", variant="primary")
+    output_box = gr.Textbox(label="Processing Output", lines=2)
+    pdf_output = gr.File(label="Generated Report")
+    run_btn.click(process_reports, inputs=file_input, outputs=[output_box, pdf_output])
+if __name__ == "__main__":
+    demo.launch()

config.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import os
-class SheamiConfig:
-    _output_dir = "./output"
-    data_dir = "./data"
-    def get_output_dir(thread_id:str):
         return os.path.join(SheamiConfig._output_dir, thread_id)

+import os
+class SheamiConfig:
+    _output_dir = "./output"
+    data_dir = "./data"
+    def get_output_dir(thread_id:str):
         return os.path.join(SheamiConfig._output_dir, thread_id)

graph.py CHANGED Viewed

@@ -1,448 +1,448 @@
-import threading
-import time
-from numpy import number
-import pandas as pd
-from langchain_core.prompts import ChatPromptTemplate
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
-from reportlab.lib.pagesizes import A4
-from reportlab.lib.styles import getSampleStyleSheet
-from reportlab.lib.units import inch
-import matplotlib.pyplot as plt
-from dataclasses import dataclass
-from typing import Dict, List, Literal, Optional, TypedDict
-import os, json
-from pydantic import BaseModel
-from langchain_core.messages import HumanMessage, SystemMessage
-from langgraph.checkpoint.memory import InMemorySaver
-from langgraph.graph.message import StateGraph
-from langgraph.graph.state import START, END
-from langchain_openai import ChatOpenAI
-from dotenv import load_dotenv
-from config import SheamiConfig
-import logging
-logging.basicConfig()
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-load_dotenv(override=True)
-llm = ChatOpenAI(model=os.getenv("MODEL"), temperature=0.3)
-# -----------------------------
-# SCHEMA DEFINITIONS
-# -----------------------------
-from typing import Optional, List
-from pydantic import BaseModel, Field
-class PatientInfo(BaseModel):
-    name: Optional[str] = Field(None, description="Patient's full name")
-    age: Optional[int] = Field(None, description="Patient's age in years")
-    sex: Optional[str] = Field(None, description="Male/Female/Other")
-    medical_record_number: Optional[str] = None
-    class Config:
-        extra = "forbid"  # 🚨 ensures schema matches OpenAI’s strict rules
-class TestResultReferenceRange(BaseModel):
-    min: Optional[float] = None
-    max: Optional[float] = None
-class LabResult(BaseModel):
-    test_name: str
-    result_value: str
-    test_unit: str
-    test_reference_range: Optional[TestResultReferenceRange] = None
-    test_date: Optional[str] = None
-    inferred_range: Literal["low", "normal", "high"]
-    class Config:
-        extra = "forbid"
-class StandardizedReport(BaseModel):
-    patient_info: PatientInfo  # 🚨 no longer a raw dict
-    lab_results: List[LabResult]
-    diagnosis: List[str]
-    recommendations: List[str]
-    class Config:
-        extra = "forbid"
-@dataclass
-class HealthReport:
-    report_file_name: str
-    report_contents: str
-class SheamiState(TypedDict):
-    thread_id: str
-    uploaded_reports: List[HealthReport]
-    standardized_reports: List[StandardizedReport]
-    trends_json: dict
-    interpreted_report: str
-import re
-def safe_filename(name: str) -> str:
-    # Replace spaces with underscores
-    name = name.replace(" ", "_")
-    # Replace any non-alphanumeric / dash / underscore with "_"
-    name = re.sub(r"[^A-Za-z0-9_\-]", "_", name)
-    # Collapse multiple underscores
-    name = re.sub(r"_+", "_", name)
-    return name.strip("_")
-import dateutil.parser
-def parse_any_date(date_str):
-    if not date_str or pd.isna(date_str):
-        return pd.NaT
-    try:
-        return dateutil.parser.parse(str(date_str), dayfirst=False, fuzzy=True)
-    except Exception:
-        return pd.NaT
-# prompt template
-testname_standardizer_prompt = ChatPromptTemplate.from_messages(
-    [
-        (
-            "system",
-            "You are a medical assistant. Normalize lab test names."
-            "All outputs must use **title case** (e.g., 'Hemoglobin', 'Blood Glucose')."
-            "Return ONLY valid JSON where keys are original names and values are standardized names. DO NOT return markdown formatting like backquotes etc.",
-        ),
-        (
-            "human",
-            """Normalize the following lab test names to their standard medical equivalents.
-Test names: {test_names}
-""",
-        ),
-    ]
-)
-# chain = prompt → LLM → string
-testname_standardizer_chain = testname_standardizer_prompt | llm
-# -----------------------------
-# GRAPH NODES
-# -----------------------------
-def fn_init_node(state: SheamiState):
-    os.makedirs(SheamiConfig.get_output_dir(state["thread_id"]), exist_ok=True)
-    state["standardized_reports"] = []
-    state["trends_json"] = {}
-    state["interpreted_report"] = ""
-    return state
-def fn_standardizer_node(state: SheamiState):
-    logger.info("%s| Standardizing reports", state["thread_id"])
-    llm_structured = llm.with_structured_output(StandardizedReport)
-    for idx, report in enumerate(state["uploaded_reports"]):
-        logger.info("%s| Standardizing report %s", state["thread_id"], report.report_file_name)
-        messages = [
-            SystemMessage(content="Standardize this medical report into the schema."),
-            # SystemMessage(
-            #     content="Populate the `inferred_range` field as 'low', 'normal', or 'high' by comparing the result value with the reference range. If both min and max are missing, set 'normal' unless the value is clearly out of usual medical ranges."
-            # ),
-            HumanMessage(content=report.report_contents),
-        ]
-        result: StandardizedReport = llm_structured.invoke(messages)
-        state["standardized_reports"].append(result)
-        # save to disk
-        with open(
-            os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), f"report_{idx}.json"), "w"
-        ) as f:
-            f.write(result.model_dump_json(indent=2))
-    logger.info("%s| Standardizing Reports: finished", state["thread_id"])
-    return state
-def fn_testname_standardizer_node(state: SheamiState):
-    logger.info("%s| Standardizing Test Names: started", state["thread_id"])
-    # collect unique names
-    unique_names = list(
-        {
-            result.test_name
-            for report in state["standardized_reports"]
-            for result in report.lab_results
-        }
-    )
-    # run through LLM
-    response = testname_standardizer_chain.invoke({"test_names": unique_names})
-    raw_text = response.content
-    try:
-        normalization_map: Dict[str, str] = json.loads(raw_text)
-    except Exception as e:
-        print("Exception in normalization: ", e)
-        normalization_map = {
-            name: name for name in unique_names
-        }  # fallback: identity mapping
-    # apply mapping back
-    for report in state["standardized_reports"]:
-        for result in report.lab_results:
-            result.test_name = normalization_map.get(result.test_name, result.test_name)
-    logger.info("%s| Standardizing Test Names: finished", state["thread_id"])
-    return state
-def fn_unit_normalizer_node(state: SheamiState):
-    logger.info("%s| Standardizing Units : started", state["thread_id"])
-    """
-    Normalize units for lab test values across all standardized reports.
-    Example: 'gms/dL', 'gm%', 'G/DL' → 'g/dL'
-    """
-    unit_map = {
-        "g/dl": "g/dL",
-        "gms/dl": "g/dL",
-        "gm%": "g/dL",
-        "g/dl.": "g/dL",
-    }
-    for report in state["standardized_reports"]:
-        for lr in report.lab_results:
-            if not lr.test_unit:
-                continue
-            normalized = lr.test_unit.lower().replace(" ", "")
-            lr.test_unit = unit_map.get(
-                normalized, lr.test_unit
-            )  # fallback: keep original
-    logger.info("%s| Standardizing Units : finished", state["thread_id"])
-    return state
-def fn_trends_aggregator_node(state: SheamiState):
-    logger.info("%s| Aggregating Trends : started", state["thread_id"])
-    import re
-    # group results by test_name
-    trends = {}
-    ref_ranges = {}
-    def try_parse_float(value: str):
-        try:
-            return float(value)
-        except (ValueError, TypeError):
-            # return None if not numeric
-            return None
-    for idx, report in enumerate(state["standardized_reports"]):
-        logger.info("%s| Aggregating Trends for report-%d", state["thread_id"], idx)
-        for lr in report.lab_results:
-            numeric_value = try_parse_float(lr.result_value)
-            trends.setdefault(lr.test_name, []).append(
-                {
-                    "date": lr.test_date or "unknown",
-                    "value": (
-                        numeric_value if numeric_value is not None else lr.result_value
-                    ),
-                    "is_numeric": numeric_value is not None,
-                    "unit": lr.test_unit,
-                }
-            )
-            # Capture reference range if available (assuming same for all entries of a test_name)
-            if lr.test_reference_range:
-                ref_ranges[lr.test_name] = {
-                    "min": lr.test_reference_range.min,
-                    "max": lr.test_reference_range.max,
-                }
-    # combine into parameter_trends
-    state["trends_json"] = {
-        "parameter_trends": [
-            {
-                "test_name": k,
-                "values": v,
-                "reference_range": ref_ranges.get(k),  # attach thresholds
-            }
-            for k, v in trends.items()
-        ]
-    }
-    with open(os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "trends.json"), "w") as f:
-        json.dump(state["trends_json"], f, indent=2)
-    logger.info("%s| Aggregating Trends : finished", state["thread_id"])
-    return state
-def fn_interpreter_node(state: SheamiState):
-    logger.info("%s| Interpreting Trends : started", state["thread_id"])
-    # 1. LLM narrative
-    messages = [
-        SystemMessage(
-            content="Interpret the following medical trends and produce a report with patient summary, trend summaries, and clinical insights. "
-            "Do not include charts, they will be programmatically added."
-        ),
-        HumanMessage(content=json.dumps(state["trends_json"], indent=2)),
-    ]
-    response = llm.invoke(messages)
-    interpretation_text = response.content
-    # 2. Generate plots for each parameter
-    plots_dir = os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "plots")
-    os.makedirs(plots_dir, exist_ok=True)
-    plot_files = []
-    for param in sorted(
-        state["trends_json"].get("parameter_trends", []), key=lambda x: x["test_name"]
-    ):
-        test_name = param["test_name"]
-        values = param["values"]
-        # plotting + PDF writing logic here
-        x = [v["date"] for v in values]
-        # print("original dates for ", test_name, "= ", x)
-        x = [parse_any_date(d) for d in x]
-        x = pd.to_datetime(x, errors="coerce")
-        # print("formatted dates for ", test_name, "= ", x)
-        try:
-            y = [float(v["value"]) for v in values]
-        except ValueError:
-            continue  # skip non-numeric
-        ## sort the data by date
-        # Zip into a DataFrame for easy sorting
-        df_plot = pd.DataFrame({"x": x, "y": y})
-        # Drop invalid dates if any
-        df_plot = df_plot.dropna(subset=["x"])
-        # Sort by date
-        df_plot = df_plot.sort_values("x")
-        # Extract sorted arrays
-        x = df_plot["x"].to_numpy()
-        y = df_plot["y"].to_numpy()
-        # print("formatted + sorted dates for", test_name, "=", x)
-        plt.figure(figsize=(6, 4))
-        plt.plot(x, y, marker="o", linestyle="-", label="Observed values")
-        # add thresholds if available
-        ref = param.get("reference_range")
-        if ref:
-            ymin, ymax = ref.get("min"), ref.get("max")
-            if ymin is not None and ymax is not None:
-                plt.axhspan(
-                    ymin, ymax, color="green", alpha=0.2, label="Reference range"
-                )
-            elif ymax is not None:
-                plt.axhline(
-                    y=ymax, color="red", linestyle="--", label="Upper threshold"
-                )
-            elif ymin is not None:
-                plt.axhline(
-                    y=ymin, color="blue", linestyle="--", label="Lower threshold"
-                )
-        plt.title(f"{test_name} Trend")
-        plt.xlabel("Date")
-        plt.ylabel(values[0]["unit"] if values and "unit" in values[0] else "")
-        plt.grid(True)
-        plt.xticks(rotation=45)
-        plt.legend()
-        plt.tight_layout()
-        filename = f"{safe_filename(test_name).replace(' ', '_')}_trend.png"
-        filepath = os.path.join(plots_dir, filename)
-        plt.savefig(filepath)
-        plt.close()
-        plot_files.append((test_name, filepath))
-    # 3. Build PDF
-    pdf_path = os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "final_report.pdf")
-    doc = SimpleDocTemplate(pdf_path, pagesize=A4)
-    styles = getSampleStyleSheet()
-    story = []
-    # Add title
-    story.append(Paragraph("<b>Medical Report Interpretation</b>", styles["Title"]))
-    story.append(Spacer(1, 0.3 * inch))
-    # Add interpretation text (LLM output)
-    for line in interpretation_text.split("\n"):
-        if line.strip():
-            story.append(Paragraph(line.strip(), styles["Normal"]))
-            story.append(Spacer(1, 0.15 * inch))
-    # Add charts
-    story.append(Spacer(1, 0.5 * inch))
-    story.append(Paragraph("<b>Trends</b>", styles["Heading2"]))
-    story.append(Spacer(1, 0.2 * inch))
-    for test_name, plotfile in plot_files:
-        story.append(Paragraph(f"<b>{test_name}</b>", styles["Heading3"]))
-        story.append(Image(plotfile, width=5 * inch, height=3 * inch))
-        story.append(Spacer(1, 0.3 * inch))
-    doc.build(story)
-    state["interpreted_report"] = pdf_path
-    ###### Schedule Cleanup of output dir after 5 min.
-    schedule_cleanup(file_path=SheamiConfig.get_output_dir(state["thread_id"]))
-    logger.info("%s| Interpreting Trends : finished", state["thread_id"])
-    return state
-def schedule_cleanup(file_path, delay=300):  # 300 sec = 5 min
-    def cleanup():
-        time.sleep(delay)
-        if os.path.exists(file_path):
-            try:
-                if os.path.isdir(file_path):
-                    import shutil
-                    shutil.rmtree(file_path)
-                else:
-                    os.remove(file_path)
-                print(f"Cleaned up: {file_path}")
-            except Exception as e:
-                print(f"Cleanup failed for {file_path}: {e}")
-    threading.Thread(target=cleanup, daemon=True).start()
-# -----------------------------
-# GRAPH CREATION
-# -----------------------------
-def create_graph(thread_id : str):
-    logger.info("%s| Creating Graph : started", thread_id)
-    memory = InMemorySaver()
-    workflow = StateGraph(SheamiState)
-    workflow.add_node("init", fn_init_node)
-    workflow.add_node("standardizer", fn_standardizer_node)
-    workflow.add_node("testname_standardizer", fn_testname_standardizer_node)
-    workflow.add_node("unit_normalizer", fn_unit_normalizer_node)
-    workflow.add_node("trends", fn_trends_aggregator_node)
-    workflow.add_node("interpreter", fn_interpreter_node)
-    workflow.add_edge(START, "init")
-    workflow.add_edge("init", "standardizer")
-    workflow.add_edge("standardizer", "testname_standardizer")
-    workflow.add_edge("testname_standardizer", "unit_normalizer")
-    workflow.add_edge("unit_normalizer", "trends")
-    workflow.add_edge("trends", "interpreter")
-    workflow.add_edge("interpreter", END)
-    logger.info("%s| Creating Graph : finished", thread_id)
-    return workflow.compile(checkpointer=memory)

+import threading
+import time
+from numpy import number
+import pandas as pd
+from langchain_core.prompts import ChatPromptTemplate
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib.units import inch
+import matplotlib.pyplot as plt
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Optional, TypedDict
+import os, json
+from pydantic import BaseModel
+from langchain_core.messages import HumanMessage, SystemMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph.message import StateGraph
+from langgraph.graph.state import START, END
+from langchain_openai import ChatOpenAI
+from dotenv import load_dotenv
+from config import SheamiConfig
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+load_dotenv(override=True)
+llm = ChatOpenAI(model=os.getenv("MODEL"), temperature=0.3)
+# -----------------------------
+# SCHEMA DEFINITIONS
+# -----------------------------
+from typing import Optional, List
+from pydantic import BaseModel, Field
+class PatientInfo(BaseModel):
+    name: Optional[str] = Field(None, description="Patient's full name")
+    age: Optional[int] = Field(None, description="Patient's age in years")
+    sex: Optional[str] = Field(None, description="Male/Female/Other")
+    medical_record_number: Optional[str] = None
+    class Config:
+        extra = "forbid"  # 🚨 ensures schema matches OpenAI’s strict rules
+class TestResultReferenceRange(BaseModel):
+    min: Optional[float] = None
+    max: Optional[float] = None
+class LabResult(BaseModel):
+    test_name: str
+    result_value: str
+    test_unit: str
+    test_reference_range: Optional[TestResultReferenceRange] = None
+    test_date: Optional[str] = None
+    inferred_range: Literal["low", "normal", "high"]
+    class Config:
+        extra = "forbid"
+class StandardizedReport(BaseModel):
+    patient_info: PatientInfo  # 🚨 no longer a raw dict
+    lab_results: List[LabResult]
+    diagnosis: List[str]
+    recommendations: List[str]
+    class Config:
+        extra = "forbid"
+@dataclass
+class HealthReport:
+    report_file_name: str
+    report_contents: str
+class SheamiState(TypedDict):
+    thread_id: str
+    uploaded_reports: List[HealthReport]
+    standardized_reports: List[StandardizedReport]
+    trends_json: dict
+    interpreted_report: str
+import re
+def safe_filename(name: str) -> str:
+    # Replace spaces with underscores
+    name = name.replace(" ", "_")
+    # Replace any non-alphanumeric / dash / underscore with "_"
+    name = re.sub(r"[^A-Za-z0-9_\-]", "_", name)
+    # Collapse multiple underscores
+    name = re.sub(r"_+", "_", name)
+    return name.strip("_")
+import dateutil.parser
+def parse_any_date(date_str):
+    if not date_str or pd.isna(date_str):
+        return pd.NaT
+    try:
+        return dateutil.parser.parse(str(date_str), dayfirst=False, fuzzy=True)
+    except Exception:
+        return pd.NaT
+# prompt template
+testname_standardizer_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are a medical assistant. Normalize lab test names."
+            "All outputs must use **title case** (e.g., 'Hemoglobin', 'Blood Glucose')."
+            "Return ONLY valid JSON where keys are original names and values are standardized names. DO NOT return markdown formatting like backquotes etc.",
+        ),
+        (
+            "human",
+            """Normalize the following lab test names to their standard medical equivalents.
+Test names: {test_names}
+""",
+        ),
+    ]
+)
+# chain = prompt → LLM → string
+testname_standardizer_chain = testname_standardizer_prompt | llm
+# -----------------------------
+# GRAPH NODES
+# -----------------------------
+def fn_init_node(state: SheamiState):
+    os.makedirs(SheamiConfig.get_output_dir(state["thread_id"]), exist_ok=True)
+    state["standardized_reports"] = []
+    state["trends_json"] = {}
+    state["interpreted_report"] = ""
+    return state
+def fn_standardizer_node(state: SheamiState):
+    logger.info("%s| Standardizing reports", state["thread_id"])
+    llm_structured = llm.with_structured_output(StandardizedReport)
+    for idx, report in enumerate(state["uploaded_reports"]):
+        logger.info("%s| Standardizing report %s", state["thread_id"], report.report_file_name)
+        messages = [
+            SystemMessage(content="Standardize this medical report into the schema."),
+            # SystemMessage(
+            #     content="Populate the `inferred_range` field as 'low', 'normal', or 'high' by comparing the result value with the reference range. If both min and max are missing, set 'normal' unless the value is clearly out of usual medical ranges."
+            # ),
+            HumanMessage(content=report.report_contents),
+        ]
+        result: StandardizedReport = llm_structured.invoke(messages)
+        state["standardized_reports"].append(result)
+        # save to disk
+        with open(
+            os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), f"report_{idx}.json"), "w"
+        ) as f:
+            f.write(result.model_dump_json(indent=2))
+    logger.info("%s| Standardizing Reports: finished", state["thread_id"])
+    return state
+def fn_testname_standardizer_node(state: SheamiState):
+    logger.info("%s| Standardizing Test Names: started", state["thread_id"])
+    # collect unique names
+    unique_names = list(
+        {
+            result.test_name
+            for report in state["standardized_reports"]
+            for result in report.lab_results
+        }
+    )
+    # run through LLM
+    response = testname_standardizer_chain.invoke({"test_names": unique_names})
+    raw_text = response.content
+    try:
+        normalization_map: Dict[str, str] = json.loads(raw_text)
+    except Exception as e:
+        print("Exception in normalization: ", e)
+        normalization_map = {
+            name: name for name in unique_names
+        }  # fallback: identity mapping
+    # apply mapping back
+    for report in state["standardized_reports"]:
+        for result in report.lab_results:
+            result.test_name = normalization_map.get(result.test_name, result.test_name)
+    logger.info("%s| Standardizing Test Names: finished", state["thread_id"])
+    return state
+def fn_unit_normalizer_node(state: SheamiState):
+    logger.info("%s| Standardizing Units : started", state["thread_id"])
+    """
+    Normalize units for lab test values across all standardized reports.
+    Example: 'gms/dL', 'gm%', 'G/DL' → 'g/dL'
+    """
+    unit_map = {
+        "g/dl": "g/dL",
+        "gms/dl": "g/dL",
+        "gm%": "g/dL",
+        "g/dl.": "g/dL",
+    }
+    for report in state["standardized_reports"]:
+        for lr in report.lab_results:
+            if not lr.test_unit:
+                continue
+            normalized = lr.test_unit.lower().replace(" ", "")
+            lr.test_unit = unit_map.get(
+                normalized, lr.test_unit
+            )  # fallback: keep original
+    logger.info("%s| Standardizing Units : finished", state["thread_id"])
+    return state
+def fn_trends_aggregator_node(state: SheamiState):
+    logger.info("%s| Aggregating Trends : started", state["thread_id"])
+    import re
+    # group results by test_name
+    trends = {}
+    ref_ranges = {}
+    def try_parse_float(value: str):
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            # return None if not numeric
+            return None
+    for idx, report in enumerate(state["standardized_reports"]):
+        logger.info("%s| Aggregating Trends for report-%d", state["thread_id"], idx)
+        for lr in report.lab_results:
+            numeric_value = try_parse_float(lr.result_value)
+            trends.setdefault(lr.test_name, []).append(
+                {
+                    "date": lr.test_date or "unknown",
+                    "value": (
+                        numeric_value if numeric_value is not None else lr.result_value
+                    ),
+                    "is_numeric": numeric_value is not None,
+                    "unit": lr.test_unit,
+                }
+            )
+            # Capture reference range if available (assuming same for all entries of a test_name)
+            if lr.test_reference_range:
+                ref_ranges[lr.test_name] = {
+                    "min": lr.test_reference_range.min,
+                    "max": lr.test_reference_range.max,
+                }
+    # combine into parameter_trends
+    state["trends_json"] = {
+        "parameter_trends": [
+            {
+                "test_name": k,
+                "values": v,
+                "reference_range": ref_ranges.get(k),  # attach thresholds
+            }
+            for k, v in trends.items()
+        ]
+    }
+    with open(os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "trends.json"), "w") as f:
+        json.dump(state["trends_json"], f, indent=2)
+    logger.info("%s| Aggregating Trends : finished", state["thread_id"])
+    return state
+def fn_interpreter_node(state: SheamiState):
+    logger.info("%s| Interpreting Trends : started", state["thread_id"])
+    # 1. LLM narrative
+    messages = [
+        SystemMessage(
+            content="Interpret the following medical trends and produce a report with patient summary, trend summaries, and clinical insights. "
+            "Do not include charts, they will be programmatically added."
+        ),
+        HumanMessage(content=json.dumps(state["trends_json"], indent=2)),
+    ]
+    response = llm.invoke(messages)
+    interpretation_text = response.content
+    # 2. Generate plots for each parameter
+    plots_dir = os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "plots")
+    os.makedirs(plots_dir, exist_ok=True)
+    plot_files = []
+    for param in sorted(
+        state["trends_json"].get("parameter_trends", []), key=lambda x: x["test_name"]
+    ):
+        test_name = param["test_name"]
+        values = param["values"]
+        # plotting + PDF writing logic here
+        x = [v["date"] for v in values]
+        # print("original dates for ", test_name, "= ", x)
+        x = [parse_any_date(d) for d in x]
+        x = pd.to_datetime(x, errors="coerce")
+        # print("formatted dates for ", test_name, "= ", x)
+        try:
+            y = [float(v["value"]) for v in values]
+        except ValueError:
+            continue  # skip non-numeric
+        ## sort the data by date
+        # Zip into a DataFrame for easy sorting
+        df_plot = pd.DataFrame({"x": x, "y": y})
+        # Drop invalid dates if any
+        df_plot = df_plot.dropna(subset=["x"])
+        # Sort by date
+        df_plot = df_plot.sort_values("x")
+        # Extract sorted arrays
+        x = df_plot["x"].to_numpy()
+        y = df_plot["y"].to_numpy()
+        # print("formatted + sorted dates for", test_name, "=", x)
+        plt.figure(figsize=(6, 4))
+        plt.plot(x, y, marker="o", linestyle="-", label="Observed values")
+        # add thresholds if available
+        ref = param.get("reference_range")
+        if ref:
+            ymin, ymax = ref.get("min"), ref.get("max")
+            if ymin is not None and ymax is not None:
+                plt.axhspan(
+                    ymin, ymax, color="green", alpha=0.2, label="Reference range"
+                )
+            elif ymax is not None:
+                plt.axhline(
+                    y=ymax, color="red", linestyle="--", label="Upper threshold"
+                )
+            elif ymin is not None:
+                plt.axhline(
+                    y=ymin, color="blue", linestyle="--", label="Lower threshold"
+                )
+        plt.title(f"{test_name} Trend")
+        plt.xlabel("Date")
+        plt.ylabel(values[0]["unit"] if values and "unit" in values[0] else "")
+        plt.grid(True)
+        plt.xticks(rotation=45)
+        plt.legend()
+        plt.tight_layout()
+        filename = f"{safe_filename(test_name).replace(' ', '_')}_trend.png"
+        filepath = os.path.join(plots_dir, filename)
+        plt.savefig(filepath)
+        plt.close()
+        plot_files.append((test_name, filepath))
+    # 3. Build PDF
+    pdf_path = os.path.join(SheamiConfig.get_output_dir(state["thread_id"]), "final_report.pdf")
+    doc = SimpleDocTemplate(pdf_path, pagesize=A4)
+    styles = getSampleStyleSheet()
+    story = []
+    # Add title
+    story.append(Paragraph("<b>Medical Report Interpretation</b>", styles["Title"]))
+    story.append(Spacer(1, 0.3 * inch))
+    # Add interpretation text (LLM output)
+    for line in interpretation_text.split("\n"):
+        if line.strip():
+            story.append(Paragraph(line.strip(), styles["Normal"]))
+            story.append(Spacer(1, 0.15 * inch))
+    # Add charts
+    story.append(Spacer(1, 0.5 * inch))
+    story.append(Paragraph("<b>Trends</b>", styles["Heading2"]))
+    story.append(Spacer(1, 0.2 * inch))
+    for test_name, plotfile in plot_files:
+        story.append(Paragraph(f"<b>{test_name}</b>", styles["Heading3"]))
+        story.append(Image(plotfile, width=5 * inch, height=3 * inch))
+        story.append(Spacer(1, 0.3 * inch))
+    doc.build(story)
+    state["interpreted_report"] = pdf_path
+    ###### Schedule Cleanup of output dir after 5 min.
+    schedule_cleanup(file_path=SheamiConfig.get_output_dir(state["thread_id"]))
+    logger.info("%s| Interpreting Trends : finished", state["thread_id"])
+    return state
+def schedule_cleanup(file_path, delay=300):  # 300 sec = 5 min
+    def cleanup():
+        time.sleep(delay)
+        if os.path.exists(file_path):
+            try:
+                if os.path.isdir(file_path):
+                    import shutil
+                    shutil.rmtree(file_path)
+                else:
+                    os.remove(file_path)
+                print(f"Cleaned up: {file_path}")
+            except Exception as e:
+                print(f"Cleanup failed for {file_path}: {e}")
+    threading.Thread(target=cleanup, daemon=True).start()
+# -----------------------------
+# GRAPH CREATION
+# -----------------------------
+def create_graph(thread_id : str):
+    logger.info("%s| Creating Graph : started", thread_id)
+    memory = InMemorySaver()
+    workflow = StateGraph(SheamiState)
+    workflow.add_node("init", fn_init_node)
+    workflow.add_node("standardizer", fn_standardizer_node)
+    workflow.add_node("testname_standardizer", fn_testname_standardizer_node)
+    workflow.add_node("unit_normalizer", fn_unit_normalizer_node)
+    workflow.add_node("trends", fn_trends_aggregator_node)
+    workflow.add_node("interpreter", fn_interpreter_node)
+    workflow.add_edge(START, "init")
+    workflow.add_edge("init", "standardizer")
+    workflow.add_edge("standardizer", "testname_standardizer")
+    workflow.add_edge("testname_standardizer", "unit_normalizer")
+    workflow.add_edge("unit_normalizer", "trends")
+    workflow.add_edge("trends", "interpreter")
+    workflow.add_edge("interpreter", END)
+    logger.info("%s| Creating Graph : finished", thread_id)
+    return workflow.compile(checkpointer=memory)

models.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from pydantic import BaseModel
-from typing import List, Optional
-class SheamiLabResult(BaseModel):
-    test_name: str
-    result_value: str
-    unit: str
-    reference_range: Optional[str]
-class SheamiStandardizedReport(BaseModel):
-    patient_info: dict
-    lab_results: List[SheamiLabResult]
-    diagnosis: List[str]
-    recommendations: List[str]

+from pydantic import BaseModel
+from typing import List, Optional
+class SheamiLabResult(BaseModel):
+    test_name: str
+    result_value: str
+    unit: str
+    reference_range: Optional[str]
+class SheamiStandardizedReport(BaseModel):
+    patient_info: dict
+    lab_results: List[SheamiLabResult]
+    diagnosis: List[str]
+    recommendations: List[str]

pdf_reader.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from pypdf import PdfReader
-def read_pdf(file_name:str):
-    reader = PdfReader(file_name)
-    # Get the number of pages
-    number_of_pages = len(reader.pages)
-    # print(f"Number of pages: {number_of_pages}")
-    content = ""
-    for page_num in range(len(reader.pages)):
-        page = reader.pages[page_num]
-        text = page.extract_text()
-        # print(f"--- Page {page_num + 1} ---")
-        # print(text)
-        content += f"--- Page {page_num + 1} ---" + "\n\n" + text
     return content

+from pypdf import PdfReader
+def read_pdf(file_name:str):
+    reader = PdfReader(file_name)
+    # Get the number of pages
+    number_of_pages = len(reader.pages)
+    # print(f"Number of pages: {number_of_pages}")
+    content = ""
+    for page_num in range(len(reader.pages)):
+        page = reader.pages[page_num]
+        text = page.extract_text()
+        # print(f"--- Page {page_num + 1} ---")
+        # print(text)
+        content += f"--- Page {page_num + 1} ---" + "\n\n" + text
     return content