Spaces:

bohraanuj23
/

SeverityAnalysis

Runtime error

App Files Files Community

bohraanuj23 commited on May 1, 2025

Commit

14f41f0

1 Parent(s): f27e986

Changes in the model architecture implemented

Browse files

Files changed (7) hide show

README.md +11 -0
app.py +28 -25
model.py +61 -58
augmented_lab_data.csv → model/augmented_lab_data.csv +0 -0
dual_encoder_model.pth → model/dual_encoder_model.pth +0 -0
requirements.txt +3 -1
utils.py +19 -78

README.md CHANGED Viewed

@@ -10,3 +10,14 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# ICD Code Predictor from Lab Reports
+Upload a medical lab report (PDF), and the model will extract lab test values, process them using LangChain + GPT, and predict ICD codes with confidence scores.
+Built with:
+- PyTorch Lightning
+- LangChain + GPT-4o
+- FAISS
+- Gradio

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 import torch.nn.functional as F
-import pandas as pd
 from model import DualEncoderModel
 from utils import (
     extract_text_from_pdf,
@@ -15,7 +14,6 @@ from langchain.embeddings import OpenAIEmbeddings
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
-# Load model and ICD mapping once
 lab_cont_features_list = [
     "ALT (SGPT)",
     "AST (SGOT)",
@@ -37,6 +35,9 @@ lab_cont_features_list = [
     "Hematocrit",
 ]
 model = DualEncoderModel(
     lab_cont_dim=len(lab_cont_features_list),
     lab_cat_dims=[],
@@ -45,16 +46,14 @@ model = DualEncoderModel(
     embedding_dim=16,
     num_classes=18,
 )
-model.load_state_dict(
-    torch.load("dual_encoder_model.pth", map_location=torch.device("cpu"))
-)
 model.eval()
-icd_mapping = load_icd_mapping("cleaned_lab_data.csv")
-def predict_from_pdf(pdf_file):
-    text = extract_text_from_pdf(pdf_file.name)
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     docs = splitter.create_documents([text])
@@ -64,10 +63,9 @@ def predict_from_pdf(pdf_file):
     llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
     qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="refine")
-    lab_tests_response = qa.run(
-        "List lab test names and values only with units (no suggestions). Format: Test: Value Unit"
-    )
-    lab_data = extract_lab_tests_dict(lab_tests_response)
     lab_cont_tensor = prepare_lab_tensor(lab_data, lab_cont_features_list)
     lab_cat_tensor = torch.zeros((1, 0), dtype=torch.int64)
@@ -79,24 +77,29 @@ def predict_from_pdf(pdf_file):
             lab_cont_tensor, lab_cat_tensor, conv_cont_tensor, conv_cat_tensor
         )
         probs = F.softmax(logits, dim=1)
-        top_k_probs = torch.topk(probs, 3, dim=1)
-    diagnoses = []
-    for idx, prob in zip(top_k_probs.indices[0], top_k_probs.values[0]):
         icd_code, icd_label, diagnosis = icd_mapping.get(
-            idx.item(), ("Unknown", "Unknown", "No Description Available")
         )
-        diagnoses.append(f"{diagnosis} (ICD: {icd_code}) - {prob.item()*100:.2f}%")
-    return "\n".join(diagnoses)
-interface = gr.Interface(
-    fn=predict_from_pdf,
-    inputs=gr.File(label="Upload Lab Report (PDF)"),
-    outputs=gr.Textbox(label="Top 3 Predicted Diagnoses with ICD Codes"),
-    title="Medical ICD Code Predictor",
 )
 if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
 import torch
 import torch.nn.functional as F
 from model import DualEncoderModel
 from utils import (
     extract_text_from_pdf,
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
 lab_cont_features_list = [
     "ALT (SGPT)",
     "AST (SGOT)",
     "Hematocrit",
 ]
+model_path = "dual_encoder_model.pth"
+icd_csv_path = "augmented_lab_data.csv"
 model = DualEncoderModel(
     lab_cont_dim=len(lab_cont_features_list),
     lab_cat_dims=[],
     embedding_dim=16,
     num_classes=18,
 )
+model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
 model.eval()
+icd_mapping = load_icd_mapping(icd_csv_path)
+def predict_icd(pdf):
+    text = extract_text_from_pdf(pdf.name)
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     docs = splitter.create_documents([text])
     llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
     qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="refine")
+    query = "List lab test names and values only with units (no suggestions). Format: Test: Value Unit"
+    lab_response = qa.run(query)
+    lab_data = extract_lab_tests_dict(lab_response)
     lab_cont_tensor = prepare_lab_tensor(lab_data, lab_cont_features_list)
     lab_cat_tensor = torch.zeros((1, 0), dtype=torch.int64)
             lab_cont_tensor, lab_cat_tensor, conv_cont_tensor, conv_cat_tensor
         )
         probs = F.softmax(logits, dim=1)
+        top_probs = torch.topk(probs, 3, dim=1)
+    output = ""
+    for i, (label_idx, prob) in enumerate(
+        zip(top_probs.indices[0], top_probs.values[0])
+    ):
         icd_code, icd_label, diagnosis = icd_mapping.get(
+            label_idx.item(), ("Unknown", "Unknown", "No Description Available")
         )
+        confidence = (
+            "🔵 High" if prob > 0.6 else "🟡 Medium" if prob > 0.3 else "🔴 Low"
+        )
+        output += f"{i+1}. **{diagnosis}**\nICD Code: {icd_code}\nConfidence: {confidence} ({prob:.2%})\n\n"
+    return output.strip()
+iface = gr.Interface(
+    fn=predict_icd,
+    inputs=gr.File(label="Upload PDF Lab Report"),
+    outputs=gr.Markdown(label="Predicted Diagnoses (ICD Codes)"),
+    title="ICD Code Predictor from Lab Report",
+    description="Upload a lab report PDF to predict possible diagnoses with ICD codes.",
 )
 if __name__ == "__main__":
+    iface.launch()

model.py CHANGED Viewed

@@ -1,20 +1,28 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 class ResidualBlock(nn.Module):
-    def __init__(self, input_dim, hidden_dim):
         super().__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_dim)
         self.relu = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_dim, input_dim)
     def forward(self, x):
-        return x + self.fc2(self.relu(self.fc1(x)))
-class DualEncoderModel(nn.Module):
     def __init__(
         self,
         lab_cont_dim,
@@ -23,72 +31,67 @@ class DualEncoderModel(nn.Module):
         conv_cat_dims,
         embedding_dim,
         num_classes,
     ):
         super().__init__()
-        # Lab encoder
-        self.lab_cont_dim = lab_cont_dim
-        self.lab_cat_dims = lab_cat_dims
-        self.lab_cat_embeds = nn.ModuleList(
-            [nn.Embedding(cat_dim, embedding_dim) for cat_dim in lab_cat_dims]
         )
-        lab_cat_total_dim = embedding_dim * len(lab_cat_dims)
-        lab_total_input_dim = lab_cont_dim + lab_cat_total_dim
-        self.lab_encoder = nn.Sequential(
-            nn.Linear(lab_total_input_dim, 64),
-            nn.ReLU(),
-            ResidualBlock(64, 32),
-            nn.ReLU(),
         )
-        # Conversation encoder
-        self.conv_cont_dim = conv_cont_dim
-        self.conv_cat_dims = conv_cat_dims
-        self.conv_cat_embeds = nn.ModuleList(
-            [nn.Embedding(cat_dim, embedding_dim) for cat_dim in conv_cat_dims]
         )
-        conv_cat_total_dim = embedding_dim * len(conv_cat_dims)
-        conv_total_input_dim = conv_cont_dim + conv_cat_total_dim
-        self.conv_encoder = nn.Sequential(
-            nn.Linear(conv_total_input_dim, 64),
-            nn.ReLU(),
-            ResidualBlock(64, 32),
-            nn.ReLU(),
         )
-        # Fusion + Classifier
         self.classifier = nn.Sequential(
-            nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, num_classes)
         )
     def forward(self, lab_cont, lab_cat, conv_cont, conv_cat):
-        # Process lab categorical features
-        if self.lab_cat_embeds:
-            lab_cat_embeds = [
-                embed(lab_cat[:, i]) for i, embed in enumerate(self.lab_cat_embeds)
             ]
-            lab_cat_encoded = torch.cat(lab_cat_embeds, dim=1)
-        else:
-            lab_cat_encoded = torch.empty((lab_cont.size(0), 0), device=lab_cont.device)
-        lab_input = torch.cat([lab_cont, lab_cat_encoded], dim=1)
-        lab_repr = self.lab_encoder(lab_input)
-        # Process conversation categorical features
-        if self.conv_cat_embeds:
-            conv_cat_embeds = [
-                embed(conv_cat[:, i]) for i, embed in enumerate(self.conv_cat_embeds)
             ]
-            conv_cat_encoded = torch.cat(conv_cat_embeds, dim=1)
-        else:
-            conv_cat_encoded = torch.empty(
-                (conv_cont.size(0), 0), device=conv_cont.device
-            )
-        conv_input = torch.cat([conv_cont, conv_cat_encoded], dim=1)
-        conv_repr = self.conv_encoder(conv_input)
-        # Concatenate and classify
-        fused = torch.cat([lab_repr, conv_repr], dim=1)
-        output = self.classifier(fused)
-        return output

 import torch
 import torch.nn as nn
+import pytorch_lightning as pl
 class ResidualBlock(nn.Module):
+    def __init__(self, in_features, out_features, dropout=0.2):
         super().__init__()
+        self.fc1 = nn.Linear(in_features, out_features)
         self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(out_features, out_features)
     def forward(self, x):
+        residual = x
+        out = self.fc1(x)
+        out = self.relu(out)
+        out = self.dropout(out)
+        out = self.fc2(out)
+        if residual.shape == out.shape:
+            out += residual
+        return out
+class DualEncoderModel(pl.LightningModule):
     def __init__(
         self,
         lab_cont_dim,
         conv_cat_dims,
         embedding_dim,
         num_classes,
+        lr=1e-3,
     ):
         super().__init__()
+        self.save_hyperparameters()
+        self.lab_cont_encoder = (
+            nn.Sequential(ResidualBlock(lab_cont_dim, 64), ResidualBlock(64, 64))
+            if lab_cont_dim > 0
+            else None
         )
+        self.lab_cat_embeddings = nn.ModuleList(
+            [nn.Embedding(dim + 1, embedding_dim) for dim in lab_cat_dims]
         )
+        self.conv_cont_encoder = (
+            nn.Sequential(ResidualBlock(conv_cont_dim, 64), ResidualBlock(64, 64))
+            if conv_cont_dim > 0
+            else None
         )
+        self.conv_cat_embeddings = nn.ModuleList(
+            [nn.Embedding(dim + 1, embedding_dim) for dim in conv_cat_dims]
         )
+        total_dim = 0
+        if self.lab_cont_encoder:
+            total_dim += 64
+        if lab_cat_dims:
+            total_dim += embedding_dim * len(lab_cat_dims)
+        if self.conv_cont_encoder:
+            total_dim += 64
+        if conv_cat_dims:
+            total_dim += embedding_dim * len(conv_cat_dims)
         self.classifier = nn.Sequential(
+            nn.Linear(total_dim, 128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(128, num_classes),
         )
     def forward(self, lab_cont, lab_cat, conv_cont, conv_cat):
+        embeddings = []
+        if self.lab_cont_encoder and lab_cont.nelement() > 0:
+            embeddings.append(self.lab_cont_encoder(lab_cont))
+        embeddings.extend(
+            [
+                emb(torch.clamp(lab_cat[:, i], min=0))
+                for i, emb in enumerate(self.lab_cat_embeddings)
             ]
+        )
+        if self.conv_cont_encoder and conv_cont.nelement() > 0:
+            embeddings.append(self.conv_cont_encoder(conv_cont))
+        embeddings.extend(
+            [
+                emb(torch.clamp(conv_cat[:, i], min=0))
+                for i, emb in enumerate(self.conv_cat_embeddings)
             ]
+        )
+        fused = torch.cat(embeddings, dim=1)
+        return self.classifier(fused)

augmented_lab_data.csv → model/augmented_lab_data.csv RENAMED Viewed

File without changes

dual_encoder_model.pth → model/dual_encoder_model.pth RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -5,4 +5,6 @@ gradio
 langchain
 openai
 faiss-cpu
-langchain-community

 langchain
 openai
 faiss-cpu
+langchain-community>=0.0.3
+pytorch-lightning

utils.py CHANGED Viewed

@@ -1,99 +1,40 @@
-import torch
-import torch.nn as nn
-import pdfplumber
 import re
 import pandas as pd
-import torch.nn.functional as F
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import FAISS
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.chains import RetrievalQA
-from langchain.chat_models import ChatOpenAI
-# ---- PDF Extraction ----
 def extract_text_from_pdf(pdf_path):
-    """
-    Extract text from a PDF file.
-    Args:
-        pdf_path (str): Path to the PDF file.
-    Returns:
-        str: Extracted text from the PDF.
-    """
-    try:
-        with pdfplumber.open(pdf_path) as pdf:
-            return "\n".join(
-                page.extract_text() for page in pdf.pages if page.extract_text()
-            )
-    except Exception as e:
-        print(f"Error extracting text from PDF: {e}")
-        return ""
-def extract_lab_tests_dict(response_text):
-    """
-    Extract lab test names and values from the response text.
-    Args:
-        response_text (str): The text containing lab test information.
-    Returns:
-        dict: A dictionary where keys are lab test names and values are their corresponding numeric values.
-    """
-    pattern = r"[-•]?\s*([\w\s/()%.-]+?):\s*([\d.,-]+)\s*(\w+/?.*)?"
-    matches = re.findall(pattern, response_text)
     lab_dict = {}
     for test, value, unit in matches:
         test = test.strip()
         try:
-            lab_dict[test] = float(value.replace(",", ""))  # Handle commas in numbers
         except ValueError:
-            continue  # Skip invalid values
     return lab_dict
 def prepare_lab_tensor(lab_data, feature_list):
-    """
-    Prepare a tensor for the lab data to be fed into the model.
-    Args:
-        lab_data (dict): A dictionary of lab test names and their values.
-        feature_list (list): A list of expected lab test names.
-    Returns:
-        torch.Tensor: A tensor containing the lab values.
-    """
     values = [lab_data.get(feature, -1) for feature in feature_list]
     return torch.tensor([values], dtype=torch.float32)
-# ---- Updated ICD Mapping Loader ----
 def load_icd_mapping(csv_path):
-    """
-    Load and process the ICD mapping from a CSV file.
-    Args:
-        csv_path (str): Path to the CSV file containing ICD mappings.
-    Returns:
-        dict: A dictionary mapping ICD labels to (ICD Code, ICD Label, Diagnosis).
-    """
-    try:
-        df = pd.read_csv(csv_path)
-        # Defensive check for required columns
-        if not {"ICD_Label", "ICD Code", "Diagnosis"}.issubset(df.columns):
-            raise ValueError(
-                "Required columns missing from CSV: ICD_Label, ICD Code, Diagnosis"
-            )
-        df = df.drop_duplicates(subset="ICD_Label")
-        return {
-            int(row["ICD_Label"]): (row["ICD Code"], row["ICD_Label"], row["Diagnosis"])
-            for _, row in df.iterrows()
-        }
-    except Exception as e:
-        print(f"Error loading ICD mapping: {e}")
-        return {}

 import re
+import pdfplumber
 import pandas as pd
+import torch
 def extract_text_from_pdf(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        return "\n".join(
+            page.extract_text() for page in pdf.pages if page.extract_text()
+        )
+def extract_lab_tests_dict(text):
+    pattern = r"[-•]?\s*([\w\s/()%.-]+?):\s*([\d.]+)\s*(\w+/?.*)?"
+    matches = re.findall(pattern, text)
     lab_dict = {}
     for test, value, unit in matches:
         test = test.strip()
         try:
+            lab_dict[test] = float(value)
         except ValueError:
+            continue
     return lab_dict
 def prepare_lab_tensor(lab_data, feature_list):
     values = [lab_data.get(feature, -1) for feature in feature_list]
     return torch.tensor([values], dtype=torch.float32)
 def load_icd_mapping(csv_path):
+    df = pd.read_csv(csv_path)
+    if not {"ICD_Label", "ICD Code", "Diagnosis"}.issubset(df.columns):
+        raise ValueError("CSV must include ICD_Label, ICD Code, Diagnosis columns.")
+    df = df.drop_duplicates(subset="ICD_Label")
+    return {
+        int(row["ICD_Label"]): (row["ICD Code"], row["ICD_Label"], row["Diagnosis"])
+        for _, row in df.iterrows()
+    }