File size: 5,007 Bytes
f4e6fe7
f38d135
f4e6fe7
f3745ec
380f9f6
f3745ec
 
 
380f9f6
de56d6b
f3745ec
de56d6b
1c5ae39
de56d6b
 
 
1c5ae39
380f9f6
 
 
1e007e5
 
 
380f9f6
 
cab3630
fba8c3b
f38d135
43a83e7
f38d135
f3745ec
de56d6b
 
 
 
 
 
 
 
 
 
 
f38d135
de56d6b
 
f38d135
 
 
 
 
f3745ec
f38d135
f3745ec
 
 
 
 
 
 
 
de56d6b
 
 
 
 
 
 
f3745ec
 
 
de56d6b
f3745ec
 
 
380f9f6
f31ea58
380f9f6
de56d6b
380f9f6
f31ea58
380f9f6
e3c8421
380f9f6
e3c8421
 
 
 
f31ea58
e3c8421
de56d6b
f31ea58
f3745ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f31ea58
f3745ec
380f9f6
f3745ec
 
 
de56d6b
f3745ec
de56d6b
f3745ec
 
 
f4e6fe7
de56d6b
70a4f0d
f3745ec
de56d6b
 
f4e6fe7
 
 
f3745ec
de56d6b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import time
import gradio as gr
import re
import requests
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import PyPDF2

# Install required tools
os.system("apt-get update")

# Ensure Hugging Face Authentication
os.system("huggingface-cli login")

# Load the Llama-3.2-3B-Instruct model and tokenizer
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
print("Loading Llama model...")
API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=API_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=API_TOKEN)

# Load the encoder model for FAISS
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Paths for required files
INDEX_PATH = "blood_test_index.faiss"
CSV_PATH = "rag_documents.csv"

def generate_faiss_index():
    df = pd.read_csv(CSV_PATH)
    if "Content" not in df.columns:
        raise ValueError("The CSV file must contain a 'Content' column.")
    print("Encoding sentences...")
    embeddings = encoder.encode(df["Content"].tolist()).astype("float32")
    print("Creating FAISS index...")
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, INDEX_PATH)
    print("FAISS index generated successfully!")

if not os.path.exists(INDEX_PATH):
    print("Generating FAISS index...")
    if not os.path.exists(CSV_PATH):
        raise FileNotFoundError(f"The required file '{CSV_PATH}' is missing!")
    generate_faiss_index()

print("Loading FAISS index...")
index = faiss.read_index(INDEX_PATH)
rag_df = pd.read_csv(CSV_PATH)

def load_thresholds(file_path="blood_test_thresholds.csv"):
    df = pd.read_csv(file_path)
    thresholds = {}
    for _, row in df.iterrows():
        thresholds[row["Parameter"]] = {"low": row["Low"], "high": row["High"], "unit": row["Unit"]}
    return thresholds

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def retrieve_context(query, index, document_df):
    query_vector = encoder.encode([query]).astype("float32")
    distances, indices = index.search(query_vector, k=3)
    results = [document_df.iloc[i]["Content"] for i in indices[0]]
    return " ".join(results)

def generate_response_with_llama(flagged_abnormality, context):
    prompt = (
        f"Flagged Abnormality: {flagged_abnormality}\n"
        f"Context: {context[:300]}.\n"
        f"Provide specific and actionable medical advice for the abnormality:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_p=0.9,
        temperature=0.7,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def analyze_blood_report(extracted_text, thresholds):
    lines = [line.strip() for line in extracted_text.split("\n") if line.strip()]
    blood_data = {}
    regex = r"([A-Za-z\s\#]+)\s*[:\-]?\s*([\d.]+)\s*([a-zA-Z/\s]*)"
    for line in lines:
        match = re.search(regex, line)
        if match:
            param, value_str, unit = match.group(1).strip(), match.group(2).strip(), match.group(3).strip()
            try:
                value = float(value_str)
                blood_data[param] = {"value": value, "unit": unit}
            except ValueError:
                continue

    flagged = {}
    recommendations = {}
    for parameter, data in blood_data.items():
        if parameter in thresholds:
            value, unit = data["value"], data["unit"]
            if value < thresholds[parameter]["low"]:
                flagged[parameter] = f"Low ({value} {unit})"
            elif value > thresholds[parameter]["high"]:
                flagged[parameter] = f"High ({value} {unit})"

    for param, status in flagged.items():
        query = f"The blood test result for {param} is {status}."
        context = retrieve_context(query, index, rag_df)
        recommendations[param] = generate_response_with_llama(flagged[param], context)

    return flagged, recommendations

def process_pdf(pdf_path):
    thresholds = load_thresholds()
    extracted_text = extract_text_from_pdf(pdf_path)
    flagged, recommendations = analyze_blood_report(extracted_text, thresholds)
    return {"Flagged Abnormalities": flagged, "Recommendations": recommendations}

interface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(type="filepath", label="Upload Blood Test PDF"),
    outputs="json",
    title="Blood Test Analyzer (PDF)",
    description="Upload a PDF blood test report. This tool extracts data, flags abnormalities, and provides medical recommendations."
)

if __name__ == "__main__":
    interface.launch(share=True)