|
|
import os |
|
|
import json |
|
|
import numpy as np |
|
|
import faiss |
|
|
import gradio as gr |
|
|
from PyPDF2 import PdfReader |
|
|
import re |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
|
|
|
with open("knowledge_base.json", "r") as file: |
|
|
kb = json.load(file) |
|
|
|
|
|
|
|
|
os.system("huggingface-cli login") |
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb] |
|
|
kb_embeddings = embedding_model.encode(kb_texts) |
|
|
kb_embeddings = np.array(kb_embeddings, dtype="float32") |
|
|
|
|
|
index = faiss.IndexFlatL2(kb_embeddings.shape[1]) |
|
|
index.add(kb_embeddings) |
|
|
|
|
|
|
|
|
llama_model_name = "meta-llama/Llama-3.2-3B-Instruct" |
|
|
API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN) |
|
|
llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN) |
|
|
|
|
|
|
|
|
def generate_advice(extracted_data): |
|
|
try: |
|
|
recommendations = [] |
|
|
|
|
|
for item in extracted_data: |
|
|
query = f"{item['Component']} {item['Status']}" |
|
|
print(f"Processing Query: {query}") |
|
|
|
|
|
|
|
|
query_embedding = embedding_model.encode([query]) |
|
|
query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1) |
|
|
|
|
|
|
|
|
_, idx = index.search(query_embedding, 1) |
|
|
best_match = kb[idx[0][0]] |
|
|
|
|
|
|
|
|
role = "Medical expert providing advice based on lab results." |
|
|
prompt = f""" |
|
|
Lab Test: {item['Component']} |
|
|
Value: {item['Value']} {item['Units']} |
|
|
Status: {item['Status']} |
|
|
|
|
|
Medical Guidelines: {best_match['Advice']} |
|
|
|
|
|
Provide additional insights or recommendations. |
|
|
""" |
|
|
|
|
|
|
|
|
message = [ |
|
|
{"role": "system", "content": role}, |
|
|
{"role": "user", "content": prompt}, |
|
|
] |
|
|
|
|
|
input_text = tokenizer.apply_chat_template( |
|
|
message, tokenize=True, add_generation_prompt=True, return_tensors="pt" |
|
|
) |
|
|
|
|
|
output = llm.generate( |
|
|
input_ids=input_text["input_ids"], |
|
|
max_length=150, |
|
|
num_return_sequences=1 |
|
|
) |
|
|
|
|
|
advice = tokenizer.decode(output[0], skip_special_tokens=True).strip() |
|
|
recommendations.append({"Component": item["Component"], "Advice": advice}) |
|
|
|
|
|
return recommendations |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
return [{"error": f"Exception occurred: {str(e)}"}] |
|
|
|
|
|
|
|
|
|
|
|
def pdf_to_text(pdf_file): |
|
|
try: |
|
|
reader = PdfReader(pdf_file.name) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() |
|
|
|
|
|
|
|
|
pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)" |
|
|
matches = re.findall(pattern, text) |
|
|
|
|
|
|
|
|
if matches: |
|
|
results = [ |
|
|
{"Component": m[0], "Value": float(m[1]), "Min": float(m[2]), "Max": float(m[3]), "Units": m[4], "Status": m[5]} |
|
|
for m in matches |
|
|
] |
|
|
return results |
|
|
else: |
|
|
return "No structured data found in the PDF." |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {e}" |
|
|
|
|
|
|
|
|
def main(): |
|
|
with gr.Blocks() as app: |
|
|
gr.Markdown("## Medical Test Interpreter with RAG and LLM") |
|
|
with gr.Row(): |
|
|
pdf_input = gr.File(label="Upload PDF", type="filepath") |
|
|
structured_data = gr.JSON(label="Extracted Structured Data") |
|
|
advice_output = gr.JSON(label="Generated Advice") |
|
|
extract_button = gr.Button("Extract Data") |
|
|
interpret_button = gr.Button("Get Advice") |
|
|
|
|
|
extract_button.click(pdf_to_text, inputs=pdf_input, outputs=structured_data) |
|
|
interpret_button.click(generate_advice, inputs=structured_data, outputs=advice_output) |
|
|
|
|
|
app.launch() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |