Spaces:

goldrode
/

PDFtoText

Runtime error

File size: 4,481 Bytes

aeefb97
dd0af55
58dc866
dd0af55
aeefb97
 
 
dd0af55
13747c8
dd0af55
 
 
 
 
aeefb97
13747c8
aeefb97
 
dd0af55
 
 
aeefb97
dd0af55
 
 
 
aeefb97
13747c8
 
dd0af55
13747c8
 
dd0af55
aeefb97
dd0af55
 
 
f30208e
dd0af55
 
f1125fe
ade1bfe
f1125fe
dd0af55
9c9d023
ade1bfe
aeefb97
dd0af55
 
 
f1125fe
13747c8
dd0af55
 
aeefb97
dd0af55
f1125fe
13747c8
f1125fe
13747c8
dd0af55
13747c8
f1125fe
aeefb97
6af8198
13747c8
 
 
aeefb97
f1125fe
13747c8
 
 
f1125fe
13747c8
 
 
 
aeefb97
 
f30208e
dd0af55
6af8198
dd0af55
f1125fe
f30208e
dd0af55
f1125fe
aeefb97
39c75f2
 
 
 
 
 
2ce12e8
aeefb97
2ce12e8
 
 
aeefb97
2ce12e8
 
aeefb97
2ce12e8
 
 
 
 
aeefb97
39c75f2
 
 
13747c8
39c75f2
 
13747c8
39c75f2
 
dd0af55
 
 
 
 
 
 
39c75f2
 
 
aeefb97
3c1950a
aeefb97

import os
import json
import numpy as np
import faiss
import gradio as gr
from PyPDF2 import PdfReader
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the knowledge base
with open("knowledge_base.json", "r") as file:
    kb = json.load(file)

# Authenticate with Hugging Face
os.system("huggingface-cli login")

# Initialize the embedding model and FAISS index
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
kb_embeddings = embedding_model.encode(kb_texts)
kb_embeddings = np.array(kb_embeddings, dtype="float32")

index = faiss.IndexFlatL2(kb_embeddings.shape[1])
index.add(kb_embeddings)

# Load the Hugging Face LLM (LLaMA)
llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)

# Generate advice using FAISS + LLM
def generate_advice(extracted_data):
    try:
        recommendations = []

        for item in extracted_data:
            query = f"{item['Component']} {item['Status']}"
            print(f"Processing Query: {query}")  # Debugging step

            # Generate query embedding as float32
            query_embedding = embedding_model.encode([query])
            query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)

            # Search FAISS for the closest match
            _, idx = index.search(query_embedding, 1)
            best_match = kb[idx[0][0]]

            # Prepare LLM prompt
            role = "Medical expert providing advice based on lab results."
            prompt = f"""
            Lab Test: {item['Component']}
            Value: {item['Value']} {item['Units']}
            Status: {item['Status']}

            Medical Guidelines: {best_match['Advice']}

            Provide additional insights or recommendations.
            """

            # Generate advice with LLaMA
            message = [
                {"role": "system", "content": role},
                {"role": "user", "content": prompt},
            ]

            input_text = tokenizer.apply_chat_template(
                message, tokenize=True, add_generation_prompt=True, return_tensors="pt"
            )

            output = llm.generate(
                input_ids=input_text["input_ids"],
                max_length=150,
                num_return_sequences=1
            )

            advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
            recommendations.append({"Component": item["Component"], "Advice": advice})

        return recommendations

    except Exception as e:
        print(f"Error: {e}")  # Debugging any unexpected issues
        return [{"error": f"Exception occurred: {str(e)}"}]


# Extract structured data from the PDF
def pdf_to_text(pdf_file):
    try:
        reader = PdfReader(pdf_file.name)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

        # Regex to extract structured lab results
        pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
        matches = re.findall(pattern, text)

        # Structure data into a list of dictionaries
        if matches:
            results = [
                {"Component": m[0], "Value": float(m[1]), "Min": float(m[2]), "Max": float(m[3]), "Units": m[4], "Status": m[5]}
                for m in matches
            ]
            return results
        else:
            return "No structured data found in the PDF."

    except Exception as e:
        return f"Error: {e}"

# Gradio Interface
def main():
    with gr.Blocks() as app:
        gr.Markdown("## Medical Test Interpreter with RAG and LLM")
        with gr.Row():
            pdf_input = gr.File(label="Upload PDF", type="filepath")
            structured_data = gr.JSON(label="Extracted Structured Data")
            advice_output = gr.JSON(label="Generated Advice")
        extract_button = gr.Button("Extract Data")
        interpret_button = gr.Button("Get Advice")

        extract_button.click(pdf_to_text, inputs=pdf_input, outputs=structured_data)
        interpret_button.click(generate_advice, inputs=structured_data, outputs=advice_output)

    app.launch()

# Run the app
if __name__ == "__main__":
    main()