PDFtoText / app.py
goldrode's picture
Update app.py
f1125fe verified
import os
import json
import numpy as np
import faiss
import gradio as gr
from PyPDF2 import PdfReader
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load the knowledge base
with open("knowledge_base.json", "r") as file:
kb = json.load(file)
# Authenticate with Hugging Face
os.system("huggingface-cli login")
# Initialize the embedding model and FAISS index
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
kb_embeddings = embedding_model.encode(kb_texts)
kb_embeddings = np.array(kb_embeddings, dtype="float32")
index = faiss.IndexFlatL2(kb_embeddings.shape[1])
index.add(kb_embeddings)
# Load the Hugging Face LLM (LLaMA)
llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)
# Generate advice using FAISS + LLM
def generate_advice(extracted_data):
try:
recommendations = []
for item in extracted_data:
query = f"{item['Component']} {item['Status']}"
print(f"Processing Query: {query}") # Debugging step
# Generate query embedding as float32
query_embedding = embedding_model.encode([query])
query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
# Search FAISS for the closest match
_, idx = index.search(query_embedding, 1)
best_match = kb[idx[0][0]]
# Prepare LLM prompt
role = "Medical expert providing advice based on lab results."
prompt = f"""
Lab Test: {item['Component']}
Value: {item['Value']} {item['Units']}
Status: {item['Status']}
Medical Guidelines: {best_match['Advice']}
Provide additional insights or recommendations.
"""
# Generate advice with LLaMA
message = [
{"role": "system", "content": role},
{"role": "user", "content": prompt},
]
input_text = tokenizer.apply_chat_template(
message, tokenize=True, add_generation_prompt=True, return_tensors="pt"
)
output = llm.generate(
input_ids=input_text["input_ids"],
max_length=150,
num_return_sequences=1
)
advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
recommendations.append({"Component": item["Component"], "Advice": advice})
return recommendations
except Exception as e:
print(f"Error: {e}") # Debugging any unexpected issues
return [{"error": f"Exception occurred: {str(e)}"}]
# Extract structured data from the PDF
def pdf_to_text(pdf_file):
try:
reader = PdfReader(pdf_file.name)
text = ""
for page in reader.pages:
text += page.extract_text()
# Regex to extract structured lab results
pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
matches = re.findall(pattern, text)
# Structure data into a list of dictionaries
if matches:
results = [
{"Component": m[0], "Value": float(m[1]), "Min": float(m[2]), "Max": float(m[3]), "Units": m[4], "Status": m[5]}
for m in matches
]
return results
else:
return "No structured data found in the PDF."
except Exception as e:
return f"Error: {e}"
# Gradio Interface
def main():
with gr.Blocks() as app:
gr.Markdown("## Medical Test Interpreter with RAG and LLM")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", type="filepath")
structured_data = gr.JSON(label="Extracted Structured Data")
advice_output = gr.JSON(label="Generated Advice")
extract_button = gr.Button("Extract Data")
interpret_button = gr.Button("Get Advice")
extract_button.click(pdf_to_text, inputs=pdf_input, outputs=structured_data)
interpret_button.click(generate_advice, inputs=structured_data, outputs=advice_output)
app.launch()
# Run the app
if __name__ == "__main__":
main()