Spaces:

goldrode
/

PDFtoText

Runtime error

App Files Files Community

goldrode commited on Dec 16, 2024

Commit

aeefb97

verified ·

1 Parent(s): 5d7671d

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -47

app.py CHANGED Viewed

@@ -1,59 +1,62 @@
 import json
 import numpy as np
-import os
 import faiss
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
 # Load the knowledge base
 with open("knowledge_base.json", "r") as file:
     kb = json.load(file)
 os.system("huggingface-cli login")
-# Initialize the embedding model
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
 kb_embeddings = embedding_model.encode(kb_texts)
-# Create FAISS index
 index = faiss.IndexFlatL2(kb_embeddings.shape[1])
 index.add(kb_embeddings)
-# Load the Hugging Face LLM
 llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
 API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
 llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)
-# Generate advice using RAG
 def generate_advice(extracted_data):
     try:
         recommendations = []
         for item in extracted_data:
-            # Validate input keys
-            if not all(k in item for k in ["Component", "Status"]):
-                raise ValueError("Each input item must have 'Component' and 'Status' keys.")
-            # Prepare the query string
             query = f"{item['Component']} {item['Status']}"
-            print(f"Processing query: {query}")
             # Generate query embedding
             query_embedding = embedding_model.encode([query])
             query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
-            # Search for the closest match in FAISS
             _, idx = index.search(query_embedding, 1)
             best_match = kb[idx[0][0]]
-            # Prepare the LLM prompt
             role = "Medical expert providing advice based on lab results."
             prompt = f"""
             Lab Test: {item['Component']}
-            Value: {item.get('Value', 'Unknown')} {item.get('Units', '')}
             Status: {item['Status']}
             Medical Guidelines: {best_match['Advice']}
@@ -61,49 +64,31 @@ def generate_advice(extracted_data):
             Provide additional insights or recommendations.
             """
-            # Tokenize input properly for LLaMA
-            message_yours = [
                 {"role": "system", "content": role},
                 {"role": "user", "content": prompt},
             ]
-            # Properly tokenize to return a PyTorch tensor
-            input_text_with_your_role = tokenizer.apply_chat_template(
-                message_yours,
-                tokenize=True,  # Must tokenize to return input_ids
-                add_generation_prompt=True,
-                return_tensors="pt",
             )
-            # Move tensor to appropriate device (CPU/GPU)
-            input_text_with_your_role = input_text_with_your_role.to(torch.device("cpu"))
-            # Generate advice
             output = llm.generate(
-                input_ids=input_text_with_your_role["input_ids"],
                 max_length=150,
                 num_return_sequences=1
             )
-            advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
-            # Append the result
-            recommendations.append({
-                "Component": item["Component"],
-                "Advice": advice
-            })
         return recommendations
     except Exception as e:
-        print(f"Error occurred: {str(e)}")
         return [{"error": f"Exception occurred: {str(e)}"}]
-# Gradio app with LLM integration
-import gradio as gr
-from PyPDF2 import PdfReader
-import re
-# Function to extract structured data from PDF
 def pdf_to_text(pdf_file):
     try:
         reader = PdfReader(pdf_file.name)
@@ -111,19 +96,20 @@ def pdf_to_text(pdf_file):
         for page in reader.pages:
             text += page.extract_text()
-        # Regex to match lab results (e.g., WBC 4.4 4.8 10.8 K/ul Low)
         pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
         matches = re.findall(pattern, text)
-        # Format the results into a list of dictionaries
         if matches:
             results = [
-                {"Component": m[0], "Value": m[1], "Min": m[2], "Max": m[3], "Units": m[4], "Status": m[5]}
                 for m in matches
             ]
             return results
         else:
             return "No structured data found in the PDF."
     except Exception as e:
         return f"Error: {e}"
@@ -143,6 +129,6 @@ def main():
     app.launch()
-# Run the Gradio app
-if __name__ == "__main__":
-    main()

+import os
 import json
 import numpy as np
 import faiss
+import gradio as gr
+from PyPDF2 import PdfReader
+import re
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Load the knowledge base
 with open("knowledge_base.json", "r") as file:
     kb = json.load(file)
+# Authenticate with Hugging Face
 os.system("huggingface-cli login")
+# Initialize the embedding model and FAISS index
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
 kb_embeddings = embedding_model.encode(kb_texts)
+kb_embeddings = np.array(kb_embeddings, dtype="float32")
 index = faiss.IndexFlatL2(kb_embeddings.shape[1])
 index.add(kb_embeddings)
+# Load the Hugging Face LLM (LLaMA)
 llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
 API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
 llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)
+# Generate advice using FAISS + LLM
 def generate_advice(extracted_data):
     try:
         recommendations = []
         for item in extracted_data:
             query = f"{item['Component']} {item['Status']}"
+            print(f"Query: {query}")  # Debugging step
             # Generate query embedding
             query_embedding = embedding_model.encode([query])
             query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
+            # Validate embedding shape
+            if query_embedding.shape[1] != index.d:
+                raise ValueError(f"Embedding dimension mismatch: FAISS expects {index.d}, but got {query_embedding.shape[1]}")
+            # Search FAISS for the closest match
             _, idx = index.search(query_embedding, 1)
             best_match = kb[idx[0][0]]
+            # LLM prompt
             role = "Medical expert providing advice based on lab results."
             prompt = f"""
             Lab Test: {item['Component']}
+            Value: {item['Value']} {item['Units']}
             Status: {item['Status']}
             Medical Guidelines: {best_match['Advice']}
             Provide additional insights or recommendations.
             """
+            # Generate advice
+            message = [
                 {"role": "system", "content": role},
                 {"role": "user", "content": prompt},
             ]
+            input_text = tokenizer.apply_chat_template(
+                message, tokenize=False, add_generation_prompt=True, return_tensors="pt"
             )
             output = llm.generate(
+                input_ids=input_text,
                 max_length=150,
                 num_return_sequences=1
             )
+            advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
+            recommendations.append({"Component": item["Component"], "Advice": advice})
         return recommendations
     except Exception as e:
         return [{"error": f"Exception occurred: {str(e)}"}]
+# Extract structured data from the PDF
 def pdf_to_text(pdf_file):
     try:
         reader = PdfReader(pdf_file.name)
         for page in reader.pages:
             text += page.extract_text()
+        # Regex to extract structured lab results
         pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
         matches = re.findall(pattern, text)
+        # Structure data into a list of dictionaries
         if matches:
             results = [
+                {"Component": m[0], "Value": float(m[1]), "Min": float(m[2]), "Max": float(m[3]), "Units": m[4], "Status": m[5]}
                 for m in matches
             ]
             return results
         else:
             return "No structured data found in the PDF."
     except Exception as e:
         return f"Error: {e}"
     app.launch()
+# Run the app
+if _name_ == "_main_":
+    main()