Spaces:

goldrode
/

PDFtoText

Runtime error

App Files Files Community

goldrode commited on Dec 16, 2024

Commit

dd0af55

verified ·

1 Parent(s): 7ac2672

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -8

app.py CHANGED Viewed

@@ -1,16 +1,75 @@
 import gradio as gr
-from PyPDF2 import PdfReader
-import re
 # Function to extract structured data from the PDF text
 def pdf_to_text(pdf_file):
     try:
         reader = PdfReader(pdf_file.name)
         text = ""
         for page in reader.pages:
             text += page.extract_text()
         # Regex to match lab results (e.g., WBC 4.4 4.8 10.8 K/ul Low)
         pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
         matches = re.findall(pattern, text)
@@ -27,19 +86,23 @@ def pdf_to_text(pdf_file):
         return f"Error: {e}"
-# Gradio Interface
 def main():
     with gr.Blocks() as app:
-        gr.Markdown("## PDF to Structured Data")
         with gr.Row():
             pdf_input = gr.File(label="Upload PDF", type="filepath")
-            output_text = gr.JSON(label="Extracted Structured Data")
-        convert_button = gr.Button("Extract Data")
-        convert_button.click(pdf_to_text, inputs=pdf_input, outputs=output_text)
     app.launch()
 # Run the Gradio app
 if __name__ == "__main__":
     main()

+import json
+import faiss
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import gradio as gr
+# Load the knowledge base
+with open("knowledge_base.json", "r") as file:
+    kb = json.load(file)
+# Initialize the embedding model
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Generate embeddings for the knowledge base
+kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
+kb_embeddings = embedding_model.encode(kb_texts)
+# Create a FAISS index
+index = faiss.IndexFlatL2(kb_embeddings.shape[1])
+index.add(kb_embeddings)
+# Load Hugging Face LLM (flan-t5 model as an example)
+model_name = "google/flan-t5-large"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+llm = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+text_generator = pipeline("text2text-generation", model=llm, tokenizer=tokenizer)
+# Function to generate advice using RAG
+def generate_advice(extracted_data):
+    try:
+        recommendations = []
+        for item in extracted_data:
+            query = f"{item['Component']} {item['Status']}"
+            query_embedding = embedding_model.encode([query])
+            # Retrieve nearest knowledge base entry
+            _, idx = index.search(query_embedding, 1)
+            best_match = kb[idx[0][0]]
+            # Use Hugging Face LLM to generate detailed advice
+            prompt = f"""
+            Lab Test: {item['Component']}
+            Value: {item['Value']} {item['Units']}
+            Status: {item['Status']}
+            Advice based on guidelines:
+            {best_match['Advice']}
+            """
+            response = text_generator(prompt, max_length=150, num_return_sequences=1)
+            recommendations.append({
+                "Component": item["Component"],
+                "Advice": response[0]["generated_text"]
+            })
+        return recommendations
+    except Exception as e:
+        return f"Error: {e}"
 # Function to extract structured data from the PDF text
 def pdf_to_text(pdf_file):
     try:
+        from PyPDF2 import PdfReader
         reader = PdfReader(pdf_file.name)
         text = ""
         for page in reader.pages:
             text += page.extract_text()
         # Regex to match lab results (e.g., WBC 4.4 4.8 10.8 K/ul Low)
+        import re
         pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
         matches = re.findall(pattern, text)
         return f"Error: {e}"
+# Gradio Interface with Hugging Face LLM Integration
 def main():
     with gr.Blocks() as app:
+        gr.Markdown("## Medical Test Interpreter with RAG (Hugging Face)")
         with gr.Row():
             pdf_input = gr.File(label="Upload PDF", type="filepath")
+            structured_data = gr.JSON(label="Extracted Structured Data")
+            advice_output = gr.JSON(label="Generated Advice")
+        extract_button = gr.Button("Extract Data")
+        interpret_button = gr.Button("Get Advice")
+        extract_button.click(pdf_to_text, inputs=pdf_input, outputs=structured_data)
+        interpret_button.click(generate_advice, inputs=structured_data, outputs=advice_output)
     app.launch()
 # Run the Gradio app
 if __name__ == "__main__":
     main()