goldrode commited on
Commit
aeefb97
·
verified ·
1 Parent(s): 5d7671d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -47
app.py CHANGED
@@ -1,59 +1,62 @@
 
1
  import json
2
  import numpy as np
3
- import os
4
  import faiss
 
 
 
5
  from sentence_transformers import SentenceTransformer
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
7
- import torch
8
 
9
  # Load the knowledge base
10
  with open("knowledge_base.json", "r") as file:
11
  kb = json.load(file)
12
 
 
13
  os.system("huggingface-cli login")
14
- # Initialize the embedding model
 
15
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
16
  kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
17
  kb_embeddings = embedding_model.encode(kb_texts)
 
18
 
19
- # Create FAISS index
20
  index = faiss.IndexFlatL2(kb_embeddings.shape[1])
21
  index.add(kb_embeddings)
22
 
23
- # Load the Hugging Face LLM
24
  llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
25
  API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
28
  llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)
29
 
30
- # Generate advice using RAG
31
  def generate_advice(extracted_data):
32
  try:
33
  recommendations = []
34
 
35
  for item in extracted_data:
36
- # Validate input keys
37
- if not all(k in item for k in ["Component", "Status"]):
38
- raise ValueError("Each input item must have 'Component' and 'Status' keys.")
39
-
40
- # Prepare the query string
41
  query = f"{item['Component']} {item['Status']}"
42
- print(f"Processing query: {query}")
43
 
44
  # Generate query embedding
45
  query_embedding = embedding_model.encode([query])
46
  query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
47
 
48
- # Search for the closest match in FAISS
 
 
 
 
49
  _, idx = index.search(query_embedding, 1)
50
  best_match = kb[idx[0][0]]
51
 
52
- # Prepare the LLM prompt
53
  role = "Medical expert providing advice based on lab results."
54
  prompt = f"""
55
  Lab Test: {item['Component']}
56
- Value: {item.get('Value', 'Unknown')} {item.get('Units', '')}
57
  Status: {item['Status']}
58
 
59
  Medical Guidelines: {best_match['Advice']}
@@ -61,49 +64,31 @@ def generate_advice(extracted_data):
61
  Provide additional insights or recommendations.
62
  """
63
 
64
- # Tokenize input properly for LLaMA
65
- message_yours = [
66
  {"role": "system", "content": role},
67
  {"role": "user", "content": prompt},
68
  ]
69
 
70
- # Properly tokenize to return a PyTorch tensor
71
- input_text_with_your_role = tokenizer.apply_chat_template(
72
- message_yours,
73
- tokenize=True, # Must tokenize to return input_ids
74
- add_generation_prompt=True,
75
- return_tensors="pt",
76
  )
77
 
78
- # Move tensor to appropriate device (CPU/GPU)
79
- input_text_with_your_role = input_text_with_your_role.to(torch.device("cpu"))
80
-
81
- # Generate advice
82
  output = llm.generate(
83
- input_ids=input_text_with_your_role["input_ids"],
84
  max_length=150,
85
  num_return_sequences=1
86
  )
87
- advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
88
 
89
- # Append the result
90
- recommendations.append({
91
- "Component": item["Component"],
92
- "Advice": advice
93
- })
94
 
95
  return recommendations
96
 
97
  except Exception as e:
98
- print(f"Error occurred: {str(e)}")
99
  return [{"error": f"Exception occurred: {str(e)}"}]
100
 
101
- # Gradio app with LLM integration
102
- import gradio as gr
103
- from PyPDF2 import PdfReader
104
- import re
105
-
106
- # Function to extract structured data from PDF
107
  def pdf_to_text(pdf_file):
108
  try:
109
  reader = PdfReader(pdf_file.name)
@@ -111,19 +96,20 @@ def pdf_to_text(pdf_file):
111
  for page in reader.pages:
112
  text += page.extract_text()
113
 
114
- # Regex to match lab results (e.g., WBC 4.4 4.8 10.8 K/ul Low)
115
  pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
116
  matches = re.findall(pattern, text)
117
 
118
- # Format the results into a list of dictionaries
119
  if matches:
120
  results = [
121
- {"Component": m[0], "Value": m[1], "Min": m[2], "Max": m[3], "Units": m[4], "Status": m[5]}
122
  for m in matches
123
  ]
124
  return results
125
  else:
126
  return "No structured data found in the PDF."
 
127
  except Exception as e:
128
  return f"Error: {e}"
129
 
@@ -143,6 +129,6 @@ def main():
143
 
144
  app.launch()
145
 
146
- # Run the Gradio app
147
- if __name__ == "__main__":
148
- main()
 
1
+ import os
2
  import json
3
  import numpy as np
 
4
  import faiss
5
+ import gradio as gr
6
+ from PyPDF2 import PdfReader
7
+ import re
8
  from sentence_transformers import SentenceTransformer
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
10
 
11
  # Load the knowledge base
12
  with open("knowledge_base.json", "r") as file:
13
  kb = json.load(file)
14
 
15
+ # Authenticate with Hugging Face
16
  os.system("huggingface-cli login")
17
+
18
+ # Initialize the embedding model and FAISS index
19
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
20
  kb_texts = [f"{item['Component']} {item['Range']} {item['Advice']}" for item in kb]
21
  kb_embeddings = embedding_model.encode(kb_texts)
22
+ kb_embeddings = np.array(kb_embeddings, dtype="float32")
23
 
 
24
  index = faiss.IndexFlatL2(kb_embeddings.shape[1])
25
  index.add(kb_embeddings)
26
 
27
+ # Load the Hugging Face LLM (LLaMA)
28
  llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
29
  API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=API_TOKEN)
32
  llm = AutoModelForCausalLM.from_pretrained(llama_model_name, token=API_TOKEN)
33
 
34
+ # Generate advice using FAISS + LLM
35
  def generate_advice(extracted_data):
36
  try:
37
  recommendations = []
38
 
39
  for item in extracted_data:
 
 
 
 
 
40
  query = f"{item['Component']} {item['Status']}"
41
+ print(f"Query: {query}") # Debugging step
42
 
43
  # Generate query embedding
44
  query_embedding = embedding_model.encode([query])
45
  query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
46
 
47
+ # Validate embedding shape
48
+ if query_embedding.shape[1] != index.d:
49
+ raise ValueError(f"Embedding dimension mismatch: FAISS expects {index.d}, but got {query_embedding.shape[1]}")
50
+
51
+ # Search FAISS for the closest match
52
  _, idx = index.search(query_embedding, 1)
53
  best_match = kb[idx[0][0]]
54
 
55
+ # LLM prompt
56
  role = "Medical expert providing advice based on lab results."
57
  prompt = f"""
58
  Lab Test: {item['Component']}
59
+ Value: {item['Value']} {item['Units']}
60
  Status: {item['Status']}
61
 
62
  Medical Guidelines: {best_match['Advice']}
 
64
  Provide additional insights or recommendations.
65
  """
66
 
67
+ # Generate advice
68
+ message = [
69
  {"role": "system", "content": role},
70
  {"role": "user", "content": prompt},
71
  ]
72
 
73
+ input_text = tokenizer.apply_chat_template(
74
+ message, tokenize=False, add_generation_prompt=True, return_tensors="pt"
 
 
 
 
75
  )
76
 
 
 
 
 
77
  output = llm.generate(
78
+ input_ids=input_text,
79
  max_length=150,
80
  num_return_sequences=1
81
  )
 
82
 
83
+ advice = tokenizer.decode(output[0], skip_special_tokens=True).strip()
84
+ recommendations.append({"Component": item["Component"], "Advice": advice})
 
 
 
85
 
86
  return recommendations
87
 
88
  except Exception as e:
 
89
  return [{"error": f"Exception occurred: {str(e)}"}]
90
 
91
+ # Extract structured data from the PDF
 
 
 
 
 
92
  def pdf_to_text(pdf_file):
93
  try:
94
  reader = PdfReader(pdf_file.name)
 
96
  for page in reader.pages:
97
  text += page.extract_text()
98
 
99
+ # Regex to extract structured lab results
100
  pattern = r"(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\w/%]+)\s+(\w+)"
101
  matches = re.findall(pattern, text)
102
 
103
+ # Structure data into a list of dictionaries
104
  if matches:
105
  results = [
106
+ {"Component": m[0], "Value": float(m[1]), "Min": float(m[2]), "Max": float(m[3]), "Units": m[4], "Status": m[5]}
107
  for m in matches
108
  ]
109
  return results
110
  else:
111
  return "No structured data found in the PDF."
112
+
113
  except Exception as e:
114
  return f"Error: {e}"
115
 
 
129
 
130
  app.launch()
131
 
132
+ # Run the app
133
+ if _name_ == "_main_":
134
+ main()