Muhammad Haris
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,12 +8,7 @@ import torch
|
|
| 8 |
import gdown
|
| 9 |
import os
|
| 10 |
|
| 11 |
-
|
| 12 |
-
# file_id = '1P3Nz6f3KG0m0kO_2pEfnVIhgP8Bvkl4v'
|
| 13 |
-
# url = f'https://drive.google.com/uc?id={file_id}'
|
| 14 |
-
# excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
|
| 15 |
-
# Download the file from Hugging Face Spaces
|
| 16 |
-
|
| 17 |
url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
|
| 18 |
excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
|
| 19 |
|
|
@@ -27,7 +22,7 @@ except UnicodeDecodeError:
|
|
| 27 |
|
| 28 |
# TF-IDF Vectorization
|
| 29 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 30 |
-
X_tfidf = vectorizer.fit_transform(medical_df[
|
| 31 |
|
| 32 |
# Load pre-trained GPT-2 model and tokenizer
|
| 33 |
model_name = "sshleifer/tiny-gpt2"
|
|
@@ -47,11 +42,11 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
|
|
| 47 |
|
| 48 |
# Find the most similar question using semantic similarity
|
| 49 |
question_embedding = sbert_model.encode(question, convert_to_tensor=True)
|
| 50 |
-
similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df[
|
| 51 |
max_sim_index = similarities.argmax().item()
|
| 52 |
|
| 53 |
# LLM response generation
|
| 54 |
-
input_text = "DiBot: " + medical_df.iloc[max_sim_index][
|
| 55 |
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
| 56 |
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
| 57 |
pad_token_id = tokenizer.eos_token_id
|
|
@@ -61,7 +56,7 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
|
|
| 61 |
# Compare similarities and choose the best response
|
| 62 |
if tfidf_similarities.max() > 0.5:
|
| 63 |
tfidf_index = tfidf_similarities.argmax()
|
| 64 |
-
return medical_df.iloc[tfidf_index]['Answers'
|
| 65 |
else:
|
| 66 |
return lm_generated_response
|
| 67 |
|
|
@@ -85,4 +80,4 @@ if user_input:
|
|
| 85 |
# Display the chat messages
|
| 86 |
for message in st.session_state.messages:
|
| 87 |
with st.chat_message(message["role"]):
|
| 88 |
-
st.markdown(message["content"])
|
|
|
|
| 8 |
import gdown
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
# Download the CSV file from Hugging Face Spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
|
| 13 |
excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
|
| 14 |
|
|
|
|
| 22 |
|
| 23 |
# TF-IDF Vectorization
|
| 24 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 25 |
+
X_tfidf = vectorizer.fit_transform(medical_df.iloc[:, 0]) # Accessing first column by index
|
| 26 |
|
| 27 |
# Load pre-trained GPT-2 model and tokenizer
|
| 28 |
model_name = "sshleifer/tiny-gpt2"
|
|
|
|
| 42 |
|
| 43 |
# Find the most similar question using semantic similarity
|
| 44 |
question_embedding = sbert_model.encode(question, convert_to_tensor=True)
|
| 45 |
+
similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df.iloc[:, 0].tolist(), convert_to_tensor=True)).flatten()
|
| 46 |
max_sim_index = similarities.argmax().item()
|
| 47 |
|
| 48 |
# LLM response generation
|
| 49 |
+
input_text = "DiBot: " + medical_df.iloc[max_sim_index][0]
|
| 50 |
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
| 51 |
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
| 52 |
pad_token_id = tokenizer.eos_token_id
|
|
|
|
| 56 |
# Compare similarities and choose the best response
|
| 57 |
if tfidf_similarities.max() > 0.5:
|
| 58 |
tfidf_index = tfidf_similarities.argmax()
|
| 59 |
+
return medical_df.iloc[tfidf_index][1] # Assuming 'Answers' is in the second column (index 1)
|
| 60 |
else:
|
| 61 |
return lm_generated_response
|
| 62 |
|
|
|
|
| 80 |
# Display the chat messages
|
| 81 |
for message in st.session_state.messages:
|
| 82 |
with st.chat_message(message["role"]):
|
| 83 |
+
st.markdown(message["content"])
|