lyimo commited on
Commit
7a45593
·
verified ·
1 Parent(s): 7bf9779

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -37
app.py CHANGED
@@ -23,35 +23,54 @@ def compressed_length(s):
23
  return len(gzip.compress(s.encode('utf-8')))
24
 
25
  def ncd(x, y):
 
 
 
26
  Cx = compressed_length(x)
27
  Cy = compressed_length(y)
28
  Cxy = compressed_length(x + " " + y)
29
  return (Cxy - min(Cx, Cy)) / max(Cx, Cy)
30
 
31
  def normalize_scores(scores, reverse=False):
 
 
 
32
  min_score = min(scores)
33
  max_score = max(scores)
 
 
34
  if reverse:
35
  return [(max_score - x) / (max_score - min_score) for x in scores]
36
  return [(x - min_score) / (max_score - min_score) for x in scores]
37
 
38
  def hybrid_retrieval(query, passages, embeddings, alpha=0.7, beta=0.3):
 
 
 
 
39
  query_embedding = model.encode(query)
40
  cosine_similarities = cosine_similarity([query_embedding], embeddings)[0]
41
 
 
42
  normalized_cosine_similarities = normalize_scores(cosine_similarities)
43
 
 
44
  ncd_values = [ncd(query, passage) for passage in passages]
45
  normalized_ncd_values = normalize_scores(ncd_values, reverse=True)
46
 
47
- final_scores = [alpha * cos_sim + beta * ncd_sim
48
- for cos_sim, ncd_sim in zip(normalized_cosine_similarities, normalized_ncd_values)]
 
 
 
49
 
50
  most_similar_index = np.argmax(final_scores)
51
-
52
  return most_similar_index, cosine_similarities[most_similar_index], ncd_values[most_similar_index], final_scores[most_similar_index]
53
 
54
  def llama_query(prompt, system_content):
 
 
 
55
  response = client.chat.completions.create(
56
  model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
57
  messages=[
@@ -69,80 +88,125 @@ def llama_query(prompt, system_content):
69
  return response.choices[0].message.content
70
 
71
  def check_blood_donation_relevance(question):
72
- prompt = f"Is the following question related to blood donation? Answer with 'Yes' or 'No': {question}"
 
 
 
73
  system_content = "You are an assistant that determines if a question is related to blood donation."
74
- response = llama_query(prompt, system_content)
75
- return response.strip().lower() == 'yes'
76
 
77
  def detect_language(text):
78
- prompt = f"Detect the language of this text. If it's Swahili, return 'Swahili'. If it's English, return 'English'. Here's the text: {text}"
 
 
 
 
 
 
 
 
79
  system_content = "You are a language detection assistant."
80
- response = llama_query(prompt, system_content)
81
- return response.strip().lower()
 
 
 
 
 
 
82
 
83
  def translate_to_english(text):
 
 
 
84
  prompt = f"Translate the following Swahili text to English: {text}"
85
  system_content = "You are a translation assistant that translates from Swahili to English."
86
  response = llama_query(prompt, system_content)
87
- return response
88
 
89
  def translate_to_swahili(text):
 
 
 
90
  prompt = f"Translate the following text to simple Swahili, avoiding difficult words: {text}"
91
  system_content = "You are a translation assistant that translates to simple Swahili."
92
  response = llama_query(prompt, system_content)
93
- return response
94
 
95
  def refine_answer(question, retrieved_answer):
96
- prompt = f"Question: {question}\nRetrieved Answer: {retrieved_answer}\nPlease refine the retrieved answer according to the question asked, ensuring it's clear and concise."
 
 
 
 
 
 
 
97
  system_content = "You are an assistant that refines answers to make them more relevant and natural."
98
- return llama_query(prompt, system_content)
99
 
100
  def get_answer(user_question, threshold=0.3):
 
101
  language = detect_language(user_question)
102
-
 
103
  if language == 'swahili':
104
  english_question = translate_to_english(user_question)
105
  else:
106
  english_question = user_question
107
 
108
- index, cosine_sim, ncd_value, final_score = hybrid_retrieval(english_question, df['Question'].tolist(), question_embeddings)
109
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if final_score > threshold:
111
  retrieved_answer = df.iloc[index]['Answer']
112
- refined_answer = refine_answer(english_question, retrieved_answer)
113
-
114
  if language == 'swahili':
115
- refined_answer = translate_to_swahili(refined_answer)
116
-
117
- return refined_answer, final_score
118
  else:
119
- # Use LLaMa to determine if the question is related to blood donation
120
- if check_blood_donation_relevance(english_question):
121
- llama_response = llama_query(
122
- f"Please provide a brief, general answer to this blood donation related question: {english_question}",
123
- "You are an assistant knowledgeable about blood donation. Provide brief, accurate answers."
124
- )
125
- if language == 'swahili':
126
- llama_response = translate_to_swahili(llama_response)
127
- return llama_response, 0
 
128
  else:
129
- off_topic_message = "I'm sorry, but your question doesn't seem to be related to blood donation. Could you please ask a question about blood donation?"
130
- if language == 'swahili':
131
- off_topic_message = translate_to_swahili(off_topic_message)
132
- return off_topic_message, 0
133
 
134
  # Gradio app
135
  def gradio_app(user_question):
136
  answer, similarity = get_answer(user_question)
137
  return f"Similarity: {similarity:.2f}\nAnswer: {answer}"
138
 
139
- # Launch the Gradio app
140
  iface = gr.Interface(
141
  fn=gradio_app,
142
  inputs=gr.Textbox(label="Enter your question"),
143
  outputs=gr.Textbox(label="Answer"),
144
  title="Blood Donation Q&A",
145
- description="Ask questions related to blood donation and get answers in English or Swahili.",
146
  )
147
 
148
- iface.launch()
 
23
  return len(gzip.compress(s.encode('utf-8')))
24
 
25
  def ncd(x, y):
26
+ """
27
+ Normalized Compression Distance for strings x and y.
28
+ """
29
  Cx = compressed_length(x)
30
  Cy = compressed_length(y)
31
  Cxy = compressed_length(x + " " + y)
32
  return (Cxy - min(Cx, Cy)) / max(Cx, Cy)
33
 
34
  def normalize_scores(scores, reverse=False):
35
+ """
36
+ Scale a list of scores to [0,1], optionally reversing (1 - x).
37
+ """
38
  min_score = min(scores)
39
  max_score = max(scores)
40
+ if max_score == min_score:
41
+ return [0] * len(scores)
42
  if reverse:
43
  return [(max_score - x) / (max_score - min_score) for x in scores]
44
  return [(x - min_score) / (max_score - min_score) for x in scores]
45
 
46
  def hybrid_retrieval(query, passages, embeddings, alpha=0.7, beta=0.3):
47
+ """
48
+ Combine cosine similarity (SentenceTransformer) and
49
+ Normalized Compression Distance (NCD) for retrieval.
50
+ """
51
  query_embedding = model.encode(query)
52
  cosine_similarities = cosine_similarity([query_embedding], embeddings)[0]
53
 
54
+ # Normalize
55
  normalized_cosine_similarities = normalize_scores(cosine_similarities)
56
 
57
+ # Calculate NCD
58
  ncd_values = [ncd(query, passage) for passage in passages]
59
  normalized_ncd_values = normalize_scores(ncd_values, reverse=True)
60
 
61
+ # Combine
62
+ final_scores = [
63
+ alpha * cos_sim + beta * ncd_sim
64
+ for cos_sim, ncd_sim in zip(normalized_cosine_similarities, normalized_ncd_values)
65
+ ]
66
 
67
  most_similar_index = np.argmax(final_scores)
 
68
  return most_similar_index, cosine_similarities[most_similar_index], ncd_values[most_similar_index], final_scores[most_similar_index]
69
 
70
  def llama_query(prompt, system_content):
71
+ """
72
+ Send a prompt to the Together LLaMa model and return the response.
73
+ """
74
  response = client.chat.completions.create(
75
  model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
76
  messages=[
 
88
  return response.choices[0].message.content
89
 
90
  def check_blood_donation_relevance(question):
91
+ """
92
+ Use LLaMa to check whether 'question' is about blood donation.
93
+ """
94
+ prompt = f"Is the following question related to blood donation? Answer ONLY with 'Yes' or 'No': {question}"
95
  system_content = "You are an assistant that determines if a question is related to blood donation."
96
+ response = llama_query(prompt, system_content).strip().lower()
97
+ return response == 'yes'
98
 
99
  def detect_language(text):
100
+ """
101
+ Use LLaMa to detect language (English or Swahili).
102
+ Returns 'swahili' or 'english'.
103
+ """
104
+ prompt = (
105
+ "Detect the language of this text. If it's Swahili, return 'Swahili'. "
106
+ "If it's English, return 'English'. Here's the text:\n\n"
107
+ f"{text}"
108
+ )
109
  system_content = "You are a language detection assistant."
110
+ response = llama_query(prompt, system_content).strip().lower()
111
+ # Attempt to match strictly 'swahili' or 'english' from the response
112
+ if "swahili" in response:
113
+ return "swahili"
114
+ if "english" in response:
115
+ return "english"
116
+ # Fallback: default to English
117
+ return "english"
118
 
119
  def translate_to_english(text):
120
+ """
121
+ Translate Swahili text to English using LLaMa.
122
+ """
123
  prompt = f"Translate the following Swahili text to English: {text}"
124
  system_content = "You are a translation assistant that translates from Swahili to English."
125
  response = llama_query(prompt, system_content)
126
+ return response.strip()
127
 
128
  def translate_to_swahili(text):
129
+ """
130
+ Translate any text to simple Swahili using LLaMa.
131
+ """
132
  prompt = f"Translate the following text to simple Swahili, avoiding difficult words: {text}"
133
  system_content = "You are a translation assistant that translates to simple Swahili."
134
  response = llama_query(prompt, system_content)
135
+ return response.strip()
136
 
137
  def refine_answer(question, retrieved_answer):
138
+ """
139
+ Refine the retrieved answer, making it more relevant and natural.
140
+ """
141
+ prompt = (
142
+ f"Question: {question}\n\n"
143
+ f"Retrieved Answer: {retrieved_answer}\n\n"
144
+ "Please refine the retrieved answer so it's direct, clear, and specifically addresses the question."
145
+ )
146
  system_content = "You are an assistant that refines answers to make them more relevant and natural."
147
+ return llama_query(prompt, system_content).strip()
148
 
149
  def get_answer(user_question, threshold=0.3):
150
+ # 1) Detect user language
151
  language = detect_language(user_question)
152
+
153
+ # 2) Convert user question to English for checking & retrieval
154
  if language == 'swahili':
155
  english_question = translate_to_english(user_question)
156
  else:
157
  english_question = user_question
158
 
159
+ # 3) Check if the question is about blood donation using LLaMa
160
+ is_blood_related = check_blood_donation_relevance(english_question)
161
+
162
+ if not is_blood_related:
163
+ # Off-topic response
164
+ off_topic_message = "I'm sorry, but your question doesn't seem to be related to blood donation. Could you please ask a question about blood donation?"
165
+ if language == 'swahili':
166
+ off_topic_message = translate_to_swahili(off_topic_message)
167
+ return off_topic_message, 0.0
168
+
169
+ # If it is about blood donation, proceed with hybrid retrieval
170
+ index, cosine_sim, ncd_value, final_score = hybrid_retrieval(
171
+ english_question,
172
+ df['Question'].tolist(),
173
+ question_embeddings
174
+ )
175
+
176
+ # 4) If retrieval confidence is high enough, refine the CSV answer
177
  if final_score > threshold:
178
  retrieved_answer = df.iloc[index]['Answer']
179
+ refined_answer_english = refine_answer(english_question, retrieved_answer)
180
+ # Translate back to user language if needed
181
  if language == 'swahili':
182
+ return translate_to_swahili(refined_answer_english), final_score
183
+ else:
184
+ return refined_answer_english, final_score
185
  else:
186
+ # 5) If retrieval is below threshold, ask LLaMa for a general blood-donation-related answer
187
+ llama_response_english = llama_query(
188
+ f"Please provide a concise, accurate answer about blood donation for the question: {english_question}",
189
+ "You are an assistant knowledgeable about blood donation. Provide concise, accurate answers."
190
+ )
191
+ llama_response_english = llama_response_english.strip()
192
+
193
+ # Translate back to user language if needed
194
+ if language == 'swahili':
195
+ return translate_to_swahili(llama_response_english), final_score
196
  else:
197
+ return llama_response_english, final_score
 
 
 
198
 
199
  # Gradio app
200
  def gradio_app(user_question):
201
  answer, similarity = get_answer(user_question)
202
  return f"Similarity: {similarity:.2f}\nAnswer: {answer}"
203
 
 
204
  iface = gr.Interface(
205
  fn=gradio_app,
206
  inputs=gr.Textbox(label="Enter your question"),
207
  outputs=gr.Textbox(label="Answer"),
208
  title="Blood Donation Q&A",
209
+ description="Ask questions about blood donation in English or Swahili. The system first checks if it's related to blood donation."
210
  )
211
 
212
+ iface.launch()