YoniFriedman commited on
Commit
74c3f81
·
verified ·
1 Parent(s): 2b33f3e

improving lang detection

Browse files
Files changed (1) hide show
  1. app.py +109 -69
app.py CHANGED
@@ -14,6 +14,7 @@ from langdetect import detect
14
  from langdetect import DetectorFactory
15
  DetectorFactory.seed = 0
16
  from deep_translator import GoogleTranslator
 
17
 
18
  # Load index
19
  from llama_index.core import VectorStoreIndex
@@ -27,11 +28,13 @@ retriever = index.as_retriever(similarity_top_k = 3)
27
  import gradio as gr
28
 
29
  import re
 
 
30
 
31
  acknowledgment_keywords_sw = ["sawa", "ndiyo", "naam", "hakika", "asante", "nimeelewa", "nimekupata", "ni kweli",
32
  "kwa hakika", "nimesikia"]
33
  acknowledgment_keywords_en = ["thanks", "thank you", "thx", "ok", "okay", "great", "got it", "appreciate", "good", "makes sense"]
34
- follow_up_keywords = ["but", "also", "and", "what", "how", "why", "when",
35
  "lakini", "pia", "na", "nini", "vipi", "kwanini", "wakati"]
36
  greeting_keywords_sw = ["sasa", "niaje", "habari", "mambo", "jambo", "shikamoo", "marahaba", "hujambo", "hamjambo", "salama", "vipi"]
37
  greeting_keywords_en = ["hi", "hello", "hey", "how's it", "what's up", "yo", "howdy"]
@@ -45,84 +48,122 @@ def contains_exact_word_or_phrase(text, keywords):
45
 
46
  def contains_greeting_sw(question):
47
  # Check if the question contains acknowledgment keywords
48
- # words = question.lower().split()
49
- # return any(keyword in words for keyword in greeting_keywords_sw)
50
  return contains_exact_word_or_phrase(question, greeting_keywords_sw)
51
 
52
  def contains_greeting_en(question):
53
  # Check if the question contains acknowledgment keywords
54
- # words = question.lower().split()
55
- # return any(keyword in words for keyword in greeting_keywords_en)
56
  return contains_exact_word_or_phrase(question, greeting_keywords_en)
57
 
58
  def contains_acknowledgment_sw(question):
59
  # Check if the question contains acknowledgment keywords
60
- # words = question.lower().split()
61
- # return any(keyword in words for keyword in acknowledgment_keywords_sw)
62
  return contains_exact_word_or_phrase(question, acknowledgment_keywords_sw)
63
 
64
  def contains_acknowledgment_en(question):
65
  # Check if the question contains acknowledgment keywords
66
- # words = question.lower().split()
67
- # return any(keyword in words for keyword in acknowledgment_keywords_en)
68
  return contains_exact_word_or_phrase(question, acknowledgment_keywords_en)
69
 
70
  def contains_follow_up(question):
71
  # Check if the question contains follow-up indicators
72
  return contains_exact_word_or_phrase(question, follow_up_keywords)
73
 
74
- def process_acknowledgment_response(question):
75
- # Handle simple acknowledgment
76
- if contains_acknowledgment_en(question) and not contains_follow_up(question):
77
- return "You're welcome! Is there anything else I can help with?"
78
- elif contains_acknowledgment_sw(question) and not contains_follow_up(question):
79
- return "Karibu! Kuna kitu kingine chochote ninachoweza kusaidia?"
80
- return None
81
-
82
- def process_greeting_response(question):
83
- # Handle simple acknowledgment
84
- if contains_greeting_en(question):
85
- return "Hi! Can I assist with any question related to HIV?"
86
- elif contains_greeting_sw(question):
87
- return "Habari! Je, ninaweza kusaidia kwa swali lolote linalohusiana na virusi vya ukimwe?"
88
- return None
 
 
 
 
 
 
89
 
90
  def nishauri(question: str, conversation_history: list[str]):
91
 
92
- ## If a greeting, then respond accordingly and do not proceed with RAG
93
- # Process greeting
94
- greet_response = process_greeting_response(question)
95
- if greet_response:
96
- conversation_history.append({"user": question, "chatbot": greet_response})
97
- return greet_response, conversation_history
98
-
99
- ## If user is acknowledging chatbot's response and not asking a follow up, then respond accordingly
100
- # Process acknowledgment
101
- ack_response = process_acknowledgment_response(question)
102
- if ack_response:
103
- conversation_history.append({"user": question, "chatbot": ack_response})
104
- return ack_response, conversation_history
105
-
106
- ## Otherwise, proceed with RAG
107
-
108
- # Create user history
109
  context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
110
-
111
- ## Language detection - we want to run the pipeline in English since our sources are in English
112
- # Split the string into words
113
- words = question.split()
114
-
115
- # Count the number of words
116
- num_words = len(words)
117
-
118
- # By default, let's assume the language is English
119
- lang_question = "en"
120
-
121
- # Language detection is unreliable with fewer that five words, so only check if at least five words
122
- if num_words > 4:
123
- lang_question = detect(question)
124
-
125
- # If language is swahili, then translate question to english
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if lang_question=="sw":
127
  question = GoogleTranslator(source='sw', target='en').translate(question)
128
 
@@ -133,18 +174,17 @@ def nishauri(question: str, conversation_history: list[str]):
133
  source2 = sources[2].text
134
 
135
  background = ("The person who asked the question is a person living with HIV."
136
- " If the person says sasa or niaje, that is swahili slang for hello."
137
- " They are asking questions about HIV. Do not talk about anything that is not related to HIV. "
138
- " Recognize that they already have HIV and do not suggest that they have to get tested"
139
- " for HIV or take post-exposure prophylaxis, as that is not relevant, though their partners perhaps should."
140
- " Do not suggest anything that is not relevant to someone who already has HIV."
141
- " Do not mention in the response that the person is living with HIV."
142
- " The following information about viral loads is authoritative for any question about viral loads:"
143
- " Under 50 copies/ml is low detectable level,"
144
- " 50 - 199 copies/ml is low level viremia, 200 - 999 is high level viremia, and "
145
- " 1000 and above is suspected treatment failure."
146
- " A high viral load or non-suppressed viral load is any viral load above 200 copies/ml."
147
- " A suppressed viral load is one below 200 copies / ml.")
148
 
149
  question_final = (
150
  f" The user previously asked and answered the following: {context}. "
 
14
  from langdetect import DetectorFactory
15
  DetectorFactory.seed = 0
16
  from deep_translator import GoogleTranslator
17
+ from lingua import Language, LanguageDetectorBuilder
18
 
19
  # Load index
20
  from llama_index.core import VectorStoreIndex
 
28
  import gradio as gr
29
 
30
  import re
31
+ import json
32
+ from datetime import datetime
33
 
34
  acknowledgment_keywords_sw = ["sawa", "ndiyo", "naam", "hakika", "asante", "nimeelewa", "nimekupata", "ni kweli",
35
  "kwa hakika", "nimesikia"]
36
  acknowledgment_keywords_en = ["thanks", "thank you", "thx", "ok", "okay", "great", "got it", "appreciate", "good", "makes sense"]
37
+ follow_up_keywords = ["but", "also", "and", "what", "how", "why", "when", "is", "?",
38
  "lakini", "pia", "na", "nini", "vipi", "kwanini", "wakati"]
39
  greeting_keywords_sw = ["sasa", "niaje", "habari", "mambo", "jambo", "shikamoo", "marahaba", "hujambo", "hamjambo", "salama", "vipi"]
40
  greeting_keywords_en = ["hi", "hello", "hey", "how's it", "what's up", "yo", "howdy"]
 
48
 
49
  def contains_greeting_sw(question):
50
  # Check if the question contains acknowledgment keywords
 
 
51
  return contains_exact_word_or_phrase(question, greeting_keywords_sw)
52
 
53
  def contains_greeting_en(question):
54
  # Check if the question contains acknowledgment keywords
 
 
55
  return contains_exact_word_or_phrase(question, greeting_keywords_en)
56
 
57
  def contains_acknowledgment_sw(question):
58
  # Check if the question contains acknowledgment keywords
 
 
59
  return contains_exact_word_or_phrase(question, acknowledgment_keywords_sw)
60
 
61
  def contains_acknowledgment_en(question):
62
  # Check if the question contains acknowledgment keywords
 
 
63
  return contains_exact_word_or_phrase(question, acknowledgment_keywords_en)
64
 
65
  def contains_follow_up(question):
66
  # Check if the question contains follow-up indicators
67
  return contains_exact_word_or_phrase(question, follow_up_keywords)
68
 
69
+ def convert_to_date(date_str):
70
+ return datetime.strptime(date_str, "%Y%m%d")
71
+
72
+ def detect_language(question):
73
+ # Check if the text has less than 5 words
74
+ if len(question.split()) < 5:
75
+ languages = [Language.ENGLISH, Language.SWAHILI] # Add more languages as needed
76
+ detector = LanguageDetectorBuilder.from_languages(*languages).build()
77
+ detected_language = detector.detect_language_of(question)
78
+ # Return language code for consistency
79
+ if detected_language == Language.SWAHILI:
80
+ return "sw"
81
+ elif detected_language == Language.ENGLISH:
82
+ return "en"
83
+ else:
84
+ try:
85
+ lang_detect = detect(question)
86
+ return lang_detect
87
+ except Exception as e:
88
+ print(f"Error with langdetect: {e}")
89
+ return "unknown"
90
 
91
  def nishauri(question: str, conversation_history: list[str]):
92
 
93
+ # Get conversation history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
95
+
96
+ ## Process greeting
97
+ # greet_response = process_greeting_response(question)
98
+ if contains_greeting_en(question) and not contains_follow_up(question):
99
+ greeting = (
100
+ f" The user previously asked and answered the following: {context}. "
101
+ f" The user just provided the following greeting: {question}. "
102
+ "Please respond accordingly in English."
103
+ )
104
+ completion = client.chat.completions.create(
105
+ model="gpt-4o",
106
+ messages=[
107
+ {"role": "user", "content": greeting}
108
+ ]
109
+ )
110
+ reply_to_user = completion.choices[0].message.content
111
+ conversation_history.append({"user": question, "chatbot": reply_to_user})
112
+ return reply_to_user, conversation_history
113
+
114
+ if contains_greeting_sw(question) and not contains_follow_up(question):
115
+ greeting = (
116
+ f" The user previously asked and answered the following: {context}. "
117
+ f" The user just provided the following greeting: {question}. "
118
+ "Please respond accordingly in Swahili."
119
+ )
120
+ completion = client.chat.completions.create(
121
+ model="gpt-4o",
122
+ messages=[
123
+ {"role": "user", "content": greeting}
124
+ ]
125
+ )
126
+ reply_to_user = completion.choices[0].message.content
127
+ conversation_history.append({"user": question, "chatbot": reply_to_user})
128
+ return reply_to_user, conversation_history
129
+
130
+ ## Process acknowledgment
131
+ if contains_acknowledgment_en(question) and not contains_follow_up(question):
132
+ acknowledgment = (
133
+ f" The user previously asked and answered the following: {context}. "
134
+ f" The user just provided the following acknowledgement: {question}. "
135
+ "Please respond accordingly in English."
136
+ )
137
+ completion = client.chat.completions.create(
138
+ model="gpt-4o",
139
+ messages=[
140
+ {"role": "user", "content": acknowledgment}
141
+ ]
142
+ )
143
+ reply_to_user = completion.choices[0].message.content
144
+ conversation_history.append({"user": question, "chatbot": reply_to_user})
145
+ return reply_to_user, conversation_history
146
+
147
+ if contains_acknowledgment_sw(question) and not contains_follow_up(question):
148
+ acknowledgment = (
149
+ f" The user previously asked and answered the following: {context}. "
150
+ f" The user just provided the following acknowledgment: {question}. "
151
+ "Please respond accordingly in Swahili."
152
+ )
153
+ completion = client.chat.completions.create(
154
+ model="gpt-4o",
155
+ messages=[
156
+ {"role": "user", "content": acknowledgment}
157
+ ]
158
+ )
159
+ reply_to_user = completion.choices[0].message.content
160
+ conversation_history.append({"user": question, "chatbot": reply_to_user})
161
+ return reply_to_user, conversation_history
162
+
163
+ ## If not greeting or acknowledgement, then proceed with RAG
164
+
165
+ ## Detect language of question - if Swahili, translate to English
166
+ lang_question = detect_language(question)
167
  if lang_question=="sw":
168
  question = GoogleTranslator(source='sw', target='en').translate(question)
169
 
 
174
  source2 = sources[2].text
175
 
176
  background = ("The person who asked the question is a person living with HIV."
177
+ " They are asking questions about HIV. Do not talk about anything that is not related to HIV. "
178
+ " Recognize that they already have HIV and do not suggest that they have to get tested"
179
+ " for HIV or take post-exposure prophylaxis, as that is not relevant, though their partners perhaps should."
180
+ " Do not suggest anything that is not relevant to someone who already has HIV."
181
+ " Do not mention in the response that the person is living with HIV."
182
+ " The following information about viral loads is authoritative for any question about viral loads:"
183
+ " Under 50 copies/ml is low detectable level,"
184
+ " 50 - 199 copies/ml is low level viremia, 200 - 999 is high level viremia, and "
185
+ " 1000 and above is suspected treatment failure."
186
+ " A high viral load or non-suppressed viral load is any viral load above 200 copies/ml."
187
+ " A suppressed viral load is one below 200 copies / ml.")
 
188
 
189
  question_final = (
190
  f" The user previously asked and answered the following: {context}. "