Spaces:

Nishauri
/

ChatBot

Sleeping

App Files Files Community

YoniFriedman commited on Jul 17, 2024

Commit

74c3f81

verified ·

1 Parent(s): 2b33f3e

improving lang detection

Browse files

Files changed (1) hide show

app.py +109 -69

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from langdetect import detect
 from langdetect import DetectorFactory
 DetectorFactory.seed = 0
 from deep_translator import GoogleTranslator
 # Load index
 from llama_index.core import VectorStoreIndex
@@ -27,11 +28,13 @@ retriever = index.as_retriever(similarity_top_k = 3)
 import gradio as gr
 import re
 acknowledgment_keywords_sw = ["sawa", "ndiyo", "naam", "hakika", "asante", "nimeelewa", "nimekupata", "ni kweli",
                              "kwa hakika", "nimesikia"]
 acknowledgment_keywords_en = ["thanks", "thank you", "thx", "ok", "okay", "great", "got it", "appreciate", "good", "makes sense"]
-follow_up_keywords = ["but", "also", "and", "what", "how", "why", "when",
                      "lakini", "pia", "na", "nini", "vipi", "kwanini", "wakati"]
 greeting_keywords_sw = ["sasa", "niaje", "habari", "mambo", "jambo", "shikamoo", "marahaba", "hujambo", "hamjambo", "salama", "vipi"]
 greeting_keywords_en = ["hi", "hello", "hey", "how's it", "what's up", "yo", "howdy"]
@@ -45,84 +48,122 @@ def contains_exact_word_or_phrase(text, keywords):
 def contains_greeting_sw(question):
     # Check if the question contains acknowledgment keywords
-    # words = question.lower().split()
-    # return any(keyword in words for keyword in greeting_keywords_sw)
     return contains_exact_word_or_phrase(question, greeting_keywords_sw)
 def contains_greeting_en(question):
     # Check if the question contains acknowledgment keywords
-    # words = question.lower().split()
-    # return any(keyword in words for keyword in greeting_keywords_en)
     return contains_exact_word_or_phrase(question, greeting_keywords_en)
 def contains_acknowledgment_sw(question):
     # Check if the question contains acknowledgment keywords
-    # words = question.lower().split()
-    # return any(keyword in words for keyword in acknowledgment_keywords_sw)
     return contains_exact_word_or_phrase(question, acknowledgment_keywords_sw)
 def contains_acknowledgment_en(question):
     # Check if the question contains acknowledgment keywords
-    # words = question.lower().split()
-    # return any(keyword in words for keyword in acknowledgment_keywords_en)
     return contains_exact_word_or_phrase(question, acknowledgment_keywords_en)
 def contains_follow_up(question):
     # Check if the question contains follow-up indicators
     return contains_exact_word_or_phrase(question, follow_up_keywords)
-def process_acknowledgment_response(question):
-    # Handle simple acknowledgment
-    if contains_acknowledgment_en(question) and not contains_follow_up(question):
-        return "You're welcome! Is there anything else I can help with?"
-    elif contains_acknowledgment_sw(question) and not contains_follow_up(question):
-        return "Karibu! Kuna kitu kingine chochote ninachoweza kusaidia?"
-    return None
-def process_greeting_response(question):
-    # Handle simple acknowledgment
-    if contains_greeting_en(question):
-        return "Hi! Can I assist with any question related to HIV?"
-    elif contains_greeting_sw(question):
-        return "Habari! Je, ninaweza kusaidia kwa swali lolote linalohusiana na virusi vya ukimwe?"
-    return None
 def nishauri(question: str, conversation_history: list[str]):
-    ## If a greeting, then respond accordingly and do not proceed with RAG
-    # Process greeting
-    greet_response = process_greeting_response(question)
-    if greet_response:
-        conversation_history.append({"user": question, "chatbot": greet_response})
-        return greet_response, conversation_history
-    ## If user is acknowledging chatbot's response and not asking a follow up, then respond accordingly
-    # Process acknowledgment
-    ack_response = process_acknowledgment_response(question)
-    if ack_response:
-        conversation_history.append({"user": question, "chatbot": ack_response})
-        return ack_response, conversation_history
-    ## Otherwise, proceed with RAG
-    # Create user history
     context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
-    ## Language detection - we want to run the pipeline in English since our sources are in English
-    # Split the string into words
-    words = question.split()
-    # Count the number of words
-    num_words = len(words)
-    # By default, let's assume the language is English
-    lang_question = "en"
-    # Language detection is unreliable with fewer that five words, so only check if at least five words
-    if num_words > 4:
-        lang_question = detect(question)
-    # If language is swahili, then translate question to english
     if lang_question=="sw":
         question = GoogleTranslator(source='sw', target='en').translate(question)
@@ -133,18 +174,17 @@ def nishauri(question: str, conversation_history: list[str]):
     source2 = sources[2].text
     background = ("The person who asked the question is a person living with HIV."
-                  " If the person says sasa or niaje, that is swahili slang for hello."
-                  " They are asking questions about HIV. Do not talk about anything that is not related to HIV. "
-                  " Recognize that they already have HIV and do not suggest that they have to get tested"
-                  " for HIV or take post-exposure prophylaxis, as that is not relevant, though their partners perhaps should."
-                  " Do not suggest anything that is not relevant to someone who already has HIV."
-                  " Do not mention in the response that the person is living with HIV."
-                  " The following information about viral loads is authoritative for any question about viral loads:"
-                  " Under 50 copies/ml is low detectable level,"
-                  " 50 - 199 copies/ml is low level viremia, 200 - 999 is high level viremia, and "
-                  " 1000 and above is suspected treatment failure."
-                  " A high viral load or non-suppressed viral load is any viral load above 200 copies/ml."
-                  " A suppressed viral load is one below 200 copies / ml.")
     question_final = (
     f" The user previously asked and answered the following: {context}. "

 from langdetect import DetectorFactory
 DetectorFactory.seed = 0
 from deep_translator import GoogleTranslator
+from lingua import Language, LanguageDetectorBuilder
 # Load index
 from llama_index.core import VectorStoreIndex
 import gradio as gr
 import re
+import json
+from datetime import datetime
 acknowledgment_keywords_sw = ["sawa", "ndiyo", "naam", "hakika", "asante", "nimeelewa", "nimekupata", "ni kweli",
                              "kwa hakika", "nimesikia"]
 acknowledgment_keywords_en = ["thanks", "thank you", "thx", "ok", "okay", "great", "got it", "appreciate", "good", "makes sense"]
+follow_up_keywords = ["but", "also", "and", "what", "how", "why", "when", "is", "?",
                      "lakini", "pia", "na", "nini", "vipi", "kwanini", "wakati"]
 greeting_keywords_sw = ["sasa", "niaje", "habari", "mambo", "jambo", "shikamoo", "marahaba", "hujambo", "hamjambo", "salama", "vipi"]
 greeting_keywords_en = ["hi", "hello", "hey", "how's it", "what's up", "yo", "howdy"]
 def contains_greeting_sw(question):
     # Check if the question contains acknowledgment keywords
     return contains_exact_word_or_phrase(question, greeting_keywords_sw)
 def contains_greeting_en(question):
     # Check if the question contains acknowledgment keywords
     return contains_exact_word_or_phrase(question, greeting_keywords_en)
 def contains_acknowledgment_sw(question):
     # Check if the question contains acknowledgment keywords
     return contains_exact_word_or_phrase(question, acknowledgment_keywords_sw)
 def contains_acknowledgment_en(question):
     # Check if the question contains acknowledgment keywords
     return contains_exact_word_or_phrase(question, acknowledgment_keywords_en)
 def contains_follow_up(question):
     # Check if the question contains follow-up indicators
     return contains_exact_word_or_phrase(question, follow_up_keywords)
+def convert_to_date(date_str):
+    return datetime.strptime(date_str, "%Y%m%d")
+def detect_language(question):
+    # Check if the text has less than 5 words
+    if len(question.split()) < 5:
+        languages = [Language.ENGLISH, Language.SWAHILI]  # Add more languages as needed
+        detector = LanguageDetectorBuilder.from_languages(*languages).build()
+        detected_language = detector.detect_language_of(question)
+        # Return language code for consistency
+        if detected_language == Language.SWAHILI:
+            return "sw"
+        elif detected_language == Language.ENGLISH:
+            return "en"
+    else:
+        try:
+            lang_detect = detect(question)
+            return lang_detect
+        except Exception as e:
+            print(f"Error with langdetect: {e}")
+            return "unknown"
 def nishauri(question: str, conversation_history: list[str]):
+    # Get conversation history
     context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
+    ## Process greeting
+    # greet_response = process_greeting_response(question)
+    if contains_greeting_en(question) and not contains_follow_up(question):
+            greeting = (
+                f" The user previously asked and answered the following: {context}. "
+                f" The user just provided the following greeting: {question}. "
+                "Please respond accordingly in English."
+            )
+            completion = client.chat.completions.create(
+              model="gpt-4o",
+                messages=[
+                {"role": "user", "content": greeting}
+              ]
+            )
+            reply_to_user = completion.choices[0].message.content
+            conversation_history.append({"user": question, "chatbot": reply_to_user})
+            return reply_to_user, conversation_history
+    if contains_greeting_sw(question) and not contains_follow_up(question):
+            greeting = (
+                f" The user previously asked and answered the following: {context}. "
+                f" The user just provided the following greeting: {question}. "
+                "Please respond accordingly in Swahili."
+            )
+            completion = client.chat.completions.create(
+              model="gpt-4o",
+                messages=[
+                {"role": "user", "content": greeting}
+              ]
+            )
+            reply_to_user = completion.choices[0].message.content
+            conversation_history.append({"user": question, "chatbot": reply_to_user})
+            return reply_to_user, conversation_history
+    ## Process acknowledgment
+    if contains_acknowledgment_en(question) and not contains_follow_up(question):
+            acknowledgment = (
+                f" The user previously asked and answered the following: {context}. "
+                f" The user just provided the following acknowledgement: {question}. "
+                "Please respond accordingly in English."
+            )
+            completion = client.chat.completions.create(
+              model="gpt-4o",
+                messages=[
+                {"role": "user", "content": acknowledgment}
+              ]
+            )
+            reply_to_user = completion.choices[0].message.content
+            conversation_history.append({"user": question, "chatbot": reply_to_user})
+            return reply_to_user, conversation_history
+    if contains_acknowledgment_sw(question) and not contains_follow_up(question):
+            acknowledgment = (
+                f" The user previously asked and answered the following: {context}. "
+                f" The user just provided the following acknowledgment: {question}. "
+                "Please respond accordingly in Swahili."
+            )
+            completion = client.chat.completions.create(
+              model="gpt-4o",
+                messages=[
+                {"role": "user", "content": acknowledgment}
+              ]
+            )
+            reply_to_user = completion.choices[0].message.content
+            conversation_history.append({"user": question, "chatbot": reply_to_user})
+            return reply_to_user, conversation_history
+    ## If not greeting or acknowledgement, then proceed with RAG
+    ## Detect language of question - if Swahili, translate to English
+    lang_question = detect_language(question)
     if lang_question=="sw":
         question = GoogleTranslator(source='sw', target='en').translate(question)
     source2 = sources[2].text
     background = ("The person who asked the question is a person living with HIV."
+          " They are asking questions about HIV. Do not talk about anything that is not related to HIV. "
+          " Recognize that they already have HIV and do not suggest that they have to get tested"
+          " for HIV or take post-exposure prophylaxis, as that is not relevant, though their partners perhaps should."
+          " Do not suggest anything that is not relevant to someone who already has HIV."
+          " Do not mention in the response that the person is living with HIV."
+          " The following information about viral loads is authoritative for any question about viral loads:"
+          " Under 50 copies/ml is low detectable level,"
+          " 50 - 199 copies/ml is low level viremia, 200 - 999 is high level viremia, and "
+          " 1000 and above is suspected treatment failure."
+          " A high viral load or non-suppressed viral load is any viral load above 200 copies/ml."
+          " A suppressed viral load is one below 200 copies / ml.")
     question_final = (
     f" The user previously asked and answered the following: {context}. "