JDFPalladium commited on
Commit
fe4eab2
·
1 Parent(s): daec31b

updating language and intention detection

Browse files
Files changed (5) hide show
  1. Makefile +5 -1
  2. app.py +42 -87
  3. requirements.txt +2 -1
  4. utils/__init__.py +0 -0
  5. utils/helpers.py +103 -0
Makefile CHANGED
@@ -1,3 +1,7 @@
1
  install:
2
  pip install --upgrade pip &&\
3
- pip install -r requirements.txt
 
 
 
 
 
1
  install:
2
  pip install --upgrade pip &&\
3
+ pip install -r requirements.txt
4
+
5
+
6
+ lint:
7
+ pylint --disable=R,C app.py
app.py CHANGED
@@ -11,8 +11,10 @@ import gradio as gr
11
  from openai import OpenAI as OpenAIOG
12
  from llama_index.llms.openai import OpenAI
13
  from llama_index.core import StorageContext, load_index_from_storage
 
14
  from deep_translator import GoogleTranslator
15
  from dotenv import load_dotenv
 
16
 
17
  # Load environment variables from .env file
18
  load_dotenv()
@@ -28,105 +30,59 @@ client = OpenAIOG()
28
  # Load index for retrieval
29
  storage_context = StorageContext.from_defaults(persist_dir="arv_metadata")
30
  index = load_index_from_storage(storage_context)
31
- retriever = index.as_retriever(similarity_top_k=3)
32
-
33
- # Define keyword lists
34
- acknowledgment_keywords_sw = ["sawa", "ndiyo", "naam", "hakika", "asante", "nimeelewa", "nimekupata", "ni kweli", "kwa hakika", "nimesikia", "ahsante"]
35
- acknowledgment_keywords_en = ["thanks", "thank you", "thx", "ok", "okay", "great", "got it", "appreciate", "good", "makes sense"]
36
- follow_up_keywords = ["but", "also", "and", "what", "how", "why", "when", "is", "?", "lakini", "pia", "na", "nini", "vipi", "kwanini", "wakati"]
37
- greeting_keywords_sw = ["sasa", "niaje", "habari", "mambo", "jambo", "shikamoo", "marahaba", "hujambo", "hamjambo", "salama", "vipi"]
38
- greeting_keywords_en = ["hi", "hello", "hey", "how's it", "what's up", "yo", "howdy"]
39
- #%%
40
- # Define helper functions
41
-
42
- def contains_exact_word_or_phrase(text, keywords):
43
- """Check if the given text contains any exact keyword from the list."""
44
- text = text.lower()
45
- return any(re.search(r'\b' + re.escape(keyword) + r'\b', text) for keyword in keywords)
46
-
47
- def contains_greeting_sw(text):
48
- return contains_exact_word_or_phrase(text, greeting_keywords_sw)
49
-
50
- def contains_greeting_en(text):
51
- return contains_exact_word_or_phrase(text, greeting_keywords_en)
52
-
53
- def contains_acknowledgment_sw(text):
54
- return contains_exact_word_or_phrase(text, acknowledgment_keywords_sw)
55
-
56
- def contains_acknowledgment_en(text):
57
- return contains_exact_word_or_phrase(text, acknowledgment_keywords_en)
58
-
59
- def contains_follow_up(text):
60
- return contains_exact_word_or_phrase(text, follow_up_keywords)
61
-
62
- def detect_language(text):
63
- """Detect language of a given text using Lingua for short texts and langdetect for longer ones."""
64
- if len(text.split()) < 5:
65
- languages = [Language.ENGLISH, Language.SWAHILI]
66
- detector = LanguageDetectorBuilder.from_languages(*languages).build()
67
- detected_language = detector.detect_language_of(text)
68
- return "sw" if detected_language == Language.SWAHILI else "en"
69
- try:
70
- return detect(text)
71
- except Exception as e:
72
- logging.warning(f"Language detection error: {e}")
73
- return "unknown"
74
  #%%
75
  # Define Gradio function
76
  def nishauri(question, conversation_history: list[str]):
77
 
78
  """Process user query, detect language, handle greetings, acknowledgments, and retrieve relevant information."""
79
  context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
80
-
81
- # Process greetings and acknowledgments
82
- for lang, contains_greeting, contains_acknowledgment in [("en", contains_greeting_en, contains_acknowledgment_en), ("sw", contains_greeting_sw, contains_acknowledgment_sw)]:
83
- if contains_greeting(question) and not contains_follow_up(question):
84
- prompt = f"The user said: {question}. Respond accordingly in {lang}."
85
- elif contains_acknowledgment(question) and not contains_follow_up(question):
86
- prompt = f"The user acknowledged: {question}. Respond accordingly in {lang}."
87
- else:
88
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  completion = client.chat.completions.create(
90
- model="gpt-4o",
91
  messages=[{"role": "user", "content": prompt}]
92
  )
93
  reply_to_user = completion.choices[0].message.content
94
  conversation_history.append({"user": question, "chatbot": reply_to_user})
95
- source1return = ""
96
- source2return = ""
97
- source3return = ""
98
- return reply_to_user, source1return, source2return, source3return, conversation_history
99
 
100
- # Detect language and translate if needed
101
- lang_question = detect_language(question)
102
  if lang_question == "sw":
103
  question = GoogleTranslator(source='sw', target='en').translate(question)
104
 
105
  # Retrieve relevant sources
106
  sources = retriever.retrieve(question)
107
- retrieved_text = "\n\n".join([f"Source {i+1}: {source.text}" for i, source in enumerate(sources[:5])])
108
-
109
- source1return = ("File Name: " +
110
- sources[0].metadata["file_name"] +
111
- "\nPage Number: " +
112
- sources[0].metadata["page_label"] +
113
- "\n Source Text: " +
114
- sources[0].text)
115
-
116
- source2return = ("File Name: " +
117
- sources[1].metadata["file_name"] +
118
- "\nPage Number: " +
119
- sources[1].metadata["page_label"] +
120
- "\n Source Text: " +
121
- sources[1].text)
122
-
123
- source3return = ("File Name: " +
124
- sources[2].metadata["file_name"] +
125
- "\nPage Number: " +
126
- sources[2].metadata["page_label"] +
127
- "\n Source Text: " +
128
- sources[2].text)
129
-
130
 
131
  # Combine into new user question - conversation history, new question, retrieved sources
132
  question_final = (
@@ -141,7 +97,9 @@ def nishauri(question, conversation_history: list[str]):
141
  # Set LLM instructions. If user consented, add user parameters, otherwise proceed without
142
  system_prompt = (
143
  "You are a helpful assistant who only answers questions about HIV.\n"
144
- "- Only answers questions about HIV (Human Immunodeficiency Virus). Recognize that users may type 'HIV' with any capitalization (e.g., HIV, hiv, Hiv, etc.) or make minor typos (e.g., hvi, hiv/aids). Use your best judgment to understand when a user intends to refer to HIV. Politely correct any significant misunderstandings, but otherwise proceed to answer normally.\n"
 
 
145
  "- Do not answer questions about other topics (e.g., malaria or tuberculosis).\n"
146
  "- If a question is unrelated to HIV, politely respond that you can only answer HIV-related questions.\n\n"
147
 
@@ -188,7 +146,7 @@ def nishauri(question, conversation_history: list[str]):
188
  reply_to_user = GoogleTranslator(source='auto', target='sw').translate(reply_to_user)
189
 
190
  # return system_prompt, conversation_history
191
- return reply_to_user, source1return, source2return, source3return, conversation_history
192
 
193
  #%%
194
  demo = gr.Interface(
@@ -198,9 +156,6 @@ demo = gr.Interface(
198
  inputs=["text", gr.State(value=[])],
199
  outputs=[
200
  gr.Textbox(label = "Nuru Response", type = "text"),
201
- gr.Textbox(label = "Source 1", max_lines = 10, autoscroll = False, type = "text"),
202
- gr.Textbox(label = "Source 2", max_lines = 10, autoscroll = False, type = "text"),
203
- gr.Textbox(label = "Source 3", max_lines = 10, autoscroll = False, type = "text"),
204
  gr.State()
205
  ],
206
  )
 
11
  from openai import OpenAI as OpenAIOG
12
  from llama_index.llms.openai import OpenAI
13
  from llama_index.core import StorageContext, load_index_from_storage
14
+ from llama_index.core.postprocessor.llm_rerank import LLMRerank
15
  from deep_translator import GoogleTranslator
16
  from dotenv import load_dotenv
17
+ import utils.helpers as helpers
18
 
19
  # Load environment variables from .env file
20
  load_dotenv()
 
30
  # Load index for retrieval
31
  storage_context = StorageContext.from_defaults(persist_dir="arv_metadata")
32
  index = load_index_from_storage(storage_context)
33
+ retriever = index.as_retriever(similarity_top_k=10,
34
+ # Similarity threshold for filtering
35
+ similarity_threshold=0.5,
36
+ # Use LLM reranking to filter results
37
+ reranker=LLMRerank(top_n=3))
38
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  #%%
40
  # Define Gradio function
41
  def nishauri(question, conversation_history: list[str]):
42
 
43
  """Process user query, detect language, handle greetings, acknowledgments, and retrieve relevant information."""
44
  context = " ".join([item["user"] + " " + item["chatbot"] for item in conversation_history])
45
+ # formatted_history = convert_conversation_format(conversation_history)
46
+ # summary = summarize_conversation(formatted_history)
47
+
48
+ # detect language of user
49
+ lang_question = helpers.detect_language(question, Language, LanguageDetectorBuilder, client)
50
+ print(lang_question)
51
+
52
+ # If user is making a greeting or acknowledgement, address that accordingly
53
+ intent = helpers.detect_intention(question, client = client)
54
+ if intent == "greeting":
55
+ prompt = f"""
56
+ The user greeted you as follows: {question}.
57
+ Respond by asking if they have any questions about HIV.
58
+ Respond in {"Swahili" if lang_question == "sw" else "English"}.
59
+ """
60
+ elif intent == "acknowledgment":
61
+ prompt = f"""
62
+ The user acknowledged a response you gave to a prior question as follows {question}.
63
+ Respond by saying you are ready to help if they have any more questions.
64
+ Respond in {"Swahili" if lang_question == "sw" else "English"}.
65
+ """
66
+ else:
67
+ prompt = None
68
+
69
+ if prompt:
70
  completion = client.chat.completions.create(
71
+ model="gpt-3.5-turbo",
72
  messages=[{"role": "user", "content": prompt}]
73
  )
74
  reply_to_user = completion.choices[0].message.content
75
  conversation_history.append({"user": question, "chatbot": reply_to_user})
76
+ return reply_to_user, conversation_history
 
 
 
77
 
78
+ # If the user is asking a question, proceed with the RAG pipeline
79
+ # Translate if needed
80
  if lang_question == "sw":
81
  question = GoogleTranslator(source='sw', target='en').translate(question)
82
 
83
  # Retrieve relevant sources
84
  sources = retriever.retrieve(question)
85
+ retrieved_text = "\n\n".join([f"Source {i+1}: {source.text}" for i, source in enumerate(sources[:3])])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Combine into new user question - conversation history, new question, retrieved sources
88
  question_final = (
 
97
  # Set LLM instructions. If user consented, add user parameters, otherwise proceed without
98
  system_prompt = (
99
  "You are a helpful assistant who only answers questions about HIV.\n"
100
+ "- Only answers questions about HIV (Human Immunodeficiency Virus).\n"
101
+ "- Recognize that users may type 'HIV' with any capitalization (e.g., HIV, hiv, Hiv, etc.) or make minor typos (e.g., hvi, hiv/aids).\n"
102
+ "- Use your best judgment to understand when a user intends to refer to HIV. Politely correct any significant misunderstandings, but otherwise proceed to answer normally.\n"
103
  "- Do not answer questions about other topics (e.g., malaria or tuberculosis).\n"
104
  "- If a question is unrelated to HIV, politely respond that you can only answer HIV-related questions.\n\n"
105
 
 
146
  reply_to_user = GoogleTranslator(source='auto', target='sw').translate(reply_to_user)
147
 
148
  # return system_prompt, conversation_history
149
+ return reply_to_user, conversation_history
150
 
151
  #%%
152
  demo = gr.Interface(
 
156
  inputs=["text", gr.State(value=[])],
157
  outputs=[
158
  gr.Textbox(label = "Nuru Response", type = "text"),
 
 
 
159
  gr.State()
160
  ],
161
  )
requirements.txt CHANGED
@@ -3,4 +3,5 @@ llama_index==0.10.51
3
  langdetect==1.0.9
4
  deep-translator==1.11.4
5
  lingua-language-detector==2.0.2
6
- dotenv==0.9.9
 
 
3
  langdetect==1.0.9
4
  deep-translator==1.11.4
5
  lingua-language-detector==2.0.2
6
+ dotenv==0.9.9
7
+ pylint
utils/__init__.py ADDED
File without changes
utils/helpers.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def detect_language(text, Language, LanguageDetectorBuilder, client):
2
+ """Detect language of a given text using an LLM for short texts and Lingua for longer ones."""
3
+ text = text.lower().strip()
4
+
5
+ # Use LLM for short texts
6
+ if len(text.split()) < 5:
7
+ system_prompt = """
8
+ You are a language detection assistant. Identify the language of the given text.
9
+ Return only the language code: "en" for English or "sw" for Swahili.
10
+ If the language is neither English nor Swahili, return "unknown".
11
+ """
12
+
13
+ user_message = f"Text: \"{text}\""
14
+
15
+ try:
16
+ completion = client.chat.completions.create(
17
+ model="gpt-3.5-turbo",
18
+ messages=[
19
+ {"role": "system", "content": system_prompt},
20
+ {"role": "user", "content": user_message}
21
+ ],
22
+ temperature=0 # Deterministic output
23
+ )
24
+ detected_language = completion.choices[0].message.content.strip()
25
+ return detected_language
26
+ except Exception as e:
27
+ logging.warning(f"Language detection error (LLM): {e}")
28
+ return "unknown"
29
+
30
+ # Use Lingua for longer texts
31
+ try:
32
+ languages = [Language.ENGLISH, Language.SWAHILI]
33
+ detector = LanguageDetectorBuilder.from_languages(*languages).build()
34
+ detected_language = detector.detect_language_of(text)
35
+ return "sw" if detected_language == Language.SWAHILI else "en"
36
+ except Exception as e:
37
+ logging.warning(f"Language detection error (Lingua): {e}")
38
+ return "unknown"
39
+
40
+ def summarize_conversation(conversation, system_prompt=None):
41
+ """
42
+ Summarizes a conversation using GPT-4o.
43
+
44
+ Args:
45
+ conversation (list): A list of dicts with 'role' and 'content'.
46
+ system_prompt (str): Optional custom system instruction for summarization.
47
+
48
+ Returns:
49
+ str: The summary of the conversation.
50
+ """
51
+ # Default system prompt
52
+ if system_prompt is None:
53
+ system_prompt = "You are a helpful assistant that summarizes conversations clearly and concisely."
54
+
55
+ # Compose messages
56
+ messages = [{"role": "system", "content": system_prompt}]
57
+ messages += conversation
58
+ messages.append({
59
+ "role": "user",
60
+ "content": "Please summarize this conversation in a concise and clear paragraph."
61
+ })
62
+
63
+ # Call GPT-4o
64
+ completion = client.chat.completions.create(
65
+ model="gpt-4o",
66
+ messages=messages,
67
+ temperature=0.0
68
+ )
69
+
70
+ return completion.choices[0].message.content
71
+
72
+ def convert_conversation_format(conversation_history):
73
+ formatted = []
74
+ for turn in conversation_history:
75
+ formatted.append({"role": "user", "content": turn["user"]})
76
+ formatted.append({"role": "assistant", "content": turn["chatbot"]})
77
+ return formatted
78
+
79
+ def detect_intention(user_input, client):
80
+ system_prompt = """
81
+ You are an intent classification assistant. Classify the user's message into one of the following categories:
82
+
83
+ - "greeting" for messages like "hi", "hello", or similar
84
+ - "acknowledgment" for messages like "thanks", "okay", or similar
85
+ - "message" for anything else that may require a response, including health concerns or information requests
86
+
87
+ The user may speak in English or Swahili. Be aware that they might not use proper punctuation or grammar.
88
+
89
+ Return only the label: "greeting", "acknowledgment", or "message".
90
+ """
91
+
92
+ user_message = f"Message: \"{user_input}\""
93
+
94
+ completion = client.chat.completions.create(
95
+ model="gpt-3.5-turbo",
96
+ messages=[
97
+ {"role": "system", "content": system_prompt},
98
+ {"role": "user", "content": user_message}
99
+ ],
100
+ temperature=0 # for deterministic output
101
+ )
102
+
103
+ return completion.choices[0].message.content