Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -527,7 +527,7 @@ class PersonalizedKazakhAssistant:
|
|
| 527 |
|
| 528 |
try:
|
| 529 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 530 |
-
bold_pattern = r'\*\*([
|
| 531 |
bold_matches = re.findall(bold_pattern, response)
|
| 532 |
|
| 533 |
for term in bold_matches:
|
|
@@ -536,31 +536,72 @@ class PersonalizedKazakhAssistant:
|
|
| 536 |
print(f"Skipped term {normalized_term}: Invalid length or already seen")
|
| 537 |
continue
|
| 538 |
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
break
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
|
| 565 |
return terms
|
| 566 |
|
|
@@ -605,8 +646,8 @@ class PersonalizedKazakhAssistant:
|
|
| 605 |
memory=memory
|
| 606 |
)
|
| 607 |
|
| 608 |
-
def process_message(self, message: str, user_id: str = "default_user", session_token: str = None,
|
| 609 |
-
"""Process user message with proper user session management
|
| 610 |
|
| 611 |
if session_token and not self.tracker.validate_session(user_id, session_token):
|
| 612 |
return f"Session expired. Please login again in {target_language}."
|
|
@@ -646,9 +687,6 @@ class PersonalizedKazakhAssistant:
|
|
| 646 |
elif message.lower().startswith('/help'):
|
| 647 |
return self.get_help_message()
|
| 648 |
|
| 649 |
-
if use_direct_gemini:
|
| 650 |
-
return self.process_direct_gemini(message, user_id, target_language)
|
| 651 |
-
|
| 652 |
# Retrieve relevant documents from vectorstore
|
| 653 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 654 |
context = "\n".join([doc.page_content for doc in retrieved_docs])
|
|
@@ -686,7 +724,7 @@ class PersonalizedKazakhAssistant:
|
|
| 686 |
words_to_review=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 687 |
for word in words_to_review]),
|
| 688 |
mastered_words=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 689 |
-
|
| 690 |
)
|
| 691 |
|
| 692 |
# Construct prompt with context, history, and progress
|
|
@@ -917,56 +955,14 @@ Start learning by asking about any Kazakh term! 🌟
|
|
| 917 |
session_token = self.tracker.create_user_session(user_id)
|
| 918 |
return session_token
|
| 919 |
|
| 920 |
-
def process_direct_gemini(self, message: str, user_id: str, target_language: str = "English") -> str:
|
| 921 |
-
"""Process message using direct Gemini with conversation memory for context."""
|
| 922 |
-
try:
|
| 923 |
-
memory = self.get_user_memory(user_id)
|
| 924 |
-
chat_history = ""
|
| 925 |
-
for msg in memory.chat_memory.messages[-10:]:
|
| 926 |
-
if isinstance(msg, HumanMessage):
|
| 927 |
-
chat_history += f"User: {msg.content}\n"
|
| 928 |
-
elif isinstance(msg, AIMessage):
|
| 929 |
-
chat_history += f"Assistant: {msg.content}\n"
|
| 930 |
-
|
| 931 |
-
direct_prompt = """
|
| 932 |
-
You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
|
| 933 |
-
|
| 934 |
-
Previous conversation context:
|
| 935 |
-
{chat_history}
|
| 936 |
-
"""
|
| 937 |
-
|
| 938 |
-
if target_language != "English" and not any(
|
| 939 |
-
keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
|
| 940 |
-
):
|
| 941 |
-
modified_message = f"Explain in {target_language}: {message}"
|
| 942 |
-
else:
|
| 943 |
-
modified_message = message
|
| 944 |
-
|
| 945 |
-
direct_model = genai.GenerativeModel(
|
| 946 |
-
model_name=self.MODEL,
|
| 947 |
-
system_instruction=direct_prompt.format(chat_history=chat_history),
|
| 948 |
-
generation_config={
|
| 949 |
-
"temperature": 0.7,
|
| 950 |
-
"max_output_tokens": 200
|
| 951 |
-
}
|
| 952 |
-
)
|
| 953 |
-
|
| 954 |
-
response = direct_model.generate_content(modified_message).text
|
| 955 |
-
|
| 956 |
-
memory.chat_memory.add_user_message(message)
|
| 957 |
-
memory.chat_memory.add_ai_message(response)
|
| 958 |
-
|
| 959 |
-
return response
|
| 960 |
-
except Exception as e:
|
| 961 |
-
return f"Error processing direct Gemini request: {str(e)}"
|
| 962 |
|
| 963 |
assistant = PersonalizedKazakhAssistant()
|
| 964 |
|
| 965 |
-
def chat_interface(message, history,
|
| 966 |
-
"""Chat interface for Gradio
|
| 967 |
try:
|
| 968 |
web_user_id = "web_user_default"
|
| 969 |
-
response = assistant.process_message(message, web_user_id,
|
| 970 |
return response
|
| 971 |
except Exception as e:
|
| 972 |
return f"Sorry, I encountered an error: {str(e)}. Please try again."
|
|
@@ -987,10 +983,10 @@ def api_login(user_id: str) -> dict:
|
|
| 987 |
"error": str(e)
|
| 988 |
}
|
| 989 |
|
| 990 |
-
def api_chat(message: str, user_id: str, session_token: str = None,
|
| 991 |
-
"""API endpoint for chat functionality with proper user session
|
| 992 |
try:
|
| 993 |
-
response = assistant.process_message(message, user_id, session_token,
|
| 994 |
return {
|
| 995 |
"success": True,
|
| 996 |
"response": response,
|
|
@@ -1186,9 +1182,8 @@ with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
|
|
| 1186 |
gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
|
| 1187 |
|
| 1188 |
with gr.Tab("💬 Chat Interface"):
|
| 1189 |
-
gr.Markdown("
|
| 1190 |
with gr.Row():
|
| 1191 |
-
use_direct_gemini = gr.Checkbox(label="Direct Gemini Mode (No RAG/Tracking)", value=False)
|
| 1192 |
target_language = gr.Dropdown(
|
| 1193 |
label="Explanation Language",
|
| 1194 |
choices=["English", "Kazakh", "Russian"],
|
|
@@ -1196,17 +1191,17 @@ with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
|
|
| 1196 |
)
|
| 1197 |
chat_interface_component = gr.ChatInterface(
|
| 1198 |
fn=chat_interface,
|
| 1199 |
-
additional_inputs=[
|
| 1200 |
type="messages",
|
| 1201 |
examples=[
|
| 1202 |
-
["сәлем деген не?",
|
| 1203 |
-
["күләпара не үшін керек?",
|
| 1204 |
-
["/progress",
|
| 1205 |
-
["/recommendations",
|
| 1206 |
-
["/review",
|
| 1207 |
-
["/mastered",
|
| 1208 |
-
["Explain Kazakh noun cases in Russian",
|
| 1209 |
-
["Teach me Kazakh verb conjugation in English",
|
| 1210 |
]
|
| 1211 |
)
|
| 1212 |
|
|
|
|
| 527 |
|
| 528 |
try:
|
| 529 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 530 |
+
bold_pattern = r'\*\*([^\*]+)\*\*' # Match any bolded text
|
| 531 |
bold_matches = re.findall(bold_pattern, response)
|
| 532 |
|
| 533 |
for term in bold_matches:
|
|
|
|
| 536 |
print(f"Skipped term {normalized_term}: Invalid length or already seen")
|
| 537 |
continue
|
| 538 |
|
| 539 |
+
# Initialize category and definition
|
| 540 |
+
category = "word" # Default to word
|
| 541 |
+
definition = ""
|
| 542 |
+
term_matched = False
|
| 543 |
+
original_term = term # Preserve original case for tracking
|
| 544 |
+
|
| 545 |
+
# Check if term is multi-word (likely an idiom)
|
| 546 |
+
if len(term.split()) > 1:
|
| 547 |
+
category = "idiom"
|
| 548 |
+
|
| 549 |
+
# Check for exact match in known terms
|
| 550 |
+
for known_term in self.known_terms:
|
| 551 |
+
if normalized_term == self.normalize_term(known_term):
|
| 552 |
+
term_matched = True
|
| 553 |
+
original_term = known_term
|
| 554 |
+
for doc in retrieved_docs:
|
| 555 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 556 |
+
if normalized_term in self.normalize_term(doc.page_content):
|
| 557 |
+
if 'idioms' in doc_type or 'тіркес' in doc_type:
|
| 558 |
+
category = "idiom"
|
| 559 |
+
elif 'words' in doc_type:
|
| 560 |
+
category = "word"
|
| 561 |
+
elif 'grammar' in doc_type:
|
| 562 |
+
category = "grammar"
|
| 563 |
+
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 564 |
+
break
|
| 565 |
+
break
|
| 566 |
+
|
| 567 |
+
# If no exact match, try fuzzy matching for idioms with suffixes
|
| 568 |
+
if not term_matched:
|
| 569 |
+
for known_term in self.known_terms:
|
| 570 |
+
normalized_known = self.normalize_term(known_term)
|
| 571 |
+
if (normalized_term.startswith(normalized_known) and
|
| 572 |
+
len(normalized_term) <= len(normalized_known) + 4):
|
| 573 |
+
term_matched = True
|
| 574 |
+
normalized_term = normalized_known
|
| 575 |
+
original_term = known_term
|
| 576 |
+
for doc in retrieved_docs:
|
| 577 |
+
if normalized_known in self.normalize_term(doc.page_content):
|
| 578 |
+
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 579 |
+
if 'idioms' in doc_type or 'тіркес' in doc_type:
|
| 580 |
+
category = "idiom"
|
| 581 |
+
elif 'words' in doc_type:
|
| 582 |
+
category = "word"
|
| 583 |
+
elif 'grammar' in doc_type:
|
| 584 |
+
category = "grammar"
|
| 585 |
+
definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
|
| 586 |
+
break
|
| 587 |
break
|
| 588 |
+
|
| 589 |
+
# If term is multi-word and not matched, assume it's an idiom
|
| 590 |
+
if not term_matched and len(term.split()) > 1:
|
| 591 |
+
category = "idiom"
|
| 592 |
+
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 593 |
+
|
| 594 |
+
# Single-word terms from words folder should be categorized as words
|
| 595 |
+
if term_matched and len(original_term.split()) == 1:
|
| 596 |
+
for doc in retrieved_docs:
|
| 597 |
+
if 'words' in doc.metadata.get('doc_type', '').lower():
|
| 598 |
+
category = "word"
|
| 599 |
+
break
|
| 600 |
+
|
| 601 |
+
if definition:
|
| 602 |
+
terms.append((original_term, category, definition))
|
| 603 |
+
seen_terms.add(normalized_term)
|
| 604 |
+
print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")
|
| 605 |
|
| 606 |
return terms
|
| 607 |
|
|
|
|
| 646 |
memory=memory
|
| 647 |
)
|
| 648 |
|
| 649 |
+
def process_message(self, message: str, user_id: str = "default_user", session_token: str = None, target_language: str = "English") -> str:
|
| 650 |
+
"""Process user message with proper user session management"""
|
| 651 |
|
| 652 |
if session_token and not self.tracker.validate_session(user_id, session_token):
|
| 653 |
return f"Session expired. Please login again in {target_language}."
|
|
|
|
| 687 |
elif message.lower().startswith('/help'):
|
| 688 |
return self.get_help_message()
|
| 689 |
|
|
|
|
|
|
|
|
|
|
| 690 |
# Retrieve relevant documents from vectorstore
|
| 691 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 692 |
context = "\n".join([doc.page_content for doc in retrieved_docs])
|
|
|
|
| 724 |
words_to_review=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 725 |
for word in words_to_review]),
|
| 726 |
mastered_words=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 727 |
+
for word in mastered_words])
|
| 728 |
)
|
| 729 |
|
| 730 |
# Construct prompt with context, history, and progress
|
|
|
|
| 955 |
session_token = self.tracker.create_user_session(user_id)
|
| 956 |
return session_token
|
| 957 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
|
| 959 |
assistant = PersonalizedKazakhAssistant()
|
| 960 |
|
| 961 |
+
def chat_interface(message, history, target_language):
|
| 962 |
+
"""Chat interface for Gradio"""
|
| 963 |
try:
|
| 964 |
web_user_id = "web_user_default"
|
| 965 |
+
response = assistant.process_message(message, web_user_id, target_language=target_language)
|
| 966 |
return response
|
| 967 |
except Exception as e:
|
| 968 |
return f"Sorry, I encountered an error: {str(e)}. Please try again."
|
|
|
|
| 983 |
"error": str(e)
|
| 984 |
}
|
| 985 |
|
| 986 |
+
def api_chat(message: str, user_id: str, session_token: str = None, target_language: str = "English") -> dict:
|
| 987 |
+
"""API endpoint for chat functionality with proper user session"""
|
| 988 |
try:
|
| 989 |
+
response = assistant.process_message(message, user_id, session_token, target_language)
|
| 990 |
return {
|
| 991 |
"success": True,
|
| 992 |
"response": response,
|
|
|
|
| 1182 |
gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
|
| 1183 |
|
| 1184 |
with gr.Tab("💬 Chat Interface"):
|
| 1185 |
+
gr.Markdown("Select the language for explanations.")
|
| 1186 |
with gr.Row():
|
|
|
|
| 1187 |
target_language = gr.Dropdown(
|
| 1188 |
label="Explanation Language",
|
| 1189 |
choices=["English", "Kazakh", "Russian"],
|
|
|
|
| 1191 |
)
|
| 1192 |
chat_interface_component = gr.ChatInterface(
|
| 1193 |
fn=chat_interface,
|
| 1194 |
+
additional_inputs=[target_language],
|
| 1195 |
type="messages",
|
| 1196 |
examples=[
|
| 1197 |
+
["сәлем деген не?", "English"],
|
| 1198 |
+
["күләпара не үшін керек?", "English"],
|
| 1199 |
+
["/progress", "English"],
|
| 1200 |
+
["/recommendations", "English"],
|
| 1201 |
+
["/review", "English"],
|
| 1202 |
+
["/mastered", "English"],
|
| 1203 |
+
["Explain Kazakh noun cases in Russian", "Russian"],
|
| 1204 |
+
["Teach me Kazakh verb conjugation in English", "English"]
|
| 1205 |
]
|
| 1206 |
)
|
| 1207 |
|