Spaces:

GuestUser33
/

kazakh-learning-api

Running

App Files Files Community

GuestUser33 commited on May 29, 2025

Commit

51a5da5

verified ·

1 Parent(s): 45ea6b5

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -89

app.py CHANGED Viewed

@@ -527,7 +527,7 @@ class PersonalizedKazakhAssistant:
         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
-            bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
             bold_matches = re.findall(bold_pattern, response)
             for term in bold_matches:
@@ -536,31 +536,72 @@ class PersonalizedKazakhAssistant:
                     print(f"Skipped term {normalized_term}: Invalid length or already seen")
                     continue
-                if normalized_term in self.known_terms:
-                    category = "word"  # Default to word
-                    definition = ""
-                    for doc in retrieved_docs:
-                        if normalized_term in self.normalize_term(doc.page_content):
                             doc_type = doc.metadata.get('doc_type', '').lower()
-                            # Prioritize folder structure
-                            if 'idioms' in doc_type.lower() or 'тіркес' in doc_type.lower():
-                                category = "idiom"
-                            elif 'words' in doc_type.lower():
-                                category = "word"  # Ensure terms from words folder are words
-                            elif 'grammar' in doc_type.lower():
-                                category = "grammar"
-                            definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
                             break
-                    # Additional check: single-word terms from words folder should not be idioms
-                    if len(term.split()) == 1 and any('words' in doc.metadata.get('doc_type', '').lower() for doc in retrieved_docs):
-                        category = "word"
-                    if not definition:
-                        definition = self.extract_clean_definition(normalized_term, "", response)
-                    if definition:
-                        terms.append((term, category, definition))
-                        seen_terms.add(normalized_term)
-                        print(f"Added bolded term: {term}, category: {category}, definition: {definition}")
             return terms
@@ -605,8 +646,8 @@ class PersonalizedKazakhAssistant:
             memory=memory
         )
-    def process_message(self, message: str, user_id: str = "default_user", session_token: str = None, use_direct_gemini: bool = False, target_language: str = "English") -> str:
-        """Process user message with proper user session management and toggle for direct Gemini"""
         if session_token and not self.tracker.validate_session(user_id, session_token):
             return f"Session expired. Please login again in {target_language}."
@@ -646,9 +687,6 @@ class PersonalizedKazakhAssistant:
         elif message.lower().startswith('/help'):
             return self.get_help_message()
-        if use_direct_gemini:
-            return self.process_direct_gemini(message, user_id, target_language)
         # Retrieve relevant documents from vectorstore
         retrieved_docs = self.vectorstore.similarity_search(message, k=5)
         context = "\n".join([doc.page_content for doc in retrieved_docs])
@@ -686,7 +724,7 @@ class PersonalizedKazakhAssistant:
                 words_to_review=''.join([f"  - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
                                         for word in words_to_review]),
                 mastered_words=''.join([f"  - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
-                                    for word in mastered_words])
             )
         # Construct prompt with context, history, and progress
@@ -917,56 +955,14 @@ Start learning by asking about any Kazakh term! 🌟
         session_token = self.tracker.create_user_session(user_id)
         return session_token
-    def process_direct_gemini(self, message: str, user_id: str, target_language: str = "English") -> str:
-        """Process message using direct Gemini with conversation memory for context."""
-        try:
-            memory = self.get_user_memory(user_id)
-            chat_history = ""
-            for msg in memory.chat_memory.messages[-10:]:
-                if isinstance(msg, HumanMessage):
-                    chat_history += f"User: {msg.content}\n"
-                elif isinstance(msg, AIMessage):
-                    chat_history += f"Assistant: {msg.content}\n"
-            direct_prompt = """
-            You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
-            Previous conversation context:
-            {chat_history}
-            """
-            if target_language != "English" and not any(
-                keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
-            ):
-                modified_message = f"Explain in {target_language}: {message}"
-            else:
-                modified_message = message
-            direct_model = genai.GenerativeModel(
-                model_name=self.MODEL,
-                system_instruction=direct_prompt.format(chat_history=chat_history),
-                generation_config={
-                    "temperature": 0.7,
-                    "max_output_tokens": 200
-                }
-            )
-            response = direct_model.generate_content(modified_message).text
-            memory.chat_memory.add_user_message(message)
-            memory.chat_memory.add_ai_message(response)
-            return response
-        except Exception as e:
-            return f"Error processing direct Gemini request: {str(e)}"
 assistant = PersonalizedKazakhAssistant()
-def chat_interface(message, history, use_direct_gemini, target_language):
-    """Chat interface for Gradio with toggle for direct Gemini mode"""
     try:
         web_user_id = "web_user_default"
-        response = assistant.process_message(message, web_user_id, use_direct_gemini=use_direct_gemini, target_language=target_language)
         return response
     except Exception as e:
         return f"Sorry, I encountered an error: {str(e)}. Please try again."
@@ -987,10 +983,10 @@ def api_login(user_id: str) -> dict:
             "error": str(e)
         }
-def api_chat(message: str, user_id: str, session_token: str = None, use_direct_gemini: bool = False, target_language: str = "English") -> dict:
-    """API endpoint for chat functionality with proper user session and direct Gemini toggle"""
     try:
-        response = assistant.process_message(message, user_id, session_token, use_direct_gemini, target_language)
         return {
             "success": True,
             "response": response,
@@ -1186,9 +1182,8 @@ with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
     gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
     with gr.Tab("💬 Chat Interface"):
-        gr.Markdown("Toggle **Direct Gemini Mode** to learn Kazakh grammar without RAG. Select the language for explanations.")
         with gr.Row():
-            use_direct_gemini = gr.Checkbox(label="Direct Gemini Mode (No RAG/Tracking)", value=False)
             target_language = gr.Dropdown(
                 label="Explanation Language",
                 choices=["English", "Kazakh", "Russian"],
@@ -1196,17 +1191,17 @@ with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
             )
         chat_interface_component = gr.ChatInterface(
             fn=chat_interface,
-            additional_inputs=[use_direct_gemini, target_language],
             type="messages",
             examples=[
-                ["сәлем деген не?", False, "English"],
-                ["күләпара не үшін керек?", False, "English"],
-                ["/progress", False, "English"],
-                ["/recommendations", False, "English"],
-                ["/review", False, "English"],
-                ["/mastered", False, "English"],
-                ["Explain Kazakh noun cases in Russian", True, "Russian"],
-                ["Teach me Kazakh verb conjugation in English", True, "English"]
             ]
         )

         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
+            bold_pattern = r'\*\*([^\*]+)\*\*'  # Match any bolded text
             bold_matches = re.findall(bold_pattern, response)
             for term in bold_matches:
                     print(f"Skipped term {normalized_term}: Invalid length or already seen")
                     continue
+                # Initialize category and definition
+                category = "word"  # Default to word
+                definition = ""
+                term_matched = False
+                original_term = term  # Preserve original case for tracking
+                # Check if term is multi-word (likely an idiom)
+                if len(term.split()) > 1:
+                    category = "idiom"
+                # Check for exact match in known terms
+                for known_term in self.known_terms:
+                    if normalized_term == self.normalize_term(known_term):
+                        term_matched = True
+                        original_term = known_term
+                        for doc in retrieved_docs:
                             doc_type = doc.metadata.get('doc_type', '').lower()
+                            if normalized_term in self.normalize_term(doc.page_content):
+                                if 'idioms' in doc_type or 'тіркес' in doc_type:
+                                    category = "idiom"
+                                elif 'words' in doc_type:
+                                    category = "word"
+                                elif 'grammar' in doc_type:
+                                    category = "grammar"
+                                definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
+                                break
+                        break
+                # If no exact match, try fuzzy matching for idioms with suffixes
+                if not term_matched:
+                    for known_term in self.known_terms:
+                        normalized_known = self.normalize_term(known_term)
+                        if (normalized_term.startswith(normalized_known) and
+                            len(normalized_term) <= len(normalized_known) + 4):
+                            term_matched = True
+                            normalized_term = normalized_known
+                            original_term = known_term
+                            for doc in retrieved_docs:
+                                if normalized_known in self.normalize_term(doc.page_content):
+                                    doc_type = doc.metadata.get('doc_type', '').lower()
+                                    if 'idioms' in doc_type or 'тіркес' in doc_type:
+                                        category = "idiom"
+                                    elif 'words' in doc_type:
+                                        category = "word"
+                                    elif 'grammar' in doc_type:
+                                        category = "grammar"
+                                    definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
+                                    break
                             break
+                # If term is multi-word and not matched, assume it's an idiom
+                if not term_matched and len(term.split()) > 1:
+                    category = "idiom"
+                    definition = self.extract_clean_definition(normalized_term, "", response)
+                # Single-word terms from words folder should be categorized as words
+                if term_matched and len(original_term.split()) == 1:
+                    for doc in retrieved_docs:
+                        if 'words' in doc.metadata.get('doc_type', '').lower():
+                            category = "word"
+                            break
+                if definition:
+                    terms.append((original_term, category, definition))
+                    seen_terms.add(normalized_term)
+                    print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")
             return terms
             memory=memory
         )
+    def process_message(self, message: str, user_id: str = "default_user", session_token: str = None, target_language: str = "English") -> str:
+        """Process user message with proper user session management"""
         if session_token and not self.tracker.validate_session(user_id, session_token):
             return f"Session expired. Please login again in {target_language}."
         elif message.lower().startswith('/help'):
             return self.get_help_message()
         # Retrieve relevant documents from vectorstore
         retrieved_docs = self.vectorstore.similarity_search(message, k=5)
         context = "\n".join([doc.page_content for doc in retrieved_docs])
                 words_to_review=''.join([f"  - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
                                         for word in words_to_review]),
                 mastered_words=''.join([f"  - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
+                                        for word in mastered_words])
             )
         # Construct prompt with context, history, and progress
         session_token = self.tracker.create_user_session(user_id)
         return session_token
 assistant = PersonalizedKazakhAssistant()
+def chat_interface(message, history, target_language):
+    """Chat interface for Gradio"""
     try:
         web_user_id = "web_user_default"
+        response = assistant.process_message(message, web_user_id, target_language=target_language)
         return response
     except Exception as e:
         return f"Sorry, I encountered an error: {str(e)}. Please try again."
             "error": str(e)
         }
+def api_chat(message: str, user_id: str, session_token: str = None, target_language: str = "English") -> dict:
+    """API endpoint for chat functionality with proper user session"""
     try:
+        response = assistant.process_message(message, user_id, session_token, target_language)
         return {
             "success": True,
             "response": response,
     gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
     with gr.Tab("💬 Chat Interface"):
+        gr.Markdown("Select the language for explanations.")
         with gr.Row():
             target_language = gr.Dropdown(
                 label="Explanation Language",
                 choices=["English", "Kazakh", "Russian"],
             )
         chat_interface_component = gr.ChatInterface(
             fn=chat_interface,
+            additional_inputs=[target_language],
             type="messages",
             examples=[
+                ["сәлем деген не?", "English"],
+                ["күләпара не үшін керек?", "English"],
+                ["/progress", "English"],
+                ["/recommendations", "English"],
+                ["/review", "English"],
+                ["/mastered", "English"],
+                ["Explain Kazakh noun cases in Russian", "Russian"],
+                ["Teach me Kazakh verb conjugation in English", "English"]
             ]
         )