Spaces:

GuestUser33
/

kazakh-learning-api

Sleeping

App Files Files Community

GuestUser33 commited on May 29, 2025

Commit

90c3bf9

verified ·

1 Parent(s): 51a5da5

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -37

app.py CHANGED Viewed

@@ -490,30 +490,35 @@ class PersonalizedKazakhAssistant:
         You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
         Key capabilities:
-        1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms
-        2. **Track Learning Progress**: Identify and track when users learn new words or idioms
-        3. **Personalized Responses**: Adapt responses based on user's learning history
-        4. **Progress Reporting**: Provide detailed progress reports when asked
-        5. **Learning Recommendations**: Suggest words/idioms to review or learn next
         Response Guidelines:
-        - For word/idiom queries: Provide definition, usage examples, and related information in {target_language}
-        - When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it
-        - Only bold the main term or idiom being explained, not other Kazakh words
-        - Always identify the main Kazakh word/idiom for progress tracking
-        - Be encouraging and supportive
-        - Use simple, clear explanations
-        - When discussing progress, be specific and motivating
-        - Avoid storing definitions as terms
-        - Normalize terms to lowercase
-        - Respond in conversational style
         """
         self.llm = genai.GenerativeModel(
             model_name=self.MODEL,
             system_instruction=self.system_prompt,
             generation_config={
                 "temperature": 0.7,
-                "max_output_tokens": 500
             }
         )
@@ -527,11 +532,11 @@ class PersonalizedKazakhAssistant:
         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
-            bold_pattern = r'\*\*([^\*]+)\*\*'  # Match any bolded text
             bold_matches = re.findall(bold_pattern, response)
             for term in bold_matches:
-                normalized_term = self.normalize_term(term)
                 if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
                     print(f"Skipped term {normalized_term}: Invalid length or already seen")
                     continue
@@ -542,15 +547,12 @@ class PersonalizedKazakhAssistant:
                 term_matched = False
                 original_term = term  # Preserve original case for tracking
-                # Check if term is multi-word (likely an idiom)
-                if len(term.split()) > 1:
-                    category = "idiom"
-                # Check for exact match in known terms
                 for known_term in self.known_terms:
                     if normalized_term == self.normalize_term(known_term):
                         term_matched = True
-                        original_term = known_term
                         for doc in retrieved_docs:
                             doc_type = doc.metadata.get('doc_type', '').lower()
                             if normalized_term in self.normalize_term(doc.page_content):
@@ -562,17 +564,23 @@ class PersonalizedKazakhAssistant:
                                     category = "grammar"
                                 definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
                                 break
                         break
                 # If no exact match, try fuzzy matching for idioms with suffixes
                 if not term_matched:
                     for known_term in self.known_terms:
                         normalized_known = self.normalize_term(known_term)
                         if (normalized_term.startswith(normalized_known) and
                             len(normalized_term) <= len(normalized_known) + 4):
                             term_matched = True
-                            normalized_term = normalized_known
-                            original_term = known_term
                             for doc in retrieved_docs:
                                 if normalized_known in self.normalize_term(doc.page_content):
                                     doc_type = doc.metadata.get('doc_type', '').lower()
@@ -584,21 +592,20 @@ class PersonalizedKazakhAssistant:
                                         category = "grammar"
                                     definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
                                     break
                             break
-                # If term is multi-word and not matched, assume it's an idiom
-                if not term_matched and len(term.split()) > 1:
-                    category = "idiom"
                     definition = self.extract_clean_definition(normalized_term, "", response)
-                # Single-word terms from words folder should be categorized as words
-                if term_matched and len(original_term.split()) == 1:
-                    for doc in retrieved_docs:
-                        if 'words' in doc.metadata.get('doc_type', '').lower():
-                            category = "word"
-                            break
-                if definition:
                     terms.append((original_term, category, definition))
                     seen_terms.add(normalized_term)
                     print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")

         You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
         Key capabilities:
+        1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms.
+        2. **Track Learning Progress**: Identify and track when users learn new words or idioms.
+        3. **Personalized Responses**: Adapt responses based on user's learning history.
+        4. **Progress Reporting**: Provide detailed progress reports when asked.
+        5. **Learning Recommendations**: Suggest words/idioms to review or learn next.
         Response Guidelines:
+        - For word/idiom queries: Provide definition, usage examples, and related information in {target_language}.
+        - When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it.
+        - Only bold the main term or idiom being explained, not other Kazakh words.
+        - Always identify the main Kazakh word/idiom for progress tracking.
+        - **RAG Usage**:
+        - Use Retrieval-Augmented Generation (RAG) only when the query explicitly asks for explanations of specific Kazakh terms or idioms (e.g., "What does сәлем mean?") or when the context strongly suggests a need for knowledge base information (e.g., queries about specific words or idioms).
+        - When using RAG to explain terms (e.g., nouns, idioms), limit examples to 3-4 relevant ones. Do not list all or many examples or all matches from the knowledge base if not explicitly asked (only 3,4).
+        - For general queries (e.g., greetings, procedural questions, or commands like /progress) or grammar-related queries (e.g., "explain me nouns"), rely on your general knowledge and do not use RAG unless the knowledge base contains relevant information.
+        - Since the knowledge base contains only words and idioms, grammar explanations (e.g., about nouns, verbs) should be provided using your own knowledge, without relying on RAG, unless the query specifically involves terms in the knowledge base.
+        - Be encouraging and supportive.
+        - Use simple, clear explanations.
+        - When discussing progress, be specific and motivating.
+        - Avoid storing definitions as terms.
+        - Normalize terms to lowercase for consistency.
+        - Respond in a conversational style.
         """
         self.llm = genai.GenerativeModel(
             model_name=self.MODEL,
             system_instruction=self.system_prompt,
             generation_config={
                 "temperature": 0.7,
+                "max_output_tokens": 700
             }
         )
         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
+            bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
             bold_matches = re.findall(bold_pattern, response)
             for term in bold_matches:
+                normalized_term = self.normalize_term(term)  # Normalize to lowercase
                 if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
                     print(f"Skipped term {normalized_term}: Invalid length or already seen")
                     continue
                 term_matched = False
                 original_term = term  # Preserve original case for tracking
+                # Check for exact match in known terms (case-insensitive)
                 for known_term in self.known_terms:
                     if normalized_term == self.normalize_term(known_term):
                         term_matched = True
+                        original_term = known_term  # Use the known term's original case
+                        # Determine category based on known term's source
                         for doc in retrieved_docs:
                             doc_type = doc.metadata.get('doc_type', '').lower()
                             if normalized_term in self.normalize_term(doc.page_content):
                                     category = "grammar"
                                 definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
                                 break
+                        # If no document match, check term length for idiom likelihood
+                        if not definition and len(known_term.split()) > 1:
+                            category = "idiom"
+                            definition = self.extract_clean_definition(normalized_term, "", response)
                         break
                 # If no exact match, try fuzzy matching for idioms with suffixes
                 if not term_matched:
                     for known_term in self.known_terms:
                         normalized_known = self.normalize_term(known_term)
+                        # Check if the bolded term is a close match to a known term
+                        # Allow up to 4 extra characters (e.g., grammatical endings)
                         if (normalized_term.startswith(normalized_known) and
                             len(normalized_term) <= len(normalized_known) + 4):
                             term_matched = True
+                            normalized_term = normalized_known  # Use the base known term
+                            original_term = known_term  # Use the original known term for tracking
                             for doc in retrieved_docs:
                                 if normalized_known in self.normalize_term(doc.page_content):
                                     doc_type = doc.metadata.get('doc_type', '').lower()
                                         category = "grammar"
                                     definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
                                     break
+                            # If no document match, assume idiom for multi-word terms
+                            if not definition and len(known_term.split()) > 1:
+                                category = "idiom"
+                                definition = self.extract_clean_definition(normalized_known, "", response)
                             break
+                # Additional check: single-word terms from words folder should not be idioms
+                if term_matched and len(original_term.split()) == 1 and any('words' in doc.metadata.get('doc_type', '').lower() for doc in retrieved_docs):
+                    category = "word"
+                if not definition and term_matched:
                     definition = self.extract_clean_definition(normalized_term, "", response)
+                if term_matched and definition:
                     terms.append((original_term, category, definition))
                     seen_terms.add(normalized_term)
                     print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")