Spaces:

GuestUser33
/

kazakh-learning-api

Running

App Files Files Community

GuestUser33 commited on May 23, 2025

Commit

7ccbf3d

verified ·

1 Parent(s): 867a432

Update app.py

Browse files

Files changed (1) hide show

app.py +497 -238

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ class LearningSession:
     end_time: Optional[datetime] = None
     words_learned: int = 0
     idioms_learned: int = 0
     questions_asked: int = 0
 @dataclass
@@ -57,16 +58,17 @@ class PersonalizedLearningTracker:
         cursor = conn.cursor()
         cursor.execute('''
-            CREATE TABLE IF NOT EXISTS learning_sessions (
-                session_id TEXT PRIMARY KEY,
-                user_id TEXT NOT NULL,
-                start_time TEXT NOT NULL,
-                end_time TEXT,
-                words_learned INTEGER DEFAULT 0,
-                idioms_learned INTEGER DEFAULT 0,
-                questions_asked INTEGER DEFAULT 0
-            )
-        ''')
         cursor.execute('''
             CREATE TABLE IF NOT EXISTS word_progress (
@@ -214,12 +216,25 @@ class PersonalizedLearningTracker:
                 WHERE user_id = ? AND word = ? AND category = ?
             ''', (now, user_id, word, category))
         else:
-            cursor.execute('''
                 INSERT INTO word_progress
                 (user_id, word, definition, category, first_encountered, last_reviewed)
                 VALUES (?, ?, ?, ?, ?, ?)
             ''', (user_id, word, definition, category, now, now))
         conn.commit()
         conn.close()
@@ -327,6 +342,32 @@ class PersonalizedLearningTracker:
         conn.close()
         return words
     def get_learning_recommendations(self, user_id: str) -> List[str]:
         """Get personalized learning recommendations"""
         progress = self.get_user_progress(user_id)
@@ -349,6 +390,7 @@ class PersonalizedLearningTracker:
 class PersonalizedKazakhAssistant:
     def __init__(self):
         self.setup_environment()
         self.setup_vectorstore()
         self.setup_llm()
@@ -358,19 +400,18 @@ class PersonalizedKazakhAssistant:
     def setup_environment(self):
         """Setup environment and configuration"""
         self.google_api_key = os.getenv("GOOGLE_API_KEY")
         self.MODEL = "gemini-1.5-flash"
         self.db_name = "vector_db"
     def setup_vectorstore(self):
         """Setup document loading and vector store"""
-        folders = glob.glob("knowledge-base/*")
         text_loader_kwargs = {'encoding': 'utf-8'}
         documents = []
         for folder in folders:
-            doc_type = os.path.basename(folder)
             loader = DirectoryLoader(
                 folder,
                 glob="**/*.txt",
@@ -382,6 +423,25 @@ class PersonalizedKazakhAssistant:
                 doc.metadata["doc_type"] = doc_type
                 documents.append(doc)
         text_splitter = CharacterTextSplitter(separator=r'\n', chunk_size=2000, chunk_overlap=0)
         chunks = text_splitter.split_documents(documents)
@@ -394,122 +454,192 @@ class PersonalizedKazakhAssistant:
         self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
         print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
     def setup_llm(self):
         """Setup LLM with enhanced system prompt"""
         system_prompt = """
-You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations.
-Key capabilities:
-1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms from your knowledge base
-2. **Track Learning Progress**: Identify and track when users learn new words or idioms
-3. **Personalized Responses**: Adapt responses based on user's learning history and progress
-4. **Progress Reporting**: Provide detailed progress reports when asked
-5. **Learning Recommendations**: Suggest words/idioms to review or learn next
-Response Guidelines:
-- For word/idiom queries: Provide definition, usage examples, and related information
-- Always identify the main Kazakh word/idiom being discussed for progress tracking
-- Be encouraging and supportive of the user's learning journey
-- Use simple, clear explanations appropriate for language learners
-- When discussing progress, be specific and motivating
-Format responses naturally in conversational style, not JSON unless specifically requested.
-"""
         self.llm = ChatGoogleGenerativeAI(
             model="models/gemini-1.5-flash",
             temperature=0.7,
-            system_instruction=system_prompt
         )
-    def get_user_memory(self, user_id: str):
-        """Get or create conversation memory for a specific user"""
-        if user_id not in self.user_memories:
-            self.user_memories[user_id] = ConversationBufferMemory(
-                memory_key='chat_history',
-                return_messages=True,
-                max_token_limit=10000
-            )
-        return self.user_memories[user_id]
-    def get_user_chain(self, user_id: str):
-        """Get or create conversation chain for a specific user"""
-        memory = self.get_user_memory(user_id)
-        retriever = self.vectorstore.as_retriever()
-        return ConversationalRetrievalChain.from_llm(
-            llm=self.llm,
-            retriever=retriever,
-            memory=memory
-        )
     def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
         """Extract meaningful Kazakh terms using document metadata to determine category"""
         terms = []
         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
-            kazakh_words = re.findall(r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:\s+[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*', response)
-            for word in kazakh_words:
-                word = word.strip()
-                if len(word) <= 2 or len(word) > 50:
-                    continue
-                skip_words = ['деген', 'деп', 'берілген', 'мәтінде', 'мағынасы', 'дегеннің',
-                             'түсіндірілген', 'келтірілген', 'болып', 'табылады', 'ауруы',
-                             'мынадай', 'тақырыбына', 'тіркестер', 'арналған', 'байланысты']
-                if any(skip in word.lower() for skip in skip_words):
-                    continue
-                category = "word"
-                definition = ""
-                for doc in retrieved_docs:
-                    if word.lower() in doc.page_content.lower():
-                        doc_type = doc.metadata.get('doc_type', '').lower()
-                        if 'idiom' in doc_type or 'тіркес' in doc_type:
-                            category = "idiom"
-                        else:
-                            category = "word"
-                        definition = self.extract_clean_definition(word, doc.page_content, response)
-                        break
-                if definition and len(word.split()) <= 4:
-                    if not any(phrase in word.lower() for phrase in ['қалай', 'қандай', 'қайда', 'неше', 'қашан']):
-                        terms.append((word, category, definition))
         except Exception as e:
             print(f"Error extracting terms: {e}")
         return terms
     def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
-        """Extract clean definition for a term"""
         sentences = response.split('.')
         for sentence in sentences:
-            if term.lower() in sentence.lower():
-                clean_sentence = sentence.strip()
-                if len(clean_sentence) > 10 and len(clean_sentence) < 150:
-                    if not any(word in clean_sentence.lower() for word in ['деген не', 'қалай аталады', 'нені білдіреді']):
-                        return clean_sentence
         doc_sentences = doc_content.split('.')
         for sentence in doc_sentences:
-            if term.lower() in sentence.lower():
-                clean_sentence = sentence.strip()
-                if len(clean_sentence) > 10 and len(clean_sentence) < 150:
-                    return clean_sentence
         return f"Definition for {term}"
-    def process_message(self, message: str, user_id: str = "default_user", session_token: str = None) -> str:
-        """Process user message with proper user session management"""
         if session_token and not self.tracker.validate_session(user_id, session_token):
             return "Session expired. Please login again."
@@ -526,9 +656,14 @@ Format responses naturally in conversational style, not JSON unless specifically
             return self.get_recommendations(user_id)
         elif message.lower().startswith('/review'):
             return self.get_review_words(user_id)
         elif message.lower().startswith('/help'):
             return self.get_help_message()
         conversation_chain = self.get_user_chain(user_id)
         result = conversation_chain.invoke({"question": message})
         response = result["answer"]
@@ -607,6 +742,25 @@ Format responses naturally in conversational style, not JSON unless specifically
         return response
     def get_help_message(self) -> str:
         """Get help message with available commands"""
         return """
@@ -616,6 +770,7 @@ Format responses naturally in conversational style, not JSON unless specifically
 - `/progress` - View your detailed learning progress
 - `/recommendations` - Get personalized learning suggestions
 - `/review` - See words that need review
 - `/help` - Show this help message
 **How to Use**:
@@ -636,22 +791,43 @@ Start learning by asking about any Kazakh term! 🌟
         """Create a session token for user authentication"""
         session_token = self.tracker.create_user_session(user_id)
         return session_token
 assistant = PersonalizedKazakhAssistant()
-def chat_interface(message, history):
-    """Chat interface for Gradio - uses consistent user for web interface"""
     try:
-        # Use a consistent user_id for the web interface session
-        # In a real app, you'd use proper session management
         web_user_id = "web_user_default"  # Consistent ID
-        response = assistant.process_message(message, web_user_id)
         return response
     except Exception as e:
         return f"Sorry, I encountered an error: {str(e)}. Please try again."
 def api_login(user_id: str) -> dict:
     """API endpoint for user login/session creation"""
     try:
@@ -668,10 +844,10 @@ def api_login(user_id: str) -> dict:
             "error": str(e)
         }
-def api_chat(message: str, user_id: str, session_token: str = None) -> dict:
-    """API endpoint for chat functionality with proper user session"""
     try:
-        response = assistant.process_message(message, user_id, session_token)
         return {
             "success": True,
             "response": response,
@@ -747,230 +923,313 @@ def api_review_words(user_id: str, session_token: str = None) -> dict:
             "error": str(e)
         }
-# Gradio Interface with API Testing
 with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
     gr.Markdown("# 🇰🇿 Personalized Kazakh Learning Assistant")
     gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
     with gr.Tab("💬 Chat Interface"):
         chat_interface = gr.ChatInterface(
-            chat_interface,
             type="messages",
             examples=[
-                "сәлем деген не?",
-                "күләпара не үшін керек?",
-                "/progress",
-                "/recommendations",
-                "/review"
             ]
         )
-    with gr.Tab("🔌 API Testing"):
-        gr.Markdown("## Test API Endpoints")
-        with gr.Row():
-            with gr.Column():
-                user_id_input = gr.Textbox(label="User ID", value="test_user", placeholder="Enter unique user ID")
-                session_token_input = gr.Textbox(label="Session Token", placeholder="Session token (get from login)")
-                message_input = gr.Textbox(label="Message", placeholder="Enter your message in Kazakh or English")
-        with gr.Row():
-            login_btn = gr.Button("🔑 Test Login API")
-            chat_btn = gr.Button("💬 Test Chat API")
-            progress_btn = gr.Button("📊 Test Progress API")
-            recommendations_btn = gr.Button("💡 Test Recommendations API")
-            review_btn = gr.Button("📚 Test Review API")
-        api_output = gr.JSON(label="API Response")
-        login_btn.click(
-            fn=lambda uid: api_login(uid),
-            inputs=user_id_input,
-            outputs=api_output
-        )
-        chat_btn.click(
-            fn=lambda msg, uid, token: api_chat(msg, uid, token),
-            inputs=[message_input, user_id_input, session_token_input],
-            outputs=api_output
-        )
-        progress_btn.click(
-            fn=lambda uid, token: api_progress(uid, token),
-            inputs=[user_id_input, session_token_input],
-            outputs=api_output
-        )
-        recommendations_btn.click(
-            fn=lambda uid, token: api_recommendations(uid, token),
-            inputs=[user_id_input, session_token_input],
-            outputs=api_output
-        )
-        review_btn.click(
-            fn=lambda uid, token: api_review_words(uid, token),
-            inputs=[user_id_input, session_token_input],
-            outputs=api_output
-        )
     with gr.Tab("📖 API Documentation"):
         gr.Markdown("""
         ## API Endpoints for Flutter Integration
         ### Base URL: `https://huggingface.co/spaces/GuestUser33/kazakh-learning-api`
         ### Authentication Flow:
-        1. **Login** to get session token
         2. **Use session token** for subsequent API calls
         3. **Session tokens expire** after inactivity
         ### Available Endpoints:
         #### 1. Login API
         ```
         POST /api/predict
         Content-Type: application/json
         {
-          "data": ["user_id"],
-          "fn_index": 0
         }
         ```
-        **Response**: `{"success": true, "session_token": "uuid", "user_id": "user_id"}`
         #### 2. Chat API
         ```
         POST /api/predict
         Content-Type: application/json
         {
-          "data": ["message", "user_id", "session_token"],
-          "fn_index": 1
         }
         ```
-        #### 3. Progress API
         ```
         POST /api/predict
         Content-Type: application/json
         {
-          "data": ["user_id", "session_token"],
-          "fn_index": 2
         }
         ```
         #### 4. Recommendations API
         ```
         POST /api/predict
         Content-Type: application/json
         {
-          "data": ["user_id", "session_token"],
-          "fn_index": 3
         }
         ```
         #### 5. Review Words API
         ```
         POST /api/predict
         Content-Type: application/json
         {
-          "data": ["user_id", "session_token"],
-          "fn_index": 4
         }
         ```
         ### Flutter Integration Example:
         ```dart
         class KazakhLearningAPI {
-          static const String baseUrl = 'https://huggingface.co/spaces/GuestUser33/kazakh-learning-api';
-          String? sessionToken;
-          String? userId;
-          // Login and get session token
-          Future<bool> login(String userId) async {
             final response = await http.post(
-              Uri.parse('$baseUrl/api/predict'),
-              headers: {'Content-Type': 'application/json'},
-              body: jsonEncode({
                 'data': [userId],
                 'fn_index': 0
-              }),
             );
             if (response.statusCode == 200) {
-              final result = jsonDecode(response.body);
-              if (result['data'][0]['success']) {
                 this.userId = userId;
                 this.sessionToken = result['data'][0]['session_token'];
                 return true;
-              }
             }
             return false;
-          }
-          // Send chat message
-          Future<String?> sendMessage(String message) async {
             if (sessionToken == null) return null;
             final response = await http.post(
-              Uri.parse('$baseUrl/api/predict'),
-              headers: {'Content-Type': 'application/json'},
-              body: jsonEncode({
-                'data': [message, userId, sessionToken],
                 'fn_index': 1
-              }),
             );
             if (response.statusCode == 200) {
-              final result = jsonDecode(response.body);
-              if (result['data'][0]['success']) {
                 return result['data'][0]['response'];
-              }
             }
             return null;
-          }
-          // Get user progress
-          Future<Map<String, dynamic>?> getProgress() async {
             if (sessionToken == null) return null;
             final response = await http.post(
-              Uri.parse('$baseUrl/api/predict'),
-              headers: {'Content-Type': 'application/json'},
-              body: jsonEncode({
                 'data': [userId, sessionToken],
                 'fn_index': 2
-              }),
             );
             if (response.statusCode == 200) {
-              final result = jsonDecode(response.body);
-              if (result['data'][0]['success']) {
                 return result['data'][0]['progress_data'];
-              }
             }
             return null;
-          }
         }
         ```
         ### Key Features:
         - ✅ **Multi-User Support**: Each user has separate learning progress
         - ✅ **Session Management**: Secure session tokens for authentication
-        - ✅ **Personalized Tracking**: Individual progress tracking per user
         - ✅ **API Ready**: All endpoints ready for mobile app integration
         - ✅ **Session Validation**: Automatic session validation and expiry
         ### Usage Notes:
         - Always call **login** first to get a session token
         - Include **session_token** in all subsequent API calls
         - Handle **session expiry** by re-logging in
-        - Use **unique user_id** for each user (could be email, username, etc.)
-        """)
 if __name__ == "__main__":
     demo.launch()

     end_time: Optional[datetime] = None
     words_learned: int = 0
     idioms_learned: int = 0
+    grammar_learned: int = 0
     questions_asked: int = 0
 @dataclass
         cursor = conn.cursor()
         cursor.execute('''
+        CREATE TABLE IF NOT EXISTS learning_sessions (
+            session_id TEXT PRIMARY KEY,
+            user_id TEXT NOT NULL,
+            start_time TEXT NOT NULL,
+            end_time TEXT,
+            words_learned INTEGER DEFAULT 0,
+            idioms_learned INTEGER DEFAULT 0,
+            grammar_learned INTEGER DEFAULT 0,
+            questions_asked INTEGER DEFAULT 0
+        )
+    ''')
         cursor.execute('''
             CREATE TABLE IF NOT EXISTS word_progress (
                 WHERE user_id = ? AND word = ? AND category = ?
             ''', (now, user_id, word, category))
         else:
+            cursor.execute ('''
                 INSERT INTO word_progress
                 (user_id, word, definition, category, first_encountered, last_reviewed)
                 VALUES (?, ?, ?, ?, ?, ?)
             ''', (user_id, word, definition, category, now, now))
+        cursor.execute('''
+            SELECT encounter_count FROM word_progress
+            WHERE user_id = ? AND word = ? AND category = ?
+        ''', (user_id, word, category))
+        encounter_count = cursor.fetchone()[0]
+        if encounter_count >= 3:
+            cursor.execute('''
+                UPDATE word_progress
+                SET mastery_level = ?
+                WHERE user_id = ? AND word = ? AND category = ?
+            ''', (3, user_id, word, category))
         conn.commit()
         conn.close()
         conn.close()
         return words
+    def get_mastered_words(self, user_id: str, limit: int = 10) -> List[Dict]:
+        """Get words with mastery level greater than 0"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute('''
+            SELECT word, definition, category, mastery_level, encounter_count
+            FROM word_progress
+            WHERE user_id = ? AND mastery_level > 0
+            ORDER BY mastery_level DESC, encounter_count DESC
+            LIMIT ?
+        ''', (user_id, limit))
+        words = []
+        for word, definition, category, mastery, encounter_count in cursor.fetchall():
+            words.append({
+                'word': word,
+                'definition': definition,
+                'category': category,
+                'mastery_level': mastery,
+                'encounter_count': encounter_count
+            })
+        conn.close()
+        return words
     def get_learning_recommendations(self, user_id: str) -> List[str]:
         """Get personalized learning recommendations"""
         progress = self.get_user_progress(user_id)
 class PersonalizedKazakhAssistant:
     def __init__(self):
+        self.known_terms = set()
         self.setup_environment()
         self.setup_vectorstore()
         self.setup_llm()
     def setup_environment(self):
         """Setup environment and configuration"""
         self.google_api_key = os.getenv("GOOGLE_API_KEY")
         self.MODEL = "gemini-1.5-flash"
         self.db_name = "vector_db"
     def setup_vectorstore(self):
         """Setup document loading and vector store"""
+        folders = glob.glob("knowledge-base/*")
         text_loader_kwargs = {'encoding': 'utf-8'}
         documents = []
         for folder in folders:
+            doc_type = os.path.basename(folder).lower()
             loader = DirectoryLoader(
                 folder,
                 glob="**/*.txt",
                 doc.metadata["doc_type"] = doc_type
                 documents.append(doc)
+        self.known_terms.clear()
+        common_words = {'бас', 'сөз', 'адам', 'жол', 'күн', 'су', 'жер', 'қол', 'тұр', 'бер'}
+        for doc in documents:
+            doc_type = doc.metadata.get('doc_type', '').lower()
+            lines = doc.page_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and " - " in line:
+                    term = line.split(" - ")[0].strip().lower()
+                    if term and (
+                        doc_type in ['idioms', 'grammar'] or
+                        (doc_type == 'words' and len(term.split()) > 1) or
+                        term not in common_words
+                    ):
+                        self.known_terms.add(term)
+        print(f"Loaded {len(self.known_terms)} known terms: {list(self.known_terms)[:10]}")
         text_splitter = CharacterTextSplitter(separator=r'\n', chunk_size=2000, chunk_overlap=0)
         chunks = text_splitter.split_documents(documents)
         self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
         print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
     def setup_llm(self):
         """Setup LLM with enhanced system prompt"""
         system_prompt = """
+    You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations.
+    Key capabilities:
+    1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms from your knowledge base
+    2. **Track Learning Progress**: Identify and track when users learn new words or idioms
+    3. **Personalized Responses**: Adapt responses based on user's learning history and progress
+    4. **Progress Reporting**: Provide detailed progress reports when asked
+    5. **Learning Recommendations**: Suggest words/idioms to review or learn next
+    Response Guidelines:
+    - For word/idiom queries: Provide definition, usage examples, and related information
+    - Always identify the main Kazakh word/idiom being discussed for progress tracking
+    - Be encouraging and supportive of the user's learning journey
+    - Use simple, clear explanations appropriate for language learners
+    - When discussing progress, be specific and motivating
+    - Avoid storing definitions as terms; only track the word/idiom itself
+    - Normalize terms to lowercase to avoid duplicates due to case differences
+    Format responses naturally in conversational style, not JSON unless specifically requested.
+    """
         self.llm = ChatGoogleGenerativeAI(
             model="models/gemini-1.5-flash",
             temperature=0.7,
+            model_kwargs={"system_instruction": system_prompt}
         )
+    def normalize_term(self, term: str) -> str:
+        """Normalize term by converting to lowercase and removing extra spaces"""
+        return ' '.join(term.lower().strip().split())
     def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
         """Extract meaningful Kazakh terms using document metadata to determine category"""
         terms = []
+        seen_terms = set()
         try:
             retrieved_docs = self.vectorstore.similarity_search(message, k=5)
+            response_normalized = self.normalize_term(response)
+            message_normalized = self.normalize_term(message)
+            is_multi_term_query = any(keyword in message_normalized for keyword in ['мысал', 'тіркестер', 'пример'])
+            common_words = {'бас', 'сөз', 'адам', 'жол', 'күн', 'су', 'жер', 'қол', 'тұр', 'бер'}
+            for known_term in self.known_terms:
+                normalized_known_term = self.normalize_term(known_term)
+                if normalized_known_term in response_normalized and normalized_known_term not in seen_terms:
+                    if normalized_known_term in common_words and not (
+                        normalized_known_term in message_normalized or is_multi_term_query
+                    ):
+                        print(f"Skipped common term: {known_term}")
+                        continue
+                    if normalized_known_term in message_normalized or any(
+                        normalized_known_term in self.normalize_term(doc.page_content) for doc in retrieved_docs
+                    ):
+                        category = "idiom"
+                        definition = ""
+                        for doc in retrieved_docs:
+                            if normalized_known_term in self.normalize_term(doc.page_content):
+                                doc_type = doc.metadata.get('doc_type', '').lower()
+                                if 'idiom' in doc_type or 'тіркес' in doc_type:
+                                    category = "idiom"
+                                elif 'grammar' in doc_type:
+                                    category = "grammar"
+                                else:
+                                    category = "word"
+                                definition = self.extract_clean_definition(normalized_known_term, doc.page_content, response)
+                                break
+                        if definition and len(normalized_known_term.split()) <= 10:
+                            terms.append((known_term, category, definition))
+                            seen_terms.add(normalized_known_term)
+                            print(f"Added term: {known_term}, category: {category}, definition: {definition}")
+                            if not is_multi_term_query and normalized_known_term not in message_normalized:
+                                return terms
+            if not terms and not is_multi_term_query:
+                kazakh_phrases = re.findall(
+                    r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s\-]+[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*',
+                    response
+                )
+                for phrase in kazakh_phrases:
+                    normalized_phrase = self.normalize_term(phrase)
+                    if normalized_phrase in seen_terms:
+                        continue
+                    if len(normalized_phrase) <= 2 or len(normalized_phrase) > 100:
+                        print(f"Skipped phrase {normalized_phrase}: Invalid length")
+                        continue
+                    skip_words = ['деген', 'деп', 'берілген', 'мәтінде', 'мағынасы', 'дегеннің',
+                                'түсіндірілген', 'келтірілген', 'болып', 'табылады', 'ауруы',
+                                'мынадай', 'тақырыбына', 'тіркестер', 'арналған', 'байланысты']
+                    if any(skip in normalized_phrase for skip in skip_words):
+                        print(f"Skipped phrase {normalized_phrase}: Contains skip word")
+                        continue
+                    if normalized_phrase in common_words and normalized_phrase not in message_normalized:
+                        print(f"Skipped common phrase: {normalized_phrase}")
+                        continue
+                    if normalized_phrase not in self.known_terms:
+                        print(f"Warning: {normalized_phrase} not in known_terms, but processing anyway")
+                    category = "word"
+                    definition = ""
+                    for doc in retrieved_docs:
+                        if normalized_phrase in self.normalize_term(doc.page_content):
+                            doc_type = doc.metadata.get('doc_type', '').lower()
+                            if 'idiom' in doc_type or 'тіркес' in doc_type:
+                                category = "idiom"
+                            elif 'grammar' in doc_type:
+                                category = "grammar"
+                            else:
+                                category = "word"
+                            definition = self.extract_clean_definition(normalized_phrase, doc.page_content, response)
+                            break
+                    if definition and len(normalized_phrase.split()) <= 6:
+                        if not any(normalized_phrase.startswith(q) for q in ['қалай', 'қандай', 'қайда', 'неше', 'қашан']):
+                            terms.append((phrase, category, definition))
+                            seen_terms.add(normalized_phrase)
+                            print(f"Added term: {phrase}, category: {category}, definition: {definition}")
+                            break
         except Exception as e:
             print(f"Error extracting terms: {e}")
         return terms
     def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
+        """Extract clean definition for a term, avoiding storing definitions as terms"""
+        normalized_term = self.normalize_term(term)
         sentences = response.split('.')
         for sentence in sentences:
+            sentence = sentence.strip()
+            if normalized_term in self.normalize_term(sentence) and len(sentence) > 10 and len(sentence) < 150:
+                if not any(word in sentence.lower() for word in ['деген не', 'қалай аталады', 'нені білдіреді']):
+                    return sentence
         doc_sentences = doc_content.split('.')
         for sentence in doc_sentences:
+            sentence = sentence.strip()
+            if normalized_term in self.normalize_term(sentence) and len(sentence) > 10 and len(sentence) < 150:
+                return sentence
         return f"Definition for {term}"
+    def get_user_memory(self, user_id: str):
+        """Get or create conversation memory for a specific user"""
+        if user_id not in self.user_memories:
+            self.user_memories[user_id] = ConversationBufferMemory(
+                memory_key='chat_history',
+                return_messages=True,
+                max_token_limit=10000
+            )
+        return self.user_memories[user_id]
+    def get_user_chain(self, user_id: str):
+        """Get or create conversation chain for a specific user"""
+        memory = self.get_user_memory(user_id)
+        retriever = self.vectorstore.as_retriever()
+        return ConversationalRetrievalChain.from_llm(
+            llm=self.llm,
+            retriever=retriever,
+            memory=memory
+        )
+    def process_message(self, message: str, user_id: str = "default_user", session_token: str = None, use_direct_gemini: bool = False, target_language: str = "English") -> str:
+        """Process user message with proper user session management and toggle for direct Gemini"""
         if session_token and not self.tracker.validate_session(user_id, session_token):
             return "Session expired. Please login again."
             return self.get_recommendations(user_id)
         elif message.lower().startswith('/review'):
             return self.get_review_words(user_id)
+        elif message.lower().startswith('/mastered'):
+            return self.get_mastered_words(user_id)
         elif message.lower().startswith('/help'):
             return self.get_help_message()
+        if use_direct_gemini:
+            return self.process_direct_gemini(message, user_id, target_language)
         conversation_chain = self.get_user_chain(user_id)
         result = conversation_chain.invoke({"question": message})
         response = result["answer"]
         return response
+    def get_mastered_words(self, user_id: str) -> str:
+        """Get words that have been mastered (mastery level > 0) for specific user"""
+        mastered_words = self.tracker.get_mastered_words(user_id, 10)
+        if not mastered_words:
+            return "Сізде әзірге меңгерілген сөздер жоқ. Терминдерді қайталауды жалғастырыңыз, сонда олар осында пайда болады! 🌟\n\nYou haven't mastered any words yet. Keep reviewing terms, and they'll appear here! 🌟"
+        response = "🏆 **Меңгерілген сөздер / Mastered Words**:\n\n"
+        for word_info in mastered_words:
+            emoji = "📝" if word_info['category'] == "word" else "🎭"
+            mastery_stars = "🟊" * word_info['mastery_level'] + "⬜" * (5 - word_info['mastery_level'])
+            response += f"{emoji} **{word_info['word']}** - {mastery_stars} (Кездесу саны / Encounters: {word_info['encounter_count']})\n"
+            definition_preview = word_info['definition'][:80] + "..." if len(word_info['definition']) > 80 else word_info['definition']
+            response += f"   {definition_preview}\n\n"
+        return response
     def get_help_message(self) -> str:
         """Get help message with available commands"""
         return """
 - `/progress` - View your detailed learning progress
 - `/recommendations` - Get personalized learning suggestions
 - `/review` - See words that need review
+- `/mastered` - See words you've mastered (mastery level > 0)
 - `/help` - Show this help message
 **How to Use**:
         """Create a session token for user authentication"""
         session_token = self.tracker.create_user_session(user_id)
         return session_token
+    def process_direct_gemini(self, message: str, user_id: str, target_language: str = "English") -> str:
+        """Process message using direct Gemini with grammar-focused prompt"""
+        try:
+            direct_prompt = """
+    You are a Kazakh language teacher specializing in grammar and vocabulary. Your role is to teach Kazakh grammar and words in the user's requested language (Kazakh, Russian, or English). Provide clear, concise explanations tailored to language learners, including examples and practical usage. If the user doesn't specify a language, default to English. Do not rely on external knowledge bases; use your internal knowledge to generate accurate and educational responses. Be encouraging and supportive, and adapt explanations to the user's proficiency level if known.
+    """
+            direct_llm = ChatGoogleGenerativeAI(
+                model="models/gemini-1.5-flash",
+                temperature=0.7,
+                model_kwargs={"system_instruction": direct_prompt}
+            )
+            message_lower = message.lower()
+            if any(keyword in message_lower for keyword in ['kazakh', 'қазақша', 'қазақ тілінде']):
+                target_language = "Kazakh"
+            elif any(keyword in message_lower for keyword in ['russian', 'русский', 'орысша']):
+                target_language = "Russian"
+            modified_message = f"Explain in {target_language}: {message}"
+            response = direct_llm.invoke(modified_message).content
+            return response
+        except Exception as e:
+            return f"Error processing direct Gemini request: {str(e)}"
 assistant = PersonalizedKazakhAssistant()
+def chat_interface(message, history, use_direct_gemini, target_language):
+    """Chat interface for Gradio with toggle for direct Gemini mode"""
     try:
         web_user_id = "web_user_default"  # Consistent ID
+        response = assistant.process_message(message, web_user_id, use_direct_gemini=use_direct_gemini, target_language=target_language)
         return response
     except Exception as e:
         return f"Sorry, I encountered an error: {str(e)}. Please try again."
 def api_login(user_id: str) -> dict:
     """API endpoint for user login/session creation"""
     try:
             "error": str(e)
         }
+def api_chat(message: str, user_id: str, session_token: str = None, use_direct_gemini: bool = False, target_language: str = "English") -> dict:
+    """API endpoint for chat functionality with proper user session and direct Gemini toggle"""
     try:
+        response = assistant.process_message(message, user_id, session_token, use_direct_gemini, target_language)
         return {
             "success": True,
             "response": response,
             "error": str(e)
         }
+def api_mastered_words(user_id: str, session_token: str = None) -> dict:
+    """API endpoint for mastered words with session validation"""
+    try:
+        if session_token and not assistant.tracker.validate_session(user_id, session_token):
+            return {"success": False, "error": "Invalid session"}
+        mastered_text = assistant.get_mastered_words(user_id)
+        mastered_data = assistant.tracker.get_mastered_words(user_id, 10)
+        return {
+            "success": True,
+            "mastered_text": mastered_text,
+            "mastered_data": mastered_data,
+            "user_id": user_id
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e)
+        }
 with gr.Blocks(title="🇰🇿 Kazakh Learning API") as demo:
     gr.Markdown("# 🇰🇿 Personalized Kazakh Learning Assistant")
     gr.Markdown("### Multi-User Chat Interface + API Endpoints for Mobile Integration")
     with gr.Tab("💬 Chat Interface"):
+        gr.Markdown("Toggle **Direct Gemini Mode** to learn Kazakh grammar without RAG. Select the language for explanations.")
+        with gr.Row():
+            use_direct_gemini = gr.Checkbox(label="Direct Gemini Mode (No RAG/Tracking)", value=False)
+            target_language = gr.Dropdown(
+                label="Explanation Language",
+                choices=["English", "Kazakh", "Russian"],
+                value="English"
+            )
         chat_interface = gr.ChatInterface(
+            fn=chat_interface,
+            additional_inputs=[use_direct_gemini, target_language],
             type="messages",
             examples=[
+                ["сәлем деген не?", False, "English"],
+                ["күләпара не үшін керек?", False, "English"],
+                ["/progress", False, "English"],
+                ["/recommendations", False, "English"],
+                ["/review", False, "English"],
+                ["/mastered", False, "English"],
+                ["Explain Kazakh noun cases in Russian", True, "Russian"],
+                ["Teach me Kazakh verb conjugation in English", True, "English"]
             ]
         )
     with gr.Tab("📖 API Documentation"):
         gr.Markdown("""
         ## API Endpoints for Flutter Integration
         ### Base URL: `https://huggingface.co/spaces/GuestUser33/kazakh-learning-api`
         ### Authentication Flow:
+        1. **Login** to get a session token
         2. **Use session token** for subsequent API calls
         3. **Session tokens expire** after inactivity
         ### Available Endpoints:
         #### 1. Login API
         ```
         POST /api/predict
         Content-Type: application/json
         {
+        "data": ["user_id"],
+        "fn_index": 0
         }
         ```
+        **Response**: `{"success": true, "session_token": "uuid", "user_id": "user_id", "message": "Login successful"}`
         #### 2. Chat API
         ```
         POST /api/predict
         Content-Type: application/json
         {
+        "data": ["message", "user_id", "session_token", use_direct_gemini, "target_language"],
+        "fn_index": 1
         }
         ```
+        **Parameters**:
+        - `message`: The user's query (e.g., "сәлем деген не?" or "Explain Kazakh noun cases")
+        - `user_id`: Unique identifier for the user
+        - `session_token`: Session token from login (optional, but required for authenticated sessions)
+        - `use_direct_gemini`: Boolean (`true`/`false`) to toggle Direct Gemini mode for grammar-focused responses without RAG/tracking
+        - `target_language`: Language for responses (`English`, `Kazakh`, or `Russian`)
+        **Response**: `{"success": true, "response": "response_text", "user_id": "user_id"}`
+        #### 3. Progress API
         ```
         POST /api/predict
         Content-Type: application/json
         {
+        "data": ["user_id", "session_token"],
+        "fn_index": 2
         }
         ```
+        **Response**: `{"success": true, "progress_text": "progress_report", "progress_data": {...}, "user_id": "user_id"}`
         #### 4. Recommendations API
         ```
         POST /api/predict
         Content-Type: application/json
         {
+        "data": ["user_id", "session_token"],
+        "fn_index": 3
         }
         ```
+        **Response**: `{"success": true, "recommendations_text": "recommendations", "recommendations_list": [...], "user_id": "user_id"}`
         #### 5. Review Words API
         ```
         POST /api/predict
         Content-Type: application/json
         {
+        "data": ["user_id", "session_token"],
+        "fn_index": 4
         }
         ```
+        **Response**: `{"success": true, "review_text": "review_words", "review_data": [...], "user_id": "user_id"}`
+        #### 6. Mastered Words API
+        ```
+        POST /api/predict
+        Content-Type: application/json
+        {
+        "data": ["user_id", "session_token"],
+        "fn_index": 5
+        }
+        ```
+        **Response**: `{"success": true, "mastered_text": "mastered_words", "mastered_data": [...], "user_id": "user_id"}`
         ### Flutter Integration Example:
         ```dart
+        import 'dart:convert';
+        import 'package:http/http.dart' as http;
         class KazakhLearningAPI {
+        static const String baseUrl = 'https://huggingface.co/spaces/GuestUser33/kazakh-learning-api';
+        String? sessionToken;
+        String? userId;
+        // Login and get session token
+        Future<bool> login(String userId) async {
             final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
                 'data': [userId],
                 'fn_index': 0
+            }),
             );
             if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
                 this.userId = userId;
                 this.sessionToken = result['data'][0]['session_token'];
                 return true;
+            }
             }
             return false;
+        }
+        // Send chat message
+        Future<String?> sendMessage(
+            String message, {
+            bool useDirectGemini = false,
+            String targetLanguage = 'English',
+        }) async {
             if (sessionToken == null) return null;
             final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
+                'data': [message, userId, sessionToken, useDirectGemini, targetLanguage],
                 'fn_index': 1
+            }),
             );
             if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
                 return result['data'][0]['response'];
+            }
             }
             return null;
+        }
+        // Get user progress
+        Future<Map<String, dynamic>?> getProgress() async {
             if (sessionToken == null) return null;
             final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
                 'data': [userId, sessionToken],
                 'fn_index': 2
+            }),
             );
             if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
                 return result['data'][0]['progress_data'];
+            }
             }
             return null;
+        }
+        // Get recommendations
+        Future<List<String>?> getRecommendations() async {
+            if (sessionToken == null) return null;
+            final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
+                'data': [userId, sessionToken],
+                'fn_index': 3
+            }),
+            );
+            if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
+                return List<String>.from(result['data'][0]['recommendations_list']);
+            }
+            }
+            return null;
+        }
+        // Get words to review
+        Future<List<dynamic>?> getReviewWords() async {
+            if (sessionToken == null) return null;
+            final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
+                'data': [userId, sessionToken],
+                'fn_index': 4
+            }),
+            );
+            if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
+                return result['data'][0]['review_data'];
+            }
+            }
+            return null;
+        }
+        // Get mastered words
+        Future<List<dynamic>?> getMasteredWords() async {
+            if (sessionToken == null) return null;
+            final response = await http.post(
+            Uri.parse('$baseUrl/api/predict'),
+            headers: {'Content-Type': 'application/json'},
+            body: jsonEncode({
+                'data': [userId, sessionToken],
+                'fn_index': 5
+            }),
+            );
+            if (response.statusCode == 200) {
+            final result = jsonDecode(response.body);
+            if (result['data'][0]['success']) {
+                return result['data'][0]['mastered_data'];
+            }
+            }
+            return null;
+        }
         }
         ```
         ### Key Features:
         - ✅ **Multi-User Support**: Each user has separate learning progress
         - ✅ **Session Management**: Secure session tokens for authentication
+        - ✅ **Personalized Tracking**: Individual progress tracking per user (in RAG mode)
+        - ✅ **Direct Gemini Mode**: Toggle for grammar-focused responses without RAG/tracking
+        - ✅ **Multi-Language Support**: Responses in English, Kazakh, or Russian
         - ✅ **API Ready**: All endpoints ready for mobile app integration
         - ✅ **Session Validation**: Automatic session validation and expiry
         ### Usage Notes:
         - Always call **login** first to get a session token
         - Include **session_token** in all subsequent API calls
+        - Use `use_direct_gemini: true` for grammar/vocabulary lessons without tracking
+        - Specify `target_language` (`English`, `Kazakh`, `Russian`) for Direct Gemini mode
         - Handle **session expiry** by re-logging in
+        - Use **unique user_id** for each user (e.g., email, username)
+        - Commands like `/progress`, `/recommendations`, `/review`, `/mastered` are only available in RAG mode (`use_direct_gemini: false`)
+        """)
 if __name__ == "__main__":
     demo.launch()