GuestUser33 commited on
Commit
5b6aa7a
·
verified ·
1 Parent(s): 78f88a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -304
app.py CHANGED
@@ -10,15 +10,14 @@ from collections import defaultdict
10
  import re
11
  import uuid
12
  import hashlib
 
13
 
14
  from dotenv import load_dotenv
15
  import gradio as gr
16
 
17
  from langchain_community.document_loaders import DirectoryLoader, TextLoader
18
  from langchain.text_splitter import CharacterTextSplitter
19
- from langchain.schema import Document
20
  from langchain_chroma import Chroma
21
- from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
22
  from langchain_huggingface import HuggingFaceEmbeddings
23
  from langchain.memory import ConversationBufferMemory
24
  from langchain.chains import ConversationalRetrievalChain
@@ -423,20 +422,19 @@ class PersonalizedLearningTracker:
423
  return words
424
 
425
  class PersonalizedKazakhAssistant:
426
- def __init__(self):
427
  self.known_terms = set()
428
  self.setup_environment()
429
  self.setup_vectorstore()
430
- self.setup_llm()
431
  self.tracker = PersonalizedLearningTracker()
432
  self.user_sessions = {}
433
  self.user_memories = {}
434
 
435
  def setup_environment(self):
436
  """Setup environment and configuration"""
437
- # self.google_api_key = os.getenv("GOOGLE_API_KEY")
438
  load_dotenv()
439
- os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
440
  self.MODEL = "gemini-1.5-flash"
441
  self.db_name = "vector_db"
442
 
@@ -486,34 +484,37 @@ class PersonalizedKazakhAssistant:
486
  self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
487
  print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
488
 
489
- def setup_llm(self):
490
- """Setup LLM with enhanced system prompt"""
491
- system_prompt = """
492
- You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations.
493
 
494
- Key capabilities:
495
- 1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms from your knowledge base
496
- 2. **Track Learning Progress**: Identify and track when users learn new words or idioms
497
- 3. **Personalized Responses**: Adapt responses based on user's learning history and progress
498
- 4. **Progress Reporting**: Provide detailed progress reports when asked
499
- 5. **Learning Recommendations**: Suggest words/idioms to review or learn next
500
 
501
- Response Guidelines:
502
- - For word/idiom queries: Provide definition, usage examples, and related information
503
- - Always identify the main Kazakh word/idiom being discussed for progress tracking
504
- - Be encouraging and supportive of the user's learning journey
505
- - Use simple, clear explanations appropriate for language learners
506
- - When discussing progress, be specific and motivating
507
- - Avoid storing definitions as terms; only track the word/idiom itself
508
- - Normalize terms to lowercase to avoid duplicates due to case differences
509
-
510
- Format responses naturally in conversational style, not JSON unless specifically requested.
511
- """
512
-
513
- self.llm = ChatGoogleGenerativeAI(
514
- model="models/gemini-1.5-flash",
515
- temperature=0.7,
516
- model_kwargs={"system_instruction": system_prompt}
 
 
 
517
  )
518
 
519
  def normalize_term(self, term: str) -> str:
@@ -521,253 +522,52 @@ class PersonalizedKazakhAssistant:
521
  return ' '.join(term.lower().strip().split())
522
 
523
  def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
524
- """Extract meaningful Kazakh terms, prioritizing response terms and full idioms."""
525
  terms = []
526
  seen_terms = set()
527
 
528
  try:
529
  retrieved_docs = self.vectorstore.similarity_search(message, k=5)
530
 
531
- response_normalized = self.normalize_term(response)
532
- message_normalized = self.normalize_term(message)
 
533
 
534
- is_multi_term_query = any(keyword in message_normalized for keyword in ['мысал', 'тіркестер', 'пример'])
535
- is_definition_query = any(keyword in message_normalized for keyword in ['деген не', 'мағынасы', 'қалай аталады'])
536
-
537
- # Step 1: For definition queries, prioritize response's primary term
538
- if is_definition_query and not is_multi_term_query:
539
- # Check if response is a single word
540
- response_words = response_normalized.split()
541
- if len(response_words) == 1:
542
- term = response.strip()
543
- normalized_term = self.normalize_term(term)
544
- if normalized_term in self.known_terms and normalized_term not in seen_terms and len(normalized_term) > 2 and len(normalized_term) <= 100:
545
- category = "word"
546
- definition = ""
547
- for doc in retrieved_docs:
548
- if normalized_term in self.normalize_term(doc.page_content):
549
- doc_type = doc.metadata.get('doc_type', '').lower()
550
- if 'idiom' in doc_type or 'тіркес' in doc_type:
551
- category = "idiom"
552
- elif 'grammar' in doc_type:
553
- category = "grammar"
554
- else:
555
- category = "word"
556
- definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
557
- break
558
- if not definition:
559
- definition = self.extract_clean_definition(normalized_term, "", response)
560
- if definition:
561
- terms.append((term, category, definition))
562
- seen_terms.add(normalized_term)
563
- print(f"Added single response term: {term}, category: {category}, definition: {definition}")
564
- return terms
565
-
566
- # Look for quoted term in response (e.g., "басыр" in "Берілген мәтін бойынша, 'басыр' - көз ауруы")
567
- quoted_pattern = r'[\'\"]([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)[\'\"]'
568
- quoted_matches = re.findall(quoted_pattern, response)
569
- if quoted_matches:
570
- term = quoted_matches[0]
571
- normalized_term = self.normalize_term(term)
572
- if normalized_term in self.known_terms and normalized_term not in seen_terms and len(normalized_term) > 2 and len(normalized_term) <= 100:
573
- category = "word"
574
- definition = ""
575
- for doc in retrieved_docs:
576
- if normalized_term in self.normalize_term(doc.page_content):
577
- doc_type = doc.metadata.get('doc_type', '').lower()
578
- if 'idiom' in doc_type or 'тіркес' in doc_type:
579
- category = "idiom"
580
- elif 'grammar' in doc_type:
581
- category = "grammar"
582
- else:
583
- category = "word"
584
- definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
585
- break
586
- if not definition:
587
- definition = self.extract_clean_definition(normalized_term, "", response)
588
- if definition:
589
- terms.append((term, category, definition))
590
- seen_terms.add(normalized_term)
591
- print(f"Added quoted term: {term}, category: {category}, definition: {definition}")
592
- return terms
593
-
594
- # Look for term before hyphen (e.g., "басыр — көз ауруы")
595
- hyphen_pattern = r'^([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\s*[-–—]\s*(.+)$'
596
- hyphen_matches = re.match(hyphen_pattern, response.strip(), re.MULTILINE)
597
- if hyphen_matches:
598
- term = hyphen_matches.group(1).strip()
599
- definition_part = hyphen_matches.group(2).strip()
600
- normalized_term = self.normalize_term(term)
601
- if normalized_term in self.known_terms and normalized_term not in seen_terms and len(normalized_term) > 2 and len(normalized_term) <= 100:
602
- category = "word"
603
- definition = definition_part
604
- for doc in retrieved_docs:
605
- if normalized_term in self.normalize_term(doc.page_content):
606
- doc_type = doc.metadata.get('doc_type', '').lower()
607
- if 'idiom' in doc_type or 'тіркес' in doc_type:
608
- category = "idiom"
609
- elif 'grammar' in doc_type:
610
- category = "grammar"
611
- else:
612
- category = "word"
613
- definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
614
- break
615
- if not definition:
616
- definition = definition_part
617
- if definition:
618
- terms.append((term, category, definition))
619
- seen_terms.add(normalized_term)
620
- print(f"Added hyphen term: {term}, category: {category}, definition: {definition}")
621
- return terms
622
-
623
- # Check query term, but only if it’s the primary term in the response
624
- query_words = message_normalized.split()
625
- for word in query_words:
626
- normalized_word = self.normalize_term(word)
627
- if normalized_word in self.known_terms and normalized_word not in seen_terms:
628
- # Ensure the query term is the primary term in the response
629
- sentences = response.split('.')
630
- for sentence in sentences:
631
- sentence = sentence.strip()
632
- if not sentence:
633
- continue
634
- if normalized_word in self.normalize_term(sentence):
635
  category = "word"
636
- definition = ""
637
- for doc in retrieved_docs:
638
- if normalized_word in self.normalize_term(doc.page_content):
639
- doc_type = doc.metadata.get('doc_type', '').lower()
640
- if 'idiom' in doc_type or 'тіркес' in doc_type:
641
- category = "idiom"
642
- elif 'grammar' in doc_type:
643
- category = "grammar"
644
- else:
645
- category = "word"
646
- definition = self.extract_clean_definition(normalized_word, doc.page_content, response)
647
- break
648
- if not definition:
649
- definition = self.extract_clean_definition(normalized_word, "", response)
650
- if definition:
651
- terms.append((word, category, definition))
652
- seen_terms.add(normalized_word)
653
- print(f"Added query term: {word}, category: {category}, definition: {definition}")
654
- return terms
655
-
656
- # Fallback to primary term in response (e.g., "абыз" in "Ел атасы данагөйді абыз деп атайды")
657
- sentences = response.split('.')
658
- for sentence in sentences:
659
- sentence = sentence.strip()
660
- if not sentence:
661
- continue
662
- kazakh_phrases = re.findall(
663
- r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+){0,2}',
664
- sentence
665
- )
666
- for phrase in kazakh_phrases:
667
- normalized_phrase = self.normalize_term(phrase)
668
- if normalized_phrase in seen_terms or len(normalized_phrase) <= 2 or len(normalized_phrase) > 100:
669
- print(f"Skipped phrase {normalized_phrase}: Invalid length or already seen")
670
- continue
671
- if normalized_phrase in self.known_terms and any(
672
- normalized_phrase in self.normalize_term(doc.page_content) for doc in retrieved_docs
673
- ):
674
- category = "word"
675
- definition = ""
676
- for doc in retrieved_docs:
677
- if normalized_phrase in self.normalize_term(doc.page_content):
678
- doc_type = doc.metadata.get('doc_type', '').lower()
679
- if 'idiom' in doc_type or 'тіркес' in doc_type:
680
- category = "idiom"
681
- elif 'grammar' in doc_type:
682
- category = "grammar"
683
- else:
684
- category = "word"
685
- definition = self.extract_clean_definition(normalized_phrase, doc.page_content, response)
686
- break
687
- if not definition:
688
- definition = self.extract_clean_definition(normalized_phrase, "", response)
689
- if definition:
690
- terms.append((phrase, category, definition))
691
- seen_terms.add(normalized_phrase)
692
- print(f"Added phrase: {phrase}, category: {category}, definition: {definition}")
693
- return terms
694
-
695
- # Step 2: For multi-term queries, prioritize full idioms from response
696
- if is_multi_term_query:
697
- kazakh_phrases = re.findall(
698
- r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s,-]+[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*',
699
- response
700
- )
701
- for phrase in kazakh_phrases:
702
- normalized_phrase = self.normalize_term(phrase)
703
- if normalized_phrase in seen_terms or len(normalized_phrase) <= 2 or len(normalized_phrase) > 100:
704
- print(f"Skipped phrase {normalized_phrase}: Invalid length or already seen")
705
- continue
706
- if normalized_phrase in self.known_terms or any(
707
- normalized_phrase in self.normalize_term(doc.page_content) for doc in retrieved_docs
708
- ):
709
- category = "word"
710
- definition = ""
711
- for doc in retrieved_docs:
712
- if normalized_phrase in self.normalize_term(doc.page_content):
713
- doc_type = doc.metadata.get('doc_type', '').lower()
714
- if 'idiom' in doc_type or 'тіркес' in doc_type:
715
- category = "idiom"
716
- elif 'grammar' in doc_type:
717
- category = "grammar"
718
- else:
719
- category = "word"
720
- definition = self.extract_clean_definition(normalized_phrase, doc.page_content, response)
721
- break
722
- if not definition:
723
- definition = self.extract_clean_definition(normalized_phrase, "", response)
724
- if definition and len(normalized_phrase.split()) <= 6:
725
- terms.append((phrase, category, definition))
726
- seen_terms.add(normalized_phrase)
727
- print(f"Added phrase: {phrase}, category: {category}, definition: {definition}")
728
- return terms
729
-
730
- for known_term in self.known_terms:
731
- normalized_known_term = self.normalize_term(known_term)
732
- if normalized_known_term in response_normalized and normalized_known_term not in seen_terms:
733
-
734
- is_part_of_idiom = any(
735
- normalized_known_term in self.normalize_term(idiom) and len(idiom.split()) > 1
736
- for idiom in self.known_terms
737
- if idiom != normalized_known_term
738
- )
739
- if is_part_of_idiom:
740
- print(f"Skipped term {known_term}: Part of a larger idiom")
741
- continue
742
- if normalized_known_term in self.known_terms and any(
743
- normalized_known_term in self.normalize_term(doc.page_content) for doc in retrieved_docs
744
- ):
745
- category = "word"
746
- definition = ""
747
- for doc in retrieved_docs:
748
- if normalized_known_term in self.normalize_term(doc.page_content):
749
- doc_type = doc.metadata.get('doc_type', '').lower()
750
- if 'idiom' in doc_type or 'тіркес' in doc_type:
751
- category = "idiom"
752
- elif 'grammar' in doc_type:
753
- category = "grammar"
754
- else:
755
- category = "word"
756
- definition = self.extract_clean_definition(normalized_known_term, doc.page_content, response)
757
- break
758
- if not definition:
759
- definition = self.extract_clean_definition(normalized_known_term, "", response)
760
- if definition and len(normalized_known_term.split()) <= 10:
761
- terms.append((known_term, category, definition))
762
- seen_terms.add(normalized_known_term)
763
- print(f"Added known term: {known_term}, category: {category}, definition: {definition}")
764
- if not is_multi_term_query:
765
- return terms
766
-
767
  except Exception as e:
768
  print(f"Error extracting terms: {e}")
769
-
770
- return terms
771
 
772
  def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
773
  """Extract a clean definition for a term from the knowledge base."""
@@ -810,14 +610,18 @@ class PersonalizedKazakhAssistant:
810
  """Process user message with proper user session management and toggle for direct Gemini"""
811
 
812
  if session_token and not self.tracker.validate_session(user_id, session_token):
813
- return "Session expired. Please login again."
814
-
815
  if session_token:
816
  self.tracker.update_session_activity(user_id, session_token)
817
 
818
  if user_id not in self.user_sessions:
819
  self.user_sessions[user_id] = self.tracker.start_session(user_id)
 
 
 
820
 
 
821
  if message.lower().startswith('/progress'):
822
  return self.get_progress_report(user_id)
823
  elif message.lower().startswith('/recommendations'):
@@ -831,35 +635,90 @@ class PersonalizedKazakhAssistant:
831
  elif message.lower().startswith('/newword'):
832
  new_word = self.get_new_word(user_id)
833
  if not new_word:
834
- return "Қазір жаңа сөздер жоқ. Басқа сөздерді қайталаңыз! 🌟\n\nNo new words available right now. Review other words! 🌟"
835
  self.tracker.track_word_encounter(user_id, new_word['word'], new_word['definition'], new_word['category'])
836
  return f"📝 **Жаңа сөз / New Word**: {new_word['word']}\n\nМағынасы / Meaning: {new_word['definition']}"
837
  elif message.lower().startswith('/newidiom'):
838
  new_idiom = self.get_new_idiom(user_id)
839
  if not new_idiom:
840
- return "Қазір жаңа тіркестер жоқ. Басқа тіркестерді қайталаңыз! 🌟\n\nNo new idioms available right now. Review other idioms! 🌟"
841
  self.tracker.track_word_encounter(user_id, new_idiom['word'], new_idiom['definition'], new_idiom['category'])
842
  return f"🎭 **Жаңа тіркес / New Idiom**: {new_idiom['word']}\n\nМағынасы / Meaning: {new_idiom['definition']}"
843
  elif message.lower().startswith('/help'):
844
  return self.get_help_message()
845
-
846
  if use_direct_gemini:
847
  return self.process_direct_gemini(message, user_id, target_language)
848
 
849
- conversation_chain = self.get_user_chain(user_id)
850
- result = conversation_chain.invoke({"question": message})
851
- response = result["answer"]
852
 
853
- extracted_terms = self.extract_kazakh_terms(message, response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  for term, category, definition in extracted_terms:
856
- if definition and term:
857
- self.tracker.track_word_encounter(
858
- user_id,
859
- term,
860
- definition,
861
- category
862
- )
863
 
864
  return response
865
 
@@ -1058,38 +917,41 @@ Start learning by asking about any Kazakh term! 🌟
1058
  """Process message using direct Gemini with conversation memory for context."""
1059
  try:
1060
  memory = self.get_user_memory(user_id)
1061
-
1062
- direct_prompt = """
1063
- You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
1064
-
1065
- Previous conversation context:
1066
- {chat_history}
1067
- """
1068
  chat_history = ""
1069
- for msg in memory.chat_memory.messages[-10:]:
1070
  if isinstance(msg, HumanMessage):
1071
  chat_history += f"User: {msg.content}\n"
1072
  elif isinstance(msg, AIMessage):
1073
  chat_history += f"Assistant: {msg.content}\n"
1074
-
1075
- direct_llm = ChatGoogleGenerativeAI(
1076
- model="models/gemini-1.5-flash",
1077
- temperature=0.7,
1078
- model_kwargs={"system_instruction": direct_prompt.format(chat_history=chat_history)}
1079
- )
1080
-
 
1081
  if target_language != "English" and not any(
1082
  keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
1083
  ):
1084
  modified_message = f"Explain in {target_language}: {message}"
1085
  else:
1086
  modified_message = message
1087
-
1088
- response = direct_llm.invoke(modified_message).content
1089
-
 
 
 
 
 
 
 
 
 
1090
  memory.chat_memory.add_user_message(message)
1091
  memory.chat_memory.add_ai_message(response)
1092
-
1093
  return response
1094
  except Exception as e:
1095
  return f"Error processing direct Gemini request: {str(e)}"
 
10
  import re
11
  import uuid
12
  import hashlib
13
+ import google.generativeai as genai
14
 
15
  from dotenv import load_dotenv
16
  import gradio as gr
17
 
18
  from langchain_community.document_loaders import DirectoryLoader, TextLoader
19
  from langchain.text_splitter import CharacterTextSplitter
 
20
  from langchain_chroma import Chroma
 
21
  from langchain_huggingface import HuggingFaceEmbeddings
22
  from langchain.memory import ConversationBufferMemory
23
  from langchain.chains import ConversationalRetrievalChain
 
422
  return words
423
 
424
  class PersonalizedKazakhAssistant:
425
+ def __init__(self, target_language: str = "English"):
426
  self.known_terms = set()
427
  self.setup_environment()
428
  self.setup_vectorstore()
429
+ self.setup_llm(target_language)
430
  self.tracker = PersonalizedLearningTracker()
431
  self.user_sessions = {}
432
  self.user_memories = {}
433
 
434
  def setup_environment(self):
435
  """Setup environment and configuration"""
 
436
  load_dotenv()
437
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
438
  self.MODEL = "gemini-1.5-flash"
439
  self.db_name = "vector_db"
440
 
 
484
  self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
485
  print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
486
 
487
+ def setup_llm(self, target_language: str = "English"):
488
+ """Setup Gemini model with system prompt formatted with target language"""
489
+ self.system_prompt = f"""
490
+ You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
491
 
492
+ Key capabilities:
493
+ 1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms
494
+ 2. **Track Learning Progress**: Identify and track when users learn new words or idioms
495
+ 3. **Personalized Responses**: Adapt responses based on user's learning history
496
+ 4. **Progress Reporting**: Provide detailed progress reports when asked
497
+ 5. **Learning Recommendations**: Suggest words/idioms to review or learn next
498
 
499
+ Response Guidelines:
500
+ - For word/idiom queries: Provide definition, usage examples, and related information in {target_language}
501
+ - When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it
502
+ - Only bold the main term or idiom being explained, not other Kazakh words
503
+ - Always identify the main Kazakh word/idiom for progress tracking
504
+ - Be encouraging and supportive
505
+ - Use simple, clear explanations
506
+ - When discussing progress, be specific and motivating
507
+ - Avoid storing definitions as terms
508
+ - Normalize terms to lowercase
509
+ - Respond in conversational style
510
+ """
511
+ self.llm = genai.GenerativeModel(
512
+ model_name=self.MODEL,
513
+ system_instruction=self.system_prompt,
514
+ generation_config={
515
+ "temperature": 0.7,
516
+ "max_output_tokens": 500
517
+ }
518
  )
519
 
520
  def normalize_term(self, term: str) -> str:
 
522
  return ' '.join(term.lower().strip().split())
523
 
524
  def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
525
+ """Extract bolded Kazakh terms from response and verify against known terms."""
526
  terms = []
527
  seen_terms = set()
528
 
529
  try:
530
  retrieved_docs = self.vectorstore.similarity_search(message, k=5)
531
 
532
+ # Pattern to match bolded terms (e.g., **күләпара**)
533
+ bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
534
+ bold_matches = re.findall(bold_pattern, response)
535
 
536
+ for term in bold_matches:
537
+ normalized_term = self.normalize_term(term)
538
+ if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
539
+ print(f"Skipped term {normalized_term}: Invalid length or already seen")
540
+ continue
541
+
542
+ # Check if term is in known_terms
543
+ if normalized_term in self.known_terms:
544
+ category = "word"
545
+ definition = ""
546
+ # Determine category and definition from retrieved docs
547
+ for doc in retrieved_docs:
548
+ if normalized_term in self.normalize_term(doc.page_content):
549
+ doc_type = doc.metadata.get('doc_type', '').lower()
550
+ if 'idiom' in doc_type or 'тіркес' in doc_type:
551
+ category = "idiom"
552
+ elif 'grammar' in doc_type:
553
+ category = "grammar"
554
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  category = "word"
556
+ definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
557
+ break
558
+ if not definition:
559
+ definition = self.extract_clean_definition(normalized_term, "", response)
560
+
561
+ if definition:
562
+ terms.append((term, category, definition))
563
+ seen_terms.add(normalized_term)
564
+ print(f"Added bolded term: {term}, category: {category}, definition: {definition}")
565
+
566
+ return terms
567
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  except Exception as e:
569
  print(f"Error extracting terms: {e}")
570
+ return terms
 
571
 
572
  def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
573
  """Extract a clean definition for a term from the knowledge base."""
 
610
  """Process user message with proper user session management and toggle for direct Gemini"""
611
 
612
  if session_token and not self.tracker.validate_session(user_id, session_token):
613
+ return f"Session expired. Please login again in {target_language}."
614
+
615
  if session_token:
616
  self.tracker.update_session_activity(user_id, session_token)
617
 
618
  if user_id not in self.user_sessions:
619
  self.user_sessions[user_id] = self.tracker.start_session(user_id)
620
+
621
+ # Set up LLM with the specified target language
622
+ self.setup_llm(target_language)
623
 
624
+ # Handle special commands
625
  if message.lower().startswith('/progress'):
626
  return self.get_progress_report(user_id)
627
  elif message.lower().startswith('/recommendations'):
 
635
  elif message.lower().startswith('/newword'):
636
  new_word = self.get_new_word(user_id)
637
  if not new_word:
638
+ return f"Қазір жаңа сөздер жоқ. Басқа сөздерді қайталаңыз! 🌟\n\nNo new words available right now. Review other words! 🌟"
639
  self.tracker.track_word_encounter(user_id, new_word['word'], new_word['definition'], new_word['category'])
640
  return f"📝 **Жаңа сөз / New Word**: {new_word['word']}\n\nМағынасы / Meaning: {new_word['definition']}"
641
  elif message.lower().startswith('/newidiom'):
642
  new_idiom = self.get_new_idiom(user_id)
643
  if not new_idiom:
644
+ return f"Қазір жаңа тіркестер жоқ. Басқа тіркестерді қайталаңыз! 🌟\n\nNo new idioms available right now. Review other idioms! 🌟"
645
  self.tracker.track_word_encounter(user_id, new_idiom['word'], new_idiom['definition'], new_idiom['category'])
646
  return f"🎭 **Жаңа тіркес / New Idiom**: {new_idiom['word']}\n\nМағынасы / Meaning: {new_idiom['definition']}"
647
  elif message.lower().startswith('/help'):
648
  return self.get_help_message()
649
+
650
  if use_direct_gemini:
651
  return self.process_direct_gemini(message, user_id, target_language)
652
 
653
+ # Retrieve relevant documents from vectorstore
654
+ retrieved_docs = self.vectorstore.similarity_search(message, k=5)
655
+ context = "\n".join([doc.page_content for doc in retrieved_docs])
656
 
657
+ # Get conversation history
658
+ memory = self.get_user_memory(user_id)
659
+ chat_history = ""
660
+ for msg in memory.chat_memory.messages[-10:]:
661
+ if isinstance(msg, HumanMessage):
662
+ chat_history += f"User: {msg.content}\n"
663
+ elif isinstance(msg, AIMessage):
664
+ chat_history += f"Assistant: {msg.content}\n"
665
+
666
+ # Retrieve user progress from SQLite database
667
+ progress = self.tracker.get_user_progress(user_id)
668
+ words_to_review = self.tracker.get_words_to_review(user_id, 5)
669
+ mastered_words = self.tracker.get_mastered_words(user_id, page=1, page_size=5)
670
+
671
+ progress_summary = """
672
+ User Learning Progress (in {target_language}):
673
+ - Total Terms Learned: {total_words}
674
+ - Category Statistics:
675
+ {category_stats}
676
+ - Recent Activity: {recent_activity} terms reviewed in the last 7 days
677
+ - Words to Review:
678
+ {words_to_review}
679
+ - Mastered Words:
680
+ {mastered_words}
681
+ """.format(
682
+ target_language=target_language,
683
+ total_words=progress['total_words'],
684
+ category_stats=''.join([f" - {category}: {stats['count']} terms, Average Mastery: {stats['average_mastery']}/5\n"
685
+ for category, stats in progress['category_stats'].items()]),
686
+ recent_activity=progress['recent_activity'],
687
+ words_to_review=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
688
+ for word in words_to_review]),
689
+ mastered_words=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
690
+ for word in mastered_words])
691
+ )
692
+
693
+ # Construct prompt with context, history, and progress
694
+ full_prompt = f"""
695
+ {self.system_prompt}
696
+
697
+ Previous conversation:
698
+ {chat_history}
699
 
700
+ Context from knowledge base:
701
+ {context}
702
+
703
+ {progress_summary}
704
+
705
+ User question: {message}
706
+
707
+ Respond in {target_language}. If explaining a Kazakh word or idiom retrieved from the context, **bold** the term (e.g., **күләпара**) in your response to highlight it. Only bold the main term being explained.
708
+ """
709
+
710
+ # Call Gemini API
711
+ response = self.llm.generate_content(full_prompt).text
712
+
713
+ # Add to conversation memory
714
+ memory.chat_memory.add_user_message(message)
715
+ memory.chat_memory.add_ai_message(response)
716
+
717
+ # Extract and track terms
718
+ extracted_terms = self.extract_kazakh_terms(message, response)
719
  for term, category, definition in extracted_terms:
720
+ if definition and term:
721
+ self.tracker.track_word_encounter(user_id, term, definition, category)
 
 
 
 
 
722
 
723
  return response
724
 
 
917
  """Process message using direct Gemini with conversation memory for context."""
918
  try:
919
  memory = self.get_user_memory(user_id)
 
 
 
 
 
 
 
920
  chat_history = ""
921
+ for msg in memory.chat_memory.messages[-10:]:
922
  if isinstance(msg, HumanMessage):
923
  chat_history += f"User: {msg.content}\n"
924
  elif isinstance(msg, AIMessage):
925
  chat_history += f"Assistant: {msg.content}\n"
926
+
927
+ direct_prompt = """
928
+ You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
929
+
930
+ Previous conversation context:
931
+ {chat_history}
932
+ """
933
+
934
  if target_language != "English" and not any(
935
  keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
936
  ):
937
  modified_message = f"Explain in {target_language}: {message}"
938
  else:
939
  modified_message = message
940
+
941
+ direct_model = genai.GenerativeModel(
942
+ model_name=self.MODEL,
943
+ system_instruction=direct_prompt.format(chat_history=chat_history),
944
+ generation_config={
945
+ "temperature": 0.7,
946
+ "max_output_tokens": 200
947
+ }
948
+ )
949
+
950
+ response = direct_model.generate_content(modified_message).text
951
+
952
  memory.chat_memory.add_user_message(message)
953
  memory.chat_memory.add_ai_message(response)
954
+
955
  return response
956
  except Exception as e:
957
  return f"Error processing direct Gemini request: {str(e)}"