GuestUser33 commited on
Commit
90c3bf9
·
verified ·
1 Parent(s): 51a5da5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -37
app.py CHANGED
@@ -490,30 +490,35 @@ class PersonalizedKazakhAssistant:
490
  You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
491
 
492
  Key capabilities:
493
- 1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms
494
- 2. **Track Learning Progress**: Identify and track when users learn new words or idioms
495
- 3. **Personalized Responses**: Adapt responses based on user's learning history
496
- 4. **Progress Reporting**: Provide detailed progress reports when asked
497
- 5. **Learning Recommendations**: Suggest words/idioms to review or learn next
498
 
499
  Response Guidelines:
500
- - For word/idiom queries: Provide definition, usage examples, and related information in {target_language}
501
- - When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it
502
- - Only bold the main term or idiom being explained, not other Kazakh words
503
- - Always identify the main Kazakh word/idiom for progress tracking
504
- - Be encouraging and supportive
505
- - Use simple, clear explanations
506
- - When discussing progress, be specific and motivating
507
- - Avoid storing definitions as terms
508
- - Normalize terms to lowercase
509
- - Respond in conversational style
 
 
 
 
 
510
  """
511
  self.llm = genai.GenerativeModel(
512
  model_name=self.MODEL,
513
  system_instruction=self.system_prompt,
514
  generation_config={
515
  "temperature": 0.7,
516
- "max_output_tokens": 500
517
  }
518
  )
519
 
@@ -527,11 +532,11 @@ class PersonalizedKazakhAssistant:
527
 
528
  try:
529
  retrieved_docs = self.vectorstore.similarity_search(message, k=5)
530
- bold_pattern = r'\*\*([^\*]+)\*\*' # Match any bolded text
531
  bold_matches = re.findall(bold_pattern, response)
532
 
533
  for term in bold_matches:
534
- normalized_term = self.normalize_term(term)
535
  if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
536
  print(f"Skipped term {normalized_term}: Invalid length or already seen")
537
  continue
@@ -542,15 +547,12 @@ class PersonalizedKazakhAssistant:
542
  term_matched = False
543
  original_term = term # Preserve original case for tracking
544
 
545
- # Check if term is multi-word (likely an idiom)
546
- if len(term.split()) > 1:
547
- category = "idiom"
548
-
549
- # Check for exact match in known terms
550
  for known_term in self.known_terms:
551
  if normalized_term == self.normalize_term(known_term):
552
  term_matched = True
553
- original_term = known_term
 
554
  for doc in retrieved_docs:
555
  doc_type = doc.metadata.get('doc_type', '').lower()
556
  if normalized_term in self.normalize_term(doc.page_content):
@@ -562,17 +564,23 @@ class PersonalizedKazakhAssistant:
562
  category = "grammar"
563
  definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
564
  break
 
 
 
 
565
  break
566
 
567
  # If no exact match, try fuzzy matching for idioms with suffixes
568
  if not term_matched:
569
  for known_term in self.known_terms:
570
  normalized_known = self.normalize_term(known_term)
 
 
571
  if (normalized_term.startswith(normalized_known) and
572
  len(normalized_term) <= len(normalized_known) + 4):
573
  term_matched = True
574
- normalized_term = normalized_known
575
- original_term = known_term
576
  for doc in retrieved_docs:
577
  if normalized_known in self.normalize_term(doc.page_content):
578
  doc_type = doc.metadata.get('doc_type', '').lower()
@@ -584,21 +592,20 @@ class PersonalizedKazakhAssistant:
584
  category = "grammar"
585
  definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
586
  break
 
 
 
 
587
  break
588
 
589
- # If term is multi-word and not matched, assume it's an idiom
590
- if not term_matched and len(term.split()) > 1:
591
- category = "idiom"
 
 
592
  definition = self.extract_clean_definition(normalized_term, "", response)
593
 
594
- # Single-word terms from words folder should be categorized as words
595
- if term_matched and len(original_term.split()) == 1:
596
- for doc in retrieved_docs:
597
- if 'words' in doc.metadata.get('doc_type', '').lower():
598
- category = "word"
599
- break
600
-
601
- if definition:
602
  terms.append((original_term, category, definition))
603
  seen_terms.add(normalized_term)
604
  print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")
 
490
  You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
491
 
492
  Key capabilities:
493
+ 1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms.
494
+ 2. **Track Learning Progress**: Identify and track when users learn new words or idioms.
495
+ 3. **Personalized Responses**: Adapt responses based on user's learning history.
496
+ 4. **Progress Reporting**: Provide detailed progress reports when asked.
497
+ 5. **Learning Recommendations**: Suggest words/idioms to review or learn next.
498
 
499
  Response Guidelines:
500
+ - For word/idiom queries: Provide definition, usage examples, and related information in {target_language}.
501
+ - When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it.
502
+ - Only bold the main term or idiom being explained, not other Kazakh words.
503
+ - Always identify the main Kazakh word/idiom for progress tracking.
504
+ - **RAG Usage**:
505
+ - Use Retrieval-Augmented Generation (RAG) only when the query explicitly asks for explanations of specific Kazakh terms or idioms (e.g., "What does сәлем mean?") or when the context strongly suggests a need for knowledge base information (e.g., queries about specific words or idioms).
506
+ - When using RAG to explain terms (e.g., nouns, idioms), limit examples to 3-4 relevant ones. Do not list all or many examples or all matches from the knowledge base if not explicitly asked (only 3,4).
507
+ - For general queries (e.g., greetings, procedural questions, or commands like /progress) or grammar-related queries (e.g., "explain me nouns"), rely on your general knowledge and do not use RAG unless the knowledge base contains relevant information.
508
+ - Since the knowledge base contains only words and idioms, grammar explanations (e.g., about nouns, verbs) should be provided using your own knowledge, without relying on RAG, unless the query specifically involves terms in the knowledge base.
509
+ - Be encouraging and supportive.
510
+ - Use simple, clear explanations.
511
+ - When discussing progress, be specific and motivating.
512
+ - Avoid storing definitions as terms.
513
+ - Normalize terms to lowercase for consistency.
514
+ - Respond in a conversational style.
515
  """
516
  self.llm = genai.GenerativeModel(
517
  model_name=self.MODEL,
518
  system_instruction=self.system_prompt,
519
  generation_config={
520
  "temperature": 0.7,
521
+ "max_output_tokens": 700
522
  }
523
  )
524
 
 
532
 
533
  try:
534
  retrieved_docs = self.vectorstore.similarity_search(message, k=5)
535
+ bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
536
  bold_matches = re.findall(bold_pattern, response)
537
 
538
  for term in bold_matches:
539
+ normalized_term = self.normalize_term(term) # Normalize to lowercase
540
  if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
541
  print(f"Skipped term {normalized_term}: Invalid length or already seen")
542
  continue
 
547
  term_matched = False
548
  original_term = term # Preserve original case for tracking
549
 
550
+ # Check for exact match in known terms (case-insensitive)
 
 
 
 
551
  for known_term in self.known_terms:
552
  if normalized_term == self.normalize_term(known_term):
553
  term_matched = True
554
+ original_term = known_term # Use the known term's original case
555
+ # Determine category based on known term's source
556
  for doc in retrieved_docs:
557
  doc_type = doc.metadata.get('doc_type', '').lower()
558
  if normalized_term in self.normalize_term(doc.page_content):
 
564
  category = "grammar"
565
  definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
566
  break
567
+ # If no document match, check term length for idiom likelihood
568
+ if not definition and len(known_term.split()) > 1:
569
+ category = "idiom"
570
+ definition = self.extract_clean_definition(normalized_term, "", response)
571
  break
572
 
573
  # If no exact match, try fuzzy matching for idioms with suffixes
574
  if not term_matched:
575
  for known_term in self.known_terms:
576
  normalized_known = self.normalize_term(known_term)
577
+ # Check if the bolded term is a close match to a known term
578
+ # Allow up to 4 extra characters (e.g., grammatical endings)
579
  if (normalized_term.startswith(normalized_known) and
580
  len(normalized_term) <= len(normalized_known) + 4):
581
  term_matched = True
582
+ normalized_term = normalized_known # Use the base known term
583
+ original_term = known_term # Use the original known term for tracking
584
  for doc in retrieved_docs:
585
  if normalized_known in self.normalize_term(doc.page_content):
586
  doc_type = doc.metadata.get('doc_type', '').lower()
 
592
  category = "grammar"
593
  definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
594
  break
595
+ # If no document match, assume idiom for multi-word terms
596
+ if not definition and len(known_term.split()) > 1:
597
+ category = "idiom"
598
+ definition = self.extract_clean_definition(normalized_known, "", response)
599
  break
600
 
601
+ # Additional check: single-word terms from words folder should not be idioms
602
+ if term_matched and len(original_term.split()) == 1 and any('words' in doc.metadata.get('doc_type', '').lower() for doc in retrieved_docs):
603
+ category = "word"
604
+
605
+ if not definition and term_matched:
606
  definition = self.extract_clean_definition(normalized_term, "", response)
607
 
608
+ if term_matched and definition:
 
 
 
 
 
 
 
609
  terms.append((original_term, category, definition))
610
  seen_terms.add(normalized_term)
611
  print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")