Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -490,30 +490,35 @@ class PersonalizedKazakhAssistant:
|
|
| 490 |
You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
|
| 491 |
|
| 492 |
Key capabilities:
|
| 493 |
-
1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms
|
| 494 |
-
2. **Track Learning Progress**: Identify and track when users learn new words or idioms
|
| 495 |
-
3. **Personalized Responses**: Adapt responses based on user's learning history
|
| 496 |
-
4. **Progress Reporting**: Provide detailed progress reports when asked
|
| 497 |
-
5. **Learning Recommendations**: Suggest words/idioms to review or learn next
|
| 498 |
|
| 499 |
Response Guidelines:
|
| 500 |
-
- For word/idiom queries: Provide definition, usage examples, and related information in {target_language}
|
| 501 |
-
- When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it
|
| 502 |
-
- Only bold the main term or idiom being explained, not other Kazakh words
|
| 503 |
-
- Always identify the main Kazakh word/idiom for progress tracking
|
| 504 |
-
-
|
| 505 |
-
- Use
|
| 506 |
-
- When
|
| 507 |
-
-
|
| 508 |
-
-
|
| 509 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
"""
|
| 511 |
self.llm = genai.GenerativeModel(
|
| 512 |
model_name=self.MODEL,
|
| 513 |
system_instruction=self.system_prompt,
|
| 514 |
generation_config={
|
| 515 |
"temperature": 0.7,
|
| 516 |
-
"max_output_tokens":
|
| 517 |
}
|
| 518 |
)
|
| 519 |
|
|
@@ -527,11 +532,11 @@ class PersonalizedKazakhAssistant:
|
|
| 527 |
|
| 528 |
try:
|
| 529 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 530 |
-
bold_pattern = r'\*\*([
|
| 531 |
bold_matches = re.findall(bold_pattern, response)
|
| 532 |
|
| 533 |
for term in bold_matches:
|
| 534 |
-
normalized_term = self.normalize_term(term)
|
| 535 |
if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
|
| 536 |
print(f"Skipped term {normalized_term}: Invalid length or already seen")
|
| 537 |
continue
|
|
@@ -542,15 +547,12 @@ class PersonalizedKazakhAssistant:
|
|
| 542 |
term_matched = False
|
| 543 |
original_term = term # Preserve original case for tracking
|
| 544 |
|
| 545 |
-
# Check
|
| 546 |
-
if len(term.split()) > 1:
|
| 547 |
-
category = "idiom"
|
| 548 |
-
|
| 549 |
-
# Check for exact match in known terms
|
| 550 |
for known_term in self.known_terms:
|
| 551 |
if normalized_term == self.normalize_term(known_term):
|
| 552 |
term_matched = True
|
| 553 |
-
original_term = known_term
|
|
|
|
| 554 |
for doc in retrieved_docs:
|
| 555 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 556 |
if normalized_term in self.normalize_term(doc.page_content):
|
|
@@ -562,17 +564,23 @@ class PersonalizedKazakhAssistant:
|
|
| 562 |
category = "grammar"
|
| 563 |
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 564 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
break
|
| 566 |
|
| 567 |
# If no exact match, try fuzzy matching for idioms with suffixes
|
| 568 |
if not term_matched:
|
| 569 |
for known_term in self.known_terms:
|
| 570 |
normalized_known = self.normalize_term(known_term)
|
|
|
|
|
|
|
| 571 |
if (normalized_term.startswith(normalized_known) and
|
| 572 |
len(normalized_term) <= len(normalized_known) + 4):
|
| 573 |
term_matched = True
|
| 574 |
-
normalized_term = normalized_known
|
| 575 |
-
original_term = known_term
|
| 576 |
for doc in retrieved_docs:
|
| 577 |
if normalized_known in self.normalize_term(doc.page_content):
|
| 578 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
|
@@ -584,21 +592,20 @@ class PersonalizedKazakhAssistant:
|
|
| 584 |
category = "grammar"
|
| 585 |
definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
|
| 586 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
break
|
| 588 |
|
| 589 |
-
#
|
| 590 |
-
if
|
| 591 |
-
category = "
|
|
|
|
|
|
|
| 592 |
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 593 |
|
| 594 |
-
|
| 595 |
-
if term_matched and len(original_term.split()) == 1:
|
| 596 |
-
for doc in retrieved_docs:
|
| 597 |
-
if 'words' in doc.metadata.get('doc_type', '').lower():
|
| 598 |
-
category = "word"
|
| 599 |
-
break
|
| 600 |
-
|
| 601 |
-
if definition:
|
| 602 |
terms.append((original_term, category, definition))
|
| 603 |
seen_terms.add(normalized_term)
|
| 604 |
print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")
|
|
|
|
| 490 |
You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
|
| 491 |
|
| 492 |
Key capabilities:
|
| 493 |
+
1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms.
|
| 494 |
+
2. **Track Learning Progress**: Identify and track when users learn new words or idioms.
|
| 495 |
+
3. **Personalized Responses**: Adapt responses based on user's learning history.
|
| 496 |
+
4. **Progress Reporting**: Provide detailed progress reports when asked.
|
| 497 |
+
5. **Learning Recommendations**: Suggest words/idioms to review or learn next.
|
| 498 |
|
| 499 |
Response Guidelines:
|
| 500 |
+
- For word/idiom queries: Provide definition, usage examples, and related information in {target_language}.
|
| 501 |
+
- When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it.
|
| 502 |
+
- Only bold the main term or idiom being explained, not other Kazakh words.
|
| 503 |
+
- Always identify the main Kazakh word/idiom for progress tracking.
|
| 504 |
+
- **RAG Usage**:
|
| 505 |
+
- Use Retrieval-Augmented Generation (RAG) only when the query explicitly asks for explanations of specific Kazakh terms or idioms (e.g., "What does сәлем mean?") or when the context strongly suggests a need for knowledge base information (e.g., queries about specific words or idioms).
|
| 506 |
+
- When using RAG to explain terms (e.g., nouns, idioms), limit examples to 3-4 relevant ones. Do not list all or many examples or all matches from the knowledge base if not explicitly asked (only 3,4).
|
| 507 |
+
- For general queries (e.g., greetings, procedural questions, or commands like /progress) or grammar-related queries (e.g., "explain me nouns"), rely on your general knowledge and do not use RAG unless the knowledge base contains relevant information.
|
| 508 |
+
- Since the knowledge base contains only words and idioms, grammar explanations (e.g., about nouns, verbs) should be provided using your own knowledge, without relying on RAG, unless the query specifically involves terms in the knowledge base.
|
| 509 |
+
- Be encouraging and supportive.
|
| 510 |
+
- Use simple, clear explanations.
|
| 511 |
+
- When discussing progress, be specific and motivating.
|
| 512 |
+
- Avoid storing definitions as terms.
|
| 513 |
+
- Normalize terms to lowercase for consistency.
|
| 514 |
+
- Respond in a conversational style.
|
| 515 |
"""
|
| 516 |
self.llm = genai.GenerativeModel(
|
| 517 |
model_name=self.MODEL,
|
| 518 |
system_instruction=self.system_prompt,
|
| 519 |
generation_config={
|
| 520 |
"temperature": 0.7,
|
| 521 |
+
"max_output_tokens": 700
|
| 522 |
}
|
| 523 |
)
|
| 524 |
|
|
|
|
| 532 |
|
| 533 |
try:
|
| 534 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 535 |
+
bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
|
| 536 |
bold_matches = re.findall(bold_pattern, response)
|
| 537 |
|
| 538 |
for term in bold_matches:
|
| 539 |
+
normalized_term = self.normalize_term(term) # Normalize to lowercase
|
| 540 |
if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
|
| 541 |
print(f"Skipped term {normalized_term}: Invalid length or already seen")
|
| 542 |
continue
|
|
|
|
| 547 |
term_matched = False
|
| 548 |
original_term = term # Preserve original case for tracking
|
| 549 |
|
| 550 |
+
# Check for exact match in known terms (case-insensitive)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
for known_term in self.known_terms:
|
| 552 |
if normalized_term == self.normalize_term(known_term):
|
| 553 |
term_matched = True
|
| 554 |
+
original_term = known_term # Use the known term's original case
|
| 555 |
+
# Determine category based on known term's source
|
| 556 |
for doc in retrieved_docs:
|
| 557 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 558 |
if normalized_term in self.normalize_term(doc.page_content):
|
|
|
|
| 564 |
category = "grammar"
|
| 565 |
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 566 |
break
|
| 567 |
+
# If no document match, check term length for idiom likelihood
|
| 568 |
+
if not definition and len(known_term.split()) > 1:
|
| 569 |
+
category = "idiom"
|
| 570 |
+
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 571 |
break
|
| 572 |
|
| 573 |
# If no exact match, try fuzzy matching for idioms with suffixes
|
| 574 |
if not term_matched:
|
| 575 |
for known_term in self.known_terms:
|
| 576 |
normalized_known = self.normalize_term(known_term)
|
| 577 |
+
# Check if the bolded term is a close match to a known term
|
| 578 |
+
# Allow up to 4 extra characters (e.g., grammatical endings)
|
| 579 |
if (normalized_term.startswith(normalized_known) and
|
| 580 |
len(normalized_term) <= len(normalized_known) + 4):
|
| 581 |
term_matched = True
|
| 582 |
+
normalized_term = normalized_known # Use the base known term
|
| 583 |
+
original_term = known_term # Use the original known term for tracking
|
| 584 |
for doc in retrieved_docs:
|
| 585 |
if normalized_known in self.normalize_term(doc.page_content):
|
| 586 |
doc_type = doc.metadata.get('doc_type', '').lower()
|
|
|
|
| 592 |
category = "grammar"
|
| 593 |
definition = self.extract_clean_definition(normalized_known, doc.page_content, response)
|
| 594 |
break
|
| 595 |
+
# If no document match, assume idiom for multi-word terms
|
| 596 |
+
if not definition and len(known_term.split()) > 1:
|
| 597 |
+
category = "idiom"
|
| 598 |
+
definition = self.extract_clean_definition(normalized_known, "", response)
|
| 599 |
break
|
| 600 |
|
| 601 |
+
# Additional check: single-word terms from words folder should not be idioms
|
| 602 |
+
if term_matched and len(original_term.split()) == 1 and any('words' in doc.metadata.get('doc_type', '').lower() for doc in retrieved_docs):
|
| 603 |
+
category = "word"
|
| 604 |
+
|
| 605 |
+
if not definition and term_matched:
|
| 606 |
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 607 |
|
| 608 |
+
if term_matched and definition:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
terms.append((original_term, category, definition))
|
| 610 |
seen_terms.add(normalized_term)
|
| 611 |
print(f"Added bolded term: {original_term}, category: {category}, definition: {definition}")
|