Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,15 +10,14 @@ from collections import defaultdict
|
|
| 10 |
import re
|
| 11 |
import uuid
|
| 12 |
import hashlib
|
|
|
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
import gradio as gr
|
| 16 |
|
| 17 |
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 18 |
from langchain.text_splitter import CharacterTextSplitter
|
| 19 |
-
from langchain.schema import Document
|
| 20 |
from langchain_chroma import Chroma
|
| 21 |
-
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
|
| 22 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 23 |
from langchain.memory import ConversationBufferMemory
|
| 24 |
from langchain.chains import ConversationalRetrievalChain
|
|
@@ -423,20 +422,19 @@ class PersonalizedLearningTracker:
|
|
| 423 |
return words
|
| 424 |
|
| 425 |
class PersonalizedKazakhAssistant:
|
| 426 |
-
def __init__(self):
|
| 427 |
self.known_terms = set()
|
| 428 |
self.setup_environment()
|
| 429 |
self.setup_vectorstore()
|
| 430 |
-
self.setup_llm()
|
| 431 |
self.tracker = PersonalizedLearningTracker()
|
| 432 |
self.user_sessions = {}
|
| 433 |
self.user_memories = {}
|
| 434 |
|
| 435 |
def setup_environment(self):
|
| 436 |
"""Setup environment and configuration"""
|
| 437 |
-
# self.google_api_key = os.getenv("GOOGLE_API_KEY")
|
| 438 |
load_dotenv()
|
| 439 |
-
|
| 440 |
self.MODEL = "gemini-1.5-flash"
|
| 441 |
self.db_name = "vector_db"
|
| 442 |
|
|
@@ -486,34 +484,37 @@ class PersonalizedKazakhAssistant:
|
|
| 486 |
self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
|
| 487 |
print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
|
| 488 |
|
| 489 |
-
def setup_llm(self):
|
| 490 |
-
"""Setup
|
| 491 |
-
system_prompt = """
|
| 492 |
-
|
| 493 |
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
self.llm =
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
| 517 |
)
|
| 518 |
|
| 519 |
def normalize_term(self, term: str) -> str:
|
|
@@ -521,253 +522,52 @@ class PersonalizedKazakhAssistant:
|
|
| 521 |
return ' '.join(term.lower().strip().split())
|
| 522 |
|
| 523 |
def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
|
| 524 |
-
"""Extract
|
| 525 |
terms = []
|
| 526 |
seen_terms = set()
|
| 527 |
|
| 528 |
try:
|
| 529 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 530 |
|
| 531 |
-
|
| 532 |
-
|
|
|
|
| 533 |
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
if
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
if
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
category = "grammar"
|
| 554 |
-
else:
|
| 555 |
-
category = "word"
|
| 556 |
-
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 557 |
-
break
|
| 558 |
-
if not definition:
|
| 559 |
-
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 560 |
-
if definition:
|
| 561 |
-
terms.append((term, category, definition))
|
| 562 |
-
seen_terms.add(normalized_term)
|
| 563 |
-
print(f"Added single response term: {term}, category: {category}, definition: {definition}")
|
| 564 |
-
return terms
|
| 565 |
-
|
| 566 |
-
# Look for quoted term in response (e.g., "басыр" in "Берілген мәтін бойынша, 'басыр' - көз ауруы")
|
| 567 |
-
quoted_pattern = r'[\'\"]([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)[\'\"]'
|
| 568 |
-
quoted_matches = re.findall(quoted_pattern, response)
|
| 569 |
-
if quoted_matches:
|
| 570 |
-
term = quoted_matches[0]
|
| 571 |
-
normalized_term = self.normalize_term(term)
|
| 572 |
-
if normalized_term in self.known_terms and normalized_term not in seen_terms and len(normalized_term) > 2 and len(normalized_term) <= 100:
|
| 573 |
-
category = "word"
|
| 574 |
-
definition = ""
|
| 575 |
-
for doc in retrieved_docs:
|
| 576 |
-
if normalized_term in self.normalize_term(doc.page_content):
|
| 577 |
-
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 578 |
-
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 579 |
-
category = "idiom"
|
| 580 |
-
elif 'grammar' in doc_type:
|
| 581 |
-
category = "grammar"
|
| 582 |
-
else:
|
| 583 |
-
category = "word"
|
| 584 |
-
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 585 |
-
break
|
| 586 |
-
if not definition:
|
| 587 |
-
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 588 |
-
if definition:
|
| 589 |
-
terms.append((term, category, definition))
|
| 590 |
-
seen_terms.add(normalized_term)
|
| 591 |
-
print(f"Added quoted term: {term}, category: {category}, definition: {definition}")
|
| 592 |
-
return terms
|
| 593 |
-
|
| 594 |
-
# Look for term before hyphen (e.g., "басыр — көз ауруы")
|
| 595 |
-
hyphen_pattern = r'^([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\s*[-–—]\s*(.+)$'
|
| 596 |
-
hyphen_matches = re.match(hyphen_pattern, response.strip(), re.MULTILINE)
|
| 597 |
-
if hyphen_matches:
|
| 598 |
-
term = hyphen_matches.group(1).strip()
|
| 599 |
-
definition_part = hyphen_matches.group(2).strip()
|
| 600 |
-
normalized_term = self.normalize_term(term)
|
| 601 |
-
if normalized_term in self.known_terms and normalized_term not in seen_terms and len(normalized_term) > 2 and len(normalized_term) <= 100:
|
| 602 |
-
category = "word"
|
| 603 |
-
definition = definition_part
|
| 604 |
-
for doc in retrieved_docs:
|
| 605 |
-
if normalized_term in self.normalize_term(doc.page_content):
|
| 606 |
-
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 607 |
-
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 608 |
-
category = "idiom"
|
| 609 |
-
elif 'grammar' in doc_type:
|
| 610 |
-
category = "grammar"
|
| 611 |
-
else:
|
| 612 |
-
category = "word"
|
| 613 |
-
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 614 |
-
break
|
| 615 |
-
if not definition:
|
| 616 |
-
definition = definition_part
|
| 617 |
-
if definition:
|
| 618 |
-
terms.append((term, category, definition))
|
| 619 |
-
seen_terms.add(normalized_term)
|
| 620 |
-
print(f"Added hyphen term: {term}, category: {category}, definition: {definition}")
|
| 621 |
-
return terms
|
| 622 |
-
|
| 623 |
-
# Check query term, but only if it’s the primary term in the response
|
| 624 |
-
query_words = message_normalized.split()
|
| 625 |
-
for word in query_words:
|
| 626 |
-
normalized_word = self.normalize_term(word)
|
| 627 |
-
if normalized_word in self.known_terms and normalized_word not in seen_terms:
|
| 628 |
-
# Ensure the query term is the primary term in the response
|
| 629 |
-
sentences = response.split('.')
|
| 630 |
-
for sentence in sentences:
|
| 631 |
-
sentence = sentence.strip()
|
| 632 |
-
if not sentence:
|
| 633 |
-
continue
|
| 634 |
-
if normalized_word in self.normalize_term(sentence):
|
| 635 |
category = "word"
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
if not definition:
|
| 649 |
-
definition = self.extract_clean_definition(normalized_word, "", response)
|
| 650 |
-
if definition:
|
| 651 |
-
terms.append((word, category, definition))
|
| 652 |
-
seen_terms.add(normalized_word)
|
| 653 |
-
print(f"Added query term: {word}, category: {category}, definition: {definition}")
|
| 654 |
-
return terms
|
| 655 |
-
|
| 656 |
-
# Fallback to primary term in response (e.g., "абыз" in "Ел атасы данагөйді абыз деп атайды")
|
| 657 |
-
sentences = response.split('.')
|
| 658 |
-
for sentence in sentences:
|
| 659 |
-
sentence = sentence.strip()
|
| 660 |
-
if not sentence:
|
| 661 |
-
continue
|
| 662 |
-
kazakh_phrases = re.findall(
|
| 663 |
-
r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+){0,2}',
|
| 664 |
-
sentence
|
| 665 |
-
)
|
| 666 |
-
for phrase in kazakh_phrases:
|
| 667 |
-
normalized_phrase = self.normalize_term(phrase)
|
| 668 |
-
if normalized_phrase in seen_terms or len(normalized_phrase) <= 2 or len(normalized_phrase) > 100:
|
| 669 |
-
print(f"Skipped phrase {normalized_phrase}: Invalid length or already seen")
|
| 670 |
-
continue
|
| 671 |
-
if normalized_phrase in self.known_terms and any(
|
| 672 |
-
normalized_phrase in self.normalize_term(doc.page_content) for doc in retrieved_docs
|
| 673 |
-
):
|
| 674 |
-
category = "word"
|
| 675 |
-
definition = ""
|
| 676 |
-
for doc in retrieved_docs:
|
| 677 |
-
if normalized_phrase in self.normalize_term(doc.page_content):
|
| 678 |
-
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 679 |
-
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 680 |
-
category = "idiom"
|
| 681 |
-
elif 'grammar' in doc_type:
|
| 682 |
-
category = "grammar"
|
| 683 |
-
else:
|
| 684 |
-
category = "word"
|
| 685 |
-
definition = self.extract_clean_definition(normalized_phrase, doc.page_content, response)
|
| 686 |
-
break
|
| 687 |
-
if not definition:
|
| 688 |
-
definition = self.extract_clean_definition(normalized_phrase, "", response)
|
| 689 |
-
if definition:
|
| 690 |
-
terms.append((phrase, category, definition))
|
| 691 |
-
seen_terms.add(normalized_phrase)
|
| 692 |
-
print(f"Added phrase: {phrase}, category: {category}, definition: {definition}")
|
| 693 |
-
return terms
|
| 694 |
-
|
| 695 |
-
# Step 2: For multi-term queries, prioritize full idioms from response
|
| 696 |
-
if is_multi_term_query:
|
| 697 |
-
kazakh_phrases = re.findall(
|
| 698 |
-
r'[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s,-]+[А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*',
|
| 699 |
-
response
|
| 700 |
-
)
|
| 701 |
-
for phrase in kazakh_phrases:
|
| 702 |
-
normalized_phrase = self.normalize_term(phrase)
|
| 703 |
-
if normalized_phrase in seen_terms or len(normalized_phrase) <= 2 or len(normalized_phrase) > 100:
|
| 704 |
-
print(f"Skipped phrase {normalized_phrase}: Invalid length or already seen")
|
| 705 |
-
continue
|
| 706 |
-
if normalized_phrase in self.known_terms or any(
|
| 707 |
-
normalized_phrase in self.normalize_term(doc.page_content) for doc in retrieved_docs
|
| 708 |
-
):
|
| 709 |
-
category = "word"
|
| 710 |
-
definition = ""
|
| 711 |
-
for doc in retrieved_docs:
|
| 712 |
-
if normalized_phrase in self.normalize_term(doc.page_content):
|
| 713 |
-
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 714 |
-
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 715 |
-
category = "idiom"
|
| 716 |
-
elif 'grammar' in doc_type:
|
| 717 |
-
category = "grammar"
|
| 718 |
-
else:
|
| 719 |
-
category = "word"
|
| 720 |
-
definition = self.extract_clean_definition(normalized_phrase, doc.page_content, response)
|
| 721 |
-
break
|
| 722 |
-
if not definition:
|
| 723 |
-
definition = self.extract_clean_definition(normalized_phrase, "", response)
|
| 724 |
-
if definition and len(normalized_phrase.split()) <= 6:
|
| 725 |
-
terms.append((phrase, category, definition))
|
| 726 |
-
seen_terms.add(normalized_phrase)
|
| 727 |
-
print(f"Added phrase: {phrase}, category: {category}, definition: {definition}")
|
| 728 |
-
return terms
|
| 729 |
-
|
| 730 |
-
for known_term in self.known_terms:
|
| 731 |
-
normalized_known_term = self.normalize_term(known_term)
|
| 732 |
-
if normalized_known_term in response_normalized and normalized_known_term not in seen_terms:
|
| 733 |
-
|
| 734 |
-
is_part_of_idiom = any(
|
| 735 |
-
normalized_known_term in self.normalize_term(idiom) and len(idiom.split()) > 1
|
| 736 |
-
for idiom in self.known_terms
|
| 737 |
-
if idiom != normalized_known_term
|
| 738 |
-
)
|
| 739 |
-
if is_part_of_idiom:
|
| 740 |
-
print(f"Skipped term {known_term}: Part of a larger idiom")
|
| 741 |
-
continue
|
| 742 |
-
if normalized_known_term in self.known_terms and any(
|
| 743 |
-
normalized_known_term in self.normalize_term(doc.page_content) for doc in retrieved_docs
|
| 744 |
-
):
|
| 745 |
-
category = "word"
|
| 746 |
-
definition = ""
|
| 747 |
-
for doc in retrieved_docs:
|
| 748 |
-
if normalized_known_term in self.normalize_term(doc.page_content):
|
| 749 |
-
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 750 |
-
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 751 |
-
category = "idiom"
|
| 752 |
-
elif 'grammar' in doc_type:
|
| 753 |
-
category = "grammar"
|
| 754 |
-
else:
|
| 755 |
-
category = "word"
|
| 756 |
-
definition = self.extract_clean_definition(normalized_known_term, doc.page_content, response)
|
| 757 |
-
break
|
| 758 |
-
if not definition:
|
| 759 |
-
definition = self.extract_clean_definition(normalized_known_term, "", response)
|
| 760 |
-
if definition and len(normalized_known_term.split()) <= 10:
|
| 761 |
-
terms.append((known_term, category, definition))
|
| 762 |
-
seen_terms.add(normalized_known_term)
|
| 763 |
-
print(f"Added known term: {known_term}, category: {category}, definition: {definition}")
|
| 764 |
-
if not is_multi_term_query:
|
| 765 |
-
return terms
|
| 766 |
-
|
| 767 |
except Exception as e:
|
| 768 |
print(f"Error extracting terms: {e}")
|
| 769 |
-
|
| 770 |
-
return terms
|
| 771 |
|
| 772 |
def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
|
| 773 |
"""Extract a clean definition for a term from the knowledge base."""
|
|
@@ -810,14 +610,18 @@ class PersonalizedKazakhAssistant:
|
|
| 810 |
"""Process user message with proper user session management and toggle for direct Gemini"""
|
| 811 |
|
| 812 |
if session_token and not self.tracker.validate_session(user_id, session_token):
|
| 813 |
-
return "Session expired. Please login again."
|
| 814 |
-
|
| 815 |
if session_token:
|
| 816 |
self.tracker.update_session_activity(user_id, session_token)
|
| 817 |
|
| 818 |
if user_id not in self.user_sessions:
|
| 819 |
self.user_sessions[user_id] = self.tracker.start_session(user_id)
|
|
|
|
|
|
|
|
|
|
| 820 |
|
|
|
|
| 821 |
if message.lower().startswith('/progress'):
|
| 822 |
return self.get_progress_report(user_id)
|
| 823 |
elif message.lower().startswith('/recommendations'):
|
|
@@ -831,35 +635,90 @@ class PersonalizedKazakhAssistant:
|
|
| 831 |
elif message.lower().startswith('/newword'):
|
| 832 |
new_word = self.get_new_word(user_id)
|
| 833 |
if not new_word:
|
| 834 |
-
return "Қазір жаңа сөздер жоқ. Басқа сөздерді қайталаңыз! 🌟\n\nNo new words available right now. Review other words! 🌟"
|
| 835 |
self.tracker.track_word_encounter(user_id, new_word['word'], new_word['definition'], new_word['category'])
|
| 836 |
return f"📝 **Жаңа сөз / New Word**: {new_word['word']}\n\nМағынасы / Meaning: {new_word['definition']}"
|
| 837 |
elif message.lower().startswith('/newidiom'):
|
| 838 |
new_idiom = self.get_new_idiom(user_id)
|
| 839 |
if not new_idiom:
|
| 840 |
-
return "Қазір жаңа тіркестер жоқ. Басқа тіркестерді қайталаңыз! 🌟\n\nNo new idioms available right now. Review other idioms! 🌟"
|
| 841 |
self.tracker.track_word_encounter(user_id, new_idiom['word'], new_idiom['definition'], new_idiom['category'])
|
| 842 |
return f"🎭 **Жаңа тіркес / New Idiom**: {new_idiom['word']}\n\nМағынасы / Meaning: {new_idiom['definition']}"
|
| 843 |
elif message.lower().startswith('/help'):
|
| 844 |
return self.get_help_message()
|
| 845 |
-
|
| 846 |
if use_direct_gemini:
|
| 847 |
return self.process_direct_gemini(message, user_id, target_language)
|
| 848 |
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
|
| 853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
for term, category, definition in extracted_terms:
|
| 856 |
-
if definition and term:
|
| 857 |
-
self.tracker.track_word_encounter(
|
| 858 |
-
user_id,
|
| 859 |
-
term,
|
| 860 |
-
definition,
|
| 861 |
-
category
|
| 862 |
-
)
|
| 863 |
|
| 864 |
return response
|
| 865 |
|
|
@@ -1058,38 +917,41 @@ Start learning by asking about any Kazakh term! 🌟
|
|
| 1058 |
"""Process message using direct Gemini with conversation memory for context."""
|
| 1059 |
try:
|
| 1060 |
memory = self.get_user_memory(user_id)
|
| 1061 |
-
|
| 1062 |
-
direct_prompt = """
|
| 1063 |
-
You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
|
| 1064 |
-
|
| 1065 |
-
Previous conversation context:
|
| 1066 |
-
{chat_history}
|
| 1067 |
-
"""
|
| 1068 |
chat_history = ""
|
| 1069 |
-
for msg in memory.chat_memory.messages[-10:]:
|
| 1070 |
if isinstance(msg, HumanMessage):
|
| 1071 |
chat_history += f"User: {msg.content}\n"
|
| 1072 |
elif isinstance(msg, AIMessage):
|
| 1073 |
chat_history += f"Assistant: {msg.content}\n"
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
|
|
|
| 1081 |
if target_language != "English" and not any(
|
| 1082 |
keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
|
| 1083 |
):
|
| 1084 |
modified_message = f"Explain in {target_language}: {message}"
|
| 1085 |
else:
|
| 1086 |
modified_message = message
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
memory.chat_memory.add_user_message(message)
|
| 1091 |
memory.chat_memory.add_ai_message(response)
|
| 1092 |
-
|
| 1093 |
return response
|
| 1094 |
except Exception as e:
|
| 1095 |
return f"Error processing direct Gemini request: {str(e)}"
|
|
|
|
| 10 |
import re
|
| 11 |
import uuid
|
| 12 |
import hashlib
|
| 13 |
+
import google.generativeai as genai
|
| 14 |
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 19 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
| 20 |
from langchain_chroma import Chroma
|
|
|
|
| 21 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 22 |
from langchain.memory import ConversationBufferMemory
|
| 23 |
from langchain.chains import ConversationalRetrievalChain
|
|
|
|
| 422 |
return words
|
| 423 |
|
| 424 |
class PersonalizedKazakhAssistant:
|
| 425 |
+
def __init__(self, target_language: str = "English"):
|
| 426 |
self.known_terms = set()
|
| 427 |
self.setup_environment()
|
| 428 |
self.setup_vectorstore()
|
| 429 |
+
self.setup_llm(target_language)
|
| 430 |
self.tracker = PersonalizedLearningTracker()
|
| 431 |
self.user_sessions = {}
|
| 432 |
self.user_memories = {}
|
| 433 |
|
| 434 |
def setup_environment(self):
|
| 435 |
"""Setup environment and configuration"""
|
|
|
|
| 436 |
load_dotenv()
|
| 437 |
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 438 |
self.MODEL = "gemini-1.5-flash"
|
| 439 |
self.db_name = "vector_db"
|
| 440 |
|
|
|
|
| 484 |
self.vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=self.db_name)
|
| 485 |
print(f"Vectorstore created with {self.vectorstore._collection.count()} documents")
|
| 486 |
|
| 487 |
+
def setup_llm(self, target_language: str = "English"):
|
| 488 |
+
"""Setup Gemini model with system prompt formatted with target language"""
|
| 489 |
+
self.system_prompt = f"""
|
| 490 |
+
You are a personalized Kazakh language learning assistant with access to a comprehensive knowledge base and user learning history. Your role is to help users learn Kazakh words and idioms while tracking their progress and providing personalized recommendations. Respond in {target_language}.
|
| 491 |
|
| 492 |
+
Key capabilities:
|
| 493 |
+
1. **Answer Queries**: Provide accurate definitions and examples for Kazakh words and idioms
|
| 494 |
+
2. **Track Learning Progress**: Identify and track when users learn new words or idioms
|
| 495 |
+
3. **Personalized Responses**: Adapt responses based on user's learning history
|
| 496 |
+
4. **Progress Reporting**: Provide detailed progress reports when asked
|
| 497 |
+
5. **Learning Recommendations**: Suggest words/idioms to review or learn next
|
| 498 |
|
| 499 |
+
Response Guidelines:
|
| 500 |
+
- For word/idiom queries: Provide definition, usage examples, and related information in {target_language}
|
| 501 |
+
- When explaining a Kazakh word or idiom retrieved from the knowledge base, **bold** the term (e.g., **күләпара**) in the response to highlight it
|
| 502 |
+
- Only bold the main term or idiom being explained, not other Kazakh words
|
| 503 |
+
- Always identify the main Kazakh word/idiom for progress tracking
|
| 504 |
+
- Be encouraging and supportive
|
| 505 |
+
- Use simple, clear explanations
|
| 506 |
+
- When discussing progress, be specific and motivating
|
| 507 |
+
- Avoid storing definitions as terms
|
| 508 |
+
- Normalize terms to lowercase
|
| 509 |
+
- Respond in conversational style
|
| 510 |
+
"""
|
| 511 |
+
self.llm = genai.GenerativeModel(
|
| 512 |
+
model_name=self.MODEL,
|
| 513 |
+
system_instruction=self.system_prompt,
|
| 514 |
+
generation_config={
|
| 515 |
+
"temperature": 0.7,
|
| 516 |
+
"max_output_tokens": 500
|
| 517 |
+
}
|
| 518 |
)
|
| 519 |
|
| 520 |
def normalize_term(self, term: str) -> str:
|
|
|
|
| 522 |
return ' '.join(term.lower().strip().split())
|
| 523 |
|
| 524 |
def extract_kazakh_terms(self, message: str, response: str) -> List[Tuple[str, str, str]]:
|
| 525 |
+
"""Extract bolded Kazakh terms from response and verify against known terms."""
|
| 526 |
terms = []
|
| 527 |
seen_terms = set()
|
| 528 |
|
| 529 |
try:
|
| 530 |
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 531 |
|
| 532 |
+
# Pattern to match bolded terms (e.g., **күләпара**)
|
| 533 |
+
bold_pattern = r'\*\*([А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+(?:[\s-][А-Яа-яӘәҒғҚқҢңӨөҰұҮүҺһІі]+)*)\*\*'
|
| 534 |
+
bold_matches = re.findall(bold_pattern, response)
|
| 535 |
|
| 536 |
+
for term in bold_matches:
|
| 537 |
+
normalized_term = self.normalize_term(term)
|
| 538 |
+
if normalized_term in seen_terms or len(normalized_term) <= 2 or len(normalized_term) > 100:
|
| 539 |
+
print(f"Skipped term {normalized_term}: Invalid length or already seen")
|
| 540 |
+
continue
|
| 541 |
+
|
| 542 |
+
# Check if term is in known_terms
|
| 543 |
+
if normalized_term in self.known_terms:
|
| 544 |
+
category = "word"
|
| 545 |
+
definition = ""
|
| 546 |
+
# Determine category and definition from retrieved docs
|
| 547 |
+
for doc in retrieved_docs:
|
| 548 |
+
if normalized_term in self.normalize_term(doc.page_content):
|
| 549 |
+
doc_type = doc.metadata.get('doc_type', '').lower()
|
| 550 |
+
if 'idiom' in doc_type or 'тіркес' in doc_type:
|
| 551 |
+
category = "idiom"
|
| 552 |
+
elif 'grammar' in doc_type:
|
| 553 |
+
category = "grammar"
|
| 554 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
category = "word"
|
| 556 |
+
definition = self.extract_clean_definition(normalized_term, doc.page_content, response)
|
| 557 |
+
break
|
| 558 |
+
if not definition:
|
| 559 |
+
definition = self.extract_clean_definition(normalized_term, "", response)
|
| 560 |
+
|
| 561 |
+
if definition:
|
| 562 |
+
terms.append((term, category, definition))
|
| 563 |
+
seen_terms.add(normalized_term)
|
| 564 |
+
print(f"Added bolded term: {term}, category: {category}, definition: {definition}")
|
| 565 |
+
|
| 566 |
+
return terms
|
| 567 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
except Exception as e:
|
| 569 |
print(f"Error extracting terms: {e}")
|
| 570 |
+
return terms
|
|
|
|
| 571 |
|
| 572 |
def extract_clean_definition(self, term: str, doc_content: str, response: str) -> str:
|
| 573 |
"""Extract a clean definition for a term from the knowledge base."""
|
|
|
|
| 610 |
"""Process user message with proper user session management and toggle for direct Gemini"""
|
| 611 |
|
| 612 |
if session_token and not self.tracker.validate_session(user_id, session_token):
|
| 613 |
+
return f"Session expired. Please login again in {target_language}."
|
| 614 |
+
|
| 615 |
if session_token:
|
| 616 |
self.tracker.update_session_activity(user_id, session_token)
|
| 617 |
|
| 618 |
if user_id not in self.user_sessions:
|
| 619 |
self.user_sessions[user_id] = self.tracker.start_session(user_id)
|
| 620 |
+
|
| 621 |
+
# Set up LLM with the specified target language
|
| 622 |
+
self.setup_llm(target_language)
|
| 623 |
|
| 624 |
+
# Handle special commands
|
| 625 |
if message.lower().startswith('/progress'):
|
| 626 |
return self.get_progress_report(user_id)
|
| 627 |
elif message.lower().startswith('/recommendations'):
|
|
|
|
| 635 |
elif message.lower().startswith('/newword'):
|
| 636 |
new_word = self.get_new_word(user_id)
|
| 637 |
if not new_word:
|
| 638 |
+
return f"Қазір жаңа сөздер жоқ. Басқа сөздерді қайталаңыз! 🌟\n\nNo new words available right now. Review other words! 🌟"
|
| 639 |
self.tracker.track_word_encounter(user_id, new_word['word'], new_word['definition'], new_word['category'])
|
| 640 |
return f"📝 **Жаңа сөз / New Word**: {new_word['word']}\n\nМағынасы / Meaning: {new_word['definition']}"
|
| 641 |
elif message.lower().startswith('/newidiom'):
|
| 642 |
new_idiom = self.get_new_idiom(user_id)
|
| 643 |
if not new_idiom:
|
| 644 |
+
return f"Қазір жаңа тіркестер жоқ. Басқа тіркестерді қайталаңыз! 🌟\n\nNo new idioms available right now. Review other idioms! 🌟"
|
| 645 |
self.tracker.track_word_encounter(user_id, new_idiom['word'], new_idiom['definition'], new_idiom['category'])
|
| 646 |
return f"🎭 **Жаңа тіркес / New Idiom**: {new_idiom['word']}\n\nМағынасы / Meaning: {new_idiom['definition']}"
|
| 647 |
elif message.lower().startswith('/help'):
|
| 648 |
return self.get_help_message()
|
| 649 |
+
|
| 650 |
if use_direct_gemini:
|
| 651 |
return self.process_direct_gemini(message, user_id, target_language)
|
| 652 |
|
| 653 |
+
# Retrieve relevant documents from vectorstore
|
| 654 |
+
retrieved_docs = self.vectorstore.similarity_search(message, k=5)
|
| 655 |
+
context = "\n".join([doc.page_content for doc in retrieved_docs])
|
| 656 |
|
| 657 |
+
# Get conversation history
|
| 658 |
+
memory = self.get_user_memory(user_id)
|
| 659 |
+
chat_history = ""
|
| 660 |
+
for msg in memory.chat_memory.messages[-10:]:
|
| 661 |
+
if isinstance(msg, HumanMessage):
|
| 662 |
+
chat_history += f"User: {msg.content}\n"
|
| 663 |
+
elif isinstance(msg, AIMessage):
|
| 664 |
+
chat_history += f"Assistant: {msg.content}\n"
|
| 665 |
+
|
| 666 |
+
# Retrieve user progress from SQLite database
|
| 667 |
+
progress = self.tracker.get_user_progress(user_id)
|
| 668 |
+
words_to_review = self.tracker.get_words_to_review(user_id, 5)
|
| 669 |
+
mastered_words = self.tracker.get_mastered_words(user_id, page=1, page_size=5)
|
| 670 |
+
|
| 671 |
+
progress_summary = """
|
| 672 |
+
User Learning Progress (in {target_language}):
|
| 673 |
+
- Total Terms Learned: {total_words}
|
| 674 |
+
- Category Statistics:
|
| 675 |
+
{category_stats}
|
| 676 |
+
- Recent Activity: {recent_activity} terms reviewed in the last 7 days
|
| 677 |
+
- Words to Review:
|
| 678 |
+
{words_to_review}
|
| 679 |
+
- Mastered Words:
|
| 680 |
+
{mastered_words}
|
| 681 |
+
""".format(
|
| 682 |
+
target_language=target_language,
|
| 683 |
+
total_words=progress['total_words'],
|
| 684 |
+
category_stats=''.join([f" - {category}: {stats['count']} terms, Average Mastery: {stats['average_mastery']}/5\n"
|
| 685 |
+
for category, stats in progress['category_stats'].items()]),
|
| 686 |
+
recent_activity=progress['recent_activity'],
|
| 687 |
+
words_to_review=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 688 |
+
for word in words_to_review]),
|
| 689 |
+
mastered_words=''.join([f" - {word['word']} (Category: {word['category']}, Mastery: {word['mastery_level']}/5, Encounters: {word['encounter_count']})\n"
|
| 690 |
+
for word in mastered_words])
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
# Construct prompt with context, history, and progress
|
| 694 |
+
full_prompt = f"""
|
| 695 |
+
{self.system_prompt}
|
| 696 |
+
|
| 697 |
+
Previous conversation:
|
| 698 |
+
{chat_history}
|
| 699 |
|
| 700 |
+
Context from knowledge base:
|
| 701 |
+
{context}
|
| 702 |
+
|
| 703 |
+
{progress_summary}
|
| 704 |
+
|
| 705 |
+
User question: {message}
|
| 706 |
+
|
| 707 |
+
Respond in {target_language}. If explaining a Kazakh word or idiom retrieved from the context, **bold** the term (e.g., **күләпара**) in your response to highlight it. Only bold the main term being explained.
|
| 708 |
+
"""
|
| 709 |
+
|
| 710 |
+
# Call Gemini API
|
| 711 |
+
response = self.llm.generate_content(full_prompt).text
|
| 712 |
+
|
| 713 |
+
# Add to conversation memory
|
| 714 |
+
memory.chat_memory.add_user_message(message)
|
| 715 |
+
memory.chat_memory.add_ai_message(response)
|
| 716 |
+
|
| 717 |
+
# Extract and track terms
|
| 718 |
+
extracted_terms = self.extract_kazakh_terms(message, response)
|
| 719 |
for term, category, definition in extracted_terms:
|
| 720 |
+
if definition and term:
|
| 721 |
+
self.tracker.track_word_encounter(user_id, term, definition, category)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
return response
|
| 724 |
|
|
|
|
| 917 |
"""Process message using direct Gemini with conversation memory for context."""
|
| 918 |
try:
|
| 919 |
memory = self.get_user_memory(user_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
chat_history = ""
|
| 921 |
+
for msg in memory.chat_memory.messages[-10:]:
|
| 922 |
if isinstance(msg, HumanMessage):
|
| 923 |
chat_history += f"User: {msg.content}\n"
|
| 924 |
elif isinstance(msg, AIMessage):
|
| 925 |
chat_history += f"Assistant: {msg.content}\n"
|
| 926 |
+
|
| 927 |
+
direct_prompt = """
|
| 928 |
+
You are a friendly and supportive Kazakh language learning assistant. Your role is to help users learn Kazakh vocabulary, grammar, and idioms in a clear, concise, and engaging way. Respond in the user's primary language, inferred from their input, unless a specific language (English, Kazakh, or Russian) is requested. Provide practical examples and explanations tailored to language learners. Keep responses concise (under 200 words) and encouraging. Use your internal knowledge to ensure accuracy and relevance, focusing exclusively on Kazakh language learning.
|
| 929 |
+
|
| 930 |
+
Previous conversation context:
|
| 931 |
+
{chat_history}
|
| 932 |
+
"""
|
| 933 |
+
|
| 934 |
if target_language != "English" and not any(
|
| 935 |
keyword in message.lower() for keyword in ['kazakh', 'қазақша', 'қазақ тілінде', 'russian', 'русский', 'орысша']
|
| 936 |
):
|
| 937 |
modified_message = f"Explain in {target_language}: {message}"
|
| 938 |
else:
|
| 939 |
modified_message = message
|
| 940 |
+
|
| 941 |
+
direct_model = genai.GenerativeModel(
|
| 942 |
+
model_name=self.MODEL,
|
| 943 |
+
system_instruction=direct_prompt.format(chat_history=chat_history),
|
| 944 |
+
generation_config={
|
| 945 |
+
"temperature": 0.7,
|
| 946 |
+
"max_output_tokens": 200
|
| 947 |
+
}
|
| 948 |
+
)
|
| 949 |
+
|
| 950 |
+
response = direct_model.generate_content(modified_message).text
|
| 951 |
+
|
| 952 |
memory.chat_memory.add_user_message(message)
|
| 953 |
memory.chat_memory.add_ai_message(response)
|
| 954 |
+
|
| 955 |
return response
|
| 956 |
except Exception as e:
|
| 957 |
return f"Error processing direct Gemini request: {str(e)}"
|