Spaces:
Runtime error
Runtime error
File size: 43,567 Bytes
0815850 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 | from flask import Blueprint, request, jsonify, current_app
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import string
import tempfile
from datetime import datetime
# Defer heavy optional import (whisper) to optional load so import-time does not crash app
MODEL_NAME = "base"
model = None
MODEL_AVAILABLE = False
try:
import whisper
try:
model = whisper.load_model(MODEL_NAME)
MODEL_AVAILABLE = True
print(f"Whisper model '{MODEL_NAME}' loaded successfully")
except Exception as ex:
print(f"Whisper installed but failed to load model '{MODEL_NAME}': {ex}")
model = None
MODEL_AVAILABLE = False
except Exception as ex:
print(f"Whisper not available: {ex}")
model = None
MODEL_AVAILABLE = False
# Add SymSpell for spell checking
try:
from symspellpy import SymSpell, Verbosity
import pkg_resources
SYMSPELL_AVAILABLE = True
except ImportError:
print("SymSpell not available. Please install: pip install symspellpy")
SYMSPELL_AVAILABLE = False
staticchat_bp = Blueprint("staticchat", __name__)
# NOTE: Blueprints do not have a config dict. MAX_CONTENT_LENGTH must be set on the Flask app.
# If you want to enforce max content size, set app.config["MAX_CONTENT_LENGTH"] when creating the Flask app.
# Initialize SymSpell if available
sym_spell = None
if SYMSPELL_AVAILABLE:
try:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
"symspellpy", "frequency_dictionary_en_82_765.txt"
)
bigram_path = pkg_resources.resource_filename(
"symspellpy", "frequency_bigramdictionary_en_243_342.txt"
)
# Load dictionaries
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=1)
print("SymSpell spell checker initialized successfully")
except Exception as e:
print(f"Failed to initialize SymSpell: {e}")
SYMSPELL_AVAILABLE = False
# Try to import NLTK with fallback
try:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Download required NLTK resources
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', quiet=True)
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords', quiet=True)
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet', quiet=True)
NLTK_AVAILABLE = True
except Exception as e:
print(f"NLTK not available, using simple text processing: {e}")
NLTK_AVAILABLE = False
# Enhanced Scenario configurations
SCENARIOS = {
"greeting": {
"keywords": ["good morning", "good afternoon", "good evening", "hello", "hi", "hey", "greetings"],
"message": {
"morning": "Good morning! Let's begin our lesson on tenses. You can ask me any question about tenses",
"afternoon": "Good afternoon! Let's begin our lesson on tenses. You can ask me any question about tenses",
"evening": "Good evening! Let's begin our lesson on tenses. You can ask me any question about tenses",
"general": "Hello! Welcome to the English Tenses Learning Assistant. How can I help you with tenses today?"
},
"audio_url": "assets/staticchat/intro.mp3",
"video_url": "assets/staticchat/intro.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
},
"thanks": {
"keywords": ["thank you", "thanks", "thank you very much", "appreciate it", "thanks a lot"],
"message": "You're welcome! Do you have any other questions?",
"audio_url": "assets/staticchat/you_are_welcome.mp3",
"video_url": "assets/staticchat/you_are_welcome.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
},
"farewell": {
"keywords": ["bye", "goodbye", "see you", "farewell", "take care", "bye bye"],
"message": "Goodbye! Keep practicing your English tenses. Remember, practice makes perfect!",
"audio_url": "assets/staticchat/bye.mp3",
"video_url": "assets/staticchat/bye.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
},
"not_available": {
"message": "I don't have the answer for that. Let's not available in my lesson today.",
"suggestions": [
"Try asking about common tenses like present simple or past perfect",
"Ask me about tense structures or examples",
"Check if your question is specifically about English verb tenses"
],
"audio_url": "assets/staticchat/no_db.mp3",
"video_url": "assets/staticchat/no_db.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
},
"out_of_syllabus": {
"keywords": [
# sports
"sports", "sport", "cricket", "ipl", "match", "score", "wicket", "runs", "bat", "bowling",
"football", "basketball", "tennis", "hockey",
# other non-tense topics
"weather", "rain", "sunny", "temperature",
"food", "pizza", "burger", "restaurant", "cooking",
"movie", "music", "song", "artist", "film",
"history", "science", "math", "politics", "geography", "economics", "physics",
# general grammar (NOT tenses)
"noun", "pronoun", "adjective", "adverb", "preposition", "conjunction",
"punctuation", "comma", "full stop", "spelling", "vocabulary", "synonym", "antonym",
"phonetics", "pronunciation"
],
"message": "That's not part of our tense lesson. Let's stay on our topic.",
"audio_url": "assets/staticchat/out_of_topic.mp3",
"video_url": "assets/staticchat/out_of_topic.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
},
"not_understandable": {
"message": "I don't understand your question. Can you ask it again more simply?",
"suggestions": [
"Try using simpler words",
"Ask about specific tenses like 'What is present tense?'",
"Ask for examples of tenses",
"Check your spelling and grammar"
],
"audio_url": "assets/staticchat/not_understand.mp3",
"video_url": "assets/staticchat/not_understand.mp4",
"story_url": "",
"detail_url": "",
"example_url": "",
"type": "scenario"
}
}
# Load questions from JSON file
def load_questions():
try:
with open('assets/qa.json', 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"Loaded {len(data)} questions from qa.json")
# Debug: Print question categories
tense_categories = {}
for item in data:
q = item['question'].lower()
if 'present' in q:
if 'continuous' in q or 'progressive' in q:
tense_categories['present_continuous'] = tense_categories.get('present_continuous', 0) + 1
elif 'perfect' in q:
tense_categories['present_perfect'] = tense_categories.get('present_perfect', 0) + 1
elif 'simple' in q:
tense_categories['present_simple'] = tense_categories.get('present_simple', 0) + 1
else:
tense_categories['present_general'] = tense_categories.get('present_general', 0) + 1
print(f"Tense categories in database: {tense_categories}")
return data
except FileNotFoundError:
print("Error: qa.json not found")
return []
except json.JSONDecodeError as e:
print(f"Error parsing qa.json: {e}")
return []
# Spell correction function
def correct_spelling(text):
"""Correct spelling using SymSpell"""
if not SYMSPELL_AVAILABLE or sym_spell is None:
return text
try:
# Split into words and correct each
words = text.split()
corrected_words = []
for word in words:
if len(word) <= 2: # Don't correct very short words
corrected_words.append(word)
continue
# Check if word needs correction
suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions and suggestions[0].term != word:
corrected_words.append(suggestions[0].term)
print(f"Corrected '{word}' to '{suggestions[0].term}'")
else:
corrected_words.append(word)
corrected_text = ' '.join(corrected_words)
# Also check for common bigram errors
bigram_suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
if bigram_suggestions and bigram_suggestions[0].term != corrected_text:
print(f"Bigram correction: '{text}' -> '{bigram_suggestions[0].term}'")
return bigram_suggestions[0].term
return corrected_text
except Exception as e:
print(f"Spell correction error: {e}")
return text
# Enhanced text preprocessing
def preprocess_text(text):
"""Preprocess text with spelling correction and enhanced NLP"""
# Correct spelling first
if SYMSPELL_AVAILABLE:
text = correct_spelling(text)
# Convert to lowercase
text = text.lower()
# Remove special characters but keep spaces
text = re.sub(r'[^\w\s]', ' ', text)
# Remove extra whitespace
text = ' '.join(text.split())
if NLTK_AVAILABLE:
try:
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
# Keep important tense-related words that might be in stopwords
important_words = {'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had',
'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
stop_words = stop_words - important_words
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens] # Lemmatize as verbs
return ' '.join(tokens)
except Exception as e:
print(f"Error in NLP processing: {e}")
# Fallback to simple processing
return text
else:
# Enhanced simple processing
# Keep important tense-related words
important_words = {'tense', 'tenses', 'present', 'past', 'future',
'continuous', 'perfect', 'simple', 'progressive',
'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had',
'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
# Basic stopwords to remove
basic_stopwords = {'a', 'an', 'the', 'of', 'in', 'on', 'at', 'by', 'for',
'with', 'about', 'against', 'between', 'into', 'through',
'during', 'before', 'after', 'above', 'below', 'to', 'from',
'up', 'down', 'out', 'off', 'over', 'under', 'again',
'further', 'then', 'once', 'here', 'there', 'when', 'where',
'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
'own', 'same', 'so', 'than', 'too', 'very', 'can', 'may',
'might', 'must', 'ought', 'shall', 'should', 'will', 'would'}
# Remove stopwords but keep important tense words
words = text.split()
filtered_words = []
for word in words:
if word in important_words:
filtered_words.append(word)
elif word not in basic_stopwords:
filtered_words.append(word)
return ' '.join(filtered_words)
def detect_scenario(user_question):
"""Detect if the user input matches any special scenario"""
question_lower = user_question.lower().strip()
# First, check for greetings, thanks, and farewell (these have highest priority)
# Check for greetings
for greeting_keyword in SCENARIOS["greeting"]["keywords"]:
if greeting_keyword in question_lower:
current_hour = datetime.now().hour
if current_hour < 12:
greeting_type = "morning"
elif current_hour < 17:
greeting_type = "afternoon"
else:
greeting_type = "evening"
return {
"scenario": "greeting",
"message": SCENARIOS["greeting"]["message"][greeting_type],
"audio_url": SCENARIOS["greeting"]["audio_url"],
"video_url": SCENARIOS["greeting"]["video_url"],
"story_url": SCENARIOS["greeting"].get("story_url", ""),
"detail_url": SCENARIOS["greeting"].get("detail_url", ""),
"example_url": SCENARIOS["greeting"].get("example_url", "")
}
# Check for thanks
for thanks_keyword in SCENARIOS["thanks"]["keywords"]:
if thanks_keyword in question_lower:
return {
"scenario": "thanks",
"message": SCENARIOS["thanks"]["message"],
"audio_url": SCENARIOS["thanks"]["audio_url"],
"video_url": SCENARIOS["thanks"]["video_url"],
"story_url": SCENARIOS["thanks"].get("story_url", ""),
"detail_url": SCENARIOS["thanks"].get("detail_url", ""),
"example_url": SCENARIOS["thanks"].get("example_url", "")
}
# Check for farewell
for farewell_keyword in SCENARIOS["farewell"]["keywords"]:
if farewell_keyword in question_lower:
return {
"scenario": "farewell",
"message": SCENARIOS["farewell"]["message"],
"audio_url": SCENARIOS["farewell"]["audio_url"],
"video_url": SCENARIOS["farewell"]["video_url"],
"story_url": SCENARIOS["farewell"].get("story_url", ""),
"detail_url": SCENARIOS["farewell"].get("detail_url", ""),
"example_url": SCENARIOS["farewell"].get("example_url", "")
}
# Check for out of syllabus topics
# Only trigger if question contains out-of-syllabus keywords AND no tense keywords
question_words = set(question_lower.split())
out_of_syllabus_keywords = set(SCENARIOS["out_of_syllabus"]["keywords"])
# Check if question contains any out-of-syllabus keyword
contains_out_of_syllabus = any(keyword in question_lower for keyword in out_of_syllabus_keywords)
if contains_out_of_syllabus:
# Check if it also contains tense-related keywords
tense_keywords = ['tense', 'tenses', 'present', 'past', 'future',
'continuous', 'perfect', 'simple', 'progressive',
'verb', 'verbs', 'grammar', 'am', 'is', 'are',
'was', 'were', 'have', 'has', 'had']
contains_tense_keyword = any(tense_word in question_lower for tense_word in tense_keywords)
# If it contains both, check if tense keyword is more dominant
if contains_tense_keyword:
# Count tense words vs out-of-syllabus words
tense_count = sum(1 for word in tense_keywords if word in question_lower)
out_count = sum(1 for word in out_of_syllabus_keywords if word in question_lower)
# If more tense-related words, treat as tense question
if tense_count >= out_count:
return None
# If no tense keywords or fewer tense words, it's out of syllabus
return {
"scenario": "out_of_syllabus",
"message": SCENARIOS["out_of_syllabus"]["message"],
"audio_url": SCENARIOS["out_of_syllabus"]["audio_url"],
"video_url": SCENARIOS["out_of_syllabus"]["video_url"],
"story_url": SCENARIOS["out_of_syllabus"].get("story_url", ""),
"detail_url": SCENARIOS["out_of_syllabus"].get("detail_url", ""),
"example_url": SCENARIOS["out_of_syllabus"].get("example_url", "")
}
# Check for not understandable
# Clean text for length check
clean_text = re.sub(r'[^\w\s]', '', question_lower)
if len(clean_text.strip()) < 2:
return {
"scenario": "not_understandable",
"message": SCENARIOS["not_understandable"]["message"],
"audio_url": SCENARIOS["not_understandable"]["audio_url"],
"video_url": SCENARIOS["not_understandable"]["video_url"],
"story_url": SCENARIOS["not_understandable"].get("story_url", ""),
"detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
"example_url": SCENARIOS["not_understandable"].get("example_url", "")
}
# Check for gibberish
words = clean_text.split()
if words:
avg_word_len = sum(len(word) for word in words) / len(words)
if avg_word_len > 15: # Very long words might be gibberish
return {
"scenario": "not_understandable",
"message": SCENARIOS["not_understandable"]["message"],
"audio_url": SCENARIOS["not_understandable"]["audio_url"],
"video_url": SCENARIOS["not_understandable"]["video_url"],
"story_url": SCENARIOS["not_understandable"].get("story_url", ""),
"detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
"example_url": SCENARIOS["not_understandable"].get("example_url", "")
}
return None
def check_topic_relevance(user_question):
"""Return True only if the question is about English tenses (not general topics)."""
q = user_question.lower().strip()
# If the question clearly contains out-of-topic words AND does not say "tense",
# treat it as out of syllabus.
out_words = SCENARIOS["out_of_syllabus"].get("keywords", [])
if any(re.search(rf"\b{re.escape(w)}\b", q) for w in out_words):
if not re.search(r"\btense(s)?\b", q):
return False
# Strong tense intent words
if re.search(r"\btense(s)?\b", q):
return True
# Common tense names (phrases)
tense_phrases = [
"present simple", "past simple", "future simple",
"present continuous", "past continuous", "future continuous",
"present perfect", "past perfect", "future perfect",
"present perfect continuous", "past perfect continuous", "future perfect continuous",
]
if any(p in q for p in tense_phrases):
return True
# If user mentions time-words + aspect-words together, likely a tense question
time_words = ["present", "past", "future"]
aspect_words = ["simple", "continuous", "perfect", "progressive"]
if any(re.search(rf"\b{w}\b", q) for w in time_words) and any(re.search(rf"\b{w}\b", q) for w in aspect_words):
return True
# If user asks usage/rules/structure about helping verbs, allow it (still tense-related)
helpers = ["am", "is", "are", "was", "were", "have", "has", "had", "do", "does", "did", "will", "shall", "would", "could", "should"]
intent_words = ["use", "using", "when", "rule", "rules", "structure", "form", "difference", "between", "meaning", "example", "examples"]
if any(re.search(rf"\b{h}\b", q) for h in helpers) and any(re.search(rf"\b{i}\b", q) for i in intent_words):
return True
# Otherwise, not a tense question
return False
# Initialize questions data
questions_data = load_questions()
question_texts = [item['question'] for item in questions_data]
preprocessed_questions = [preprocess_text(q) for q in question_texts]
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Use unigrams and bigrams
if preprocessed_questions: # Only fit if we have questions
tfidf_matrix = vectorizer.fit_transform(preprocessed_questions)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
else:
tfidf_matrix = None
def calculate_similarity(user_question):
"""Calculate similarity between user question and stored questions"""
if not preprocessed_questions: # No questions loaded
return np.array([])
# Preprocess user question
preprocessed_user_q = preprocess_text(user_question)
# Vectorize user question
user_vector = vectorizer.transform([preprocessed_user_q])
# Calculate similarity scores
similarity_scores = cosine_similarity(user_vector, tfidf_matrix)
return similarity_scores[0]
def keyword_match(user_question, questions):
"""Fallback keyword matching - IMPROVED"""
user_words = set(preprocess_text(user_question).split())
matches = []
for i, q_data in enumerate(questions):
question_words = set(preprocess_text(q_data['question']).split())
common_words = user_words.intersection(question_words)
if common_words:
# Calculate score based on common words and length
score = len(common_words) / max(len(user_words), len(question_words))
matches.append({
'index': i,
'score': score,
'common_words': list(common_words)
})
# Sort by score
matches.sort(key=lambda x: x['score'], reverse=True)
return matches
def verify_match_relevance(user_q, matched_q, matched_answer):
"""Verify if the match is actually relevant - IMPROVED VERSION"""
user_q_lower = user_q.lower()
matched_q_lower = matched_q.lower()
matched_answer_lower = matched_answer.lower()
# Extract key terms from user question
user_terms = set(preprocess_text(user_q).split())
# Extract key terms from matched question
matched_terms = set(preprocess_text(matched_q).split())
# Check for important keywords in user question
important_keywords = ['difference', 'compare', 'between', 'versus', 'vs',
'how to', 'how do i', 'explain', 'when to',
'conditional', 'subjunctive', 'passive', 'modal',
'reported speech', 'used to', 'mixed', 'perfect']
# Group similar question starters
question_starters = {
'what': ['what is', 'what are', 'what does', 'what do'],
'how': ['how to', 'how do', 'how does'],
'when': ['when to', 'when do', 'when does'],
'why': ['why do', 'why does', 'why is']
}
# Check if user and match have similar question starters
user_starter = None
matched_starter = None
for starter_type, starters in question_starters.items():
for starter in starters:
if starter in user_q_lower:
user_starter = starter_type
if starter in matched_q_lower:
matched_starter = starter_type
# If both are asking "what" questions, it's likely a match even if wording differs
if user_starter and matched_starter and user_starter == matched_starter:
# Both are the same type of question (e.g., both "what" questions)
print(f"Both are {user_starter} questions - accepting match")
# Continue with other checks but don't reject just because wording differs
# Check for important keywords that MUST be in the answer
must_have_keywords = []
for keyword in important_keywords:
if keyword in user_q_lower:
must_have_keywords.append(keyword)
# If user asks for differences but answer doesn't compare, reject
if 'difference' in user_q_lower or 'compare' in user_q_lower or 'versus' in user_q_lower:
if not ('difference' in matched_answer_lower or 'compare' in matched_answer_lower or 'vs' in matched_answer_lower):
print("User asked for differences but answer doesn't compare - rejecting")
return False
# If user asks "how to" but answer is just definition
if ('how to' in user_q_lower or 'how do' in user_q_lower) and 'how' not in matched_answer_lower.lower():
# Check if answer contains instructions/steps
instruction_words = ['step', 'first', 'second', 'then', 'next', 'finally', 'process']
if not any(word in matched_answer_lower for word in instruction_words):
print("User asked 'how to' but answer is not instructional - rejecting")
return False
# Check if the match is just generic when user asks for specific
generic_questions = ['what is', 'what are', 'what does', 'what do']
specific_questions = ['difference between', 'how to use', 'when to use',
'compare', 'explain the difference', 'give example of']
user_is_specific = any(phrase in user_q_lower for phrase in specific_questions)
match_is_generic = any(phrase in matched_q_lower for phrase in generic_questions)
if user_is_specific and match_is_generic:
# Check if the generic answer actually addresses the specific question
user_specific_terms = []
for phrase in specific_questions:
if phrase in user_q_lower:
# Get the terms after the phrase
idx = user_q_lower.find(phrase) + len(phrase)
user_specific_terms = user_q_lower[idx:].strip().split()[:3]
break
if user_specific_terms:
# Check if these specific terms are in the answer
if not any(term in matched_answer_lower for term in user_specific_terms if len(term) > 2):
print("User asked specific, match is generic - likely wrong")
return False
# Check for core topic overlap
user_words = set(user_q_lower.split())
matched_words = set(matched_q_lower.split())
common_core = user_words.intersection(matched_words)
# Remove common stopwords
stopwords_set = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
common_core = {word for word in common_core if word not in stopwords_set and len(word) > 2}
if len(common_core) >= 2: # At least 2 meaningful words in common
print(f"Common core words: {common_core} - accepting match")
return True
# If TF-IDF score was high and we got here, it's probably OK
return True
def verify_tense_specificity(user_q, matched_q, matched_answer):
"""Ensure we return the correct specificity for tense questions"""
user_q_lower = user_q.lower()
matched_q_lower = matched_q.lower()
# Check if user is asking about general tense vs specific tense
if 'present tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
# User is asking about present tense in general
if 'present continuous' in matched_q_lower or 'present perfect' in matched_q_lower:
# They got a specific tense instead of general
# Check if we have a general present tense question
for i, q_data in enumerate(questions_data):
q_text = q_data['question'].lower()
if 'present tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
return i # Return index of general present tense
elif 'past tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
if 'past continuous' in matched_q_lower or 'past perfect' in matched_q_lower:
for i, q_data in enumerate(questions_data):
q_text = q_data['question'].lower()
if 'past tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
return i
elif 'future tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
if 'future continuous' in matched_q_lower or 'future perfect' in matched_q_lower:
for i, q_data in enumerate(questions_data):
q_text = q_data['question'].lower()
if 'future tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
return i
return None # No need to override
@staticchat_bp.route('/search', methods=['POST'])
def search_question():
try:
data = request.get_json()
original_question = data.get('question', '').strip()
if not original_question:
return jsonify({
'success': False,
'message': 'Please provide a question'
}), 400
print(f"\n=== Processing: '{original_question}' ===")
# First, check for special scenarios
scenario_result = detect_scenario(original_question)
if scenario_result:
print(f"Detected scenario: {scenario_result['scenario']}") # Debug log
return jsonify({
'success': True,
'scenario': scenario_result['scenario'],
'message': scenario_result['message'],
'audio_url': scenario_result.get('audio_url', ''),
'video_url': scenario_result.get('video_url', ''),
'story_url': scenario_result.get('story_url', ''),
'detail_url': scenario_result.get('detail_url', ''),
'example_url': scenario_result.get('example_url', ''),
'user_question': original_question,
'matching_method': 'scenario'
})
print("No scenario detected, checking topic relevance...") # Debug log
# Check if question is related to tenses
is_topic_relevant = check_topic_relevance(original_question)
print(f"Topic relevant: {is_topic_relevant}") # Debug log
if not is_topic_relevant:
# If not relevant and not caught by out_of_syllabus scenario
return jsonify({
'success': True,
'scenario': 'out_of_syllabus',
'message': SCENARIOS['out_of_syllabus']['message'],
'audio_url': SCENARIOS['out_of_syllabus']['audio_url'],
'video_url': SCENARIOS['out_of_syllabus']['video_url'],
'story_url': SCENARIOS['out_of_syllabus'].get('story_url', ''),
'detail_url': SCENARIOS['out_of_syllabus'].get('detail_url', ''),
'example_url': SCENARIOS['out_of_syllabus'].get('example_url', ''),
'user_question': original_question,
'matching_method': 'scenario'
})
# Calculate similarity if we have questions
if not preprocessed_questions:
return jsonify({
'success': True,
'scenario': 'not_available',
'message': SCENARIOS['not_available']['message'],
'suggestions': SCENARIOS['not_available']['suggestions'],
'audio_url': SCENARIOS['not_available']['audio_url'],
'video_url': SCENARIOS['not_available']['video_url'],
'story_url': SCENARIOS['not_available'].get('story_url', ''),
'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
'example_url': SCENARIOS['not_available'].get('example_url', ''),
'user_question': original_question,
'matching_method': 'scenario'
})
similarity_scores = calculate_similarity(original_question)
if len(similarity_scores) == 0: # No questions loaded
return jsonify({
'success': True,
'scenario': 'not_available',
'message': SCENARIOS['not_available']['message'],
'suggestions': SCENARIOS['not_available']['suggestions'],
'audio_url': SCENARIOS['not_available']['audio_url'],
'video_url': SCENARIOS['not_available']['video_url'],
'story_url': SCENARIOS['not_available'].get('story_url', ''),
'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
'example_url': SCENARIOS['not_available'].get('example_url', ''),
'user_question': original_question,
'matching_method': 'scenario'
})
# Get the best match
best_match_idx = similarity_scores.argmax()
best_score = similarity_scores[best_match_idx]
print(f"Best TF-IDF score: {best_score:.3f}") # Debug log
print(f"Matched to question #{best_match_idx + 1}: {questions_data[best_match_idx]['question']}") # Debug log
# Check if we need to override for tense specificity
override_idx = verify_tense_specificity(
original_question,
questions_data[best_match_idx]['question'],
questions_data[best_match_idx]['answer']
)
if override_idx is not None:
best_match_idx = override_idx
best_score = 0.9 # Set high score for exact match
print(f"Overriding to general tense question: {questions_data[best_match_idx]['question']}")
# Set higher threshold for matching - INCREASED to prevent wrong matches
tfidf_threshold = 0.35 # Increased from 0.2 to 0.35
keyword_threshold = 0.25 # Increased from 0.1 to 0.25
if best_score > tfidf_threshold:
# Verify the match is actually relevant
matched_question = questions_data[best_match_idx]
is_relevant = verify_match_relevance(original_question,
matched_question['question'],
matched_question['answer'])
if is_relevant:
# Good match found with TF-IDF
return jsonify({
'success': True,
'matched_question': matched_question['question'],
'answer': matched_question['answer'],
'sno': matched_question['sno'],
'audio_url': matched_question.get('audio_url', ''),
'video_url': matched_question.get('video_url', ''),
'story_url': matched_question.get('story_url', ''),
'detail_url': matched_question.get('detail_url', ''),
'example_url': matched_question.get('example_url', ''),
'confidence_score': float(best_score),
'user_question': original_question,
'matching_method': 'tfidf',
'spell_corrected': original_question if SYMSPELL_AVAILABLE else 'not_available'
})
else:
# Match is not actually relevant
print(f"Match verification failed. Score: {best_score:.3f}")
# Fall through to not_available
else:
# Score below threshold
print(f"Score below threshold. Score: {best_score:.3f}, Threshold: {tfidf_threshold}")
# Try keyword matching as fallback (with higher threshold)
keyword_matches = keyword_match(original_question, questions_data)
print(f"Keyword matches found: {len(keyword_matches)}") # Debug log
if keyword_matches:
print(f"Best keyword score: {keyword_matches[0]['score']:.3f}") # Debug log
if keyword_matches and keyword_matches[0]['score'] > keyword_threshold:
best_keyword_match = keyword_matches[0]
matched_question = questions_data[best_keyword_match['index']]
# Verify keyword match too
is_relevant = verify_match_relevance(original_question,
matched_question['question'],
matched_question['answer'])
if is_relevant:
return jsonify({
'success': True,
'matched_question': matched_question['question'],
'answer': matched_question['answer'],
'sno': matched_question['sno'],
'audio_url': matched_question.get('audio_url', ''),
'video_url': matched_question.get('video_url', ''),
'story_url': matched_question.get('story_url', ''),
'detail_url': matched_question.get('detail_url', ''),
'example_url': matched_question.get('example_url', ''),
'confidence_score': float(best_keyword_match['score']),
'user_question': original_question,
'matching_method': 'keyword',
'common_words': best_keyword_match['common_words']
})
else:
print("Keyword match verification failed")
# No good match found but question is tense-related
return jsonify({
'success': True,
'scenario': 'not_available',
'message': SCENARIOS['not_available']['message'],
'suggestions': SCENARIOS['not_available']['suggestions'],
'audio_url': SCENARIOS['not_available']['audio_url'],
'video_url': SCENARIOS['not_available']['video_url'],
'story_url': SCENARIOS['not_available'].get('story_url', ''),
'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
'example_url': SCENARIOS['not_available'].get('example_url', ''),
'user_question': original_question,
'matching_method': 'scenario',
'debug_info': {
'best_tfidf_score': float(best_score) if len(similarity_scores) > 0 else 0,
'best_keyword_score': keyword_matches[0]['score'] if keyword_matches else 0
}
})
except Exception as e:
print(f"Error in search_question: {str(e)}")
import traceback
traceback.print_exc()
return jsonify({
'success': False,
'message': f'Error processing request: {str(e)}'
}), 500
@staticchat_bp.route('/questions', methods=['GET'])
def get_all_questions():
"""Get all questions for reference"""
try:
questions = load_questions()
# Return only question text for autocomplete
question_list = [{'sno': q['sno'], 'question': q['question']} for q in questions]
return jsonify({
'success': True,
'questions': question_list,
'count': len(question_list)
})
except Exception as e:
return jsonify({
'success': False,
'message': str(e)
}), 500
@staticchat_bp.route('/question/<int:sno>', methods=['GET'])
def get_question_by_sno(sno):
"""Get specific question by serial number"""
try:
questions = load_questions()
question = next((q for q in questions if q['sno'] == sno), None)
if question:
return jsonify({
'success': True,
'question': question
})
else:
return jsonify({
'success': False,
'message': f'Question with SNO {sno} not found'
}), 404
except Exception as e:
return jsonify({
'success': False,
'message': str(e)
}), 500
@staticchat_bp.route('/suggestions', methods=['GET'])
def get_suggestions():
"""Get random suggestions from the database"""
try:
if not questions_data:
return jsonify({
'success': False,
'message': "No questions available.",
'suggestions': []
})
# Get parameter for number of suggestions
count = request.args.get('count', default=5, type=int)
# Get random questions for suggestions
import random
random_questions = random.sample(questions_data, min(count, len(questions_data)))
suggestions = [q['question'] for q in random_questions]
return jsonify({
'success': True,
'suggestions': suggestions,
'count': len(suggestions)
})
except Exception as e:
print(f"Error in get_suggestions: {str(e)}")
return jsonify({
'success': False,
'message': str(e),
'suggestions': []
}), 500
@staticchat_bp.route('/scenarios', methods=['GET'])
def get_scenarios():
"""Get information about available scenarios"""
try:
scenarios_info = {}
for scenario_name, scenario_data in SCENARIOS.items():
scenarios_info[scenario_name] = {
"type": scenario_data.get("type", "scenario"),
"has_audio": bool(scenario_data.get("audio_url")),
"has_video": bool(scenario_data.get("video_url")),
"keywords": scenario_data.get("keywords", [])
}
return jsonify({
'success': True,
'scenarios': scenarios_info,
'count': len(scenarios_info)
})
except Exception as e:
return jsonify({
'success': False,
'message': str(e)
}), 500
@staticchat_bp.route('/transcribe', methods=['POST'])
def transcribe():
if "file" not in request.files:
return jsonify({"error": "No file field named 'file'"}), 400
f = request.files["file"]
if not f:
return jsonify({"error": "No file uploaded"}), 400
# Optional language from client: en / hi / ta
language = request.form.get("language") # may be None
tmp_path = None
try:
# Keep a suffix so ffmpeg/whisper detects it better
suffix = os.path.splitext(f.filename or "")[1].lower()
if not suffix:
suffix = ".webm" # safe default for browser uploads
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp_path = tmp.name
f.save(tmp_path)
# Run local whisper
result = model.transcribe(
tmp_path,
language=language if language else None,
fp16=False # CPU-only: must be False
)
text = (result.get("text") or "").strip()
return jsonify({"text": text})
except Exception as e:
return jsonify({"error": str(e)}), 500
finally:
if tmp_path and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except:
pass
|