Spaces:

pykara
/

py-learn-backend

Runtime error

File size: 43,567 Bytes
from flask import Blueprint, request, jsonify, current_app
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import string
import tempfile
from datetime import datetime

# Defer heavy optional import (whisper) to optional load so import-time does not crash app
MODEL_NAME = "base"
model = None
MODEL_AVAILABLE = False
try:
    import whisper
    try:
        model = whisper.load_model(MODEL_NAME)
        MODEL_AVAILABLE = True
        print(f"Whisper model '{MODEL_NAME}' loaded successfully")
    except Exception as ex:
        print(f"Whisper installed but failed to load model '{MODEL_NAME}': {ex}")
        model = None
        MODEL_AVAILABLE = False
except Exception as ex:
    print(f"Whisper not available: {ex}")
    model = None
    MODEL_AVAILABLE = False

# Add SymSpell for spell checking
try:
    from symspellpy import SymSpell, Verbosity
    import pkg_resources
    SYMSPELL_AVAILABLE = True
except ImportError:
    print("SymSpell not available. Please install: pip install symspellpy")
    SYMSPELL_AVAILABLE = False


staticchat_bp = Blueprint("staticchat", __name__)

# NOTE: Blueprints do not have a config dict. MAX_CONTENT_LENGTH must be set on the Flask app.
# If you want to enforce max content size, set app.config["MAX_CONTENT_LENGTH"] when creating the Flask app.

# Initialize SymSpell if available
sym_spell = None
if SYMSPELL_AVAILABLE:
    try:
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt"
        )
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
        )
        
        # Load dictionaries
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=1)
        print("SymSpell spell checker initialized successfully")
    except Exception as e:
        print(f"Failed to initialize SymSpell: {e}")
        SYMSPELL_AVAILABLE = False

# Try to import NLTK with fallback
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    # Download required NLTK resources
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
    
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK not available, using simple text processing: {e}")
    NLTK_AVAILABLE = False

# Enhanced Scenario configurations
SCENARIOS = {
    "greeting": {
        "keywords": ["good morning", "good afternoon", "good evening", "hello", "hi", "hey", "greetings"],
        "message": {
            "morning": "Good morning! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "afternoon": "Good afternoon! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "evening": "Good evening! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "general": "Hello! Welcome to the English Tenses Learning Assistant. How can I help you with tenses today?"
        },
        "audio_url": "assets/staticchat/intro.mp3",
        "video_url": "assets/staticchat/intro.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "thanks": {
        "keywords": ["thank you", "thanks", "thank you very much", "appreciate it", "thanks a lot"],
        "message": "You're welcome! Do you have any other questions?",
        "audio_url": "assets/staticchat/you_are_welcome.mp3",
        "video_url": "assets/staticchat/you_are_welcome.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "farewell": {
        "keywords": ["bye", "goodbye", "see you", "farewell", "take care", "bye bye"],
        "message": "Goodbye! Keep practicing your English tenses. Remember, practice makes perfect!",
        "audio_url": "assets/staticchat/bye.mp3",
        "video_url": "assets/staticchat/bye.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "not_available": {
        "message": "I don't have the answer for that. Let's not available in my lesson today.",
        "suggestions": [
            "Try asking about common tenses like present simple or past perfect",
            "Ask me about tense structures or examples",
            "Check if your question is specifically about English verb tenses"
        ],
        "audio_url": "assets/staticchat/no_db.mp3",
        "video_url": "assets/staticchat/no_db.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "out_of_syllabus": {
        "keywords": [
            # sports
            "sports", "sport", "cricket", "ipl", "match", "score", "wicket", "runs", "bat", "bowling",
            "football", "basketball", "tennis", "hockey",
            # other non-tense topics
            "weather", "rain", "sunny", "temperature",
            "food", "pizza", "burger", "restaurant", "cooking",
            "movie", "music", "song", "artist", "film",
            "history", "science", "math", "politics", "geography", "economics", "physics",
            # general grammar (NOT tenses)
            "noun", "pronoun", "adjective", "adverb", "preposition", "conjunction",
            "punctuation", "comma", "full stop", "spelling", "vocabulary", "synonym", "antonym",
            "phonetics", "pronunciation"
        ],
        "message": "That's not part of our tense lesson. Let's stay on our topic.",
        "audio_url": "assets/staticchat/out_of_topic.mp3",
        "video_url": "assets/staticchat/out_of_topic.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "not_understandable": {
        "message": "I don't understand your question. Can you ask it again more simply?",
        "suggestions": [
            "Try using simpler words",
            "Ask about specific tenses like 'What is present tense?'",
            "Ask for examples of tenses",
            "Check your spelling and grammar"
        ],
        "audio_url": "assets/staticchat/not_understand.mp3",
        "video_url": "assets/staticchat/not_understand.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    }
}

# Load questions from JSON file
def load_questions():
    try:
        with open('assets/qa.json', 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} questions from qa.json")
        
        # Debug: Print question categories
        tense_categories = {}
        for item in data:
            q = item['question'].lower()
            if 'present' in q:
                if 'continuous' in q or 'progressive' in q:
                    tense_categories['present_continuous'] = tense_categories.get('present_continuous', 0) + 1
                elif 'perfect' in q:
                    tense_categories['present_perfect'] = tense_categories.get('present_perfect', 0) + 1
                elif 'simple' in q:
                    tense_categories['present_simple'] = tense_categories.get('present_simple', 0) + 1
                else:
                    tense_categories['present_general'] = tense_categories.get('present_general', 0) + 1
        
        print(f"Tense categories in database: {tense_categories}")
        return data
    except FileNotFoundError:
        print("Error: qa.json not found")
        return []
    except json.JSONDecodeError as e:
        print(f"Error parsing qa.json: {e}")
        return []

# Spell correction function
def correct_spelling(text):
    """Correct spelling using SymSpell"""
    if not SYMSPELL_AVAILABLE or sym_spell is None:
        return text
    
    try:
        # Split into words and correct each
        words = text.split()
        corrected_words = []
        
        for word in words:
            if len(word) <= 2:  # Don't correct very short words
                corrected_words.append(word)
                continue
            
            # Check if word needs correction
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions and suggestions[0].term != word:
                corrected_words.append(suggestions[0].term)
                print(f"Corrected '{word}' to '{suggestions[0].term}'")
            else:
                corrected_words.append(word)
        
        corrected_text = ' '.join(corrected_words)
        
        # Also check for common bigram errors
        bigram_suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
        if bigram_suggestions and bigram_suggestions[0].term != corrected_text:
            print(f"Bigram correction: '{text}' -> '{bigram_suggestions[0].term}'")
            return bigram_suggestions[0].term
        
        return corrected_text
    except Exception as e:
        print(f"Spell correction error: {e}")
        return text

# Enhanced text preprocessing
def preprocess_text(text):
    """Preprocess text with spelling correction and enhanced NLP"""
    # Correct spelling first
    if SYMSPELL_AVAILABLE:
        text = correct_spelling(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    if NLTK_AVAILABLE:
        try:
            # Tokenize
            tokens = word_tokenize(text)
            
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            # Keep important tense-related words that might be in stopwords
            important_words = {'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 
                              'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
            stop_words = stop_words - important_words
            
            tokens = [word for word in tokens if word not in stop_words]
            
            # Lemmatize
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # Lemmatize as verbs
            
            return ' '.join(tokens)
        except Exception as e:
            print(f"Error in NLP processing: {e}")
            # Fallback to simple processing
            return text
    else:
        # Enhanced simple processing
        # Keep important tense-related words
        important_words = {'tense', 'tenses', 'present', 'past', 'future', 
                          'continuous', 'perfect', 'simple', 'progressive',
                          'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had',
                          'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
        
        # Basic stopwords to remove
        basic_stopwords = {'a', 'an', 'the', 'of', 'in', 'on', 'at', 'by', 'for', 
                          'with', 'about', 'against', 'between', 'into', 'through',
                          'during', 'before', 'after', 'above', 'below', 'to', 'from',
                          'up', 'down', 'out', 'off', 'over', 'under', 'again', 
                          'further', 'then', 'once', 'here', 'there', 'when', 'where',
                          'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                          'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
                          'own', 'same', 'so', 'than', 'too', 'very', 'can', 'may',
                          'might', 'must', 'ought', 'shall', 'should', 'will', 'would'}
        
        # Remove stopwords but keep important tense words
        words = text.split()
        filtered_words = []
        for word in words:
            if word in important_words:
                filtered_words.append(word)
            elif word not in basic_stopwords:
                filtered_words.append(word)
        
        return ' '.join(filtered_words)

def detect_scenario(user_question):
    """Detect if the user input matches any special scenario"""
    question_lower = user_question.lower().strip()
    
    # First, check for greetings, thanks, and farewell (these have highest priority)
    # Check for greetings
    for greeting_keyword in SCENARIOS["greeting"]["keywords"]:
        if greeting_keyword in question_lower:
            current_hour = datetime.now().hour
            if current_hour < 12:
                greeting_type = "morning"
            elif current_hour < 17:
                greeting_type = "afternoon"
            else:
                greeting_type = "evening"
            
            return {
                "scenario": "greeting",
                "message": SCENARIOS["greeting"]["message"][greeting_type],
                "audio_url": SCENARIOS["greeting"]["audio_url"],
                "video_url": SCENARIOS["greeting"]["video_url"],
                "story_url": SCENARIOS["greeting"].get("story_url", ""),
                "detail_url": SCENARIOS["greeting"].get("detail_url", ""),
                "example_url": SCENARIOS["greeting"].get("example_url", "")
            }
    
    # Check for thanks
    for thanks_keyword in SCENARIOS["thanks"]["keywords"]:
        if thanks_keyword in question_lower:
            return {
                "scenario": "thanks",
                "message": SCENARIOS["thanks"]["message"],
                "audio_url": SCENARIOS["thanks"]["audio_url"],
                "video_url": SCENARIOS["thanks"]["video_url"],
                "story_url": SCENARIOS["thanks"].get("story_url", ""),
                "detail_url": SCENARIOS["thanks"].get("detail_url", ""),
                "example_url": SCENARIOS["thanks"].get("example_url", "")
            }
    
    # Check for farewell
    for farewell_keyword in SCENARIOS["farewell"]["keywords"]:
        if farewell_keyword in question_lower:
            return {
                "scenario": "farewell",
                "message": SCENARIOS["farewell"]["message"],
                "audio_url": SCENARIOS["farewell"]["audio_url"],
                "video_url": SCENARIOS["farewell"]["video_url"],
                "story_url": SCENARIOS["farewell"].get("story_url", ""),
                "detail_url": SCENARIOS["farewell"].get("detail_url", ""),
                "example_url": SCENARIOS["farewell"].get("example_url", "")
            }
    
    # Check for out of syllabus topics
    # Only trigger if question contains out-of-syllabus keywords AND no tense keywords
    question_words = set(question_lower.split())
    out_of_syllabus_keywords = set(SCENARIOS["out_of_syllabus"]["keywords"])
    
    # Check if question contains any out-of-syllabus keyword
    contains_out_of_syllabus = any(keyword in question_lower for keyword in out_of_syllabus_keywords)
    
    if contains_out_of_syllabus:
        # Check if it also contains tense-related keywords
        tense_keywords = ['tense', 'tenses', 'present', 'past', 'future', 
                         'continuous', 'perfect', 'simple', 'progressive', 
                         'verb', 'verbs', 'grammar', 'am', 'is', 'are', 
                         'was', 'were', 'have', 'has', 'had']
        
        contains_tense_keyword = any(tense_word in question_lower for tense_word in tense_keywords)
        
        # If it contains both, check if tense keyword is more dominant
        if contains_tense_keyword:
            # Count tense words vs out-of-syllabus words
            tense_count = sum(1 for word in tense_keywords if word in question_lower)
            out_count = sum(1 for word in out_of_syllabus_keywords if word in question_lower)
            
            # If more tense-related words, treat as tense question
            if tense_count >= out_count:
                return None
        
        # If no tense keywords or fewer tense words, it's out of syllabus
        return {
            "scenario": "out_of_syllabus",
            "message": SCENARIOS["out_of_syllabus"]["message"],
            "audio_url": SCENARIOS["out_of_syllabus"]["audio_url"],
            "video_url": SCENARIOS["out_of_syllabus"]["video_url"],
            "story_url": SCENARIOS["out_of_syllabus"].get("story_url", ""),
            "detail_url": SCENARIOS["out_of_syllabus"].get("detail_url", ""),
            "example_url": SCENARIOS["out_of_syllabus"].get("example_url", "")
        }
    
    # Check for not understandable
    # Clean text for length check
    clean_text = re.sub(r'[^\w\s]', '', question_lower)
    
    if len(clean_text.strip()) < 2:
        return {
            "scenario": "not_understandable",
            "message": SCENARIOS["not_understandable"]["message"],
            "audio_url": SCENARIOS["not_understandable"]["audio_url"],
            "video_url": SCENARIOS["not_understandable"]["video_url"],
            "story_url": SCENARIOS["not_understandable"].get("story_url", ""),
            "detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
            "example_url": SCENARIOS["not_understandable"].get("example_url", "")
        }
    
    # Check for gibberish
    words = clean_text.split()
    if words:
        avg_word_len = sum(len(word) for word in words) / len(words)
        if avg_word_len > 15:  # Very long words might be gibberish
            return {
                "scenario": "not_understandable",
                "message": SCENARIOS["not_understandable"]["message"],
                "audio_url": SCENARIOS["not_understandable"]["audio_url"],
                "video_url": SCENARIOS["not_understandable"]["video_url"],
                "story_url": SCENARIOS["not_understandable"].get("story_url", ""),
                "detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
                "example_url": SCENARIOS["not_understandable"].get("example_url", "")
            }
    
    return None

def check_topic_relevance(user_question):
    """Return True only if the question is about English tenses (not general topics)."""
    q = user_question.lower().strip()

    # If the question clearly contains out-of-topic words AND does not say "tense",
    # treat it as out of syllabus.
    out_words = SCENARIOS["out_of_syllabus"].get("keywords", [])
    if any(re.search(rf"\b{re.escape(w)}\b", q) for w in out_words):
        if not re.search(r"\btense(s)?\b", q):
            return False

    # Strong tense intent words
    if re.search(r"\btense(s)?\b", q):
        return True

    # Common tense names (phrases)
    tense_phrases = [
        "present simple", "past simple", "future simple",
        "present continuous", "past continuous", "future continuous",
        "present perfect", "past perfect", "future perfect",
        "present perfect continuous", "past perfect continuous", "future perfect continuous",
    ]
    if any(p in q for p in tense_phrases):
        return True

    # If user mentions time-words + aspect-words together, likely a tense question
    time_words = ["present", "past", "future"]
    aspect_words = ["simple", "continuous", "perfect", "progressive"]
    if any(re.search(rf"\b{w}\b", q) for w in time_words) and any(re.search(rf"\b{w}\b", q) for w in aspect_words):
        return True

    # If user asks usage/rules/structure about helping verbs, allow it (still tense-related)
    helpers = ["am", "is", "are", "was", "were", "have", "has", "had", "do", "does", "did", "will", "shall", "would", "could", "should"]
    intent_words = ["use", "using", "when", "rule", "rules", "structure", "form", "difference", "between", "meaning", "example", "examples"]
    if any(re.search(rf"\b{h}\b", q) for h in helpers) and any(re.search(rf"\b{i}\b", q) for i in intent_words):
        return True

    # Otherwise, not a tense question
    return False

# Initialize questions data
questions_data = load_questions()
question_texts = [item['question'] for item in questions_data]
preprocessed_questions = [preprocess_text(q) for q in question_texts]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Use unigrams and bigrams
if preprocessed_questions:  # Only fit if we have questions
    tfidf_matrix = vectorizer.fit_transform(preprocessed_questions)
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
else:
    tfidf_matrix = None

def calculate_similarity(user_question):
    """Calculate similarity between user question and stored questions"""
    if not preprocessed_questions:  # No questions loaded
        return np.array([])
    
    # Preprocess user question
    preprocessed_user_q = preprocess_text(user_question)
    
    # Vectorize user question
    user_vector = vectorizer.transform([preprocessed_user_q])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix)
    
    return similarity_scores[0]

def keyword_match(user_question, questions):
    """Fallback keyword matching - IMPROVED"""
    user_words = set(preprocess_text(user_question).split())
    matches = []
    
    for i, q_data in enumerate(questions):
        question_words = set(preprocess_text(q_data['question']).split())
        common_words = user_words.intersection(question_words)
        
        if common_words:
            # Calculate score based on common words and length
            score = len(common_words) / max(len(user_words), len(question_words))
            matches.append({
                'index': i,
                'score': score,
                'common_words': list(common_words)
            })
    
    # Sort by score
    matches.sort(key=lambda x: x['score'], reverse=True)
    return matches

def verify_match_relevance(user_q, matched_q, matched_answer):
    """Verify if the match is actually relevant - IMPROVED VERSION"""
    user_q_lower = user_q.lower()
    matched_q_lower = matched_q.lower()
    matched_answer_lower = matched_answer.lower()

    # Extract key terms from user question
    user_terms = set(preprocess_text(user_q).split())

    # Extract key terms from matched question
    matched_terms = set(preprocess_text(matched_q).split())

    # Check for important keywords in user question
    important_keywords = ['difference', 'compare', 'between', 'versus', 'vs', 
                         'how to', 'how do i', 'explain', 'when to',
                         'conditional', 'subjunctive', 'passive', 'modal',
                         'reported speech', 'used to', 'mixed', 'perfect']

    # Group similar question starters
    question_starters = {
        'what': ['what is', 'what are', 'what does', 'what do'],
        'how': ['how to', 'how do', 'how does'],
        'when': ['when to', 'when do', 'when does'],
        'why': ['why do', 'why does', 'why is']
    }

    # Check if user and match have similar question starters
    user_starter = None
    matched_starter = None

    for starter_type, starters in question_starters.items():
        for starter in starters:
            if starter in user_q_lower:
                user_starter = starter_type
            if starter in matched_q_lower:
                matched_starter = starter_type

    # If both are asking "what" questions, it's likely a match even if wording differs
    if user_starter and matched_starter and user_starter == matched_starter:
        # Both are the same type of question (e.g., both "what" questions)
        print(f"Both are {user_starter} questions - accepting match")
        # Continue with other checks but don't reject just because wording differs

    # Check for important keywords that MUST be in the answer
    must_have_keywords = []
    for keyword in important_keywords:
        if keyword in user_q_lower:
            must_have_keywords.append(keyword)

    # If user asks for differences but answer doesn't compare, reject
    if 'difference' in user_q_lower or 'compare' in user_q_lower or 'versus' in user_q_lower:
        if not ('difference' in matched_answer_lower or 'compare' in matched_answer_lower or 'vs' in matched_answer_lower):
            print("User asked for differences but answer doesn't compare - rejecting")
            return False

    # If user asks "how to" but answer is just definition
    if ('how to' in user_q_lower or 'how do' in user_q_lower) and 'how' not in matched_answer_lower.lower():
        # Check if answer contains instructions/steps
        instruction_words = ['step', 'first', 'second', 'then', 'next', 'finally', 'process']
        if not any(word in matched_answer_lower for word in instruction_words):
            print("User asked 'how to' but answer is not instructional - rejecting")
            return False

    # Check if the match is just generic when user asks for specific
    generic_questions = ['what is', 'what are', 'what does', 'what do']
    specific_questions = ['difference between', 'how to use', 'when to use', 
                         'compare', 'explain the difference', 'give example of']

    user_is_specific = any(phrase in user_q_lower for phrase in specific_questions)
    match_is_generic = any(phrase in matched_q_lower for phrase in generic_questions)

    if user_is_specific and match_is_generic:
        # Check if the generic answer actually addresses the specific question
        user_specific_terms = []
        for phrase in specific_questions:
            if phrase in user_q_lower:
                # Get the terms after the phrase
                idx = user_q_lower.find(phrase) + len(phrase)
                user_specific_terms = user_q_lower[idx:].strip().split()[:3]
                break

        if user_specific_terms:
            # Check if these specific terms are in the answer
            if not any(term in matched_answer_lower for term in user_specific_terms if len(term) > 2):
                print("User asked specific, match is generic - likely wrong")
                return False

    # Check for core topic overlap
    user_words = set(user_q_lower.split())
    matched_words = set(matched_q_lower.split())
    common_core = user_words.intersection(matched_words)

    # Remove common stopwords
    stopwords_set = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    common_core = {word for word in common_core if word not in stopwords_set and len(word) > 2}

    if len(common_core) >= 2:  # At least 2 meaningful words in common
        print(f"Common core words: {common_core} - accepting match")
        return True

    # If TF-IDF score was high and we got here, it's probably OK
    return True

def verify_tense_specificity(user_q, matched_q, matched_answer):
    """Ensure we return the correct specificity for tense questions"""
    user_q_lower = user_q.lower()
    matched_q_lower = matched_q.lower()
    
    # Check if user is asking about general tense vs specific tense
    if 'present tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        # User is asking about present tense in general
        if 'present continuous' in matched_q_lower or 'present perfect' in matched_q_lower:
            # They got a specific tense instead of general
            # Check if we have a general present tense question
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'present tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i  # Return index of general present tense
    
    elif 'past tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        if 'past continuous' in matched_q_lower or 'past perfect' in matched_q_lower:
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'past tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i
    
    elif 'future tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        if 'future continuous' in matched_q_lower or 'future perfect' in matched_q_lower:
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'future tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i
    
    return None  # No need to override

@staticchat_bp.route('/search', methods=['POST'])
def search_question():
    try:
        data = request.get_json()
        original_question = data.get('question', '').strip()
        
        if not original_question:
            return jsonify({
                'success': False,
                'message': 'Please provide a question'
            }), 400
        
        print(f"\n=== Processing: '{original_question}' ===")
        
        # First, check for special scenarios
        scenario_result = detect_scenario(original_question)
        if scenario_result:
            print(f"Detected scenario: {scenario_result['scenario']}")  # Debug log
            return jsonify({
                'success': True,
                'scenario': scenario_result['scenario'],
                'message': scenario_result['message'],
                'audio_url': scenario_result.get('audio_url', ''),
                'video_url': scenario_result.get('video_url', ''),
                'story_url': scenario_result.get('story_url', ''),
                'detail_url': scenario_result.get('detail_url', ''),
                'example_url': scenario_result.get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        print("No scenario detected, checking topic relevance...")  # Debug log
        
        # Check if question is related to tenses
        is_topic_relevant = check_topic_relevance(original_question)
        print(f"Topic relevant: {is_topic_relevant}")  # Debug log
        
        if not is_topic_relevant:
            # If not relevant and not caught by out_of_syllabus scenario
            return jsonify({
                'success': True,
                'scenario': 'out_of_syllabus',
                'message': SCENARIOS['out_of_syllabus']['message'],
                'audio_url': SCENARIOS['out_of_syllabus']['audio_url'],
                'video_url': SCENARIOS['out_of_syllabus']['video_url'],
                'story_url': SCENARIOS['out_of_syllabus'].get('story_url', ''),
                'detail_url': SCENARIOS['out_of_syllabus'].get('detail_url', ''),
                'example_url': SCENARIOS['out_of_syllabus'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        # Calculate similarity if we have questions
        if not preprocessed_questions:
            return jsonify({
                'success': True,
                'scenario': 'not_available',
                'message': SCENARIOS['not_available']['message'],
                'suggestions': SCENARIOS['not_available']['suggestions'],
                'audio_url': SCENARIOS['not_available']['audio_url'],
                'video_url': SCENARIOS['not_available']['video_url'],
                'story_url': SCENARIOS['not_available'].get('story_url', ''),
                'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
                'example_url': SCENARIOS['not_available'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        similarity_scores = calculate_similarity(original_question)
        
        if len(similarity_scores) == 0:  # No questions loaded
            return jsonify({
                'success': True,
                'scenario': 'not_available',
                'message': SCENARIOS['not_available']['message'],
                'suggestions': SCENARIOS['not_available']['suggestions'],
                'audio_url': SCENARIOS['not_available']['audio_url'],
                'video_url': SCENARIOS['not_available']['video_url'],
                'story_url': SCENARIOS['not_available'].get('story_url', ''),
                'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
                'example_url': SCENARIOS['not_available'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        # Get the best match
        best_match_idx = similarity_scores.argmax()
        best_score = similarity_scores[best_match_idx]
        
        print(f"Best TF-IDF score: {best_score:.3f}")  # Debug log
        print(f"Matched to question #{best_match_idx + 1}: {questions_data[best_match_idx]['question']}")  # Debug log
        
        # Check if we need to override for tense specificity
        override_idx = verify_tense_specificity(
            original_question,
            questions_data[best_match_idx]['question'],
            questions_data[best_match_idx]['answer']
        )
        
        if override_idx is not None:
            best_match_idx = override_idx
            best_score = 0.9  # Set high score for exact match
            print(f"Overriding to general tense question: {questions_data[best_match_idx]['question']}")
        
        # Set higher threshold for matching - INCREASED to prevent wrong matches
        tfidf_threshold = 0.35  # Increased from 0.2 to 0.35
        keyword_threshold = 0.25  # Increased from 0.1 to 0.25
        
        if best_score > tfidf_threshold:
            # Verify the match is actually relevant
            matched_question = questions_data[best_match_idx]
            is_relevant = verify_match_relevance(original_question, 
                                                matched_question['question'],
                                                matched_question['answer'])
            
            if is_relevant:
                # Good match found with TF-IDF
                return jsonify({
                    'success': True,
                    'matched_question': matched_question['question'],
                    'answer': matched_question['answer'],
                    'sno': matched_question['sno'],
                    'audio_url': matched_question.get('audio_url', ''),
                    'video_url': matched_question.get('video_url', ''),
                    'story_url': matched_question.get('story_url', ''),
                    'detail_url': matched_question.get('detail_url', ''),
                    'example_url': matched_question.get('example_url', ''),
                    'confidence_score': float(best_score),
                    'user_question': original_question,
                    'matching_method': 'tfidf',
                    'spell_corrected': original_question if SYMSPELL_AVAILABLE else 'not_available'
                })
            else:
                # Match is not actually relevant
                print(f"Match verification failed. Score: {best_score:.3f}")
                # Fall through to not_available
        else:
            # Score below threshold
            print(f"Score below threshold. Score: {best_score:.3f}, Threshold: {tfidf_threshold}")
        
        # Try keyword matching as fallback (with higher threshold)
        keyword_matches = keyword_match(original_question, questions_data)
        
        print(f"Keyword matches found: {len(keyword_matches)}")  # Debug log
        if keyword_matches:
            print(f"Best keyword score: {keyword_matches[0]['score']:.3f}")  # Debug log
        
        if keyword_matches and keyword_matches[0]['score'] > keyword_threshold:
            best_keyword_match = keyword_matches[0]
            matched_question = questions_data[best_keyword_match['index']]
            
            # Verify keyword match too
            is_relevant = verify_match_relevance(original_question,
                                                matched_question['question'],
                                                matched_question['answer'])
            
            if is_relevant:
                return jsonify({
                    'success': True,
                    'matched_question': matched_question['question'],
                    'answer': matched_question['answer'],
                    'sno': matched_question['sno'],
                    'audio_url': matched_question.get('audio_url', ''),
                    'video_url': matched_question.get('video_url', ''),
                    'story_url': matched_question.get('story_url', ''),
                    'detail_url': matched_question.get('detail_url', ''),
                    'example_url': matched_question.get('example_url', ''),
                    'confidence_score': float(best_keyword_match['score']),
                    'user_question': original_question,
                    'matching_method': 'keyword',
                    'common_words': best_keyword_match['common_words']
                })
            else:
                print("Keyword match verification failed")
        
        # No good match found but question is tense-related
        return jsonify({
            'success': True,
            'scenario': 'not_available',
            'message': SCENARIOS['not_available']['message'],
            'suggestions': SCENARIOS['not_available']['suggestions'],
            'audio_url': SCENARIOS['not_available']['audio_url'],
            'video_url': SCENARIOS['not_available']['video_url'],
            'story_url': SCENARIOS['not_available'].get('story_url', ''),
            'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
            'example_url': SCENARIOS['not_available'].get('example_url', ''),
            'user_question': original_question,
            'matching_method': 'scenario',
            'debug_info': {
                'best_tfidf_score': float(best_score) if len(similarity_scores) > 0 else 0,
                'best_keyword_score': keyword_matches[0]['score'] if keyword_matches else 0
            }
        })
                
    except Exception as e:
        print(f"Error in search_question: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({
            'success': False,
            'message': f'Error processing request: {str(e)}'
        }), 500

@staticchat_bp.route('/questions', methods=['GET'])
def get_all_questions():
    """Get all questions for reference"""
    try:
        questions = load_questions()
        # Return only question text for autocomplete
        question_list = [{'sno': q['sno'], 'question': q['question']} for q in questions]
        return jsonify({
            'success': True,
            'questions': question_list,
            'count': len(question_list)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/question/<int:sno>', methods=['GET'])
def get_question_by_sno(sno):
    """Get specific question by serial number"""
    try:
        questions = load_questions()
        question = next((q for q in questions if q['sno'] == sno), None)
        
        if question:
            return jsonify({
                'success': True,
                'question': question
            })
        else:
            return jsonify({
                'success': False,
                'message': f'Question with SNO {sno} not found'
            }), 404
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/suggestions', methods=['GET'])
def get_suggestions():
    """Get random suggestions from the database"""
    try:
        if not questions_data:
            return jsonify({
                'success': False,
                'message': "No questions available.",
                'suggestions': []
            })
        
        # Get parameter for number of suggestions
        count = request.args.get('count', default=5, type=int)
        
        # Get random questions for suggestions
        import random
        random_questions = random.sample(questions_data, min(count, len(questions_data)))
        suggestions = [q['question'] for q in random_questions]
        
        return jsonify({
            'success': True,
            'suggestions': suggestions,
            'count': len(suggestions)
        })
    except Exception as e:
        print(f"Error in get_suggestions: {str(e)}")
        return jsonify({
            'success': False,
            'message': str(e),
            'suggestions': []
        }), 500

@staticchat_bp.route('/scenarios', methods=['GET'])
def get_scenarios():
    """Get information about available scenarios"""
    try:
        scenarios_info = {}
        for scenario_name, scenario_data in SCENARIOS.items():
            scenarios_info[scenario_name] = {
                "type": scenario_data.get("type", "scenario"),
                "has_audio": bool(scenario_data.get("audio_url")),
                "has_video": bool(scenario_data.get("video_url")),
                "keywords": scenario_data.get("keywords", [])
            }
        
        return jsonify({
            'success': True,
            'scenarios': scenarios_info,
            'count': len(scenarios_info)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/transcribe', methods=['POST'])
def transcribe():
    if "file" not in request.files:
        return jsonify({"error": "No file field named 'file'"}), 400

    f = request.files["file"]
    if not f:
        return jsonify({"error": "No file uploaded"}), 400

    # Optional language from client: en / hi / ta
    language = request.form.get("language")  # may be None

    tmp_path = None
    try:
        # Keep a suffix so ffmpeg/whisper detects it better
        suffix = os.path.splitext(f.filename or "")[1].lower()
        if not suffix:
            suffix = ".webm"  # safe default for browser uploads

        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp_path = tmp.name
            f.save(tmp_path)

        # Run local whisper
        result = model.transcribe(
            tmp_path,
            language=language if language else None,
            fp16=False  # CPU-only: must be False
        )

        text = (result.get("text") or "").strip()
        return jsonify({"text": text})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

    finally:
        if tmp_path and os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
            except:
                pass