File size: 43,567 Bytes
0815850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
from flask import Blueprint, request, jsonify, current_app
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import string
import tempfile
from datetime import datetime

# Defer heavy optional import (whisper) to optional load so import-time does not crash app
MODEL_NAME = "base"
model = None
MODEL_AVAILABLE = False
try:
    import whisper
    try:
        model = whisper.load_model(MODEL_NAME)
        MODEL_AVAILABLE = True
        print(f"Whisper model '{MODEL_NAME}' loaded successfully")
    except Exception as ex:
        print(f"Whisper installed but failed to load model '{MODEL_NAME}': {ex}")
        model = None
        MODEL_AVAILABLE = False
except Exception as ex:
    print(f"Whisper not available: {ex}")
    model = None
    MODEL_AVAILABLE = False

# Add SymSpell for spell checking
try:
    from symspellpy import SymSpell, Verbosity
    import pkg_resources
    SYMSPELL_AVAILABLE = True
except ImportError:
    print("SymSpell not available. Please install: pip install symspellpy")
    SYMSPELL_AVAILABLE = False


staticchat_bp = Blueprint("staticchat", __name__)

# NOTE: Blueprints do not have a config dict. MAX_CONTENT_LENGTH must be set on the Flask app.
# If you want to enforce max content size, set app.config["MAX_CONTENT_LENGTH"] when creating the Flask app.

# Initialize SymSpell if available
sym_spell = None
if SYMSPELL_AVAILABLE:
    try:
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt"
        )
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
        )
        
        # Load dictionaries
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=1)
        print("SymSpell spell checker initialized successfully")
    except Exception as e:
        print(f"Failed to initialize SymSpell: {e}")
        SYMSPELL_AVAILABLE = False

# Try to import NLTK with fallback
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    
    # Download required NLTK resources
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
    
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK not available, using simple text processing: {e}")
    NLTK_AVAILABLE = False

# Enhanced Scenario configurations
SCENARIOS = {
    "greeting": {
        "keywords": ["good morning", "good afternoon", "good evening", "hello", "hi", "hey", "greetings"],
        "message": {
            "morning": "Good morning! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "afternoon": "Good afternoon! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "evening": "Good evening! Let's begin our lesson on tenses. You can ask me any question about tenses",
            "general": "Hello! Welcome to the English Tenses Learning Assistant. How can I help you with tenses today?"
        },
        "audio_url": "assets/staticchat/intro.mp3",
        "video_url": "assets/staticchat/intro.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "thanks": {
        "keywords": ["thank you", "thanks", "thank you very much", "appreciate it", "thanks a lot"],
        "message": "You're welcome! Do you have any other questions?",
        "audio_url": "assets/staticchat/you_are_welcome.mp3",
        "video_url": "assets/staticchat/you_are_welcome.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "farewell": {
        "keywords": ["bye", "goodbye", "see you", "farewell", "take care", "bye bye"],
        "message": "Goodbye! Keep practicing your English tenses. Remember, practice makes perfect!",
        "audio_url": "assets/staticchat/bye.mp3",
        "video_url": "assets/staticchat/bye.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "not_available": {
        "message": "I don't have the answer for that. Let's not available in my lesson today.",
        "suggestions": [
            "Try asking about common tenses like present simple or past perfect",
            "Ask me about tense structures or examples",
            "Check if your question is specifically about English verb tenses"
        ],
        "audio_url": "assets/staticchat/no_db.mp3",
        "video_url": "assets/staticchat/no_db.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "out_of_syllabus": {
        "keywords": [
            # sports
            "sports", "sport", "cricket", "ipl", "match", "score", "wicket", "runs", "bat", "bowling",
            "football", "basketball", "tennis", "hockey",
            # other non-tense topics
            "weather", "rain", "sunny", "temperature",
            "food", "pizza", "burger", "restaurant", "cooking",
            "movie", "music", "song", "artist", "film",
            "history", "science", "math", "politics", "geography", "economics", "physics",
            # general grammar (NOT tenses)
            "noun", "pronoun", "adjective", "adverb", "preposition", "conjunction",
            "punctuation", "comma", "full stop", "spelling", "vocabulary", "synonym", "antonym",
            "phonetics", "pronunciation"
        ],
        "message": "That's not part of our tense lesson. Let's stay on our topic.",
        "audio_url": "assets/staticchat/out_of_topic.mp3",
        "video_url": "assets/staticchat/out_of_topic.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    },
    "not_understandable": {
        "message": "I don't understand your question. Can you ask it again more simply?",
        "suggestions": [
            "Try using simpler words",
            "Ask about specific tenses like 'What is present tense?'",
            "Ask for examples of tenses",
            "Check your spelling and grammar"
        ],
        "audio_url": "assets/staticchat/not_understand.mp3",
        "video_url": "assets/staticchat/not_understand.mp4",
        "story_url": "",
        "detail_url": "",
        "example_url": "",
        "type": "scenario"
    }
}

# Load questions from JSON file
def load_questions():
    try:
        with open('assets/qa.json', 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} questions from qa.json")
        
        # Debug: Print question categories
        tense_categories = {}
        for item in data:
            q = item['question'].lower()
            if 'present' in q:
                if 'continuous' in q or 'progressive' in q:
                    tense_categories['present_continuous'] = tense_categories.get('present_continuous', 0) + 1
                elif 'perfect' in q:
                    tense_categories['present_perfect'] = tense_categories.get('present_perfect', 0) + 1
                elif 'simple' in q:
                    tense_categories['present_simple'] = tense_categories.get('present_simple', 0) + 1
                else:
                    tense_categories['present_general'] = tense_categories.get('present_general', 0) + 1
        
        print(f"Tense categories in database: {tense_categories}")
        return data
    except FileNotFoundError:
        print("Error: qa.json not found")
        return []
    except json.JSONDecodeError as e:
        print(f"Error parsing qa.json: {e}")
        return []

# Spell correction function
def correct_spelling(text):
    """Correct spelling using SymSpell"""
    if not SYMSPELL_AVAILABLE or sym_spell is None:
        return text
    
    try:
        # Split into words and correct each
        words = text.split()
        corrected_words = []
        
        for word in words:
            if len(word) <= 2:  # Don't correct very short words
                corrected_words.append(word)
                continue
            
            # Check if word needs correction
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions and suggestions[0].term != word:
                corrected_words.append(suggestions[0].term)
                print(f"Corrected '{word}' to '{suggestions[0].term}'")
            else:
                corrected_words.append(word)
        
        corrected_text = ' '.join(corrected_words)
        
        # Also check for common bigram errors
        bigram_suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
        if bigram_suggestions and bigram_suggestions[0].term != corrected_text:
            print(f"Bigram correction: '{text}' -> '{bigram_suggestions[0].term}'")
            return bigram_suggestions[0].term
        
        return corrected_text
    except Exception as e:
        print(f"Spell correction error: {e}")
        return text

# Enhanced text preprocessing
def preprocess_text(text):
    """Preprocess text with spelling correction and enhanced NLP"""
    # Correct spelling first
    if SYMSPELL_AVAILABLE:
        text = correct_spelling(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    if NLTK_AVAILABLE:
        try:
            # Tokenize
            tokens = word_tokenize(text)
            
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            # Keep important tense-related words that might be in stopwords
            important_words = {'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 
                              'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
            stop_words = stop_words - important_words
            
            tokens = [word for word in tokens if word not in stop_words]
            
            # Lemmatize
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # Lemmatize as verbs
            
            return ' '.join(tokens)
        except Exception as e:
            print(f"Error in NLP processing: {e}")
            # Fallback to simple processing
            return text
    else:
        # Enhanced simple processing
        # Keep important tense-related words
        important_words = {'tense', 'tenses', 'present', 'past', 'future', 
                          'continuous', 'perfect', 'simple', 'progressive',
                          'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had',
                          'do', 'does', 'did', 'will', 'shall', 'would', 'could', 'should'}
        
        # Basic stopwords to remove
        basic_stopwords = {'a', 'an', 'the', 'of', 'in', 'on', 'at', 'by', 'for', 
                          'with', 'about', 'against', 'between', 'into', 'through',
                          'during', 'before', 'after', 'above', 'below', 'to', 'from',
                          'up', 'down', 'out', 'off', 'over', 'under', 'again', 
                          'further', 'then', 'once', 'here', 'there', 'when', 'where',
                          'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                          'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
                          'own', 'same', 'so', 'than', 'too', 'very', 'can', 'may',
                          'might', 'must', 'ought', 'shall', 'should', 'will', 'would'}
        
        # Remove stopwords but keep important tense words
        words = text.split()
        filtered_words = []
        for word in words:
            if word in important_words:
                filtered_words.append(word)
            elif word not in basic_stopwords:
                filtered_words.append(word)
        
        return ' '.join(filtered_words)

def detect_scenario(user_question):
    """Detect if the user input matches any special scenario"""
    question_lower = user_question.lower().strip()
    
    # First, check for greetings, thanks, and farewell (these have highest priority)
    # Check for greetings
    for greeting_keyword in SCENARIOS["greeting"]["keywords"]:
        if greeting_keyword in question_lower:
            current_hour = datetime.now().hour
            if current_hour < 12:
                greeting_type = "morning"
            elif current_hour < 17:
                greeting_type = "afternoon"
            else:
                greeting_type = "evening"
            
            return {
                "scenario": "greeting",
                "message": SCENARIOS["greeting"]["message"][greeting_type],
                "audio_url": SCENARIOS["greeting"]["audio_url"],
                "video_url": SCENARIOS["greeting"]["video_url"],
                "story_url": SCENARIOS["greeting"].get("story_url", ""),
                "detail_url": SCENARIOS["greeting"].get("detail_url", ""),
                "example_url": SCENARIOS["greeting"].get("example_url", "")
            }
    
    # Check for thanks
    for thanks_keyword in SCENARIOS["thanks"]["keywords"]:
        if thanks_keyword in question_lower:
            return {
                "scenario": "thanks",
                "message": SCENARIOS["thanks"]["message"],
                "audio_url": SCENARIOS["thanks"]["audio_url"],
                "video_url": SCENARIOS["thanks"]["video_url"],
                "story_url": SCENARIOS["thanks"].get("story_url", ""),
                "detail_url": SCENARIOS["thanks"].get("detail_url", ""),
                "example_url": SCENARIOS["thanks"].get("example_url", "")
            }
    
    # Check for farewell
    for farewell_keyword in SCENARIOS["farewell"]["keywords"]:
        if farewell_keyword in question_lower:
            return {
                "scenario": "farewell",
                "message": SCENARIOS["farewell"]["message"],
                "audio_url": SCENARIOS["farewell"]["audio_url"],
                "video_url": SCENARIOS["farewell"]["video_url"],
                "story_url": SCENARIOS["farewell"].get("story_url", ""),
                "detail_url": SCENARIOS["farewell"].get("detail_url", ""),
                "example_url": SCENARIOS["farewell"].get("example_url", "")
            }
    
    # Check for out of syllabus topics
    # Only trigger if question contains out-of-syllabus keywords AND no tense keywords
    question_words = set(question_lower.split())
    out_of_syllabus_keywords = set(SCENARIOS["out_of_syllabus"]["keywords"])
    
    # Check if question contains any out-of-syllabus keyword
    contains_out_of_syllabus = any(keyword in question_lower for keyword in out_of_syllabus_keywords)
    
    if contains_out_of_syllabus:
        # Check if it also contains tense-related keywords
        tense_keywords = ['tense', 'tenses', 'present', 'past', 'future', 
                         'continuous', 'perfect', 'simple', 'progressive', 
                         'verb', 'verbs', 'grammar', 'am', 'is', 'are', 
                         'was', 'were', 'have', 'has', 'had']
        
        contains_tense_keyword = any(tense_word in question_lower for tense_word in tense_keywords)
        
        # If it contains both, check if tense keyword is more dominant
        if contains_tense_keyword:
            # Count tense words vs out-of-syllabus words
            tense_count = sum(1 for word in tense_keywords if word in question_lower)
            out_count = sum(1 for word in out_of_syllabus_keywords if word in question_lower)
            
            # If more tense-related words, treat as tense question
            if tense_count >= out_count:
                return None
        
        # If no tense keywords or fewer tense words, it's out of syllabus
        return {
            "scenario": "out_of_syllabus",
            "message": SCENARIOS["out_of_syllabus"]["message"],
            "audio_url": SCENARIOS["out_of_syllabus"]["audio_url"],
            "video_url": SCENARIOS["out_of_syllabus"]["video_url"],
            "story_url": SCENARIOS["out_of_syllabus"].get("story_url", ""),
            "detail_url": SCENARIOS["out_of_syllabus"].get("detail_url", ""),
            "example_url": SCENARIOS["out_of_syllabus"].get("example_url", "")
        }
    
    # Check for not understandable
    # Clean text for length check
    clean_text = re.sub(r'[^\w\s]', '', question_lower)
    
    if len(clean_text.strip()) < 2:
        return {
            "scenario": "not_understandable",
            "message": SCENARIOS["not_understandable"]["message"],
            "audio_url": SCENARIOS["not_understandable"]["audio_url"],
            "video_url": SCENARIOS["not_understandable"]["video_url"],
            "story_url": SCENARIOS["not_understandable"].get("story_url", ""),
            "detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
            "example_url": SCENARIOS["not_understandable"].get("example_url", "")
        }
    
    # Check for gibberish
    words = clean_text.split()
    if words:
        avg_word_len = sum(len(word) for word in words) / len(words)
        if avg_word_len > 15:  # Very long words might be gibberish
            return {
                "scenario": "not_understandable",
                "message": SCENARIOS["not_understandable"]["message"],
                "audio_url": SCENARIOS["not_understandable"]["audio_url"],
                "video_url": SCENARIOS["not_understandable"]["video_url"],
                "story_url": SCENARIOS["not_understandable"].get("story_url", ""),
                "detail_url": SCENARIOS["not_understandable"].get("detail_url", ""),
                "example_url": SCENARIOS["not_understandable"].get("example_url", "")
            }
    
    return None

def check_topic_relevance(user_question):
    """Return True only if the question is about English tenses (not general topics)."""
    q = user_question.lower().strip()

    # If the question clearly contains out-of-topic words AND does not say "tense",
    # treat it as out of syllabus.
    out_words = SCENARIOS["out_of_syllabus"].get("keywords", [])
    if any(re.search(rf"\b{re.escape(w)}\b", q) for w in out_words):
        if not re.search(r"\btense(s)?\b", q):
            return False

    # Strong tense intent words
    if re.search(r"\btense(s)?\b", q):
        return True

    # Common tense names (phrases)
    tense_phrases = [
        "present simple", "past simple", "future simple",
        "present continuous", "past continuous", "future continuous",
        "present perfect", "past perfect", "future perfect",
        "present perfect continuous", "past perfect continuous", "future perfect continuous",
    ]
    if any(p in q for p in tense_phrases):
        return True

    # If user mentions time-words + aspect-words together, likely a tense question
    time_words = ["present", "past", "future"]
    aspect_words = ["simple", "continuous", "perfect", "progressive"]
    if any(re.search(rf"\b{w}\b", q) for w in time_words) and any(re.search(rf"\b{w}\b", q) for w in aspect_words):
        return True

    # If user asks usage/rules/structure about helping verbs, allow it (still tense-related)
    helpers = ["am", "is", "are", "was", "were", "have", "has", "had", "do", "does", "did", "will", "shall", "would", "could", "should"]
    intent_words = ["use", "using", "when", "rule", "rules", "structure", "form", "difference", "between", "meaning", "example", "examples"]
    if any(re.search(rf"\b{h}\b", q) for h in helpers) and any(re.search(rf"\b{i}\b", q) for i in intent_words):
        return True

    # Otherwise, not a tense question
    return False

# Initialize questions data
questions_data = load_questions()
question_texts = [item['question'] for item in questions_data]
preprocessed_questions = [preprocess_text(q) for q in question_texts]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Use unigrams and bigrams
if preprocessed_questions:  # Only fit if we have questions
    tfidf_matrix = vectorizer.fit_transform(preprocessed_questions)
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
else:
    tfidf_matrix = None

def calculate_similarity(user_question):
    """Calculate similarity between user question and stored questions"""
    if not preprocessed_questions:  # No questions loaded
        return np.array([])
    
    # Preprocess user question
    preprocessed_user_q = preprocess_text(user_question)
    
    # Vectorize user question
    user_vector = vectorizer.transform([preprocessed_user_q])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix)
    
    return similarity_scores[0]

def keyword_match(user_question, questions):
    """Fallback keyword matching - IMPROVED"""
    user_words = set(preprocess_text(user_question).split())
    matches = []
    
    for i, q_data in enumerate(questions):
        question_words = set(preprocess_text(q_data['question']).split())
        common_words = user_words.intersection(question_words)
        
        if common_words:
            # Calculate score based on common words and length
            score = len(common_words) / max(len(user_words), len(question_words))
            matches.append({
                'index': i,
                'score': score,
                'common_words': list(common_words)
            })
    
    # Sort by score
    matches.sort(key=lambda x: x['score'], reverse=True)
    return matches

def verify_match_relevance(user_q, matched_q, matched_answer):
    """Verify if the match is actually relevant - IMPROVED VERSION"""
    user_q_lower = user_q.lower()
    matched_q_lower = matched_q.lower()
    matched_answer_lower = matched_answer.lower()

    # Extract key terms from user question
    user_terms = set(preprocess_text(user_q).split())

    # Extract key terms from matched question
    matched_terms = set(preprocess_text(matched_q).split())

    # Check for important keywords in user question
    important_keywords = ['difference', 'compare', 'between', 'versus', 'vs', 
                         'how to', 'how do i', 'explain', 'when to',
                         'conditional', 'subjunctive', 'passive', 'modal',
                         'reported speech', 'used to', 'mixed', 'perfect']

    # Group similar question starters
    question_starters = {
        'what': ['what is', 'what are', 'what does', 'what do'],
        'how': ['how to', 'how do', 'how does'],
        'when': ['when to', 'when do', 'when does'],
        'why': ['why do', 'why does', 'why is']
    }

    # Check if user and match have similar question starters
    user_starter = None
    matched_starter = None

    for starter_type, starters in question_starters.items():
        for starter in starters:
            if starter in user_q_lower:
                user_starter = starter_type
            if starter in matched_q_lower:
                matched_starter = starter_type

    # If both are asking "what" questions, it's likely a match even if wording differs
    if user_starter and matched_starter and user_starter == matched_starter:
        # Both are the same type of question (e.g., both "what" questions)
        print(f"Both are {user_starter} questions - accepting match")
        # Continue with other checks but don't reject just because wording differs

    # Check for important keywords that MUST be in the answer
    must_have_keywords = []
    for keyword in important_keywords:
        if keyword in user_q_lower:
            must_have_keywords.append(keyword)

    # If user asks for differences but answer doesn't compare, reject
    if 'difference' in user_q_lower or 'compare' in user_q_lower or 'versus' in user_q_lower:
        if not ('difference' in matched_answer_lower or 'compare' in matched_answer_lower or 'vs' in matched_answer_lower):
            print("User asked for differences but answer doesn't compare - rejecting")
            return False

    # If user asks "how to" but answer is just definition
    if ('how to' in user_q_lower or 'how do' in user_q_lower) and 'how' not in matched_answer_lower.lower():
        # Check if answer contains instructions/steps
        instruction_words = ['step', 'first', 'second', 'then', 'next', 'finally', 'process']
        if not any(word in matched_answer_lower for word in instruction_words):
            print("User asked 'how to' but answer is not instructional - rejecting")
            return False

    # Check if the match is just generic when user asks for specific
    generic_questions = ['what is', 'what are', 'what does', 'what do']
    specific_questions = ['difference between', 'how to use', 'when to use', 
                         'compare', 'explain the difference', 'give example of']

    user_is_specific = any(phrase in user_q_lower for phrase in specific_questions)
    match_is_generic = any(phrase in matched_q_lower for phrase in generic_questions)

    if user_is_specific and match_is_generic:
        # Check if the generic answer actually addresses the specific question
        user_specific_terms = []
        for phrase in specific_questions:
            if phrase in user_q_lower:
                # Get the terms after the phrase
                idx = user_q_lower.find(phrase) + len(phrase)
                user_specific_terms = user_q_lower[idx:].strip().split()[:3]
                break

        if user_specific_terms:
            # Check if these specific terms are in the answer
            if not any(term in matched_answer_lower for term in user_specific_terms if len(term) > 2):
                print("User asked specific, match is generic - likely wrong")
                return False

    # Check for core topic overlap
    user_words = set(user_q_lower.split())
    matched_words = set(matched_q_lower.split())
    common_core = user_words.intersection(matched_words)

    # Remove common stopwords
    stopwords_set = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    common_core = {word for word in common_core if word not in stopwords_set and len(word) > 2}

    if len(common_core) >= 2:  # At least 2 meaningful words in common
        print(f"Common core words: {common_core} - accepting match")
        return True

    # If TF-IDF score was high and we got here, it's probably OK
    return True

def verify_tense_specificity(user_q, matched_q, matched_answer):
    """Ensure we return the correct specificity for tense questions"""
    user_q_lower = user_q.lower()
    matched_q_lower = matched_q.lower()
    
    # Check if user is asking about general tense vs specific tense
    if 'present tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        # User is asking about present tense in general
        if 'present continuous' in matched_q_lower or 'present perfect' in matched_q_lower:
            # They got a specific tense instead of general
            # Check if we have a general present tense question
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'present tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i  # Return index of general present tense
    
    elif 'past tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        if 'past continuous' in matched_q_lower or 'past perfect' in matched_q_lower:
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'past tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i
    
    elif 'future tense' in user_q_lower and ('continuous' not in user_q_lower and 'perfect' not in user_q_lower):
        if 'future continuous' in matched_q_lower or 'future perfect' in matched_q_lower:
            for i, q_data in enumerate(questions_data):
                q_text = q_data['question'].lower()
                if 'future tense' in q_text and 'continuous' not in q_text and 'perfect' not in q_text:
                    return i
    
    return None  # No need to override

@staticchat_bp.route('/search', methods=['POST'])
def search_question():
    try:
        data = request.get_json()
        original_question = data.get('question', '').strip()
        
        if not original_question:
            return jsonify({
                'success': False,
                'message': 'Please provide a question'
            }), 400
        
        print(f"\n=== Processing: '{original_question}' ===")
        
        # First, check for special scenarios
        scenario_result = detect_scenario(original_question)
        if scenario_result:
            print(f"Detected scenario: {scenario_result['scenario']}")  # Debug log
            return jsonify({
                'success': True,
                'scenario': scenario_result['scenario'],
                'message': scenario_result['message'],
                'audio_url': scenario_result.get('audio_url', ''),
                'video_url': scenario_result.get('video_url', ''),
                'story_url': scenario_result.get('story_url', ''),
                'detail_url': scenario_result.get('detail_url', ''),
                'example_url': scenario_result.get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        print("No scenario detected, checking topic relevance...")  # Debug log
        
        # Check if question is related to tenses
        is_topic_relevant = check_topic_relevance(original_question)
        print(f"Topic relevant: {is_topic_relevant}")  # Debug log
        
        if not is_topic_relevant:
            # If not relevant and not caught by out_of_syllabus scenario
            return jsonify({
                'success': True,
                'scenario': 'out_of_syllabus',
                'message': SCENARIOS['out_of_syllabus']['message'],
                'audio_url': SCENARIOS['out_of_syllabus']['audio_url'],
                'video_url': SCENARIOS['out_of_syllabus']['video_url'],
                'story_url': SCENARIOS['out_of_syllabus'].get('story_url', ''),
                'detail_url': SCENARIOS['out_of_syllabus'].get('detail_url', ''),
                'example_url': SCENARIOS['out_of_syllabus'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        # Calculate similarity if we have questions
        if not preprocessed_questions:
            return jsonify({
                'success': True,
                'scenario': 'not_available',
                'message': SCENARIOS['not_available']['message'],
                'suggestions': SCENARIOS['not_available']['suggestions'],
                'audio_url': SCENARIOS['not_available']['audio_url'],
                'video_url': SCENARIOS['not_available']['video_url'],
                'story_url': SCENARIOS['not_available'].get('story_url', ''),
                'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
                'example_url': SCENARIOS['not_available'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        similarity_scores = calculate_similarity(original_question)
        
        if len(similarity_scores) == 0:  # No questions loaded
            return jsonify({
                'success': True,
                'scenario': 'not_available',
                'message': SCENARIOS['not_available']['message'],
                'suggestions': SCENARIOS['not_available']['suggestions'],
                'audio_url': SCENARIOS['not_available']['audio_url'],
                'video_url': SCENARIOS['not_available']['video_url'],
                'story_url': SCENARIOS['not_available'].get('story_url', ''),
                'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
                'example_url': SCENARIOS['not_available'].get('example_url', ''),
                'user_question': original_question,
                'matching_method': 'scenario'
            })
        
        # Get the best match
        best_match_idx = similarity_scores.argmax()
        best_score = similarity_scores[best_match_idx]
        
        print(f"Best TF-IDF score: {best_score:.3f}")  # Debug log
        print(f"Matched to question #{best_match_idx + 1}: {questions_data[best_match_idx]['question']}")  # Debug log
        
        # Check if we need to override for tense specificity
        override_idx = verify_tense_specificity(
            original_question,
            questions_data[best_match_idx]['question'],
            questions_data[best_match_idx]['answer']
        )
        
        if override_idx is not None:
            best_match_idx = override_idx
            best_score = 0.9  # Set high score for exact match
            print(f"Overriding to general tense question: {questions_data[best_match_idx]['question']}")
        
        # Set higher threshold for matching - INCREASED to prevent wrong matches
        tfidf_threshold = 0.35  # Increased from 0.2 to 0.35
        keyword_threshold = 0.25  # Increased from 0.1 to 0.25
        
        if best_score > tfidf_threshold:
            # Verify the match is actually relevant
            matched_question = questions_data[best_match_idx]
            is_relevant = verify_match_relevance(original_question, 
                                                matched_question['question'],
                                                matched_question['answer'])
            
            if is_relevant:
                # Good match found with TF-IDF
                return jsonify({
                    'success': True,
                    'matched_question': matched_question['question'],
                    'answer': matched_question['answer'],
                    'sno': matched_question['sno'],
                    'audio_url': matched_question.get('audio_url', ''),
                    'video_url': matched_question.get('video_url', ''),
                    'story_url': matched_question.get('story_url', ''),
                    'detail_url': matched_question.get('detail_url', ''),
                    'example_url': matched_question.get('example_url', ''),
                    'confidence_score': float(best_score),
                    'user_question': original_question,
                    'matching_method': 'tfidf',
                    'spell_corrected': original_question if SYMSPELL_AVAILABLE else 'not_available'
                })
            else:
                # Match is not actually relevant
                print(f"Match verification failed. Score: {best_score:.3f}")
                # Fall through to not_available
        else:
            # Score below threshold
            print(f"Score below threshold. Score: {best_score:.3f}, Threshold: {tfidf_threshold}")
        
        # Try keyword matching as fallback (with higher threshold)
        keyword_matches = keyword_match(original_question, questions_data)
        
        print(f"Keyword matches found: {len(keyword_matches)}")  # Debug log
        if keyword_matches:
            print(f"Best keyword score: {keyword_matches[0]['score']:.3f}")  # Debug log
        
        if keyword_matches and keyword_matches[0]['score'] > keyword_threshold:
            best_keyword_match = keyword_matches[0]
            matched_question = questions_data[best_keyword_match['index']]
            
            # Verify keyword match too
            is_relevant = verify_match_relevance(original_question,
                                                matched_question['question'],
                                                matched_question['answer'])
            
            if is_relevant:
                return jsonify({
                    'success': True,
                    'matched_question': matched_question['question'],
                    'answer': matched_question['answer'],
                    'sno': matched_question['sno'],
                    'audio_url': matched_question.get('audio_url', ''),
                    'video_url': matched_question.get('video_url', ''),
                    'story_url': matched_question.get('story_url', ''),
                    'detail_url': matched_question.get('detail_url', ''),
                    'example_url': matched_question.get('example_url', ''),
                    'confidence_score': float(best_keyword_match['score']),
                    'user_question': original_question,
                    'matching_method': 'keyword',
                    'common_words': best_keyword_match['common_words']
                })
            else:
                print("Keyword match verification failed")
        
        # No good match found but question is tense-related
        return jsonify({
            'success': True,
            'scenario': 'not_available',
            'message': SCENARIOS['not_available']['message'],
            'suggestions': SCENARIOS['not_available']['suggestions'],
            'audio_url': SCENARIOS['not_available']['audio_url'],
            'video_url': SCENARIOS['not_available']['video_url'],
            'story_url': SCENARIOS['not_available'].get('story_url', ''),
            'detail_url': SCENARIOS['not_available'].get('detail_url', ''),
            'example_url': SCENARIOS['not_available'].get('example_url', ''),
            'user_question': original_question,
            'matching_method': 'scenario',
            'debug_info': {
                'best_tfidf_score': float(best_score) if len(similarity_scores) > 0 else 0,
                'best_keyword_score': keyword_matches[0]['score'] if keyword_matches else 0
            }
        })
                
    except Exception as e:
        print(f"Error in search_question: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({
            'success': False,
            'message': f'Error processing request: {str(e)}'
        }), 500

@staticchat_bp.route('/questions', methods=['GET'])
def get_all_questions():
    """Get all questions for reference"""
    try:
        questions = load_questions()
        # Return only question text for autocomplete
        question_list = [{'sno': q['sno'], 'question': q['question']} for q in questions]
        return jsonify({
            'success': True,
            'questions': question_list,
            'count': len(question_list)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/question/<int:sno>', methods=['GET'])
def get_question_by_sno(sno):
    """Get specific question by serial number"""
    try:
        questions = load_questions()
        question = next((q for q in questions if q['sno'] == sno), None)
        
        if question:
            return jsonify({
                'success': True,
                'question': question
            })
        else:
            return jsonify({
                'success': False,
                'message': f'Question with SNO {sno} not found'
            }), 404
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/suggestions', methods=['GET'])
def get_suggestions():
    """Get random suggestions from the database"""
    try:
        if not questions_data:
            return jsonify({
                'success': False,
                'message': "No questions available.",
                'suggestions': []
            })
        
        # Get parameter for number of suggestions
        count = request.args.get('count', default=5, type=int)
        
        # Get random questions for suggestions
        import random
        random_questions = random.sample(questions_data, min(count, len(questions_data)))
        suggestions = [q['question'] for q in random_questions]
        
        return jsonify({
            'success': True,
            'suggestions': suggestions,
            'count': len(suggestions)
        })
    except Exception as e:
        print(f"Error in get_suggestions: {str(e)}")
        return jsonify({
            'success': False,
            'message': str(e),
            'suggestions': []
        }), 500

@staticchat_bp.route('/scenarios', methods=['GET'])
def get_scenarios():
    """Get information about available scenarios"""
    try:
        scenarios_info = {}
        for scenario_name, scenario_data in SCENARIOS.items():
            scenarios_info[scenario_name] = {
                "type": scenario_data.get("type", "scenario"),
                "has_audio": bool(scenario_data.get("audio_url")),
                "has_video": bool(scenario_data.get("video_url")),
                "keywords": scenario_data.get("keywords", [])
            }
        
        return jsonify({
            'success': True,
            'scenarios': scenarios_info,
            'count': len(scenarios_info)
        })
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e)
        }), 500

@staticchat_bp.route('/transcribe', methods=['POST'])
def transcribe():
    if "file" not in request.files:
        return jsonify({"error": "No file field named 'file'"}), 400

    f = request.files["file"]
    if not f:
        return jsonify({"error": "No file uploaded"}), 400

    # Optional language from client: en / hi / ta
    language = request.form.get("language")  # may be None

    tmp_path = None
    try:
        # Keep a suffix so ffmpeg/whisper detects it better
        suffix = os.path.splitext(f.filename or "")[1].lower()
        if not suffix:
            suffix = ".webm"  # safe default for browser uploads

        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp_path = tmp.name
            f.save(tmp_path)

        # Run local whisper
        result = model.transcribe(
            tmp_path,
            language=language if language else None,
            fp16=False  # CPU-only: must be False
        )

        text = (result.get("text") or "").strip()
        return jsonify({"text": text})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

    finally:
        if tmp_path and os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
            except:
                pass