File size: 44,138 Bytes
1bbe15b
 
 
 
 
 
 
 
 
 
 
 
19e2a5e
1bbe15b
 
 
 
 
 
 
19e2a5e
 
 
 
 
1bbe15b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2101b97
 
 
 
 
 
1bbe15b
19e2a5e
 
 
 
 
1bbe15b
 
2101b97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bbe15b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19e2a5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
"""
Data Analyzer for structured files (Excel / CSV).

When users upload spreadsheets and ask analytical questions
(highest, lowest, average, total, count, etc.), this module
computes the answer directly from the data rather than relying
on text-similarity retrieval.
"""

import os
import re
import csv
import json
from typing import Dict, List, Optional, Any

try:
    import openpyxl
except ImportError:
    openpyxl = None

try:
    from groq import Groq
except ImportError:
    Groq = None


# ── Keyword patterns that signal an analytical question ──────────────────────
AGGREGATE_PATTERNS = [
    (r"\b(highest|maximum|max|most|top|greatest|best)\b", "max"),
    (r"\b(lowest|minimum|min|least|worst|bottom|fewest)\b", "min"),
    (r"\b(average|mean|avg)\b", "avg"),
    (r"\b(total|sum|overall)\b", "sum"),
    (r"\b(count|how many|number of)\b", "count"),
    (r"\b(sort|rank|order|list all)\b", "sort"),
]

# Patterns for filter/conditional queries
FILTER_PATTERNS = [
    # "greater than 80", "above 90", "more than 75", "over 80", "at least 80"
    (r"(greater than|above|more than|over|at least|>=?|exceeds?)\s*(\d+\.?\d*)", "gte"),
    # "less than 80", "below 70", "under 60", "at most 50"
    (r"(less than|below|under|at most|<=?)\s*(\d+\.?\d*)", "lte"),
    # "equal to 80", "exactly 80"
    (r"(equal to|exactly|equals?)\s*(\d+\.?\d*)", "eq"),
    # "between 70 and 90"
    (r"between\s+(\d+\.?\d*)\s*(?:and|to|-)\s*(\d+\.?\d*)", "between"),
]


class StructuredDataStore:
    """Keeps in-memory tables from uploaded Excel / CSV files."""

    def __init__(self):
        # { filename: [ {col: val, …}, … ] }
        self.tables: Dict[str, List[Dict[str, Any]]] = {}
        # { filename: [col_names] }
        self.headers: Dict[str, List[str]] = {}

    # ── Loading ──────────────────────────────────────────────────────────────
    def load_excel(self, file_path: str) -> int:
        """Load all sheets from an Excel file. Returns row count."""
        if openpyxl is None:
            return 0

        wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
        total = 0
        fname = os.path.basename(file_path)

        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            rows = list(ws.iter_rows(values_only=True))
            if len(rows) < 2:
                continue

            # Auto-detect real header row (skip merged title rows)
            header_idx = self._find_header_row(rows)
            headers = [str(h).strip() if h is not None else f"Col{i}"
                       for i, h in enumerate(rows[header_idx])]

            records = []
            for row in rows[header_idx + 1:]:
                cells = list(row)
                filled = [c for c in cells if c is not None and str(c).strip()]
                if len(filled) < 2:
                    continue
                # Skip rows without a text name (totals / max-marks)
                has_name = any(
                    isinstance(c, str) and len(c.strip()) > 3 and not c.strip().replace('.', '').isdigit()
                    for c in cells
                )
                if not has_name:
                    continue
                record = {}
                for h, cell in zip(headers, cells):
                    record[h] = cell
                records.append(record)

            if records:
                key = f"{fname}::{sheet_name}" if len(wb.sheetnames) > 1 else fname
                self.tables[key] = records
                self.headers[key] = headers
                total += len(records)

        wb.close()
        return total

    @staticmethod
    def _find_header_row(rows) -> int:
        """Find the real header row by looking for keyword matches."""
        kw = {'name', 'no', 'roll', 'sl', 'sno', 'total', 'id',
              'section', 'subject', 'marks', 'grade', 'percentage',
              'attendance', 'date', 'class', 'student'}
        best_idx, best_score = 0, 0
        for i, row in enumerate(rows[:20]):
            cells = [str(c).strip().lower() for c in row if c is not None and str(c).strip()]
            if len(cells) < 3:
                continue
            hits = sum(1 for c in cells if any(k in c for k in kw))
            short = sum(1 for c in cells if len(c) < 30)
            score = hits * 3 + short
            if score > best_score:
                best_score = score
                best_idx = i
        return best_idx

    def load_csv(self, file_path: str) -> int:
        """Load a CSV file. Returns row count."""
        fname = os.path.basename(file_path)
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            reader = csv.DictReader(f)
            records = list(reader)

        if not records:
            return 0

        self.tables[fname] = records
        self.headers[fname] = list(records[0].keys())
        return len(records)

    def clear(self):
        self.tables.clear()
        self.headers.clear()

    @property
    def has_data(self) -> bool:
        return bool(self.tables)

    # ── Analysis ─────────────────────────────────────────────────────────────
    def _query_mentions_specific_entity(self, query: str) -> bool:
        """Check if the query references a specific ID/roll number or known name."""
        # Check for roll number patterns
        if self._ID_PATTERN.search(query) or self._GENERIC_ID.search(query):
            return True
        # Check if any known cell value (name/ID) appears in the query
        q_lower = query.lower()
        for tkey, rows in self.tables.items():
            for row in rows:
                for val in row.values():
                    if val is None:
                        continue
                    val_str = str(val).strip()
                    if len(val_str) >= 3 and val_str.lower() in q_lower:
                        return True
        return False

    def answer_query(self, query: str) -> Optional[str]:
        """
        Try to answer a query by analysing the stored structured data.
        Returns an answer string, or None if the query isn't analytical.
        """
        if not self.has_data:
            return None

        # 0) Try comparison first ("compare X and Y", "who is better X or Y")
        ans = self._try_comparison(query)
        if ans:
            return ans

        # If query mentions a specific student/ID, try row lookup FIRST
        if self._query_mentions_specific_entity(query):
            ans = self._try_row_lookup(query)
            if ans:
                return ans

        # 1) Try filter + count ("how many students have attendance > 80%")
        ans = self._try_filter_query(query)
        if ans:
            return ans

        # 2) Try aggregate (highest, lowest, avg, total, count, rank)
        op = self._detect_operation(query)
        if op is not None:
            table_key, column = self._match_column(query, op)
            if op == "count" and table_key is None:
                table_key = next(iter(self.tables))
                column = None
            if table_key is not None:
                rows = self.tables[table_key]
                result = self._compute(rows, column, op, query)
                if result:
                    return result

        # 3) Try row lookup ONLY if query looks like a person/ID lookup
        #    (not for general knowledge questions about PDF content)
        if self._is_entity_query(query):
            ans = self._try_row_lookup(query)
            if ans:
                return ans

        # 4) Fallback: Use Groq LLM to analyze the data for complex questions
        ans = self._try_llm_analysis(query)
        if ans:
            return ans

        return None

    def _is_entity_query(self, query: str) -> bool:
        """Check if the query is asking about a specific person/ID/record,
        not a general knowledge question."""
        # Has a roll number / ID pattern
        if self._ID_PATTERN.search(query) or self._GENERIC_ID.search(query):
            return True
        # Has a name in ALL CAPS (like student names)
        if re.search(r'\b[A-Z][A-Z ]{4,}\b', query):
            return True
        # Query patterns that suggest a person lookup
        person_patterns = (
            r'\bwho is\b', r'\btell me about\b', r'\bdetails of\b',
            r'\battendance of\b', r'\bmarks of\b', r'\bscore of\b',
        )
        q_lower = query.lower()
        if any(re.search(p, q_lower) for p in person_patterns):
            # But only if the query is short (likely a name lookup, not a concept question)
            # "who is mahesh babu" = name lookup
            # "what is hallucination firewall" = concept question
            words = query.split()
            if len(words) <= 8:
                return True
        return False

    # ── Row Lookup ────────────────────────────────────────────────────────────

    # Patterns that look like IDs / roll numbers (alphanumeric codes)
    _ID_PATTERN = re.compile(r'\b(\d{2}[A-Za-z]{2}\d[A-Za-z]\d{4})\b')  # e.g. 22PA1A0504
    _GENERIC_ID = re.compile(r'\b([A-Z]{2,}\d{3,}[A-Z]*\d*)\b', re.IGNORECASE)  # broader

    def _try_row_lookup(self, query: str) -> Optional[str]:
        """Answer queries like 'How many UHV classes attended by 22PA1A0501?'
        or 'What is the attendance of 22PA1A0504?' or 'Tell me about Alice'.

        If the query mentions a specific column, returns only that value.
        Otherwise returns the full row.
        If the query mentions an ID/roll number that doesn't exist, flags it
        as hallucinated.
        """
        q_lower = query.lower()

        for tkey, rows in self.tables.items():
            headers = self.headers[tkey]
            label_col = self._find_label_column(rows)

            for row in rows:
                # Check every cell value in the row for a match with the query
                matched_id = None
                for col in headers:
                    val = row.get(col)
                    if val is None:
                        continue
                    val_str = str(val).strip()
                    if len(val_str) < 3:
                        continue
                    if val_str.lower() in q_lower:
                        matched_id = val_str
                        break

                if matched_id is None:
                    continue

                # Found the row β€” now figure out what the user is asking
                name_val = str(row.get(label_col, matched_id)).strip()

                # ── Check if the query is a verification/claim question ────
                # e.g. "is 22PA1A0501 has attendance percentage of 90%"
                claimed_value = self._extract_claimed_value(query)
                asked_cols = self._find_asked_columns(query, headers, tkey)

                if claimed_value is not None and asked_cols:
                    # User is claiming a specific value β€” verify it
                    for ac in asked_cols:
                        actual = self._to_float(row.get(ac))
                        if actual is not None:
                            if abs(actual - claimed_value) < 0.5:
                                return (
                                    f"Yes, that is correct. The {ac} of {name_val} is {actual}, "
                                    f"which matches the claimed value of {claimed_value}."
                                )
                            else:
                                return (
                                    f"HALLUCINATION DETECTED: No, that is incorrect. "
                                    f"The claimed {ac} of {name_val} is {claimed_value}, "
                                    f"but the actual value is {actual}. "
                                    f"The claim does not match the uploaded data."
                                )
                elif claimed_value is not None:
                    # User claimed a value but no specific column detected β€” check all numeric columns
                    for h in headers:
                        actual = self._to_float(row.get(h))
                        if actual is not None and abs(actual - claimed_value) < 0.5:
                            return (
                                f"Yes, that is correct. The {h} of {name_val} is {actual}, "
                                f"which matches the claimed value of {claimed_value}."
                            )
                    # No column matched the claimed value
                    # Find the most likely column (e.g. % or total)
                    likely_cols = [h for h in headers if h.strip() in ('%', 'TOTAL', 'Percentage')]
                    if likely_cols:
                        ac = likely_cols[0]
                        actual = self._to_float(row.get(ac))
                        if actual is not None:
                            return (
                                f"HALLUCINATION DETECTED: No, that is incorrect. "
                                f"The claimed value for {name_val} is {claimed_value}, "
                                f"but the actual {ac} is {actual}. "
                                f"The claim does not match the uploaded data."
                            )

                if asked_cols:
                    # Return only the requested fields
                    parts = []
                    for ac in asked_cols:
                        cell = row.get(ac)
                        if cell is not None:
                            parts.append(f"{ac}: {cell}")

                    if len(parts) == 1:
                        col_name, col_val = parts[0].split(": ", 1)
                        return f"The {col_name} of {name_val} is {col_val}."
                    else:
                        return f"For {name_val}:\n" + "\n".join(f"  - {p}" for p in parts)
                else:
                    # No specific column detected β€” return full row
                    parts = []
                    for h in headers:
                        cell = row.get(h)
                        if cell is not None and str(cell).strip():
                            parts.append(f"{h}: {cell}")
                    return f"Details for {name_val}:\n" + "\n".join(f"  - {p}" for p in parts)

        # ── No row matched β€” check if the query contains an ID that looks
        #    like it *should* be in the data but isn't (hallucination) ────────
        return self._check_hallucinated_id(query)

    # ── Student Comparison ───────────────────────────────────────────────────
    _COMPARE_PATTERNS = re.compile(
        r'(compare|versus|vs\.?|difference between|who.*(better|higher|more|greater|lower|less|worse))',
        re.IGNORECASE
    )

    def _try_comparison(self, query: str) -> Optional[str]:
        """Handle queries like 'compare 22PA1A0501 and 22PA1A0502' or
        'who has better attendance 22PA1A0501 or 22PA1A0502'."""
        if not self._COMPARE_PATTERNS.search(query):
            return None

        # Find all entity matches (IDs or names) in the query
        matched_rows = []
        q_lower = query.lower()

        for tkey, rows in self.tables.items():
            headers = self.headers[tkey]
            label_col = self._find_label_column(rows)
            for row in rows:
                for col in headers:
                    val = row.get(col)
                    if val is None:
                        continue
                    val_str = str(val).strip()
                    if len(val_str) < 3:
                        continue
                    if val_str.lower() in q_lower:
                        name_val = str(row.get(label_col, val_str)).strip()
                        if not any(r[0] == name_val for r in matched_rows):
                            matched_rows.append((name_val, row, headers, tkey))
                        break

        if len(matched_rows) < 2:
            # Extract all IDs/names mentioned in the query
            requested_ids = self._ID_PATTERN.findall(query)
            requested_ids += self._GENERIC_ID.findall(query)
            # Also check for full names in caps
            requested_ids += re.findall(r'\b([A-Z][A-Z ]{4,})\b', query)

            if len(matched_rows) == 1 and len(requested_ids) >= 2:
                # One student found, one not β€” partial verification
                found_name = matched_rows[0][0]
                found_row = matched_rows[0][1]
                found_headers = matched_rows[0][2]
                # Figure out which ID is missing
                missing_ids = []
                for rid in requested_ids:
                    rid_lower = rid.strip().lower()
                    is_found = False
                    for val in found_row.values():
                        if val is not None and rid_lower == str(val).strip().lower():
                            is_found = True
                            break
                    if not is_found:
                        missing_ids.append(rid)
                missing = missing_ids[0] if missing_ids else requested_ids[-1]
                # Build partial result with found student's data
                parts = []
                for h in found_headers:
                    cell = found_row.get(h)
                    if cell is not None and str(cell).strip():
                        parts.append(f"  - {h}: {cell}")
                return (
                    f"PARTIAL VERIFICATION: Cannot fully compare because '{missing}' "
                    f"does not exist in the uploaded data.\n\n"
                    f"Found data for {found_name}:\n" + "\n".join(parts) + "\n\n"
                    f"The student/ID '{missing}' was not found in any of the uploaded documents. "
                    f"This comparison is only partially verified."
                )
            return None

        # Use first two matched students
        (name1, row1, headers1, tkey1) = matched_rows[0]
        (name2, row2, headers2, tkey2) = matched_rows[1]

        # Check if a specific column is asked for comparison
        asked_cols = self._find_asked_columns(query, headers1, tkey1)

        if asked_cols:
            # Compare specific columns
            lines = [f"Comparison between {name1} and {name2}:\n"]
            for col in asked_cols:
                val1 = row1.get(col)
                val2 = row2.get(col)
                v1_f = self._to_float(val1)
                v2_f = self._to_float(val2)
                lines.append(f"  {col}:")
                lines.append(f"    {name1}: {val1}")
                lines.append(f"    {name2}: {val2}")
                if v1_f is not None and v2_f is not None:
                    diff = v1_f - v2_f
                    if diff > 0:
                        lines.append(f"    β†’ {name1} is higher by {abs(diff):.2f}")
                    elif diff < 0:
                        lines.append(f"    β†’ {name2} is higher by {abs(diff):.2f}")
                    else:
                        lines.append(f"    β†’ Both are equal")
            return "\n".join(lines)
        else:
            # Compare all numeric columns
            lines = [f"Comparison between {name1} and {name2}:\n"]
            wins1, wins2 = 0, 0
            for col in headers1:
                v1 = self._to_float(row1.get(col))
                v2 = self._to_float(row2.get(col))
                if v1 is None or v2 is None:
                    continue
                diff = v1 - v2
                marker = ""
                if diff > 0:
                    marker = f" βœ“ (+{diff:.1f})"
                    wins1 += 1
                elif diff < 0:
                    marker = f" βœ— ({diff:.1f})"
                    wins2 += 1
                lines.append(f"  {col}: {v1} vs {v2}{marker}")

            lines.append(f"\nSummary: {name1} leads in {wins1} subjects, {name2} leads in {wins2} subjects.")
            total1 = self._to_float(row1.get('TOTAL'))
            total2 = self._to_float(row2.get('TOTAL'))
            pct1 = self._to_float(row1.get('%'))
            pct2 = self._to_float(row2.get('%'))
            if pct1 is not None and pct2 is not None:
                if pct1 > pct2:
                    lines.append(f"Overall: {name1} has higher attendance ({pct1}% vs {pct2}%).")
                elif pct2 > pct1:
                    lines.append(f"Overall: {name2} has higher attendance ({pct2}% vs {pct1}%).")
                else:
                    lines.append(f"Overall: Both have the same attendance percentage ({pct1}%).")
            return "\n".join(lines)

    # Words to strip when extracting a potential name from a query
    _STOP_WORDS = {
        'what', 'is', 'the', 'of', 'tell', 'me', 'about', 'who', 'how',
        'many', 'much', 'give', 'show', 'get', 'find', 'details', 'detail',
        'info', 'information', 'attendance', 'marks', 'score', 'total',
        'percentage', 'classes', 'attended', 'for', 'by', 'a', 'an', 'and',
        'in', 'to', 'does', 'did', 'has', 'have', 'had', 'can', 'do',
        'please', 'sir', 'student', 'roll', 'number', 'name',
    }

    def _check_hallucinated_id(self, query: str) -> Optional[str]:
        """If the query mentions an ID / roll number / name that doesn't exist
        in any table, return a hallucination warning."""

        # Collect all known IDs and names from every table
        known_values = set()
        known_names = []  # list of (lowercase_name, original_name)
        for tkey, rows in self.tables.items():
            for row in rows:
                for val in row.values():
                    if val is not None:
                        val_str = str(val).strip()
                        known_values.add(val_str.lower())
                        # Collect all text values as potential names
                        if isinstance(val, str) and len(val_str) > 2 and self._to_float(val) is None:
                            known_names.append((val_str.lower(), val_str))

        # Look for ID-like patterns in the query
        candidates = []
        for pattern in (self._ID_PATTERN, self._GENERIC_ID):
            candidates.extend(pattern.findall(query))

        # Also check for quoted or capitalized multi-word names
        name_matches = re.findall(r'\b([A-Z][A-Z ]{4,})\b', query)
        candidates.extend(name_matches)

        for candidate in candidates:
            c_lower = candidate.strip().lower()
            if c_lower and c_lower not in known_values:
                return (
                    f"HALLUCINATION DETECTED: '{candidate}' does not exist in the uploaded data. "
                    f"This identifier was not found in any of the loaded documents. "
                    f"The information about '{candidate}' cannot be verified and is likely fabricated."
                )

        # ── Extract a potential name from the query (even lowercase) ────────
        # Strip stop words and see if what remains looks like a person's name
        q_words = re.findall(r'[a-zA-Z]+', query)
        name_words = [w for w in q_words if w.lower() not in self._STOP_WORDS and len(w) > 1]
        extracted_name = " ".join(name_words).strip()

        if len(name_words) >= 1 and extracted_name:
            extracted_lower = extracted_name.lower()
            # Only match if the extracted name is an EXACT full match of a known name
            for known_lower, known_original in known_names:
                if extracted_lower == known_lower:
                    return None  # Exact full name match, not hallucinated

            # Name was extracted but no exact match found
            return (
                f"HALLUCINATION DETECTED: '{extracted_name}' does not exist in the uploaded data. "
                f"No matching student or record was found in the uploaded documents. "
                f"Please use the full name exactly as it appears in the data."
            )

        return None

    def _find_asked_columns(self, query: str, headers: List[str], table_key: str) -> List[str]:
        """Detect which columns the user is asking about in a lookup query.

        Returns a list of matching column names, or empty list if the query
        is generic (e.g. 'tell me about X').
        """
        q_lower = query.lower()
        q_words = set(re.findall(r'\w+', q_lower))
        q_stems = {self._stem(w) for w in q_words if len(w) > 2}

        # If the query is generic ("tell me about X", "details of X"), return empty
        generic_patterns = [r'\btell\b.*\babout\b', r'\bdetails?\b.*\bof\b',
                            r'\binfo\b.*\babout\b', r'\ball\b.*\bdetails?\b',
                            r'\bshow\b.*\bdata\b', r'\bfull\b.*\bdata\b']
        if any(re.search(p, q_lower) for p in generic_patterns):
            return []

        # Skip these generic words that don't refer to columns
        skip_words = {'what', 'how', 'many', 'the', 'who', 'which', 'tell',
                      'about', 'give', 'show', 'get', 'find', 'is', 'are',
                      'was', 'were', 'has', 'have', 'had', 'does', 'did',
                      'classes', 'attended', 'scored', 'marks', 'score',
                      'value', 'number', 'much', 'detail', 'info',
                      'student', 'name', 'roll', 'sir', 'please', 'of', 'by'}

        # First: check if a full column name appears verbatim in the query
        # e.g. "fml lab" in "How many FML LAB classes attended by X?"
        # Sort by length descending so "FML LAB" matches before "FML"
        exact_matches = []
        for col in sorted(headers, key=lambda c: len(c), reverse=True):
            col_lower = col.lower().strip()
            # Check aliases first (even for single-char columns like '%')
            aliases = set()
            for alias_key, alias_set in self.COLUMN_ALIASES.items():
                if col_lower == alias_key or col_lower in alias_set:
                    aliases = alias_set
                    break
            if aliases and (q_words & aliases):
                exact_matches.append(col)
                continue
            if len(col_lower) < 2:
                continue
            # For short column names (<=3 chars like "SE", "OS"), use word boundary
            # to avoid matching inside other words ("se" in "classes")
            if len(col_lower) <= 3:
                if re.search(r'\b' + re.escape(col_lower) + r'\b', q_lower):
                    exact_matches.append(col)
            else:
                # Longer names: verbatim substring is fine
                if col_lower in q_lower:
                    exact_matches.append(col)

        if exact_matches:
            # Filter out columns whose names are substrings of already-matched longer names
            # e.g. if "FML LAB" matched, don't also return "FML"
            filtered = []
            for col in exact_matches:
                cl = col.lower().strip()
                is_substring = any(
                    cl != other.lower().strip() and cl in other.lower().strip()
                    for other in exact_matches
                )
                if not is_substring:
                    filtered.append(col)
            return filtered

        # Fallback: stem/substring matching for partial names
        matched = []
        for col in headers:
            col_lower = col.lower().strip()
            col_words = set(re.findall(r'\w+', col_lower))
            col_stems = {self._stem(w) for w in col_words}

            if not col_stems:
                continue

            stem_hits = len(q_stems & col_stems)
            sub_hits = sum(
                1 for qw in q_words - skip_words
                if len(qw) > 1 and any(
                    (qw == cw or (len(qw) > 2 and len(cw) > 2 and (qw in cw or cw in qw)))
                    for cw in col_words
                )
            )

            if stem_hits > 0 or sub_hits > 0:
                matched.append(col)

        return matched

    # ── Filter / Conditional Queries ─────────────────────────────────────────
    def _try_filter_query(self, query: str) -> Optional[str]:
        """Answer queries like 'how many students have attendance > 80%' or
        'list students with percentage above 90'."""
        q_lower = query.lower()

        # Detect a filter condition
        filter_op = None
        threshold = None
        threshold2 = None  # for 'between'

        for pattern, op in FILTER_PATTERNS:
            m = re.search(pattern, q_lower)
            if m:
                filter_op = op
                if op == "between":
                    threshold = float(m.group(1))
                    threshold2 = float(m.group(2))
                else:
                    threshold = float(m.group(2))
                break

        if filter_op is None:
            return None

        # Find the column to filter on
        table_key, column = self._match_column(query, "max")
        if table_key is None or column is None:
            # Try first table, % column
            table_key = next(iter(self.tables), None)
            if table_key is None:
                return None
            # Look for a % or percentage column
            for h in self.headers[table_key]:
                if h.strip() in ('%', 'Percentage', 'percentage', 'Attendance'):
                    column = h
                    break
            if column is None:
                return None

        rows = self.tables[table_key]
        label_col = self._find_label_column(rows)

        # Apply the filter
        matching = []
        for r in rows:
            val = self._to_float(r.get(column))
            if val is None:
                continue
            label = str(r.get(label_col, "?")).strip() if label_col else "?"

            if filter_op == "gte" and val >= threshold:
                matching.append((label, val))
            elif filter_op == "lte" and val <= threshold:
                matching.append((label, val))
            elif filter_op == "eq" and abs(val - threshold) < 0.01:
                matching.append((label, val))
            elif filter_op == "between" and threshold <= val <= threshold2:
                matching.append((label, val))

        matching.sort(key=lambda x: x[1], reverse=True)
        col_clean = column.strip()

        # Detect if query asks "how many" (count) or "list/who" (list names)
        wants_count = bool(re.search(r"(how many|count|number of)", q_lower))
        op_label = {
            "gte": f"greater than or equal to {threshold}",
            "lte": f"less than or equal to {threshold}",
            "eq": f"equal to {threshold}",
            "between": f"between {threshold} and {threshold2}",
        }[filter_op]

        if wants_count:
            answer = f"{len(matching)} students have {col_clean} {op_label}."
            if matching and len(matching) <= 20:
                names = ", ".join(f"{lbl} ({v})" for lbl, v in matching[:10])
                answer += f"\n\nThey are: {names}"
                if len(matching) > 10:
                    answer += f" ... and {len(matching) - 10} more."
            return answer
        else:
            # List them
            if not matching:
                return f"No students found with {col_clean} {op_label}."
            lines = [f"Students with {col_clean} {op_label} ({len(matching)} found):"]
            for i, (lbl, v) in enumerate(matching[:20], 1):
                lines.append(f"  {i}. {lbl} β€” {v}")
            if len(matching) > 20:
                lines.append(f"  ... and {len(matching) - 20} more.")
            return "\n".join(lines)

    # ── Internal helpers ─────────────────────────────────────────────────────
    @staticmethod
    def _extract_claimed_value(query: str) -> Optional[float]:
        """Extract a numeric value the user is claiming/asserting in the query.
        e.g. 'is 22PA1A0501 has attendance percentage of 90%' β†’ 90.0
             'does X have 85 marks' β†’ 85.0
        Only triggers for verification-style queries (is/does/has/did/correct/true).
        """
        q_lower = query.lower()
        # Only look for claimed values in verification-style queries
        verification_words = ('is ', 'does ', 'has ', 'did ', 'had ', 'correct', 'true', 'right')
        if not any(q_lower.startswith(w) or w in q_lower for w in verification_words):
            return None
        # Extract numbers from the query (skip roll-number-like patterns)
        numbers = re.findall(r'(?<!\w)(\d+\.?\d*)%?(?!\w*[A-Za-z])', query)
        # Filter out roll-number-like values (long alphanumeric codes)
        roll_pattern = re.compile(r'\d{2}[A-Za-z]{2}\d[A-Za-z]\d{4}')
        roll_numbers = roll_pattern.findall(query)
        roll_digits = set()
        for rn in roll_numbers:
            roll_digits.update(re.findall(r'\d+', rn))
        # Return the last number that isn't part of a roll number
        for num_str in reversed(numbers):
            if num_str not in roll_digits:
                try:
                    return float(num_str)
                except ValueError:
                    continue
        return None

    @staticmethod
    def _stem(word: str) -> str:
        """Cheap suffix stripping so 'students' matches 'student' etc."""
        w = word.lower()
        for suffix in ("ing", "tion", "ness", "ment", "ies", "es", "ed", "ly", "s"):
            if len(w) > len(suffix) + 2 and w.endswith(suffix):
                return w[: -len(suffix)]
        return w

    def _detect_operation(self, query: str) -> Optional[str]:
        q = query.lower()
        for pattern, op in AGGREGATE_PATTERNS:
            if re.search(pattern, q):
                return op
        return None

    # Map short / symbolic column names to query-friendly aliases
    COLUMN_ALIASES = {
        '%': {'percentage', 'percent', 'attendance', 'rate'},
        'total': {'total', 'overall', 'sum', 'aggregate'},
        'p&s': {'p&s', 'ps', 'p and s', 'probability', 'p s'},
    }

    def _match_column(self, query: str, op: str = None):
        """Find which table + column the query is about.

        Uses stemming, substring matching, and alias expansion so that
        e.g. 'students' matches 'Student Name', 'attendance percentage'
        matches the '%' column, etc.
        """
        q_lower = query.lower()
        q_stems = {self._stem(w) for w in re.findall(r'\w+', q_lower) if len(w) > 2}
        q_words = set(re.findall(r'\w+', q_lower))

        best_score = 0.0
        best_table = None
        best_col = None

        for tkey, headers in self.headers.items():
            for col in headers:
                col_lower = col.lower().strip()
                col_words = set(re.findall(r'\w+', col_lower))
                col_stems = {self._stem(w) for w in col_words}

                # --- Check aliases for short/symbolic column names ---
                aliases = set()
                for alias_key, alias_set in self.COLUMN_ALIASES.items():
                    if col_lower == alias_key or col_lower in alias_set:
                        aliases = alias_set
                        break

                alias_hits = len(q_words & aliases) if aliases else 0

                if alias_hits > 0:
                    # Strong match via alias
                    score = 0.9 + alias_hits * 0.05
                elif not col_stems:
                    continue
                else:
                    # Method 1: stem-based overlap
                    stem_overlap = len(q_stems & col_stems)
                    score1 = stem_overlap / len(col_stems) if col_stems else 0

                    # Method 2: substring match (skip 1-char stems to avoid false positives)
                    sub_hits = 0
                    for qw in q_stems:
                        if any(
                            (qw in cw or cw in qw) and len(cw) > 1 and len(qw) > 1
                            for cw in col_stems
                        ):
                            sub_hits += 1
                    score2 = sub_hits / len(col_stems) if col_stems else 0

                    score = max(score1, score2)

                # For numeric aggregations, prefer numeric columns
                if op in ("max", "min", "avg", "sum", "sort") and score > 0:
                    rows = self.tables[tkey]
                    sample_val = rows[0].get(col) if rows else None
                    if self._to_float(sample_val) is not None:
                        score += 0.1  # small boost for numeric cols

                if score > best_score:
                    best_score = score
                    best_table = tkey
                    best_col = col

        if best_score < 0.25:
            return None, None
        return best_table, best_col

    def _to_float(self, val) -> Optional[float]:
        """Try to parse a cell value as float."""
        if val is None:
            return None
        s = str(val).strip().replace("%", "").replace(",", "").replace("$", "")
        try:
            return float(s)
        except (ValueError, TypeError):
            return None

    def _find_label_column(self, rows: List[Dict]) -> Optional[str]:
        """Find the column that likely contains names/labels."""
        if not rows:
            return None
        # Prefer columns with 'name' in the header
        for col in rows[0]:
            if 'name' in col.lower():
                return col
        # Fallback: first column whose values are mostly non-numeric strings
        for col in rows[0]:
            non_num = sum(1 for r in rows[:10] if r.get(col) and self._to_float(r[col]) is None)
            if non_num > len(rows[:10]) * 0.5:
                return col
        return list(rows[0].keys())[0]

    def _compute(self, rows: List[Dict], column: Optional[str], op: str, query: str) -> Optional[str]:
        """Run the aggregate and build a natural-language answer."""
        label_col = self._find_label_column(rows)

        # For count, we can work without a numeric column
        if op == "count":
            total = len(rows)
            if column and column != label_col:
                # Count non-empty values in that column
                filled = sum(1 for r in rows if r.get(column) is not None and str(r.get(column)).strip())
                return f"There are {filled} entries with {column} values (out of {total} total rows)."
            return f"There are {total} entries/rows in the data."

        if column is None:
            return None

        # Extract numeric values paired with their labels
        pairs = []
        for r in rows:
            val = self._to_float(r.get(column))
            label = str(r.get(label_col, "?")).strip() if label_col else "?"
            if val is not None:
                pairs.append((label, val))

        if not pairs:
            return None

        col_clean = column.strip()

        if op == "max":
            pairs.sort(key=lambda x: x[1], reverse=True)
            winner = pairs[0]
            answer = f"{winner[0]} has the highest {col_clean} with a value of {winner[1]}."
            if len(pairs) > 1:
                answer += f" Followed by {pairs[1][0]} ({pairs[1][1]})"
                if len(pairs) > 2:
                    answer += f" and {pairs[2][0]} ({pairs[2][1]})"
                answer += "."
            return answer

        if op == "min":
            pairs.sort(key=lambda x: x[1])
            winner = pairs[0]
            answer = f"{winner[0]} has the lowest {col_clean} with a value of {winner[1]}."
            if len(pairs) > 1:
                answer += f" Followed by {pairs[1][0]} ({pairs[1][1]})"
                if len(pairs) > 2:
                    answer += f" and {pairs[2][0]} ({pairs[2][1]})"
                answer += "."
            return answer

        if op == "avg":
            vals = [v for _, v in pairs]
            avg = sum(vals) / len(vals)
            return f"The average {col_clean} is {avg:.2f} (across {len(vals)} entries)."

        if op == "sum":
            total = sum(v for _, v in pairs)
            return f"The total {col_clean} is {total:.2f} (across {len(pairs)} entries)."

        if op == "count":
            return f"There are {len(pairs)} entries with numeric {col_clean} values."

        if op == "sort":
            pairs.sort(key=lambda x: x[1], reverse=True)
            lines = [f"Ranking by {col_clean} (highest to lowest):"]
            for i, (lbl, val) in enumerate(pairs[:15], 1):
                lines.append(f"  {i}. {lbl} β€” {val}")
            if len(pairs) > 15:
                lines.append(f"  ... and {len(pairs) - 15} more.")
            return "\n".join(lines)

        return None

    # ── LLM-powered Data Analysis ────────────────────────────────────────────
    def _try_llm_analysis(self, query: str) -> Optional[str]:
        """Use Groq LLM to analyze structured data for complex questions
        that the pattern-based methods can't handle."""
        if Groq is None:
            return None

        from config.settings import GROQ_API_KEY, LLM_MODEL
        if not GROQ_API_KEY:
            return None

        # Build a compact data summary for the LLM
        data_context = self._build_data_context()
        if not data_context:
            return None

        prompt = f"""You are a data analyst. Answer the following question using ONLY the data provided below.
Be precise and use actual numbers from the data. If the answer cannot be determined from the data, say so.
Do not include file paths, source references, or [Source: ...] tags.
Give a clear, natural response.

DATA:
{data_context}

QUESTION: {query}

ANSWER:"""

        try:
            client = Groq(api_key=GROQ_API_KEY)
            response = client.chat.completions.create(
                model=LLM_MODEL,
                messages=[
                    {"role": "system", "content": "You are a precise data analyst. Answer only from the given data. Be concise and accurate."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1000,
                temperature=0.1
            )
            answer = response.choices[0].message.content.strip()
            if answer:
                return answer
        except Exception as e:
            print(f"LLM analysis error: {e}")

        return None

    def _build_data_context(self, max_rows: int = 80) -> str:
        """Convert stored tables into a compact text format for LLM context."""
        parts = []
        for tkey, rows in self.tables.items():
            headers = self.headers.get(tkey, [])
            if not rows:
                continue

            parts.append(f"Table: {tkey}")
            parts.append(f"Columns: {', '.join(headers)}")
            parts.append(f"Total rows: {len(rows)}")

            # Include data as CSV-like format (compact)
            parts.append("Data:")
            parts.append(" | ".join(headers))
            for r in rows[:max_rows]:
                vals = [str(r.get(h, "")) for h in headers]
                parts.append(" | ".join(vals))

            if len(rows) > max_rows:
                parts.append(f"... ({len(rows) - max_rows} more rows)")
            parts.append("")

        return "\n".join(parts)