File size: 59,882 Bytes
2e345cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
import json
import re
import os
import sys
from typing import Dict, List, Any
from collections import defaultdict
from fuzzywuzzy import fuzz  # For fuzzy deduplication; pip install fuzzywuzzy python-Levenshtein
import anthropic
from dotenv import load_dotenv

# Add the project root to the path
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from database.supabase_manager import supabase_manager

# Load environment variables
load_dotenv()

class JSONContextExtractor:
    def __init__(self, chunk_size: int = 100, overlap_size: int = 20):
        self.chunk_size = chunk_size
        self.overlap_size = overlap_size
        
        # Initialize Anthropic client
        self.anthropic_client = anthropic.Anthropic(
            api_key=os.getenv("ANTHROPIC_API_KEY")
        )
        
        # Set default model name
        self.model_name = "claude-3-5-sonnet-20240620"
        
        # Test Supabase connection
        if not supabase_manager.is_connected():
            print("⚠️  Supabase connection failed. Operations will be logged only.")
            self.db_enabled = False
        else:
            print("βœ… Supabase connection successful.")
            self.db_enabled = True

    def chunk_messages(self, messages: List[Dict[str, str]]) -> List[List[Dict[str, str]]]:
        """Split messages into overlapping chunks for LLM processing"""
        if not messages:
            return []
            
        chunks = []
        for i in range(0, len(messages), self.chunk_size - self.overlap_size):
            chunk = messages[i:i + self.chunk_size]
            if chunk:
                chunks.append(chunk)
            
            # Stop if we've reached the end
            if i + self.chunk_size >= len(messages):
                break
                
        return chunks

    def deduplicate_messages(self, messages: List[Dict]) -> List[Dict]:
        """Remove duplicate messages within a context using fuzzy matching"""
        seen = set()
        deduped = []
        for msg in messages:
            text = msg.get("message", "").strip().lower()
            if text:
                is_duplicate = any(fuzz.ratio(text, s) > 95 for s in seen)
                if not is_duplicate:
                    seen.add(text)
                    deduped.append(msg)
        return deduped

    def prepare_comprehensive_llm_prompt(self, chunk: List[Dict[str, str]], user_id: str) -> str:
        """Prepare prompt for comprehensive LLM extraction from message chunk with ownership validation"""
        deduped_chunk = self.deduplicate_messages(chunk)
        context_text = "\n".join([
            f"[{msg.get('id', i)}] {msg.get('sender', 'Unknown')}: {msg.get('message', '')}" 
            for i, msg in enumerate(deduped_chunk)
        ])
        
        prompt = f"""
You are an expert information extractor for a personal assistant memory system. Extract ONLY personal information that specifically belongs to or is claimed by the target user "{user_id}".

TARGET USER: {user_id}

CONVERSATION CHUNK:
{context_text}

CRITICAL OWNERSHIP VALIDATION RULES:
1. ONLY extract information that is specifically ABOUT {user_id} or claimed BY {user_id}
2. REJECT information about other people, temporary locations, or general mentions
3. REQUIRE clear ownership indicators like "my", "I am", "I live", "my name is", etc.
4. REJECT casual mentions that don't indicate personal ownership
5. If unclear who the information belongs to, DO NOT extract it

VALID OWNERSHIP PATTERNS:
βœ… "{user_id} says: My address is..."
βœ… "I live at..." (when {user_id} is speaking)
βœ… "My phone number is..." (when {user_id} is speaking)  
βœ… "I was born in..." (when {user_id} is speaking)
βœ… "My wife/husband is..." (when {user_id} is speaking)

INVALID PATTERNS TO REJECT:
❌ "Let's meet at Mumbai" (temporary location, not personal address)
❌ "The office is in Bandra" (not personal address)
❌ "She lives in..." (someone else's information)
❌ Any location mentioned without clear personal ownership
❌ Business addresses, meeting locations, casual place mentions

MEMORY LAYERS TO EXTRACT (ONLY IF CLEARLY OWNED BY {user_id}):

LAYER 1 - Basic Personal Information (HIGHEST PRIORITY):
- Full names, ages, date of birth, nationality, gender, blood group (ONLY if about {user_id})
- Phone numbers, email addresses (ONLY if stated as THEIR contact info)
- Home addresses (ONLY if clearly stated as THEIR home/personal address)
- Relationship status, occupation, company details (ONLY if {user_id}'s info)
- Work location, car model, Work Address comes under LAYER 1

LAYER 2 - Document Information (HIGH PRIORITY):
- DONT EXTRACT INFORMATION IF IT IS NOT CLEARLY DESCRIBED. For example, user mentions about the aadhar card, but doesn't have any aadhar card ID/details, Then DONT EXTRACT
- Government IDs, certificates (ONLY if {user_id}'s documents)
- Document numbers, policies, licenses (ONLY if belonging to {user_id})
- Credit cards, bank details (ONLY if {user_id}'s financial info)

LAYER 3 - Relationships & Contacts (MEDIUM PRIORITY):
- Family members, friends, colleagues (ONLY if {user_id}'s relationships AND names are mentioned)
- CONTACT DETAILS OF FAMILY/FRIENDS (ONLY if {user_id} is sharing about THEIR contacts)
- IMPORTANT: Only extract relationship if SPECIFIC NAMES are mentioned
- REJECT: Generic "my wife", "my husband" without names
- ACCEPT: "my wife Sarah", "my husband John"

LAYER 4 - Preferences & Instructions (LOWER PRIORITY):
- Food preferences, allergies (ONLY if {user_id}'s preferences)
- Favorite places, vendors (ONLY if {user_id}'s preferences)
- Habits, routines (ONLY if {user_id}'s personal habits)
- DO NOT put travel plans here unless they are long-term preferences
- DO NOT CONSIDER EVERYTHING AS A PREFERENCE, UNTIL {user_id} SAYS OR LIKES IT. 

STRICT LAYER 3 RELATIONSHIP RULES:
- ONLY extract relationships if SPECIFIC NAMES are mentioned.
βœ… "My wife Sarah" β†’ Extract: relationship="spouse", name="Sarah"
βœ… "My husband John works at..." β†’ Extract: relationship="spouse", name="John"
βœ… "My brother Mike lives in..." β†’ Extract: relationship="brother", name="Mike"
❌ "My wife will come" β†’ REJECT (no name mentioned)
❌ "Thanks guys" β†’ REJECT (not a relationship statement)
❌ "I want to carry something for my 9 year old nephew" β†’ REJECT (no name mentioned)


OWNERSHIP VALIDATION EXAMPLES:
βœ… "My address is 402, Pinnacle Gold, Bandra" β†’ Extract as {user_id}'s address
❌ "Let's meet at Pinnacle Gold, Bandra" β†’ REJECT (not personal address)
βœ… "I live in Mumbai" β†’ Extract as {user_id}'s address  
❌ "Mumbai has good restaurants" β†’ REJECT (general comment, not personal info)
βœ… "My wife Sarah" β†’ Extract as {user_id}'s spouse with name "Sarah"
❌ "My wife will come" β†’ REJECT (no specific name mentioned)
❌ "Priya is coming" β†’ REJECT (unclear relationship/ownership)

CRITICAL EXTRACTION RULES:
1. NEVER extract relationship information without specific names.
2. NEVER extract information from casual conversation phrases
3. ALWAYS ensure the information directly belongs to {user_id}
4. FOCUS ON Layer 1, 2, 3 - avoid putting travel plans in Layer 1
5. For relationships: ONLY extract if both relationship type AND name are clear
6. IF THE USER MENTIONS THE CONTACT OR DETAILS OF A FAMILY MEMBER/FRIEND, ONLY EXTRACT IF IT IS CLEAR THAT THE DETAILS BELONG TO {user_id}(For e.g., "My wife's number is..." then save it as spouse phone number)

CONFIDENCE SCORING BASED ON OWNERSHIP:
- 0.9-1.0: Direct personal claims with clear ownership ("My address is...")
- 0.8-0.9: Clear attribution to {user_id} ("I live at...")  
- Below 0.8: REJECT (unclear ownership)

RESPONSE FORMAT (JSON only):
{{
  "Layer1": [
    {{
      "detail": {{"type": "field_type", "value": "extracted_value"}},
      "confidence": 0.95,
      "evidence": [
        {{"message_id": "id", "message_snippet": "relevant_part"}}
      ],
      "timestamp": "2025-09-13 00:00:00",
      "ownership_reason": "Clear personal claim by {user_id}"
    }}
  ],
  "Layer2": [...],
  "Layer3": [...],
  "Layer4": [...]
}}

FIELD VALUE SPECIFICATIONS:
- relationship_status: Use "married" or "single". If mentions "wife"/"husband" use "married"
- gender: Use "Male" or "Female" 
- address: ONLY extract if clearly stated as personal/home address, NOT business/meeting locations or anything other than personal.
- spouse/spouse_name: Use ONLY if specific name is mentioned (e.g., "My wife Sarah" β†’ spouse_name: "Sarah")
- family_member: Use ONLY if specific name is mentioned (e.g., "My brother Mike" β†’ family_member: "Mike")
- phone_number: Use exact format from message,
- email: Use exact email address as written.

PROHIBITED EXTRACTIONS:
❌ Do NOT extract relationships (e.g., "wife", "husband", "nephew") as standalone values without names
❌ Do NOT extract meeting locations as personal addresses
❌ Do NOT extract third-party information as user's information

REMEMBER: When in doubt about ownership or clarity, DO NOT extract. It's better to miss information than to incorrectly attribute someone else's details to {user_id}.

Only return valid JSON. If no clearly owned information found for a layer, return empty array for that layer.
"""
        return prompt

    def call_llm_for_extraction(self, prompt: str) -> Dict:
        """Call Claude LLM for information extraction"""
        try:
            response = self.anthropic_client.messages.create(
                model=self.model_name,
                max_tokens=3000,  # Increased for comprehensive extraction
                temperature=0.1,
                messages=[{"role": "user", "content": prompt}]
            )
            
            # Extract text content from response
            response_text = response.content[0].text
            
            # Try to parse JSON response
            try:
                raw_data = json.loads(response_text)
                # Apply validation to filter out bad extractions
                validated_data = self._validate_extractions(raw_data)
                return validated_data
            except json.JSONDecodeError:
                print(f"Failed to parse LLM response as JSON: {response_text[:200]}...")
                return {}
                
        except Exception as e:
            print(f"LLM API error: {e}")
            return {}

    def _validate_extractions(self, raw_data: Dict) -> Dict:
        """Validate and filter out bad extractions"""
        validated_data = {}
        
        # Invalid values that should never be extracted
        invalid_relationship_values = {
            'wife', 'husband', 'nephew', 'niece', 'son', 'daughter', 'brother', 'sister',
            'mother', 'father', 'aunt', 'uncle', 'cousin', 'friend', 'colleague'
        }
        
        # Invalid evidence snippets that indicate bad extraction
        invalid_evidence_patterns = [
            'thanks guys', 'thank you', 'thanks', 'ok', 'okay', 'yes', 'no',
            'sure', 'great', 'perfect', 'sounds good', 'alright'
        ]
        
        for layer, nodes in raw_data.items():
            validated_nodes = []
            
            for node in nodes:
                if not isinstance(node, dict) or 'detail' not in node:
                    continue
                    
                detail = node['detail']
                fact_type = detail.get('type', '')
                fact_value = detail.get('value', '').lower().strip()
                evidence = node.get('evidence', [])
                
                # Skip invalid extractions
                skip_node = False
                
                # Check for invalid relationship extractions
                if fact_type in ['relationship', 'family_member', 'spouse'] and fact_value in invalid_relationship_values:
                    print(f"⚠️  Skipping invalid {fact_type}: '{fact_value}' (no specific name)")
                    skip_node = True
                
                # Check for invalid evidence snippets
                for ev in evidence:
                    snippet = ev.get('message_snippet', '').lower().strip()
                    if snippet in invalid_evidence_patterns:
                        print(f"⚠️  Skipping extraction from invalid evidence: '{snippet}'")
                        skip_node = True
                        break
                
                # Check for travel plans in Layer 1
                if layer == 'Layer1' and fact_type in ['travel_plan', 'flight_number', 'travel_date']:
                    print(f"⚠️  Moving {fact_type} from Layer1 to Layer4 (travel plans don't belong in basic info)")
                    # Move to Layer4 instead of rejecting
                    if 'Layer4' not in validated_data:
                        validated_data['Layer4'] = []
                    validated_data['Layer4'].append(node)
                    continue
                
                # Check confidence threshold
                confidence = node.get('confidence', 0)
                if confidence < 0.75:
                    print(f"⚠️  Skipping low confidence ({confidence}) extraction: {fact_type} = {fact_value}")
                    skip_node = True
                
                if not skip_node:
                    validated_nodes.append(node)
            
            if validated_nodes:
                validated_data[layer] = validated_nodes
        
        return validated_data

    async def store_in_db(self, extracted_nodes: Dict, user_id: str):
        """Store extracted nodes in Supabase DB using SupabaseManager
        Returns dict with newly_stored_nodes containing only facts that were actually added in this iteration"""
        if not self.db_enabled:
            print(f"\n=== EXTRACTED DATA FOR USER: {user_id} (DB DISABLED) ===")
            for layer, nodes in extracted_nodes.items():
                print(f"\n{layer}:")
                for i, node in enumerate(nodes, 1):
                    print(f"  {i}. Type: {node['detail']['type']}")
                    print(f"     Value: {node['detail']['value']}")
                    print(f"     Confidence: {node['confidence']}")
                    print(f"     Evidence: {len(node['evidence'])} message(s)")
            print("=" * 50)
            # Return all nodes as "newly stored" when DB is disabled (for testing), but filter empty layers
            filtered_extracted = {layer: nodes for layer, nodes in extracted_nodes.items() if nodes}
            return {"newly_stored_nodes": filtered_extracted, "stats": {"stored": 0, "duplicate": 0, "low_confidence_rejected": 0}}
        
        try:
            stored_nodes = 0
            duplicate_nodes = 0
            low_confidence_rejected = 0
            newly_stored_nodes = {}  # Track only nodes that were actually stored in this iteration
            
            # Minimum confidence threshold for ownership validation
            CONFIDENCE_THRESHOLD = 0.75
            
            for layer, nodes in extracted_nodes.items():
                layer_number = int(layer.replace("Layer", ""))
                newly_stored_nodes[layer] = []  # Initialize layer in newly stored nodes
                
                for node in nodes:
                    detail = node["detail"]
                    confidence = node["confidence"]
                    evidence = node["evidence"]
                    
                    # Apply confidence threshold - reject low confidence extractions
                    if confidence < CONFIDENCE_THRESHOLD:
                        low_confidence_rejected += 1
                        print(f"  ⚠️  Rejected low confidence {detail['type']}: {detail['value']} (confidence: {confidence:.2f})")
                        continue
                    
                    # Safely extract required fields
                    try:
                        fact_type = detail.get("type", "unknown")
                        fact_value = detail.get("value", "N/A")
                        
                        if fact_value == "N/A":
                            print(f"⚠️  Warning: Missing 'value' field in detail: {detail}")
                        
                        # Format concluded fact for human readability
                        concluded_fact = self._format_concluded_fact(user_id, fact_type, fact_value)
                        
                        # Store memory node directly in Supabase
                        node_id = await supabase_manager.store_memory_node(
                            user_phone=user_id,
                            layer=layer_number,
                            fact_type=fact_type,
                            content=fact_value,
                            concluded_fact=concluded_fact,
                            confidence=confidence,
                            evidence=evidence
                        )
                        
                        if node_id:
                            stored_nodes += 1
                            ownership_reason = node.get('ownership_reason', 'Ownership validated')
                            print(f"  βœ… Stored {fact_type}: {fact_value} (confidence: {confidence:.2f}) - {ownership_reason}")
                            # Add to newly stored nodes for this iteration
                            newly_stored_nodes[layer].append(node)
                        else:
                            duplicate_nodes += 1
                            print(f"  ⚠️  Failed to store {fact_type}: {fact_value}")
                            
                    except Exception as detail_error:
                        print(f"❌ Error processing detail: {str(detail_error)}")
                        print(f"❌ Detail structure: {detail}")
                        continue  # Skip this detail and continue with next one
            
            # Print summary
            print(f"\nπŸ“Š Database Summary for {user_id}:")
            print(f"   Stored: {stored_nodes} new nodes")
            print(f"   Failed: {duplicate_nodes} nodes")
            print(f"   Rejected (low confidence): {low_confidence_rejected} nodes")
            
            # Get user summary from database
            import asyncio
            summary = await supabase_manager.get_user_summary(user_id)
            if summary:
                print(f"   Total nodes in DB: {summary.get('total_nodes', 0)}")
                print(f"   By status: Approved={summary.get('approved_nodes', 0)}, Pending={summary.get('pending_nodes', 0)}, Rejected={summary.get('rejected_nodes', 0)}")
                if 'layers' in summary:
                    print(f"   By layers: {summary['layers']}")
            
            # Return only the newly stored nodes from this iteration
            # Filter out empty layers for cleaner UI display
            filtered_newly_stored = {layer: nodes for layer, nodes in newly_stored_nodes.items() if nodes}
            
            return {
                "newly_stored_nodes": filtered_newly_stored,
                "stats": {
                    "stored": stored_nodes,
                    "duplicate": duplicate_nodes,
                    "low_confidence_rejected": low_confidence_rejected
                }
            }
            
        except Exception as e:
            print(f"❌ Database error: {e}")
            # Fallback to logging
            print(f"\n=== EXTRACTED DATA FOR USER: {user_id} (DB ERROR FALLBACK) ===")
            for layer, nodes in extracted_nodes.items():
                print(f"\n{layer}:")
                for i, node in enumerate(nodes, 1):
                    print(f"  {i}. Type: {node['detail']['type']}")
                    print(f"     Value: {node['detail']['value']}")
                    print(f"     Confidence: {node['confidence']}")
                    print(f"     Evidence: {len(node['evidence'])} message(s)")
            print("=" * 50)
            # Return all nodes as fallback when DB has errors, but filter empty layers
            filtered_extracted = {layer: nodes for layer, nodes in extracted_nodes.items() if nodes}
            return {"newly_stored_nodes": filtered_extracted, "stats": {"stored": 0, "duplicate": 0, "low_confidence_rejected": 0}}

    def _format_concluded_fact(self, user_id: str, fact_type: str, value: str) -> str:
        """Format a concluded fact in human-readable form"""
        
        # Common fact type mappings with ownership emphasis
        fact_formatters = {
            'phone': lambda u, v: f"πŸ“± Phone number of {u} is {v}",
            'phone_number': lambda u, v: f"πŸ“± Phone number of {u} is {v}",
            'email': lambda u, v: f"πŸ“§ Email address of {u} is {v}",
            'address': lambda u, v: f"🏠 Home address of {u} is {v}",
            'dob': lambda u, v: f"πŸŽ‚ Date of birth of {u} is {v}",
            'name': lambda u, v: f"πŸ‘€ Name of {u} is {v}",
            'age': lambda u, v: f"πŸ“… Age of {u} is {v}",
            'occupation': lambda u, v: f"πŸ’Ό Occupation of {u} is {v}",
            'company': lambda u, v: f"🏒 Company of {u} is {v}",
            'relationship_status': lambda u, v: f"πŸ’‘ Relationship status of {u} is {v}",
            'gender': lambda u, v: f"⚧ Gender of {u} is {v}",
            'nationality': lambda u, v: f"🌍 Nationality of {u} is {v}",
            'document_type': lambda u, v: f"πŸ“„ {u} shared a document of type {v}",
            'aadhaar_number': lambda u, v: f"πŸ†” Aadhaar number of {u} is {v}",
            'pan_number': lambda u, v: f"πŸ“‡ PAN number of {u} is {v}",
            'family_member': lambda u, v: f"πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦ Family member of {u}: {v}",
            'friend': lambda u, v: f"πŸ‘« Friend of {u}: {v}",
            'colleague': lambda u, v: f"🀝 Colleague of {u}: {v}",
            'contact_name': lambda u, v: f"πŸ“ž Contact of {u}: {v}",
            'relationship_type': lambda u, v: f"πŸ’• {v} of {u}",
            'food_preference': lambda u, v: f"🍽️ Food preference of {u}: {v}",
            'restaurant_preference': lambda u, v: f"πŸͺ Preferred restaurant of {u}: {v}",
            'allergy': lambda u, v: f"⚠️ {u} is allergic to {v}",
            'service_provider': lambda u, v: f"πŸ”§ Service provider of {u}: {v}",
            'vendor_name': lambda u, v: f"πŸ›’ Vendor used by {u}: {v}",
            'habit': lambda u, v: f"πŸ”„ Personal habit of {u}: {v}",
            'routine': lambda u, v: f"πŸ“‹ Routine of {u}: {v}",
        }
        
        # Use specific formatter or fallback to generic
        formatter = fact_formatters.get(fact_type, lambda u, v: f"{fact_type.replace('_', ' ').title()} of {u} is {v}")
        return formatter(user_id, value)

    def merge_extracted_data(self, all_extracted: List[Dict]) -> Dict:
        """Merge extracted data from multiple chunks, removing duplicates"""
        merged = {"Layer1": [], "Layer2": [], "Layer3": [], "Layer4": []}
        
        for extracted in all_extracted:
            for layer in ["Layer1", "Layer2", "Layer3", "Layer4"]:
                if layer in extracted:
                    merged[layer].extend(extracted[layer])
        
        # Deduplicate based on fact type and value similarity
        for layer in merged:
            merged[layer] = self._deduplicate_facts(merged[layer])
        
        return merged

    def _deduplicate_facts(self, facts: List[Dict]) -> List[Dict]:
        """Remove duplicate facts based on type and value similarity"""
        if not facts:
            return facts
            
        deduped = []
        for fact in facts:
            is_duplicate = False
            fact_type = fact["detail"]["type"]
            fact_value = fact["detail"]["value"].lower().strip()
            
            for existing in deduped:
                existing_type = existing["detail"]["type"]
                existing_value = existing["detail"]["value"].lower().strip()
                
                # Same type and high similarity
                if (fact_type == existing_type and 
                    fuzz.ratio(fact_value, existing_value) > 85):
                    is_duplicate = True
                    # Keep the one with higher confidence
                    if fact["confidence"] > existing["confidence"]:
                        deduped.remove(existing)
                        deduped.append(fact)
                    break
            
            if not is_duplicate:
                deduped.append(fact)
        
        return deduped
        
    def call_llm_for_deduplication(self, prompt: str) -> Dict:
        """Call LLM specifically for deduplication tasks"""
        try:
            response = self.anthropic_client.messages.create(
                model=self.model_name,
                max_tokens=4000,
                temperature=0.1,  # Low temperature for consistency
                messages=[
                    {"role": "user", "content": prompt}
                ],
                system="""You are a memory system deduplication and validation expert. Your job is to return a valid JSON with deduplicated facts AND validate that all facts make sense.

DEDUPLICATION RULES:
- REMOVE ANY DUPLICATE VALUES THAT ARE PRESENT, OR ANYTHING THAT LOOKS REPEATED.
- IF THERE ARE 2 SAME KEYS, WITH DIFFERENT VALUES, MAKE SURE YOU ONLY ALLOW ONE INSTANCE WITH THE HIGHEST CONFIDENCE SCORE.
For example, if the same phone number/address is present multiple times, keep only one instance with the highest confidence.

VALIDATION & SENSE-CHECKING RULES:
- REMOVE any facts that don't make logical sense (e.g., "age": "blue", "phone": "happy", "name": "12345")
- REMOVE any facts with impossible values (e.g., age over 150, phone numbers with letters, invalid email formats)
- REMOVE any facts that are clearly nonsensical or corrupted data
- REMOVE any facts that contain only special characters, random symbols, or gibberish
- REMOVE any facts where the value doesn't match the fact type (e.g., an address in a phone number field)
- REMOVE any facts with extremely low confidence scores (below 0.3) that also seem questionable
- KEEP facts that make sense even if they seem unusual (e.g., uncommon names, foreign addresses)

FORMATTING RULES:
1. Return ONLY the JSON data with NO explanatory text, NO markdown formatting, and NO code blocks
2. Your entire response must be parseable as JSON
3. Maintain the exact same structure as the input JSON
4. CRITICAL: Preserve ALL evidence snippets exactly as given in the input. DO NOT change snippets to "No snippet available" or any placeholder text.
5. Never add new facts or modify existing valid data beyond merging duplicates and removing nonsensical entries
6. If duplicates have different confidence scores, keep the one with higher confidence
7. If duplicates have the same confidence, merge their evidence lists completely with no loss of information"""
            )
            
            # Extract the response content as text
            if not response or not response.content:
                print("⚠️ Empty response from LLM")
                return {}
                
            content = response.content[0].text
            print(f"Raw LLM response (first 100 chars): {content[:100]}...")
            
            # Try multiple methods to extract JSON
            
            # Method 1: Try to find JSON in code blocks
            json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', content)
            if json_match:
                json_str = json_match.group(1)
                print("Found JSON in code block")
            else:
                # Method 2: Find first { and last }
                first_brace = content.find('{')
                last_brace = content.rfind('}')
                
                if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
                    json_str = content[first_brace:last_brace+1]
                    print(f"Extracted JSON using brace matching: chars {first_brace} to {last_brace}")
                else:
                    # Method 3: Use the whole response
                    json_str = content
                    print("Using full response content for JSON parsing")
            
            # Clean up the JSON string
            json_str = re.sub(r'(?m)^//.*$', '', json_str)  # Remove comment lines
            json_str = json_str.strip()
            
            try:
                print(f"Attempting to parse JSON (length: {len(json_str)})")
                result = json.loads(json_str)
                print("βœ… Successfully parsed JSON")
                return result
            except json.JSONDecodeError as e:
                print(f"⚠️ Failed to decode JSON from LLM response: {e}")
                print(f"JSON string start: {json_str[:200]}...")
                print(f"JSON string end: ...{json_str[-200:]}")
                
                # Fallback to raw extraction
                try:
                    # Attempt to rescue the response using regex
                    layer_pattern = r'"(Layer\d+)":\s*\[([\s\S]*?)\]'
                    layers = re.findall(layer_pattern, json_str)
                    
                    if layers:
                        print("πŸ› οΈ Attempting fallback regex extraction")
                        result = {}
                        for layer_name, layer_content in layers:
                            result[layer_name] = []
                        return result
                except Exception as regex_error:
                    print(f"Regex fallback failed: {regex_error}")
                    
                return {}
                
        except Exception as e:
            print(f"⚠️ Error calling LLM for deduplication: {e}")
            return {}

    async def deduplicate_with_llm(self, extracted_data: Dict, user_id: str) -> Dict:
        """Use LLM to deduplicate facts and merge evidence when appropriate
        
        This takes the raw extraction results and has the LLM analyze them to:
        1. Remove exact duplicates, keeping the highest confidence version
        2. Merge duplicates with same confidence by combining their evidence
        3. Maintain the 4-layer memory structure
        """
        if not extracted_data:
            print(f"No data to deduplicate for user {user_id}")
            return {}
            
        # Convert extracted data to a format suitable for LLM processing
        facts_by_layer = {}
        for layer, nodes in extracted_data.items():
            facts_by_layer[layer] = []
            for node in nodes:
                # Create a simplified representation for LLM
                evidence_snippets = []
                for ev in node['evidence']:
                    evidence_snippets.append({
                        'message_id': ev.get('message_id', 'unknown'),
                        'snippet': ev.get('message_snippet', ev.get('snippet', ''))  # Handle both formats
                    })
                    
                fact = {
                    'type': node['detail']['type'],
                    'value': node['detail']['value'],
                    'confidence': node['confidence'],
                    'evidence': evidence_snippets
                }
                facts_by_layer[layer].append(fact)
        
        # Create prompt for LLM deduplication and validation
        prompt = f"""
Review, validate, and deduplicate the following memory facts for user {user_id}.

DEDUPLICATION INSTRUCTIONS:
1. PHONE NUMBER DEDUPLICATION: If there are multiple phone numbers for the same person, keep ONLY the one with the highest confidence score. Different formats of the same number (+91 prefix vs without) should be considered duplicates.
2. ADDRESS DEDUPLICATION: If there are multiple addresses for the same person, keep ONLY the one with the highest confidence score. Similar addresses should be considered duplicates.
3. EMAIL DEDUPLICATION: If there are multiple emails, keep all distinct email addresses as people often have multiple emails.
4. RELATIONSHIP DEDUPLICATION: Remove any relationship entries that are just generic terms like "wife", "husband", "nephew" without specific names. Only keep relationships with actual names.
5. GENERAL RULE: For all other fact types, remove duplicates keeping only the highest confidence version for each unique fact.

VALIDATION & SENSE-CHECKING INSTRUCTIONS:
6. DATA TYPE VALIDATION: Remove facts where the value doesn't match the expected data type:
   - Names should be text, not numbers or symbols
   - Phone numbers should contain only digits, spaces, +, -, (, )
   - Ages should be reasonable numbers (0-150)
   - Emails should have valid email format
   - Addresses should be coherent location text
7. LOGICAL SENSE CHECK: Remove facts that are clearly nonsensical:
   - Random character combinations (e.g., "asdfgh", "123!@#")
   - Values that are clearly system errors or corrupted data
   - Facts with impossible combinations (e.g., age: "red", name: "999999")
8. CONFIDENCE FILTERING: Remove facts with very low confidence (<0.3) that also seem questionable or corrupted
9. PRESERVE UNUSUAL BUT VALID: Keep facts that seem unusual but are logically valid (foreign names, uncommon addresses, etc.)

STRUCTURAL RULES:
10. If duplicates have identical confidence scores, merge their evidence lists into one comprehensive fact.
11. Maintain the 4-layer structure (Layer1: Basic Personal Info, Layer2: Documents, Layer3: Relationships, Layer4: Preferences).
12. Do not add, remove, or modify any actual data values that are valid - simply reorganize, deduplicate, and remove nonsensical entries.
13. CRITICAL: Preserve ALL evidence snippets exactly as given in the input. DO NOT replace snippets with "No snippet available" or any placeholder text.
14. Return ONLY a valid JSON with no additional text or explanation.

IMPORTANT: Your response must be ONLY the cleaned and deduplicated JSON data with nothing else. Do not include any explanations, markdown formatting, or text outside the JSON. Return the JSON directly with all evidence snippets preserved exactly as provided.

When processing evidence, maintain the exact field names as provided in input (message_id and either 'snippet' or 'message_snippet').

RELATIONSHIP FILTERING RULES:
- REMOVE: "relationship": "wife" (no name specified)
- REMOVE: "family_member": "nephew" (no name specified)  
- KEEP: "spouse": "Sarah" (specific name provided)
- KEEP: "family_member": "Mike Johnson" (specific name provided)

PHONE NUMBER DEDUPLICATION EXAMPLE:
These are 3 different phone numbers for the same person. Keep ONLY the highest confidence one: "9870781578" (confidence: 0.95)

Input facts: {json.dumps(facts_by_layer, indent=2)}
"""

        try:
            # Call LLM for deduplication
            deduplicated_data = self.call_llm_for_deduplication(prompt)
            
            # Verify we got a valid response
            if not deduplicated_data:
                print("⚠️ No valid data from LLM deduplication. Applying fallback deduplication.")
                return self._apply_fallback_deduplication(extracted_data, user_id)
                
            # Verify that we have all the expected layers
            missing_layers = set(extracted_data.keys()) - set(deduplicated_data.keys())
            if missing_layers:
                print(f"⚠️ LLM response is missing layers: {missing_layers}. Applying fallback deduplication.")
                return self._apply_fallback_deduplication(extracted_data, user_id)
            
            print(f"\nπŸ” LLM deduplication complete for {user_id}")
            
            # Count before/after
            before_count = sum(len(nodes) for nodes in extracted_data.values())
            after_count = sum(len(nodes) for nodes in deduplicated_data.values())
            
            # Safety check - if LLM removed all facts, something went wrong
            if after_count == 0 and before_count > 0:
                print("⚠️ LLM deduplication removed ALL facts! Applying fallback deduplication.")
                return self._apply_fallback_deduplication(extracted_data, user_id)
                
            print(f"   Facts before: {before_count}, after: {after_count}, removed: {before_count - after_count}")
            
            # Convert back to the original format structure
            result = {}
            for layer, facts in deduplicated_data.items():
                result[layer] = []
                for fact in facts:
                    # Handle potential missing keys with defaults
                    if 'type' not in fact or 'value' not in fact:
                        print(f"⚠️ Skipping malformed fact (missing type/value): {fact}")
                        continue
                        
                    # Reconstruct evidence in the expected format - strictly preserve original snippets
                    evidence = []
                    if 'evidence' in fact and isinstance(fact['evidence'], list):
                        for ev in fact['evidence']:
                            message_id = ev.get('message_id', 'unknown')
                            # Handle both 'snippet' and 'message_snippet' from LLM responses
                            snippet = ev.get('snippet', ev.get('message_snippet', ''))
                            
                            # Ensure snippet is preserved exactly as in the original
                            if not snippet or snippet == "No snippet available":
                                print(f"⚠️ Warning: Missing or placeholder snippet for message {message_id}")
                                
                            evidence.append({
                                'message_id': message_id,
                                'snippet': snippet
                            })
                    else:
                        print(f"⚠️ Warning: Missing evidence list in fact: {fact['type']}: {fact['value']}")
                        # Create minimal evidence to avoid UI errors
                        evidence = [{'message_id': 'unknown', 'snippet': 'Evidence missing during deduplication'}]
                    
                    # Reconstruct node in the expected format
                    node = {
                        'detail': {
                            'type': fact['type'],
                            'value': fact['value']
                        },
                        'confidence': fact.get('confidence', 0.8),  # Default if missing
                        'evidence': evidence
                    }
                    result[layer].append(node)
            
            return result
        except Exception as e:
            print(f"⚠️ Error during LLM deduplication: {e}")
            print("Applying fallback deduplication to ensure duplicates are removed")
            return self._apply_fallback_deduplication(extracted_data, user_id)

    def _apply_fallback_deduplication(self, extracted_data: Dict, user_id: str) -> Dict:
        """Apply rule-based deduplication when LLM deduplication fails
        
        This ensures that deduplication ALWAYS happens, even if LLM fails.
        """
        print(f"\nπŸ”§ Applying fallback deduplication for {user_id}")
        
        result = {}
        total_removed = 0
        
        for layer, nodes in extracted_data.items():
            if not nodes:
                result[layer] = []
                continue
                
            # Apply rule-based deduplication with special handling for phone numbers
            deduplicated_nodes = self._deduplicate_facts_enhanced(nodes, layer)
            result[layer] = deduplicated_nodes
            
            removed_count = len(nodes) - len(deduplicated_nodes)
            total_removed += removed_count
            
            if removed_count > 0:
                print(f"   {layer}: {len(nodes)} β†’ {len(deduplicated_nodes)} facts (removed {removed_count} duplicates)")
        
        before_count = sum(len(nodes) for nodes in extracted_data.values())
        after_count = sum(len(nodes) for nodes in result.values())
        
        print(f"   Facts before: {before_count}, after: {after_count}, removed: {total_removed}")
        print(f"βœ… Fallback deduplication complete for {user_id}")
        
        return result
    
    def _deduplicate_facts_enhanced(self, facts: List[Dict], layer: str) -> List[Dict]:
        """Enhanced rule-based deduplication with special handling for different fact types"""
        if not facts:
            return []
        
        deduped = []
        
        # Group facts by type for specialized deduplication
        facts_by_type = {}
        for fact in facts:
            fact_type = fact['detail']['type']
            if fact_type not in facts_by_type:
                facts_by_type[fact_type] = []
            facts_by_type[fact_type].append(fact)
        
        # Apply type-specific deduplication rules
        for fact_type, type_facts in facts_by_type.items():
            if fact_type == 'phone_number':
                # For phone numbers: keep only the highest confidence one
                deduplicated = self._deduplicate_phone_numbers(type_facts)
            elif fact_type in ['address', 'email']:
                # For addresses and emails: more sophisticated deduplication
                deduplicated = self._deduplicate_contact_info(type_facts, fact_type)
            else:
                # For other types: use standard fuzzy matching
                deduplicated = self._deduplicate_facts(type_facts)
            
            deduped.extend(deduplicated)
        
        return deduped
    
    def _deduplicate_phone_numbers(self, phone_facts: List[Dict]) -> List[Dict]:
        """Keep only the highest confidence phone number"""
        if not phone_facts:
            return []
        
        # Sort by confidence (highest first)
        sorted_phones = sorted(phone_facts, key=lambda x: x['confidence'], reverse=True)
        
        # Keep only the highest confidence phone number
        best_phone = sorted_phones[0]
        
        print(f"      Phone deduplication: {len(phone_facts)} β†’ 1 (kept highest confidence: {best_phone['detail']['value']})")
        
        return [best_phone]
    
    def _deduplicate_contact_info(self, contact_facts: List[Dict], fact_type: str) -> List[Dict]:
        """Deduplicate addresses/emails with fuzzy matching"""
        if not contact_facts:
            return []
        
        deduped = []
        
        for fact in contact_facts:
            fact_value = fact['detail']['value'].lower().strip()
            is_duplicate = False
            
            for existing in deduped:
                existing_value = existing['detail']['value'].lower().strip()
                
                # For emails: exact match after normalization
                if fact_type == 'email':
                    if fact_value == existing_value:
                        is_duplicate = True
                        break
                
                # For addresses: fuzzy matching
                elif fact_type == 'address':
                    if (fact_value == existing_value or 
                        fuzz.ratio(fact_value, existing_value) > 85):
                        is_duplicate = True
                        # Keep the one with higher confidence
                        if fact["confidence"] > existing["confidence"]:
                            deduped.remove(existing)
                            deduped.append(fact)
                        break
            
            if not is_duplicate:
                deduped.append(fact)
        
        if len(contact_facts) != len(deduped):
            print(f"      {fact_type} deduplication: {len(contact_facts)} β†’ {len(deduped)}")
        
        return deduped

    async def process_json(self, input_json: List[Dict], user_id: str, force_reprocess: bool = False) -> Dict:
        """Process the JSON: chunk messages, call LLM for each chunk, merge and store results"""
        
        # Check if file has already been processed (unless force reprocess)
        if not force_reprocess and self.db_enabled:
            already_processed = await supabase_manager.is_file_processed(user_id)
            if already_processed:
                print(f"⏭️  {user_id} already processed. Skipping. Use force_reprocess=True to reprocess.")
                return {"message": "Already processed", "skipped": True}
        
        # Flatten messages, preserving order and adding metadata
        all_messages = []
        for conv_idx, conv in enumerate(input_json):
            for query in conv.get("user_queries", []):
                query = query.copy()
                query["sender"] = "User"
                query["conversation_id"] = conv_idx
                # Preserve original message_id, use as id for processing
                query["id"] = query.get("message_id", f"unknown_user_{conv_idx}")
                all_messages.append(query)
            for reply in conv.get("team_replies", []):
                reply = reply.copy()
                reply["sender"] = "Team"
                reply["conversation_id"] = conv_idx
                # Preserve original message_id, use as id for processing
                reply["id"] = reply.get("message_id", f"unknown_team_{conv_idx}")
                all_messages.append(reply)

        print(f"Processing {len(all_messages)} messages for user {user_id}")
        
        # Create chunks for processing
        chunks = self.chunk_messages(all_messages)
        print(f"Split into {len(chunks)} chunks (chunk_size={self.chunk_size}, overlap={self.overlap_size})")
        
        # Process each chunk
        all_extracted_data = []
        for i, chunk in enumerate(chunks):
            print(f"\nProcessing chunk {i+1}/{len(chunks)} ({len(chunk)} messages)")
            
            prompt = self.prepare_comprehensive_llm_prompt(chunk, user_id)
            extracted_data = self.call_llm_for_extraction(prompt)
            
            if extracted_data:
                # Count nodes in this chunk
                total_nodes = sum(len(nodes) for nodes in extracted_data.values() if isinstance(nodes, list))
                print(f"  Extracted {total_nodes} nodes from chunk {i+1}")
                all_extracted_data.append(extracted_data)
            else:
                print(f"  No data extracted from chunk {i+1}")
        
        # Merge all extracted data and remove duplicates
        if all_extracted_data:
            merged_data = self.merge_extracted_data(all_extracted_data)
            
            # Print summary
            total_nodes = sum(len(nodes) for nodes in merged_data.values())
            print(f"\nMerged Results for user {user_id}: {total_nodes} total nodes")
            for layer, nodes in merged_data.items():
                if nodes:
                    print(f"  {layer}: {len(nodes)} nodes")
            
            # Apply LLM deduplication to get a refined version with no duplicates
            print("\n🧹 Applying LLM deduplication to remove duplicates and merge evidence...")
            try:
                deduplicated_data = await self.deduplicate_with_llm(merged_data, user_id)
                print("βœ… LLM deduplication completed successfully")
            except Exception as dedup_error:
                print(f"❌ Error during LLM deduplication: {str(dedup_error)}")
                raise dedup_error
            
            # Print deduplication summary
            dedup_total_nodes = sum(len(nodes) for nodes in deduplicated_data.values())
            print(f"\nDeduplicated Results for user {user_id}: {dedup_total_nodes} total nodes")
            for layer, nodes in deduplicated_data.items():
                if nodes:
                    print(f"  {layer}: {len(nodes)} nodes")
            
            # Store deduplicated data in database
            print(f"\nπŸ’Ύ Storing {dedup_total_nodes} deduplicated nodes in database...")
            try:
                storage_result = await self.store_in_db(deduplicated_data, user_id)
                print("βœ… Database storage completed successfully")
            except Exception as storage_error:
                print(f"❌ Error during database storage: {str(storage_error)}")
                raise storage_error
            
            # Mark file as processed
            if self.db_enabled:
                print(f"\nπŸ“ Marking file as processed for user {user_id}...")
                try:
                    await supabase_manager.mark_file_processed(user_id, dedup_total_nodes)
                    print("βœ… File marked as processed successfully")
                except Exception as mark_error:
                    print(f"❌ Error marking file as processed: {str(mark_error)}")
                    raise mark_error
            
            # Return only the newly stored facts from this iteration
            newly_stored_facts = storage_result.get("newly_stored_nodes", {})
            print(f"\n🎯 Returning {sum(len(nodes) for nodes in newly_stored_facts.values())} newly stored facts for this iteration")
            
            return newly_stored_facts
        else:
            print(f"No data extracted for user {user_id}")
            return {}

    async def reprocess_rejected_fact(self, user_id: str, fact_type: str, layer: str, 
                               excluded_message_ids: List[str], original_node_id: str) -> Dict:
        """Reprocess a specific rejected fact type by re-analyzing with focused prompt"""
        
        # Load the user's JSON file
        input_path = os.path.join("input_jsons", f"{user_id}.json")
        if not os.path.exists(input_path):
            print(f"Warning: {input_path} not found for reprocessing.")
            return {}
        
        with open(input_path, 'r', encoding='utf-8') as f:
            input_data = json.load(f)
        
        # Flatten messages, preserving order
        all_messages = []
        for conv in input_data:
            for query in conv.get("user_queries", []):
                query = query.copy()
                query["sender"] = "User"
                # Preserve original message_id, use as id for processing
                query["id"] = query.get("message_id", "unknown_user")
                all_messages.append(query)
            for reply in conv.get("team_replies", []):
                reply = reply.copy()
                reply["sender"] = "Team"
                # Preserve original message_id, use as id for processing
                reply["id"] = reply.get("message_id", "unknown_team")
                all_messages.append(reply)
        
        # Filter out messages that were used in the original (rejected) extraction
        filtered_messages = [
            msg for msg in all_messages 
            if msg.get("id") not in excluded_message_ids
        ]
        
        print(f"Reprocessing {fact_type} for {user_id}")
        print(f"Original context had {len(excluded_message_ids)} messages, searching in {len(filtered_messages)} remaining messages")
        
        # Create chunks from filtered messages
        chunks = self.chunk_messages(filtered_messages)
        
        layer_number = layer.split("_")[-1] if "_" in layer else layer.replace("layer", "").replace("Layer", "")
        layer_key = f"Layer{layer_number}"
        extracted_data = {layer_key: []}
        
        print(f"Processing {len(chunks)} chunks for reprocessing")
        
        for i, chunk in enumerate(chunks):
            print(f"  Reprocessing chunk {i+1}: {len(chunk)} messages")
            
            # Create a specialized prompt for reprocessing
            prompt = self.prepare_reprocessing_prompt(chunk, layer_key, fact_type)
            llm_output = self.call_llm_for_extraction(prompt)
            nodes = llm_output.get(layer_key, [])
            
            if nodes:
                print(f"    Extracted {len(nodes)} nodes on reprocessing")
                extracted_data[layer_key].extend(nodes)
            else:
                print(f"    No nodes extracted on reprocessing")
        
        # Deduplicate the reprocessed results
        if extracted_data[layer_key]:
            extracted_data[layer_key] = self._deduplicate_facts(extracted_data[layer_key])
            await self.store_reprocessed_nodes(extracted_data, user_id, original_node_id)
        else:
            print(f"Reprocessing complete: No {fact_type} found in alternative contexts")
        
        return extracted_data

    def prepare_reprocessing_prompt(self, chunk: List[Dict[str, str]], layer: str, fact_type: str) -> str:
        """Prepare specialized prompt for reprocessing rejected facts"""
        deduped_chunk = self.deduplicate_messages(chunk)
        context_text = "\n".join([
            f"[{msg.get('id', i)}] {msg.get('sender', 'Unknown')}: {msg.get('message', '')}" 
            for i, msg in enumerate(deduped_chunk)
        ])
        layer_number = layer.replace("Layer", "")
        
        layer_descriptions = {
            "1": "Basic Personal Information (name, age, address, phone, email, DOB, nationality, gender, blood group, relationship status)",
            "2": "Information from Documents Shared (Aadhaar card, PAN card, driving license, voter ID, birth certificate, insurance, rent agreement, utility bills)",
            "3": "Loved Ones & Relations (family members, friends, colleagues, roommates, partners with their names, relationships, and contact details)",
            "4": "Preferences, Vendors, Standing Instructions (food preferences, favorite restaurants, service providers, vendors, habits, routines, standing orders)"
        }
        
        prompt = f"""
You are reprocessing a REJECTED extraction for a personal assistant memory system. A previous extraction of "{fact_type}" was rejected by reviewers.

LAYER {layer_number}: {layer_descriptions[layer_number]}

SPECIFIC TASK: Find "{fact_type}" information in this conversation chunk with extra care and precision.

CONVERSATION CHUNK:
{context_text}

INSTRUCTIONS:
1. Focus ONLY on finding clear, unambiguous "{fact_type}" information
2. This is a REPROCESSING attempt - be more careful and precise than usual
3. Look for casual conversational patterns, not just formal statements
4. Only extract if you find very clear evidence with high confidence (0.8+)
5. If not found clearly, return empty results
6. Reference specific message IDs as evidence

RESPONSE FORMAT (JSON only):
{{
  "{layer}": [
    {{
      "detail": {{"type": "{fact_type}", "value": "extracted_value"}},
      "confidence": 0.85,
      "evidence": [
        {{"message_id": "id", "message_snippet": "relevant_part_of_message"}}
      ],
      "timestamp": "2025-09-13 00:00:00"
    }}
  ]
}}

Only return JSON. If no clear "{fact_type}" information is found, return empty array.
"""
        return prompt

    async def store_reprocessed_nodes(self, extracted_nodes: Dict, user_id: str, original_node_id: str):
        """Store reprocessed nodes with link to original rejected node"""
        if not self.db_enabled:
            print(f"\n=== REPROCESSED DATA FOR USER: {user_id} (DB DISABLED) ===")
            for layer, nodes in extracted_nodes.items():
                print(f"\n{layer}:")
                for i, node in enumerate(nodes, 1):
                    print(f"  {i}. Type: {node['detail']['type']}")
                    print(f"     Value: {node['detail']['value']}")
                    print(f"     Confidence: {node['confidence']}")
                    print(f"     Evidence: {len(node['evidence'])} message(s)")
                    print(f"     Original Node ID: {original_node_id}")
            print("=" * 50)
            return
        
        try:
            stored_nodes = 0
            
            for layer, nodes in extracted_nodes.items():
                layer_number = int(layer.replace("Layer", ""))
                
                for node in nodes:
                    detail = node["detail"]
                    confidence = node["confidence"]
                    evidence = node["evidence"]
                    
                    # Safely extract required fields  
                    try:
                        fact_type = detail.get("type", "unknown")
                        fact_value = detail.get("value", "N/A")
                        
                        if fact_value == "N/A":
                            print(f"⚠️  Warning: Missing 'value' field in detail: {detail}")
                    
                        # Format concluded fact for human readability
                        concluded_fact = self._format_concluded_fact(user_id, fact_type, fact_value)
                        
                        # Store reprocessed memory node with link to original
                        node_id = await supabase_manager.store_memory_node(
                            user_phone=user_id,
                            layer=layer_number,
                            fact_type=fact_type,
                            content=fact_value,
                            concluded_fact=concluded_fact,
                            confidence=confidence,
                            evidence=evidence,
                            extraction_method='reprocess',
                            parent_update_id=original_node_id
                        )
                        
                        if node_id:
                            stored_nodes += 1
                            print(f"  βœ… Stored reprocessed {fact_type}: {fact_value} (Node ID: {node_id})")
                        else:
                            print(f"  ⚠️  Failed to store reprocessed {fact_type}: {fact_value}")
                            
                    except Exception as detail_error:
                        print(f"❌ Error processing detail: {str(detail_error)}")
                        print(f"❌ Detail structure: {detail}")
                        continue  # Skip this detail and continue with next one
            
            # Mark original node as reprocessed
            if stored_nodes > 0:
                await supabase_manager.mark_reprocessing_complete(original_node_id)
                print(f"\nπŸ“Š Reprocessing Summary for {user_id}:")
                print(f"   Stored: {stored_nodes} reprocessed nodes")
                print(f"   Linked to original node: {original_node_id}")
            
        except Exception as e:
            print(f"❌ Database error during reprocessing: {e}")

    def main(self):
        """Standalone batch processing for all JSON files"""
        input_folder = "input_jsons"
        input_files = [
            "RahulSingh.json",
            "MonarkMoolchandani.json",
            "ManuJain.json",
            "Gaurav-Sherlocksai.json",
            "Anurag.json",
            "AdityaShetty.json"
        ]

        for filename in input_files:
            input_path = os.path.join(input_folder, filename)
            if not os.path.exists(input_path):
                print(f"Warning: {input_path} not found, skipping.")
                continue

            print(f"\n{'='*50}")
            print(f"Processing {input_path}")
            print(f"{'='*50}")
            
            with open(input_path, 'r', encoding='utf-8') as f:
                input_data = json.load(f)

            user_id = filename.split(".")[0]
            
            import asyncio
            asyncio.run(self.process_json(input_data, user_id))
            print(f"Finished processing {input_path}")

if __name__ == "__main__":
    extractor = JSONContextExtractor(chunk_size=100, overlap_size=20)  # Configurable chunk sizes
    extractor.main()