jashdoshi77 commited on
Commit
abc646e
·
1 Parent(s): 60ff586

Update UI styling and message formatting improvements

Browse files
check_chroma_data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick script to check what's stored in ChromaDB metadata collection."""
2
+
3
+ from services.chroma_service import ChromaService
4
+
5
+ def check_metadata():
6
+ print("Connecting to ChromaDB...")
7
+ c = ChromaService()
8
+
9
+ data = c.metadata_collection.get()
10
+ total = len(data['ids'])
11
+ print(f"Total entries in document_metadata: {total}")
12
+
13
+ if total > 0:
14
+ print("\n--- Sample entry (first) ---")
15
+ meta = data['metadatas'][0]
16
+ for key, value in sorted(meta.items()):
17
+ print(f" {key}: {value}")
18
+
19
+ # Find Feb 2026 renewals
20
+ print("\n--- Entries with Feb 2026 renewal ---")
21
+ feb_count = 0
22
+ for meta in data['metadatas']:
23
+ rd = str(meta.get('renewal_date', ''))
24
+ ry = meta.get('renewal_year', 0)
25
+ # Check for Feb 2026
26
+ if ry == 2026 and '-02-' in rd:
27
+ feb_count += 1
28
+ print(f" {meta.get('document_title')}: renewal_date={rd}, renewal_year={ry}")
29
+
30
+ print(f"\nTotal Feb 2026 renewals found: {feb_count}")
31
+
32
+ # Show all unique renewal years
33
+ years = set(meta.get('renewal_year', 0) for meta in data['metadatas'])
34
+ print(f"\nAll renewal years in data: {sorted(years)}")
35
+ else:
36
+ print("No data found!")
37
+
38
+ if __name__ == "__main__":
39
+ check_metadata()
check_users.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Check what users exist in the system."""
2
+
3
+ from services.chroma_service import ChromaService
4
+
5
+ def check_users():
6
+ print("Connecting to ChromaDB...")
7
+ c = ChromaService()
8
+
9
+ # Get all users
10
+ users = c.users_collection.get()
11
+ print(f"\nTotal users: {len(users['ids'])}")
12
+
13
+ for i, uid in enumerate(users['ids']):
14
+ meta = users['metadatas'][i]
15
+ print(f" User ID: {uid}")
16
+ print(f" Username: {meta.get('username')}")
17
+ print(f" Role: {meta.get('role')}")
18
+ print()
19
+
20
+ # Check buckets for first user
21
+ if users['ids']:
22
+ first_user = users['ids'][0]
23
+ buckets = c.buckets_collection.get(where={"user_id": first_user})
24
+ print(f"\nBuckets for user {first_user}:")
25
+ for i, bid in enumerate(buckets['ids']):
26
+ print(f" Bucket ID: {bid}")
27
+ print(f" Name: {buckets['metadatas'][i].get('name')}")
28
+
29
+ if __name__ == "__main__":
30
+ check_users()
clear_and_remigrate.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to clear document_metadata collection and re-run migration
3
+ with a different user_id and no bucket_id
4
+ """
5
+
6
+ import re
7
+ import hashlib
8
+ import time
9
+ from services.chroma_service import ChromaService
10
+
11
+
12
+ def parse_markdown_table(filepath: str) -> tuple[list[str], list[dict]]:
13
+ """
14
+ Parse the markdown table from table.md and extract all rows.
15
+
16
+ Returns:
17
+ - headers: list of column names
18
+ - rows: list of dicts with column names as keys
19
+ """
20
+ with open(filepath, 'r', encoding='utf-8') as f:
21
+ content = f.read()
22
+
23
+ lines = content.strip().split('\n')
24
+
25
+ # Parse header row (line 1)
26
+ header_line = lines[0]
27
+ # Extract column names from header
28
+ raw_headers = [h.strip().replace('\t', '').strip() for h in header_line.split('|')]
29
+ headers = [h for h in raw_headers if h and h != ':--------:']
30
+
31
+ print(f"Found {len(headers)} columns in header:")
32
+ for i, h in enumerate(headers):
33
+ print(f" {i+1}. {h}")
34
+
35
+ rows = []
36
+
37
+ # Parse data rows (skip header and separator lines)
38
+ for line_num, line in enumerate(lines[2:], start=3):
39
+ if not line.strip():
40
+ continue
41
+
42
+ # Split by pipe and clean values
43
+ raw_cells = [c.strip().replace('\t', '').strip() for c in line.split('|')]
44
+ cells = [c for c in raw_cells if c != '']
45
+
46
+ if not cells:
47
+ continue
48
+
49
+ # Create dict with header keys and cell values
50
+ row = {}
51
+ for i, header in enumerate(headers):
52
+ if i < len(cells):
53
+ value = cells[i]
54
+ # Clean up special characters and artifacts
55
+ if value == "..'" or value == "..'":
56
+ value = ""
57
+ row[header] = value
58
+ else:
59
+ row[header] = ""
60
+
61
+ rows.append(row)
62
+
63
+ print(f"\nParsed {len(rows)} data rows from table")
64
+ return headers, rows
65
+
66
+
67
+ def parse_date_to_iso(date_str: str) -> str:
68
+ """
69
+ Convert date from DD-MM-YYYY format to YYYY-MM-DD format.
70
+ Returns empty string if parsing fails.
71
+ """
72
+ if not date_str or date_str in ["..'" , "..'", ""]:
73
+ return ""
74
+
75
+ date_str = date_str.strip()
76
+
77
+ # Handle DD-MM-YYYY format
78
+ match = re.match(r'(\d{1,2})-(\d{1,2})-(\d{4})', date_str)
79
+ if match:
80
+ day, month, year = match.groups()
81
+ return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
82
+
83
+ return date_str
84
+
85
+
86
+ def extract_year_from_date(date_str: str) -> int:
87
+ """Extract year from a date string."""
88
+ if not date_str:
89
+ return 0
90
+
91
+ # Try to find a 4-digit year
92
+ match = re.search(r'(\d{4})', date_str)
93
+ if match:
94
+ return int(match.group(1))
95
+
96
+ return 0
97
+
98
+
99
+ def parse_premium(value: str) -> float:
100
+ """Parse premium value to float."""
101
+ if not value or value in ["..'" , "..'", ""]:
102
+ return 0.0
103
+
104
+ try:
105
+ # Remove commas and currency symbols
106
+ cleaned = re.sub(r'[^\d.]', '', str(value).replace(',', ''))
107
+ if cleaned:
108
+ return float(cleaned)
109
+ except:
110
+ pass
111
+
112
+ return 0.0
113
+
114
+
115
+ def parse_int_value(value: str) -> int:
116
+ """Parse integer value."""
117
+ if not value or value in ["..'" , "..'", ""]:
118
+ return 0
119
+
120
+ try:
121
+ cleaned = re.sub(r'[^\d]', '', str(value))
122
+ if cleaned:
123
+ return int(cleaned)
124
+ except:
125
+ pass
126
+
127
+ return 0
128
+
129
+
130
+ def generate_doc_id(row: dict) -> str:
131
+ """Generate a unique document ID based on row data."""
132
+ pl_number = row.get('PL/EN #', row.get('PL/EN', ''))
133
+ policy_number = row.get('Policy Number', '')
134
+ client_name = row.get('Client Name', '')
135
+
136
+ unique_string = f"{pl_number}_{policy_number}_{client_name}"
137
+ return hashlib.sha256(unique_string.encode()).hexdigest()[:16]
138
+
139
+
140
+ def convert_row_to_metadata(row: dict, all_headers: list[str]) -> dict:
141
+ """
142
+ Convert a parsed table row into the metadata format expected by ChromaDB.
143
+
144
+ ALL columns from the table are preserved.
145
+ """
146
+ # Parse dates
147
+ policy_start = parse_date_to_iso(row.get('Policy Start Date', ''))
148
+ policy_end = parse_date_to_iso(row.get('Policy End Date', ''))
149
+
150
+ # Use policy_end_date as renewal_date
151
+ renewal_date = policy_end
152
+ renewal_year = extract_year_from_date(policy_end)
153
+
154
+ # Parse premium amounts
155
+ premium_paid = parse_premium(row.get('Premium Paid', ''))
156
+ gross_premium = parse_premium(row.get('Gross Premium', ''))
157
+ net_premium = parse_premium(row.get('Premium', ''))
158
+
159
+ # Use the best available premium value
160
+ premium_amount = premium_paid or gross_premium or net_premium
161
+
162
+ # Build metadata dict matching ChromaDB structure (required fields)
163
+ metadata = {
164
+ # Basic document info
165
+ "document_type": str(row.get('Type', 'Policy')),
166
+ "document_title": str(row.get('PL/EN #', '')),
167
+
168
+ # Policy details
169
+ "policy_number": str(row.get('Policy Number', '')),
170
+ "policy_type": str(row.get('Policy', '')),
171
+
172
+ # Parties involved
173
+ "insurer_name": str(row.get('Insurer', '')),
174
+ "insured_name": str(row.get('Insured Name', row.get('Client Name', ''))),
175
+ "broker_name": "",
176
+
177
+ # Financial
178
+ "sum_insured": 0.0,
179
+ "premium_amount": premium_amount,
180
+
181
+ # Dates
182
+ "policy_start_date": policy_start,
183
+ "policy_end_date": policy_end,
184
+ "renewal_date": renewal_date,
185
+ "renewal_year": renewal_year,
186
+
187
+ # Location
188
+ "city": "",
189
+ "state": "",
190
+ "pincode": "",
191
+ "property_address": "",
192
+
193
+ # Classification
194
+ "industry": "",
195
+ "is_manufacturing": False,
196
+
197
+ # Arrays stored as JSON
198
+ "coverage_type": [],
199
+ "keywords": [],
200
+
201
+ # Tracking
202
+ "created_at": time.time()
203
+ }
204
+
205
+ # Now add ALL columns from the table as additional metadata
206
+ # These will be stored as extra_* fields
207
+ for header in all_headers:
208
+ # Create a safe key name
209
+ safe_key = header.replace(' ', '_').replace('.', '').replace('#', 'num')
210
+ safe_key = re.sub(r'[^a-zA-Z0-9_]', '', safe_key)
211
+ safe_key = safe_key.lower()
212
+
213
+ # Skip if already mapped above
214
+ if safe_key in ['policy_number', 'policy', 'insurer', 'insured_name',
215
+ 'client_name', 'premium_paid', 'gross_premium', 'premium',
216
+ 'policy_start_date', 'policy_end_date', 'type', 'plen_num']:
217
+ continue
218
+
219
+ value = row.get(header, '')
220
+
221
+ # Store the raw value as a string (ChromaDB requires primitive types)
222
+ if value and value not in ["..'" , "..'", ""]:
223
+ metadata[f"col_{safe_key}"] = str(value)[:500] # Limit string length
224
+
225
+ return metadata
226
+
227
+
228
+ def print_progress_bar(current: int, total: int, bar_length: int = 50):
229
+ """Print a progress bar using ASCII characters."""
230
+ percent = current / total
231
+ filled = int(bar_length * percent)
232
+ bar = '#' * filled + '-' * (bar_length - filled)
233
+ print(f'\r Progress: |{bar}| {current}/{total} ({percent*100:.1f}%)', end='', flush=True)
234
+
235
+
236
+ def clear_and_remigrate():
237
+ """
238
+ Clear all data from document_metadata collection and re-run migration
239
+ with new user_id and no bucket_id.
240
+ """
241
+ print("=" * 60)
242
+ print("Clearing document_metadata and re-running migration")
243
+ print("=" * 60)
244
+
245
+ # Initialize ChromaDB service
246
+ print("\n[1] Connecting to ChromaDB...")
247
+ chroma = ChromaService()
248
+
249
+ # Check current state
250
+ current_data = chroma.metadata_collection.get()
251
+ print(f"Current document_metadata collection has {len(current_data['ids'])} entries")
252
+
253
+ # Delete all data from document_metadata collection
254
+ print("\n[2] Deleting all data from document_metadata collection...")
255
+ if current_data['ids']:
256
+ chroma.metadata_collection.delete(ids=current_data['ids'])
257
+ print(f"Deleted {len(current_data['ids'])} entries")
258
+ else:
259
+ print("Collection was already empty")
260
+
261
+ # Verify deletion
262
+ verify_data = chroma.metadata_collection.get()
263
+ print(f"After deletion: {len(verify_data['ids'])} entries remain")
264
+
265
+ # Parse the markdown table
266
+ print("\n[3] Parsing table.md...")
267
+ table_path = "table.md"
268
+ headers, rows = parse_markdown_table(table_path)
269
+
270
+ if not rows:
271
+ print("ERROR: No data rows found in table.md")
272
+ return
273
+
274
+ print(f"\nSample row data (first row, first 10 columns):")
275
+ sample = rows[0]
276
+ for key in list(sample.keys())[:10]:
277
+ print(f" {key}: {sample[key][:50] if len(sample[key]) > 50 else sample[key]}")
278
+
279
+ # NEW: Use different user_id (Nishant) and NO bucket_id
280
+ user_id = "55c0893720ef38eb" # Nishant's user ID
281
+ bucket_id = "" # No bucket ID specified
282
+
283
+ print(f"\n[4] Migrating {len(rows)} rows to ChromaDB...")
284
+ print(f"Using user_id: {user_id} (Nishant)")
285
+ print(f"Using bucket_id: (empty - no bucket specified)")
286
+ print(f"Batch size: 10 entries")
287
+ print()
288
+
289
+ success_count = 0
290
+ error_count = 0
291
+
292
+ BATCH_SIZE = 10
293
+ total_batches = (len(rows) + BATCH_SIZE - 1) // BATCH_SIZE
294
+
295
+ for batch_num in range(total_batches):
296
+ start_idx = batch_num * BATCH_SIZE
297
+ end_idx = min(start_idx + BATCH_SIZE, len(rows))
298
+ batch_rows = rows[start_idx:end_idx]
299
+
300
+ for row in batch_rows:
301
+ try:
302
+ # Generate unique doc_id
303
+ doc_id = generate_doc_id(row)
304
+
305
+ # Convert row to metadata format (include all headers)
306
+ metadata = convert_row_to_metadata(row, headers)
307
+
308
+ # Store in ChromaDB
309
+ result = chroma.store_document_metadata(
310
+ doc_id=doc_id,
311
+ user_id=user_id,
312
+ bucket_id=bucket_id,
313
+ metadata=metadata
314
+ )
315
+
316
+ if result.get("status") == "stored":
317
+ success_count += 1
318
+ else:
319
+ error_count += 1
320
+
321
+ except Exception as e:
322
+ print(f"\n Error on row: {str(e)[:100]}")
323
+ error_count += 1
324
+
325
+ # Update progress bar
326
+ print_progress_bar(end_idx, len(rows))
327
+
328
+ # Small delay between batches to not overload the API
329
+ if batch_num < total_batches - 1:
330
+ time.sleep(0.1)
331
+
332
+ print() # New line after progress bar
333
+
334
+ # Final status
335
+ print("\n" + "=" * 60)
336
+ print("Migration Complete!")
337
+ print("=" * 60)
338
+ print(f"Successfully migrated: {success_count} entries")
339
+ print(f"Errors: {error_count}")
340
+
341
+ # Verify final state
342
+ final_data = chroma.metadata_collection.get()
343
+ print(f"\nFinal document_metadata collection has {len(final_data['ids'])} entries")
344
+
345
+ # Show sample of stored data
346
+ if final_data['ids']:
347
+ print("\nSample stored metadata (first entry):")
348
+ sample_meta = final_data['metadatas'][0]
349
+ for key, value in list(sample_meta.items())[:15]:
350
+ print(f" {key}: {value}")
351
+ print(f" ... and {len(sample_meta) - 15} more fields")
352
+
353
+
354
+ if __name__ == "__main__":
355
+ clear_and_remigrate()
clear_old_migration.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Clear all entries with wrong user_id from document_metadata collection."""
2
+
3
+ from services.chroma_service import ChromaService
4
+
5
+ def clear_old_data():
6
+ print("Connecting to ChromaDB...")
7
+ c = ChromaService()
8
+
9
+ # Get all entries with the wrong user_id
10
+ data = c.metadata_collection.get(where={"user_id": "jashdoshi"})
11
+ count = len(data['ids'])
12
+
13
+ print(f"Found {count} entries with user_id='jashdoshi' (old migration)")
14
+
15
+ if count > 0 and data['ids']:
16
+ print("Deleting these entries...")
17
+ c.metadata_collection.delete(ids=data['ids'])
18
+ print(f"Deleted {count} entries.")
19
+
20
+ # Verify
21
+ remaining = c.metadata_collection.get()
22
+ print(f"Remaining entries in collection: {len(remaining['ids'])}")
23
+
24
+ if __name__ == "__main__":
25
+ clear_old_data()
find_buckets.py CHANGED
@@ -1,25 +1,38 @@
1
- """Helper script to find buckets for a user"""
2
- import sys
3
- sys.path.insert(0, '.')
4
 
5
- from services.chroma_service import chroma_service
6
 
7
- # Check both users
8
- user_ids = ['55c0893720ef38eb', '7ac2ed69d52d2010']
9
-
10
- for user_id in user_ids:
11
- print(f"\nUser: {user_id}")
12
- print("-" * 40)
 
13
 
14
- # Get documents
15
- docs = chroma_service.get_user_documents(user_id)
16
- print(f"Documents: {len(docs)}")
 
 
 
 
 
 
 
17
 
18
- # Get buckets
19
- buckets = chroma_service.get_user_buckets(user_id)
20
- if buckets:
21
- print("Buckets:")
22
- for b in buckets:
23
- print(f" - {b['name']} (ID: {b['bucket_id']}, Docs: {b.get('doc_count', 0)})")
24
- else:
25
- print("No buckets found")
 
 
 
 
 
 
 
 
1
+ """Check buckets for all users."""
 
 
2
 
3
+ from services.chroma_service import ChromaService
4
 
5
+ def check_all_buckets():
6
+ print("Connecting to ChromaDB...")
7
+ c = ChromaService()
8
+
9
+ # Get all users
10
+ users = c.users_collection.get()
11
+ print(f"\nTotal users: {len(users['ids'])}")
12
 
13
+ for i, uid in enumerate(users['ids']):
14
+ meta = users['metadatas'][i]
15
+ print(f"\n=== User: {meta.get('username')} (ID: {uid}) ===")
16
+
17
+ # Get buckets for this user
18
+ buckets = c.buckets_collection.get(where={"user_id": uid})
19
+ print(f"Buckets: {len(buckets['ids'])}")
20
+
21
+ for j, bid in enumerate(buckets['ids']):
22
+ print(f" - {buckets['metadatas'][j].get('name')} (ID: {bid})")
23
 
24
+ # Also check what the metadata is stored with
25
+ print("\n=== Metadata collection user/bucket combos ===")
26
+ data = c.metadata_collection.get()
27
+ if data['ids']:
28
+ # Get unique user_id/bucket_id combinations
29
+ combos = set()
30
+ for meta in data['metadatas']:
31
+ combos.add((meta.get('user_id'), meta.get('bucket_id')))
32
+
33
+ for user_id, bucket_id in combos:
34
+ count = sum(1 for m in data['metadatas'] if m.get('user_id') == user_id and m.get('bucket_id') == bucket_id)
35
+ print(f" user_id={user_id}, bucket_id={bucket_id}: {count} entries")
36
+
37
+ if __name__ == "__main__":
38
+ check_all_buckets()
migrate_table_to_chroma.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Migration script to populate document_metadata collection from table.md
3
+
4
+ This script reads the markdown table and migrates data to ChromaDB's document_metadata
5
+ collection. No AI or API keys are used - just straightforward data parsing.
6
+
7
+ All columns from the table are preserved in the metadata.
8
+ """
9
+
10
+ import re
11
+ import hashlib
12
+ import time
13
+ import json
14
+ from services.chroma_service import ChromaService
15
+
16
+
17
+ def parse_markdown_table(filepath: str) -> tuple[list[str], list[dict]]:
18
+ """
19
+ Parse the markdown table from table.md and extract all rows.
20
+
21
+ Returns:
22
+ - headers: list of column names
23
+ - rows: list of dicts with column names as keys
24
+ """
25
+ with open(filepath, 'r', encoding='utf-8') as f:
26
+ content = f.read()
27
+
28
+ lines = content.strip().split('\n')
29
+
30
+ # Parse header row (line 1)
31
+ header_line = lines[0]
32
+ # Extract column names from header
33
+ raw_headers = [h.strip().replace('\t', '').strip() for h in header_line.split('|')]
34
+ headers = [h for h in raw_headers if h and h != ':--------:']
35
+
36
+ print(f"Found {len(headers)} columns in header:")
37
+ for i, h in enumerate(headers):
38
+ print(f" {i+1}. {h}")
39
+
40
+ rows = []
41
+
42
+ # Parse data rows (skip header and separator lines)
43
+ for line_num, line in enumerate(lines[2:], start=3):
44
+ if not line.strip():
45
+ continue
46
+
47
+ # Split by pipe and clean values
48
+ raw_cells = [c.strip().replace('\t', '').strip() for c in line.split('|')]
49
+ cells = [c for c in raw_cells if c != '']
50
+
51
+ if not cells:
52
+ continue
53
+
54
+ # Create dict with header keys and cell values
55
+ row = {}
56
+ for i, header in enumerate(headers):
57
+ if i < len(cells):
58
+ value = cells[i]
59
+ # Clean up special characters and artifacts
60
+ if value == "..'" or value == "..'":
61
+ value = ""
62
+ row[header] = value
63
+ else:
64
+ row[header] = ""
65
+
66
+ rows.append(row)
67
+
68
+ print(f"\nParsed {len(rows)} data rows from table")
69
+ return headers, rows
70
+
71
+
72
+ def parse_date_to_iso(date_str: str) -> str:
73
+ """
74
+ Convert date from DD-MM-YYYY format to YYYY-MM-DD format.
75
+ Returns empty string if parsing fails.
76
+ """
77
+ if not date_str or date_str in ["..'" , "..'", ""]:
78
+ return ""
79
+
80
+ date_str = date_str.strip()
81
+
82
+ # Handle DD-MM-YYYY format
83
+ match = re.match(r'(\d{1,2})-(\d{1,2})-(\d{4})', date_str)
84
+ if match:
85
+ day, month, year = match.groups()
86
+ return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
87
+
88
+ return date_str
89
+
90
+
91
+ def extract_year_from_date(date_str: str) -> int:
92
+ """Extract year from a date string."""
93
+ if not date_str:
94
+ return 0
95
+
96
+ # Try to find a 4-digit year
97
+ match = re.search(r'(\d{4})', date_str)
98
+ if match:
99
+ return int(match.group(1))
100
+
101
+ return 0
102
+
103
+
104
+ def parse_premium(value: str) -> float:
105
+ """Parse premium value to float."""
106
+ if not value or value in ["..'" , "..'", ""]:
107
+ return 0.0
108
+
109
+ try:
110
+ # Remove commas and currency symbols
111
+ cleaned = re.sub(r'[^\d.]', '', str(value).replace(',', ''))
112
+ if cleaned:
113
+ return float(cleaned)
114
+ except:
115
+ pass
116
+
117
+ return 0.0
118
+
119
+
120
+ def parse_int_value(value: str) -> int:
121
+ """Parse integer value."""
122
+ if not value or value in ["..'" , "..'", ""]:
123
+ return 0
124
+
125
+ try:
126
+ cleaned = re.sub(r'[^\d]', '', str(value))
127
+ if cleaned:
128
+ return int(cleaned)
129
+ except:
130
+ pass
131
+
132
+ return 0
133
+
134
+
135
+ def generate_doc_id(row: dict) -> str:
136
+ """Generate a unique document ID based on row data."""
137
+ pl_number = row.get('PL/EN #', row.get('PL/EN', ''))
138
+ policy_number = row.get('Policy Number', '')
139
+ client_name = row.get('Client Name', '')
140
+
141
+ unique_string = f"{pl_number}_{policy_number}_{client_name}"
142
+ return hashlib.sha256(unique_string.encode()).hexdigest()[:16]
143
+
144
+
145
+ def convert_row_to_metadata(row: dict, all_headers: list[str]) -> dict:
146
+ """
147
+ Convert a parsed table row into the metadata format expected by ChromaDB.
148
+
149
+ ALL columns from the table are preserved.
150
+ """
151
+ # Parse dates
152
+ policy_start = parse_date_to_iso(row.get('Policy Start Date', ''))
153
+ policy_end = parse_date_to_iso(row.get('Policy End Date', ''))
154
+
155
+ # Use policy_end_date as renewal_date
156
+ renewal_date = policy_end
157
+ renewal_year = extract_year_from_date(policy_end)
158
+
159
+ # Parse premium amounts
160
+ premium_paid = parse_premium(row.get('Premium Paid', ''))
161
+ gross_premium = parse_premium(row.get('Gross Premium', ''))
162
+ net_premium = parse_premium(row.get('Premium', ''))
163
+
164
+ # Use the best available premium value
165
+ premium_amount = premium_paid or gross_premium or net_premium
166
+
167
+ # Build metadata dict matching ChromaDB structure (required fields)
168
+ metadata = {
169
+ # Basic document info
170
+ "document_type": str(row.get('Type', 'Policy')),
171
+ "document_title": str(row.get('PL/EN #', '')),
172
+
173
+ # Policy details
174
+ "policy_number": str(row.get('Policy Number', '')),
175
+ "policy_type": str(row.get('Policy', '')),
176
+
177
+ # Parties involved
178
+ "insurer_name": str(row.get('Insurer', '')),
179
+ "insured_name": str(row.get('Insured Name', row.get('Client Name', ''))),
180
+ "broker_name": "",
181
+
182
+ # Financial
183
+ "sum_insured": 0.0,
184
+ "premium_amount": premium_amount,
185
+
186
+ # Dates
187
+ "policy_start_date": policy_start,
188
+ "policy_end_date": policy_end,
189
+ "renewal_date": renewal_date,
190
+ "renewal_year": renewal_year,
191
+
192
+ # Location
193
+ "city": "",
194
+ "state": "",
195
+ "pincode": "",
196
+ "property_address": "",
197
+
198
+ # Classification
199
+ "industry": "",
200
+ "is_manufacturing": False,
201
+
202
+ # Arrays stored as JSON
203
+ "coverage_type": [],
204
+ "keywords": [],
205
+
206
+ # Tracking
207
+ "created_at": time.time()
208
+ }
209
+
210
+ # Now add ALL columns from the table as additional metadata
211
+ # These will be stored as extra_* fields
212
+ for header in all_headers:
213
+ # Create a safe key name
214
+ safe_key = header.replace(' ', '_').replace('.', '').replace('#', 'num')
215
+ safe_key = re.sub(r'[^a-zA-Z0-9_]', '', safe_key)
216
+ safe_key = safe_key.lower()
217
+
218
+ # Skip if already mapped above
219
+ if safe_key in ['policy_number', 'policy', 'insurer', 'insured_name',
220
+ 'client_name', 'premium_paid', 'gross_premium', 'premium',
221
+ 'policy_start_date', 'policy_end_date', 'type', 'plen_num']:
222
+ continue
223
+
224
+ value = row.get(header, '')
225
+
226
+ # Store the raw value as a string (ChromaDB requires primitive types)
227
+ if value and value not in ["..'" , "..'", ""]:
228
+ metadata[f"col_{safe_key}"] = str(value)[:500] # Limit string length
229
+
230
+ return metadata
231
+
232
+
233
+ def print_progress_bar(current: int, total: int, bar_length: int = 50):
234
+ """Print a progress bar using ASCII characters."""
235
+ percent = current / total
236
+ filled = int(bar_length * percent)
237
+ bar = '#' * filled + '-' * (bar_length - filled)
238
+ print(f'\r Progress: |{bar}| {current}/{total} ({percent*100:.1f}%)', end='', flush=True)
239
+
240
+
241
+ def migrate_table_to_chroma():
242
+ """
243
+ Main migration function.
244
+
245
+ Reads table.md, parses all rows, and inserts them into ChromaDB in batches.
246
+ """
247
+ print("=" * 60)
248
+ print("Starting migration from table.md to ChromaDB")
249
+ print("=" * 60)
250
+
251
+ # Initialize ChromaDB service
252
+ print("\n[1] Connecting to ChromaDB...")
253
+ chroma = ChromaService()
254
+
255
+ # Check current state of the metadata collection
256
+ current_data = chroma.metadata_collection.get()
257
+ print(f"Current document_metadata collection has {len(current_data['ids'])} entries")
258
+
259
+ # Parse the markdown table
260
+ print("\n[2] Parsing table.md...")
261
+ table_path = "table.md"
262
+ headers, rows = parse_markdown_table(table_path)
263
+
264
+ if not rows:
265
+ print("ERROR: No data rows found in table.md")
266
+ return
267
+
268
+ print(f"\nSample row data (first row, first 10 columns):")
269
+ sample = rows[0]
270
+ for key in list(sample.keys())[:10]:
271
+ print(f" {key}: {sample[key][:50] if len(sample[key]) > 50 else sample[key]}")
272
+
273
+ # Fixed user_id and bucket_id for all entries
274
+ # IMPORTANT: These should match the actual hashed IDs in the system
275
+ # Check with: python check_users.py
276
+ # User "jash" has ID: 7ac2ed69d52d2010
277
+ # Bucket "2025 policy sibro" has ID: ee449d7c04e92039
278
+ user_id = "7ac2ed69d52d2010" # jash's user ID
279
+ bucket_id = "ee449d7c04e92039" # 2025 policy sibro bucket
280
+
281
+ print(f"\n[3] Migrating {len(rows)} rows to ChromaDB...")
282
+ print(f"Using user_id: {user_id}, bucket_id: {bucket_id}")
283
+ print(f"Batch size: 10 entries")
284
+ print()
285
+
286
+ success_count = 0
287
+ error_count = 0
288
+
289
+ BATCH_SIZE = 10
290
+ total_batches = (len(rows) + BATCH_SIZE - 1) // BATCH_SIZE
291
+
292
+ for batch_num in range(total_batches):
293
+ start_idx = batch_num * BATCH_SIZE
294
+ end_idx = min(start_idx + BATCH_SIZE, len(rows))
295
+ batch_rows = rows[start_idx:end_idx]
296
+
297
+ for row in batch_rows:
298
+ try:
299
+ # Generate unique doc_id
300
+ doc_id = generate_doc_id(row)
301
+
302
+ # Convert row to metadata format (include all headers)
303
+ metadata = convert_row_to_metadata(row, headers)
304
+
305
+ # Store in ChromaDB
306
+ result = chroma.store_document_metadata(
307
+ doc_id=doc_id,
308
+ user_id=user_id,
309
+ bucket_id=bucket_id,
310
+ metadata=metadata
311
+ )
312
+
313
+ if result.get("status") == "stored":
314
+ success_count += 1
315
+ else:
316
+ error_count += 1
317
+
318
+ except Exception as e:
319
+ print(f"\n Error on row: {str(e)[:100]}")
320
+ error_count += 1
321
+
322
+ # Update progress bar
323
+ print_progress_bar(end_idx, len(rows))
324
+
325
+ # Small delay between batches to not overload the API
326
+ if batch_num < total_batches - 1:
327
+ time.sleep(0.1)
328
+
329
+ print() # New line after progress bar
330
+
331
+ # Final status
332
+ print("\n" + "=" * 60)
333
+ print("Migration Complete!")
334
+ print("=" * 60)
335
+ print(f"Successfully migrated: {success_count} entries")
336
+ print(f"Errors: {error_count}")
337
+
338
+ # Verify final state
339
+ final_data = chroma.metadata_collection.get()
340
+ print(f"\nFinal document_metadata collection has {len(final_data['ids'])} entries")
341
+
342
+ # Show sample of stored data
343
+ if final_data['ids']:
344
+ print("\nSample stored metadata (first entry):")
345
+ sample_meta = final_data['metadatas'][0]
346
+ for key, value in list(sample_meta.items())[:15]:
347
+ print(f" {key}: {value}")
348
+ print(f" ... and {len(sample_meta) - 15} more fields")
349
+
350
+
351
+ if __name__ == "__main__":
352
+ migrate_table_to_chroma()
services/chroma_service.py CHANGED
@@ -410,25 +410,23 @@ class ChromaService:
410
 
411
  IMPORTANT: When bucket_id is provided, ONLY chunks from that bucket are returned.
412
  This ensures strict bucket isolation for multi-bucket deployments.
 
413
  """
414
- # Build where clause with strict bucket isolation
 
 
415
  if bucket_id:
416
- where_clause = {
417
- "$and": [
418
- {"user_id": user_id},
419
- {"bucket_id": bucket_id}
420
- ]
421
- }
422
  print(f"[CHROMA] Strict bucket isolation: searching only bucket '{bucket_id}'")
423
- elif doc_ids:
424
- where_clause = {
425
- "$and": [
426
- {"user_id": user_id},
427
- {"doc_id": {"$in": doc_ids}}
428
- ]
429
- }
430
  else:
431
- where_clause = {"user_id": user_id}
432
 
433
  results = self.chunks_collection.query(
434
  query_texts=[query],
 
410
 
411
  IMPORTANT: When bucket_id is provided, ONLY chunks from that bucket are returned.
412
  This ensures strict bucket isolation for multi-bucket deployments.
413
+ When doc_ids is also provided, it filters to specific documents within the bucket.
414
  """
415
+ # Build where clause with strict bucket isolation and optional doc_id filtering
416
+ conditions = [{"user_id": user_id}]
417
+
418
  if bucket_id:
419
+ conditions.append({"bucket_id": bucket_id})
 
 
 
 
 
420
  print(f"[CHROMA] Strict bucket isolation: searching only bucket '{bucket_id}'")
421
+
422
+ if doc_ids:
423
+ conditions.append({"doc_id": {"$in": doc_ids}})
424
+ print(f"[CHROMA] Filtering to {len(doc_ids)} specific documents")
425
+
426
+ if len(conditions) > 1:
427
+ where_clause = {"$and": conditions}
428
  else:
429
+ where_clause = conditions[0]
430
 
431
  results = self.chunks_collection.query(
432
  query_texts=[query],
services/rag_service.py CHANGED
@@ -234,6 +234,9 @@ CRITICAL RULES:
234
  2. When multiple industries are mentioned (e.g., "manufacturing and healthcare"), combine them with comma: "manufacturing, healthcare"
235
  3. When user asks for "top N" of something, set both limit AND sort_by appropriately
236
  4. Keywords like "manufacturing", "healthcare", "retail", "IT", "construction" are INDUSTRIES - put them in filters
 
 
 
237
 
238
  FORMAT DETECTION (NEW):
239
  1. Detect if user explicitly asks for a specific format:
@@ -290,7 +293,22 @@ Query: "list all fire policies in bullet points"
290
  {"intent":"list","needs_metadata":true,"filters":{"policy_type":"fire"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"bullets","is_format_change":false}
291
 
292
  Query: "top 5 health policies by sum insured as a table"
293
- {"intent":"rank","needs_metadata":true,"filters":{"policy_type":"health"},"sort_by":"sum_insured","sort_order":"desc","limit":5,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  messages = [
296
  {"role": "system", "content": system_prompt},
@@ -457,25 +475,32 @@ Query: "top 5 health policies by sum insured as a table"
457
  Returns markdown-compatible formatting guidance.
458
  """
459
  format_map = {
460
- "table": """FORMAT: Present data in a markdown table.
461
- - Use | column | headers | with |---| separator line
462
- - Keep columns aligned and consistent
463
- - Include all requested data in table rows""",
 
 
 
464
 
465
- "list": """FORMAT: Present as a numbered list.
466
- 1. Each item on its own line with number prefix
467
- 2. Include key details after the number
468
- 3. Use consistent formatting for all items""",
 
 
469
 
470
- "bullets": """FORMAT: Use bullet points.
471
- - Each item as a bullet point
472
- - Sub-details can be indented bullets
473
- - Keep bullets concise and scannable""",
 
 
474
 
475
  "paragraph": """FORMAT: Write in flowing prose paragraphs.
476
  - Use complete sentences and natural language
477
  - Group related information into paragraphs
478
- - Avoid lists or tables unless absolutely necessary"""
479
  }
480
 
481
  return format_map.get(format_preference, "")
@@ -928,9 +953,91 @@ Summary: {summary[:300] if summary else 'No summary available'}
928
 
929
  elif field in ['city', 'state', 'insurer_name', 'insured_name', 'broker_name']:
930
  # Handle comma-separated values (OR logic)
 
931
  filter_values = [v.strip().lower() for v in str(value).split(',')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932
  all_metadata = [m for m in all_metadata
933
- if any(fv in str(m.get(field, '')).lower() for fv in filter_values)]
934
  print(f"[METADATA QUERY] Filtered by {field} {filter_values}: {len(all_metadata)} remaining")
935
 
936
  elif field == 'renewal_year':
@@ -1041,8 +1148,8 @@ Summary: {summary[:300] if summary else 'No summary available'}
1041
  - Location: {meta.get('city', '')}, {meta.get('state', '')}
1042
  """
1043
  else:
1044
- # Compact format for large sets
1045
- entry = f"{i}. {meta.get('document_title', 'Unknown')} | {meta.get('insured_name', 'N/A')} | ₹{meta.get('premium_amount', 0):,.0f} | {meta.get('policy_type', 'N/A')}"
1046
 
1047
  context_parts.append(entry)
1048
 
@@ -1059,6 +1166,397 @@ Summary: {summary[:300] if summary else 'No summary available'}
1059
  'sources': {m.get('doc_id'): m.get('document_title') for m in all_metadata}
1060
  }
1061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1062
  def _stream_metadata_query(self, user_id: str, bucket_id: str,
1063
  query: str, parsed: dict, chat_id: str = ""):
1064
  """
@@ -1116,13 +1614,34 @@ Summary: {summary[:300] if summary else 'No summary available'}
1116
  total_before = result.get('total_before_filter', 0)
1117
  calculation = result.get('calculation')
1118
 
1119
- # Check if we have any data
1120
  if not context or total_docs == 0:
1121
- yield {
1122
- "type": "error",
1123
- "content": "No document metadata found. Please run the migration script to extract metadata from your documents."
1124
- }
1125
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1126
 
1127
  # Send sources first
1128
  yield {
@@ -1139,7 +1658,30 @@ Summary: {summary[:300] if summary else 'No summary available'}
1139
  conciseness_directive = "\n\nIMPORTANT: Be concise and direct. No preambles or verbose explanations. Get straight to the formatted answer." if format_preference else ""
1140
 
1141
  if intent == 'count':
1142
- system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COUNT query.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
 
1144
  CRITICAL INSTRUCTIONS:
1145
  1. The count has been computed: {total_docs} documents match the criteria.
@@ -1191,18 +1733,31 @@ CRITICAL INSTRUCTIONS:
1191
  {format_instructions if format_instructions else "FORMAT: Use tables or side-by-side format where helpful."}"""
1192
 
1193
  else: # list, summarize, or other
1194
- system_prompt = f"""You are Iribl AI, a document analysis assistant. You are answering a query that requires information from {total_docs} documents.
1195
-
1196
- CRITICAL INSTRUCTIONS:
1197
- 1. You have been given metadata for {total_docs} documents (from {total_before} total).
1198
- 2. Your answer must be COMPREHENSIVE - include ALL relevant items from the data provided.
1199
- 3. For "list" queries, actually list ALL matching items with key details.
1200
- 4. Organize information logically (by type, by company, by date, etc.).
1201
- 5. For "summarize" queries, provide a concise overview with key statistics.{conciseness_directive}
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
 
1203
  {format_instructions if format_instructions else "FORMAT: Use headers, bullet points, and bold text for clarity."}
1204
 
1205
- Do NOT say information is missing - you have the filtered list. Do NOT ask for more documents."""
1206
 
1207
  # Step 3: Load conversation history for memory (CRITICAL FOR CONTEXT)
1208
  stored_history = []
@@ -1900,13 +2455,25 @@ Instructions: Synthesize from multiple documents if relevant. Be detailed but co
1900
  print(f"[QUERY ROUTING] AI-parsed query: {parsed}")
1901
 
1902
  # Route based on AI-parsed intent
1903
- # needs_metadata = True means query requires aggregate data across all documents
1904
- if parsed.get('needs_metadata', False):
 
 
 
 
 
 
 
 
1905
  yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
1906
  return
1907
 
1908
- # For all other query types (specific, comparison, followup, general),
1909
- # continue with existing top-K chunk retrieval logic
 
 
 
 
1910
 
1911
  # Step 1: Expand query for better retrieval (handles "module 5" -> "module five", etc.)
1912
  expanded_queries = self._expand_query(query)
 
234
  2. When multiple industries are mentioned (e.g., "manufacturing and healthcare"), combine them with comma: "manufacturing, healthcare"
235
  3. When user asks for "top N" of something, set both limit AND sort_by appropriately
236
  4. Keywords like "manufacturing", "healthcare", "retail", "IT", "construction" are INDUSTRIES - put them in filters
237
+ 5. COMPANY NAME EXTRACTION: When user mentions a company name (e.g., "ABC Corp", "XYZ Industries", "Company Name"), extract it to insured_name filter. Extract the company name as mentioned in the query, even if it's partial. The system will handle name variations (case, spacing, suffixes like "Pvt Ltd", singular/plural) automatically.
238
+ 6. TYPO HANDLING: If user makes typos (e.g., "policie" -> "policies", "polciy" -> "policy"), still extract the correct intent and filters. The system is forgiving of spelling errors.
239
+ 7. COMPANY vs INDIVIDUAL: When user mentions a company name with business keywords (e.g., "ABC Chemical", "XYZ Industries", "Company Corp"), they want COMPANY policies, not individual person policies. The system will automatically filter out individual person names when company keywords are detected.
240
 
241
  FORMAT DETECTION (NEW):
242
  1. Detect if user explicitly asks for a specific format:
 
293
  {"intent":"list","needs_metadata":true,"filters":{"policy_type":"fire"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"bullets","is_format_change":false}
294
 
295
  Query: "top 5 health policies by sum insured as a table"
296
+ {"intent":"rank","needs_metadata":true,"filters":{"policy_type":"health"},"sort_by":"sum_insured","sort_order":"desc","limit":5,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}
297
+
298
+ Query: "renewals in march 2026"
299
+ {"intent":"list","needs_metadata":true,"filters":{"renewal_year":2026,"renewal_month":"march"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
300
+
301
+ Query: "renewals in march 2026 also list the renewal date"
302
+ {"intent":"list","needs_metadata":true,"filters":{"renewal_year":2026,"renewal_month":"march"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}
303
+
304
+ Query: "policies expiring in april 2025 with premium details"
305
+ {"intent":"list","needs_metadata":true,"filters":{"renewal_year":2025,"renewal_month":"april"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
306
+
307
+ Query: "list all ABC Corp policies"
308
+ {"intent":"list","needs_metadata":true,"filters":{"insured_name":"ABC Corp"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
309
+
310
+ Query: "show me policies for XYZ Industries"
311
+ {"intent":"list","needs_metadata":true,"filters":{"insured_name":"XYZ Industries"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}"""
312
 
313
  messages = [
314
  {"role": "system", "content": system_prompt},
 
475
  Returns markdown-compatible formatting guidance.
476
  """
477
  format_map = {
478
+ "table": """FORMAT: Present data as a complete markdown table.
479
+ CRITICAL TABLE RULES:
480
+ 1. Include EVERY SINGLE item from the data - do NOT truncate or summarize
481
+ 2. Use these standard columns: | S.No | Document/Policy Name | Insured Name | Policy Type | Sum Insured | Premium | Renewal Date |
482
+ 3. Use | column | headers | with |---| separator line
483
+ 4. If there are 37 items, the table MUST have 37 rows (plus header)
484
+ 5. Use ₹ symbol for currency values with proper formatting""",
485
 
486
+ "list": """FORMAT: Present as a complete numbered list.
487
+ CRITICAL LIST RULES:
488
+ 1. Include EVERY SINGLE item - do NOT skip any
489
+ 2. Number each item starting from 1
490
+ 3. Include key details: name, policy type, amounts, dates
491
+ 4. If there are 37 items, list ALL 37 items""",
492
 
493
+ "bullets": """FORMAT: Use bullet points for all items.
494
+ CRITICAL BULLET RULES:
495
+ 1. Include EVERY SINGLE item as a bullet point
496
+ 2. Sub-details can be indented bullets
497
+ 3. Do NOT summarize or truncate the list
498
+ 4. If there are 37 items, show ALL 37 bullets""",
499
 
500
  "paragraph": """FORMAT: Write in flowing prose paragraphs.
501
  - Use complete sentences and natural language
502
  - Group related information into paragraphs
503
+ - Still mention ALL items, just in prose form"""
504
  }
505
 
506
  return format_map.get(format_preference, "")
 
953
 
954
  elif field in ['city', 'state', 'insurer_name', 'insured_name', 'broker_name']:
955
  # Handle comma-separated values (OR logic)
956
+ # For name fields, use flexible matching to handle variations
957
  filter_values = [v.strip().lower() for v in str(value).split(',')]
958
+
959
+ def matches_name(metadata_value, filter_value):
960
+ """Flexible name matching that handles variations"""
961
+ if not metadata_value or not filter_value:
962
+ return False
963
+
964
+ meta_lower = str(metadata_value).lower()
965
+ filter_lower = filter_value.lower()
966
+
967
+ # Detect if filter is looking for a company (contains business keywords)
968
+ # vs individual person (just a name)
969
+ filter_is_company = any(keyword in filter_lower for keyword in
970
+ ['chemical', 'chemicals', 'industries', 'industry',
971
+ 'corp', 'corporation', 'ltd', 'limited', 'pvt',
972
+ 'private', 'inc', 'incorporated', 'llc', 'company',
973
+ 'enterprises', 'group', 'holdings'])
974
+
975
+ # Detect if metadata is a company (has business suffixes or keywords)
976
+ meta_is_company = any(keyword in meta_lower for keyword in
977
+ [' pvt ltd', ' pvt. ltd', ' ltd', ' ltd.', ' limited',
978
+ ' inc', ' inc.', ' incorporated', ' llc', ' llc.',
979
+ ' corporation', ' corp', ' corp.', ' industries',
980
+ ' industry', ' company', ' enterprises', ' group'])
981
+
982
+ # If filter is for a company but metadata is individual, skip
983
+ # This prevents matching "Choksey Chemical" company with "Bharat Choksey" person
984
+ if filter_is_company and not meta_is_company:
985
+ # Check if metadata is clearly a person name (has first/middle/last name pattern)
986
+ # Person names typically have 2-4 words, company names are usually longer or have suffixes
987
+ meta_words = meta_lower.split()
988
+ if len(meta_words) <= 4 and not any(char.isdigit() for char in meta_lower):
989
+ # Likely a person name, skip if filter is for company
990
+ return False
991
+
992
+ # Remove common suffixes/prefixes for better matching
993
+ # Remove "pvt ltd", "ltd", "inc", "llc", etc.
994
+ meta_clean = meta_lower
995
+ filter_clean = filter_lower
996
+
997
+ for suffix in [' pvt ltd', ' pvt. ltd', ' pvt ltd.', ' pvt. ltd.',
998
+ ' ltd', ' ltd.', ' limited', ' inc', ' inc.',
999
+ ' incorporated', ' llc', ' llc.', ' corporation', ' corp', ' corp.',
1000
+ ' industries', ' industry', ' company', ' enterprises', ' group']:
1001
+ meta_clean = meta_clean.replace(suffix, '')
1002
+ filter_clean = filter_clean.replace(suffix, '')
1003
+
1004
+ # Remove extra spaces and punctuation
1005
+ import re
1006
+ meta_clean = re.sub(r'[^\w\s]', ' ', meta_clean)
1007
+ filter_clean = re.sub(r'[^\w\s]', ' ', filter_clean)
1008
+ meta_clean = ' '.join(meta_clean.split())
1009
+ filter_clean = ' '.join(filter_clean.split())
1010
+
1011
+ # Check if filter value is a substring of metadata value
1012
+ if filter_clean in meta_clean:
1013
+ return True
1014
+
1015
+ # Also check if all significant words from filter are in metadata
1016
+ # Handle singular/plural variations
1017
+ filter_words = [w for w in filter_clean.split() if len(w) > 2]
1018
+ if filter_words:
1019
+ meta_words = set(meta_clean.split())
1020
+ for word in filter_words:
1021
+ # Check exact match
1022
+ if word in meta_words:
1023
+ continue
1024
+ # Check singular/plural variations
1025
+ word_singular = word.rstrip('s') if word.endswith('s') else word
1026
+ word_plural = word + 's' if not word.endswith('s') else word
1027
+ if word_singular in meta_words or word_plural in meta_words:
1028
+ continue
1029
+ # Check if word is a substring of any metadata word
1030
+ if any(word in mw or mw in word for mw in meta_words if len(mw) > 3):
1031
+ continue
1032
+ # If none match, this word doesn't match
1033
+ return False
1034
+ return True
1035
+
1036
+ return False
1037
+
1038
+ # Apply flexible matching
1039
  all_metadata = [m for m in all_metadata
1040
+ if any(matches_name(m.get(field, ''), fv) for fv in filter_values)]
1041
  print(f"[METADATA QUERY] Filtered by {field} {filter_values}: {len(all_metadata)} remaining")
1042
 
1043
  elif field == 'renewal_year':
 
1148
  - Location: {meta.get('city', '')}, {meta.get('state', '')}
1149
  """
1150
  else:
1151
+ # Compact format for large sets - includes renewal date
1152
+ entry = f"{i}. {meta.get('document_title', 'Unknown')} | {meta.get('insured_name', 'N/A')} | ₹{meta.get('premium_amount', 0):,.0f} | {meta.get('policy_type', 'N/A')} | Renewal: {meta.get('renewal_date', 'N/A')}"
1153
 
1154
  context_parts.append(entry)
1155
 
 
1166
  'sources': {m.get('doc_id'): m.get('document_title') for m in all_metadata}
1167
  }
1168
 
1169
+ def _get_rag_context_for_query(self, user_id: str, bucket_id: str, query: str,
1170
+ filters: dict = None, is_fallback: bool = False,
1171
+ doc_ids: list[str] = None) -> dict:
1172
+ """
1173
+ Get RAG context from chunk retrieval for a query.
1174
+ Used as fallback when metadata filtering returns 0 results,
1175
+ or to supplement metadata with detailed document content.
1176
+
1177
+ Args:
1178
+ user_id: User ID
1179
+ bucket_id: Bucket ID
1180
+ query: The search query
1181
+ filters: Optional filters from parsed query (used to build search query)
1182
+ is_fallback: If True, use more aggressive search (higher top_k, better query construction)
1183
+ doc_ids: Optional list of specific document IDs to search (from document name detection)
1184
+
1185
+ Returns:
1186
+ dict with:
1187
+ - context: Combined text from retrieved chunks
1188
+ - sources: dict of doc_id -> filename
1189
+ - chunk_count: Number of chunks retrieved
1190
+ - chunks: Raw chunk data
1191
+ """
1192
+ print(f"[HYBRID RAG] Getting RAG context for query: {query[:50]}... (fallback={is_fallback})")
1193
+
1194
+ # Step 0: Detect if user mentioned a specific document name in the query
1195
+ if doc_ids is None:
1196
+ user_docs = chroma_service.get_user_documents(user_id, bucket_id)
1197
+ referenced_doc_ids = self._detect_document_reference(query, user_docs)
1198
+ if referenced_doc_ids:
1199
+ doc_ids = referenced_doc_ids
1200
+ print(f"[HYBRID RAG] Detected document reference in query: {len(doc_ids)} documents")
1201
+
1202
+ # Build enhanced search query from filters if available
1203
+ search_query = query
1204
+ if filters:
1205
+ # Add filter values to improve semantic search
1206
+ filter_terms = []
1207
+ for field, value in filters.items():
1208
+ if value and field in ['insured_name', 'insurer_name', 'broker_name',
1209
+ 'policy_type', 'industry', 'city', 'state']:
1210
+ filter_terms.append(str(value))
1211
+ if filter_terms:
1212
+ search_query = f"{query} {' '.join(filter_terms)}"
1213
+ print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
1214
+
1215
+ # For fallback searches, use more aggressive parameters
1216
+ if is_fallback:
1217
+ # Extract key terms from original query for better matching
1218
+ # Split query into words and keep important terms
1219
+ query_words = query.lower().split()
1220
+ # Remove common stop words but keep entity names and numbers
1221
+ stop_words = {'how', 'many', 'are', 'is', 'the', 'a', 'an', 'for', 'in', 'on', 'at', 'to', 'of'}
1222
+ key_terms = [w for w in query_words if w not in stop_words and len(w) > 2]
1223
+
1224
+ # If we have filters, prioritize those terms
1225
+ if filters:
1226
+ for field, value in filters.items():
1227
+ if value and field in ['insured_name', 'insurer_name', 'broker_name']:
1228
+ # Add the filter value as a key term
1229
+ value_words = str(value).lower().split()
1230
+ key_terms.extend([w for w in value_words if len(w) > 2])
1231
+
1232
+ # Build a more focused search query for fallback
1233
+ if key_terms:
1234
+ # Use original query + key terms for better semantic matching
1235
+ enhanced_query = f"{query} {' '.join(set(key_terms))}"
1236
+ print(f"[HYBRID RAG] Fallback enhanced query: {enhanced_query[:100]}...")
1237
+ search_query = enhanced_query
1238
+
1239
+ # Perform semantic chunk search with higher top_k for fallback
1240
+ # IMPORTANT: ChromaDB Cloud has a quota limit of 300 results per query
1241
+ # Cap top_k to respect this limit
1242
+ CHROMADB_MAX_RESULTS = 300
1243
+ if is_fallback:
1244
+ top_k_value = min(self.top_k * 4, CHROMADB_MAX_RESULTS)
1245
+ else:
1246
+ top_k_value = min(self.top_k * 2, CHROMADB_MAX_RESULTS)
1247
+ print(f"[HYBRID RAG] Using top_k={top_k_value} for search (capped at {CHROMADB_MAX_RESULTS} for ChromaDB quota)")
1248
+
1249
+ chunks = chroma_service.search_chunks(
1250
+ user_id=user_id,
1251
+ query=search_query,
1252
+ bucket_id=bucket_id,
1253
+ doc_ids=doc_ids, # Pass doc_ids to filter by specific documents if detected
1254
+ top_k=top_k_value
1255
+ )
1256
+
1257
+ if not chunks:
1258
+ print("[HYBRID RAG] No chunks found from RAG search")
1259
+ return {
1260
+ 'context': '',
1261
+ 'sources': {},
1262
+ 'chunk_count': 0,
1263
+ 'chunks': []
1264
+ }
1265
+
1266
+ print(f"[HYBRID RAG] Found {len(chunks)} chunks via semantic search")
1267
+
1268
+ # Build context from chunks
1269
+ context_parts = []
1270
+ sources = {}
1271
+
1272
+ for i, chunk in enumerate(chunks, 1):
1273
+ doc_id = chunk['doc_id']
1274
+
1275
+ # Get filename from chroma if not cached
1276
+ if doc_id not in sources:
1277
+ doc_info = chroma_service.get_document(doc_id, user_id)
1278
+ filename = doc_info.get('filename', 'Document') if doc_info else 'Document'
1279
+ sources[doc_id] = filename
1280
+
1281
+ # Build context entry with document label
1282
+ section = f"=== DOCUMENT: {sources[doc_id]} (Section {i}) ===\n{chunk['text']}"
1283
+ context_parts.append(section)
1284
+
1285
+ context = "\n\n".join(context_parts)
1286
+ print(f"[HYBRID RAG] Built context: {len(context)} chars from {len(chunks)} chunks")
1287
+
1288
+ return {
1289
+ 'context': context,
1290
+ 'sources': sources,
1291
+ 'chunk_count': len(chunks),
1292
+ 'chunks': chunks
1293
+ }
1294
+
1295
+ def _combine_metadata_and_rag(self, metadata_result: dict, rag_result: dict) -> dict:
1296
+ """
1297
+ Combine metadata and RAG contexts for hybrid queries.
1298
+ Provides structured metadata summary + detailed RAG content.
1299
+
1300
+ Args:
1301
+ metadata_result: Result from _handle_metadata_query
1302
+ rag_result: Result from _get_rag_context_for_query
1303
+
1304
+ Returns:
1305
+ Combined context dict with merged sources
1306
+ """
1307
+ combined_parts = []
1308
+
1309
+ # Add metadata summary section if available
1310
+ if metadata_result.get('context') and metadata_result.get('total_documents', 0) > 0:
1311
+ combined_parts.append("=== DOCUMENT METADATA (Structured Fields) ===")
1312
+ combined_parts.append(metadata_result['context'])
1313
+ combined_parts.append("")
1314
+
1315
+ # Add RAG context section if available
1316
+ if rag_result.get('context') and rag_result.get('chunk_count', 0) > 0:
1317
+ combined_parts.append("=== DETAILED DOCUMENT CONTENT (From Text Search) ===")
1318
+ combined_parts.append(rag_result['context'])
1319
+
1320
+ # Merge sources
1321
+ all_sources = {}
1322
+ all_sources.update(metadata_result.get('sources', {}))
1323
+ all_sources.update(rag_result.get('sources', {}))
1324
+
1325
+ combined_context = "\n".join(combined_parts)
1326
+
1327
+ print(f"[HYBRID] Combined context: metadata={metadata_result.get('total_documents', 0)} docs, "
1328
+ f"rag={rag_result.get('chunk_count', 0)} chunks, total sources={len(all_sources)}")
1329
+
1330
+ return {
1331
+ 'context': combined_context,
1332
+ 'sources': all_sources,
1333
+ 'total_documents': metadata_result.get('total_documents', 0),
1334
+ 'chunk_count': rag_result.get('chunk_count', 0),
1335
+ 'calculation': metadata_result.get('calculation'),
1336
+ 'total_before_filter': metadata_result.get('total_before_filter', 0)
1337
+ }
1338
+
1339
+ def _stream_hybrid_query(self, user_id: str, bucket_id: str,
1340
+ query: str, parsed: dict, chat_id: str = ""):
1341
+ """
1342
+ Stream responses for HYBRID queries.
1343
+ Combines metadata (structured fields) with RAG (detailed content) for comprehensive answers.
1344
+
1345
+ Works for all query types: specific, compare, general, summarize, followup.
1346
+ """
1347
+ print(f"[HYBRID STREAM] Handling {parsed.get('intent')} query with metadata+RAG")
1348
+
1349
+ # Get format preference
1350
+ format_preference = parsed.get('format_preference')
1351
+ format_instructions = self._get_format_instructions(format_preference) if format_preference else ""
1352
+
1353
+ # Step 1: Get metadata context (may return 0 if filters don't match exactly)
1354
+ metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
1355
+ print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
1356
+
1357
+ # Step 2: Always get RAG context for detailed content
1358
+ # If metadata returned 0, use fallback mode for more aggressive search
1359
+ # Also detect document names in query for targeted search
1360
+ metadata_has_results = metadata_result.get('total_documents', 0) > 0
1361
+ rag_result = self._get_rag_context_for_query(
1362
+ user_id, bucket_id, query,
1363
+ filters=parsed.get('filters'),
1364
+ is_fallback=not metadata_has_results, # Use fallback mode if metadata failed
1365
+ doc_ids=None # Document name detection happens inside the method
1366
+ )
1367
+ print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
1368
+
1369
+ # Step 3: Combine contexts
1370
+ if metadata_result.get('total_documents', 0) > 0:
1371
+ # Have metadata - combine with RAG
1372
+ combined = self._combine_metadata_and_rag(metadata_result, rag_result)
1373
+ elif rag_result.get('chunk_count', 0) > 0:
1374
+ # No metadata match but RAG found content - use RAG only
1375
+ print("[HYBRID STREAM] No metadata match, using RAG-only context")
1376
+ combined = {
1377
+ 'context': rag_result['context'],
1378
+ 'sources': rag_result['sources'],
1379
+ 'total_documents': 0,
1380
+ 'chunk_count': rag_result['chunk_count'],
1381
+ 'calculation': None,
1382
+ 'total_before_filter': 0
1383
+ }
1384
+ else:
1385
+ # Neither found anything
1386
+ yield {
1387
+ "type": "error",
1388
+ "content": "No matching documents found. The document may not exist or try rephrasing your query."
1389
+ }
1390
+ return
1391
+
1392
+ context = combined['context']
1393
+ sources = combined['sources']
1394
+ total_docs = combined.get('total_documents', 0) + combined.get('chunk_count', 0)
1395
+
1396
+ # Send sources first
1397
+ yield {
1398
+ "type": "sources",
1399
+ "sources": list(sources.keys()),
1400
+ "source_files": list(sources.values())
1401
+ }
1402
+
1403
+ # Step 4: Build AI prompt based on intent
1404
+ intent = parsed.get('intent', 'specific')
1405
+
1406
+ if intent == 'compare':
1407
+ system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COMPARISON query.
1408
+
1409
+ CRITICAL INSTRUCTIONS:
1410
+ 1. You have BOTH structured metadata AND detailed document content.
1411
+ 2. Use metadata for key fields: policy numbers, amounts, dates, companies.
1412
+ 3. Use detailed content for specifics not in metadata.
1413
+ 4. Create a clear comparison highlighting differences and similarities.
1414
+ 5. Use a table format if comparing multiple attributes.
1415
+
1416
+ {format_instructions}
1417
+
1418
+ Do NOT say information is missing if it's in the provided context."""
1419
+
1420
+ elif intent == 'summarize':
1421
+ system_prompt = f"""You are Iribl AI, a document analysis assistant providing a SUMMARY.
1422
+
1423
+ CRITICAL INSTRUCTIONS:
1424
+ 1. You have BOTH structured metadata AND detailed document content.
1425
+ 2. Provide a concise but comprehensive summary.
1426
+ 3. Include key facts: insured name, policy type, coverage, premium, dates.
1427
+ 4. Highlight important terms or conditions from detailed content.
1428
+ 5. Format with clear headers and bullet points.
1429
+
1430
+ {format_instructions}
1431
+
1432
+ Do NOT say information is missing - search through ALL provided context thoroughly."""
1433
+
1434
+ elif intent == 'specific':
1435
+ system_prompt = f"""You are Iribl AI, a document analysis assistant answering a SPECIFIC query about a particular document or entity.
1436
+
1437
+ CRITICAL INSTRUCTIONS:
1438
+ 1. You have BOTH structured metadata AND detailed document content.
1439
+ 2. Use metadata for: policy number, insured name, sum insured, premium, dates.
1440
+ 3. Use detailed content for: coverage details, terms, conditions, exclusions.
1441
+ 4. Provide a comprehensive answer covering all relevant information.
1442
+ 5. Format clearly with headers and bullet points.
1443
+
1444
+ {format_instructions}
1445
+
1446
+ Do NOT say information is missing - search through ALL provided context thoroughly."""
1447
+
1448
+ else: # general, followup, or any other
1449
+ system_prompt = f"""You are Iribl AI, a document analysis assistant.
1450
+
1451
+ CRITICAL INSTRUCTIONS:
1452
+ 1. You have BOTH structured metadata AND detailed document content.
1453
+ 2. Search thoroughly through ALL provided context before answering.
1454
+ 3. Use metadata for structured fields like names, amounts, dates.
1455
+ 4. Use detailed content for explanations, terms, conditions.
1456
+ 5. Provide a complete and accurate answer based on the documents.
1457
+ 6. Format clearly with headers and bullet points where appropriate.
1458
+
1459
+ {format_instructions}
1460
+
1461
+ Do NOT say information is missing - search through ALL provided context thoroughly."""
1462
+
1463
+ # Step 5: Load conversation history
1464
+ stored_history = []
1465
+ if chat_id:
1466
+ try:
1467
+ all_history = chroma_service.get_conversation_history(
1468
+ user_id=user_id,
1469
+ bucket_id=bucket_id,
1470
+ limit=50
1471
+ )
1472
+ stored_history = [msg for msg in all_history if msg.get('chat_id', '') == chat_id]
1473
+ stored_history = stored_history[-self.max_history:]
1474
+ except Exception as e:
1475
+ print(f"[HYBRID STREAM] Failed to load history: {e}")
1476
+
1477
+ # Step 6: Build messages
1478
+ messages = [{"role": "system", "content": system_prompt}]
1479
+
1480
+ for msg in stored_history:
1481
+ messages.append({
1482
+ "role": msg['role'],
1483
+ "content": msg['content']
1484
+ })
1485
+
1486
+ format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
1487
+
1488
+ user_message = f"""Based on the following document data, answer my question comprehensively.
1489
+
1490
+ DOCUMENT DATA:
1491
+ {context}
1492
+
1493
+ QUESTION: {query}
1494
+
1495
+ Instructions: Use both the structured metadata AND detailed content to provide a complete answer.{format_reminder}"""
1496
+
1497
+ messages.append({"role": "user", "content": user_message})
1498
+
1499
+ # Step 7: Stream response
1500
+ full_response = ""
1501
+ chunk_count = 0
1502
+
1503
+ if self.use_deepseek:
1504
+ print("[HYBRID STREAM] Using DeepSeek for response")
1505
+ for chunk in self._call_deepseek_streaming(messages):
1506
+ if "error" in chunk:
1507
+ break
1508
+ if "chunk" in chunk:
1509
+ full_response += chunk["chunk"]
1510
+ chunk_count += 1
1511
+ yield {"type": "content", "content": chunk["chunk"]}
1512
+
1513
+ # Fallback to OpenRouter if needed
1514
+ if not full_response:
1515
+ print("[HYBRID STREAM] Falling back to OpenRouter")
1516
+ for model_key in self.fallback_order:
1517
+ try:
1518
+ for chunk in self._call_ai_model_streaming(model_key, messages):
1519
+ if "error" in chunk:
1520
+ continue
1521
+ if "chunk" in chunk:
1522
+ full_response += chunk["chunk"]
1523
+ chunk_count += 1
1524
+ yield {"type": "content", "content": chunk["chunk"]}
1525
+ if full_response:
1526
+ break
1527
+ except Exception as e:
1528
+ print(f"[HYBRID STREAM] Model {model_key} failed: {e}")
1529
+ continue
1530
+
1531
+ # Step 8: Store conversation
1532
+ if full_response and chat_id:
1533
+ try:
1534
+ chroma_service.store_conversation(
1535
+ user_id=user_id,
1536
+ role="user",
1537
+ content=query,
1538
+ bucket_id=bucket_id or "",
1539
+ chat_id=chat_id
1540
+ )
1541
+ chroma_service.store_conversation(
1542
+ user_id=user_id,
1543
+ role="assistant",
1544
+ content=full_response,
1545
+ bucket_id=bucket_id or "",
1546
+ chat_id=chat_id,
1547
+ format_preference=format_preference
1548
+ )
1549
+ except Exception as e:
1550
+ print(f"[HYBRID STREAM] Failed to store conversation: {e}")
1551
+
1552
+ yield {
1553
+ "type": "done",
1554
+ "query_type": "hybrid",
1555
+ "intent": intent,
1556
+ "metadata_docs": combined.get('total_documents', 0),
1557
+ "rag_chunks": combined.get('chunk_count', 0)
1558
+ }
1559
+
1560
  def _stream_metadata_query(self, user_id: str, bucket_id: str,
1561
  query: str, parsed: dict, chat_id: str = ""):
1562
  """
 
1614
  total_before = result.get('total_before_filter', 0)
1615
  calculation = result.get('calculation')
1616
 
1617
+ # Check if we have any data - if not, try RAG fallback
1618
  if not context or total_docs == 0:
1619
+ print(f"[HYBRID] Metadata returned 0 results, attempting RAG fallback...")
1620
+
1621
+ # Try RAG fallback with the filters as search enhancement
1622
+ # Use is_fallback=True for more aggressive search
1623
+ # Also detect document names in query for targeted search
1624
+ rag_result = self._get_rag_context_for_query(
1625
+ user_id, bucket_id, query,
1626
+ filters=parsed.get('filters'),
1627
+ is_fallback=True, # Use more aggressive search parameters
1628
+ doc_ids=None # Document name detection happens inside the method
1629
+ )
1630
+
1631
+ if rag_result.get('context') and rag_result.get('chunk_count', 0) > 0:
1632
+ # Use RAG context instead
1633
+ context = rag_result['context']
1634
+ sources = rag_result['sources']
1635
+ total_docs = rag_result['chunk_count']
1636
+ total_before = 0 # Not applicable for RAG
1637
+ print(f"[HYBRID] RAG fallback successful: found {total_docs} chunks")
1638
+ else:
1639
+ # Both metadata and RAG failed
1640
+ yield {
1641
+ "type": "error",
1642
+ "content": "No matching documents found. The document may not exist in this collection, or try rephrasing your query."
1643
+ }
1644
+ return
1645
 
1646
  # Send sources first
1647
  yield {
 
1658
  conciseness_directive = "\n\nIMPORTANT: Be concise and direct. No preambles or verbose explanations. Get straight to the formatted answer." if format_preference else ""
1659
 
1660
  if intent == 'count':
1661
+ # Check if we're using RAG fallback (metadata returned 0)
1662
+ is_rag_fallback = (total_before == 0 and total_docs > 0)
1663
+
1664
+ if is_rag_fallback:
1665
+ # Using RAG content - need to extract count from document text
1666
+ system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COUNT query.
1667
+
1668
+ CRITICAL INSTRUCTIONS:
1669
+ 1. The user is asking for a COUNT/NUMBER that may not be in structured metadata.
1670
+ 2. You have been provided with detailed document content from RAG search.
1671
+ 3. CAREFULLY read through the document content to find the specific number/count requested.
1672
+ 4. Look for numbers, counts, totals, or quantities related to the query.
1673
+ 5. If the query asks "how many students", search for phrases like:
1674
+ - "total students", "number of students", "students insured", "X students"
1675
+ - Look for explicit numbers in the context
1676
+ 6. State the count clearly and directly. If you find the number, present it confidently.
1677
+ 7. If the count is not explicitly stated, say so clearly.{conciseness_directive}
1678
+
1679
+ {format_instructions}
1680
+
1681
+ IMPORTANT: The answer is in the provided document content. Read it carefully to extract the exact number."""
1682
+ else:
1683
+ # Using metadata - count is pre-computed
1684
+ system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COUNT query.
1685
 
1686
  CRITICAL INSTRUCTIONS:
1687
  1. The count has been computed: {total_docs} documents match the criteria.
 
1733
  {format_instructions if format_instructions else "FORMAT: Use tables or side-by-side format where helpful."}"""
1734
 
1735
  else: # list, summarize, or other
1736
+ system_prompt = f"""You are Iribl AI, a document analysis assistant. You are answering a query about {total_docs} documents.
1737
+
1738
+ ABSOLUTELY CRITICAL - READ CAREFULLY:
1739
+ 1. You have been given metadata for EXACTLY {total_docs} documents.
1740
+ 2. When asked to list or format as table, you MUST include ALL {total_docs} items.
1741
+ 3. Do NOT truncate, summarize, or skip ANY items.
1742
+ 4. If there are {total_docs} documents, your response MUST contain {total_docs} entries.
1743
+ 5. For TABLES: Include EVERY row - the table must have exactly {total_docs} data rows.
1744
+ 6. For LISTS: Number 1 through {total_docs} - include every single one.
1745
+
1746
+ METADATA COLUMNS AVAILABLE:
1747
+ - document_title (Policy/Document Name)
1748
+ - insured_name (Insured Company)
1749
+ - insurer_name (Insurance Company)
1750
+ - policy_type (Type of Policy)
1751
+ - sum_insured (Coverage Amount)
1752
+ - premium_amount (Premium)
1753
+ - renewal_date (Renewal Date)
1754
+ - renewal_year (Renewal Year)
1755
+ - policy_start_date, policy_end_date
1756
+ - city, state (Location)
1757
 
1758
  {format_instructions if format_instructions else "FORMAT: Use headers, bullet points, and bold text for clarity."}
1759
 
1760
+ FAILURE TO INCLUDE ALL {total_docs} ITEMS IS UNACCEPTABLE. Do NOT say 'and X more' or truncate the list."""
1761
 
1762
  # Step 3: Load conversation history for memory (CRITICAL FOR CONTEXT)
1763
  stored_history = []
 
2455
  print(f"[QUERY ROUTING] AI-parsed query: {parsed}")
2456
 
2457
  # Route based on AI-parsed intent
2458
+ intent = parsed.get('intent', 'specific')
2459
+ needs_metadata = parsed.get('needs_metadata', False)
2460
+
2461
+ # HYBRID ROUTING LOGIC:
2462
+ # 1. For aggregate/list/count/rank queries: Use metadata (with RAG fallback)
2463
+ # 2. For ALL other queries: Use HYBRID (metadata + RAG together) for comprehensive answers
2464
+
2465
+ if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
2466
+ # Aggregate queries - metadata is primary, RAG is fallback (handled inside)
2467
+ print(f"[QUERY ROUTING] Using METADATA path for {intent} query")
2468
  yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
2469
  return
2470
 
2471
+ else:
2472
+ # ALL other queries (specific, compare, general, summarize, followup)
2473
+ # Use HYBRID approach - both metadata AND RAG for comprehensive answers
2474
+ print(f"[QUERY ROUTING] Using HYBRID path for {intent} query")
2475
+ yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
2476
+ return
2477
 
2478
  # Step 1: Expand query for better retrieval (handles "module 5" -> "module five", etc.)
2479
  expanded_queries = self._expand_query(query)
static/css/styles.css CHANGED
@@ -671,6 +671,7 @@ body {
671
  overflow: hidden;
672
  height: 100%;
673
  /* Ensure it takes full height */
 
674
  }
675
 
676
  /* ==================== Chat Bucket Filter ==================== */
@@ -807,6 +808,21 @@ body {
807
  gap: 1rem;
808
  min-height: 0;
809
  /* Critical: allows scrolling to work */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
 
812
  /* Custom scrollbar for chat messages */
@@ -830,8 +846,10 @@ body {
830
  .message {
831
  display: flex;
832
  gap: 0.75rem;
833
- max-width: 85%;
 
834
  animation: messageSlide 0.3s ease-out;
 
835
  }
836
 
837
  @keyframes messageSlide {
@@ -849,6 +867,12 @@ body {
849
  .message.user {
850
  align-self: flex-end;
851
  flex-direction: row-reverse;
 
 
 
 
 
 
852
  }
853
 
854
  .message-avatar {
@@ -868,20 +892,29 @@ body {
868
  padding: 1rem 1.25rem;
869
  border-radius: var(--radius-lg);
870
  font-size: 0.9rem;
871
- line-height: 1.6;
 
 
 
 
 
 
872
  }
873
 
874
  .message.user .message-content {
875
  background: var(--accent-primary);
876
- color: var(--bg-darkest);
877
  border-bottom-right-radius: 4px;
 
 
 
878
  }
879
 
880
  .message.assistant .message-content {
881
  background: linear-gradient(135deg, var(--bg-light) 0%, var(--bg-medium) 100%);
882
  border: 1px solid var(--glass-border);
883
  border-bottom-left-radius: 4px;
884
- box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
885
  }
886
 
887
  .message-sources {
@@ -905,54 +938,84 @@ body {
905
  .message-content h2,
906
  .message-content h3,
907
  .message-content h4,
 
 
908
  .message-content .msg-header {
909
- font-weight: 600;
910
  color: var(--text-primary);
911
- margin: 1.25rem 0 0.6rem 0;
912
- line-height: 1.4;
 
913
  }
914
 
915
  .message-content h1 {
916
- font-size: 1.25rem;
917
- background: linear-gradient(90deg, var(--accent-primary), var(--accent-secondary));
918
  -webkit-background-clip: text;
919
  -webkit-text-fill-color: transparent;
920
  background-clip: text;
921
- padding-bottom: 0.5rem;
922
- border-bottom: 2px solid rgba(168, 85, 247, 0.3);
 
 
923
  }
924
 
925
  .message-content h2 {
926
- font-size: 1.1rem;
927
- color: var(--accent-secondary);
928
- border-bottom: 1px solid rgba(168, 85, 247, 0.2);
929
- padding-bottom: 0.4rem;
 
 
930
  }
931
 
932
  .message-content h3 {
933
- font-size: 1rem;
934
  color: var(--info);
 
 
 
935
  }
936
 
937
  .message-content h4 {
938
- font-size: 0.95rem;
939
  font-weight: 600;
940
- color: var(--text-secondary);
941
- margin: 0.9rem 0 0.4rem 0;
 
 
 
 
 
 
 
 
942
  }
943
 
944
  .message-content h1:first-child,
945
  .message-content h2:first-child,
946
  .message-content h3:first-child,
947
  .message-content h4:first-child,
 
 
948
  .message-content .msg-header:first-child {
949
  margin-top: 0;
950
  }
951
 
952
  .message-content p,
953
  .message-content .msg-para {
954
- margin: 0.75rem 0;
955
  line-height: 1.75;
 
 
 
 
 
 
 
 
 
 
956
  }
957
 
958
  .message-content p:first-child,
@@ -962,7 +1025,7 @@ body {
962
 
963
  /* ==================== Enhanced Lists ==================== */
964
  .message-content .formatted-list {
965
- margin: 1rem 0;
966
  padding-left: 0;
967
  list-style: none;
968
  }
@@ -973,17 +1036,20 @@ body {
973
 
974
  .message-content .formatted-list li {
975
  position: relative;
976
- padding: 0.5rem 0.75rem 0.5rem 2.25rem;
977
- margin: 0.35rem 0;
978
- background: rgba(255, 255, 255, 0.02);
979
  border-radius: var(--radius-md);
980
  border-left: 3px solid transparent;
981
- line-height: 1.65;
982
  transition: all 0.2s ease;
 
 
983
  }
984
 
985
  .message-content .formatted-list li:hover {
986
- background: rgba(255, 255, 255, 0.04);
 
987
  }
988
 
989
  .message-content .formatted-list li.numbered {
@@ -1032,14 +1098,16 @@ body {
1032
  /* Legacy list support */
1033
  .message-content ul,
1034
  .message-content ol {
1035
- margin: 0.75rem 0;
1036
- padding-left: 1.5rem;
1037
  }
1038
 
1039
  .message-content li {
1040
- margin: 0.4rem 0;
1041
  padding-left: 0.5rem;
1042
- line-height: 1.6;
 
 
1043
  }
1044
 
1045
  .message-content ul li::marker {
@@ -1066,12 +1134,15 @@ body {
1066
  overflow-x: auto;
1067
  overflow-y: hidden;
1068
  max-width: 100%;
 
1069
  box-shadow: 0 2px 12px rgba(0, 0, 0, 0.2);
1070
  border: 1px solid rgba(255, 255, 255, 0.08);
 
1071
  }
1072
 
1073
  .message-content table {
1074
  width: 100%;
 
1075
  border-collapse: collapse;
1076
  font-size: 0.8rem;
1077
  background: rgba(0, 0, 0, 0.2);
@@ -1083,23 +1154,27 @@ body {
1083
  }
1084
 
1085
  .message-content th {
1086
- padding: 0.6rem 0.75rem;
1087
- font-weight: 600;
1088
  color: var(--text-primary);
1089
  text-align: left;
1090
- border-bottom: 2px solid rgba(168, 85, 247, 0.3);
1091
  text-transform: uppercase;
1092
- font-size: 0.7rem;
1093
- letter-spacing: 0.3px;
1094
  white-space: nowrap;
1095
  }
1096
 
1097
  .message-content td {
1098
- padding: 0.5rem 0.75rem;
1099
- border-bottom: 1px solid rgba(255, 255, 255, 0.05);
1100
  color: var(--text-secondary);
1101
  word-break: break-word;
1102
- max-width: 200px;
 
 
 
 
1103
  }
1104
 
1105
  .message-content tbody tr {
@@ -1159,12 +1234,17 @@ body {
1159
  .message-content b {
1160
  font-weight: 700;
1161
  color: var(--text-primary);
 
 
 
 
1162
  }
1163
 
1164
  .message-content em,
1165
  .message-content i {
1166
  font-style: italic;
1167
- color: var(--text-secondary);
 
1168
  }
1169
 
1170
  /* ==================== Dividers ==================== */
@@ -1173,17 +1253,21 @@ body {
1173
  height: 1px;
1174
  background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.4), transparent);
1175
  margin: 1.5rem 0;
 
1176
  }
1177
 
1178
  /* ==================== Blockquotes ==================== */
1179
  .message-content blockquote {
1180
  border-left: 4px solid var(--accent-primary);
1181
- margin: 1rem 0;
1182
- padding: 0.75rem 1.25rem;
1183
- background: linear-gradient(135deg, rgba(168, 85, 247, 0.08) 0%, rgba(96, 165, 250, 0.05) 100%);
1184
  border-radius: 0 var(--radius-md) var(--radius-md) 0;
1185
  font-style: italic;
1186
  color: var(--text-secondary);
 
 
 
1187
  }
1188
 
1189
  /* ==================== Typing Indicator ==================== */
 
671
  overflow: hidden;
672
  height: 100%;
673
  /* Ensure it takes full height */
674
+ transition: all var(--transition-smooth);
675
  }
676
 
677
  /* ==================== Chat Bucket Filter ==================== */
 
808
  gap: 1rem;
809
  min-height: 0;
810
  /* Critical: allows scrolling to work */
811
+ transition: padding var(--transition-smooth);
812
+ }
813
+
814
+ /* Adjust chat when both sidebars are open for better space utilization */
815
+ .main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .chat-messages {
816
+ padding: 0.875rem;
817
+ gap: 0.875rem;
818
+ }
819
+
820
+ .main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .message.assistant {
821
+ max-width: 85%;
822
+ }
823
+
824
+ .main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .message.user {
825
+ max-width: 75%;
826
  }
827
 
828
  /* Custom scrollbar for chat messages */
 
846
  .message {
847
  display: flex;
848
  gap: 0.75rem;
849
+ max-width: 100%;
850
+ width: 100%;
851
  animation: messageSlide 0.3s ease-out;
852
+ min-width: 0; /* Allow flex item to shrink below content size */
853
  }
854
 
855
  @keyframes messageSlide {
 
867
  .message.user {
868
  align-self: flex-end;
869
  flex-direction: row-reverse;
870
+ max-width: 80%; /* User messages can be narrower */
871
+ }
872
+
873
+ .message.assistant {
874
+ align-self: flex-start;
875
+ max-width: 90%; /* Assistant messages use most of the width, leaving small margin */
876
  }
877
 
878
  .message-avatar {
 
892
  padding: 1rem 1.25rem;
893
  border-radius: var(--radius-lg);
894
  font-size: 0.9rem;
895
+ line-height: 1.7;
896
+ max-width: 100%;
897
+ min-width: 0; /* Allow content to shrink */
898
+ word-wrap: break-word;
899
+ overflow-wrap: break-word;
900
+ word-break: break-word; /* Break long words if needed */
901
+ color: var(--text-secondary);
902
  }
903
 
904
  .message.user .message-content {
905
  background: var(--accent-primary);
906
+ color: var(--bg-darkest) !important; /* Ensure dark text on white background */
907
  border-bottom-right-radius: 4px;
908
+ padding: 1rem 1.25rem; /* Slightly less padding for user messages */
909
+ font-weight: 500; /* Slightly bolder for better readability */
910
+ line-height: 1.7;
911
  }
912
 
913
  .message.assistant .message-content {
914
  background: linear-gradient(135deg, var(--bg-light) 0%, var(--bg-medium) 100%);
915
  border: 1px solid var(--glass-border);
916
  border-bottom-left-radius: 4px;
917
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
918
  }
919
 
920
  .message-sources {
 
938
  .message-content h2,
939
  .message-content h3,
940
  .message-content h4,
941
+ .message-content h5,
942
+ .message-content h6,
943
  .message-content .msg-header {
944
+ font-weight: 700;
945
  color: var(--text-primary);
946
+ margin: 2rem 0 1rem 0;
947
+ line-height: 1.3;
948
+ letter-spacing: -0.01em;
949
  }
950
 
951
  .message-content h1 {
952
+ font-size: 1.3rem;
953
+ background: linear-gradient(135deg, var(--accent-primary) 0%, #a855f7 100%);
954
  -webkit-background-clip: text;
955
  -webkit-text-fill-color: transparent;
956
  background-clip: text;
957
+ padding-bottom: 0.6rem;
958
+ border-bottom: 2px solid rgba(168, 85, 247, 0.4);
959
+ margin-top: 0;
960
+ margin-bottom: 1.25rem;
961
  }
962
 
963
  .message-content h2 {
964
+ font-size: 1.15rem;
965
+ color: var(--accent-primary);
966
+ border-bottom: 1px solid rgba(168, 85, 247, 0.3);
967
+ padding-bottom: 0.45rem;
968
+ margin-top: 1.5rem;
969
+ margin-bottom: 0.875rem;
970
  }
971
 
972
  .message-content h3 {
973
+ font-size: 1.05rem;
974
  color: var(--info);
975
+ margin-top: 1.5rem;
976
+ margin-bottom: 0.75rem;
977
+ font-weight: 600;
978
  }
979
 
980
  .message-content h4 {
981
+ font-size: 1rem;
982
  font-weight: 600;
983
+ color: var(--text-primary);
984
+ margin: 1.25rem 0 0.625rem 0;
985
+ }
986
+
987
+ .message-content h5,
988
+ .message-content h6 {
989
+ font-size: 1rem;
990
+ font-weight: 600;
991
+ color: var(--text-primary);
992
+ margin: 1.25rem 0 0.625rem 0;
993
  }
994
 
995
  .message-content h1:first-child,
996
  .message-content h2:first-child,
997
  .message-content h3:first-child,
998
  .message-content h4:first-child,
999
+ .message-content h5:first-child,
1000
+ .message-content h6:first-child,
1001
  .message-content .msg-header:first-child {
1002
  margin-top: 0;
1003
  }
1004
 
1005
  .message-content p,
1006
  .message-content .msg-para {
1007
+ margin: 0.875rem 0;
1008
  line-height: 1.75;
1009
+ color: var(--text-secondary);
1010
+ font-size: 0.9rem;
1011
+ }
1012
+
1013
+ .message-content p:first-child {
1014
+ margin-top: 0;
1015
+ }
1016
+
1017
+ .message-content p:last-child {
1018
+ margin-bottom: 0;
1019
  }
1020
 
1021
  .message-content p:first-child,
 
1025
 
1026
  /* ==================== Enhanced Lists ==================== */
1027
  .message-content .formatted-list {
1028
+ margin: 1.25rem 0;
1029
  padding-left: 0;
1030
  list-style: none;
1031
  }
 
1036
 
1037
  .message-content .formatted-list li {
1038
  position: relative;
1039
+ padding: 0.625rem 0.875rem 0.625rem 2.25rem;
1040
+ margin: 0.4rem 0;
1041
+ background: rgba(255, 255, 255, 0.03);
1042
  border-radius: var(--radius-md);
1043
  border-left: 3px solid transparent;
1044
+ line-height: 1.7;
1045
  transition: all 0.2s ease;
1046
+ font-size: 0.9rem;
1047
+ color: var(--text-secondary);
1048
  }
1049
 
1050
  .message-content .formatted-list li:hover {
1051
+ background: rgba(255, 255, 255, 0.06);
1052
+ transform: translateX(2px);
1053
  }
1054
 
1055
  .message-content .formatted-list li.numbered {
 
1098
  /* Legacy list support */
1099
  .message-content ul,
1100
  .message-content ol {
1101
+ margin: 1.25rem 0;
1102
+ padding-left: 1.75rem;
1103
  }
1104
 
1105
  .message-content li {
1106
+ margin: 0.5rem 0;
1107
  padding-left: 0.5rem;
1108
+ line-height: 1.7;
1109
+ font-size: 0.9rem;
1110
+ color: var(--text-secondary);
1111
  }
1112
 
1113
  .message-content ul li::marker {
 
1134
  overflow-x: auto;
1135
  overflow-y: hidden;
1136
  max-width: 100%;
1137
+ width: 100%;
1138
  box-shadow: 0 2px 12px rgba(0, 0, 0, 0.2);
1139
  border: 1px solid rgba(255, 255, 255, 0.08);
1140
+ -webkit-overflow-scrolling: touch; /* Smooth scrolling on iOS */
1141
  }
1142
 
1143
  .message-content table {
1144
  width: 100%;
1145
+ min-width: 100%; /* Ensure table takes full width of wrapper */
1146
  border-collapse: collapse;
1147
  font-size: 0.8rem;
1148
  background: rgba(0, 0, 0, 0.2);
 
1154
  }
1155
 
1156
  .message-content th {
1157
+ padding: 0.75rem 1rem;
1158
+ font-weight: 700;
1159
  color: var(--text-primary);
1160
  text-align: left;
1161
+ border-bottom: 2px solid rgba(168, 85, 247, 0.4);
1162
  text-transform: uppercase;
1163
+ font-size: 0.75rem;
1164
+ letter-spacing: 0.5px;
1165
  white-space: nowrap;
1166
  }
1167
 
1168
  .message-content td {
1169
+ padding: 0.75rem 1rem;
1170
+ border-bottom: 1px solid rgba(255, 255, 255, 0.08);
1171
  color: var(--text-secondary);
1172
  word-break: break-word;
1173
+ overflow-wrap: break-word;
1174
+ max-width: none; /* Remove max-width restriction */
1175
+ min-width: 100px; /* Minimum width for readability */
1176
+ line-height: 1.6;
1177
+ font-size: 0.9rem;
1178
  }
1179
 
1180
  .message-content tbody tr {
 
1234
  .message-content b {
1235
  font-weight: 700;
1236
  color: var(--text-primary);
1237
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.1) 0%, rgba(168, 85, 247, 0.1) 100%);
1238
+ padding: 0.1rem 0.3rem;
1239
+ border-radius: 4px;
1240
+ font-weight: 600;
1241
  }
1242
 
1243
  .message-content em,
1244
  .message-content i {
1245
  font-style: italic;
1246
+ color: var(--accent-secondary);
1247
+ font-weight: 500;
1248
  }
1249
 
1250
  /* ==================== Dividers ==================== */
 
1253
  height: 1px;
1254
  background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.4), transparent);
1255
  margin: 1.5rem 0;
1256
+ border-radius: 2px;
1257
  }
1258
 
1259
  /* ==================== Blockquotes ==================== */
1260
  .message-content blockquote {
1261
  border-left: 4px solid var(--accent-primary);
1262
+ margin: 1.25rem 0;
1263
+ padding: 0.875rem 1.25rem;
1264
+ background: linear-gradient(135deg, rgba(168, 85, 247, 0.12) 0%, rgba(96, 165, 250, 0.08) 100%);
1265
  border-radius: 0 var(--radius-md) var(--radius-md) 0;
1266
  font-style: italic;
1267
  color: var(--text-secondary);
1268
+ font-size: 0.9rem;
1269
+ line-height: 1.75;
1270
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
1271
  }
1272
 
1273
  /* ==================== Typing Indicator ==================== */
static/js/app.js CHANGED
@@ -1326,7 +1326,8 @@ function renderMessages() {
1326
 
1327
  const html = state.messages.map((msg, i) => {
1328
  const avatar = msg.role === 'user' ? (state.user?.username?.charAt(0).toUpperCase() || 'U') : '🧠';
1329
- return `<div class="message ${msg.role}"><div class="message-avatar">${avatar}</div><div class="message-content">${formatContent(msg.content)}</div></div>`;
 
1330
  }).join('');
1331
 
1332
  // Build full content with summary panel and welcome screen
@@ -1354,10 +1355,24 @@ function renderMessages() {
1354
  elements.summaryClose.addEventListener('click', hideSummary);
1355
  }
1356
 
1357
- function formatContent(content) {
1358
  // Enhanced markdown parsing for beautiful formatting
1359
  let html = content;
1360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1361
  // Escape HTML special characters first (except for already parsed markdown)
1362
  // Skip this if content looks like it's already HTML
1363
  if (!html.includes('<table') && !html.includes('<div')) {
 
1326
 
1327
  const html = state.messages.map((msg, i) => {
1328
  const avatar = msg.role === 'user' ? (state.user?.username?.charAt(0).toUpperCase() || 'U') : '🧠';
1329
+ const isUserMessage = msg.role === 'user';
1330
+ return `<div class="message ${msg.role}"><div class="message-avatar">${avatar}</div><div class="message-content">${formatContent(msg.content, isUserMessage)}</div></div>`;
1331
  }).join('');
1332
 
1333
  // Build full content with summary panel and welcome screen
 
1355
  elements.summaryClose.addEventListener('click', hideSummary);
1356
  }
1357
 
1358
+ function formatContent(content, isUserMessage = false) {
1359
  // Enhanced markdown parsing for beautiful formatting
1360
  let html = content;
1361
 
1362
+ // For user messages, escape HTML and preserve line breaks
1363
+ if (isUserMessage) {
1364
+ // Escape HTML to prevent XSS
1365
+ html = html
1366
+ .replace(/&/g, '&amp;')
1367
+ .replace(/</g, '&lt;')
1368
+ .replace(/>/g, '&gt;')
1369
+ .replace(/"/g, '&quot;')
1370
+ .replace(/'/g, '&#039;');
1371
+ // Convert line breaks to <br>
1372
+ html = html.replace(/\n/g, '<br>');
1373
+ return html;
1374
+ }
1375
+
1376
  // Escape HTML special characters first (except for already parsed markdown)
1377
  // Skip this if content looks like it's already HTML
1378
  if (!html.includes('<table') && !html.includes('<div')) {
table.md ADDED
The diff for this file is too large to render. See raw diff