Spaces:
Running
Running
jashdoshi77
commited on
Commit
·
abc646e
1
Parent(s):
60ff586
Update UI styling and message formatting improvements
Browse files- check_chroma_data.py +39 -0
- check_users.py +30 -0
- clear_and_remigrate.py +355 -0
- clear_old_migration.py +25 -0
- find_buckets.py +34 -21
- migrate_table_to_chroma.py +352 -0
- services/chroma_service.py +13 -15
- services/rag_service.py +604 -37
- static/css/styles.css +126 -42
- static/js/app.js +17 -2
- table.md +0 -0
check_chroma_data.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quick script to check what's stored in ChromaDB metadata collection."""
|
| 2 |
+
|
| 3 |
+
from services.chroma_service import ChromaService
|
| 4 |
+
|
| 5 |
+
def check_metadata():
|
| 6 |
+
print("Connecting to ChromaDB...")
|
| 7 |
+
c = ChromaService()
|
| 8 |
+
|
| 9 |
+
data = c.metadata_collection.get()
|
| 10 |
+
total = len(data['ids'])
|
| 11 |
+
print(f"Total entries in document_metadata: {total}")
|
| 12 |
+
|
| 13 |
+
if total > 0:
|
| 14 |
+
print("\n--- Sample entry (first) ---")
|
| 15 |
+
meta = data['metadatas'][0]
|
| 16 |
+
for key, value in sorted(meta.items()):
|
| 17 |
+
print(f" {key}: {value}")
|
| 18 |
+
|
| 19 |
+
# Find Feb 2026 renewals
|
| 20 |
+
print("\n--- Entries with Feb 2026 renewal ---")
|
| 21 |
+
feb_count = 0
|
| 22 |
+
for meta in data['metadatas']:
|
| 23 |
+
rd = str(meta.get('renewal_date', ''))
|
| 24 |
+
ry = meta.get('renewal_year', 0)
|
| 25 |
+
# Check for Feb 2026
|
| 26 |
+
if ry == 2026 and '-02-' in rd:
|
| 27 |
+
feb_count += 1
|
| 28 |
+
print(f" {meta.get('document_title')}: renewal_date={rd}, renewal_year={ry}")
|
| 29 |
+
|
| 30 |
+
print(f"\nTotal Feb 2026 renewals found: {feb_count}")
|
| 31 |
+
|
| 32 |
+
# Show all unique renewal years
|
| 33 |
+
years = set(meta.get('renewal_year', 0) for meta in data['metadatas'])
|
| 34 |
+
print(f"\nAll renewal years in data: {sorted(years)}")
|
| 35 |
+
else:
|
| 36 |
+
print("No data found!")
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
check_metadata()
|
check_users.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Check what users exist in the system."""
|
| 2 |
+
|
| 3 |
+
from services.chroma_service import ChromaService
|
| 4 |
+
|
| 5 |
+
def check_users():
|
| 6 |
+
print("Connecting to ChromaDB...")
|
| 7 |
+
c = ChromaService()
|
| 8 |
+
|
| 9 |
+
# Get all users
|
| 10 |
+
users = c.users_collection.get()
|
| 11 |
+
print(f"\nTotal users: {len(users['ids'])}")
|
| 12 |
+
|
| 13 |
+
for i, uid in enumerate(users['ids']):
|
| 14 |
+
meta = users['metadatas'][i]
|
| 15 |
+
print(f" User ID: {uid}")
|
| 16 |
+
print(f" Username: {meta.get('username')}")
|
| 17 |
+
print(f" Role: {meta.get('role')}")
|
| 18 |
+
print()
|
| 19 |
+
|
| 20 |
+
# Check buckets for first user
|
| 21 |
+
if users['ids']:
|
| 22 |
+
first_user = users['ids'][0]
|
| 23 |
+
buckets = c.buckets_collection.get(where={"user_id": first_user})
|
| 24 |
+
print(f"\nBuckets for user {first_user}:")
|
| 25 |
+
for i, bid in enumerate(buckets['ids']):
|
| 26 |
+
print(f" Bucket ID: {bid}")
|
| 27 |
+
print(f" Name: {buckets['metadatas'][i].get('name')}")
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
check_users()
|
clear_and_remigrate.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Script to clear document_metadata collection and re-run migration
|
| 3 |
+
with a different user_id and no bucket_id
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import hashlib
|
| 8 |
+
import time
|
| 9 |
+
from services.chroma_service import ChromaService
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_markdown_table(filepath: str) -> tuple[list[str], list[dict]]:
|
| 13 |
+
"""
|
| 14 |
+
Parse the markdown table from table.md and extract all rows.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
- headers: list of column names
|
| 18 |
+
- rows: list of dicts with column names as keys
|
| 19 |
+
"""
|
| 20 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 21 |
+
content = f.read()
|
| 22 |
+
|
| 23 |
+
lines = content.strip().split('\n')
|
| 24 |
+
|
| 25 |
+
# Parse header row (line 1)
|
| 26 |
+
header_line = lines[0]
|
| 27 |
+
# Extract column names from header
|
| 28 |
+
raw_headers = [h.strip().replace('\t', '').strip() for h in header_line.split('|')]
|
| 29 |
+
headers = [h for h in raw_headers if h and h != ':--------:']
|
| 30 |
+
|
| 31 |
+
print(f"Found {len(headers)} columns in header:")
|
| 32 |
+
for i, h in enumerate(headers):
|
| 33 |
+
print(f" {i+1}. {h}")
|
| 34 |
+
|
| 35 |
+
rows = []
|
| 36 |
+
|
| 37 |
+
# Parse data rows (skip header and separator lines)
|
| 38 |
+
for line_num, line in enumerate(lines[2:], start=3):
|
| 39 |
+
if not line.strip():
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
# Split by pipe and clean values
|
| 43 |
+
raw_cells = [c.strip().replace('\t', '').strip() for c in line.split('|')]
|
| 44 |
+
cells = [c for c in raw_cells if c != '']
|
| 45 |
+
|
| 46 |
+
if not cells:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
# Create dict with header keys and cell values
|
| 50 |
+
row = {}
|
| 51 |
+
for i, header in enumerate(headers):
|
| 52 |
+
if i < len(cells):
|
| 53 |
+
value = cells[i]
|
| 54 |
+
# Clean up special characters and artifacts
|
| 55 |
+
if value == "..'" or value == "..'":
|
| 56 |
+
value = ""
|
| 57 |
+
row[header] = value
|
| 58 |
+
else:
|
| 59 |
+
row[header] = ""
|
| 60 |
+
|
| 61 |
+
rows.append(row)
|
| 62 |
+
|
| 63 |
+
print(f"\nParsed {len(rows)} data rows from table")
|
| 64 |
+
return headers, rows
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def parse_date_to_iso(date_str: str) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Convert date from DD-MM-YYYY format to YYYY-MM-DD format.
|
| 70 |
+
Returns empty string if parsing fails.
|
| 71 |
+
"""
|
| 72 |
+
if not date_str or date_str in ["..'" , "..'", ""]:
|
| 73 |
+
return ""
|
| 74 |
+
|
| 75 |
+
date_str = date_str.strip()
|
| 76 |
+
|
| 77 |
+
# Handle DD-MM-YYYY format
|
| 78 |
+
match = re.match(r'(\d{1,2})-(\d{1,2})-(\d{4})', date_str)
|
| 79 |
+
if match:
|
| 80 |
+
day, month, year = match.groups()
|
| 81 |
+
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
| 82 |
+
|
| 83 |
+
return date_str
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def extract_year_from_date(date_str: str) -> int:
|
| 87 |
+
"""Extract year from a date string."""
|
| 88 |
+
if not date_str:
|
| 89 |
+
return 0
|
| 90 |
+
|
| 91 |
+
# Try to find a 4-digit year
|
| 92 |
+
match = re.search(r'(\d{4})', date_str)
|
| 93 |
+
if match:
|
| 94 |
+
return int(match.group(1))
|
| 95 |
+
|
| 96 |
+
return 0
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def parse_premium(value: str) -> float:
|
| 100 |
+
"""Parse premium value to float."""
|
| 101 |
+
if not value or value in ["..'" , "..'", ""]:
|
| 102 |
+
return 0.0
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
# Remove commas and currency symbols
|
| 106 |
+
cleaned = re.sub(r'[^\d.]', '', str(value).replace(',', ''))
|
| 107 |
+
if cleaned:
|
| 108 |
+
return float(cleaned)
|
| 109 |
+
except:
|
| 110 |
+
pass
|
| 111 |
+
|
| 112 |
+
return 0.0
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def parse_int_value(value: str) -> int:
|
| 116 |
+
"""Parse integer value."""
|
| 117 |
+
if not value or value in ["..'" , "..'", ""]:
|
| 118 |
+
return 0
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
cleaned = re.sub(r'[^\d]', '', str(value))
|
| 122 |
+
if cleaned:
|
| 123 |
+
return int(cleaned)
|
| 124 |
+
except:
|
| 125 |
+
pass
|
| 126 |
+
|
| 127 |
+
return 0
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def generate_doc_id(row: dict) -> str:
|
| 131 |
+
"""Generate a unique document ID based on row data."""
|
| 132 |
+
pl_number = row.get('PL/EN #', row.get('PL/EN', ''))
|
| 133 |
+
policy_number = row.get('Policy Number', '')
|
| 134 |
+
client_name = row.get('Client Name', '')
|
| 135 |
+
|
| 136 |
+
unique_string = f"{pl_number}_{policy_number}_{client_name}"
|
| 137 |
+
return hashlib.sha256(unique_string.encode()).hexdigest()[:16]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def convert_row_to_metadata(row: dict, all_headers: list[str]) -> dict:
|
| 141 |
+
"""
|
| 142 |
+
Convert a parsed table row into the metadata format expected by ChromaDB.
|
| 143 |
+
|
| 144 |
+
ALL columns from the table are preserved.
|
| 145 |
+
"""
|
| 146 |
+
# Parse dates
|
| 147 |
+
policy_start = parse_date_to_iso(row.get('Policy Start Date', ''))
|
| 148 |
+
policy_end = parse_date_to_iso(row.get('Policy End Date', ''))
|
| 149 |
+
|
| 150 |
+
# Use policy_end_date as renewal_date
|
| 151 |
+
renewal_date = policy_end
|
| 152 |
+
renewal_year = extract_year_from_date(policy_end)
|
| 153 |
+
|
| 154 |
+
# Parse premium amounts
|
| 155 |
+
premium_paid = parse_premium(row.get('Premium Paid', ''))
|
| 156 |
+
gross_premium = parse_premium(row.get('Gross Premium', ''))
|
| 157 |
+
net_premium = parse_premium(row.get('Premium', ''))
|
| 158 |
+
|
| 159 |
+
# Use the best available premium value
|
| 160 |
+
premium_amount = premium_paid or gross_premium or net_premium
|
| 161 |
+
|
| 162 |
+
# Build metadata dict matching ChromaDB structure (required fields)
|
| 163 |
+
metadata = {
|
| 164 |
+
# Basic document info
|
| 165 |
+
"document_type": str(row.get('Type', 'Policy')),
|
| 166 |
+
"document_title": str(row.get('PL/EN #', '')),
|
| 167 |
+
|
| 168 |
+
# Policy details
|
| 169 |
+
"policy_number": str(row.get('Policy Number', '')),
|
| 170 |
+
"policy_type": str(row.get('Policy', '')),
|
| 171 |
+
|
| 172 |
+
# Parties involved
|
| 173 |
+
"insurer_name": str(row.get('Insurer', '')),
|
| 174 |
+
"insured_name": str(row.get('Insured Name', row.get('Client Name', ''))),
|
| 175 |
+
"broker_name": "",
|
| 176 |
+
|
| 177 |
+
# Financial
|
| 178 |
+
"sum_insured": 0.0,
|
| 179 |
+
"premium_amount": premium_amount,
|
| 180 |
+
|
| 181 |
+
# Dates
|
| 182 |
+
"policy_start_date": policy_start,
|
| 183 |
+
"policy_end_date": policy_end,
|
| 184 |
+
"renewal_date": renewal_date,
|
| 185 |
+
"renewal_year": renewal_year,
|
| 186 |
+
|
| 187 |
+
# Location
|
| 188 |
+
"city": "",
|
| 189 |
+
"state": "",
|
| 190 |
+
"pincode": "",
|
| 191 |
+
"property_address": "",
|
| 192 |
+
|
| 193 |
+
# Classification
|
| 194 |
+
"industry": "",
|
| 195 |
+
"is_manufacturing": False,
|
| 196 |
+
|
| 197 |
+
# Arrays stored as JSON
|
| 198 |
+
"coverage_type": [],
|
| 199 |
+
"keywords": [],
|
| 200 |
+
|
| 201 |
+
# Tracking
|
| 202 |
+
"created_at": time.time()
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
# Now add ALL columns from the table as additional metadata
|
| 206 |
+
# These will be stored as extra_* fields
|
| 207 |
+
for header in all_headers:
|
| 208 |
+
# Create a safe key name
|
| 209 |
+
safe_key = header.replace(' ', '_').replace('.', '').replace('#', 'num')
|
| 210 |
+
safe_key = re.sub(r'[^a-zA-Z0-9_]', '', safe_key)
|
| 211 |
+
safe_key = safe_key.lower()
|
| 212 |
+
|
| 213 |
+
# Skip if already mapped above
|
| 214 |
+
if safe_key in ['policy_number', 'policy', 'insurer', 'insured_name',
|
| 215 |
+
'client_name', 'premium_paid', 'gross_premium', 'premium',
|
| 216 |
+
'policy_start_date', 'policy_end_date', 'type', 'plen_num']:
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
value = row.get(header, '')
|
| 220 |
+
|
| 221 |
+
# Store the raw value as a string (ChromaDB requires primitive types)
|
| 222 |
+
if value and value not in ["..'" , "..'", ""]:
|
| 223 |
+
metadata[f"col_{safe_key}"] = str(value)[:500] # Limit string length
|
| 224 |
+
|
| 225 |
+
return metadata
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def print_progress_bar(current: int, total: int, bar_length: int = 50):
|
| 229 |
+
"""Print a progress bar using ASCII characters."""
|
| 230 |
+
percent = current / total
|
| 231 |
+
filled = int(bar_length * percent)
|
| 232 |
+
bar = '#' * filled + '-' * (bar_length - filled)
|
| 233 |
+
print(f'\r Progress: |{bar}| {current}/{total} ({percent*100:.1f}%)', end='', flush=True)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def clear_and_remigrate():
|
| 237 |
+
"""
|
| 238 |
+
Clear all data from document_metadata collection and re-run migration
|
| 239 |
+
with new user_id and no bucket_id.
|
| 240 |
+
"""
|
| 241 |
+
print("=" * 60)
|
| 242 |
+
print("Clearing document_metadata and re-running migration")
|
| 243 |
+
print("=" * 60)
|
| 244 |
+
|
| 245 |
+
# Initialize ChromaDB service
|
| 246 |
+
print("\n[1] Connecting to ChromaDB...")
|
| 247 |
+
chroma = ChromaService()
|
| 248 |
+
|
| 249 |
+
# Check current state
|
| 250 |
+
current_data = chroma.metadata_collection.get()
|
| 251 |
+
print(f"Current document_metadata collection has {len(current_data['ids'])} entries")
|
| 252 |
+
|
| 253 |
+
# Delete all data from document_metadata collection
|
| 254 |
+
print("\n[2] Deleting all data from document_metadata collection...")
|
| 255 |
+
if current_data['ids']:
|
| 256 |
+
chroma.metadata_collection.delete(ids=current_data['ids'])
|
| 257 |
+
print(f"Deleted {len(current_data['ids'])} entries")
|
| 258 |
+
else:
|
| 259 |
+
print("Collection was already empty")
|
| 260 |
+
|
| 261 |
+
# Verify deletion
|
| 262 |
+
verify_data = chroma.metadata_collection.get()
|
| 263 |
+
print(f"After deletion: {len(verify_data['ids'])} entries remain")
|
| 264 |
+
|
| 265 |
+
# Parse the markdown table
|
| 266 |
+
print("\n[3] Parsing table.md...")
|
| 267 |
+
table_path = "table.md"
|
| 268 |
+
headers, rows = parse_markdown_table(table_path)
|
| 269 |
+
|
| 270 |
+
if not rows:
|
| 271 |
+
print("ERROR: No data rows found in table.md")
|
| 272 |
+
return
|
| 273 |
+
|
| 274 |
+
print(f"\nSample row data (first row, first 10 columns):")
|
| 275 |
+
sample = rows[0]
|
| 276 |
+
for key in list(sample.keys())[:10]:
|
| 277 |
+
print(f" {key}: {sample[key][:50] if len(sample[key]) > 50 else sample[key]}")
|
| 278 |
+
|
| 279 |
+
# NEW: Use different user_id (Nishant) and NO bucket_id
|
| 280 |
+
user_id = "55c0893720ef38eb" # Nishant's user ID
|
| 281 |
+
bucket_id = "" # No bucket ID specified
|
| 282 |
+
|
| 283 |
+
print(f"\n[4] Migrating {len(rows)} rows to ChromaDB...")
|
| 284 |
+
print(f"Using user_id: {user_id} (Nishant)")
|
| 285 |
+
print(f"Using bucket_id: (empty - no bucket specified)")
|
| 286 |
+
print(f"Batch size: 10 entries")
|
| 287 |
+
print()
|
| 288 |
+
|
| 289 |
+
success_count = 0
|
| 290 |
+
error_count = 0
|
| 291 |
+
|
| 292 |
+
BATCH_SIZE = 10
|
| 293 |
+
total_batches = (len(rows) + BATCH_SIZE - 1) // BATCH_SIZE
|
| 294 |
+
|
| 295 |
+
for batch_num in range(total_batches):
|
| 296 |
+
start_idx = batch_num * BATCH_SIZE
|
| 297 |
+
end_idx = min(start_idx + BATCH_SIZE, len(rows))
|
| 298 |
+
batch_rows = rows[start_idx:end_idx]
|
| 299 |
+
|
| 300 |
+
for row in batch_rows:
|
| 301 |
+
try:
|
| 302 |
+
# Generate unique doc_id
|
| 303 |
+
doc_id = generate_doc_id(row)
|
| 304 |
+
|
| 305 |
+
# Convert row to metadata format (include all headers)
|
| 306 |
+
metadata = convert_row_to_metadata(row, headers)
|
| 307 |
+
|
| 308 |
+
# Store in ChromaDB
|
| 309 |
+
result = chroma.store_document_metadata(
|
| 310 |
+
doc_id=doc_id,
|
| 311 |
+
user_id=user_id,
|
| 312 |
+
bucket_id=bucket_id,
|
| 313 |
+
metadata=metadata
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
if result.get("status") == "stored":
|
| 317 |
+
success_count += 1
|
| 318 |
+
else:
|
| 319 |
+
error_count += 1
|
| 320 |
+
|
| 321 |
+
except Exception as e:
|
| 322 |
+
print(f"\n Error on row: {str(e)[:100]}")
|
| 323 |
+
error_count += 1
|
| 324 |
+
|
| 325 |
+
# Update progress bar
|
| 326 |
+
print_progress_bar(end_idx, len(rows))
|
| 327 |
+
|
| 328 |
+
# Small delay between batches to not overload the API
|
| 329 |
+
if batch_num < total_batches - 1:
|
| 330 |
+
time.sleep(0.1)
|
| 331 |
+
|
| 332 |
+
print() # New line after progress bar
|
| 333 |
+
|
| 334 |
+
# Final status
|
| 335 |
+
print("\n" + "=" * 60)
|
| 336 |
+
print("Migration Complete!")
|
| 337 |
+
print("=" * 60)
|
| 338 |
+
print(f"Successfully migrated: {success_count} entries")
|
| 339 |
+
print(f"Errors: {error_count}")
|
| 340 |
+
|
| 341 |
+
# Verify final state
|
| 342 |
+
final_data = chroma.metadata_collection.get()
|
| 343 |
+
print(f"\nFinal document_metadata collection has {len(final_data['ids'])} entries")
|
| 344 |
+
|
| 345 |
+
# Show sample of stored data
|
| 346 |
+
if final_data['ids']:
|
| 347 |
+
print("\nSample stored metadata (first entry):")
|
| 348 |
+
sample_meta = final_data['metadatas'][0]
|
| 349 |
+
for key, value in list(sample_meta.items())[:15]:
|
| 350 |
+
print(f" {key}: {value}")
|
| 351 |
+
print(f" ... and {len(sample_meta) - 15} more fields")
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
if __name__ == "__main__":
|
| 355 |
+
clear_and_remigrate()
|
clear_old_migration.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Clear all entries with wrong user_id from document_metadata collection."""
|
| 2 |
+
|
| 3 |
+
from services.chroma_service import ChromaService
|
| 4 |
+
|
| 5 |
+
def clear_old_data():
|
| 6 |
+
print("Connecting to ChromaDB...")
|
| 7 |
+
c = ChromaService()
|
| 8 |
+
|
| 9 |
+
# Get all entries with the wrong user_id
|
| 10 |
+
data = c.metadata_collection.get(where={"user_id": "jashdoshi"})
|
| 11 |
+
count = len(data['ids'])
|
| 12 |
+
|
| 13 |
+
print(f"Found {count} entries with user_id='jashdoshi' (old migration)")
|
| 14 |
+
|
| 15 |
+
if count > 0 and data['ids']:
|
| 16 |
+
print("Deleting these entries...")
|
| 17 |
+
c.metadata_collection.delete(ids=data['ids'])
|
| 18 |
+
print(f"Deleted {count} entries.")
|
| 19 |
+
|
| 20 |
+
# Verify
|
| 21 |
+
remaining = c.metadata_collection.get()
|
| 22 |
+
print(f"Remaining entries in collection: {len(remaining['ids'])}")
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
clear_old_data()
|
find_buckets.py
CHANGED
|
@@ -1,25 +1,38 @@
|
|
| 1 |
-
"""
|
| 2 |
-
import sys
|
| 3 |
-
sys.path.insert(0, '.')
|
| 4 |
|
| 5 |
-
from services.chroma_service import
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Check buckets for all users."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from services.chroma_service import ChromaService
|
| 4 |
|
| 5 |
+
def check_all_buckets():
|
| 6 |
+
print("Connecting to ChromaDB...")
|
| 7 |
+
c = ChromaService()
|
| 8 |
+
|
| 9 |
+
# Get all users
|
| 10 |
+
users = c.users_collection.get()
|
| 11 |
+
print(f"\nTotal users: {len(users['ids'])}")
|
| 12 |
|
| 13 |
+
for i, uid in enumerate(users['ids']):
|
| 14 |
+
meta = users['metadatas'][i]
|
| 15 |
+
print(f"\n=== User: {meta.get('username')} (ID: {uid}) ===")
|
| 16 |
+
|
| 17 |
+
# Get buckets for this user
|
| 18 |
+
buckets = c.buckets_collection.get(where={"user_id": uid})
|
| 19 |
+
print(f"Buckets: {len(buckets['ids'])}")
|
| 20 |
+
|
| 21 |
+
for j, bid in enumerate(buckets['ids']):
|
| 22 |
+
print(f" - {buckets['metadatas'][j].get('name')} (ID: {bid})")
|
| 23 |
|
| 24 |
+
# Also check what the metadata is stored with
|
| 25 |
+
print("\n=== Metadata collection user/bucket combos ===")
|
| 26 |
+
data = c.metadata_collection.get()
|
| 27 |
+
if data['ids']:
|
| 28 |
+
# Get unique user_id/bucket_id combinations
|
| 29 |
+
combos = set()
|
| 30 |
+
for meta in data['metadatas']:
|
| 31 |
+
combos.add((meta.get('user_id'), meta.get('bucket_id')))
|
| 32 |
+
|
| 33 |
+
for user_id, bucket_id in combos:
|
| 34 |
+
count = sum(1 for m in data['metadatas'] if m.get('user_id') == user_id and m.get('bucket_id') == bucket_id)
|
| 35 |
+
print(f" user_id={user_id}, bucket_id={bucket_id}: {count} entries")
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
check_all_buckets()
|
migrate_table_to_chroma.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Migration script to populate document_metadata collection from table.md
|
| 3 |
+
|
| 4 |
+
This script reads the markdown table and migrates data to ChromaDB's document_metadata
|
| 5 |
+
collection. No AI or API keys are used - just straightforward data parsing.
|
| 6 |
+
|
| 7 |
+
All columns from the table are preserved in the metadata.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
import hashlib
|
| 12 |
+
import time
|
| 13 |
+
import json
|
| 14 |
+
from services.chroma_service import ChromaService
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_markdown_table(filepath: str) -> tuple[list[str], list[dict]]:
|
| 18 |
+
"""
|
| 19 |
+
Parse the markdown table from table.md and extract all rows.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
- headers: list of column names
|
| 23 |
+
- rows: list of dicts with column names as keys
|
| 24 |
+
"""
|
| 25 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 26 |
+
content = f.read()
|
| 27 |
+
|
| 28 |
+
lines = content.strip().split('\n')
|
| 29 |
+
|
| 30 |
+
# Parse header row (line 1)
|
| 31 |
+
header_line = lines[0]
|
| 32 |
+
# Extract column names from header
|
| 33 |
+
raw_headers = [h.strip().replace('\t', '').strip() for h in header_line.split('|')]
|
| 34 |
+
headers = [h for h in raw_headers if h and h != ':--------:']
|
| 35 |
+
|
| 36 |
+
print(f"Found {len(headers)} columns in header:")
|
| 37 |
+
for i, h in enumerate(headers):
|
| 38 |
+
print(f" {i+1}. {h}")
|
| 39 |
+
|
| 40 |
+
rows = []
|
| 41 |
+
|
| 42 |
+
# Parse data rows (skip header and separator lines)
|
| 43 |
+
for line_num, line in enumerate(lines[2:], start=3):
|
| 44 |
+
if not line.strip():
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
# Split by pipe and clean values
|
| 48 |
+
raw_cells = [c.strip().replace('\t', '').strip() for c in line.split('|')]
|
| 49 |
+
cells = [c for c in raw_cells if c != '']
|
| 50 |
+
|
| 51 |
+
if not cells:
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
# Create dict with header keys and cell values
|
| 55 |
+
row = {}
|
| 56 |
+
for i, header in enumerate(headers):
|
| 57 |
+
if i < len(cells):
|
| 58 |
+
value = cells[i]
|
| 59 |
+
# Clean up special characters and artifacts
|
| 60 |
+
if value == "..'" or value == "..'":
|
| 61 |
+
value = ""
|
| 62 |
+
row[header] = value
|
| 63 |
+
else:
|
| 64 |
+
row[header] = ""
|
| 65 |
+
|
| 66 |
+
rows.append(row)
|
| 67 |
+
|
| 68 |
+
print(f"\nParsed {len(rows)} data rows from table")
|
| 69 |
+
return headers, rows
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def parse_date_to_iso(date_str: str) -> str:
|
| 73 |
+
"""
|
| 74 |
+
Convert date from DD-MM-YYYY format to YYYY-MM-DD format.
|
| 75 |
+
Returns empty string if parsing fails.
|
| 76 |
+
"""
|
| 77 |
+
if not date_str or date_str in ["..'" , "..'", ""]:
|
| 78 |
+
return ""
|
| 79 |
+
|
| 80 |
+
date_str = date_str.strip()
|
| 81 |
+
|
| 82 |
+
# Handle DD-MM-YYYY format
|
| 83 |
+
match = re.match(r'(\d{1,2})-(\d{1,2})-(\d{4})', date_str)
|
| 84 |
+
if match:
|
| 85 |
+
day, month, year = match.groups()
|
| 86 |
+
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
| 87 |
+
|
| 88 |
+
return date_str
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def extract_year_from_date(date_str: str) -> int:
|
| 92 |
+
"""Extract year from a date string."""
|
| 93 |
+
if not date_str:
|
| 94 |
+
return 0
|
| 95 |
+
|
| 96 |
+
# Try to find a 4-digit year
|
| 97 |
+
match = re.search(r'(\d{4})', date_str)
|
| 98 |
+
if match:
|
| 99 |
+
return int(match.group(1))
|
| 100 |
+
|
| 101 |
+
return 0
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def parse_premium(value: str) -> float:
|
| 105 |
+
"""Parse premium value to float."""
|
| 106 |
+
if not value or value in ["..'" , "..'", ""]:
|
| 107 |
+
return 0.0
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
# Remove commas and currency symbols
|
| 111 |
+
cleaned = re.sub(r'[^\d.]', '', str(value).replace(',', ''))
|
| 112 |
+
if cleaned:
|
| 113 |
+
return float(cleaned)
|
| 114 |
+
except:
|
| 115 |
+
pass
|
| 116 |
+
|
| 117 |
+
return 0.0
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def parse_int_value(value: str) -> int:
|
| 121 |
+
"""Parse integer value."""
|
| 122 |
+
if not value or value in ["..'" , "..'", ""]:
|
| 123 |
+
return 0
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
cleaned = re.sub(r'[^\d]', '', str(value))
|
| 127 |
+
if cleaned:
|
| 128 |
+
return int(cleaned)
|
| 129 |
+
except:
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
return 0
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def generate_doc_id(row: dict) -> str:
|
| 136 |
+
"""Generate a unique document ID based on row data."""
|
| 137 |
+
pl_number = row.get('PL/EN #', row.get('PL/EN', ''))
|
| 138 |
+
policy_number = row.get('Policy Number', '')
|
| 139 |
+
client_name = row.get('Client Name', '')
|
| 140 |
+
|
| 141 |
+
unique_string = f"{pl_number}_{policy_number}_{client_name}"
|
| 142 |
+
return hashlib.sha256(unique_string.encode()).hexdigest()[:16]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def convert_row_to_metadata(row: dict, all_headers: list[str]) -> dict:
|
| 146 |
+
"""
|
| 147 |
+
Convert a parsed table row into the metadata format expected by ChromaDB.
|
| 148 |
+
|
| 149 |
+
ALL columns from the table are preserved.
|
| 150 |
+
"""
|
| 151 |
+
# Parse dates
|
| 152 |
+
policy_start = parse_date_to_iso(row.get('Policy Start Date', ''))
|
| 153 |
+
policy_end = parse_date_to_iso(row.get('Policy End Date', ''))
|
| 154 |
+
|
| 155 |
+
# Use policy_end_date as renewal_date
|
| 156 |
+
renewal_date = policy_end
|
| 157 |
+
renewal_year = extract_year_from_date(policy_end)
|
| 158 |
+
|
| 159 |
+
# Parse premium amounts
|
| 160 |
+
premium_paid = parse_premium(row.get('Premium Paid', ''))
|
| 161 |
+
gross_premium = parse_premium(row.get('Gross Premium', ''))
|
| 162 |
+
net_premium = parse_premium(row.get('Premium', ''))
|
| 163 |
+
|
| 164 |
+
# Use the best available premium value
|
| 165 |
+
premium_amount = premium_paid or gross_premium or net_premium
|
| 166 |
+
|
| 167 |
+
# Build metadata dict matching ChromaDB structure (required fields)
|
| 168 |
+
metadata = {
|
| 169 |
+
# Basic document info
|
| 170 |
+
"document_type": str(row.get('Type', 'Policy')),
|
| 171 |
+
"document_title": str(row.get('PL/EN #', '')),
|
| 172 |
+
|
| 173 |
+
# Policy details
|
| 174 |
+
"policy_number": str(row.get('Policy Number', '')),
|
| 175 |
+
"policy_type": str(row.get('Policy', '')),
|
| 176 |
+
|
| 177 |
+
# Parties involved
|
| 178 |
+
"insurer_name": str(row.get('Insurer', '')),
|
| 179 |
+
"insured_name": str(row.get('Insured Name', row.get('Client Name', ''))),
|
| 180 |
+
"broker_name": "",
|
| 181 |
+
|
| 182 |
+
# Financial
|
| 183 |
+
"sum_insured": 0.0,
|
| 184 |
+
"premium_amount": premium_amount,
|
| 185 |
+
|
| 186 |
+
# Dates
|
| 187 |
+
"policy_start_date": policy_start,
|
| 188 |
+
"policy_end_date": policy_end,
|
| 189 |
+
"renewal_date": renewal_date,
|
| 190 |
+
"renewal_year": renewal_year,
|
| 191 |
+
|
| 192 |
+
# Location
|
| 193 |
+
"city": "",
|
| 194 |
+
"state": "",
|
| 195 |
+
"pincode": "",
|
| 196 |
+
"property_address": "",
|
| 197 |
+
|
| 198 |
+
# Classification
|
| 199 |
+
"industry": "",
|
| 200 |
+
"is_manufacturing": False,
|
| 201 |
+
|
| 202 |
+
# Arrays stored as JSON
|
| 203 |
+
"coverage_type": [],
|
| 204 |
+
"keywords": [],
|
| 205 |
+
|
| 206 |
+
# Tracking
|
| 207 |
+
"created_at": time.time()
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Now add ALL columns from the table as additional metadata
|
| 211 |
+
# These will be stored as extra_* fields
|
| 212 |
+
for header in all_headers:
|
| 213 |
+
# Create a safe key name
|
| 214 |
+
safe_key = header.replace(' ', '_').replace('.', '').replace('#', 'num')
|
| 215 |
+
safe_key = re.sub(r'[^a-zA-Z0-9_]', '', safe_key)
|
| 216 |
+
safe_key = safe_key.lower()
|
| 217 |
+
|
| 218 |
+
# Skip if already mapped above
|
| 219 |
+
if safe_key in ['policy_number', 'policy', 'insurer', 'insured_name',
|
| 220 |
+
'client_name', 'premium_paid', 'gross_premium', 'premium',
|
| 221 |
+
'policy_start_date', 'policy_end_date', 'type', 'plen_num']:
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
+
value = row.get(header, '')
|
| 225 |
+
|
| 226 |
+
# Store the raw value as a string (ChromaDB requires primitive types)
|
| 227 |
+
if value and value not in ["..'" , "..'", ""]:
|
| 228 |
+
metadata[f"col_{safe_key}"] = str(value)[:500] # Limit string length
|
| 229 |
+
|
| 230 |
+
return metadata
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def print_progress_bar(current: int, total: int, bar_length: int = 50):
|
| 234 |
+
"""Print a progress bar using ASCII characters."""
|
| 235 |
+
percent = current / total
|
| 236 |
+
filled = int(bar_length * percent)
|
| 237 |
+
bar = '#' * filled + '-' * (bar_length - filled)
|
| 238 |
+
print(f'\r Progress: |{bar}| {current}/{total} ({percent*100:.1f}%)', end='', flush=True)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def migrate_table_to_chroma():
|
| 242 |
+
"""
|
| 243 |
+
Main migration function.
|
| 244 |
+
|
| 245 |
+
Reads table.md, parses all rows, and inserts them into ChromaDB in batches.
|
| 246 |
+
"""
|
| 247 |
+
print("=" * 60)
|
| 248 |
+
print("Starting migration from table.md to ChromaDB")
|
| 249 |
+
print("=" * 60)
|
| 250 |
+
|
| 251 |
+
# Initialize ChromaDB service
|
| 252 |
+
print("\n[1] Connecting to ChromaDB...")
|
| 253 |
+
chroma = ChromaService()
|
| 254 |
+
|
| 255 |
+
# Check current state of the metadata collection
|
| 256 |
+
current_data = chroma.metadata_collection.get()
|
| 257 |
+
print(f"Current document_metadata collection has {len(current_data['ids'])} entries")
|
| 258 |
+
|
| 259 |
+
# Parse the markdown table
|
| 260 |
+
print("\n[2] Parsing table.md...")
|
| 261 |
+
table_path = "table.md"
|
| 262 |
+
headers, rows = parse_markdown_table(table_path)
|
| 263 |
+
|
| 264 |
+
if not rows:
|
| 265 |
+
print("ERROR: No data rows found in table.md")
|
| 266 |
+
return
|
| 267 |
+
|
| 268 |
+
print(f"\nSample row data (first row, first 10 columns):")
|
| 269 |
+
sample = rows[0]
|
| 270 |
+
for key in list(sample.keys())[:10]:
|
| 271 |
+
print(f" {key}: {sample[key][:50] if len(sample[key]) > 50 else sample[key]}")
|
| 272 |
+
|
| 273 |
+
# Fixed user_id and bucket_id for all entries
|
| 274 |
+
# IMPORTANT: These should match the actual hashed IDs in the system
|
| 275 |
+
# Check with: python check_users.py
|
| 276 |
+
# User "jash" has ID: 7ac2ed69d52d2010
|
| 277 |
+
# Bucket "2025 policy sibro" has ID: ee449d7c04e92039
|
| 278 |
+
user_id = "7ac2ed69d52d2010" # jash's user ID
|
| 279 |
+
bucket_id = "ee449d7c04e92039" # 2025 policy sibro bucket
|
| 280 |
+
|
| 281 |
+
print(f"\n[3] Migrating {len(rows)} rows to ChromaDB...")
|
| 282 |
+
print(f"Using user_id: {user_id}, bucket_id: {bucket_id}")
|
| 283 |
+
print(f"Batch size: 10 entries")
|
| 284 |
+
print()
|
| 285 |
+
|
| 286 |
+
success_count = 0
|
| 287 |
+
error_count = 0
|
| 288 |
+
|
| 289 |
+
BATCH_SIZE = 10
|
| 290 |
+
total_batches = (len(rows) + BATCH_SIZE - 1) // BATCH_SIZE
|
| 291 |
+
|
| 292 |
+
for batch_num in range(total_batches):
|
| 293 |
+
start_idx = batch_num * BATCH_SIZE
|
| 294 |
+
end_idx = min(start_idx + BATCH_SIZE, len(rows))
|
| 295 |
+
batch_rows = rows[start_idx:end_idx]
|
| 296 |
+
|
| 297 |
+
for row in batch_rows:
|
| 298 |
+
try:
|
| 299 |
+
# Generate unique doc_id
|
| 300 |
+
doc_id = generate_doc_id(row)
|
| 301 |
+
|
| 302 |
+
# Convert row to metadata format (include all headers)
|
| 303 |
+
metadata = convert_row_to_metadata(row, headers)
|
| 304 |
+
|
| 305 |
+
# Store in ChromaDB
|
| 306 |
+
result = chroma.store_document_metadata(
|
| 307 |
+
doc_id=doc_id,
|
| 308 |
+
user_id=user_id,
|
| 309 |
+
bucket_id=bucket_id,
|
| 310 |
+
metadata=metadata
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
if result.get("status") == "stored":
|
| 314 |
+
success_count += 1
|
| 315 |
+
else:
|
| 316 |
+
error_count += 1
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
print(f"\n Error on row: {str(e)[:100]}")
|
| 320 |
+
error_count += 1
|
| 321 |
+
|
| 322 |
+
# Update progress bar
|
| 323 |
+
print_progress_bar(end_idx, len(rows))
|
| 324 |
+
|
| 325 |
+
# Small delay between batches to not overload the API
|
| 326 |
+
if batch_num < total_batches - 1:
|
| 327 |
+
time.sleep(0.1)
|
| 328 |
+
|
| 329 |
+
print() # New line after progress bar
|
| 330 |
+
|
| 331 |
+
# Final status
|
| 332 |
+
print("\n" + "=" * 60)
|
| 333 |
+
print("Migration Complete!")
|
| 334 |
+
print("=" * 60)
|
| 335 |
+
print(f"Successfully migrated: {success_count} entries")
|
| 336 |
+
print(f"Errors: {error_count}")
|
| 337 |
+
|
| 338 |
+
# Verify final state
|
| 339 |
+
final_data = chroma.metadata_collection.get()
|
| 340 |
+
print(f"\nFinal document_metadata collection has {len(final_data['ids'])} entries")
|
| 341 |
+
|
| 342 |
+
# Show sample of stored data
|
| 343 |
+
if final_data['ids']:
|
| 344 |
+
print("\nSample stored metadata (first entry):")
|
| 345 |
+
sample_meta = final_data['metadatas'][0]
|
| 346 |
+
for key, value in list(sample_meta.items())[:15]:
|
| 347 |
+
print(f" {key}: {value}")
|
| 348 |
+
print(f" ... and {len(sample_meta) - 15} more fields")
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
if __name__ == "__main__":
|
| 352 |
+
migrate_table_to_chroma()
|
services/chroma_service.py
CHANGED
|
@@ -410,25 +410,23 @@ class ChromaService:
|
|
| 410 |
|
| 411 |
IMPORTANT: When bucket_id is provided, ONLY chunks from that bucket are returned.
|
| 412 |
This ensures strict bucket isolation for multi-bucket deployments.
|
|
|
|
| 413 |
"""
|
| 414 |
-
# Build where clause with strict bucket isolation
|
|
|
|
|
|
|
| 415 |
if bucket_id:
|
| 416 |
-
|
| 417 |
-
"$and": [
|
| 418 |
-
{"user_id": user_id},
|
| 419 |
-
{"bucket_id": bucket_id}
|
| 420 |
-
]
|
| 421 |
-
}
|
| 422 |
print(f"[CHROMA] Strict bucket isolation: searching only bucket '{bucket_id}'")
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
}
|
| 430 |
else:
|
| 431 |
-
where_clause =
|
| 432 |
|
| 433 |
results = self.chunks_collection.query(
|
| 434 |
query_texts=[query],
|
|
|
|
| 410 |
|
| 411 |
IMPORTANT: When bucket_id is provided, ONLY chunks from that bucket are returned.
|
| 412 |
This ensures strict bucket isolation for multi-bucket deployments.
|
| 413 |
+
When doc_ids is also provided, it filters to specific documents within the bucket.
|
| 414 |
"""
|
| 415 |
+
# Build where clause with strict bucket isolation and optional doc_id filtering
|
| 416 |
+
conditions = [{"user_id": user_id}]
|
| 417 |
+
|
| 418 |
if bucket_id:
|
| 419 |
+
conditions.append({"bucket_id": bucket_id})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
print(f"[CHROMA] Strict bucket isolation: searching only bucket '{bucket_id}'")
|
| 421 |
+
|
| 422 |
+
if doc_ids:
|
| 423 |
+
conditions.append({"doc_id": {"$in": doc_ids}})
|
| 424 |
+
print(f"[CHROMA] Filtering to {len(doc_ids)} specific documents")
|
| 425 |
+
|
| 426 |
+
if len(conditions) > 1:
|
| 427 |
+
where_clause = {"$and": conditions}
|
| 428 |
else:
|
| 429 |
+
where_clause = conditions[0]
|
| 430 |
|
| 431 |
results = self.chunks_collection.query(
|
| 432 |
query_texts=[query],
|
services/rag_service.py
CHANGED
|
@@ -234,6 +234,9 @@ CRITICAL RULES:
|
|
| 234 |
2. When multiple industries are mentioned (e.g., "manufacturing and healthcare"), combine them with comma: "manufacturing, healthcare"
|
| 235 |
3. When user asks for "top N" of something, set both limit AND sort_by appropriately
|
| 236 |
4. Keywords like "manufacturing", "healthcare", "retail", "IT", "construction" are INDUSTRIES - put them in filters
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
FORMAT DETECTION (NEW):
|
| 239 |
1. Detect if user explicitly asks for a specific format:
|
|
@@ -290,7 +293,22 @@ Query: "list all fire policies in bullet points"
|
|
| 290 |
{"intent":"list","needs_metadata":true,"filters":{"policy_type":"fire"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"bullets","is_format_change":false}
|
| 291 |
|
| 292 |
Query: "top 5 health policies by sum insured as a table"
|
| 293 |
-
{"intent":"rank","needs_metadata":true,"filters":{"policy_type":"health"},"sort_by":"sum_insured","sort_order":"desc","limit":5,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
messages = [
|
| 296 |
{"role": "system", "content": system_prompt},
|
|
@@ -457,25 +475,32 @@ Query: "top 5 health policies by sum insured as a table"
|
|
| 457 |
Returns markdown-compatible formatting guidance.
|
| 458 |
"""
|
| 459 |
format_map = {
|
| 460 |
-
"table": """FORMAT: Present data
|
| 461 |
-
|
| 462 |
-
-
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
-
"list": """FORMAT: Present as a numbered list.
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
"bullets": """FORMAT: Use bullet points.
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
-
|
|
|
|
|
|
|
| 474 |
|
| 475 |
"paragraph": """FORMAT: Write in flowing prose paragraphs.
|
| 476 |
- Use complete sentences and natural language
|
| 477 |
- Group related information into paragraphs
|
| 478 |
-
-
|
| 479 |
}
|
| 480 |
|
| 481 |
return format_map.get(format_preference, "")
|
|
@@ -928,9 +953,91 @@ Summary: {summary[:300] if summary else 'No summary available'}
|
|
| 928 |
|
| 929 |
elif field in ['city', 'state', 'insurer_name', 'insured_name', 'broker_name']:
|
| 930 |
# Handle comma-separated values (OR logic)
|
|
|
|
| 931 |
filter_values = [v.strip().lower() for v in str(value).split(',')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 932 |
all_metadata = [m for m in all_metadata
|
| 933 |
-
if any(
|
| 934 |
print(f"[METADATA QUERY] Filtered by {field} {filter_values}: {len(all_metadata)} remaining")
|
| 935 |
|
| 936 |
elif field == 'renewal_year':
|
|
@@ -1041,8 +1148,8 @@ Summary: {summary[:300] if summary else 'No summary available'}
|
|
| 1041 |
- Location: {meta.get('city', '')}, {meta.get('state', '')}
|
| 1042 |
"""
|
| 1043 |
else:
|
| 1044 |
-
# Compact format for large sets
|
| 1045 |
-
entry = f"{i}. {meta.get('document_title', 'Unknown')} | {meta.get('insured_name', 'N/A')} | ₹{meta.get('premium_amount', 0):,.0f} | {meta.get('policy_type', 'N/A')}"
|
| 1046 |
|
| 1047 |
context_parts.append(entry)
|
| 1048 |
|
|
@@ -1059,6 +1166,397 @@ Summary: {summary[:300] if summary else 'No summary available'}
|
|
| 1059 |
'sources': {m.get('doc_id'): m.get('document_title') for m in all_metadata}
|
| 1060 |
}
|
| 1061 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1062 |
def _stream_metadata_query(self, user_id: str, bucket_id: str,
|
| 1063 |
query: str, parsed: dict, chat_id: str = ""):
|
| 1064 |
"""
|
|
@@ -1116,13 +1614,34 @@ Summary: {summary[:300] if summary else 'No summary available'}
|
|
| 1116 |
total_before = result.get('total_before_filter', 0)
|
| 1117 |
calculation = result.get('calculation')
|
| 1118 |
|
| 1119 |
-
# Check if we have any data
|
| 1120 |
if not context or total_docs == 0:
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
|
| 1127 |
# Send sources first
|
| 1128 |
yield {
|
|
@@ -1139,7 +1658,30 @@ Summary: {summary[:300] if summary else 'No summary available'}
|
|
| 1139 |
conciseness_directive = "\n\nIMPORTANT: Be concise and direct. No preambles or verbose explanations. Get straight to the formatted answer." if format_preference else ""
|
| 1140 |
|
| 1141 |
if intent == 'count':
|
| 1142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
|
| 1144 |
CRITICAL INSTRUCTIONS:
|
| 1145 |
1. The count has been computed: {total_docs} documents match the criteria.
|
|
@@ -1191,18 +1733,31 @@ CRITICAL INSTRUCTIONS:
|
|
| 1191 |
{format_instructions if format_instructions else "FORMAT: Use tables or side-by-side format where helpful."}"""
|
| 1192 |
|
| 1193 |
else: # list, summarize, or other
|
| 1194 |
-
system_prompt = f"""You are Iribl AI, a document analysis assistant. You are answering a query
|
| 1195 |
-
|
| 1196 |
-
CRITICAL
|
| 1197 |
-
1. You have been given metadata for {total_docs} documents
|
| 1198 |
-
2.
|
| 1199 |
-
3.
|
| 1200 |
-
4.
|
| 1201 |
-
5. For
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1202 |
|
| 1203 |
{format_instructions if format_instructions else "FORMAT: Use headers, bullet points, and bold text for clarity."}
|
| 1204 |
|
| 1205 |
-
|
| 1206 |
|
| 1207 |
# Step 3: Load conversation history for memory (CRITICAL FOR CONTEXT)
|
| 1208 |
stored_history = []
|
|
@@ -1900,13 +2455,25 @@ Instructions: Synthesize from multiple documents if relevant. Be detailed but co
|
|
| 1900 |
print(f"[QUERY ROUTING] AI-parsed query: {parsed}")
|
| 1901 |
|
| 1902 |
# Route based on AI-parsed intent
|
| 1903 |
-
|
| 1904 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1905 |
yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
|
| 1906 |
return
|
| 1907 |
|
| 1908 |
-
|
| 1909 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1910 |
|
| 1911 |
# Step 1: Expand query for better retrieval (handles "module 5" -> "module five", etc.)
|
| 1912 |
expanded_queries = self._expand_query(query)
|
|
|
|
| 234 |
2. When multiple industries are mentioned (e.g., "manufacturing and healthcare"), combine them with comma: "manufacturing, healthcare"
|
| 235 |
3. When user asks for "top N" of something, set both limit AND sort_by appropriately
|
| 236 |
4. Keywords like "manufacturing", "healthcare", "retail", "IT", "construction" are INDUSTRIES - put them in filters
|
| 237 |
+
5. COMPANY NAME EXTRACTION: When user mentions a company name (e.g., "ABC Corp", "XYZ Industries", "Company Name"), extract it to insured_name filter. Extract the company name as mentioned in the query, even if it's partial. The system will handle name variations (case, spacing, suffixes like "Pvt Ltd", singular/plural) automatically.
|
| 238 |
+
6. TYPO HANDLING: If user makes typos (e.g., "policie" -> "policies", "polciy" -> "policy"), still extract the correct intent and filters. The system is forgiving of spelling errors.
|
| 239 |
+
7. COMPANY vs INDIVIDUAL: When user mentions a company name with business keywords (e.g., "ABC Chemical", "XYZ Industries", "Company Corp"), they want COMPANY policies, not individual person policies. The system will automatically filter out individual person names when company keywords are detected.
|
| 240 |
|
| 241 |
FORMAT DETECTION (NEW):
|
| 242 |
1. Detect if user explicitly asks for a specific format:
|
|
|
|
| 293 |
{"intent":"list","needs_metadata":true,"filters":{"policy_type":"fire"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"bullets","is_format_change":false}
|
| 294 |
|
| 295 |
Query: "top 5 health policies by sum insured as a table"
|
| 296 |
+
{"intent":"rank","needs_metadata":true,"filters":{"policy_type":"health"},"sort_by":"sum_insured","sort_order":"desc","limit":5,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}
|
| 297 |
+
|
| 298 |
+
Query: "renewals in march 2026"
|
| 299 |
+
{"intent":"list","needs_metadata":true,"filters":{"renewal_year":2026,"renewal_month":"march"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
|
| 300 |
+
|
| 301 |
+
Query: "renewals in march 2026 also list the renewal date"
|
| 302 |
+
{"intent":"list","needs_metadata":true,"filters":{"renewal_year":2026,"renewal_month":"march"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":"table","is_format_change":false}
|
| 303 |
+
|
| 304 |
+
Query: "policies expiring in april 2025 with premium details"
|
| 305 |
+
{"intent":"list","needs_metadata":true,"filters":{"renewal_year":2025,"renewal_month":"april"},"sort_by":"renewal_date","sort_order":"asc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
|
| 306 |
+
|
| 307 |
+
Query: "list all ABC Corp policies"
|
| 308 |
+
{"intent":"list","needs_metadata":true,"filters":{"insured_name":"ABC Corp"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}
|
| 309 |
+
|
| 310 |
+
Query: "show me policies for XYZ Industries"
|
| 311 |
+
{"intent":"list","needs_metadata":true,"filters":{"insured_name":"XYZ Industries"},"sort_by":null,"sort_order":"desc","limit":null,"calculation":null,"calculation_field":null,"format_preference":null,"is_format_change":false}"""
|
| 312 |
|
| 313 |
messages = [
|
| 314 |
{"role": "system", "content": system_prompt},
|
|
|
|
| 475 |
Returns markdown-compatible formatting guidance.
|
| 476 |
"""
|
| 477 |
format_map = {
|
| 478 |
+
"table": """FORMAT: Present data as a complete markdown table.
|
| 479 |
+
CRITICAL TABLE RULES:
|
| 480 |
+
1. Include EVERY SINGLE item from the data - do NOT truncate or summarize
|
| 481 |
+
2. Use these standard columns: | S.No | Document/Policy Name | Insured Name | Policy Type | Sum Insured | Premium | Renewal Date |
|
| 482 |
+
3. Use | column | headers | with |---| separator line
|
| 483 |
+
4. If there are 37 items, the table MUST have 37 rows (plus header)
|
| 484 |
+
5. Use ₹ symbol for currency values with proper formatting""",
|
| 485 |
|
| 486 |
+
"list": """FORMAT: Present as a complete numbered list.
|
| 487 |
+
CRITICAL LIST RULES:
|
| 488 |
+
1. Include EVERY SINGLE item - do NOT skip any
|
| 489 |
+
2. Number each item starting from 1
|
| 490 |
+
3. Include key details: name, policy type, amounts, dates
|
| 491 |
+
4. If there are 37 items, list ALL 37 items""",
|
| 492 |
|
| 493 |
+
"bullets": """FORMAT: Use bullet points for all items.
|
| 494 |
+
CRITICAL BULLET RULES:
|
| 495 |
+
1. Include EVERY SINGLE item as a bullet point
|
| 496 |
+
2. Sub-details can be indented bullets
|
| 497 |
+
3. Do NOT summarize or truncate the list
|
| 498 |
+
4. If there are 37 items, show ALL 37 bullets""",
|
| 499 |
|
| 500 |
"paragraph": """FORMAT: Write in flowing prose paragraphs.
|
| 501 |
- Use complete sentences and natural language
|
| 502 |
- Group related information into paragraphs
|
| 503 |
+
- Still mention ALL items, just in prose form"""
|
| 504 |
}
|
| 505 |
|
| 506 |
return format_map.get(format_preference, "")
|
|
|
|
| 953 |
|
| 954 |
elif field in ['city', 'state', 'insurer_name', 'insured_name', 'broker_name']:
|
| 955 |
# Handle comma-separated values (OR logic)
|
| 956 |
+
# For name fields, use flexible matching to handle variations
|
| 957 |
filter_values = [v.strip().lower() for v in str(value).split(',')]
|
| 958 |
+
|
| 959 |
+
def matches_name(metadata_value, filter_value):
|
| 960 |
+
"""Flexible name matching that handles variations"""
|
| 961 |
+
if not metadata_value or not filter_value:
|
| 962 |
+
return False
|
| 963 |
+
|
| 964 |
+
meta_lower = str(metadata_value).lower()
|
| 965 |
+
filter_lower = filter_value.lower()
|
| 966 |
+
|
| 967 |
+
# Detect if filter is looking for a company (contains business keywords)
|
| 968 |
+
# vs individual person (just a name)
|
| 969 |
+
filter_is_company = any(keyword in filter_lower for keyword in
|
| 970 |
+
['chemical', 'chemicals', 'industries', 'industry',
|
| 971 |
+
'corp', 'corporation', 'ltd', 'limited', 'pvt',
|
| 972 |
+
'private', 'inc', 'incorporated', 'llc', 'company',
|
| 973 |
+
'enterprises', 'group', 'holdings'])
|
| 974 |
+
|
| 975 |
+
# Detect if metadata is a company (has business suffixes or keywords)
|
| 976 |
+
meta_is_company = any(keyword in meta_lower for keyword in
|
| 977 |
+
[' pvt ltd', ' pvt. ltd', ' ltd', ' ltd.', ' limited',
|
| 978 |
+
' inc', ' inc.', ' incorporated', ' llc', ' llc.',
|
| 979 |
+
' corporation', ' corp', ' corp.', ' industries',
|
| 980 |
+
' industry', ' company', ' enterprises', ' group'])
|
| 981 |
+
|
| 982 |
+
# If filter is for a company but metadata is individual, skip
|
| 983 |
+
# This prevents matching "Choksey Chemical" company with "Bharat Choksey" person
|
| 984 |
+
if filter_is_company and not meta_is_company:
|
| 985 |
+
# Check if metadata is clearly a person name (has first/middle/last name pattern)
|
| 986 |
+
# Person names typically have 2-4 words, company names are usually longer or have suffixes
|
| 987 |
+
meta_words = meta_lower.split()
|
| 988 |
+
if len(meta_words) <= 4 and not any(char.isdigit() for char in meta_lower):
|
| 989 |
+
# Likely a person name, skip if filter is for company
|
| 990 |
+
return False
|
| 991 |
+
|
| 992 |
+
# Remove common suffixes/prefixes for better matching
|
| 993 |
+
# Remove "pvt ltd", "ltd", "inc", "llc", etc.
|
| 994 |
+
meta_clean = meta_lower
|
| 995 |
+
filter_clean = filter_lower
|
| 996 |
+
|
| 997 |
+
for suffix in [' pvt ltd', ' pvt. ltd', ' pvt ltd.', ' pvt. ltd.',
|
| 998 |
+
' ltd', ' ltd.', ' limited', ' inc', ' inc.',
|
| 999 |
+
' incorporated', ' llc', ' llc.', ' corporation', ' corp', ' corp.',
|
| 1000 |
+
' industries', ' industry', ' company', ' enterprises', ' group']:
|
| 1001 |
+
meta_clean = meta_clean.replace(suffix, '')
|
| 1002 |
+
filter_clean = filter_clean.replace(suffix, '')
|
| 1003 |
+
|
| 1004 |
+
# Remove extra spaces and punctuation
|
| 1005 |
+
import re
|
| 1006 |
+
meta_clean = re.sub(r'[^\w\s]', ' ', meta_clean)
|
| 1007 |
+
filter_clean = re.sub(r'[^\w\s]', ' ', filter_clean)
|
| 1008 |
+
meta_clean = ' '.join(meta_clean.split())
|
| 1009 |
+
filter_clean = ' '.join(filter_clean.split())
|
| 1010 |
+
|
| 1011 |
+
# Check if filter value is a substring of metadata value
|
| 1012 |
+
if filter_clean in meta_clean:
|
| 1013 |
+
return True
|
| 1014 |
+
|
| 1015 |
+
# Also check if all significant words from filter are in metadata
|
| 1016 |
+
# Handle singular/plural variations
|
| 1017 |
+
filter_words = [w for w in filter_clean.split() if len(w) > 2]
|
| 1018 |
+
if filter_words:
|
| 1019 |
+
meta_words = set(meta_clean.split())
|
| 1020 |
+
for word in filter_words:
|
| 1021 |
+
# Check exact match
|
| 1022 |
+
if word in meta_words:
|
| 1023 |
+
continue
|
| 1024 |
+
# Check singular/plural variations
|
| 1025 |
+
word_singular = word.rstrip('s') if word.endswith('s') else word
|
| 1026 |
+
word_plural = word + 's' if not word.endswith('s') else word
|
| 1027 |
+
if word_singular in meta_words or word_plural in meta_words:
|
| 1028 |
+
continue
|
| 1029 |
+
# Check if word is a substring of any metadata word
|
| 1030 |
+
if any(word in mw or mw in word for mw in meta_words if len(mw) > 3):
|
| 1031 |
+
continue
|
| 1032 |
+
# If none match, this word doesn't match
|
| 1033 |
+
return False
|
| 1034 |
+
return True
|
| 1035 |
+
|
| 1036 |
+
return False
|
| 1037 |
+
|
| 1038 |
+
# Apply flexible matching
|
| 1039 |
all_metadata = [m for m in all_metadata
|
| 1040 |
+
if any(matches_name(m.get(field, ''), fv) for fv in filter_values)]
|
| 1041 |
print(f"[METADATA QUERY] Filtered by {field} {filter_values}: {len(all_metadata)} remaining")
|
| 1042 |
|
| 1043 |
elif field == 'renewal_year':
|
|
|
|
| 1148 |
- Location: {meta.get('city', '')}, {meta.get('state', '')}
|
| 1149 |
"""
|
| 1150 |
else:
|
| 1151 |
+
# Compact format for large sets - includes renewal date
|
| 1152 |
+
entry = f"{i}. {meta.get('document_title', 'Unknown')} | {meta.get('insured_name', 'N/A')} | ₹{meta.get('premium_amount', 0):,.0f} | {meta.get('policy_type', 'N/A')} | Renewal: {meta.get('renewal_date', 'N/A')}"
|
| 1153 |
|
| 1154 |
context_parts.append(entry)
|
| 1155 |
|
|
|
|
| 1166 |
'sources': {m.get('doc_id'): m.get('document_title') for m in all_metadata}
|
| 1167 |
}
|
| 1168 |
|
| 1169 |
+
def _get_rag_context_for_query(self, user_id: str, bucket_id: str, query: str,
|
| 1170 |
+
filters: dict = None, is_fallback: bool = False,
|
| 1171 |
+
doc_ids: list[str] = None) -> dict:
|
| 1172 |
+
"""
|
| 1173 |
+
Get RAG context from chunk retrieval for a query.
|
| 1174 |
+
Used as fallback when metadata filtering returns 0 results,
|
| 1175 |
+
or to supplement metadata with detailed document content.
|
| 1176 |
+
|
| 1177 |
+
Args:
|
| 1178 |
+
user_id: User ID
|
| 1179 |
+
bucket_id: Bucket ID
|
| 1180 |
+
query: The search query
|
| 1181 |
+
filters: Optional filters from parsed query (used to build search query)
|
| 1182 |
+
is_fallback: If True, use more aggressive search (higher top_k, better query construction)
|
| 1183 |
+
doc_ids: Optional list of specific document IDs to search (from document name detection)
|
| 1184 |
+
|
| 1185 |
+
Returns:
|
| 1186 |
+
dict with:
|
| 1187 |
+
- context: Combined text from retrieved chunks
|
| 1188 |
+
- sources: dict of doc_id -> filename
|
| 1189 |
+
- chunk_count: Number of chunks retrieved
|
| 1190 |
+
- chunks: Raw chunk data
|
| 1191 |
+
"""
|
| 1192 |
+
print(f"[HYBRID RAG] Getting RAG context for query: {query[:50]}... (fallback={is_fallback})")
|
| 1193 |
+
|
| 1194 |
+
# Step 0: Detect if user mentioned a specific document name in the query
|
| 1195 |
+
if doc_ids is None:
|
| 1196 |
+
user_docs = chroma_service.get_user_documents(user_id, bucket_id)
|
| 1197 |
+
referenced_doc_ids = self._detect_document_reference(query, user_docs)
|
| 1198 |
+
if referenced_doc_ids:
|
| 1199 |
+
doc_ids = referenced_doc_ids
|
| 1200 |
+
print(f"[HYBRID RAG] Detected document reference in query: {len(doc_ids)} documents")
|
| 1201 |
+
|
| 1202 |
+
# Build enhanced search query from filters if available
|
| 1203 |
+
search_query = query
|
| 1204 |
+
if filters:
|
| 1205 |
+
# Add filter values to improve semantic search
|
| 1206 |
+
filter_terms = []
|
| 1207 |
+
for field, value in filters.items():
|
| 1208 |
+
if value and field in ['insured_name', 'insurer_name', 'broker_name',
|
| 1209 |
+
'policy_type', 'industry', 'city', 'state']:
|
| 1210 |
+
filter_terms.append(str(value))
|
| 1211 |
+
if filter_terms:
|
| 1212 |
+
search_query = f"{query} {' '.join(filter_terms)}"
|
| 1213 |
+
print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
|
| 1214 |
+
|
| 1215 |
+
# For fallback searches, use more aggressive parameters
|
| 1216 |
+
if is_fallback:
|
| 1217 |
+
# Extract key terms from original query for better matching
|
| 1218 |
+
# Split query into words and keep important terms
|
| 1219 |
+
query_words = query.lower().split()
|
| 1220 |
+
# Remove common stop words but keep entity names and numbers
|
| 1221 |
+
stop_words = {'how', 'many', 'are', 'is', 'the', 'a', 'an', 'for', 'in', 'on', 'at', 'to', 'of'}
|
| 1222 |
+
key_terms = [w for w in query_words if w not in stop_words and len(w) > 2]
|
| 1223 |
+
|
| 1224 |
+
# If we have filters, prioritize those terms
|
| 1225 |
+
if filters:
|
| 1226 |
+
for field, value in filters.items():
|
| 1227 |
+
if value and field in ['insured_name', 'insurer_name', 'broker_name']:
|
| 1228 |
+
# Add the filter value as a key term
|
| 1229 |
+
value_words = str(value).lower().split()
|
| 1230 |
+
key_terms.extend([w for w in value_words if len(w) > 2])
|
| 1231 |
+
|
| 1232 |
+
# Build a more focused search query for fallback
|
| 1233 |
+
if key_terms:
|
| 1234 |
+
# Use original query + key terms for better semantic matching
|
| 1235 |
+
enhanced_query = f"{query} {' '.join(set(key_terms))}"
|
| 1236 |
+
print(f"[HYBRID RAG] Fallback enhanced query: {enhanced_query[:100]}...")
|
| 1237 |
+
search_query = enhanced_query
|
| 1238 |
+
|
| 1239 |
+
# Perform semantic chunk search with higher top_k for fallback
|
| 1240 |
+
# IMPORTANT: ChromaDB Cloud has a quota limit of 300 results per query
|
| 1241 |
+
# Cap top_k to respect this limit
|
| 1242 |
+
CHROMADB_MAX_RESULTS = 300
|
| 1243 |
+
if is_fallback:
|
| 1244 |
+
top_k_value = min(self.top_k * 4, CHROMADB_MAX_RESULTS)
|
| 1245 |
+
else:
|
| 1246 |
+
top_k_value = min(self.top_k * 2, CHROMADB_MAX_RESULTS)
|
| 1247 |
+
print(f"[HYBRID RAG] Using top_k={top_k_value} for search (capped at {CHROMADB_MAX_RESULTS} for ChromaDB quota)")
|
| 1248 |
+
|
| 1249 |
+
chunks = chroma_service.search_chunks(
|
| 1250 |
+
user_id=user_id,
|
| 1251 |
+
query=search_query,
|
| 1252 |
+
bucket_id=bucket_id,
|
| 1253 |
+
doc_ids=doc_ids, # Pass doc_ids to filter by specific documents if detected
|
| 1254 |
+
top_k=top_k_value
|
| 1255 |
+
)
|
| 1256 |
+
|
| 1257 |
+
if not chunks:
|
| 1258 |
+
print("[HYBRID RAG] No chunks found from RAG search")
|
| 1259 |
+
return {
|
| 1260 |
+
'context': '',
|
| 1261 |
+
'sources': {},
|
| 1262 |
+
'chunk_count': 0,
|
| 1263 |
+
'chunks': []
|
| 1264 |
+
}
|
| 1265 |
+
|
| 1266 |
+
print(f"[HYBRID RAG] Found {len(chunks)} chunks via semantic search")
|
| 1267 |
+
|
| 1268 |
+
# Build context from chunks
|
| 1269 |
+
context_parts = []
|
| 1270 |
+
sources = {}
|
| 1271 |
+
|
| 1272 |
+
for i, chunk in enumerate(chunks, 1):
|
| 1273 |
+
doc_id = chunk['doc_id']
|
| 1274 |
+
|
| 1275 |
+
# Get filename from chroma if not cached
|
| 1276 |
+
if doc_id not in sources:
|
| 1277 |
+
doc_info = chroma_service.get_document(doc_id, user_id)
|
| 1278 |
+
filename = doc_info.get('filename', 'Document') if doc_info else 'Document'
|
| 1279 |
+
sources[doc_id] = filename
|
| 1280 |
+
|
| 1281 |
+
# Build context entry with document label
|
| 1282 |
+
section = f"=== DOCUMENT: {sources[doc_id]} (Section {i}) ===\n{chunk['text']}"
|
| 1283 |
+
context_parts.append(section)
|
| 1284 |
+
|
| 1285 |
+
context = "\n\n".join(context_parts)
|
| 1286 |
+
print(f"[HYBRID RAG] Built context: {len(context)} chars from {len(chunks)} chunks")
|
| 1287 |
+
|
| 1288 |
+
return {
|
| 1289 |
+
'context': context,
|
| 1290 |
+
'sources': sources,
|
| 1291 |
+
'chunk_count': len(chunks),
|
| 1292 |
+
'chunks': chunks
|
| 1293 |
+
}
|
| 1294 |
+
|
| 1295 |
+
def _combine_metadata_and_rag(self, metadata_result: dict, rag_result: dict) -> dict:
|
| 1296 |
+
"""
|
| 1297 |
+
Combine metadata and RAG contexts for hybrid queries.
|
| 1298 |
+
Provides structured metadata summary + detailed RAG content.
|
| 1299 |
+
|
| 1300 |
+
Args:
|
| 1301 |
+
metadata_result: Result from _handle_metadata_query
|
| 1302 |
+
rag_result: Result from _get_rag_context_for_query
|
| 1303 |
+
|
| 1304 |
+
Returns:
|
| 1305 |
+
Combined context dict with merged sources
|
| 1306 |
+
"""
|
| 1307 |
+
combined_parts = []
|
| 1308 |
+
|
| 1309 |
+
# Add metadata summary section if available
|
| 1310 |
+
if metadata_result.get('context') and metadata_result.get('total_documents', 0) > 0:
|
| 1311 |
+
combined_parts.append("=== DOCUMENT METADATA (Structured Fields) ===")
|
| 1312 |
+
combined_parts.append(metadata_result['context'])
|
| 1313 |
+
combined_parts.append("")
|
| 1314 |
+
|
| 1315 |
+
# Add RAG context section if available
|
| 1316 |
+
if rag_result.get('context') and rag_result.get('chunk_count', 0) > 0:
|
| 1317 |
+
combined_parts.append("=== DETAILED DOCUMENT CONTENT (From Text Search) ===")
|
| 1318 |
+
combined_parts.append(rag_result['context'])
|
| 1319 |
+
|
| 1320 |
+
# Merge sources
|
| 1321 |
+
all_sources = {}
|
| 1322 |
+
all_sources.update(metadata_result.get('sources', {}))
|
| 1323 |
+
all_sources.update(rag_result.get('sources', {}))
|
| 1324 |
+
|
| 1325 |
+
combined_context = "\n".join(combined_parts)
|
| 1326 |
+
|
| 1327 |
+
print(f"[HYBRID] Combined context: metadata={metadata_result.get('total_documents', 0)} docs, "
|
| 1328 |
+
f"rag={rag_result.get('chunk_count', 0)} chunks, total sources={len(all_sources)}")
|
| 1329 |
+
|
| 1330 |
+
return {
|
| 1331 |
+
'context': combined_context,
|
| 1332 |
+
'sources': all_sources,
|
| 1333 |
+
'total_documents': metadata_result.get('total_documents', 0),
|
| 1334 |
+
'chunk_count': rag_result.get('chunk_count', 0),
|
| 1335 |
+
'calculation': metadata_result.get('calculation'),
|
| 1336 |
+
'total_before_filter': metadata_result.get('total_before_filter', 0)
|
| 1337 |
+
}
|
| 1338 |
+
|
| 1339 |
+
def _stream_hybrid_query(self, user_id: str, bucket_id: str,
|
| 1340 |
+
query: str, parsed: dict, chat_id: str = ""):
|
| 1341 |
+
"""
|
| 1342 |
+
Stream responses for HYBRID queries.
|
| 1343 |
+
Combines metadata (structured fields) with RAG (detailed content) for comprehensive answers.
|
| 1344 |
+
|
| 1345 |
+
Works for all query types: specific, compare, general, summarize, followup.
|
| 1346 |
+
"""
|
| 1347 |
+
print(f"[HYBRID STREAM] Handling {parsed.get('intent')} query with metadata+RAG")
|
| 1348 |
+
|
| 1349 |
+
# Get format preference
|
| 1350 |
+
format_preference = parsed.get('format_preference')
|
| 1351 |
+
format_instructions = self._get_format_instructions(format_preference) if format_preference else ""
|
| 1352 |
+
|
| 1353 |
+
# Step 1: Get metadata context (may return 0 if filters don't match exactly)
|
| 1354 |
+
metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
|
| 1355 |
+
print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
|
| 1356 |
+
|
| 1357 |
+
# Step 2: Always get RAG context for detailed content
|
| 1358 |
+
# If metadata returned 0, use fallback mode for more aggressive search
|
| 1359 |
+
# Also detect document names in query for targeted search
|
| 1360 |
+
metadata_has_results = metadata_result.get('total_documents', 0) > 0
|
| 1361 |
+
rag_result = self._get_rag_context_for_query(
|
| 1362 |
+
user_id, bucket_id, query,
|
| 1363 |
+
filters=parsed.get('filters'),
|
| 1364 |
+
is_fallback=not metadata_has_results, # Use fallback mode if metadata failed
|
| 1365 |
+
doc_ids=None # Document name detection happens inside the method
|
| 1366 |
+
)
|
| 1367 |
+
print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
|
| 1368 |
+
|
| 1369 |
+
# Step 3: Combine contexts
|
| 1370 |
+
if metadata_result.get('total_documents', 0) > 0:
|
| 1371 |
+
# Have metadata - combine with RAG
|
| 1372 |
+
combined = self._combine_metadata_and_rag(metadata_result, rag_result)
|
| 1373 |
+
elif rag_result.get('chunk_count', 0) > 0:
|
| 1374 |
+
# No metadata match but RAG found content - use RAG only
|
| 1375 |
+
print("[HYBRID STREAM] No metadata match, using RAG-only context")
|
| 1376 |
+
combined = {
|
| 1377 |
+
'context': rag_result['context'],
|
| 1378 |
+
'sources': rag_result['sources'],
|
| 1379 |
+
'total_documents': 0,
|
| 1380 |
+
'chunk_count': rag_result['chunk_count'],
|
| 1381 |
+
'calculation': None,
|
| 1382 |
+
'total_before_filter': 0
|
| 1383 |
+
}
|
| 1384 |
+
else:
|
| 1385 |
+
# Neither found anything
|
| 1386 |
+
yield {
|
| 1387 |
+
"type": "error",
|
| 1388 |
+
"content": "No matching documents found. The document may not exist or try rephrasing your query."
|
| 1389 |
+
}
|
| 1390 |
+
return
|
| 1391 |
+
|
| 1392 |
+
context = combined['context']
|
| 1393 |
+
sources = combined['sources']
|
| 1394 |
+
total_docs = combined.get('total_documents', 0) + combined.get('chunk_count', 0)
|
| 1395 |
+
|
| 1396 |
+
# Send sources first
|
| 1397 |
+
yield {
|
| 1398 |
+
"type": "sources",
|
| 1399 |
+
"sources": list(sources.keys()),
|
| 1400 |
+
"source_files": list(sources.values())
|
| 1401 |
+
}
|
| 1402 |
+
|
| 1403 |
+
# Step 4: Build AI prompt based on intent
|
| 1404 |
+
intent = parsed.get('intent', 'specific')
|
| 1405 |
+
|
| 1406 |
+
if intent == 'compare':
|
| 1407 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COMPARISON query.
|
| 1408 |
+
|
| 1409 |
+
CRITICAL INSTRUCTIONS:
|
| 1410 |
+
1. You have BOTH structured metadata AND detailed document content.
|
| 1411 |
+
2. Use metadata for key fields: policy numbers, amounts, dates, companies.
|
| 1412 |
+
3. Use detailed content for specifics not in metadata.
|
| 1413 |
+
4. Create a clear comparison highlighting differences and similarities.
|
| 1414 |
+
5. Use a table format if comparing multiple attributes.
|
| 1415 |
+
|
| 1416 |
+
{format_instructions}
|
| 1417 |
+
|
| 1418 |
+
Do NOT say information is missing if it's in the provided context."""
|
| 1419 |
+
|
| 1420 |
+
elif intent == 'summarize':
|
| 1421 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant providing a SUMMARY.
|
| 1422 |
+
|
| 1423 |
+
CRITICAL INSTRUCTIONS:
|
| 1424 |
+
1. You have BOTH structured metadata AND detailed document content.
|
| 1425 |
+
2. Provide a concise but comprehensive summary.
|
| 1426 |
+
3. Include key facts: insured name, policy type, coverage, premium, dates.
|
| 1427 |
+
4. Highlight important terms or conditions from detailed content.
|
| 1428 |
+
5. Format with clear headers and bullet points.
|
| 1429 |
+
|
| 1430 |
+
{format_instructions}
|
| 1431 |
+
|
| 1432 |
+
Do NOT say information is missing - search through ALL provided context thoroughly."""
|
| 1433 |
+
|
| 1434 |
+
elif intent == 'specific':
|
| 1435 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant answering a SPECIFIC query about a particular document or entity.
|
| 1436 |
+
|
| 1437 |
+
CRITICAL INSTRUCTIONS:
|
| 1438 |
+
1. You have BOTH structured metadata AND detailed document content.
|
| 1439 |
+
2. Use metadata for: policy number, insured name, sum insured, premium, dates.
|
| 1440 |
+
3. Use detailed content for: coverage details, terms, conditions, exclusions.
|
| 1441 |
+
4. Provide a comprehensive answer covering all relevant information.
|
| 1442 |
+
5. Format clearly with headers and bullet points.
|
| 1443 |
+
|
| 1444 |
+
{format_instructions}
|
| 1445 |
+
|
| 1446 |
+
Do NOT say information is missing - search through ALL provided context thoroughly."""
|
| 1447 |
+
|
| 1448 |
+
else: # general, followup, or any other
|
| 1449 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant.
|
| 1450 |
+
|
| 1451 |
+
CRITICAL INSTRUCTIONS:
|
| 1452 |
+
1. You have BOTH structured metadata AND detailed document content.
|
| 1453 |
+
2. Search thoroughly through ALL provided context before answering.
|
| 1454 |
+
3. Use metadata for structured fields like names, amounts, dates.
|
| 1455 |
+
4. Use detailed content for explanations, terms, conditions.
|
| 1456 |
+
5. Provide a complete and accurate answer based on the documents.
|
| 1457 |
+
6. Format clearly with headers and bullet points where appropriate.
|
| 1458 |
+
|
| 1459 |
+
{format_instructions}
|
| 1460 |
+
|
| 1461 |
+
Do NOT say information is missing - search through ALL provided context thoroughly."""
|
| 1462 |
+
|
| 1463 |
+
# Step 5: Load conversation history
|
| 1464 |
+
stored_history = []
|
| 1465 |
+
if chat_id:
|
| 1466 |
+
try:
|
| 1467 |
+
all_history = chroma_service.get_conversation_history(
|
| 1468 |
+
user_id=user_id,
|
| 1469 |
+
bucket_id=bucket_id,
|
| 1470 |
+
limit=50
|
| 1471 |
+
)
|
| 1472 |
+
stored_history = [msg for msg in all_history if msg.get('chat_id', '') == chat_id]
|
| 1473 |
+
stored_history = stored_history[-self.max_history:]
|
| 1474 |
+
except Exception as e:
|
| 1475 |
+
print(f"[HYBRID STREAM] Failed to load history: {e}")
|
| 1476 |
+
|
| 1477 |
+
# Step 6: Build messages
|
| 1478 |
+
messages = [{"role": "system", "content": system_prompt}]
|
| 1479 |
+
|
| 1480 |
+
for msg in stored_history:
|
| 1481 |
+
messages.append({
|
| 1482 |
+
"role": msg['role'],
|
| 1483 |
+
"content": msg['content']
|
| 1484 |
+
})
|
| 1485 |
+
|
| 1486 |
+
format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
|
| 1487 |
+
|
| 1488 |
+
user_message = f"""Based on the following document data, answer my question comprehensively.
|
| 1489 |
+
|
| 1490 |
+
DOCUMENT DATA:
|
| 1491 |
+
{context}
|
| 1492 |
+
|
| 1493 |
+
QUESTION: {query}
|
| 1494 |
+
|
| 1495 |
+
Instructions: Use both the structured metadata AND detailed content to provide a complete answer.{format_reminder}"""
|
| 1496 |
+
|
| 1497 |
+
messages.append({"role": "user", "content": user_message})
|
| 1498 |
+
|
| 1499 |
+
# Step 7: Stream response
|
| 1500 |
+
full_response = ""
|
| 1501 |
+
chunk_count = 0
|
| 1502 |
+
|
| 1503 |
+
if self.use_deepseek:
|
| 1504 |
+
print("[HYBRID STREAM] Using DeepSeek for response")
|
| 1505 |
+
for chunk in self._call_deepseek_streaming(messages):
|
| 1506 |
+
if "error" in chunk:
|
| 1507 |
+
break
|
| 1508 |
+
if "chunk" in chunk:
|
| 1509 |
+
full_response += chunk["chunk"]
|
| 1510 |
+
chunk_count += 1
|
| 1511 |
+
yield {"type": "content", "content": chunk["chunk"]}
|
| 1512 |
+
|
| 1513 |
+
# Fallback to OpenRouter if needed
|
| 1514 |
+
if not full_response:
|
| 1515 |
+
print("[HYBRID STREAM] Falling back to OpenRouter")
|
| 1516 |
+
for model_key in self.fallback_order:
|
| 1517 |
+
try:
|
| 1518 |
+
for chunk in self._call_ai_model_streaming(model_key, messages):
|
| 1519 |
+
if "error" in chunk:
|
| 1520 |
+
continue
|
| 1521 |
+
if "chunk" in chunk:
|
| 1522 |
+
full_response += chunk["chunk"]
|
| 1523 |
+
chunk_count += 1
|
| 1524 |
+
yield {"type": "content", "content": chunk["chunk"]}
|
| 1525 |
+
if full_response:
|
| 1526 |
+
break
|
| 1527 |
+
except Exception as e:
|
| 1528 |
+
print(f"[HYBRID STREAM] Model {model_key} failed: {e}")
|
| 1529 |
+
continue
|
| 1530 |
+
|
| 1531 |
+
# Step 8: Store conversation
|
| 1532 |
+
if full_response and chat_id:
|
| 1533 |
+
try:
|
| 1534 |
+
chroma_service.store_conversation(
|
| 1535 |
+
user_id=user_id,
|
| 1536 |
+
role="user",
|
| 1537 |
+
content=query,
|
| 1538 |
+
bucket_id=bucket_id or "",
|
| 1539 |
+
chat_id=chat_id
|
| 1540 |
+
)
|
| 1541 |
+
chroma_service.store_conversation(
|
| 1542 |
+
user_id=user_id,
|
| 1543 |
+
role="assistant",
|
| 1544 |
+
content=full_response,
|
| 1545 |
+
bucket_id=bucket_id or "",
|
| 1546 |
+
chat_id=chat_id,
|
| 1547 |
+
format_preference=format_preference
|
| 1548 |
+
)
|
| 1549 |
+
except Exception as e:
|
| 1550 |
+
print(f"[HYBRID STREAM] Failed to store conversation: {e}")
|
| 1551 |
+
|
| 1552 |
+
yield {
|
| 1553 |
+
"type": "done",
|
| 1554 |
+
"query_type": "hybrid",
|
| 1555 |
+
"intent": intent,
|
| 1556 |
+
"metadata_docs": combined.get('total_documents', 0),
|
| 1557 |
+
"rag_chunks": combined.get('chunk_count', 0)
|
| 1558 |
+
}
|
| 1559 |
+
|
| 1560 |
def _stream_metadata_query(self, user_id: str, bucket_id: str,
|
| 1561 |
query: str, parsed: dict, chat_id: str = ""):
|
| 1562 |
"""
|
|
|
|
| 1614 |
total_before = result.get('total_before_filter', 0)
|
| 1615 |
calculation = result.get('calculation')
|
| 1616 |
|
| 1617 |
+
# Check if we have any data - if not, try RAG fallback
|
| 1618 |
if not context or total_docs == 0:
|
| 1619 |
+
print(f"[HYBRID] Metadata returned 0 results, attempting RAG fallback...")
|
| 1620 |
+
|
| 1621 |
+
# Try RAG fallback with the filters as search enhancement
|
| 1622 |
+
# Use is_fallback=True for more aggressive search
|
| 1623 |
+
# Also detect document names in query for targeted search
|
| 1624 |
+
rag_result = self._get_rag_context_for_query(
|
| 1625 |
+
user_id, bucket_id, query,
|
| 1626 |
+
filters=parsed.get('filters'),
|
| 1627 |
+
is_fallback=True, # Use more aggressive search parameters
|
| 1628 |
+
doc_ids=None # Document name detection happens inside the method
|
| 1629 |
+
)
|
| 1630 |
+
|
| 1631 |
+
if rag_result.get('context') and rag_result.get('chunk_count', 0) > 0:
|
| 1632 |
+
# Use RAG context instead
|
| 1633 |
+
context = rag_result['context']
|
| 1634 |
+
sources = rag_result['sources']
|
| 1635 |
+
total_docs = rag_result['chunk_count']
|
| 1636 |
+
total_before = 0 # Not applicable for RAG
|
| 1637 |
+
print(f"[HYBRID] RAG fallback successful: found {total_docs} chunks")
|
| 1638 |
+
else:
|
| 1639 |
+
# Both metadata and RAG failed
|
| 1640 |
+
yield {
|
| 1641 |
+
"type": "error",
|
| 1642 |
+
"content": "No matching documents found. The document may not exist in this collection, or try rephrasing your query."
|
| 1643 |
+
}
|
| 1644 |
+
return
|
| 1645 |
|
| 1646 |
# Send sources first
|
| 1647 |
yield {
|
|
|
|
| 1658 |
conciseness_directive = "\n\nIMPORTANT: Be concise and direct. No preambles or verbose explanations. Get straight to the formatted answer." if format_preference else ""
|
| 1659 |
|
| 1660 |
if intent == 'count':
|
| 1661 |
+
# Check if we're using RAG fallback (metadata returned 0)
|
| 1662 |
+
is_rag_fallback = (total_before == 0 and total_docs > 0)
|
| 1663 |
+
|
| 1664 |
+
if is_rag_fallback:
|
| 1665 |
+
# Using RAG content - need to extract count from document text
|
| 1666 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COUNT query.
|
| 1667 |
+
|
| 1668 |
+
CRITICAL INSTRUCTIONS:
|
| 1669 |
+
1. The user is asking for a COUNT/NUMBER that may not be in structured metadata.
|
| 1670 |
+
2. You have been provided with detailed document content from RAG search.
|
| 1671 |
+
3. CAREFULLY read through the document content to find the specific number/count requested.
|
| 1672 |
+
4. Look for numbers, counts, totals, or quantities related to the query.
|
| 1673 |
+
5. If the query asks "how many students", search for phrases like:
|
| 1674 |
+
- "total students", "number of students", "students insured", "X students"
|
| 1675 |
+
- Look for explicit numbers in the context
|
| 1676 |
+
6. State the count clearly and directly. If you find the number, present it confidently.
|
| 1677 |
+
7. If the count is not explicitly stated, say so clearly.{conciseness_directive}
|
| 1678 |
+
|
| 1679 |
+
{format_instructions}
|
| 1680 |
+
|
| 1681 |
+
IMPORTANT: The answer is in the provided document content. Read it carefully to extract the exact number."""
|
| 1682 |
+
else:
|
| 1683 |
+
# Using metadata - count is pre-computed
|
| 1684 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant answering a COUNT query.
|
| 1685 |
|
| 1686 |
CRITICAL INSTRUCTIONS:
|
| 1687 |
1. The count has been computed: {total_docs} documents match the criteria.
|
|
|
|
| 1733 |
{format_instructions if format_instructions else "FORMAT: Use tables or side-by-side format where helpful."}"""
|
| 1734 |
|
| 1735 |
else: # list, summarize, or other
|
| 1736 |
+
system_prompt = f"""You are Iribl AI, a document analysis assistant. You are answering a query about {total_docs} documents.
|
| 1737 |
+
|
| 1738 |
+
ABSOLUTELY CRITICAL - READ CAREFULLY:
|
| 1739 |
+
1. You have been given metadata for EXACTLY {total_docs} documents.
|
| 1740 |
+
2. When asked to list or format as table, you MUST include ALL {total_docs} items.
|
| 1741 |
+
3. Do NOT truncate, summarize, or skip ANY items.
|
| 1742 |
+
4. If there are {total_docs} documents, your response MUST contain {total_docs} entries.
|
| 1743 |
+
5. For TABLES: Include EVERY row - the table must have exactly {total_docs} data rows.
|
| 1744 |
+
6. For LISTS: Number 1 through {total_docs} - include every single one.
|
| 1745 |
+
|
| 1746 |
+
METADATA COLUMNS AVAILABLE:
|
| 1747 |
+
- document_title (Policy/Document Name)
|
| 1748 |
+
- insured_name (Insured Company)
|
| 1749 |
+
- insurer_name (Insurance Company)
|
| 1750 |
+
- policy_type (Type of Policy)
|
| 1751 |
+
- sum_insured (Coverage Amount)
|
| 1752 |
+
- premium_amount (Premium)
|
| 1753 |
+
- renewal_date (Renewal Date)
|
| 1754 |
+
- renewal_year (Renewal Year)
|
| 1755 |
+
- policy_start_date, policy_end_date
|
| 1756 |
+
- city, state (Location)
|
| 1757 |
|
| 1758 |
{format_instructions if format_instructions else "FORMAT: Use headers, bullet points, and bold text for clarity."}
|
| 1759 |
|
| 1760 |
+
FAILURE TO INCLUDE ALL {total_docs} ITEMS IS UNACCEPTABLE. Do NOT say 'and X more' or truncate the list."""
|
| 1761 |
|
| 1762 |
# Step 3: Load conversation history for memory (CRITICAL FOR CONTEXT)
|
| 1763 |
stored_history = []
|
|
|
|
| 2455 |
print(f"[QUERY ROUTING] AI-parsed query: {parsed}")
|
| 2456 |
|
| 2457 |
# Route based on AI-parsed intent
|
| 2458 |
+
intent = parsed.get('intent', 'specific')
|
| 2459 |
+
needs_metadata = parsed.get('needs_metadata', False)
|
| 2460 |
+
|
| 2461 |
+
# HYBRID ROUTING LOGIC:
|
| 2462 |
+
# 1. For aggregate/list/count/rank queries: Use metadata (with RAG fallback)
|
| 2463 |
+
# 2. For ALL other queries: Use HYBRID (metadata + RAG together) for comprehensive answers
|
| 2464 |
+
|
| 2465 |
+
if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
|
| 2466 |
+
# Aggregate queries - metadata is primary, RAG is fallback (handled inside)
|
| 2467 |
+
print(f"[QUERY ROUTING] Using METADATA path for {intent} query")
|
| 2468 |
yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
|
| 2469 |
return
|
| 2470 |
|
| 2471 |
+
else:
|
| 2472 |
+
# ALL other queries (specific, compare, general, summarize, followup)
|
| 2473 |
+
# Use HYBRID approach - both metadata AND RAG for comprehensive answers
|
| 2474 |
+
print(f"[QUERY ROUTING] Using HYBRID path for {intent} query")
|
| 2475 |
+
yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
|
| 2476 |
+
return
|
| 2477 |
|
| 2478 |
# Step 1: Expand query for better retrieval (handles "module 5" -> "module five", etc.)
|
| 2479 |
expanded_queries = self._expand_query(query)
|
static/css/styles.css
CHANGED
|
@@ -671,6 +671,7 @@ body {
|
|
| 671 |
overflow: hidden;
|
| 672 |
height: 100%;
|
| 673 |
/* Ensure it takes full height */
|
|
|
|
| 674 |
}
|
| 675 |
|
| 676 |
/* ==================== Chat Bucket Filter ==================== */
|
|
@@ -807,6 +808,21 @@ body {
|
|
| 807 |
gap: 1rem;
|
| 808 |
min-height: 0;
|
| 809 |
/* Critical: allows scrolling to work */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
}
|
| 811 |
|
| 812 |
/* Custom scrollbar for chat messages */
|
|
@@ -830,8 +846,10 @@ body {
|
|
| 830 |
.message {
|
| 831 |
display: flex;
|
| 832 |
gap: 0.75rem;
|
| 833 |
-
max-width:
|
|
|
|
| 834 |
animation: messageSlide 0.3s ease-out;
|
|
|
|
| 835 |
}
|
| 836 |
|
| 837 |
@keyframes messageSlide {
|
|
@@ -849,6 +867,12 @@ body {
|
|
| 849 |
.message.user {
|
| 850 |
align-self: flex-end;
|
| 851 |
flex-direction: row-reverse;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
}
|
| 853 |
|
| 854 |
.message-avatar {
|
|
@@ -868,20 +892,29 @@ body {
|
|
| 868 |
padding: 1rem 1.25rem;
|
| 869 |
border-radius: var(--radius-lg);
|
| 870 |
font-size: 0.9rem;
|
| 871 |
-
line-height: 1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
}
|
| 873 |
|
| 874 |
.message.user .message-content {
|
| 875 |
background: var(--accent-primary);
|
| 876 |
-
color: var(--bg-darkest);
|
| 877 |
border-bottom-right-radius: 4px;
|
|
|
|
|
|
|
|
|
|
| 878 |
}
|
| 879 |
|
| 880 |
.message.assistant .message-content {
|
| 881 |
background: linear-gradient(135deg, var(--bg-light) 0%, var(--bg-medium) 100%);
|
| 882 |
border: 1px solid var(--glass-border);
|
| 883 |
border-bottom-left-radius: 4px;
|
| 884 |
-
box-shadow: 0
|
| 885 |
}
|
| 886 |
|
| 887 |
.message-sources {
|
|
@@ -905,54 +938,84 @@ body {
|
|
| 905 |
.message-content h2,
|
| 906 |
.message-content h3,
|
| 907 |
.message-content h4,
|
|
|
|
|
|
|
| 908 |
.message-content .msg-header {
|
| 909 |
-
font-weight:
|
| 910 |
color: var(--text-primary);
|
| 911 |
-
margin:
|
| 912 |
-
line-height: 1.
|
|
|
|
| 913 |
}
|
| 914 |
|
| 915 |
.message-content h1 {
|
| 916 |
-
font-size: 1.
|
| 917 |
-
background: linear-gradient(
|
| 918 |
-webkit-background-clip: text;
|
| 919 |
-webkit-text-fill-color: transparent;
|
| 920 |
background-clip: text;
|
| 921 |
-
padding-bottom: 0.
|
| 922 |
-
border-bottom: 2px solid rgba(168, 85, 247, 0.
|
|
|
|
|
|
|
| 923 |
}
|
| 924 |
|
| 925 |
.message-content h2 {
|
| 926 |
-
font-size: 1.
|
| 927 |
-
color: var(--accent-
|
| 928 |
-
border-bottom: 1px solid rgba(168, 85, 247, 0.
|
| 929 |
-
padding-bottom: 0.
|
|
|
|
|
|
|
| 930 |
}
|
| 931 |
|
| 932 |
.message-content h3 {
|
| 933 |
-
font-size:
|
| 934 |
color: var(--info);
|
|
|
|
|
|
|
|
|
|
| 935 |
}
|
| 936 |
|
| 937 |
.message-content h4 {
|
| 938 |
-
font-size:
|
| 939 |
font-weight: 600;
|
| 940 |
-
color: var(--text-
|
| 941 |
-
margin:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
}
|
| 943 |
|
| 944 |
.message-content h1:first-child,
|
| 945 |
.message-content h2:first-child,
|
| 946 |
.message-content h3:first-child,
|
| 947 |
.message-content h4:first-child,
|
|
|
|
|
|
|
| 948 |
.message-content .msg-header:first-child {
|
| 949 |
margin-top: 0;
|
| 950 |
}
|
| 951 |
|
| 952 |
.message-content p,
|
| 953 |
.message-content .msg-para {
|
| 954 |
-
margin: 0.
|
| 955 |
line-height: 1.75;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 956 |
}
|
| 957 |
|
| 958 |
.message-content p:first-child,
|
|
@@ -962,7 +1025,7 @@ body {
|
|
| 962 |
|
| 963 |
/* ==================== Enhanced Lists ==================== */
|
| 964 |
.message-content .formatted-list {
|
| 965 |
-
margin:
|
| 966 |
padding-left: 0;
|
| 967 |
list-style: none;
|
| 968 |
}
|
|
@@ -973,17 +1036,20 @@ body {
|
|
| 973 |
|
| 974 |
.message-content .formatted-list li {
|
| 975 |
position: relative;
|
| 976 |
-
padding: 0.
|
| 977 |
-
margin: 0.
|
| 978 |
-
background: rgba(255, 255, 255, 0.
|
| 979 |
border-radius: var(--radius-md);
|
| 980 |
border-left: 3px solid transparent;
|
| 981 |
-
line-height: 1.
|
| 982 |
transition: all 0.2s ease;
|
|
|
|
|
|
|
| 983 |
}
|
| 984 |
|
| 985 |
.message-content .formatted-list li:hover {
|
| 986 |
-
background: rgba(255, 255, 255, 0.
|
|
|
|
| 987 |
}
|
| 988 |
|
| 989 |
.message-content .formatted-list li.numbered {
|
|
@@ -1032,14 +1098,16 @@ body {
|
|
| 1032 |
/* Legacy list support */
|
| 1033 |
.message-content ul,
|
| 1034 |
.message-content ol {
|
| 1035 |
-
margin:
|
| 1036 |
-
padding-left: 1.
|
| 1037 |
}
|
| 1038 |
|
| 1039 |
.message-content li {
|
| 1040 |
-
margin: 0.
|
| 1041 |
padding-left: 0.5rem;
|
| 1042 |
-
line-height: 1.
|
|
|
|
|
|
|
| 1043 |
}
|
| 1044 |
|
| 1045 |
.message-content ul li::marker {
|
|
@@ -1066,12 +1134,15 @@ body {
|
|
| 1066 |
overflow-x: auto;
|
| 1067 |
overflow-y: hidden;
|
| 1068 |
max-width: 100%;
|
|
|
|
| 1069 |
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.2);
|
| 1070 |
border: 1px solid rgba(255, 255, 255, 0.08);
|
|
|
|
| 1071 |
}
|
| 1072 |
|
| 1073 |
.message-content table {
|
| 1074 |
width: 100%;
|
|
|
|
| 1075 |
border-collapse: collapse;
|
| 1076 |
font-size: 0.8rem;
|
| 1077 |
background: rgba(0, 0, 0, 0.2);
|
|
@@ -1083,23 +1154,27 @@ body {
|
|
| 1083 |
}
|
| 1084 |
|
| 1085 |
.message-content th {
|
| 1086 |
-
padding: 0.
|
| 1087 |
-
font-weight:
|
| 1088 |
color: var(--text-primary);
|
| 1089 |
text-align: left;
|
| 1090 |
-
border-bottom: 2px solid rgba(168, 85, 247, 0.
|
| 1091 |
text-transform: uppercase;
|
| 1092 |
-
font-size: 0.
|
| 1093 |
-
letter-spacing: 0.
|
| 1094 |
white-space: nowrap;
|
| 1095 |
}
|
| 1096 |
|
| 1097 |
.message-content td {
|
| 1098 |
-
padding: 0.
|
| 1099 |
-
border-bottom: 1px solid rgba(255, 255, 255, 0.
|
| 1100 |
color: var(--text-secondary);
|
| 1101 |
word-break: break-word;
|
| 1102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1103 |
}
|
| 1104 |
|
| 1105 |
.message-content tbody tr {
|
|
@@ -1159,12 +1234,17 @@ body {
|
|
| 1159 |
.message-content b {
|
| 1160 |
font-weight: 700;
|
| 1161 |
color: var(--text-primary);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1162 |
}
|
| 1163 |
|
| 1164 |
.message-content em,
|
| 1165 |
.message-content i {
|
| 1166 |
font-style: italic;
|
| 1167 |
-
color: var(--
|
|
|
|
| 1168 |
}
|
| 1169 |
|
| 1170 |
/* ==================== Dividers ==================== */
|
|
@@ -1173,17 +1253,21 @@ body {
|
|
| 1173 |
height: 1px;
|
| 1174 |
background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.4), transparent);
|
| 1175 |
margin: 1.5rem 0;
|
|
|
|
| 1176 |
}
|
| 1177 |
|
| 1178 |
/* ==================== Blockquotes ==================== */
|
| 1179 |
.message-content blockquote {
|
| 1180 |
border-left: 4px solid var(--accent-primary);
|
| 1181 |
-
margin:
|
| 1182 |
-
padding: 0.
|
| 1183 |
-
background: linear-gradient(135deg, rgba(168, 85, 247, 0.
|
| 1184 |
border-radius: 0 var(--radius-md) var(--radius-md) 0;
|
| 1185 |
font-style: italic;
|
| 1186 |
color: var(--text-secondary);
|
|
|
|
|
|
|
|
|
|
| 1187 |
}
|
| 1188 |
|
| 1189 |
/* ==================== Typing Indicator ==================== */
|
|
|
|
| 671 |
overflow: hidden;
|
| 672 |
height: 100%;
|
| 673 |
/* Ensure it takes full height */
|
| 674 |
+
transition: all var(--transition-smooth);
|
| 675 |
}
|
| 676 |
|
| 677 |
/* ==================== Chat Bucket Filter ==================== */
|
|
|
|
| 808 |
gap: 1rem;
|
| 809 |
min-height: 0;
|
| 810 |
/* Critical: allows scrolling to work */
|
| 811 |
+
transition: padding var(--transition-smooth);
|
| 812 |
+
}
|
| 813 |
+
|
| 814 |
+
/* Adjust chat when both sidebars are open for better space utilization */
|
| 815 |
+
.main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .chat-messages {
|
| 816 |
+
padding: 0.875rem;
|
| 817 |
+
gap: 0.875rem;
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
.main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .message.assistant {
|
| 821 |
+
max-width: 85%;
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
.main-content:has(.sidebar-left:not(.collapsed)):has(.sidebar-right:not(.collapsed)) .message.user {
|
| 825 |
+
max-width: 75%;
|
| 826 |
}
|
| 827 |
|
| 828 |
/* Custom scrollbar for chat messages */
|
|
|
|
| 846 |
.message {
|
| 847 |
display: flex;
|
| 848 |
gap: 0.75rem;
|
| 849 |
+
max-width: 100%;
|
| 850 |
+
width: 100%;
|
| 851 |
animation: messageSlide 0.3s ease-out;
|
| 852 |
+
min-width: 0; /* Allow flex item to shrink below content size */
|
| 853 |
}
|
| 854 |
|
| 855 |
@keyframes messageSlide {
|
|
|
|
| 867 |
.message.user {
|
| 868 |
align-self: flex-end;
|
| 869 |
flex-direction: row-reverse;
|
| 870 |
+
max-width: 80%; /* User messages can be narrower */
|
| 871 |
+
}
|
| 872 |
+
|
| 873 |
+
.message.assistant {
|
| 874 |
+
align-self: flex-start;
|
| 875 |
+
max-width: 90%; /* Assistant messages use most of the width, leaving small margin */
|
| 876 |
}
|
| 877 |
|
| 878 |
.message-avatar {
|
|
|
|
| 892 |
padding: 1rem 1.25rem;
|
| 893 |
border-radius: var(--radius-lg);
|
| 894 |
font-size: 0.9rem;
|
| 895 |
+
line-height: 1.7;
|
| 896 |
+
max-width: 100%;
|
| 897 |
+
min-width: 0; /* Allow content to shrink */
|
| 898 |
+
word-wrap: break-word;
|
| 899 |
+
overflow-wrap: break-word;
|
| 900 |
+
word-break: break-word; /* Break long words if needed */
|
| 901 |
+
color: var(--text-secondary);
|
| 902 |
}
|
| 903 |
|
| 904 |
.message.user .message-content {
|
| 905 |
background: var(--accent-primary);
|
| 906 |
+
color: var(--bg-darkest) !important; /* Ensure dark text on white background */
|
| 907 |
border-bottom-right-radius: 4px;
|
| 908 |
+
padding: 1rem 1.25rem; /* Slightly less padding for user messages */
|
| 909 |
+
font-weight: 500; /* Slightly bolder for better readability */
|
| 910 |
+
line-height: 1.7;
|
| 911 |
}
|
| 912 |
|
| 913 |
.message.assistant .message-content {
|
| 914 |
background: linear-gradient(135deg, var(--bg-light) 0%, var(--bg-medium) 100%);
|
| 915 |
border: 1px solid var(--glass-border);
|
| 916 |
border-bottom-left-radius: 4px;
|
| 917 |
+
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
|
| 918 |
}
|
| 919 |
|
| 920 |
.message-sources {
|
|
|
|
| 938 |
.message-content h2,
|
| 939 |
.message-content h3,
|
| 940 |
.message-content h4,
|
| 941 |
+
.message-content h5,
|
| 942 |
+
.message-content h6,
|
| 943 |
.message-content .msg-header {
|
| 944 |
+
font-weight: 700;
|
| 945 |
color: var(--text-primary);
|
| 946 |
+
margin: 2rem 0 1rem 0;
|
| 947 |
+
line-height: 1.3;
|
| 948 |
+
letter-spacing: -0.01em;
|
| 949 |
}
|
| 950 |
|
| 951 |
.message-content h1 {
|
| 952 |
+
font-size: 1.3rem;
|
| 953 |
+
background: linear-gradient(135deg, var(--accent-primary) 0%, #a855f7 100%);
|
| 954 |
-webkit-background-clip: text;
|
| 955 |
-webkit-text-fill-color: transparent;
|
| 956 |
background-clip: text;
|
| 957 |
+
padding-bottom: 0.6rem;
|
| 958 |
+
border-bottom: 2px solid rgba(168, 85, 247, 0.4);
|
| 959 |
+
margin-top: 0;
|
| 960 |
+
margin-bottom: 1.25rem;
|
| 961 |
}
|
| 962 |
|
| 963 |
.message-content h2 {
|
| 964 |
+
font-size: 1.15rem;
|
| 965 |
+
color: var(--accent-primary);
|
| 966 |
+
border-bottom: 1px solid rgba(168, 85, 247, 0.3);
|
| 967 |
+
padding-bottom: 0.45rem;
|
| 968 |
+
margin-top: 1.5rem;
|
| 969 |
+
margin-bottom: 0.875rem;
|
| 970 |
}
|
| 971 |
|
| 972 |
.message-content h3 {
|
| 973 |
+
font-size: 1.05rem;
|
| 974 |
color: var(--info);
|
| 975 |
+
margin-top: 1.5rem;
|
| 976 |
+
margin-bottom: 0.75rem;
|
| 977 |
+
font-weight: 600;
|
| 978 |
}
|
| 979 |
|
| 980 |
.message-content h4 {
|
| 981 |
+
font-size: 1rem;
|
| 982 |
font-weight: 600;
|
| 983 |
+
color: var(--text-primary);
|
| 984 |
+
margin: 1.25rem 0 0.625rem 0;
|
| 985 |
+
}
|
| 986 |
+
|
| 987 |
+
.message-content h5,
|
| 988 |
+
.message-content h6 {
|
| 989 |
+
font-size: 1rem;
|
| 990 |
+
font-weight: 600;
|
| 991 |
+
color: var(--text-primary);
|
| 992 |
+
margin: 1.25rem 0 0.625rem 0;
|
| 993 |
}
|
| 994 |
|
| 995 |
.message-content h1:first-child,
|
| 996 |
.message-content h2:first-child,
|
| 997 |
.message-content h3:first-child,
|
| 998 |
.message-content h4:first-child,
|
| 999 |
+
.message-content h5:first-child,
|
| 1000 |
+
.message-content h6:first-child,
|
| 1001 |
.message-content .msg-header:first-child {
|
| 1002 |
margin-top: 0;
|
| 1003 |
}
|
| 1004 |
|
| 1005 |
.message-content p,
|
| 1006 |
.message-content .msg-para {
|
| 1007 |
+
margin: 0.875rem 0;
|
| 1008 |
line-height: 1.75;
|
| 1009 |
+
color: var(--text-secondary);
|
| 1010 |
+
font-size: 0.9rem;
|
| 1011 |
+
}
|
| 1012 |
+
|
| 1013 |
+
.message-content p:first-child {
|
| 1014 |
+
margin-top: 0;
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
.message-content p:last-child {
|
| 1018 |
+
margin-bottom: 0;
|
| 1019 |
}
|
| 1020 |
|
| 1021 |
.message-content p:first-child,
|
|
|
|
| 1025 |
|
| 1026 |
/* ==================== Enhanced Lists ==================== */
|
| 1027 |
.message-content .formatted-list {
|
| 1028 |
+
margin: 1.25rem 0;
|
| 1029 |
padding-left: 0;
|
| 1030 |
list-style: none;
|
| 1031 |
}
|
|
|
|
| 1036 |
|
| 1037 |
.message-content .formatted-list li {
|
| 1038 |
position: relative;
|
| 1039 |
+
padding: 0.625rem 0.875rem 0.625rem 2.25rem;
|
| 1040 |
+
margin: 0.4rem 0;
|
| 1041 |
+
background: rgba(255, 255, 255, 0.03);
|
| 1042 |
border-radius: var(--radius-md);
|
| 1043 |
border-left: 3px solid transparent;
|
| 1044 |
+
line-height: 1.7;
|
| 1045 |
transition: all 0.2s ease;
|
| 1046 |
+
font-size: 0.9rem;
|
| 1047 |
+
color: var(--text-secondary);
|
| 1048 |
}
|
| 1049 |
|
| 1050 |
.message-content .formatted-list li:hover {
|
| 1051 |
+
background: rgba(255, 255, 255, 0.06);
|
| 1052 |
+
transform: translateX(2px);
|
| 1053 |
}
|
| 1054 |
|
| 1055 |
.message-content .formatted-list li.numbered {
|
|
|
|
| 1098 |
/* Legacy list support */
|
| 1099 |
.message-content ul,
|
| 1100 |
.message-content ol {
|
| 1101 |
+
margin: 1.25rem 0;
|
| 1102 |
+
padding-left: 1.75rem;
|
| 1103 |
}
|
| 1104 |
|
| 1105 |
.message-content li {
|
| 1106 |
+
margin: 0.5rem 0;
|
| 1107 |
padding-left: 0.5rem;
|
| 1108 |
+
line-height: 1.7;
|
| 1109 |
+
font-size: 0.9rem;
|
| 1110 |
+
color: var(--text-secondary);
|
| 1111 |
}
|
| 1112 |
|
| 1113 |
.message-content ul li::marker {
|
|
|
|
| 1134 |
overflow-x: auto;
|
| 1135 |
overflow-y: hidden;
|
| 1136 |
max-width: 100%;
|
| 1137 |
+
width: 100%;
|
| 1138 |
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.2);
|
| 1139 |
border: 1px solid rgba(255, 255, 255, 0.08);
|
| 1140 |
+
-webkit-overflow-scrolling: touch; /* Smooth scrolling on iOS */
|
| 1141 |
}
|
| 1142 |
|
| 1143 |
.message-content table {
|
| 1144 |
width: 100%;
|
| 1145 |
+
min-width: 100%; /* Ensure table takes full width of wrapper */
|
| 1146 |
border-collapse: collapse;
|
| 1147 |
font-size: 0.8rem;
|
| 1148 |
background: rgba(0, 0, 0, 0.2);
|
|
|
|
| 1154 |
}
|
| 1155 |
|
| 1156 |
.message-content th {
|
| 1157 |
+
padding: 0.75rem 1rem;
|
| 1158 |
+
font-weight: 700;
|
| 1159 |
color: var(--text-primary);
|
| 1160 |
text-align: left;
|
| 1161 |
+
border-bottom: 2px solid rgba(168, 85, 247, 0.4);
|
| 1162 |
text-transform: uppercase;
|
| 1163 |
+
font-size: 0.75rem;
|
| 1164 |
+
letter-spacing: 0.5px;
|
| 1165 |
white-space: nowrap;
|
| 1166 |
}
|
| 1167 |
|
| 1168 |
.message-content td {
|
| 1169 |
+
padding: 0.75rem 1rem;
|
| 1170 |
+
border-bottom: 1px solid rgba(255, 255, 255, 0.08);
|
| 1171 |
color: var(--text-secondary);
|
| 1172 |
word-break: break-word;
|
| 1173 |
+
overflow-wrap: break-word;
|
| 1174 |
+
max-width: none; /* Remove max-width restriction */
|
| 1175 |
+
min-width: 100px; /* Minimum width for readability */
|
| 1176 |
+
line-height: 1.6;
|
| 1177 |
+
font-size: 0.9rem;
|
| 1178 |
}
|
| 1179 |
|
| 1180 |
.message-content tbody tr {
|
|
|
|
| 1234 |
.message-content b {
|
| 1235 |
font-weight: 700;
|
| 1236 |
color: var(--text-primary);
|
| 1237 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.1) 0%, rgba(168, 85, 247, 0.1) 100%);
|
| 1238 |
+
padding: 0.1rem 0.3rem;
|
| 1239 |
+
border-radius: 4px;
|
| 1240 |
+
font-weight: 600;
|
| 1241 |
}
|
| 1242 |
|
| 1243 |
.message-content em,
|
| 1244 |
.message-content i {
|
| 1245 |
font-style: italic;
|
| 1246 |
+
color: var(--accent-secondary);
|
| 1247 |
+
font-weight: 500;
|
| 1248 |
}
|
| 1249 |
|
| 1250 |
/* ==================== Dividers ==================== */
|
|
|
|
| 1253 |
height: 1px;
|
| 1254 |
background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.4), transparent);
|
| 1255 |
margin: 1.5rem 0;
|
| 1256 |
+
border-radius: 2px;
|
| 1257 |
}
|
| 1258 |
|
| 1259 |
/* ==================== Blockquotes ==================== */
|
| 1260 |
.message-content blockquote {
|
| 1261 |
border-left: 4px solid var(--accent-primary);
|
| 1262 |
+
margin: 1.25rem 0;
|
| 1263 |
+
padding: 0.875rem 1.25rem;
|
| 1264 |
+
background: linear-gradient(135deg, rgba(168, 85, 247, 0.12) 0%, rgba(96, 165, 250, 0.08) 100%);
|
| 1265 |
border-radius: 0 var(--radius-md) var(--radius-md) 0;
|
| 1266 |
font-style: italic;
|
| 1267 |
color: var(--text-secondary);
|
| 1268 |
+
font-size: 0.9rem;
|
| 1269 |
+
line-height: 1.75;
|
| 1270 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
|
| 1271 |
}
|
| 1272 |
|
| 1273 |
/* ==================== Typing Indicator ==================== */
|
static/js/app.js
CHANGED
|
@@ -1326,7 +1326,8 @@ function renderMessages() {
|
|
| 1326 |
|
| 1327 |
const html = state.messages.map((msg, i) => {
|
| 1328 |
const avatar = msg.role === 'user' ? (state.user?.username?.charAt(0).toUpperCase() || 'U') : '🧠';
|
| 1329 |
-
|
|
|
|
| 1330 |
}).join('');
|
| 1331 |
|
| 1332 |
// Build full content with summary panel and welcome screen
|
|
@@ -1354,10 +1355,24 @@ function renderMessages() {
|
|
| 1354 |
elements.summaryClose.addEventListener('click', hideSummary);
|
| 1355 |
}
|
| 1356 |
|
| 1357 |
-
function formatContent(content) {
|
| 1358 |
// Enhanced markdown parsing for beautiful formatting
|
| 1359 |
let html = content;
|
| 1360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1361 |
// Escape HTML special characters first (except for already parsed markdown)
|
| 1362 |
// Skip this if content looks like it's already HTML
|
| 1363 |
if (!html.includes('<table') && !html.includes('<div')) {
|
|
|
|
| 1326 |
|
| 1327 |
const html = state.messages.map((msg, i) => {
|
| 1328 |
const avatar = msg.role === 'user' ? (state.user?.username?.charAt(0).toUpperCase() || 'U') : '🧠';
|
| 1329 |
+
const isUserMessage = msg.role === 'user';
|
| 1330 |
+
return `<div class="message ${msg.role}"><div class="message-avatar">${avatar}</div><div class="message-content">${formatContent(msg.content, isUserMessage)}</div></div>`;
|
| 1331 |
}).join('');
|
| 1332 |
|
| 1333 |
// Build full content with summary panel and welcome screen
|
|
|
|
| 1355 |
elements.summaryClose.addEventListener('click', hideSummary);
|
| 1356 |
}
|
| 1357 |
|
| 1358 |
+
function formatContent(content, isUserMessage = false) {
|
| 1359 |
// Enhanced markdown parsing for beautiful formatting
|
| 1360 |
let html = content;
|
| 1361 |
|
| 1362 |
+
// For user messages, escape HTML and preserve line breaks
|
| 1363 |
+
if (isUserMessage) {
|
| 1364 |
+
// Escape HTML to prevent XSS
|
| 1365 |
+
html = html
|
| 1366 |
+
.replace(/&/g, '&')
|
| 1367 |
+
.replace(/</g, '<')
|
| 1368 |
+
.replace(/>/g, '>')
|
| 1369 |
+
.replace(/"/g, '"')
|
| 1370 |
+
.replace(/'/g, ''');
|
| 1371 |
+
// Convert line breaks to <br>
|
| 1372 |
+
html = html.replace(/\n/g, '<br>');
|
| 1373 |
+
return html;
|
| 1374 |
+
}
|
| 1375 |
+
|
| 1376 |
// Escape HTML special characters first (except for already parsed markdown)
|
| 1377 |
// Skip this if content looks like it's already HTML
|
| 1378 |
if (!html.includes('<table') && !html.includes('<div')) {
|
table.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|