Spaces:
Sleeping
Sleeping
Ara Yeroyan
commited on
Commit
·
de1d74a
1
Parent(s):
39edab4
Remove scripts and ignore local_* files
Browse files- .gitignore +4 -1
- upload_to_gemini_filestore.py +0 -450
- verify_qdrant_migration.py +0 -438
.gitignore
CHANGED
|
@@ -109,4 +109,7 @@ pytest_cache/
|
|
| 109 |
tmp/
|
| 110 |
temp/
|
| 111 |
*.tmp
|
| 112 |
-
*.temp
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
tmp/
|
| 110 |
temp/
|
| 111 |
*.tmp
|
| 112 |
+
*.temp
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
local_*
|
upload_to_gemini_filestore.py
DELETED
|
@@ -1,450 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Upload Documents to Google Gemini File Search Store
|
| 4 |
-
|
| 5 |
-
This script uploads PDF documents to a Gemini File Search store for RAG.
|
| 6 |
-
It processes documents from the reports directory and uploads them with metadata.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import os
|
| 10 |
-
import sys
|
| 11 |
-
import json
|
| 12 |
-
import time
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from typing import List, Dict, Any, Optional
|
| 15 |
-
from dotenv import load_dotenv
|
| 16 |
-
|
| 17 |
-
try:
|
| 18 |
-
from google import genai
|
| 19 |
-
from google.genai import types
|
| 20 |
-
GEMINI_AVAILABLE = True
|
| 21 |
-
except ImportError:
|
| 22 |
-
GEMINI_AVAILABLE = False
|
| 23 |
-
print("❌ google-genai package not installed. Install with: pip install google-genai")
|
| 24 |
-
|
| 25 |
-
# Load .env file
|
| 26 |
-
load_dotenv()
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
|
| 30 |
-
"""Extract metadata from file path structure."""
|
| 31 |
-
# Example: /path/to/reports/Annual Consolidated OAG audit reports 2018/Annual Consolidated OAG audit reports 2018.pdf
|
| 32 |
-
parts = file_path.parts
|
| 33 |
-
filename = file_path.stem # Without extension
|
| 34 |
-
|
| 35 |
-
metadata = {
|
| 36 |
-
"filename": file_path.name,
|
| 37 |
-
"filepath": str(file_path),
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
# Extract year
|
| 41 |
-
year_match = None
|
| 42 |
-
for part in parts:
|
| 43 |
-
if any(year in part for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']):
|
| 44 |
-
for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']:
|
| 45 |
-
if year in part:
|
| 46 |
-
year_match = year
|
| 47 |
-
break
|
| 48 |
-
if year_match:
|
| 49 |
-
break
|
| 50 |
-
|
| 51 |
-
if year_match:
|
| 52 |
-
metadata["year"] = year_match
|
| 53 |
-
|
| 54 |
-
# Extract source/district
|
| 55 |
-
filename_lower = filename.lower()
|
| 56 |
-
if "consolidated" in filename_lower or "oag" in filename_lower:
|
| 57 |
-
metadata["source"] = "Consolidated"
|
| 58 |
-
elif "gulu" in filename_lower:
|
| 59 |
-
metadata["source"] = "Gulu DLG"
|
| 60 |
-
metadata["district"] = "Gulu"
|
| 61 |
-
elif "kalangala" in filename_lower:
|
| 62 |
-
metadata["source"] = "Kalangala DLG"
|
| 63 |
-
metadata["district"] = "Kalangala"
|
| 64 |
-
elif "kcca" in filename_lower:
|
| 65 |
-
metadata["source"] = "KCCA"
|
| 66 |
-
metadata["district"] = "Kampala"
|
| 67 |
-
elif "maaif" in filename_lower:
|
| 68 |
-
metadata["source"] = "MAAIF"
|
| 69 |
-
elif "mwts" in filename_lower:
|
| 70 |
-
metadata["source"] = "MWTS"
|
| 71 |
-
|
| 72 |
-
return metadata
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def get_or_create_filestore(client: genai.Client, store_name: Optional[str] = None) -> str:
|
| 76 |
-
"""Get existing file search store or create a new one."""
|
| 77 |
-
# First, try to list all existing stores
|
| 78 |
-
try:
|
| 79 |
-
stores = list(client.file_search_stores.list())
|
| 80 |
-
print(f" 🔍 Found {len(stores)} existing store(s)")
|
| 81 |
-
|
| 82 |
-
if stores:
|
| 83 |
-
# If store_name is provided, try to match it
|
| 84 |
-
if store_name:
|
| 85 |
-
for store in stores:
|
| 86 |
-
# Check both name (full path like "fileSearchStores/xxx") and display_name
|
| 87 |
-
store_name_match = store.name == store_name or store.name.endswith(store_name)
|
| 88 |
-
display_name_match = store.display_name == store_name
|
| 89 |
-
|
| 90 |
-
# Also check if store_name is just the ID part
|
| 91 |
-
store_id = store.name.split("/")[-1] if "/" in store.name else store.name
|
| 92 |
-
id_match = store_id == store_name
|
| 93 |
-
|
| 94 |
-
if store_name_match or display_name_match or id_match:
|
| 95 |
-
print(f" ✅ Using existing store: {store.display_name} ({store.name})")
|
| 96 |
-
print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
|
| 97 |
-
return store.name
|
| 98 |
-
|
| 99 |
-
# If no store_name provided, use the most recent store
|
| 100 |
-
if not store_name and stores:
|
| 101 |
-
latest_store = stores[-1] # Most recent
|
| 102 |
-
store_id = latest_store.name.split("/")[-1] if "/" in latest_store.name else latest_store.name
|
| 103 |
-
print(f" ✅ Using most recent store: {latest_store.display_name} ({latest_store.name})")
|
| 104 |
-
print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
|
| 105 |
-
return latest_store.name
|
| 106 |
-
except Exception as e:
|
| 107 |
-
print(f" ⚠️ Could not list stores: {e}")
|
| 108 |
-
print(f" 📝 Will create new store...")
|
| 109 |
-
|
| 110 |
-
# Create new store only if no existing store found
|
| 111 |
-
display_name = store_name or "Audit Reports"
|
| 112 |
-
print(f" 📝 Creating new file search store: '{display_name}'...")
|
| 113 |
-
|
| 114 |
-
try:
|
| 115 |
-
file_search_store = client.file_search_stores.create(
|
| 116 |
-
config={'display_name': display_name}
|
| 117 |
-
)
|
| 118 |
-
store_id = file_search_store.name.split("/")[-1] if "/" in file_search_store.name else file_search_store.name
|
| 119 |
-
print(f" ✅ Created store: {file_search_store.display_name} ({file_search_store.name})")
|
| 120 |
-
print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
|
| 121 |
-
return file_search_store.name
|
| 122 |
-
except Exception as e:
|
| 123 |
-
print(f" ❌ Failed to create store: {e}")
|
| 124 |
-
raise
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def format_metadata_for_gemini(metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 128 |
-
"""Format metadata dictionary for Gemini API customMetadata format.
|
| 129 |
-
|
| 130 |
-
Based on Gemini API, customMetadata should use:
|
| 131 |
-
- string_value for string fields
|
| 132 |
-
- numeric_value for numeric fields
|
| 133 |
-
"""
|
| 134 |
-
custom_metadata = []
|
| 135 |
-
|
| 136 |
-
# Add year if available (as numeric_value)
|
| 137 |
-
if metadata.get('year'):
|
| 138 |
-
try:
|
| 139 |
-
year_int = int(metadata['year'])
|
| 140 |
-
custom_metadata.append({
|
| 141 |
-
'key': 'year',
|
| 142 |
-
'numeric_value': year_int
|
| 143 |
-
})
|
| 144 |
-
except (ValueError, TypeError):
|
| 145 |
-
# Fallback to string if not numeric
|
| 146 |
-
custom_metadata.append({
|
| 147 |
-
'key': 'year',
|
| 148 |
-
'string_value': str(metadata['year'])
|
| 149 |
-
})
|
| 150 |
-
|
| 151 |
-
# Add source if available (as string_value)
|
| 152 |
-
if metadata.get('source'):
|
| 153 |
-
custom_metadata.append({
|
| 154 |
-
'key': 'source',
|
| 155 |
-
'string_value': str(metadata['source'])
|
| 156 |
-
})
|
| 157 |
-
|
| 158 |
-
# Add district if available (as string_value)
|
| 159 |
-
if metadata.get('district'):
|
| 160 |
-
custom_metadata.append({
|
| 161 |
-
'key': 'district',
|
| 162 |
-
'string_value': str(metadata['district'])
|
| 163 |
-
})
|
| 164 |
-
|
| 165 |
-
# Add filename for reference (as string_value)
|
| 166 |
-
if metadata.get('filename'):
|
| 167 |
-
custom_metadata.append({
|
| 168 |
-
'key': 'filename',
|
| 169 |
-
'string_value': str(metadata['filename'])
|
| 170 |
-
})
|
| 171 |
-
|
| 172 |
-
return custom_metadata
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def check_file_exists(client: genai.Client, store_name: str, filename: str) -> bool:
|
| 176 |
-
"""Check if a file with the same name already exists in the store."""
|
| 177 |
-
try:
|
| 178 |
-
# List files in the store
|
| 179 |
-
store = client.file_search_stores.get(name=store_name)
|
| 180 |
-
# Note: The API might not have a direct list method, so we'll catch errors
|
| 181 |
-
return False # Assume not exists for now
|
| 182 |
-
except Exception:
|
| 183 |
-
return False # If we can't check, assume it doesn't exist
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
def upload_file_to_store(
|
| 187 |
-
client: genai.Client,
|
| 188 |
-
file_path: Path,
|
| 189 |
-
store_name: str,
|
| 190 |
-
metadata: Dict[str, Any],
|
| 191 |
-
skip_existing: bool = True
|
| 192 |
-
) -> Optional[bool]:
|
| 193 |
-
"""Upload a single file to the file search store with metadata."""
|
| 194 |
-
try:
|
| 195 |
-
print(f" 📤 Uploading: {file_path.name}...")
|
| 196 |
-
|
| 197 |
-
# Format metadata for Gemini API
|
| 198 |
-
custom_metadata = format_metadata_for_gemini(metadata)
|
| 199 |
-
|
| 200 |
-
# Display metadata being uploaded
|
| 201 |
-
if custom_metadata:
|
| 202 |
-
metadata_parts = []
|
| 203 |
-
for m in custom_metadata:
|
| 204 |
-
if 'numeric_value' in m:
|
| 205 |
-
metadata_parts.append(f"{m['key']}={m['numeric_value']}")
|
| 206 |
-
elif 'string_value' in m:
|
| 207 |
-
metadata_parts.append(f"{m['key']}={m['string_value']}")
|
| 208 |
-
if metadata_parts:
|
| 209 |
-
print(f" 📋 Metadata: {', '.join(metadata_parts)}")
|
| 210 |
-
|
| 211 |
-
# Check if file already exists (if skip_existing is True)
|
| 212 |
-
if skip_existing:
|
| 213 |
-
# Note: We'll handle duplicates via error messages
|
| 214 |
-
pass
|
| 215 |
-
|
| 216 |
-
# Upload and import file with metadata
|
| 217 |
-
# Note: Gemini API may not support customMetadata in upload_to_file_search_store
|
| 218 |
-
# We'll try with metadata first, then fallback without it if it fails
|
| 219 |
-
upload_params = {
|
| 220 |
-
'file': str(file_path),
|
| 221 |
-
'file_search_store_name': store_name,
|
| 222 |
-
}
|
| 223 |
-
|
| 224 |
-
# Build config
|
| 225 |
-
config = {
|
| 226 |
-
'display_name': metadata.get('filename', file_path.name),
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
# Upload file (metadata not supported in upload config per API)
|
| 230 |
-
# Note: Gemini File Search API doesn't support customMetadata in upload_to_file_search_store
|
| 231 |
-
# Metadata would need to be added via a separate API call after upload, if supported
|
| 232 |
-
# For now, we upload without metadata - the filename in display_name contains the info
|
| 233 |
-
upload_params['config'] = config
|
| 234 |
-
|
| 235 |
-
try:
|
| 236 |
-
operation = client.file_search_stores.upload_to_file_search_store(**upload_params)
|
| 237 |
-
except Exception as upload_error:
|
| 238 |
-
error_str = str(upload_error).lower()
|
| 239 |
-
error_msg = str(upload_error)
|
| 240 |
-
|
| 241 |
-
# Check if it's a "terminated" or "already exists" error
|
| 242 |
-
if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
|
| 243 |
-
print(f" ⚠️ Upload error: File may already exist or upload was interrupted")
|
| 244 |
-
print(f" 💡 Error details: {error_msg}")
|
| 245 |
-
print(f" 💡 Skipping this file")
|
| 246 |
-
return None # Return None to indicate "skipped"
|
| 247 |
-
# Re-raise if it's a different error
|
| 248 |
-
raise
|
| 249 |
-
|
| 250 |
-
# Wait for import to complete
|
| 251 |
-
max_wait = 300 # 5 minutes max per file
|
| 252 |
-
start_time = time.time()
|
| 253 |
-
|
| 254 |
-
while not operation.done:
|
| 255 |
-
if time.time() - start_time > max_wait:
|
| 256 |
-
print(f" ⚠️ Timeout waiting for upload to complete")
|
| 257 |
-
return False
|
| 258 |
-
|
| 259 |
-
time.sleep(2)
|
| 260 |
-
try:
|
| 261 |
-
operation = client.operations.get(operation)
|
| 262 |
-
except Exception as op_error:
|
| 263 |
-
# Check if it's a "terminated" error (file might already exist)
|
| 264 |
-
error_str = str(op_error).lower()
|
| 265 |
-
if 'terminated' in error_str or 'already' in error_str:
|
| 266 |
-
print(f" ⚠️ File may already exist or upload was interrupted")
|
| 267 |
-
print(f" 💡 Skipping this file")
|
| 268 |
-
return None # Return None to indicate "skipped"
|
| 269 |
-
raise
|
| 270 |
-
|
| 271 |
-
# Check for errors in the operation result
|
| 272 |
-
if hasattr(operation, 'error') and operation.error:
|
| 273 |
-
error_msg = str(operation.error)
|
| 274 |
-
if 'terminated' in error_msg.lower() or 'already' in error_msg.lower():
|
| 275 |
-
print(f" ⚠️ File may already exist in the store")
|
| 276 |
-
print(f" 💡 Skipping this file")
|
| 277 |
-
return None # Return None to indicate "skipped" vs False for "failed"
|
| 278 |
-
print(f" ❌ Upload failed: {operation.error}")
|
| 279 |
-
return False
|
| 280 |
-
|
| 281 |
-
print(f" ✅ Uploaded successfully")
|
| 282 |
-
return True
|
| 283 |
-
|
| 284 |
-
except Exception as e:
|
| 285 |
-
error_str = str(e).lower()
|
| 286 |
-
# Handle specific error cases
|
| 287 |
-
if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
|
| 288 |
-
print(f" ⚠️ Upload error: File may already exist or upload was interrupted")
|
| 289 |
-
print(f" 💡 Error details: {e}")
|
| 290 |
-
print(f" 💡 Skipping this file")
|
| 291 |
-
return None # Return None to indicate "skipped"
|
| 292 |
-
print(f" ❌ Error uploading {file_path.name}: {e}")
|
| 293 |
-
import traceback
|
| 294 |
-
traceback.print_exc()
|
| 295 |
-
return False
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
def find_report_files(reports_dir: Path) -> List[Path]:
|
| 299 |
-
"""Find all PDF report files in the reports directory."""
|
| 300 |
-
pdf_files = []
|
| 301 |
-
|
| 302 |
-
if not reports_dir.exists():
|
| 303 |
-
print(f"❌ Reports directory not found: {reports_dir}")
|
| 304 |
-
return pdf_files
|
| 305 |
-
|
| 306 |
-
# Find all PDF files
|
| 307 |
-
for pdf_file in reports_dir.rglob("*.pdf"):
|
| 308 |
-
pdf_files.append(pdf_file)
|
| 309 |
-
|
| 310 |
-
return sorted(pdf_files)
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
def main():
|
| 314 |
-
"""Main function to upload documents to Gemini File Search store."""
|
| 315 |
-
print("=" * 60)
|
| 316 |
-
print("Gemini File Search Store Upload Tool")
|
| 317 |
-
print("=" * 60)
|
| 318 |
-
|
| 319 |
-
if not GEMINI_AVAILABLE:
|
| 320 |
-
print("\n❌ Please install google-genai package:")
|
| 321 |
-
print(" pip install google-genai")
|
| 322 |
-
return 1
|
| 323 |
-
|
| 324 |
-
# Get API key
|
| 325 |
-
api_key = os.getenv("GEMINI_API_KEY")
|
| 326 |
-
if not api_key:
|
| 327 |
-
print("\n❌ GEMINI_API_KEY not found in environment variables")
|
| 328 |
-
print(" Please add GEMINI_API_KEY to your .env file")
|
| 329 |
-
return 1
|
| 330 |
-
|
| 331 |
-
# Get store name (optional)
|
| 332 |
-
store_name = os.getenv("GEMINI_FILESTORE_NAME")
|
| 333 |
-
|
| 334 |
-
# Get reports directory - try multiple possible locations
|
| 335 |
-
reports_dir_str = os.getenv("REPORTS_DIR")
|
| 336 |
-
if not reports_dir_str:
|
| 337 |
-
# Try common locations
|
| 338 |
-
possible_paths = [
|
| 339 |
-
"/Users/ayeroyan/workspace/chatbot-rag/reports",
|
| 340 |
-
Path(__file__).parent / "reports",
|
| 341 |
-
Path.cwd() / "reports",
|
| 342 |
-
]
|
| 343 |
-
for path in possible_paths:
|
| 344 |
-
if Path(path).exists():
|
| 345 |
-
reports_dir_str = str(path)
|
| 346 |
-
break
|
| 347 |
-
|
| 348 |
-
if not reports_dir_str:
|
| 349 |
-
reports_dir_str = "/Users/ayeroyan/workspace/chatbot-rag/reports" # Default fallback
|
| 350 |
-
|
| 351 |
-
reports_dir = Path(reports_dir_str)
|
| 352 |
-
|
| 353 |
-
# Initialize Gemini client
|
| 354 |
-
print(f"\n🔌 Connecting to Gemini API...")
|
| 355 |
-
try:
|
| 356 |
-
client = genai.Client(api_key=api_key)
|
| 357 |
-
print(f" ✅ Connected")
|
| 358 |
-
except Exception as e:
|
| 359 |
-
print(f" ❌ Failed to connect: {e}")
|
| 360 |
-
return 1
|
| 361 |
-
|
| 362 |
-
# Get or create file search store
|
| 363 |
-
print(f"\n📦 Setting up file search store...")
|
| 364 |
-
try:
|
| 365 |
-
store_name_full = get_or_create_filestore(client, store_name)
|
| 366 |
-
# Store the full name for upload, but also save just the ID for .env reference
|
| 367 |
-
# The full name is like "fileSearchStores/audit-reports-xxx"
|
| 368 |
-
# For API calls, we need just the ID part (after fileSearchStores/)
|
| 369 |
-
if store_name_full.startswith("fileSearchStores/"):
|
| 370 |
-
store_id = store_name_full.split("/", 1)[1]
|
| 371 |
-
print(f" 💡 Store ID (for GEMINI_FILESTORE_NAME env var): {store_id}")
|
| 372 |
-
else:
|
| 373 |
-
store_id = store_name_full
|
| 374 |
-
# Use full name for upload operations
|
| 375 |
-
store_name = store_name_full
|
| 376 |
-
except Exception as e:
|
| 377 |
-
print(f" ❌ Failed to setup store: {e}")
|
| 378 |
-
return 1
|
| 379 |
-
|
| 380 |
-
# Find all PDF files
|
| 381 |
-
print(f"\n🔍 Scanning for PDF files in: {reports_dir}")
|
| 382 |
-
pdf_files = find_report_files(reports_dir)
|
| 383 |
-
|
| 384 |
-
if not pdf_files:
|
| 385 |
-
print(f" ❌ No PDF files found in {reports_dir}")
|
| 386 |
-
return 1
|
| 387 |
-
|
| 388 |
-
print(f" ✅ Found {len(pdf_files)} PDF files")
|
| 389 |
-
|
| 390 |
-
# Upload files
|
| 391 |
-
print(f"\n📤 Uploading files to store...")
|
| 392 |
-
print(f" Store: {store_name}")
|
| 393 |
-
print(f" Files: {len(pdf_files)}")
|
| 394 |
-
|
| 395 |
-
uploaded = 0
|
| 396 |
-
failed = 0
|
| 397 |
-
skipped = 0
|
| 398 |
-
|
| 399 |
-
for i, pdf_file in enumerate(pdf_files, 1):
|
| 400 |
-
print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_file.name}")
|
| 401 |
-
|
| 402 |
-
# Extract metadata
|
| 403 |
-
metadata = extract_metadata_from_path(pdf_file)
|
| 404 |
-
|
| 405 |
-
# Display extracted metadata
|
| 406 |
-
metadata_info = []
|
| 407 |
-
if metadata.get('year'):
|
| 408 |
-
metadata_info.append(f"Year: {metadata['year']}")
|
| 409 |
-
if metadata.get('source'):
|
| 410 |
-
metadata_info.append(f"Source: {metadata['source']}")
|
| 411 |
-
if metadata.get('district'):
|
| 412 |
-
metadata_info.append(f"District: {metadata['district']}")
|
| 413 |
-
|
| 414 |
-
if metadata_info:
|
| 415 |
-
print(f" 📊 Extracted metadata: {', '.join(metadata_info)}")
|
| 416 |
-
|
| 417 |
-
# Upload file with metadata
|
| 418 |
-
result = upload_file_to_store(client, pdf_file, store_name, metadata, skip_existing=True)
|
| 419 |
-
|
| 420 |
-
if result is True:
|
| 421 |
-
uploaded += 1
|
| 422 |
-
elif result is None: # Skipped (already exists)
|
| 423 |
-
skipped += 1
|
| 424 |
-
else: # Failed
|
| 425 |
-
failed += 1
|
| 426 |
-
|
| 427 |
-
# Small delay between uploads to avoid rate limits
|
| 428 |
-
if i < len(pdf_files):
|
| 429 |
-
time.sleep(1)
|
| 430 |
-
|
| 431 |
-
# Summary
|
| 432 |
-
print(f"\n" + "=" * 60)
|
| 433 |
-
print(f"Upload Summary")
|
| 434 |
-
print(f"=" * 60)
|
| 435 |
-
print(f" ✅ Uploaded: {uploaded}")
|
| 436 |
-
if skipped > 0:
|
| 437 |
-
print(f" ⏭️ Skipped (already exists): {skipped}")
|
| 438 |
-
print(f" ❌ Failed: {failed}")
|
| 439 |
-
print(f" 📦 Store: {store_name}")
|
| 440 |
-
|
| 441 |
-
if uploaded > 0:
|
| 442 |
-
print(f"\n✅ Successfully uploaded {uploaded} files to Gemini File Search store!")
|
| 443 |
-
print(f" You can now use this store in the beta version of the chatbot.")
|
| 444 |
-
|
| 445 |
-
return 0 if failed == 0 else 1
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
if __name__ == "__main__":
|
| 449 |
-
sys.exit(main())
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
verify_qdrant_migration.py
DELETED
|
@@ -1,438 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Qdrant Migration Verification Script
|
| 4 |
-
|
| 5 |
-
This script compares the source and destination Qdrant collections to verify
|
| 6 |
-
that the migration was successful. It:
|
| 7 |
-
1. Compares collection configurations
|
| 8 |
-
2. Fetches sample points from source
|
| 9 |
-
3. Retrieves same points from destination using IDs
|
| 10 |
-
4. Compares vectors, metadata, and all attributes
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import os
|
| 14 |
-
import sys
|
| 15 |
-
from typing import List, Dict, Any, Optional
|
| 16 |
-
from pathlib import Path
|
| 17 |
-
from qdrant_client import QdrantClient
|
| 18 |
-
import json
|
| 19 |
-
|
| 20 |
-
# Try to import config loader and dotenv for automatic source detection
|
| 21 |
-
try:
|
| 22 |
-
from src.config.loader import load_config
|
| 23 |
-
CONFIG_AVAILABLE = True
|
| 24 |
-
except ImportError:
|
| 25 |
-
CONFIG_AVAILABLE = False
|
| 26 |
-
|
| 27 |
-
try:
|
| 28 |
-
from dotenv import load_dotenv
|
| 29 |
-
DOTENV_AVAILABLE = True
|
| 30 |
-
except ImportError:
|
| 31 |
-
DOTENV_AVAILABLE = False
|
| 32 |
-
|
| 33 |
-
# Load .env file automatically if available
|
| 34 |
-
if DOTENV_AVAILABLE:
|
| 35 |
-
project_root = Path(__file__).parent
|
| 36 |
-
env_file = project_root / ".env"
|
| 37 |
-
if env_file.exists():
|
| 38 |
-
load_dotenv(env_file, override=True)
|
| 39 |
-
else:
|
| 40 |
-
load_dotenv(override=True)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def get_collection_info(client: QdrantClient, collection_name: str) -> Dict[str, Any]:
|
| 44 |
-
"""Get collection information including vector size and point count."""
|
| 45 |
-
try:
|
| 46 |
-
collection_info = client.get_collection(collection_name)
|
| 47 |
-
|
| 48 |
-
# Handle different Qdrant client versions and response formats
|
| 49 |
-
if hasattr(collection_info, 'config'):
|
| 50 |
-
config = collection_info.config
|
| 51 |
-
if hasattr(config, 'params') and hasattr(config.params, 'vectors'):
|
| 52 |
-
vectors_config = config.params.vectors
|
| 53 |
-
if isinstance(vectors_config, dict):
|
| 54 |
-
vector_size = vectors_config.get('size')
|
| 55 |
-
distance = vectors_config.get('distance')
|
| 56 |
-
else:
|
| 57 |
-
vector_size = getattr(vectors_config, 'size', None)
|
| 58 |
-
distance = getattr(vectors_config, 'distance', None)
|
| 59 |
-
else:
|
| 60 |
-
vector_size = getattr(config, 'vector_size', None)
|
| 61 |
-
distance = getattr(config, 'distance', None)
|
| 62 |
-
else:
|
| 63 |
-
vector_size = getattr(collection_info, 'vector_size', None)
|
| 64 |
-
distance = getattr(collection_info, 'distance', None)
|
| 65 |
-
|
| 66 |
-
points_count = getattr(collection_info, 'points_count', 0)
|
| 67 |
-
indexed_vectors_count = getattr(collection_info, 'indexed_vectors_count', 0)
|
| 68 |
-
|
| 69 |
-
if vector_size is None:
|
| 70 |
-
try:
|
| 71 |
-
result, _ = client.scroll(collection_name=collection_name, limit=1, with_vectors=True)
|
| 72 |
-
if result and hasattr(result[0], 'vector') and result[0].vector:
|
| 73 |
-
vector_size = len(result[0].vector)
|
| 74 |
-
except Exception:
|
| 75 |
-
pass
|
| 76 |
-
|
| 77 |
-
return {
|
| 78 |
-
"vector_size": vector_size,
|
| 79 |
-
"distance": distance or "Cosine",
|
| 80 |
-
"points_count": points_count,
|
| 81 |
-
"indexed_vectors_count": indexed_vectors_count,
|
| 82 |
-
}
|
| 83 |
-
except Exception as e:
|
| 84 |
-
print(f"❌ Error getting collection info: {e}")
|
| 85 |
-
return None
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def fetch_points_by_ids(client: QdrantClient, collection_name: str, point_ids: List) -> Dict:
|
| 89 |
-
"""Fetch points by their IDs from a collection."""
|
| 90 |
-
try:
|
| 91 |
-
points = client.retrieve(
|
| 92 |
-
collection_name=collection_name,
|
| 93 |
-
ids=point_ids,
|
| 94 |
-
with_payload=True,
|
| 95 |
-
with_vectors=True
|
| 96 |
-
)
|
| 97 |
-
return {point.id: point for point in points}
|
| 98 |
-
except Exception as e:
|
| 99 |
-
print(f"❌ Error fetching points by IDs: {e}")
|
| 100 |
-
return {}
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def compare_points(source_point, dest_point, point_id) -> Dict[str, Any]:
|
| 104 |
-
"""Compare two points and return differences."""
|
| 105 |
-
differences = []
|
| 106 |
-
matches = []
|
| 107 |
-
|
| 108 |
-
# Compare IDs
|
| 109 |
-
if source_point.id == dest_point.id:
|
| 110 |
-
matches.append("ID")
|
| 111 |
-
else:
|
| 112 |
-
differences.append(f"ID: source={source_point.id}, dest={dest_point.id}")
|
| 113 |
-
|
| 114 |
-
# Compare vectors
|
| 115 |
-
source_vec = getattr(source_point, 'vector', None)
|
| 116 |
-
dest_vec = getattr(dest_point, 'vector', None)
|
| 117 |
-
|
| 118 |
-
if source_vec is None and dest_vec is None:
|
| 119 |
-
matches.append("Vector (both None)")
|
| 120 |
-
elif source_vec is None or dest_vec is None:
|
| 121 |
-
differences.append(f"Vector: source={'None' if source_vec is None else f'len={len(source_vec)}'}, dest={'None' if dest_vec is None else f'len={len(dest_vec)}'}")
|
| 122 |
-
elif len(source_vec) != len(dest_vec):
|
| 123 |
-
differences.append(f"Vector length: source={len(source_vec)}, dest={len(dest_vec)}")
|
| 124 |
-
else:
|
| 125 |
-
# Compare vector values (with tolerance for floating point)
|
| 126 |
-
import numpy as np
|
| 127 |
-
try:
|
| 128 |
-
vec_diff = np.abs(np.array(source_vec) - np.array(dest_vec))
|
| 129 |
-
max_diff = float(np.max(vec_diff))
|
| 130 |
-
if max_diff < 1e-6:
|
| 131 |
-
matches.append(f"Vector (max diff: {max_diff:.2e})")
|
| 132 |
-
else:
|
| 133 |
-
differences.append(f"Vector values differ (max diff: {max_diff:.2e})")
|
| 134 |
-
except Exception as e:
|
| 135 |
-
differences.append(f"Vector comparison error: {e}")
|
| 136 |
-
|
| 137 |
-
# Compare payloads
|
| 138 |
-
source_payload = getattr(source_point, 'payload', {}) or {}
|
| 139 |
-
dest_payload = getattr(dest_point, 'payload', {}) or {}
|
| 140 |
-
|
| 141 |
-
# Convert to dicts if needed
|
| 142 |
-
if hasattr(source_payload, '__dict__'):
|
| 143 |
-
source_payload = source_payload.__dict__
|
| 144 |
-
if hasattr(dest_payload, '__dict__'):
|
| 145 |
-
dest_payload = dest_payload.__dict__
|
| 146 |
-
|
| 147 |
-
source_keys = set(source_payload.keys())
|
| 148 |
-
dest_keys = set(dest_payload.keys())
|
| 149 |
-
|
| 150 |
-
if source_keys != dest_keys:
|
| 151 |
-
missing_in_dest = source_keys - dest_keys
|
| 152 |
-
extra_in_dest = dest_keys - source_keys
|
| 153 |
-
if missing_in_dest:
|
| 154 |
-
differences.append(f"Payload keys missing in dest: {missing_in_dest}")
|
| 155 |
-
if extra_in_dest:
|
| 156 |
-
differences.append(f"Payload keys extra in dest: {extra_in_dest}")
|
| 157 |
-
|
| 158 |
-
# Compare payload values
|
| 159 |
-
common_keys = source_keys & dest_keys
|
| 160 |
-
for key in common_keys:
|
| 161 |
-
source_val = source_payload[key]
|
| 162 |
-
dest_val = dest_payload[key]
|
| 163 |
-
|
| 164 |
-
if source_val == dest_val:
|
| 165 |
-
matches.append(f"Payload.{key}")
|
| 166 |
-
else:
|
| 167 |
-
# Handle nested structures
|
| 168 |
-
if isinstance(source_val, dict) and isinstance(dest_val, dict):
|
| 169 |
-
if source_val != dest_val:
|
| 170 |
-
differences.append(f"Payload.{key}: dicts differ")
|
| 171 |
-
elif isinstance(source_val, list) and isinstance(dest_val, list):
|
| 172 |
-
if source_val != dest_val:
|
| 173 |
-
differences.append(f"Payload.{key}: lists differ (len: {len(source_val)} vs {len(dest_val)})")
|
| 174 |
-
else:
|
| 175 |
-
differences.append(f"Payload.{key}: '{source_val}' != '{dest_val}'")
|
| 176 |
-
|
| 177 |
-
return {
|
| 178 |
-
"point_id": point_id,
|
| 179 |
-
"matches": matches,
|
| 180 |
-
"differences": differences,
|
| 181 |
-
"match_count": len(matches),
|
| 182 |
-
"diff_count": len(differences)
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
def main():
|
| 187 |
-
print("="*70)
|
| 188 |
-
print("Qdrant Migration Verification Script")
|
| 189 |
-
print("="*70)
|
| 190 |
-
|
| 191 |
-
# Auto-detect source from config and .env file
|
| 192 |
-
source_url = os.getenv('QDRANT_URL')
|
| 193 |
-
source_key = os.getenv('QDRANT_API_KEY')
|
| 194 |
-
source_collection = os.getenv('QDRANT_COLLECTION', 'docling')
|
| 195 |
-
|
| 196 |
-
if CONFIG_AVAILABLE:
|
| 197 |
-
try:
|
| 198 |
-
config = load_config()
|
| 199 |
-
qdrant_config = config.get('qdrant', {})
|
| 200 |
-
if not source_url:
|
| 201 |
-
source_url = qdrant_config.get('url')
|
| 202 |
-
if not source_key:
|
| 203 |
-
source_key = qdrant_config.get('api_key')
|
| 204 |
-
if not source_collection:
|
| 205 |
-
source_collection = qdrant_config.get('collection_name', 'docling')
|
| 206 |
-
except Exception as e:
|
| 207 |
-
print(f"⚠️ Could not load config: {e}")
|
| 208 |
-
|
| 209 |
-
# Get destination from env
|
| 210 |
-
dest_url = os.getenv('DEST_QDRANT_URL')
|
| 211 |
-
dest_key = os.getenv('DEST_QDRANT_API_KEY')
|
| 212 |
-
dest_collection = os.getenv('DEST_COLLECTION') # Optional, will auto-detect
|
| 213 |
-
|
| 214 |
-
# Validate
|
| 215 |
-
if not source_url or not source_key:
|
| 216 |
-
print("❌ Source Qdrant credentials missing!")
|
| 217 |
-
print(" Set QDRANT_URL and QDRANT_API_KEY in .env or environment")
|
| 218 |
-
return 1
|
| 219 |
-
|
| 220 |
-
if not dest_url or not dest_key:
|
| 221 |
-
print("❌ Destination Qdrant credentials missing!")
|
| 222 |
-
print(" Set DEST_QDRANT_URL and DEST_QDRANT_API_KEY in .env or environment")
|
| 223 |
-
return 1
|
| 224 |
-
|
| 225 |
-
print(f"\n📋 Configuration:")
|
| 226 |
-
print(f" Source: {source_url}")
|
| 227 |
-
print(f" Source Collection: {source_collection}")
|
| 228 |
-
print(f" Destination: {dest_url}")
|
| 229 |
-
if dest_collection:
|
| 230 |
-
print(f" Destination Collection: {dest_collection} (specified)")
|
| 231 |
-
else:
|
| 232 |
-
print(f" Destination Collection: (auto-detect)")
|
| 233 |
-
|
| 234 |
-
# Connect to Qdrant instances
|
| 235 |
-
print(f"\n🔌 Connecting to Qdrant instances...")
|
| 236 |
-
try:
|
| 237 |
-
source_client = QdrantClient(url=source_url, api_key=source_key, timeout=120)
|
| 238 |
-
print(f" ✅ Connected to source")
|
| 239 |
-
except Exception as e:
|
| 240 |
-
print(f" ❌ Failed to connect to source: {e}")
|
| 241 |
-
return 1
|
| 242 |
-
|
| 243 |
-
try:
|
| 244 |
-
dest_client = QdrantClient(url=dest_url, api_key=dest_key, timeout=120)
|
| 245 |
-
print(f" ✅ Connected to destination")
|
| 246 |
-
except Exception as e:
|
| 247 |
-
print(f" ❌ Failed to connect to destination: {e}")
|
| 248 |
-
return 1
|
| 249 |
-
|
| 250 |
-
# Auto-detect destination collection if not specified
|
| 251 |
-
if not dest_collection:
|
| 252 |
-
try:
|
| 253 |
-
collections = dest_client.get_collections().collections
|
| 254 |
-
collection_names = [c.name for c in collections]
|
| 255 |
-
if len(collection_names) == 1:
|
| 256 |
-
dest_collection = collection_names[0]
|
| 257 |
-
print(f"\n📋 Auto-detected destination collection: '{dest_collection}'")
|
| 258 |
-
elif len(collection_names) > 1:
|
| 259 |
-
print(f"\n⚠️ Found {len(collection_names)} collections in destination:")
|
| 260 |
-
for name in collection_names:
|
| 261 |
-
print(f" - {name}")
|
| 262 |
-
print(f"\n Using first collection: '{collection_names[0]}'")
|
| 263 |
-
dest_collection = collection_names[0]
|
| 264 |
-
else:
|
| 265 |
-
print("❌ No collections found in destination!")
|
| 266 |
-
return 1
|
| 267 |
-
except Exception as e:
|
| 268 |
-
print(f"❌ Could not list destination collections: {e}")
|
| 269 |
-
return 1
|
| 270 |
-
|
| 271 |
-
# Get collection info
|
| 272 |
-
print(f"\n📊 Collection Information Comparison")
|
| 273 |
-
print("="*70)
|
| 274 |
-
|
| 275 |
-
source_info = get_collection_info(source_client, source_collection)
|
| 276 |
-
dest_info = get_collection_info(dest_client, dest_collection)
|
| 277 |
-
|
| 278 |
-
if not source_info:
|
| 279 |
-
print("❌ Could not get source collection info")
|
| 280 |
-
return 1
|
| 281 |
-
|
| 282 |
-
if not dest_info:
|
| 283 |
-
print("❌ Could not get destination collection info")
|
| 284 |
-
return 1
|
| 285 |
-
|
| 286 |
-
print(f"\nSource Collection ('{source_collection}'):")
|
| 287 |
-
print(f" Vector size: {source_info['vector_size']}")
|
| 288 |
-
print(f" Distance: {source_info['distance']}")
|
| 289 |
-
print(f" Points: {source_info['points_count']:,}")
|
| 290 |
-
print(f" Indexed: {source_info['indexed_vectors_count']:,}")
|
| 291 |
-
|
| 292 |
-
print(f"\nDestination Collection ('{dest_collection}'):")
|
| 293 |
-
print(f" Vector size: {dest_info['vector_size']}")
|
| 294 |
-
print(f" Distance: {dest_info['distance']}")
|
| 295 |
-
print(f" Points: {dest_info['points_count']:,}")
|
| 296 |
-
print(f" Indexed: {dest_info['indexed_vectors_count']:,}")
|
| 297 |
-
|
| 298 |
-
# Compare configs
|
| 299 |
-
print(f"\n🔍 Configuration Comparison:")
|
| 300 |
-
config_matches = []
|
| 301 |
-
config_diffs = []
|
| 302 |
-
|
| 303 |
-
if source_info['vector_size'] == dest_info['vector_size']:
|
| 304 |
-
config_matches.append(f"Vector size: {source_info['vector_size']}")
|
| 305 |
-
else:
|
| 306 |
-
config_diffs.append(f"Vector size: source={source_info['vector_size']}, dest={dest_info['vector_size']}")
|
| 307 |
-
|
| 308 |
-
if str(source_info['distance']) == str(dest_info['distance']):
|
| 309 |
-
config_matches.append(f"Distance: {source_info['distance']}")
|
| 310 |
-
else:
|
| 311 |
-
config_diffs.append(f"Distance: source={source_info['distance']}, dest={dest_info['distance']}")
|
| 312 |
-
|
| 313 |
-
if source_info['points_count'] == dest_info['points_count']:
|
| 314 |
-
config_matches.append(f"Points count: {source_info['points_count']:,}")
|
| 315 |
-
else:
|
| 316 |
-
config_diffs.append(f"Points count: source={source_info['points_count']:,}, dest={dest_info['points_count']:,}")
|
| 317 |
-
|
| 318 |
-
if config_matches:
|
| 319 |
-
print(f" ✅ Matches: {len(config_matches)}")
|
| 320 |
-
for match in config_matches:
|
| 321 |
-
print(f" - {match}")
|
| 322 |
-
|
| 323 |
-
if config_diffs:
|
| 324 |
-
print(f" ❌ Differences: {len(config_diffs)}")
|
| 325 |
-
for diff in config_diffs:
|
| 326 |
-
print(f" - {diff}")
|
| 327 |
-
|
| 328 |
-
# Fetch sample points from source
|
| 329 |
-
print(f"\n📥 Fetching sample points from source...")
|
| 330 |
-
sample_size = 2000 # Fetch 20 sample points
|
| 331 |
-
|
| 332 |
-
try:
|
| 333 |
-
source_points_result, _ = source_client.scroll(
|
| 334 |
-
collection_name=source_collection,
|
| 335 |
-
limit=sample_size,
|
| 336 |
-
with_payload=True,
|
| 337 |
-
with_vectors=True
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
if not source_points_result:
|
| 341 |
-
print("❌ No points found in source collection!")
|
| 342 |
-
return 1
|
| 343 |
-
|
| 344 |
-
print(f" ✅ Fetched {len(source_points_result)} points from source")
|
| 345 |
-
|
| 346 |
-
# Extract point IDs
|
| 347 |
-
source_point_ids = [point.id for point in source_points_result]
|
| 348 |
-
print(f" Point IDs: {source_point_ids[:5]}{'...' if len(source_point_ids) > 5 else ''}")
|
| 349 |
-
|
| 350 |
-
except Exception as e:
|
| 351 |
-
print(f"❌ Error fetching source points: {e}")
|
| 352 |
-
import traceback
|
| 353 |
-
traceback.print_exc()
|
| 354 |
-
return 1
|
| 355 |
-
|
| 356 |
-
# Fetch same points from destination
|
| 357 |
-
print(f"\n📥 Fetching same points from destination by ID...")
|
| 358 |
-
try:
|
| 359 |
-
dest_points_dict = fetch_points_by_ids(dest_client, dest_collection, source_point_ids)
|
| 360 |
-
print(f" ✅ Fetched {len(dest_points_dict)} points from destination")
|
| 361 |
-
|
| 362 |
-
missing_ids = set(source_point_ids) - set(dest_points_dict.keys())
|
| 363 |
-
if missing_ids:
|
| 364 |
-
print(f" ⚠️ Missing {len(missing_ids)} points in destination: {list(missing_ids)[:5]}{'...' if len(missing_ids) > 5 else ''}")
|
| 365 |
-
|
| 366 |
-
except Exception as e:
|
| 367 |
-
print(f"❌ Error fetching destination points: {e}")
|
| 368 |
-
import traceback
|
| 369 |
-
traceback.print_exc()
|
| 370 |
-
return 1
|
| 371 |
-
|
| 372 |
-
# Compare points
|
| 373 |
-
print(f"\n🔍 Point-by-Point Comparison")
|
| 374 |
-
print("="*70)
|
| 375 |
-
|
| 376 |
-
comparison_results = []
|
| 377 |
-
for source_point in source_points_result:
|
| 378 |
-
point_id = source_point.id
|
| 379 |
-
dest_point = dest_points_dict.get(point_id)
|
| 380 |
-
|
| 381 |
-
if dest_point is None:
|
| 382 |
-
comparison_results.append({
|
| 383 |
-
"point_id": point_id,
|
| 384 |
-
"status": "MISSING",
|
| 385 |
-
"matches": [],
|
| 386 |
-
"differences": [f"Point not found in destination"]
|
| 387 |
-
})
|
| 388 |
-
else:
|
| 389 |
-
comparison = compare_points(source_point, dest_point, point_id)
|
| 390 |
-
comparison["status"] = "MATCH" if comparison["diff_count"] == 0 else "DIFF"
|
| 391 |
-
comparison_results.append(comparison)
|
| 392 |
-
|
| 393 |
-
# Summary
|
| 394 |
-
matches = [r for r in comparison_results if r["status"] == "MATCH"]
|
| 395 |
-
diffs = [r for r in comparison_results if r["status"] == "DIFF"]
|
| 396 |
-
missing = [r for r in comparison_results if r["status"] == "MISSING"]
|
| 397 |
-
|
| 398 |
-
print(f"\n📊 Comparison Summary:")
|
| 399 |
-
print(f" Total points compared: {len(comparison_results)}")
|
| 400 |
-
print(f" ✅ Perfect matches: {len(matches)}")
|
| 401 |
-
print(f" ⚠️ Differences found: {len(diffs)}")
|
| 402 |
-
print(f" ❌ Missing in destination: {len(missing)}")
|
| 403 |
-
|
| 404 |
-
# Show details for points with differences
|
| 405 |
-
if diffs:
|
| 406 |
-
print(f"\n⚠️ Points with differences:")
|
| 407 |
-
for diff_result in diffs[:10]: # Show first 10
|
| 408 |
-
print(f"\n Point ID: {diff_result['point_id']}")
|
| 409 |
-
if diff_result['matches']:
|
| 410 |
-
print(f" ✅ Matches ({len(diff_result['matches'])}): {', '.join(diff_result['matches'][:5])}")
|
| 411 |
-
if diff_result['differences']:
|
| 412 |
-
print(f" ❌ Differences ({len(diff_result['differences'])}):")
|
| 413 |
-
for d in diff_result['differences'][:5]:
|
| 414 |
-
print(f" - {d}")
|
| 415 |
-
|
| 416 |
-
if missing:
|
| 417 |
-
print(f"\n❌ Missing points in destination:")
|
| 418 |
-
for missing_result in missing[:10]:
|
| 419 |
-
print(f" - Point ID: {missing_result['point_id']}")
|
| 420 |
-
|
| 421 |
-
# Final verdict
|
| 422 |
-
print(f"\n" + "="*70)
|
| 423 |
-
if len(missing) == 0 and len(diffs) == 0:
|
| 424 |
-
print("✅ VERIFICATION PASSED: All points match perfectly!")
|
| 425 |
-
return 0
|
| 426 |
-
elif len(missing) == 0:
|
| 427 |
-
print(f"⚠️ VERIFICATION PARTIAL: All points present but {len(diffs)} have differences")
|
| 428 |
-
return 1
|
| 429 |
-
else:
|
| 430 |
-
print(f"❌ VERIFICATION FAILED: {len(missing)} points missing, {len(diffs)} have differences")
|
| 431 |
-
return 1
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
if __name__ == "__main__":
|
| 435 |
-
sys.exit(main())
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|