Ara Yeroyan commited on
Commit
de1d74a
·
1 Parent(s): 39edab4

Remove scripts and ignore local_* files

Browse files
.gitignore CHANGED
@@ -109,4 +109,7 @@ pytest_cache/
109
  tmp/
110
  temp/
111
  *.tmp
112
- *.temp
 
 
 
 
109
  tmp/
110
  temp/
111
  *.tmp
112
+ *.temp
113
+
114
+
115
+ local_*
upload_to_gemini_filestore.py DELETED
@@ -1,450 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Upload Documents to Google Gemini File Search Store
4
-
5
- This script uploads PDF documents to a Gemini File Search store for RAG.
6
- It processes documents from the reports directory and uploads them with metadata.
7
- """
8
-
9
- import os
10
- import sys
11
- import json
12
- import time
13
- from pathlib import Path
14
- from typing import List, Dict, Any, Optional
15
- from dotenv import load_dotenv
16
-
17
- try:
18
- from google import genai
19
- from google.genai import types
20
- GEMINI_AVAILABLE = True
21
- except ImportError:
22
- GEMINI_AVAILABLE = False
23
- print("❌ google-genai package not installed. Install with: pip install google-genai")
24
-
25
- # Load .env file
26
- load_dotenv()
27
-
28
-
29
- def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
30
- """Extract metadata from file path structure."""
31
- # Example: /path/to/reports/Annual Consolidated OAG audit reports 2018/Annual Consolidated OAG audit reports 2018.pdf
32
- parts = file_path.parts
33
- filename = file_path.stem # Without extension
34
-
35
- metadata = {
36
- "filename": file_path.name,
37
- "filepath": str(file_path),
38
- }
39
-
40
- # Extract year
41
- year_match = None
42
- for part in parts:
43
- if any(year in part for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']):
44
- for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']:
45
- if year in part:
46
- year_match = year
47
- break
48
- if year_match:
49
- break
50
-
51
- if year_match:
52
- metadata["year"] = year_match
53
-
54
- # Extract source/district
55
- filename_lower = filename.lower()
56
- if "consolidated" in filename_lower or "oag" in filename_lower:
57
- metadata["source"] = "Consolidated"
58
- elif "gulu" in filename_lower:
59
- metadata["source"] = "Gulu DLG"
60
- metadata["district"] = "Gulu"
61
- elif "kalangala" in filename_lower:
62
- metadata["source"] = "Kalangala DLG"
63
- metadata["district"] = "Kalangala"
64
- elif "kcca" in filename_lower:
65
- metadata["source"] = "KCCA"
66
- metadata["district"] = "Kampala"
67
- elif "maaif" in filename_lower:
68
- metadata["source"] = "MAAIF"
69
- elif "mwts" in filename_lower:
70
- metadata["source"] = "MWTS"
71
-
72
- return metadata
73
-
74
-
75
- def get_or_create_filestore(client: genai.Client, store_name: Optional[str] = None) -> str:
76
- """Get existing file search store or create a new one."""
77
- # First, try to list all existing stores
78
- try:
79
- stores = list(client.file_search_stores.list())
80
- print(f" 🔍 Found {len(stores)} existing store(s)")
81
-
82
- if stores:
83
- # If store_name is provided, try to match it
84
- if store_name:
85
- for store in stores:
86
- # Check both name (full path like "fileSearchStores/xxx") and display_name
87
- store_name_match = store.name == store_name or store.name.endswith(store_name)
88
- display_name_match = store.display_name == store_name
89
-
90
- # Also check if store_name is just the ID part
91
- store_id = store.name.split("/")[-1] if "/" in store.name else store.name
92
- id_match = store_id == store_name
93
-
94
- if store_name_match or display_name_match or id_match:
95
- print(f" ✅ Using existing store: {store.display_name} ({store.name})")
96
- print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
97
- return store.name
98
-
99
- # If no store_name provided, use the most recent store
100
- if not store_name and stores:
101
- latest_store = stores[-1] # Most recent
102
- store_id = latest_store.name.split("/")[-1] if "/" in latest_store.name else latest_store.name
103
- print(f" ✅ Using most recent store: {latest_store.display_name} ({latest_store.name})")
104
- print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
105
- return latest_store.name
106
- except Exception as e:
107
- print(f" ⚠️ Could not list stores: {e}")
108
- print(f" 📝 Will create new store...")
109
-
110
- # Create new store only if no existing store found
111
- display_name = store_name or "Audit Reports"
112
- print(f" 📝 Creating new file search store: '{display_name}'...")
113
-
114
- try:
115
- file_search_store = client.file_search_stores.create(
116
- config={'display_name': display_name}
117
- )
118
- store_id = file_search_store.name.split("/")[-1] if "/" in file_search_store.name else file_search_store.name
119
- print(f" ✅ Created store: {file_search_store.display_name} ({file_search_store.name})")
120
- print(f" 💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
121
- return file_search_store.name
122
- except Exception as e:
123
- print(f" ❌ Failed to create store: {e}")
124
- raise
125
-
126
-
127
- def format_metadata_for_gemini(metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
128
- """Format metadata dictionary for Gemini API customMetadata format.
129
-
130
- Based on Gemini API, customMetadata should use:
131
- - string_value for string fields
132
- - numeric_value for numeric fields
133
- """
134
- custom_metadata = []
135
-
136
- # Add year if available (as numeric_value)
137
- if metadata.get('year'):
138
- try:
139
- year_int = int(metadata['year'])
140
- custom_metadata.append({
141
- 'key': 'year',
142
- 'numeric_value': year_int
143
- })
144
- except (ValueError, TypeError):
145
- # Fallback to string if not numeric
146
- custom_metadata.append({
147
- 'key': 'year',
148
- 'string_value': str(metadata['year'])
149
- })
150
-
151
- # Add source if available (as string_value)
152
- if metadata.get('source'):
153
- custom_metadata.append({
154
- 'key': 'source',
155
- 'string_value': str(metadata['source'])
156
- })
157
-
158
- # Add district if available (as string_value)
159
- if metadata.get('district'):
160
- custom_metadata.append({
161
- 'key': 'district',
162
- 'string_value': str(metadata['district'])
163
- })
164
-
165
- # Add filename for reference (as string_value)
166
- if metadata.get('filename'):
167
- custom_metadata.append({
168
- 'key': 'filename',
169
- 'string_value': str(metadata['filename'])
170
- })
171
-
172
- return custom_metadata
173
-
174
-
175
- def check_file_exists(client: genai.Client, store_name: str, filename: str) -> bool:
176
- """Check if a file with the same name already exists in the store."""
177
- try:
178
- # List files in the store
179
- store = client.file_search_stores.get(name=store_name)
180
- # Note: The API might not have a direct list method, so we'll catch errors
181
- return False # Assume not exists for now
182
- except Exception:
183
- return False # If we can't check, assume it doesn't exist
184
-
185
-
186
- def upload_file_to_store(
187
- client: genai.Client,
188
- file_path: Path,
189
- store_name: str,
190
- metadata: Dict[str, Any],
191
- skip_existing: bool = True
192
- ) -> Optional[bool]:
193
- """Upload a single file to the file search store with metadata."""
194
- try:
195
- print(f" 📤 Uploading: {file_path.name}...")
196
-
197
- # Format metadata for Gemini API
198
- custom_metadata = format_metadata_for_gemini(metadata)
199
-
200
- # Display metadata being uploaded
201
- if custom_metadata:
202
- metadata_parts = []
203
- for m in custom_metadata:
204
- if 'numeric_value' in m:
205
- metadata_parts.append(f"{m['key']}={m['numeric_value']}")
206
- elif 'string_value' in m:
207
- metadata_parts.append(f"{m['key']}={m['string_value']}")
208
- if metadata_parts:
209
- print(f" 📋 Metadata: {', '.join(metadata_parts)}")
210
-
211
- # Check if file already exists (if skip_existing is True)
212
- if skip_existing:
213
- # Note: We'll handle duplicates via error messages
214
- pass
215
-
216
- # Upload and import file with metadata
217
- # Note: Gemini API may not support customMetadata in upload_to_file_search_store
218
- # We'll try with metadata first, then fallback without it if it fails
219
- upload_params = {
220
- 'file': str(file_path),
221
- 'file_search_store_name': store_name,
222
- }
223
-
224
- # Build config
225
- config = {
226
- 'display_name': metadata.get('filename', file_path.name),
227
- }
228
-
229
- # Upload file (metadata not supported in upload config per API)
230
- # Note: Gemini File Search API doesn't support customMetadata in upload_to_file_search_store
231
- # Metadata would need to be added via a separate API call after upload, if supported
232
- # For now, we upload without metadata - the filename in display_name contains the info
233
- upload_params['config'] = config
234
-
235
- try:
236
- operation = client.file_search_stores.upload_to_file_search_store(**upload_params)
237
- except Exception as upload_error:
238
- error_str = str(upload_error).lower()
239
- error_msg = str(upload_error)
240
-
241
- # Check if it's a "terminated" or "already exists" error
242
- if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
243
- print(f" ⚠️ Upload error: File may already exist or upload was interrupted")
244
- print(f" 💡 Error details: {error_msg}")
245
- print(f" 💡 Skipping this file")
246
- return None # Return None to indicate "skipped"
247
- # Re-raise if it's a different error
248
- raise
249
-
250
- # Wait for import to complete
251
- max_wait = 300 # 5 minutes max per file
252
- start_time = time.time()
253
-
254
- while not operation.done:
255
- if time.time() - start_time > max_wait:
256
- print(f" ⚠️ Timeout waiting for upload to complete")
257
- return False
258
-
259
- time.sleep(2)
260
- try:
261
- operation = client.operations.get(operation)
262
- except Exception as op_error:
263
- # Check if it's a "terminated" error (file might already exist)
264
- error_str = str(op_error).lower()
265
- if 'terminated' in error_str or 'already' in error_str:
266
- print(f" ⚠️ File may already exist or upload was interrupted")
267
- print(f" 💡 Skipping this file")
268
- return None # Return None to indicate "skipped"
269
- raise
270
-
271
- # Check for errors in the operation result
272
- if hasattr(operation, 'error') and operation.error:
273
- error_msg = str(operation.error)
274
- if 'terminated' in error_msg.lower() or 'already' in error_msg.lower():
275
- print(f" ⚠️ File may already exist in the store")
276
- print(f" 💡 Skipping this file")
277
- return None # Return None to indicate "skipped" vs False for "failed"
278
- print(f" ❌ Upload failed: {operation.error}")
279
- return False
280
-
281
- print(f" ✅ Uploaded successfully")
282
- return True
283
-
284
- except Exception as e:
285
- error_str = str(e).lower()
286
- # Handle specific error cases
287
- if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
288
- print(f" ⚠️ Upload error: File may already exist or upload was interrupted")
289
- print(f" 💡 Error details: {e}")
290
- print(f" 💡 Skipping this file")
291
- return None # Return None to indicate "skipped"
292
- print(f" ❌ Error uploading {file_path.name}: {e}")
293
- import traceback
294
- traceback.print_exc()
295
- return False
296
-
297
-
298
- def find_report_files(reports_dir: Path) -> List[Path]:
299
- """Find all PDF report files in the reports directory."""
300
- pdf_files = []
301
-
302
- if not reports_dir.exists():
303
- print(f"❌ Reports directory not found: {reports_dir}")
304
- return pdf_files
305
-
306
- # Find all PDF files
307
- for pdf_file in reports_dir.rglob("*.pdf"):
308
- pdf_files.append(pdf_file)
309
-
310
- return sorted(pdf_files)
311
-
312
-
313
- def main():
314
- """Main function to upload documents to Gemini File Search store."""
315
- print("=" * 60)
316
- print("Gemini File Search Store Upload Tool")
317
- print("=" * 60)
318
-
319
- if not GEMINI_AVAILABLE:
320
- print("\n❌ Please install google-genai package:")
321
- print(" pip install google-genai")
322
- return 1
323
-
324
- # Get API key
325
- api_key = os.getenv("GEMINI_API_KEY")
326
- if not api_key:
327
- print("\n❌ GEMINI_API_KEY not found in environment variables")
328
- print(" Please add GEMINI_API_KEY to your .env file")
329
- return 1
330
-
331
- # Get store name (optional)
332
- store_name = os.getenv("GEMINI_FILESTORE_NAME")
333
-
334
- # Get reports directory - try multiple possible locations
335
- reports_dir_str = os.getenv("REPORTS_DIR")
336
- if not reports_dir_str:
337
- # Try common locations
338
- possible_paths = [
339
- "/Users/ayeroyan/workspace/chatbot-rag/reports",
340
- Path(__file__).parent / "reports",
341
- Path.cwd() / "reports",
342
- ]
343
- for path in possible_paths:
344
- if Path(path).exists():
345
- reports_dir_str = str(path)
346
- break
347
-
348
- if not reports_dir_str:
349
- reports_dir_str = "/Users/ayeroyan/workspace/chatbot-rag/reports" # Default fallback
350
-
351
- reports_dir = Path(reports_dir_str)
352
-
353
- # Initialize Gemini client
354
- print(f"\n🔌 Connecting to Gemini API...")
355
- try:
356
- client = genai.Client(api_key=api_key)
357
- print(f" ✅ Connected")
358
- except Exception as e:
359
- print(f" ❌ Failed to connect: {e}")
360
- return 1
361
-
362
- # Get or create file search store
363
- print(f"\n📦 Setting up file search store...")
364
- try:
365
- store_name_full = get_or_create_filestore(client, store_name)
366
- # Store the full name for upload, but also save just the ID for .env reference
367
- # The full name is like "fileSearchStores/audit-reports-xxx"
368
- # For API calls, we need just the ID part (after fileSearchStores/)
369
- if store_name_full.startswith("fileSearchStores/"):
370
- store_id = store_name_full.split("/", 1)[1]
371
- print(f" 💡 Store ID (for GEMINI_FILESTORE_NAME env var): {store_id}")
372
- else:
373
- store_id = store_name_full
374
- # Use full name for upload operations
375
- store_name = store_name_full
376
- except Exception as e:
377
- print(f" ❌ Failed to setup store: {e}")
378
- return 1
379
-
380
- # Find all PDF files
381
- print(f"\n🔍 Scanning for PDF files in: {reports_dir}")
382
- pdf_files = find_report_files(reports_dir)
383
-
384
- if not pdf_files:
385
- print(f" ❌ No PDF files found in {reports_dir}")
386
- return 1
387
-
388
- print(f" ✅ Found {len(pdf_files)} PDF files")
389
-
390
- # Upload files
391
- print(f"\n📤 Uploading files to store...")
392
- print(f" Store: {store_name}")
393
- print(f" Files: {len(pdf_files)}")
394
-
395
- uploaded = 0
396
- failed = 0
397
- skipped = 0
398
-
399
- for i, pdf_file in enumerate(pdf_files, 1):
400
- print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_file.name}")
401
-
402
- # Extract metadata
403
- metadata = extract_metadata_from_path(pdf_file)
404
-
405
- # Display extracted metadata
406
- metadata_info = []
407
- if metadata.get('year'):
408
- metadata_info.append(f"Year: {metadata['year']}")
409
- if metadata.get('source'):
410
- metadata_info.append(f"Source: {metadata['source']}")
411
- if metadata.get('district'):
412
- metadata_info.append(f"District: {metadata['district']}")
413
-
414
- if metadata_info:
415
- print(f" 📊 Extracted metadata: {', '.join(metadata_info)}")
416
-
417
- # Upload file with metadata
418
- result = upload_file_to_store(client, pdf_file, store_name, metadata, skip_existing=True)
419
-
420
- if result is True:
421
- uploaded += 1
422
- elif result is None: # Skipped (already exists)
423
- skipped += 1
424
- else: # Failed
425
- failed += 1
426
-
427
- # Small delay between uploads to avoid rate limits
428
- if i < len(pdf_files):
429
- time.sleep(1)
430
-
431
- # Summary
432
- print(f"\n" + "=" * 60)
433
- print(f"Upload Summary")
434
- print(f"=" * 60)
435
- print(f" ✅ Uploaded: {uploaded}")
436
- if skipped > 0:
437
- print(f" ⏭️ Skipped (already exists): {skipped}")
438
- print(f" ❌ Failed: {failed}")
439
- print(f" 📦 Store: {store_name}")
440
-
441
- if uploaded > 0:
442
- print(f"\n✅ Successfully uploaded {uploaded} files to Gemini File Search store!")
443
- print(f" You can now use this store in the beta version of the chatbot.")
444
-
445
- return 0 if failed == 0 else 1
446
-
447
-
448
- if __name__ == "__main__":
449
- sys.exit(main())
450
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
verify_qdrant_migration.py DELETED
@@ -1,438 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Qdrant Migration Verification Script
4
-
5
- This script compares the source and destination Qdrant collections to verify
6
- that the migration was successful. It:
7
- 1. Compares collection configurations
8
- 2. Fetches sample points from source
9
- 3. Retrieves same points from destination using IDs
10
- 4. Compares vectors, metadata, and all attributes
11
- """
12
-
13
- import os
14
- import sys
15
- from typing import List, Dict, Any, Optional
16
- from pathlib import Path
17
- from qdrant_client import QdrantClient
18
- import json
19
-
20
- # Try to import config loader and dotenv for automatic source detection
21
- try:
22
- from src.config.loader import load_config
23
- CONFIG_AVAILABLE = True
24
- except ImportError:
25
- CONFIG_AVAILABLE = False
26
-
27
- try:
28
- from dotenv import load_dotenv
29
- DOTENV_AVAILABLE = True
30
- except ImportError:
31
- DOTENV_AVAILABLE = False
32
-
33
- # Load .env file automatically if available
34
- if DOTENV_AVAILABLE:
35
- project_root = Path(__file__).parent
36
- env_file = project_root / ".env"
37
- if env_file.exists():
38
- load_dotenv(env_file, override=True)
39
- else:
40
- load_dotenv(override=True)
41
-
42
-
43
- def get_collection_info(client: QdrantClient, collection_name: str) -> Dict[str, Any]:
44
- """Get collection information including vector size and point count."""
45
- try:
46
- collection_info = client.get_collection(collection_name)
47
-
48
- # Handle different Qdrant client versions and response formats
49
- if hasattr(collection_info, 'config'):
50
- config = collection_info.config
51
- if hasattr(config, 'params') and hasattr(config.params, 'vectors'):
52
- vectors_config = config.params.vectors
53
- if isinstance(vectors_config, dict):
54
- vector_size = vectors_config.get('size')
55
- distance = vectors_config.get('distance')
56
- else:
57
- vector_size = getattr(vectors_config, 'size', None)
58
- distance = getattr(vectors_config, 'distance', None)
59
- else:
60
- vector_size = getattr(config, 'vector_size', None)
61
- distance = getattr(config, 'distance', None)
62
- else:
63
- vector_size = getattr(collection_info, 'vector_size', None)
64
- distance = getattr(collection_info, 'distance', None)
65
-
66
- points_count = getattr(collection_info, 'points_count', 0)
67
- indexed_vectors_count = getattr(collection_info, 'indexed_vectors_count', 0)
68
-
69
- if vector_size is None:
70
- try:
71
- result, _ = client.scroll(collection_name=collection_name, limit=1, with_vectors=True)
72
- if result and hasattr(result[0], 'vector') and result[0].vector:
73
- vector_size = len(result[0].vector)
74
- except Exception:
75
- pass
76
-
77
- return {
78
- "vector_size": vector_size,
79
- "distance": distance or "Cosine",
80
- "points_count": points_count,
81
- "indexed_vectors_count": indexed_vectors_count,
82
- }
83
- except Exception as e:
84
- print(f"❌ Error getting collection info: {e}")
85
- return None
86
-
87
-
88
- def fetch_points_by_ids(client: QdrantClient, collection_name: str, point_ids: List) -> Dict:
89
- """Fetch points by their IDs from a collection."""
90
- try:
91
- points = client.retrieve(
92
- collection_name=collection_name,
93
- ids=point_ids,
94
- with_payload=True,
95
- with_vectors=True
96
- )
97
- return {point.id: point for point in points}
98
- except Exception as e:
99
- print(f"❌ Error fetching points by IDs: {e}")
100
- return {}
101
-
102
-
103
- def compare_points(source_point, dest_point, point_id) -> Dict[str, Any]:
104
- """Compare two points and return differences."""
105
- differences = []
106
- matches = []
107
-
108
- # Compare IDs
109
- if source_point.id == dest_point.id:
110
- matches.append("ID")
111
- else:
112
- differences.append(f"ID: source={source_point.id}, dest={dest_point.id}")
113
-
114
- # Compare vectors
115
- source_vec = getattr(source_point, 'vector', None)
116
- dest_vec = getattr(dest_point, 'vector', None)
117
-
118
- if source_vec is None and dest_vec is None:
119
- matches.append("Vector (both None)")
120
- elif source_vec is None or dest_vec is None:
121
- differences.append(f"Vector: source={'None' if source_vec is None else f'len={len(source_vec)}'}, dest={'None' if dest_vec is None else f'len={len(dest_vec)}'}")
122
- elif len(source_vec) != len(dest_vec):
123
- differences.append(f"Vector length: source={len(source_vec)}, dest={len(dest_vec)}")
124
- else:
125
- # Compare vector values (with tolerance for floating point)
126
- import numpy as np
127
- try:
128
- vec_diff = np.abs(np.array(source_vec) - np.array(dest_vec))
129
- max_diff = float(np.max(vec_diff))
130
- if max_diff < 1e-6:
131
- matches.append(f"Vector (max diff: {max_diff:.2e})")
132
- else:
133
- differences.append(f"Vector values differ (max diff: {max_diff:.2e})")
134
- except Exception as e:
135
- differences.append(f"Vector comparison error: {e}")
136
-
137
- # Compare payloads
138
- source_payload = getattr(source_point, 'payload', {}) or {}
139
- dest_payload = getattr(dest_point, 'payload', {}) or {}
140
-
141
- # Convert to dicts if needed
142
- if hasattr(source_payload, '__dict__'):
143
- source_payload = source_payload.__dict__
144
- if hasattr(dest_payload, '__dict__'):
145
- dest_payload = dest_payload.__dict__
146
-
147
- source_keys = set(source_payload.keys())
148
- dest_keys = set(dest_payload.keys())
149
-
150
- if source_keys != dest_keys:
151
- missing_in_dest = source_keys - dest_keys
152
- extra_in_dest = dest_keys - source_keys
153
- if missing_in_dest:
154
- differences.append(f"Payload keys missing in dest: {missing_in_dest}")
155
- if extra_in_dest:
156
- differences.append(f"Payload keys extra in dest: {extra_in_dest}")
157
-
158
- # Compare payload values
159
- common_keys = source_keys & dest_keys
160
- for key in common_keys:
161
- source_val = source_payload[key]
162
- dest_val = dest_payload[key]
163
-
164
- if source_val == dest_val:
165
- matches.append(f"Payload.{key}")
166
- else:
167
- # Handle nested structures
168
- if isinstance(source_val, dict) and isinstance(dest_val, dict):
169
- if source_val != dest_val:
170
- differences.append(f"Payload.{key}: dicts differ")
171
- elif isinstance(source_val, list) and isinstance(dest_val, list):
172
- if source_val != dest_val:
173
- differences.append(f"Payload.{key}: lists differ (len: {len(source_val)} vs {len(dest_val)})")
174
- else:
175
- differences.append(f"Payload.{key}: '{source_val}' != '{dest_val}'")
176
-
177
- return {
178
- "point_id": point_id,
179
- "matches": matches,
180
- "differences": differences,
181
- "match_count": len(matches),
182
- "diff_count": len(differences)
183
- }
184
-
185
-
186
- def main():
187
- print("="*70)
188
- print("Qdrant Migration Verification Script")
189
- print("="*70)
190
-
191
- # Auto-detect source from config and .env file
192
- source_url = os.getenv('QDRANT_URL')
193
- source_key = os.getenv('QDRANT_API_KEY')
194
- source_collection = os.getenv('QDRANT_COLLECTION', 'docling')
195
-
196
- if CONFIG_AVAILABLE:
197
- try:
198
- config = load_config()
199
- qdrant_config = config.get('qdrant', {})
200
- if not source_url:
201
- source_url = qdrant_config.get('url')
202
- if not source_key:
203
- source_key = qdrant_config.get('api_key')
204
- if not source_collection:
205
- source_collection = qdrant_config.get('collection_name', 'docling')
206
- except Exception as e:
207
- print(f"⚠️ Could not load config: {e}")
208
-
209
- # Get destination from env
210
- dest_url = os.getenv('DEST_QDRANT_URL')
211
- dest_key = os.getenv('DEST_QDRANT_API_KEY')
212
- dest_collection = os.getenv('DEST_COLLECTION') # Optional, will auto-detect
213
-
214
- # Validate
215
- if not source_url or not source_key:
216
- print("❌ Source Qdrant credentials missing!")
217
- print(" Set QDRANT_URL and QDRANT_API_KEY in .env or environment")
218
- return 1
219
-
220
- if not dest_url or not dest_key:
221
- print("❌ Destination Qdrant credentials missing!")
222
- print(" Set DEST_QDRANT_URL and DEST_QDRANT_API_KEY in .env or environment")
223
- return 1
224
-
225
- print(f"\n📋 Configuration:")
226
- print(f" Source: {source_url}")
227
- print(f" Source Collection: {source_collection}")
228
- print(f" Destination: {dest_url}")
229
- if dest_collection:
230
- print(f" Destination Collection: {dest_collection} (specified)")
231
- else:
232
- print(f" Destination Collection: (auto-detect)")
233
-
234
- # Connect to Qdrant instances
235
- print(f"\n🔌 Connecting to Qdrant instances...")
236
- try:
237
- source_client = QdrantClient(url=source_url, api_key=source_key, timeout=120)
238
- print(f" ✅ Connected to source")
239
- except Exception as e:
240
- print(f" ❌ Failed to connect to source: {e}")
241
- return 1
242
-
243
- try:
244
- dest_client = QdrantClient(url=dest_url, api_key=dest_key, timeout=120)
245
- print(f" ✅ Connected to destination")
246
- except Exception as e:
247
- print(f" ❌ Failed to connect to destination: {e}")
248
- return 1
249
-
250
- # Auto-detect destination collection if not specified
251
- if not dest_collection:
252
- try:
253
- collections = dest_client.get_collections().collections
254
- collection_names = [c.name for c in collections]
255
- if len(collection_names) == 1:
256
- dest_collection = collection_names[0]
257
- print(f"\n📋 Auto-detected destination collection: '{dest_collection}'")
258
- elif len(collection_names) > 1:
259
- print(f"\n⚠️ Found {len(collection_names)} collections in destination:")
260
- for name in collection_names:
261
- print(f" - {name}")
262
- print(f"\n Using first collection: '{collection_names[0]}'")
263
- dest_collection = collection_names[0]
264
- else:
265
- print("❌ No collections found in destination!")
266
- return 1
267
- except Exception as e:
268
- print(f"❌ Could not list destination collections: {e}")
269
- return 1
270
-
271
- # Get collection info
272
- print(f"\n📊 Collection Information Comparison")
273
- print("="*70)
274
-
275
- source_info = get_collection_info(source_client, source_collection)
276
- dest_info = get_collection_info(dest_client, dest_collection)
277
-
278
- if not source_info:
279
- print("❌ Could not get source collection info")
280
- return 1
281
-
282
- if not dest_info:
283
- print("❌ Could not get destination collection info")
284
- return 1
285
-
286
- print(f"\nSource Collection ('{source_collection}'):")
287
- print(f" Vector size: {source_info['vector_size']}")
288
- print(f" Distance: {source_info['distance']}")
289
- print(f" Points: {source_info['points_count']:,}")
290
- print(f" Indexed: {source_info['indexed_vectors_count']:,}")
291
-
292
- print(f"\nDestination Collection ('{dest_collection}'):")
293
- print(f" Vector size: {dest_info['vector_size']}")
294
- print(f" Distance: {dest_info['distance']}")
295
- print(f" Points: {dest_info['points_count']:,}")
296
- print(f" Indexed: {dest_info['indexed_vectors_count']:,}")
297
-
298
- # Compare configs
299
- print(f"\n🔍 Configuration Comparison:")
300
- config_matches = []
301
- config_diffs = []
302
-
303
- if source_info['vector_size'] == dest_info['vector_size']:
304
- config_matches.append(f"Vector size: {source_info['vector_size']}")
305
- else:
306
- config_diffs.append(f"Vector size: source={source_info['vector_size']}, dest={dest_info['vector_size']}")
307
-
308
- if str(source_info['distance']) == str(dest_info['distance']):
309
- config_matches.append(f"Distance: {source_info['distance']}")
310
- else:
311
- config_diffs.append(f"Distance: source={source_info['distance']}, dest={dest_info['distance']}")
312
-
313
- if source_info['points_count'] == dest_info['points_count']:
314
- config_matches.append(f"Points count: {source_info['points_count']:,}")
315
- else:
316
- config_diffs.append(f"Points count: source={source_info['points_count']:,}, dest={dest_info['points_count']:,}")
317
-
318
- if config_matches:
319
- print(f" ✅ Matches: {len(config_matches)}")
320
- for match in config_matches:
321
- print(f" - {match}")
322
-
323
- if config_diffs:
324
- print(f" ❌ Differences: {len(config_diffs)}")
325
- for diff in config_diffs:
326
- print(f" - {diff}")
327
-
328
- # Fetch sample points from source
329
- print(f"\n📥 Fetching sample points from source...")
330
- sample_size = 2000 # Fetch 20 sample points
331
-
332
- try:
333
- source_points_result, _ = source_client.scroll(
334
- collection_name=source_collection,
335
- limit=sample_size,
336
- with_payload=True,
337
- with_vectors=True
338
- )
339
-
340
- if not source_points_result:
341
- print("❌ No points found in source collection!")
342
- return 1
343
-
344
- print(f" ✅ Fetched {len(source_points_result)} points from source")
345
-
346
- # Extract point IDs
347
- source_point_ids = [point.id for point in source_points_result]
348
- print(f" Point IDs: {source_point_ids[:5]}{'...' if len(source_point_ids) > 5 else ''}")
349
-
350
- except Exception as e:
351
- print(f"❌ Error fetching source points: {e}")
352
- import traceback
353
- traceback.print_exc()
354
- return 1
355
-
356
- # Fetch same points from destination
357
- print(f"\n📥 Fetching same points from destination by ID...")
358
- try:
359
- dest_points_dict = fetch_points_by_ids(dest_client, dest_collection, source_point_ids)
360
- print(f" ✅ Fetched {len(dest_points_dict)} points from destination")
361
-
362
- missing_ids = set(source_point_ids) - set(dest_points_dict.keys())
363
- if missing_ids:
364
- print(f" ⚠️ Missing {len(missing_ids)} points in destination: {list(missing_ids)[:5]}{'...' if len(missing_ids) > 5 else ''}")
365
-
366
- except Exception as e:
367
- print(f"❌ Error fetching destination points: {e}")
368
- import traceback
369
- traceback.print_exc()
370
- return 1
371
-
372
- # Compare points
373
- print(f"\n🔍 Point-by-Point Comparison")
374
- print("="*70)
375
-
376
- comparison_results = []
377
- for source_point in source_points_result:
378
- point_id = source_point.id
379
- dest_point = dest_points_dict.get(point_id)
380
-
381
- if dest_point is None:
382
- comparison_results.append({
383
- "point_id": point_id,
384
- "status": "MISSING",
385
- "matches": [],
386
- "differences": [f"Point not found in destination"]
387
- })
388
- else:
389
- comparison = compare_points(source_point, dest_point, point_id)
390
- comparison["status"] = "MATCH" if comparison["diff_count"] == 0 else "DIFF"
391
- comparison_results.append(comparison)
392
-
393
- # Summary
394
- matches = [r for r in comparison_results if r["status"] == "MATCH"]
395
- diffs = [r for r in comparison_results if r["status"] == "DIFF"]
396
- missing = [r for r in comparison_results if r["status"] == "MISSING"]
397
-
398
- print(f"\n📊 Comparison Summary:")
399
- print(f" Total points compared: {len(comparison_results)}")
400
- print(f" ✅ Perfect matches: {len(matches)}")
401
- print(f" ⚠️ Differences found: {len(diffs)}")
402
- print(f" ❌ Missing in destination: {len(missing)}")
403
-
404
- # Show details for points with differences
405
- if diffs:
406
- print(f"\n⚠️ Points with differences:")
407
- for diff_result in diffs[:10]: # Show first 10
408
- print(f"\n Point ID: {diff_result['point_id']}")
409
- if diff_result['matches']:
410
- print(f" ✅ Matches ({len(diff_result['matches'])}): {', '.join(diff_result['matches'][:5])}")
411
- if diff_result['differences']:
412
- print(f" ❌ Differences ({len(diff_result['differences'])}):")
413
- for d in diff_result['differences'][:5]:
414
- print(f" - {d}")
415
-
416
- if missing:
417
- print(f"\n❌ Missing points in destination:")
418
- for missing_result in missing[:10]:
419
- print(f" - Point ID: {missing_result['point_id']}")
420
-
421
- # Final verdict
422
- print(f"\n" + "="*70)
423
- if len(missing) == 0 and len(diffs) == 0:
424
- print("✅ VERIFICATION PASSED: All points match perfectly!")
425
- return 0
426
- elif len(missing) == 0:
427
- print(f"⚠️ VERIFICATION PARTIAL: All points present but {len(diffs)} have differences")
428
- return 1
429
- else:
430
- print(f"❌ VERIFICATION FAILED: {len(missing)} points missing, {len(diffs)} have differences")
431
- return 1
432
-
433
-
434
- if __name__ == "__main__":
435
- sys.exit(main())
436
-
437
-
438
-