Spaces:

akryldigital
/

audit_assistant

Running

App Files Files Community

Ara Yeroyan commited on Nov 15, 2025

Commit

de1d74a

1 Parent(s): 39edab4

Remove scripts and ignore local_* files

Browse files

Files changed (3) hide show

.gitignore +4 -1
upload_to_gemini_filestore.py +0 -450
verify_qdrant_migration.py +0 -438

.gitignore CHANGED Viewed

@@ -109,4 +109,7 @@ pytest_cache/
 tmp/
 temp/
 *.tmp
-*.temp

 tmp/
 temp/
 *.tmp
+*.temp
+local_*

upload_to_gemini_filestore.py DELETED Viewed

@@ -1,450 +0,0 @@
-#!/usr/bin/env python3
-"""
-Upload Documents to Google Gemini File Search Store
-This script uploads PDF documents to a Gemini File Search store for RAG.
-It processes documents from the reports directory and uploads them with metadata.
-"""
-import os
-import sys
-import json
-import time
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-from dotenv import load_dotenv
-try:
-    from google import genai
-    from google.genai import types
-    GEMINI_AVAILABLE = True
-except ImportError:
-    GEMINI_AVAILABLE = False
-    print("❌ google-genai package not installed. Install with: pip install google-genai")
-# Load .env file
-load_dotenv()
-def extract_metadata_from_path(file_path: Path) -> Dict[str, Any]:
-    """Extract metadata from file path structure."""
-    # Example: /path/to/reports/Annual Consolidated OAG audit reports 2018/Annual Consolidated OAG audit reports 2018.pdf
-    parts = file_path.parts
-    filename = file_path.stem  # Without extension
-    metadata = {
-        "filename": file_path.name,
-        "filepath": str(file_path),
-    }
-    # Extract year
-    year_match = None
-    for part in parts:
-        if any(year in part for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']):
-            for year in ['2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']:
-                if year in part:
-                    year_match = year
-                    break
-            if year_match:
-                break
-    if year_match:
-        metadata["year"] = year_match
-    # Extract source/district
-    filename_lower = filename.lower()
-    if "consolidated" in filename_lower or "oag" in filename_lower:
-        metadata["source"] = "Consolidated"
-    elif "gulu" in filename_lower:
-        metadata["source"] = "Gulu DLG"
-        metadata["district"] = "Gulu"
-    elif "kalangala" in filename_lower:
-        metadata["source"] = "Kalangala DLG"
-        metadata["district"] = "Kalangala"
-    elif "kcca" in filename_lower:
-        metadata["source"] = "KCCA"
-        metadata["district"] = "Kampala"
-    elif "maaif" in filename_lower:
-        metadata["source"] = "MAAIF"
-    elif "mwts" in filename_lower:
-        metadata["source"] = "MWTS"
-    return metadata
-def get_or_create_filestore(client: genai.Client, store_name: Optional[str] = None) -> str:
-    """Get existing file search store or create a new one."""
-    # First, try to list all existing stores
-    try:
-        stores = list(client.file_search_stores.list())
-        print(f"   🔍 Found {len(stores)} existing store(s)")
-        if stores:
-            # If store_name is provided, try to match it
-            if store_name:
-                for store in stores:
-                    # Check both name (full path like "fileSearchStores/xxx") and display_name
-                    store_name_match = store.name == store_name or store.name.endswith(store_name)
-                    display_name_match = store.display_name == store_name
-                    # Also check if store_name is just the ID part
-                    store_id = store.name.split("/")[-1] if "/" in store.name else store.name
-                    id_match = store_id == store_name
-                    if store_name_match or display_name_match or id_match:
-                        print(f"   ✅ Using existing store: {store.display_name} ({store.name})")
-                        print(f"   💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
-                        return store.name
-            # If no store_name provided, use the most recent store
-            if not store_name and stores:
-                latest_store = stores[-1]  # Most recent
-                store_id = latest_store.name.split("/")[-1] if "/" in latest_store.name else latest_store.name
-                print(f"   ✅ Using most recent store: {latest_store.display_name} ({latest_store.name})")
-                print(f"   💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
-                return latest_store.name
-    except Exception as e:
-        print(f"   ⚠️  Could not list stores: {e}")
-        print(f"   📝 Will create new store...")
-    # Create new store only if no existing store found
-    display_name = store_name or "Audit Reports"
-    print(f"   📝 Creating new file search store: '{display_name}'...")
-    try:
-        file_search_store = client.file_search_stores.create(
-            config={'display_name': display_name}
-        )
-        store_id = file_search_store.name.split("/")[-1] if "/" in file_search_store.name else file_search_store.name
-        print(f"   ✅ Created store: {file_search_store.display_name} ({file_search_store.name})")
-        print(f"   💡 Store ID: {store_id} (use this in GEMINI_FILESTORE_NAME)")
-        return file_search_store.name
-    except Exception as e:
-        print(f"   ❌ Failed to create store: {e}")
-        raise
-def format_metadata_for_gemini(metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
-    """Format metadata dictionary for Gemini API customMetadata format.
-    Based on Gemini API, customMetadata should use:
-    - string_value for string fields
-    - numeric_value for numeric fields
-    """
-    custom_metadata = []
-    # Add year if available (as numeric_value)
-    if metadata.get('year'):
-        try:
-            year_int = int(metadata['year'])
-            custom_metadata.append({
-                'key': 'year',
-                'numeric_value': year_int
-            })
-        except (ValueError, TypeError):
-            # Fallback to string if not numeric
-            custom_metadata.append({
-                'key': 'year',
-                'string_value': str(metadata['year'])
-            })
-    # Add source if available (as string_value)
-    if metadata.get('source'):
-        custom_metadata.append({
-            'key': 'source',
-            'string_value': str(metadata['source'])
-        })
-    # Add district if available (as string_value)
-    if metadata.get('district'):
-        custom_metadata.append({
-            'key': 'district',
-            'string_value': str(metadata['district'])
-        })
-    # Add filename for reference (as string_value)
-    if metadata.get('filename'):
-        custom_metadata.append({
-            'key': 'filename',
-            'string_value': str(metadata['filename'])
-        })
-    return custom_metadata
-def check_file_exists(client: genai.Client, store_name: str, filename: str) -> bool:
-    """Check if a file with the same name already exists in the store."""
-    try:
-        # List files in the store
-        store = client.file_search_stores.get(name=store_name)
-        # Note: The API might not have a direct list method, so we'll catch errors
-        return False  # Assume not exists for now
-    except Exception:
-        return False  # If we can't check, assume it doesn't exist
-def upload_file_to_store(
-    client: genai.Client,
-    file_path: Path,
-    store_name: str,
-    metadata: Dict[str, Any],
-    skip_existing: bool = True
-) -> Optional[bool]:
-    """Upload a single file to the file search store with metadata."""
-    try:
-        print(f"   📤 Uploading: {file_path.name}...")
-        # Format metadata for Gemini API
-        custom_metadata = format_metadata_for_gemini(metadata)
-        # Display metadata being uploaded
-        if custom_metadata:
-            metadata_parts = []
-            for m in custom_metadata:
-                if 'numeric_value' in m:
-                    metadata_parts.append(f"{m['key']}={m['numeric_value']}")
-                elif 'string_value' in m:
-                    metadata_parts.append(f"{m['key']}={m['string_value']}")
-            if metadata_parts:
-                print(f"      📋 Metadata: {', '.join(metadata_parts)}")
-        # Check if file already exists (if skip_existing is True)
-        if skip_existing:
-            # Note: We'll handle duplicates via error messages
-            pass
-        # Upload and import file with metadata
-        # Note: Gemini API may not support customMetadata in upload_to_file_search_store
-        # We'll try with metadata first, then fallback without it if it fails
-        upload_params = {
-            'file': str(file_path),
-            'file_search_store_name': store_name,
-        }
-        # Build config
-        config = {
-            'display_name': metadata.get('filename', file_path.name),
-        }
-        # Upload file (metadata not supported in upload config per API)
-        # Note: Gemini File Search API doesn't support customMetadata in upload_to_file_search_store
-        # Metadata would need to be added via a separate API call after upload, if supported
-        # For now, we upload without metadata - the filename in display_name contains the info
-        upload_params['config'] = config
-        try:
-            operation = client.file_search_stores.upload_to_file_search_store(**upload_params)
-        except Exception as upload_error:
-            error_str = str(upload_error).lower()
-            error_msg = str(upload_error)
-            # Check if it's a "terminated" or "already exists" error
-            if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
-                print(f"      ⚠️  Upload error: File may already exist or upload was interrupted")
-                print(f"      💡 Error details: {error_msg}")
-                print(f"      💡 Skipping this file")
-                return None  # Return None to indicate "skipped"
-            # Re-raise if it's a different error
-            raise
-        # Wait for import to complete
-        max_wait = 300  # 5 minutes max per file
-        start_time = time.time()
-        while not operation.done:
-            if time.time() - start_time > max_wait:
-                print(f"      ⚠️  Timeout waiting for upload to complete")
-                return False
-            time.sleep(2)
-            try:
-                operation = client.operations.get(operation)
-            except Exception as op_error:
-                # Check if it's a "terminated" error (file might already exist)
-                error_str = str(op_error).lower()
-                if 'terminated' in error_str or 'already' in error_str:
-                    print(f"      ⚠️  File may already exist or upload was interrupted")
-                    print(f"      💡 Skipping this file")
-                    return None  # Return None to indicate "skipped"
-                raise
-        # Check for errors in the operation result
-        if hasattr(operation, 'error') and operation.error:
-            error_msg = str(operation.error)
-            if 'terminated' in error_msg.lower() or 'already' in error_msg.lower():
-                print(f"      ⚠️  File may already exist in the store")
-                print(f"      💡 Skipping this file")
-                return None  # Return None to indicate "skipped" vs False for "failed"
-            print(f"      ❌ Upload failed: {operation.error}")
-            return False
-        print(f"      ✅ Uploaded successfully")
-        return True
-    except Exception as e:
-        error_str = str(e).lower()
-        # Handle specific error cases
-        if 'terminated' in error_str or 'already' in error_str or '400' in error_str:
-            print(f"      ⚠️  Upload error: File may already exist or upload was interrupted")
-            print(f"      💡 Error details: {e}")
-            print(f"      💡 Skipping this file")
-            return None  # Return None to indicate "skipped"
-        print(f"      ❌ Error uploading {file_path.name}: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-def find_report_files(reports_dir: Path) -> List[Path]:
-    """Find all PDF report files in the reports directory."""
-    pdf_files = []
-    if not reports_dir.exists():
-        print(f"❌ Reports directory not found: {reports_dir}")
-        return pdf_files
-    # Find all PDF files
-    for pdf_file in reports_dir.rglob("*.pdf"):
-        pdf_files.append(pdf_file)
-    return sorted(pdf_files)
-def main():
-    """Main function to upload documents to Gemini File Search store."""
-    print("=" * 60)
-    print("Gemini File Search Store Upload Tool")
-    print("=" * 60)
-    if not GEMINI_AVAILABLE:
-        print("\n❌ Please install google-genai package:")
-        print("   pip install google-genai")
-        return 1
-    # Get API key
-    api_key = os.getenv("GEMINI_API_KEY")
-    if not api_key:
-        print("\n❌ GEMINI_API_KEY not found in environment variables")
-        print("   Please add GEMINI_API_KEY to your .env file")
-        return 1
-    # Get store name (optional)
-    store_name = os.getenv("GEMINI_FILESTORE_NAME")
-    # Get reports directory - try multiple possible locations
-    reports_dir_str = os.getenv("REPORTS_DIR")
-    if not reports_dir_str:
-        # Try common locations
-        possible_paths = [
-            "/Users/ayeroyan/workspace/chatbot-rag/reports",
-            Path(__file__).parent / "reports",
-            Path.cwd() / "reports",
-        ]
-        for path in possible_paths:
-            if Path(path).exists():
-                reports_dir_str = str(path)
-                break
-        if not reports_dir_str:
-            reports_dir_str = "/Users/ayeroyan/workspace/chatbot-rag/reports"  # Default fallback
-    reports_dir = Path(reports_dir_str)
-    # Initialize Gemini client
-    print(f"\n🔌 Connecting to Gemini API...")
-    try:
-        client = genai.Client(api_key=api_key)
-        print(f"   ✅ Connected")
-    except Exception as e:
-        print(f"   ❌ Failed to connect: {e}")
-        return 1
-    # Get or create file search store
-    print(f"\n📦 Setting up file search store...")
-    try:
-        store_name_full = get_or_create_filestore(client, store_name)
-        # Store the full name for upload, but also save just the ID for .env reference
-        # The full name is like "fileSearchStores/audit-reports-xxx"
-        # For API calls, we need just the ID part (after fileSearchStores/)
-        if store_name_full.startswith("fileSearchStores/"):
-            store_id = store_name_full.split("/", 1)[1]
-            print(f"   💡 Store ID (for GEMINI_FILESTORE_NAME env var): {store_id}")
-        else:
-            store_id = store_name_full
-        # Use full name for upload operations
-        store_name = store_name_full
-    except Exception as e:
-        print(f"   ❌ Failed to setup store: {e}")
-        return 1
-    # Find all PDF files
-    print(f"\n🔍 Scanning for PDF files in: {reports_dir}")
-    pdf_files = find_report_files(reports_dir)
-    if not pdf_files:
-        print(f"   ❌ No PDF files found in {reports_dir}")
-        return 1
-    print(f"   ✅ Found {len(pdf_files)} PDF files")
-    # Upload files
-    print(f"\n📤 Uploading files to store...")
-    print(f"   Store: {store_name}")
-    print(f"   Files: {len(pdf_files)}")
-    uploaded = 0
-    failed = 0
-    skipped = 0
-    for i, pdf_file in enumerate(pdf_files, 1):
-        print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_file.name}")
-        # Extract metadata
-        metadata = extract_metadata_from_path(pdf_file)
-        # Display extracted metadata
-        metadata_info = []
-        if metadata.get('year'):
-            metadata_info.append(f"Year: {metadata['year']}")
-        if metadata.get('source'):
-            metadata_info.append(f"Source: {metadata['source']}")
-        if metadata.get('district'):
-            metadata_info.append(f"District: {metadata['district']}")
-        if metadata_info:
-            print(f"   📊 Extracted metadata: {', '.join(metadata_info)}")
-        # Upload file with metadata
-        result = upload_file_to_store(client, pdf_file, store_name, metadata, skip_existing=True)
-        if result is True:
-            uploaded += 1
-        elif result is None:  # Skipped (already exists)
-            skipped += 1
-        else:  # Failed
-            failed += 1
-        # Small delay between uploads to avoid rate limits
-        if i < len(pdf_files):
-            time.sleep(1)
-    # Summary
-    print(f"\n" + "=" * 60)
-    print(f"Upload Summary")
-    print(f"=" * 60)
-    print(f"   ✅ Uploaded: {uploaded}")
-    if skipped > 0:
-        print(f"   ⏭️  Skipped (already exists): {skipped}")
-    print(f"   ❌ Failed: {failed}")
-    print(f"   📦 Store: {store_name}")
-    if uploaded > 0:
-        print(f"\n✅ Successfully uploaded {uploaded} files to Gemini File Search store!")
-        print(f"   You can now use this store in the beta version of the chatbot.")
-    return 0 if failed == 0 else 1
-if __name__ == "__main__":
-    sys.exit(main())

verify_qdrant_migration.py DELETED Viewed

@@ -1,438 +0,0 @@
-#!/usr/bin/env python3
-"""
-Qdrant Migration Verification Script
-This script compares the source and destination Qdrant collections to verify
-that the migration was successful. It:
-1. Compares collection configurations
-2. Fetches sample points from source
-3. Retrieves same points from destination using IDs
-4. Compares vectors, metadata, and all attributes
-"""
-import os
-import sys
-from typing import List, Dict, Any, Optional
-from pathlib import Path
-from qdrant_client import QdrantClient
-import json
-# Try to import config loader and dotenv for automatic source detection
-try:
-    from src.config.loader import load_config
-    CONFIG_AVAILABLE = True
-except ImportError:
-    CONFIG_AVAILABLE = False
-try:
-    from dotenv import load_dotenv
-    DOTENV_AVAILABLE = True
-except ImportError:
-    DOTENV_AVAILABLE = False
-# Load .env file automatically if available
-if DOTENV_AVAILABLE:
-    project_root = Path(__file__).parent
-    env_file = project_root / ".env"
-    if env_file.exists():
-        load_dotenv(env_file, override=True)
-    else:
-        load_dotenv(override=True)
-def get_collection_info(client: QdrantClient, collection_name: str) -> Dict[str, Any]:
-    """Get collection information including vector size and point count."""
-    try:
-        collection_info = client.get_collection(collection_name)
-        # Handle different Qdrant client versions and response formats
-        if hasattr(collection_info, 'config'):
-            config = collection_info.config
-            if hasattr(config, 'params') and hasattr(config.params, 'vectors'):
-                vectors_config = config.params.vectors
-                if isinstance(vectors_config, dict):
-                    vector_size = vectors_config.get('size')
-                    distance = vectors_config.get('distance')
-                else:
-                    vector_size = getattr(vectors_config, 'size', None)
-                    distance = getattr(vectors_config, 'distance', None)
-            else:
-                vector_size = getattr(config, 'vector_size', None)
-                distance = getattr(config, 'distance', None)
-        else:
-            vector_size = getattr(collection_info, 'vector_size', None)
-            distance = getattr(collection_info, 'distance', None)
-        points_count = getattr(collection_info, 'points_count', 0)
-        indexed_vectors_count = getattr(collection_info, 'indexed_vectors_count', 0)
-        if vector_size is None:
-            try:
-                result, _ = client.scroll(collection_name=collection_name, limit=1, with_vectors=True)
-                if result and hasattr(result[0], 'vector') and result[0].vector:
-                    vector_size = len(result[0].vector)
-            except Exception:
-                pass
-        return {
-            "vector_size": vector_size,
-            "distance": distance or "Cosine",
-            "points_count": points_count,
-            "indexed_vectors_count": indexed_vectors_count,
-        }
-    except Exception as e:
-        print(f"❌ Error getting collection info: {e}")
-        return None
-def fetch_points_by_ids(client: QdrantClient, collection_name: str, point_ids: List) -> Dict:
-    """Fetch points by their IDs from a collection."""
-    try:
-        points = client.retrieve(
-            collection_name=collection_name,
-            ids=point_ids,
-            with_payload=True,
-            with_vectors=True
-        )
-        return {point.id: point for point in points}
-    except Exception as e:
-        print(f"❌ Error fetching points by IDs: {e}")
-        return {}
-def compare_points(source_point, dest_point, point_id) -> Dict[str, Any]:
-    """Compare two points and return differences."""
-    differences = []
-    matches = []
-    # Compare IDs
-    if source_point.id == dest_point.id:
-        matches.append("ID")
-    else:
-        differences.append(f"ID: source={source_point.id}, dest={dest_point.id}")
-    # Compare vectors
-    source_vec = getattr(source_point, 'vector', None)
-    dest_vec = getattr(dest_point, 'vector', None)
-    if source_vec is None and dest_vec is None:
-        matches.append("Vector (both None)")
-    elif source_vec is None or dest_vec is None:
-        differences.append(f"Vector: source={'None' if source_vec is None else f'len={len(source_vec)}'}, dest={'None' if dest_vec is None else f'len={len(dest_vec)}'}")
-    elif len(source_vec) != len(dest_vec):
-        differences.append(f"Vector length: source={len(source_vec)}, dest={len(dest_vec)}")
-    else:
-        # Compare vector values (with tolerance for floating point)
-        import numpy as np
-        try:
-            vec_diff = np.abs(np.array(source_vec) - np.array(dest_vec))
-            max_diff = float(np.max(vec_diff))
-            if max_diff < 1e-6:
-                matches.append(f"Vector (max diff: {max_diff:.2e})")
-            else:
-                differences.append(f"Vector values differ (max diff: {max_diff:.2e})")
-        except Exception as e:
-            differences.append(f"Vector comparison error: {e}")
-    # Compare payloads
-    source_payload = getattr(source_point, 'payload', {}) or {}
-    dest_payload = getattr(dest_point, 'payload', {}) or {}
-    # Convert to dicts if needed
-    if hasattr(source_payload, '__dict__'):
-        source_payload = source_payload.__dict__
-    if hasattr(dest_payload, '__dict__'):
-        dest_payload = dest_payload.__dict__
-    source_keys = set(source_payload.keys())
-    dest_keys = set(dest_payload.keys())
-    if source_keys != dest_keys:
-        missing_in_dest = source_keys - dest_keys
-        extra_in_dest = dest_keys - source_keys
-        if missing_in_dest:
-            differences.append(f"Payload keys missing in dest: {missing_in_dest}")
-        if extra_in_dest:
-            differences.append(f"Payload keys extra in dest: {extra_in_dest}")
-    # Compare payload values
-    common_keys = source_keys & dest_keys
-    for key in common_keys:
-        source_val = source_payload[key]
-        dest_val = dest_payload[key]
-        if source_val == dest_val:
-            matches.append(f"Payload.{key}")
-        else:
-            # Handle nested structures
-            if isinstance(source_val, dict) and isinstance(dest_val, dict):
-                if source_val != dest_val:
-                    differences.append(f"Payload.{key}: dicts differ")
-            elif isinstance(source_val, list) and isinstance(dest_val, list):
-                if source_val != dest_val:
-                    differences.append(f"Payload.{key}: lists differ (len: {len(source_val)} vs {len(dest_val)})")
-            else:
-                differences.append(f"Payload.{key}: '{source_val}' != '{dest_val}'")
-    return {
-        "point_id": point_id,
-        "matches": matches,
-        "differences": differences,
-        "match_count": len(matches),
-        "diff_count": len(differences)
-    }
-def main():
-    print("="*70)
-    print("Qdrant Migration Verification Script")
-    print("="*70)
-    # Auto-detect source from config and .env file
-    source_url = os.getenv('QDRANT_URL')
-    source_key = os.getenv('QDRANT_API_KEY')
-    source_collection = os.getenv('QDRANT_COLLECTION', 'docling')
-    if CONFIG_AVAILABLE:
-        try:
-            config = load_config()
-            qdrant_config = config.get('qdrant', {})
-            if not source_url:
-                source_url = qdrant_config.get('url')
-            if not source_key:
-                source_key = qdrant_config.get('api_key')
-            if not source_collection:
-                source_collection = qdrant_config.get('collection_name', 'docling')
-        except Exception as e:
-            print(f"⚠️  Could not load config: {e}")
-    # Get destination from env
-    dest_url = os.getenv('DEST_QDRANT_URL')
-    dest_key = os.getenv('DEST_QDRANT_API_KEY')
-    dest_collection = os.getenv('DEST_COLLECTION')  # Optional, will auto-detect
-    # Validate
-    if not source_url or not source_key:
-        print("❌ Source Qdrant credentials missing!")
-        print("   Set QDRANT_URL and QDRANT_API_KEY in .env or environment")
-        return 1
-    if not dest_url or not dest_key:
-        print("❌ Destination Qdrant credentials missing!")
-        print("   Set DEST_QDRANT_URL and DEST_QDRANT_API_KEY in .env or environment")
-        return 1
-    print(f"\n📋 Configuration:")
-    print(f"   Source: {source_url}")
-    print(f"   Source Collection: {source_collection}")
-    print(f"   Destination: {dest_url}")
-    if dest_collection:
-        print(f"   Destination Collection: {dest_collection} (specified)")
-    else:
-        print(f"   Destination Collection: (auto-detect)")
-    # Connect to Qdrant instances
-    print(f"\n🔌 Connecting to Qdrant instances...")
-    try:
-        source_client = QdrantClient(url=source_url, api_key=source_key, timeout=120)
-        print(f"   ✅ Connected to source")
-    except Exception as e:
-        print(f"   ❌ Failed to connect to source: {e}")
-        return 1
-    try:
-        dest_client = QdrantClient(url=dest_url, api_key=dest_key, timeout=120)
-        print(f"   ✅ Connected to destination")
-    except Exception as e:
-        print(f"   ❌ Failed to connect to destination: {e}")
-        return 1
-    # Auto-detect destination collection if not specified
-    if not dest_collection:
-        try:
-            collections = dest_client.get_collections().collections
-            collection_names = [c.name for c in collections]
-            if len(collection_names) == 1:
-                dest_collection = collection_names[0]
-                print(f"\n📋 Auto-detected destination collection: '{dest_collection}'")
-            elif len(collection_names) > 1:
-                print(f"\n⚠️  Found {len(collection_names)} collections in destination:")
-                for name in collection_names:
-                    print(f"   - {name}")
-                print(f"\n   Using first collection: '{collection_names[0]}'")
-                dest_collection = collection_names[0]
-            else:
-                print("❌ No collections found in destination!")
-                return 1
-        except Exception as e:
-            print(f"❌ Could not list destination collections: {e}")
-            return 1
-    # Get collection info
-    print(f"\n📊 Collection Information Comparison")
-    print("="*70)
-    source_info = get_collection_info(source_client, source_collection)
-    dest_info = get_collection_info(dest_client, dest_collection)
-    if not source_info:
-        print("❌ Could not get source collection info")
-        return 1
-    if not dest_info:
-        print("❌ Could not get destination collection info")
-        return 1
-    print(f"\nSource Collection ('{source_collection}'):")
-    print(f"   Vector size: {source_info['vector_size']}")
-    print(f"   Distance: {source_info['distance']}")
-    print(f"   Points: {source_info['points_count']:,}")
-    print(f"   Indexed: {source_info['indexed_vectors_count']:,}")
-    print(f"\nDestination Collection ('{dest_collection}'):")
-    print(f"   Vector size: {dest_info['vector_size']}")
-    print(f"   Distance: {dest_info['distance']}")
-    print(f"   Points: {dest_info['points_count']:,}")
-    print(f"   Indexed: {dest_info['indexed_vectors_count']:,}")
-    # Compare configs
-    print(f"\n🔍 Configuration Comparison:")
-    config_matches = []
-    config_diffs = []
-    if source_info['vector_size'] == dest_info['vector_size']:
-        config_matches.append(f"Vector size: {source_info['vector_size']}")
-    else:
-        config_diffs.append(f"Vector size: source={source_info['vector_size']}, dest={dest_info['vector_size']}")
-    if str(source_info['distance']) == str(dest_info['distance']):
-        config_matches.append(f"Distance: {source_info['distance']}")
-    else:
-        config_diffs.append(f"Distance: source={source_info['distance']}, dest={dest_info['distance']}")
-    if source_info['points_count'] == dest_info['points_count']:
-        config_matches.append(f"Points count: {source_info['points_count']:,}")
-    else:
-        config_diffs.append(f"Points count: source={source_info['points_count']:,}, dest={dest_info['points_count']:,}")
-    if config_matches:
-        print(f"   ✅ Matches: {len(config_matches)}")
-        for match in config_matches:
-            print(f"      - {match}")
-    if config_diffs:
-        print(f"   ❌ Differences: {len(config_diffs)}")
-        for diff in config_diffs:
-            print(f"      - {diff}")
-    # Fetch sample points from source
-    print(f"\n📥 Fetching sample points from source...")
-    sample_size = 2000  # Fetch 20 sample points
-    try:
-        source_points_result, _ = source_client.scroll(
-            collection_name=source_collection,
-            limit=sample_size,
-            with_payload=True,
-            with_vectors=True
-        )
-        if not source_points_result:
-            print("❌ No points found in source collection!")
-            return 1
-        print(f"   ✅ Fetched {len(source_points_result)} points from source")
-        # Extract point IDs
-        source_point_ids = [point.id for point in source_points_result]
-        print(f"   Point IDs: {source_point_ids[:5]}{'...' if len(source_point_ids) > 5 else ''}")
-    except Exception as e:
-        print(f"❌ Error fetching source points: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-    # Fetch same points from destination
-    print(f"\n📥 Fetching same points from destination by ID...")
-    try:
-        dest_points_dict = fetch_points_by_ids(dest_client, dest_collection, source_point_ids)
-        print(f"   ✅ Fetched {len(dest_points_dict)} points from destination")
-        missing_ids = set(source_point_ids) - set(dest_points_dict.keys())
-        if missing_ids:
-            print(f"   ⚠️  Missing {len(missing_ids)} points in destination: {list(missing_ids)[:5]}{'...' if len(missing_ids) > 5 else ''}")
-    except Exception as e:
-        print(f"❌ Error fetching destination points: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-    # Compare points
-    print(f"\n🔍 Point-by-Point Comparison")
-    print("="*70)
-    comparison_results = []
-    for source_point in source_points_result:
-        point_id = source_point.id
-        dest_point = dest_points_dict.get(point_id)
-        if dest_point is None:
-            comparison_results.append({
-                "point_id": point_id,
-                "status": "MISSING",
-                "matches": [],
-                "differences": [f"Point not found in destination"]
-            })
-        else:
-            comparison = compare_points(source_point, dest_point, point_id)
-            comparison["status"] = "MATCH" if comparison["diff_count"] == 0 else "DIFF"
-            comparison_results.append(comparison)
-    # Summary
-    matches = [r for r in comparison_results if r["status"] == "MATCH"]
-    diffs = [r for r in comparison_results if r["status"] == "DIFF"]
-    missing = [r for r in comparison_results if r["status"] == "MISSING"]
-    print(f"\n📊 Comparison Summary:")
-    print(f"   Total points compared: {len(comparison_results)}")
-    print(f"   ✅ Perfect matches: {len(matches)}")
-    print(f"   ⚠️  Differences found: {len(diffs)}")
-    print(f"   ❌ Missing in destination: {len(missing)}")
-    # Show details for points with differences
-    if diffs:
-        print(f"\n⚠️  Points with differences:")
-        for diff_result in diffs[:10]:  # Show first 10
-            print(f"\n   Point ID: {diff_result['point_id']}")
-            if diff_result['matches']:
-                print(f"      ✅ Matches ({len(diff_result['matches'])}): {', '.join(diff_result['matches'][:5])}")
-            if diff_result['differences']:
-                print(f"      ❌ Differences ({len(diff_result['differences'])}):")
-                for d in diff_result['differences'][:5]:
-                    print(f"         - {d}")
-    if missing:
-        print(f"\n❌ Missing points in destination:")
-        for missing_result in missing[:10]:
-            print(f"   - Point ID: {missing_result['point_id']}")
-    # Final verdict
-    print(f"\n" + "="*70)
-    if len(missing) == 0 and len(diffs) == 0:
-        print("✅ VERIFICATION PASSED: All points match perfectly!")
-        return 0
-    elif len(missing) == 0:
-        print(f"⚠️  VERIFICATION PARTIAL: All points present but {len(diffs)} have differences")
-        return 1
-    else:
-        print(f"❌ VERIFICATION FAILED: {len(missing)} points missing, {len(diffs)} have differences")
-        return 1
-if __name__ == "__main__":
-    sys.exit(main())