""" Database Migration Script: Add Metadata Support This script updates the documents table to add: - metadata (JSONB) column for storing extracted metadata - doc_id (TEXT) column for grouping chunks from the same document Run this script after deploying the metadata extraction feature. """ import os import sys from pathlib import Path # Add parent directory to path to import backend modules project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from dotenv import load_dotenv import psycopg2 load_dotenv() # Get database connection from environment DATABASE_URL = os.getenv("POSTGRESQL_URL") def get_connection(): """ Establish a direct PostgreSQL connection. """ if not DATABASE_URL: raise ValueError( "PostgreSQL connection string not configured. " "Set POSTGRESQL_URL in your .env file." ) return psycopg2.connect(DATABASE_URL) def migrate_database(): """ Add metadata and doc_id columns to the documents table. """ print("šŸ”„ Starting database migration: Adding metadata support...") try: conn = get_connection() cur = conn.cursor() # Check if columns already exist cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_name = 'documents' AND column_name IN ('metadata', 'doc_id'); """) existing_columns = {row[0] for row in cur.fetchall()} # Add metadata column if it doesn't exist if 'metadata' not in existing_columns: print(" āž• Adding 'metadata' JSONB column...") cur.execute(""" ALTER TABLE documents ADD COLUMN metadata JSONB; """) print(" āœ… 'metadata' column added successfully") else: print(" āœ“ 'metadata' column already exists") # Add doc_id column if it doesn't exist if 'doc_id' not in existing_columns: print(" āž• Adding 'doc_id' TEXT column...") cur.execute(""" ALTER TABLE documents ADD COLUMN doc_id TEXT; """) print(" āœ… 'doc_id' column added successfully") else: print(" āœ“ 'doc_id' column already exists") # Create index on doc_id for faster lookups (optional but recommended) try: print(" āž• Creating index on 'doc_id'...") cur.execute(""" CREATE INDEX IF NOT EXISTS documents_doc_id_idx ON documents (doc_id); """) print(" āœ… Index on 'doc_id' created successfully") except Exception as e: print(f" āš ļø Index creation skipped (may already exist): {e}") # Create GIN index on metadata for JSONB queries (optional but recommended) try: print(" āž• Creating GIN index on 'metadata'...") cur.execute(""" CREATE INDEX IF NOT EXISTS documents_metadata_idx ON documents USING GIN (metadata); """) print(" āœ… GIN index on 'metadata' created successfully") except Exception as e: print(f" āš ļø GIN index creation skipped (may already exist): {e}") conn.commit() cur.close() conn.close() print("\nāœ… Database migration completed successfully!") print("\nThe documents table now supports:") print(" - metadata (JSONB): Stores extracted metadata (title, summary, tags, topics, etc.)") print(" - doc_id (TEXT): Groups chunks from the same document") print("\nNew documents will automatically have metadata extracted during ingestion.") return True except Exception as e: print(f"\nāŒ Migration failed: {e}") print("\nTroubleshooting:") print(" 1. Ensure PostgreSQL is running") print(" 2. Check POSTGRESQL_URL in your .env file") print(" 3. Verify you have permissions to alter the table") print(" 4. Check if the documents table exists") return False def verify_migration(): """ Verify that the migration was successful by checking column existence. """ print("\nšŸ” Verifying migration...") try: conn = get_connection() cur = conn.cursor() cur.execute(""" SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'documents' AND column_name IN ('metadata', 'doc_id') ORDER BY column_name; """) columns = cur.fetchall() if len(columns) == 2: print(" āœ… Both columns exist:") for col_name, col_type in columns: print(f" - {col_name}: {col_type}") return True else: print(f" āš ļø Found {len(columns)} column(s), expected 2") for col_name, col_type in columns: print(f" - {col_name}: {col_type}") return False except Exception as e: print(f" āŒ Verification failed: {e}") return False finally: try: cur.close() conn.close() except: pass if __name__ == "__main__": print("=" * 60) print("Database Migration: Add Metadata Support") print("=" * 60) print() # Check if database connection is available try: conn = get_connection() conn.close() print("āœ“ Database connection successful\n") except Exception as e: print(f"āŒ Cannot connect to database: {e}") print("\nPlease check:") print(" 1. PostgreSQL is running") print(" 2. POSTGRESQL_URL is set in .env file") print(" 3. Database credentials are correct") sys.exit(1) # Run migration success = migrate_database() if success: # Verify migration verify_migration() print("\n" + "=" * 60) print("Migration completed! You can now use metadata extraction.") print("=" * 60) else: print("\n" + "=" * 60) print("Migration failed. Please check the errors above.") print("=" * 60) sys.exit(1)