IntegraChat / backend /scripts /migrate_add_metadata.py
nothingworry's picture
feat: Add AI metadata extraction, latency prediction, context-aware routing, and tool output schemas
d1e5882
"""
Database Migration Script: Add Metadata Support
This script updates the documents table to add:
- metadata (JSONB) column for storing extracted metadata
- doc_id (TEXT) column for grouping chunks from the same document
Run this script after deploying the metadata extraction feature.
"""
import os
import sys
from pathlib import Path
# Add parent directory to path to import backend modules
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from dotenv import load_dotenv
import psycopg2
load_dotenv()
# Get database connection from environment
DATABASE_URL = os.getenv("POSTGRESQL_URL")
def get_connection():
"""
Establish a direct PostgreSQL connection.
"""
if not DATABASE_URL:
raise ValueError(
"PostgreSQL connection string not configured. "
"Set POSTGRESQL_URL in your .env file."
)
return psycopg2.connect(DATABASE_URL)
def migrate_database():
"""
Add metadata and doc_id columns to the documents table.
"""
print("πŸ”„ Starting database migration: Adding metadata support...")
try:
conn = get_connection()
cur = conn.cursor()
# Check if columns already exist
cur.execute("""
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'documents'
AND column_name IN ('metadata', 'doc_id');
""")
existing_columns = {row[0] for row in cur.fetchall()}
# Add metadata column if it doesn't exist
if 'metadata' not in existing_columns:
print(" βž• Adding 'metadata' JSONB column...")
cur.execute("""
ALTER TABLE documents
ADD COLUMN metadata JSONB;
""")
print(" βœ… 'metadata' column added successfully")
else:
print(" βœ“ 'metadata' column already exists")
# Add doc_id column if it doesn't exist
if 'doc_id' not in existing_columns:
print(" βž• Adding 'doc_id' TEXT column...")
cur.execute("""
ALTER TABLE documents
ADD COLUMN doc_id TEXT;
""")
print(" βœ… 'doc_id' column added successfully")
else:
print(" βœ“ 'doc_id' column already exists")
# Create index on doc_id for faster lookups (optional but recommended)
try:
print(" βž• Creating index on 'doc_id'...")
cur.execute("""
CREATE INDEX IF NOT EXISTS documents_doc_id_idx
ON documents (doc_id);
""")
print(" βœ… Index on 'doc_id' created successfully")
except Exception as e:
print(f" ⚠️ Index creation skipped (may already exist): {e}")
# Create GIN index on metadata for JSONB queries (optional but recommended)
try:
print(" βž• Creating GIN index on 'metadata'...")
cur.execute("""
CREATE INDEX IF NOT EXISTS documents_metadata_idx
ON documents USING GIN (metadata);
""")
print(" βœ… GIN index on 'metadata' created successfully")
except Exception as e:
print(f" ⚠️ GIN index creation skipped (may already exist): {e}")
conn.commit()
cur.close()
conn.close()
print("\nβœ… Database migration completed successfully!")
print("\nThe documents table now supports:")
print(" - metadata (JSONB): Stores extracted metadata (title, summary, tags, topics, etc.)")
print(" - doc_id (TEXT): Groups chunks from the same document")
print("\nNew documents will automatically have metadata extracted during ingestion.")
return True
except Exception as e:
print(f"\n❌ Migration failed: {e}")
print("\nTroubleshooting:")
print(" 1. Ensure PostgreSQL is running")
print(" 2. Check POSTGRESQL_URL in your .env file")
print(" 3. Verify you have permissions to alter the table")
print(" 4. Check if the documents table exists")
return False
def verify_migration():
"""
Verify that the migration was successful by checking column existence.
"""
print("\nπŸ” Verifying migration...")
try:
conn = get_connection()
cur = conn.cursor()
cur.execute("""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = 'documents'
AND column_name IN ('metadata', 'doc_id')
ORDER BY column_name;
""")
columns = cur.fetchall()
if len(columns) == 2:
print(" βœ… Both columns exist:")
for col_name, col_type in columns:
print(f" - {col_name}: {col_type}")
return True
else:
print(f" ⚠️ Found {len(columns)} column(s), expected 2")
for col_name, col_type in columns:
print(f" - {col_name}: {col_type}")
return False
except Exception as e:
print(f" ❌ Verification failed: {e}")
return False
finally:
try:
cur.close()
conn.close()
except:
pass
if __name__ == "__main__":
print("=" * 60)
print("Database Migration: Add Metadata Support")
print("=" * 60)
print()
# Check if database connection is available
try:
conn = get_connection()
conn.close()
print("βœ“ Database connection successful\n")
except Exception as e:
print(f"❌ Cannot connect to database: {e}")
print("\nPlease check:")
print(" 1. PostgreSQL is running")
print(" 2. POSTGRESQL_URL is set in .env file")
print(" 3. Database credentials are correct")
sys.exit(1)
# Run migration
success = migrate_database()
if success:
# Verify migration
verify_migration()
print("\n" + "=" * 60)
print("Migration completed! You can now use metadata extraction.")
print("=" * 60)
else:
print("\n" + "=" * 60)
print("Migration failed. Please check the errors above.")
print("=" * 60)
sys.exit(1)