File size: 6,441 Bytes
d1e5882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Database Migration Script: Add Metadata Support

This script updates the documents table to add:
- metadata (JSONB) column for storing extracted metadata
- doc_id (TEXT) column for grouping chunks from the same document

Run this script after deploying the metadata extraction feature.
"""

import os
import sys
from pathlib import Path

# Add parent directory to path to import backend modules
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from dotenv import load_dotenv
import psycopg2

load_dotenv()

# Get database connection from environment
DATABASE_URL = os.getenv("POSTGRESQL_URL")

def get_connection():
    """
    Establish a direct PostgreSQL connection.
    """
    if not DATABASE_URL:
        raise ValueError(
            "PostgreSQL connection string not configured. "
            "Set POSTGRESQL_URL in your .env file."
        )
    return psycopg2.connect(DATABASE_URL)


def migrate_database():
    """
    Add metadata and doc_id columns to the documents table.
    """
    print("πŸ”„ Starting database migration: Adding metadata support...")
    
    try:
        conn = get_connection()
        cur = conn.cursor()
        
        # Check if columns already exist
        cur.execute("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = 'documents' 
            AND column_name IN ('metadata', 'doc_id');
        """)
        existing_columns = {row[0] for row in cur.fetchall()}
        
        # Add metadata column if it doesn't exist
        if 'metadata' not in existing_columns:
            print("  βž• Adding 'metadata' JSONB column...")
            cur.execute("""
                ALTER TABLE documents 
                ADD COLUMN metadata JSONB;
            """)
            print("  βœ… 'metadata' column added successfully")
        else:
            print("  βœ“ 'metadata' column already exists")
        
        # Add doc_id column if it doesn't exist
        if 'doc_id' not in existing_columns:
            print("  βž• Adding 'doc_id' TEXT column...")
            cur.execute("""
                ALTER TABLE documents 
                ADD COLUMN doc_id TEXT;
            """)
            print("  βœ… 'doc_id' column added successfully")
        else:
            print("  βœ“ 'doc_id' column already exists")
        
        # Create index on doc_id for faster lookups (optional but recommended)
        try:
            print("  βž• Creating index on 'doc_id'...")
            cur.execute("""
                CREATE INDEX IF NOT EXISTS documents_doc_id_idx 
                ON documents (doc_id);
            """)
            print("  βœ… Index on 'doc_id' created successfully")
        except Exception as e:
            print(f"  ⚠️  Index creation skipped (may already exist): {e}")
        
        # Create GIN index on metadata for JSONB queries (optional but recommended)
        try:
            print("  βž• Creating GIN index on 'metadata'...")
            cur.execute("""
                CREATE INDEX IF NOT EXISTS documents_metadata_idx 
                ON documents USING GIN (metadata);
            """)
            print("  βœ… GIN index on 'metadata' created successfully")
        except Exception as e:
            print(f"  ⚠️  GIN index creation skipped (may already exist): {e}")
        
        conn.commit()
        cur.close()
        conn.close()
        
        print("\nβœ… Database migration completed successfully!")
        print("\nThe documents table now supports:")
        print("  - metadata (JSONB): Stores extracted metadata (title, summary, tags, topics, etc.)")
        print("  - doc_id (TEXT): Groups chunks from the same document")
        print("\nNew documents will automatically have metadata extracted during ingestion.")
        
        return True
        
    except Exception as e:
        print(f"\n❌ Migration failed: {e}")
        print("\nTroubleshooting:")
        print("  1. Ensure PostgreSQL is running")
        print("  2. Check POSTGRESQL_URL in your .env file")
        print("  3. Verify you have permissions to alter the table")
        print("  4. Check if the documents table exists")
        return False


def verify_migration():
    """
    Verify that the migration was successful by checking column existence.
    """
    print("\nπŸ” Verifying migration...")
    
    try:
        conn = get_connection()
        cur = conn.cursor()
        
        cur.execute("""
            SELECT column_name, data_type 
            FROM information_schema.columns 
            WHERE table_name = 'documents' 
            AND column_name IN ('metadata', 'doc_id')
            ORDER BY column_name;
        """)
        
        columns = cur.fetchall()
        
        if len(columns) == 2:
            print("  βœ… Both columns exist:")
            for col_name, col_type in columns:
                print(f"     - {col_name}: {col_type}")
            return True
        else:
            print(f"  ⚠️  Found {len(columns)} column(s), expected 2")
            for col_name, col_type in columns:
                print(f"     - {col_name}: {col_type}")
            return False
            
    except Exception as e:
        print(f"  ❌ Verification failed: {e}")
        return False
    finally:
        try:
            cur.close()
            conn.close()
        except:
            pass


if __name__ == "__main__":
    print("=" * 60)
    print("Database Migration: Add Metadata Support")
    print("=" * 60)
    print()
    
    # Check if database connection is available
    try:
        conn = get_connection()
        conn.close()
        print("βœ“ Database connection successful\n")
    except Exception as e:
        print(f"❌ Cannot connect to database: {e}")
        print("\nPlease check:")
        print("  1. PostgreSQL is running")
        print("  2. POSTGRESQL_URL is set in .env file")
        print("  3. Database credentials are correct")
        sys.exit(1)
    
    # Run migration
    success = migrate_database()
    
    if success:
        # Verify migration
        verify_migration()
        print("\n" + "=" * 60)
        print("Migration completed! You can now use metadata extraction.")
        print("=" * 60)
    else:
        print("\n" + "=" * 60)
        print("Migration failed. Please check the errors above.")
        print("=" * 60)
        sys.exit(1)