drift-detection-ui / create_database.py
Yoon-gu Hwang
데이터 5배 증가 및 모델 정보 테이블 추가
403ebf2
import sqlite3
from datetime import datetime, timedelta
import random
# Create database connection
conn = sqlite3.connect('drift_detection.db')
cursor = conn.cursor()
# Create tables
cursor.executescript('''
CREATE TABLE IF NOT EXISTS model_info (
model_id INTEGER PRIMARY KEY,
model_name VARCHAR(255) NOT NULL,
release_date DATE NOT NULL,
prediction_period INTEGER
);
CREATE TABLE IF NOT EXISTS total_result (
customer_id INTEGER NOT NULL,
model_id INTEGER NOT NULL,
score REAL NOT NULL CHECK(score > 0),
prediction_date DATE NOT NULL,
load_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (customer_id, model_id),
FOREIGN KEY (model_id) REFERENCES model_info(model_id)
);
CREATE TABLE IF NOT EXISTS delivered (
customer_id INTEGER NOT NULL,
model_id INTEGER NOT NULL,
rank INTEGER NOT NULL,
prediction_date DATE NOT NULL,
load_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (customer_id, model_id),
FOREIGN KEY (model_id) REFERENCES model_info(model_id)
);
CREATE TABLE IF NOT EXISTS drift_record (
model_id INTEGER NOT NULL,
precision REAL CHECK(precision >= 0 AND precision <= 1),
recall REAL CHECK(recall >= 0 AND recall <= 1),
sample_numbers INTEGER NOT NULL CHECK(sample_numbers > 0),
js_value REAL CHECK(js_value >= 0 AND js_value <= 1),
wd_value REAL CHECK(wd_value >= 0),
prediction_date TIMESTAMP NOT NULL,
PRIMARY KEY (model_id, prediction_date),
FOREIGN KEY (model_id) REFERENCES model_info(model_id)
);
''')
# Insert sample data into model_info
model_data = [
(1, 'Churn Prediction Model v1', '2024-11-15', 30),
(2, 'Churn Prediction Model v2', '2024-11-20', 30),
(3, 'Churn Prediction Model v3', '2024-12-10', 30),
(4, 'Customer Lifetime Value Model', '2024-12-01', 60),
(5, 'Product Recommendation Model', '2024-12-15', 15)
]
cursor.executemany(
'INSERT OR IGNORE INTO model_info (model_id, model_name, release_date, prediction_period) VALUES (?, ?, ?, ?)',
model_data
)
# Generate sample data for total_result
base_date = datetime(2025, 1, 1)
for customer_id in range(1, 101):
for model_id in range(1, 6):
score = random.uniform(0.1, 0.95)
pred_date = base_date + timedelta(days=random.randint(0, 240))
cursor.execute(
'INSERT OR IGNORE INTO total_result (customer_id, model_id, score, prediction_date) VALUES (?, ?, ?, ?)',
(customer_id, model_id, score, pred_date.strftime('%Y-%m-%d'))
)
# Generate sample data for delivered
for customer_id in range(1, 101):
for model_id in range(1, 6):
if random.random() > 0.3: # 70% of customers get delivery
rank = random.randint(1, 100)
pred_date = base_date + timedelta(days=random.randint(0, 240))
cursor.execute(
'INSERT OR IGNORE INTO delivered (customer_id, model_id, rank, prediction_date) VALUES (?, ?, ?, ?)',
(customer_id, model_id, rank, pred_date.strftime('%Y-%m-%d'))
)
# Generate sample data for drift_record (time series data)
# Create records every 3-4 days for each model (Jan 2025 - Aug 2025)
for model_id in range(1, 6):
# Generate data every 3-4 days for 8 months (about 60-70 records per model)
day_offset = 0
record_count = 0
while day_offset < 240: # 8 months = ~240 days
# Add random time component (hours, minutes, seconds)
pred_date = base_date + timedelta(
days=day_offset,
hours=random.randint(0, 23),
minutes=random.randint(0, 59),
seconds=random.randint(0, 59)
)
# Add some trend and noise to make it realistic
days_elapsed = day_offset / 30.0 # Convert to months for trend calculation
base_precision = 0.85 - (days_elapsed * 0.01) # Slight degradation over time
base_recall = 0.80 - (days_elapsed * 0.008)
precision = max(0.6, min(0.95, base_precision + random.uniform(-0.05, 0.05)))
recall = max(0.55, min(0.92, base_recall + random.uniform(-0.05, 0.05)))
sample_numbers = random.randint(1000, 5000)
js_value = min(0.3, 0.05 + (days_elapsed * 0.015) + random.uniform(-0.02, 0.02)) # Drift increases over time
wd_value = min(2.5, 0.2 + (days_elapsed * 0.1) + random.uniform(-0.1, 0.1))
cursor.execute(
'''INSERT OR IGNORE INTO drift_record
(model_id, precision, recall, sample_numbers, js_value, wd_value, prediction_date)
VALUES (?, ?, ?, ?, ?, ?, ?)''',
(model_id, precision, recall, sample_numbers, js_value, wd_value, pred_date.strftime('%Y-%m-%d %H:%M:%S'))
)
# Next record in 3-4 days
day_offset += random.randint(3, 4)
record_count += 1
conn.commit()
conn.close()
print("Database created successfully!")
print("Database file: drift_detection.db")