dataclerk-openenv / app /database.py
vedhanth66's picture
Initial commit: DataClerk OpenEnv
2a352e6
"""
DataClerk OpenEnv β€” SQLite database setup and seeding.
Uses a fixed random seed (42) so every run produces identical data
and task graders remain deterministic.
"""
from __future__ import annotations
import os
import random
import sqlite3
from datetime import datetime, timedelta
DB_PATH: str = os.environ.get("DB_PATH", "/tmp/dataclerk.db")
# ─────────────────────────────────────────────
# Schema
# ─────────────────────────────────────────────
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS customers (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT UNIQUE NOT NULL,
city TEXT,
country TEXT,
tier TEXT DEFAULT 'standard',
created_at DATE NOT NULL
);
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
category TEXT NOT NULL,
base_price REAL NOT NULL,
stock_quantity INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL,
status TEXT NOT NULL DEFAULT 'completed',
total_amount REAL NOT NULL,
created_at DATE NOT NULL,
FOREIGN KEY (customer_id) REFERENCES customers(id)
);
CREATE TABLE IF NOT EXISTS order_items (
id INTEGER PRIMARY KEY,
order_id INTEGER NOT NULL,
product_id INTEGER NOT NULL,
quantity INTEGER NOT NULL,
unit_price REAL NOT NULL,
FOREIGN KEY (order_id) REFERENCES orders(id),
FOREIGN KEY (product_id) REFERENCES products(id)
);
CREATE TABLE IF NOT EXISTS support_tickets (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL,
category TEXT NOT NULL,
priority TEXT NOT NULL,
status TEXT NOT NULL,
created_at DATE NOT NULL,
resolved_at DATE,
FOREIGN KEY (customer_id) REFERENCES customers(id)
);
"""
# ─────────────────────────────────────────────
# Seed data
# ─────────────────────────────────────────────
_CITIES = [
("New York", "US"), ("Los Angeles", "US"), ("Chicago", "US"),
("London", "UK"), ("Manchester", "UK"),
("Berlin", "DE"), ("Munich", "DE"),
("Paris", "FR"),
("Tokyo", "JP"), ("Osaka", "JP"),
("Sydney", "AU"),
("Toronto", "CA"),
("Mumbai", "IN"), ("Bangalore", "IN"),
]
_TIERS = ["standard", "premium", "enterprise"]
_TIER_WEIGHTS = [0.60, 0.30, 0.10]
_CATEGORIES_PRODUCTS: dict[str, list[tuple[str, float]]] = {
"Electronics": [
("Laptop Pro 15", 1299.99),
("Wireless Noise-Cancelling Headphones", 199.99),
("Smart Watch Series 5", 299.99),
("Tablet 10-inch", 499.99),
("Mechanical Keyboard", 149.99),
("Webcam 4K", 119.99),
("USB-C Hub 7-port", 59.99),
("Phone Case Premium", 29.99),
],
"Clothing": [
("Running Shoes Pro", 89.99),
("Insulated Winter Jacket", 179.99),
("Slim-Fit Denim Jeans", 69.99),
("Organic Cotton T-Shirt 3-pack", 34.99),
("High-Support Sports Bra", 49.99),
("Lightweight Casual Sneakers", 74.99),
("Merino Wool Sweater", 99.99),
("Waterproof Hiking Boots", 139.99),
],
"Food & Beverage": [
("Whey Protein Powder 2kg", 54.99),
("Single-Origin Coffee Beans 500g", 22.99),
("Premium Green Tea Set", 27.99),
("Protein Energy Bars 24-pack", 39.99),
("Daily Multivitamin Pack 90ct", 32.99),
("Raw Organic Honey 500g", 16.99),
("Collagen Supplement 60ct", 44.99),
],
"Sports": [
("Premium Yoga Mat 6mm", 44.99),
("Adjustable Dumbbell Set 40kg", 129.99),
("Speed Jump Rope", 24.99),
("Resistance Bands Set 5-level", 29.99),
("Insulated Water Bottle 1L", 34.99),
("Durable Gym Bag 40L", 59.99),
("Foam Roller High-Density", 39.99),
],
"Home & Garden": [
("HEPA Air Purifier Large Room", 249.99),
("Ceramic Plant Pot Set 3-pc", 39.99),
("LED Desk Lamp with USB", 54.99),
("Stackable Storage Organizer", 34.99),
("12-cup Programmable Coffee Maker", 89.99),
("High-Speed Blender 1200W", 79.99),
("Bamboo Cutting Board Set", 29.99),
],
}
_ORDER_STATUSES = ["completed"] * 8 + ["refunded"] * 1 + ["pending"] * 1
_TICKET_CATEGORIES = ["billing", "technical", "shipping", "returns", "general"]
_TICKET_PRIORITIES = ["low", "medium", "high", "urgent"]
_TICKET_PRI_WEIGHTS = [0.25, 0.40, 0.25, 0.10]
_TICKET_STATUSES = ["open", "in_progress", "resolved", "closed"]
_TICKET_STATUS_WEIGHTS = [0.15, 0.10, 0.45, 0.30]
# Resolution days per priority (used to seed resolved_at)
_RESOLUTION_DAYS: dict[str, tuple[int, int]] = {
"urgent": (1, 3),
"high": (2, 7),
"medium": (4, 14),
"low": (7, 21),
}
# ─────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────
def seed_database(db_path: str = DB_PATH) -> None:
"""Create the database schema and insert deterministic seed data."""
conn = sqlite3.connect(db_path)
conn.executescript(SCHEMA_SQL)
# Skip if already populated
if conn.execute("SELECT COUNT(*) FROM customers").fetchone()[0] > 0:
conn.close()
return
rng = random.Random(42) # fixed seed β†’ deterministic answers
today = datetime(2025, 6, 15) # fixed "today" for reproducibility
def days_ago(n: int) -> str:
return (today - timedelta(days=n)).strftime("%Y-%m-%d")
# ── Customers (200) ──────────────────────────────────────────────────────
customers = []
for i in range(1, 201):
city, country = rng.choice(_CITIES)
tier = rng.choices(_TIERS, weights=_TIER_WEIGHTS)[0]
created = days_ago(rng.randint(60, 900))
customers.append(
(i, f"Customer_{i:03d}", f"user{i}@example.com", city, country, tier, created)
)
conn.executemany(
"INSERT OR IGNORE INTO customers VALUES (?,?,?,?,?,?,?)", customers
)
# ── Products (37) ────────────────────────────────────────────────────────
products = []
pid = 1
for category, items in _CATEGORIES_PRODUCTS.items():
for name, price in items:
stock = rng.randint(0, 300)
products.append((pid, name, category, price, stock))
pid += 1
conn.executemany("INSERT OR IGNORE INTO products VALUES (?,?,?,?,?)", products)
# ── Orders + items (1 800 orders) ────────────────────────────────────────
orders: list[tuple] = []
order_items: list[tuple] = []
oid = 1
iid = 1
# Spread orders over last 400 days; heavier in recent 180 days
all_customer_ids = list(range(1, 201))
for _ in range(1800):
cid = rng.choice(all_customer_ids)
# ~60 % of orders in last 180 days
if rng.random() < 0.60:
days_back = rng.randint(0, 179)
else:
days_back = rng.randint(180, 400)
order_date = days_ago(days_back)
status = rng.choice(_ORDER_STATUSES)
n_items = rng.randint(1, 4)
selected = rng.sample(products, n_items)
total = 0.0
for prod in selected:
qty = rng.randint(1, 3)
price = round(prod[3] * rng.uniform(0.92, 1.08), 2)
total += qty * price
order_items.append((iid, oid, prod[0], qty, price))
iid += 1
orders.append((oid, cid, status, round(total, 2), order_date))
oid += 1
conn.executemany(
"INSERT OR IGNORE INTO orders VALUES (?,?,?,?,?)", orders
)
conn.executemany(
"INSERT OR IGNORE INTO order_items VALUES (?,?,?,?,?)", order_items
)
# ── Support tickets (600) ────────────────────────────────────────────────
tickets: list[tuple] = []
for tid in range(1, 601):
cid = rng.randint(1, 200)
cat = rng.choice(_TICKET_CATEGORIES)
pri = rng.choices(_TICKET_PRIORITIES, weights=_TICKET_PRI_WEIGHTS)[0]
status = rng.choices(_TICKET_STATUSES, weights=_TICKET_STATUS_WEIGHTS)[0]
created_days = rng.randint(0, 270)
created_str = days_ago(created_days)
resolved_str = None
if status in ("resolved", "closed"):
lo, hi = _RESOLUTION_DAYS[pri]
res_days = rng.randint(lo, hi)
resolved_dt = datetime.strptime(created_str, "%Y-%m-%d") + timedelta(days=res_days)
resolved_str = resolved_dt.strftime("%Y-%m-%d")
tickets.append((tid, cid, cat, pri, status, created_str, resolved_str))
conn.executemany(
"INSERT OR IGNORE INTO support_tickets VALUES (?,?,?,?,?,?,?)", tickets
)
conn.commit()
conn.close()
def get_schema_summary(db_path: str = DB_PATH) -> dict[str, list[str]]:
"""Return {table: ["col (TYPE)", …]} for all tables."""
conn = sqlite3.connect(db_path)
tables = ["customers", "products", "orders", "order_items", "support_tickets"]
summary: dict[str, list[str]] = {}
for table in tables:
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
summary[table] = [f"{r[1]} ({r[2]})" for r in rows]
conn.close()
return summary