Spaces:
Sleeping
Sleeping
| """ | |
| DataClerk OpenEnv β SQLite database setup and seeding. | |
| Uses a fixed random seed (42) so every run produces identical data | |
| and task graders remain deterministic. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import random | |
| import sqlite3 | |
| from datetime import datetime, timedelta | |
| DB_PATH: str = os.environ.get("DB_PATH", "/tmp/dataclerk.db") | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Schema | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| SCHEMA_SQL = """ | |
| CREATE TABLE IF NOT EXISTS customers ( | |
| id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| email TEXT UNIQUE NOT NULL, | |
| city TEXT, | |
| country TEXT, | |
| tier TEXT DEFAULT 'standard', | |
| created_at DATE NOT NULL | |
| ); | |
| CREATE TABLE IF NOT EXISTS products ( | |
| id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| category TEXT NOT NULL, | |
| base_price REAL NOT NULL, | |
| stock_quantity INTEGER NOT NULL DEFAULT 0 | |
| ); | |
| CREATE TABLE IF NOT EXISTS orders ( | |
| id INTEGER PRIMARY KEY, | |
| customer_id INTEGER NOT NULL, | |
| status TEXT NOT NULL DEFAULT 'completed', | |
| total_amount REAL NOT NULL, | |
| created_at DATE NOT NULL, | |
| FOREIGN KEY (customer_id) REFERENCES customers(id) | |
| ); | |
| CREATE TABLE IF NOT EXISTS order_items ( | |
| id INTEGER PRIMARY KEY, | |
| order_id INTEGER NOT NULL, | |
| product_id INTEGER NOT NULL, | |
| quantity INTEGER NOT NULL, | |
| unit_price REAL NOT NULL, | |
| FOREIGN KEY (order_id) REFERENCES orders(id), | |
| FOREIGN KEY (product_id) REFERENCES products(id) | |
| ); | |
| CREATE TABLE IF NOT EXISTS support_tickets ( | |
| id INTEGER PRIMARY KEY, | |
| customer_id INTEGER NOT NULL, | |
| category TEXT NOT NULL, | |
| priority TEXT NOT NULL, | |
| status TEXT NOT NULL, | |
| created_at DATE NOT NULL, | |
| resolved_at DATE, | |
| FOREIGN KEY (customer_id) REFERENCES customers(id) | |
| ); | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Seed data | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| _CITIES = [ | |
| ("New York", "US"), ("Los Angeles", "US"), ("Chicago", "US"), | |
| ("London", "UK"), ("Manchester", "UK"), | |
| ("Berlin", "DE"), ("Munich", "DE"), | |
| ("Paris", "FR"), | |
| ("Tokyo", "JP"), ("Osaka", "JP"), | |
| ("Sydney", "AU"), | |
| ("Toronto", "CA"), | |
| ("Mumbai", "IN"), ("Bangalore", "IN"), | |
| ] | |
| _TIERS = ["standard", "premium", "enterprise"] | |
| _TIER_WEIGHTS = [0.60, 0.30, 0.10] | |
| _CATEGORIES_PRODUCTS: dict[str, list[tuple[str, float]]] = { | |
| "Electronics": [ | |
| ("Laptop Pro 15", 1299.99), | |
| ("Wireless Noise-Cancelling Headphones", 199.99), | |
| ("Smart Watch Series 5", 299.99), | |
| ("Tablet 10-inch", 499.99), | |
| ("Mechanical Keyboard", 149.99), | |
| ("Webcam 4K", 119.99), | |
| ("USB-C Hub 7-port", 59.99), | |
| ("Phone Case Premium", 29.99), | |
| ], | |
| "Clothing": [ | |
| ("Running Shoes Pro", 89.99), | |
| ("Insulated Winter Jacket", 179.99), | |
| ("Slim-Fit Denim Jeans", 69.99), | |
| ("Organic Cotton T-Shirt 3-pack", 34.99), | |
| ("High-Support Sports Bra", 49.99), | |
| ("Lightweight Casual Sneakers", 74.99), | |
| ("Merino Wool Sweater", 99.99), | |
| ("Waterproof Hiking Boots", 139.99), | |
| ], | |
| "Food & Beverage": [ | |
| ("Whey Protein Powder 2kg", 54.99), | |
| ("Single-Origin Coffee Beans 500g", 22.99), | |
| ("Premium Green Tea Set", 27.99), | |
| ("Protein Energy Bars 24-pack", 39.99), | |
| ("Daily Multivitamin Pack 90ct", 32.99), | |
| ("Raw Organic Honey 500g", 16.99), | |
| ("Collagen Supplement 60ct", 44.99), | |
| ], | |
| "Sports": [ | |
| ("Premium Yoga Mat 6mm", 44.99), | |
| ("Adjustable Dumbbell Set 40kg", 129.99), | |
| ("Speed Jump Rope", 24.99), | |
| ("Resistance Bands Set 5-level", 29.99), | |
| ("Insulated Water Bottle 1L", 34.99), | |
| ("Durable Gym Bag 40L", 59.99), | |
| ("Foam Roller High-Density", 39.99), | |
| ], | |
| "Home & Garden": [ | |
| ("HEPA Air Purifier Large Room", 249.99), | |
| ("Ceramic Plant Pot Set 3-pc", 39.99), | |
| ("LED Desk Lamp with USB", 54.99), | |
| ("Stackable Storage Organizer", 34.99), | |
| ("12-cup Programmable Coffee Maker", 89.99), | |
| ("High-Speed Blender 1200W", 79.99), | |
| ("Bamboo Cutting Board Set", 29.99), | |
| ], | |
| } | |
| _ORDER_STATUSES = ["completed"] * 8 + ["refunded"] * 1 + ["pending"] * 1 | |
| _TICKET_CATEGORIES = ["billing", "technical", "shipping", "returns", "general"] | |
| _TICKET_PRIORITIES = ["low", "medium", "high", "urgent"] | |
| _TICKET_PRI_WEIGHTS = [0.25, 0.40, 0.25, 0.10] | |
| _TICKET_STATUSES = ["open", "in_progress", "resolved", "closed"] | |
| _TICKET_STATUS_WEIGHTS = [0.15, 0.10, 0.45, 0.30] | |
| # Resolution days per priority (used to seed resolved_at) | |
| _RESOLUTION_DAYS: dict[str, tuple[int, int]] = { | |
| "urgent": (1, 3), | |
| "high": (2, 7), | |
| "medium": (4, 14), | |
| "low": (7, 21), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Public API | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def seed_database(db_path: str = DB_PATH) -> None: | |
| """Create the database schema and insert deterministic seed data.""" | |
| conn = sqlite3.connect(db_path) | |
| conn.executescript(SCHEMA_SQL) | |
| # Skip if already populated | |
| if conn.execute("SELECT COUNT(*) FROM customers").fetchone()[0] > 0: | |
| conn.close() | |
| return | |
| rng = random.Random(42) # fixed seed β deterministic answers | |
| today = datetime(2025, 6, 15) # fixed "today" for reproducibility | |
| def days_ago(n: int) -> str: | |
| return (today - timedelta(days=n)).strftime("%Y-%m-%d") | |
| # ββ Customers (200) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| customers = [] | |
| for i in range(1, 201): | |
| city, country = rng.choice(_CITIES) | |
| tier = rng.choices(_TIERS, weights=_TIER_WEIGHTS)[0] | |
| created = days_ago(rng.randint(60, 900)) | |
| customers.append( | |
| (i, f"Customer_{i:03d}", f"user{i}@example.com", city, country, tier, created) | |
| ) | |
| conn.executemany( | |
| "INSERT OR IGNORE INTO customers VALUES (?,?,?,?,?,?,?)", customers | |
| ) | |
| # ββ Products (37) ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| products = [] | |
| pid = 1 | |
| for category, items in _CATEGORIES_PRODUCTS.items(): | |
| for name, price in items: | |
| stock = rng.randint(0, 300) | |
| products.append((pid, name, category, price, stock)) | |
| pid += 1 | |
| conn.executemany("INSERT OR IGNORE INTO products VALUES (?,?,?,?,?)", products) | |
| # ββ Orders + items (1 800 orders) ββββββββββββββββββββββββββββββββββββββββ | |
| orders: list[tuple] = [] | |
| order_items: list[tuple] = [] | |
| oid = 1 | |
| iid = 1 | |
| # Spread orders over last 400 days; heavier in recent 180 days | |
| all_customer_ids = list(range(1, 201)) | |
| for _ in range(1800): | |
| cid = rng.choice(all_customer_ids) | |
| # ~60 % of orders in last 180 days | |
| if rng.random() < 0.60: | |
| days_back = rng.randint(0, 179) | |
| else: | |
| days_back = rng.randint(180, 400) | |
| order_date = days_ago(days_back) | |
| status = rng.choice(_ORDER_STATUSES) | |
| n_items = rng.randint(1, 4) | |
| selected = rng.sample(products, n_items) | |
| total = 0.0 | |
| for prod in selected: | |
| qty = rng.randint(1, 3) | |
| price = round(prod[3] * rng.uniform(0.92, 1.08), 2) | |
| total += qty * price | |
| order_items.append((iid, oid, prod[0], qty, price)) | |
| iid += 1 | |
| orders.append((oid, cid, status, round(total, 2), order_date)) | |
| oid += 1 | |
| conn.executemany( | |
| "INSERT OR IGNORE INTO orders VALUES (?,?,?,?,?)", orders | |
| ) | |
| conn.executemany( | |
| "INSERT OR IGNORE INTO order_items VALUES (?,?,?,?,?)", order_items | |
| ) | |
| # ββ Support tickets (600) ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tickets: list[tuple] = [] | |
| for tid in range(1, 601): | |
| cid = rng.randint(1, 200) | |
| cat = rng.choice(_TICKET_CATEGORIES) | |
| pri = rng.choices(_TICKET_PRIORITIES, weights=_TICKET_PRI_WEIGHTS)[0] | |
| status = rng.choices(_TICKET_STATUSES, weights=_TICKET_STATUS_WEIGHTS)[0] | |
| created_days = rng.randint(0, 270) | |
| created_str = days_ago(created_days) | |
| resolved_str = None | |
| if status in ("resolved", "closed"): | |
| lo, hi = _RESOLUTION_DAYS[pri] | |
| res_days = rng.randint(lo, hi) | |
| resolved_dt = datetime.strptime(created_str, "%Y-%m-%d") + timedelta(days=res_days) | |
| resolved_str = resolved_dt.strftime("%Y-%m-%d") | |
| tickets.append((tid, cid, cat, pri, status, created_str, resolved_str)) | |
| conn.executemany( | |
| "INSERT OR IGNORE INTO support_tickets VALUES (?,?,?,?,?,?,?)", tickets | |
| ) | |
| conn.commit() | |
| conn.close() | |
| def get_schema_summary(db_path: str = DB_PATH) -> dict[str, list[str]]: | |
| """Return {table: ["col (TYPE)", β¦]} for all tables.""" | |
| conn = sqlite3.connect(db_path) | |
| tables = ["customers", "products", "orders", "order_items", "support_tickets"] | |
| summary: dict[str, list[str]] = {} | |
| for table in tables: | |
| rows = conn.execute(f"PRAGMA table_info({table})").fetchall() | |
| summary[table] = [f"{r[1]} ({r[2]})" for r in rows] | |
| conn.close() | |
| return summary | |