File size: 10,179 Bytes
2a352e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""
DataClerk OpenEnv β€” SQLite database setup and seeding.

Uses a fixed random seed (42) so every run produces identical data
and task graders remain deterministic.
"""

from __future__ import annotations

import os
import random
import sqlite3
from datetime import datetime, timedelta

DB_PATH: str = os.environ.get("DB_PATH", "/tmp/dataclerk.db")

# ─────────────────────────────────────────────
#  Schema
# ─────────────────────────────────────────────

SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS customers (
    id          INTEGER PRIMARY KEY,
    name        TEXT    NOT NULL,
    email       TEXT    UNIQUE NOT NULL,
    city        TEXT,
    country     TEXT,
    tier        TEXT    DEFAULT 'standard',
    created_at  DATE    NOT NULL
);

CREATE TABLE IF NOT EXISTS products (
    id             INTEGER PRIMARY KEY,
    name           TEXT    NOT NULL,
    category       TEXT    NOT NULL,
    base_price     REAL    NOT NULL,
    stock_quantity INTEGER NOT NULL DEFAULT 0
);

CREATE TABLE IF NOT EXISTS orders (
    id            INTEGER PRIMARY KEY,
    customer_id   INTEGER NOT NULL,
    status        TEXT    NOT NULL DEFAULT 'completed',
    total_amount  REAL    NOT NULL,
    created_at    DATE    NOT NULL,
    FOREIGN KEY (customer_id) REFERENCES customers(id)
);

CREATE TABLE IF NOT EXISTS order_items (
    id          INTEGER PRIMARY KEY,
    order_id    INTEGER NOT NULL,
    product_id  INTEGER NOT NULL,
    quantity    INTEGER NOT NULL,
    unit_price  REAL    NOT NULL,
    FOREIGN KEY (order_id)   REFERENCES orders(id),
    FOREIGN KEY (product_id) REFERENCES products(id)
);

CREATE TABLE IF NOT EXISTS support_tickets (
    id           INTEGER PRIMARY KEY,
    customer_id  INTEGER NOT NULL,
    category     TEXT    NOT NULL,
    priority     TEXT    NOT NULL,
    status       TEXT    NOT NULL,
    created_at   DATE    NOT NULL,
    resolved_at  DATE,
    FOREIGN KEY (customer_id) REFERENCES customers(id)
);
"""

# ─────────────────────────────────────────────
#  Seed data
# ─────────────────────────────────────────────

_CITIES = [
    ("New York", "US"), ("Los Angeles", "US"), ("Chicago", "US"),
    ("London", "UK"), ("Manchester", "UK"),
    ("Berlin", "DE"), ("Munich", "DE"),
    ("Paris", "FR"),
    ("Tokyo", "JP"), ("Osaka", "JP"),
    ("Sydney", "AU"),
    ("Toronto", "CA"),
    ("Mumbai", "IN"), ("Bangalore", "IN"),
]

_TIERS = ["standard", "premium", "enterprise"]
_TIER_WEIGHTS = [0.60, 0.30, 0.10]

_CATEGORIES_PRODUCTS: dict[str, list[tuple[str, float]]] = {
    "Electronics": [
        ("Laptop Pro 15", 1299.99),
        ("Wireless Noise-Cancelling Headphones", 199.99),
        ("Smart Watch Series 5", 299.99),
        ("Tablet 10-inch", 499.99),
        ("Mechanical Keyboard", 149.99),
        ("Webcam 4K", 119.99),
        ("USB-C Hub 7-port", 59.99),
        ("Phone Case Premium", 29.99),
    ],
    "Clothing": [
        ("Running Shoes Pro", 89.99),
        ("Insulated Winter Jacket", 179.99),
        ("Slim-Fit Denim Jeans", 69.99),
        ("Organic Cotton T-Shirt 3-pack", 34.99),
        ("High-Support Sports Bra", 49.99),
        ("Lightweight Casual Sneakers", 74.99),
        ("Merino Wool Sweater", 99.99),
        ("Waterproof Hiking Boots", 139.99),
    ],
    "Food & Beverage": [
        ("Whey Protein Powder 2kg", 54.99),
        ("Single-Origin Coffee Beans 500g", 22.99),
        ("Premium Green Tea Set", 27.99),
        ("Protein Energy Bars 24-pack", 39.99),
        ("Daily Multivitamin Pack 90ct", 32.99),
        ("Raw Organic Honey 500g", 16.99),
        ("Collagen Supplement 60ct", 44.99),
    ],
    "Sports": [
        ("Premium Yoga Mat 6mm", 44.99),
        ("Adjustable Dumbbell Set 40kg", 129.99),
        ("Speed Jump Rope", 24.99),
        ("Resistance Bands Set 5-level", 29.99),
        ("Insulated Water Bottle 1L", 34.99),
        ("Durable Gym Bag 40L", 59.99),
        ("Foam Roller High-Density", 39.99),
    ],
    "Home & Garden": [
        ("HEPA Air Purifier Large Room", 249.99),
        ("Ceramic Plant Pot Set 3-pc", 39.99),
        ("LED Desk Lamp with USB", 54.99),
        ("Stackable Storage Organizer", 34.99),
        ("12-cup Programmable Coffee Maker", 89.99),
        ("High-Speed Blender 1200W", 79.99),
        ("Bamboo Cutting Board Set", 29.99),
    ],
}

_ORDER_STATUSES = ["completed"] * 8 + ["refunded"] * 1 + ["pending"] * 1
_TICKET_CATEGORIES = ["billing", "technical", "shipping", "returns", "general"]
_TICKET_PRIORITIES = ["low", "medium", "high", "urgent"]
_TICKET_PRI_WEIGHTS = [0.25, 0.40, 0.25, 0.10]
_TICKET_STATUSES = ["open", "in_progress", "resolved", "closed"]
_TICKET_STATUS_WEIGHTS = [0.15, 0.10, 0.45, 0.30]

# Resolution days per priority (used to seed resolved_at)
_RESOLUTION_DAYS: dict[str, tuple[int, int]] = {
    "urgent": (1, 3),
    "high": (2, 7),
    "medium": (4, 14),
    "low": (7, 21),
}

# ─────────────────────────────────────────────
#  Public API
# ─────────────────────────────────────────────

def seed_database(db_path: str = DB_PATH) -> None:
    """Create the database schema and insert deterministic seed data."""
    conn = sqlite3.connect(db_path)
    conn.executescript(SCHEMA_SQL)

    # Skip if already populated
    if conn.execute("SELECT COUNT(*) FROM customers").fetchone()[0] > 0:
        conn.close()
        return

    rng = random.Random(42)  # fixed seed β†’ deterministic answers
    today = datetime(2025, 6, 15)  # fixed "today" for reproducibility

    def days_ago(n: int) -> str:
        return (today - timedelta(days=n)).strftime("%Y-%m-%d")

    # ── Customers (200) ──────────────────────────────────────────────────────
    customers = []
    for i in range(1, 201):
        city, country = rng.choice(_CITIES)
        tier = rng.choices(_TIERS, weights=_TIER_WEIGHTS)[0]
        created = days_ago(rng.randint(60, 900))
        customers.append(
            (i, f"Customer_{i:03d}", f"user{i}@example.com", city, country, tier, created)
        )
    conn.executemany(
        "INSERT OR IGNORE INTO customers VALUES (?,?,?,?,?,?,?)", customers
    )

    # ── Products (37) ────────────────────────────────────────────────────────
    products = []
    pid = 1
    for category, items in _CATEGORIES_PRODUCTS.items():
        for name, price in items:
            stock = rng.randint(0, 300)
            products.append((pid, name, category, price, stock))
            pid += 1
    conn.executemany("INSERT OR IGNORE INTO products VALUES (?,?,?,?,?)", products)

    # ── Orders + items (1 800 orders) ────────────────────────────────────────
    orders: list[tuple] = []
    order_items: list[tuple] = []
    oid = 1
    iid = 1

    # Spread orders over last 400 days; heavier in recent 180 days
    all_customer_ids = list(range(1, 201))

    for _ in range(1800):
        cid = rng.choice(all_customer_ids)
        # ~60 % of orders in last 180 days
        if rng.random() < 0.60:
            days_back = rng.randint(0, 179)
        else:
            days_back = rng.randint(180, 400)
        order_date = days_ago(days_back)
        status = rng.choice(_ORDER_STATUSES)

        n_items = rng.randint(1, 4)
        selected = rng.sample(products, n_items)

        total = 0.0
        for prod in selected:
            qty = rng.randint(1, 3)
            price = round(prod[3] * rng.uniform(0.92, 1.08), 2)
            total += qty * price
            order_items.append((iid, oid, prod[0], qty, price))
            iid += 1

        orders.append((oid, cid, status, round(total, 2), order_date))
        oid += 1

    conn.executemany(
        "INSERT OR IGNORE INTO orders VALUES (?,?,?,?,?)", orders
    )
    conn.executemany(
        "INSERT OR IGNORE INTO order_items VALUES (?,?,?,?,?)", order_items
    )

    # ── Support tickets (600) ────────────────────────────────────────────────
    tickets: list[tuple] = []
    for tid in range(1, 601):
        cid = rng.randint(1, 200)
        cat = rng.choice(_TICKET_CATEGORIES)
        pri = rng.choices(_TICKET_PRIORITIES, weights=_TICKET_PRI_WEIGHTS)[0]
        status = rng.choices(_TICKET_STATUSES, weights=_TICKET_STATUS_WEIGHTS)[0]
        created_days = rng.randint(0, 270)
        created_str = days_ago(created_days)

        resolved_str = None
        if status in ("resolved", "closed"):
            lo, hi = _RESOLUTION_DAYS[pri]
            res_days = rng.randint(lo, hi)
            resolved_dt = datetime.strptime(created_str, "%Y-%m-%d") + timedelta(days=res_days)
            resolved_str = resolved_dt.strftime("%Y-%m-%d")

        tickets.append((tid, cid, cat, pri, status, created_str, resolved_str))

    conn.executemany(
        "INSERT OR IGNORE INTO support_tickets VALUES (?,?,?,?,?,?,?)", tickets
    )

    conn.commit()
    conn.close()


def get_schema_summary(db_path: str = DB_PATH) -> dict[str, list[str]]:
    """Return {table: ["col (TYPE)", …]} for all tables."""
    conn = sqlite3.connect(db_path)
    tables = ["customers", "products", "orders", "order_items", "support_tickets"]
    summary: dict[str, list[str]] = {}
    for table in tables:
        rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
        summary[table] = [f"{r[1]} ({r[2]})" for r in rows]
    conn.close()
    return summary