nl2sql-bench / data_factory /schemas.py
ritvik360's picture
Upload folder using huggingface_hub
a39d8ef verified
"""
data_factory/schemas.py
========================
SQLite CREATE TABLE statements for all four domains.
Each schema is fully self-contained and has been verified to create
without errors in SQLite 3.x.
"""
from __future__ import annotations
import sqlite3
import random
from datetime import date, timedelta
from typing import Callable
# ─────────────────────────────────────────────────────────────────────────────
# SQL SCHEMAS
# ─────────────────────────────────────────────────────────────────────────────
ECOMMERCE_SCHEMA = """
CREATE TABLE IF NOT EXISTS categories (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
category_id INTEGER NOT NULL REFERENCES categories(id),
price REAL NOT NULL CHECK(price >= 0),
stock_quantity INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS customers (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE,
country TEXT NOT NULL,
tier TEXT NOT NULL DEFAULT 'bronze'
CHECK(tier IN ('bronze', 'silver', 'gold')),
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL REFERENCES customers(id),
status TEXT NOT NULL DEFAULT 'pending'
CHECK(status IN ('pending','processing','shipped','delivered','cancelled')),
created_at TEXT NOT NULL,
total_amount REAL NOT NULL CHECK(total_amount >= 0)
);
CREATE TABLE IF NOT EXISTS order_items (
id INTEGER PRIMARY KEY,
order_id INTEGER NOT NULL REFERENCES orders(id),
product_id INTEGER NOT NULL REFERENCES products(id),
quantity INTEGER NOT NULL CHECK(quantity > 0),
unit_price REAL NOT NULL CHECK(unit_price >= 0)
);
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY,
product_id INTEGER NOT NULL REFERENCES products(id),
customer_id INTEGER NOT NULL REFERENCES customers(id),
rating INTEGER NOT NULL CHECK(rating BETWEEN 1 AND 5),
created_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_products_category ON products(category_id);
CREATE INDEX IF NOT EXISTS idx_orders_customer ON orders(customer_id);
CREATE INDEX IF NOT EXISTS idx_orders_status ON orders(status);
CREATE INDEX IF NOT EXISTS idx_orders_created ON orders(created_at);
CREATE INDEX IF NOT EXISTS idx_order_items_order ON order_items(order_id);
CREATE INDEX IF NOT EXISTS idx_order_items_product ON order_items(product_id);
CREATE INDEX IF NOT EXISTS idx_reviews_product ON reviews(product_id);
CREATE INDEX IF NOT EXISTS idx_customers_tier ON customers(tier);
"""
HEALTHCARE_SCHEMA = """
CREATE TABLE IF NOT EXISTS patients (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
date_of_birth TEXT NOT NULL,
gender TEXT NOT NULL CHECK(gender IN ('M','F','Other')),
blood_type TEXT NOT NULL,
country TEXT NOT NULL,
registered_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS doctors (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
specialization TEXT NOT NULL,
department TEXT NOT NULL,
experience_years INTEGER NOT NULL CHECK(experience_years >= 0),
consultation_fee REAL NOT NULL CHECK(consultation_fee >= 0)
);
CREATE TABLE IF NOT EXISTS appointments (
id INTEGER PRIMARY KEY,
patient_id INTEGER NOT NULL REFERENCES patients(id),
doctor_id INTEGER NOT NULL REFERENCES doctors(id),
scheduled_at TEXT NOT NULL,
status TEXT NOT NULL
CHECK(status IN ('scheduled','completed','cancelled','no_show')),
notes TEXT
);
CREATE TABLE IF NOT EXISTS diagnoses (
id INTEGER PRIMARY KEY,
appointment_id INTEGER NOT NULL REFERENCES appointments(id),
icd_code TEXT NOT NULL,
description TEXT NOT NULL,
severity TEXT NOT NULL CHECK(severity IN ('mild','moderate','severe'))
);
CREATE TABLE IF NOT EXISTS medications (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
category TEXT NOT NULL,
unit_price REAL NOT NULL CHECK(unit_price >= 0)
);
CREATE TABLE IF NOT EXISTS prescriptions (
id INTEGER PRIMARY KEY,
appointment_id INTEGER NOT NULL REFERENCES appointments(id),
medication_id INTEGER NOT NULL REFERENCES medications(id),
dosage TEXT NOT NULL,
duration_days INTEGER NOT NULL CHECK(duration_days > 0),
quantity INTEGER NOT NULL CHECK(quantity > 0)
);
CREATE INDEX IF NOT EXISTS idx_appt_patient ON appointments(patient_id);
CREATE INDEX IF NOT EXISTS idx_appt_doctor ON appointments(doctor_id);
CREATE INDEX IF NOT EXISTS idx_appt_status ON appointments(status);
CREATE INDEX IF NOT EXISTS idx_diag_appt ON diagnoses(appointment_id);
CREATE INDEX IF NOT EXISTS idx_presc_appt ON prescriptions(appointment_id);
CREATE INDEX IF NOT EXISTS idx_presc_med ON prescriptions(medication_id);
"""
FINANCE_SCHEMA = """
CREATE TABLE IF NOT EXISTS fin_customers (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE,
country TEXT NOT NULL,
kyc_status TEXT NOT NULL CHECK(kyc_status IN ('pending','verified','rejected')),
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS accounts (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL REFERENCES fin_customers(id),
account_type TEXT NOT NULL
CHECK(account_type IN ('savings','current','fixed_deposit','loan')),
balance REAL NOT NULL DEFAULT 0,
currency TEXT NOT NULL DEFAULT 'USD',
status TEXT NOT NULL CHECK(status IN ('active','dormant','closed')),
opened_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS transactions (
id INTEGER PRIMARY KEY,
account_id INTEGER NOT NULL REFERENCES accounts(id),
txn_type TEXT NOT NULL CHECK(txn_type IN ('credit','debit')),
amount REAL NOT NULL CHECK(amount > 0),
currency TEXT NOT NULL DEFAULT 'USD',
merchant TEXT,
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS loans (
id INTEGER PRIMARY KEY,
customer_id INTEGER NOT NULL REFERENCES fin_customers(id),
loan_type TEXT NOT NULL
CHECK(loan_type IN ('personal','home','auto','business')),
principal_amount REAL NOT NULL,
interest_rate REAL NOT NULL,
tenure_months INTEGER NOT NULL,
status TEXT NOT NULL CHECK(status IN ('active','closed','defaulted')),
disbursed_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS loan_payments (
id INTEGER PRIMARY KEY,
loan_id INTEGER NOT NULL REFERENCES loans(id),
amount_paid REAL NOT NULL CHECK(amount_paid > 0),
payment_date TEXT NOT NULL,
is_late INTEGER NOT NULL DEFAULT 0 CHECK(is_late IN (0,1))
);
CREATE INDEX IF NOT EXISTS idx_acct_customer ON accounts(customer_id);
CREATE INDEX IF NOT EXISTS idx_txn_account ON transactions(account_id);
CREATE INDEX IF NOT EXISTS idx_txn_type ON transactions(txn_type);
CREATE INDEX IF NOT EXISTS idx_loan_customer ON loans(customer_id);
CREATE INDEX IF NOT EXISTS idx_lp_loan ON loan_payments(loan_id);
"""
HR_SCHEMA = """
CREATE TABLE IF NOT EXISTS departments (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
location TEXT NOT NULL,
budget REAL NOT NULL CHECK(budget >= 0)
);
CREATE TABLE IF NOT EXISTS employees (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT NOT NULL UNIQUE,
department_id INTEGER NOT NULL REFERENCES departments(id),
job_title TEXT NOT NULL,
hire_date TEXT NOT NULL,
salary REAL NOT NULL CHECK(salary >= 0),
status TEXT NOT NULL CHECK(status IN ('active','resigned','terminated'))
);
CREATE TABLE IF NOT EXISTS performance_reviews (
id INTEGER PRIMARY KEY,
employee_id INTEGER NOT NULL REFERENCES employees(id),
review_year INTEGER NOT NULL,
rating INTEGER NOT NULL CHECK(rating BETWEEN 1 AND 5),
reviewer_id INTEGER NOT NULL REFERENCES employees(id),
comments TEXT
);
CREATE TABLE IF NOT EXISTS projects (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
department_id INTEGER NOT NULL REFERENCES departments(id),
start_date TEXT NOT NULL,
end_date TEXT,
budget REAL NOT NULL,
status TEXT NOT NULL
CHECK(status IN ('planned','active','completed','cancelled'))
);
CREATE TABLE IF NOT EXISTS project_assignments (
id INTEGER PRIMARY KEY,
employee_id INTEGER NOT NULL REFERENCES employees(id),
project_id INTEGER NOT NULL REFERENCES projects(id),
role TEXT NOT NULL,
hours_allocated INTEGER NOT NULL CHECK(hours_allocated > 0)
);
CREATE INDEX IF NOT EXISTS idx_emp_dept ON employees(department_id);
CREATE INDEX IF NOT EXISTS idx_emp_status ON employees(status);
CREATE INDEX IF NOT EXISTS idx_pr_employee ON performance_reviews(employee_id);
CREATE INDEX IF NOT EXISTS idx_proj_dept ON projects(department_id);
CREATE INDEX IF NOT EXISTS idx_pa_employee ON project_assignments(employee_id);
CREATE INDEX IF NOT EXISTS idx_pa_project ON project_assignments(project_id);
"""
# ─────────────────────────────────────────────────────────────────────────────
# SCHEMA REGISTRY
# ─────────────────────────────────────────────────────────────────────────────
SCHEMA_MAP: dict[str, str] = {
"ecommerce": ECOMMERCE_SCHEMA,
"healthcare": HEALTHCARE_SCHEMA,
"finance": FINANCE_SCHEMA,
"hr": HR_SCHEMA,
}
# ─────────────────────────────────────────────────────────────────────────────
# COMPACT SCHEMA CONTEXT (injected into every training prompt)
# ─────────────────────────────────────────────────────────────────────────────
SCHEMA_CONTEXT: dict[str, str] = {
"ecommerce": """\
Database: ecommerce (SQLite, read-only)
TABLES
------
categories(id INTEGER PK, name TEXT)
products(id INTEGER PK, name TEXT, category_id INTEGER FK→categories.id, price REAL, stock_quantity INTEGER)
customers(id INTEGER PK, name TEXT, email TEXT, country TEXT, tier TEXT ∈ {bronze|silver|gold}, created_at TEXT ISO-8601)
orders(id INTEGER PK, customer_id INTEGER FKβ†’customers.id, status TEXT ∈ {pending|processing|shipped|delivered|cancelled}, created_at TEXT ISO-8601, total_amount REAL)
order_items(id INTEGER PK, order_id INTEGER FK→orders.id, product_id INTEGER FK→products.id, quantity INTEGER, unit_price REAL)
reviews(id INTEGER PK, product_id INTEGER FK→products.id, customer_id INTEGER FK→customers.id, rating INTEGER 1-5, created_at TEXT ISO-8601)
NOTES
-----
- Use created_at >= '2024-01-01' for date filtering (ISO text sort works)
- SQLite window functions: RANK, DENSE_RANK, ROW_NUMBER, LAG, LEAD
- strftime('%Y-%m', created_at) returns 'YYYY-MM'
- All monetary values in USD
""",
"healthcare": """\
Database: healthcare (SQLite, read-only)
TABLES
------
patients(id INTEGER PK, name TEXT, date_of_birth TEXT ISO-8601, gender TEXT ∈ {M|F|Other}, blood_type TEXT, country TEXT, registered_at TEXT ISO-8601)
doctors(id INTEGER PK, name TEXT, specialization TEXT, department TEXT, experience_years INTEGER, consultation_fee REAL)
appointments(id INTEGER PK, patient_id INTEGER FKβ†’patients.id, doctor_id INTEGER FKβ†’doctors.id, scheduled_at TEXT ISO-8601, status TEXT ∈ {scheduled|completed|cancelled|no_show}, notes TEXT nullable)
diagnoses(id INTEGER PK, appointment_id INTEGER FKβ†’appointments.id, icd_code TEXT, description TEXT, severity TEXT ∈ {mild|moderate|severe})
medications(id INTEGER PK, name TEXT, category TEXT, unit_price REAL)
prescriptions(id INTEGER PK, appointment_id INTEGER FK→appointments.id, medication_id INTEGER FK→medications.id, dosage TEXT, duration_days INTEGER, quantity INTEGER)
NOTES
-----
- consultation_fee is in USD per visit
- ICD codes follow WHO ICD-10 format (e.g. 'I10', 'E11')
- SQLite window functions available
""",
"finance": """\
Database: finance (SQLite, read-only)
TABLES
------
fin_customers(id INTEGER PK, name TEXT, email TEXT, country TEXT, kyc_status TEXT ∈ {pending|verified|rejected}, created_at TEXT ISO-8601)
accounts(id INTEGER PK, customer_id INTEGER FKβ†’fin_customers.id, account_type TEXT ∈ {savings|current|fixed_deposit|loan}, balance REAL, currency TEXT, status TEXT ∈ {active|dormant|closed}, opened_at TEXT ISO-8601)
transactions(id INTEGER PK, account_id INTEGER FKβ†’accounts.id, txn_type TEXT ∈ {credit|debit}, amount REAL, currency TEXT, merchant TEXT nullable, created_at TEXT ISO-8601)
loans(id INTEGER PK, customer_id INTEGER FKβ†’fin_customers.id, loan_type TEXT ∈ {personal|home|auto|business}, principal_amount REAL, interest_rate REAL, tenure_months INTEGER, status TEXT ∈ {active|closed|defaulted}, disbursed_at TEXT ISO-8601)
loan_payments(id INTEGER PK, loan_id INTEGER FKβ†’loans.id, amount_paid REAL, payment_date TEXT ISO-8601, is_late INTEGER ∈ {0|1})
NOTES
-----
- All monetary values in USD unless currency column specifies otherwise
- is_late = 1 means the payment was overdue
- SQLite window functions available
""",
"hr": """\
Database: hr (SQLite, read-only)
TABLES
------
departments(id INTEGER PK, name TEXT, location TEXT, budget REAL)
employees(id INTEGER PK, name TEXT, email TEXT, department_id INTEGER FKβ†’departments.id, job_title TEXT, hire_date TEXT ISO-8601, salary REAL, status TEXT ∈ {active|resigned|terminated})
performance_reviews(id INTEGER PK, employee_id INTEGER FK→employees.id, review_year INTEGER, rating INTEGER 1-5, reviewer_id INTEGER FK→employees.id, comments TEXT nullable)
projects(id INTEGER PK, name TEXT, department_id INTEGER FKβ†’departments.id, start_date TEXT ISO-8601, end_date TEXT nullable, budget REAL, status TEXT ∈ {planned|active|completed|cancelled})
project_assignments(id INTEGER PK, employee_id INTEGER FK→employees.id, project_id INTEGER FK→projects.id, role TEXT, hours_allocated INTEGER)
NOTES
-----
- salary is annual in USD
- performance rating: 1 (lowest) to 5 (highest)
- end_date is NULL for ongoing projects
- SQLite window functions available
""",
}
# ─────────────────────────────────────────────────────────────────────────────
# SEED FUNCTIONS (deterministic, SEED=42)
# ─────────────────────────────────────────────────────────────────────────────
def _rdate(rng: random.Random, start: str = "2022-01-01", end: str = "2024-12-31") -> str:
s = date.fromisoformat(start)
e = date.fromisoformat(end)
return (s + timedelta(days=rng.randint(0, (e - s).days))).isoformat()
def seed_ecommerce(conn: sqlite3.Connection, seed: int = 42) -> None:
rng = random.Random(seed)
cats = ["Electronics", "Clothing", "Books", "Home & Garden",
"Sports & Outdoors", "Toys & Games", "Beauty", "Automotive"]
conn.executemany("INSERT INTO categories(id,name) VALUES(?,?)", enumerate(cats, 1))
products = [
(1,"Wireless Headphones",1,149.99,50),(2,"Laptop Stand",1,59.99,120),
(3,"USB-C Hub",1,49.99,90),(4,"Webcam 4K",1,89.99,30),
(5,"Cotton T-Shirt",2,19.99,200),(6,"Winter Jacket",2,129.99,60),
(7,"Running Shorts",2,34.99,150),(8,"Clean Code",3,39.99,80),
(9,"Deep Learning Book",3,59.99,45),(10,"Coffee Maker",4,89.99,40),
(11,"Air Purifier",4,199.99,25),(12,"Yoga Mat",5,29.99,150),
(13,"Resistance Bands",5,14.99,200),(14,"Lego City Set",6,79.99,60),
(15,"Face Serum",7,34.99,100),(16,"Dash Cam",8,119.99,35),
]
conn.executemany("INSERT INTO products VALUES(?,?,?,?,?)", products)
countries = ["India","USA","Germany","UK","Canada","Australia","France","Brazil"]
tiers = ["bronze","silver","gold"]
customers = []
for i in range(1, 51):
customers.append((i, f"Customer {i}", f"cust{i}@shop.com",
rng.choice(countries), rng.choice(tiers), _rdate(rng)))
conn.executemany("INSERT INTO customers VALUES(?,?,?,?,?,?)", customers)
statuses = ["pending","processing","shipped","delivered","cancelled"]
orders = []
for i in range(1, 201):
orders.append((i, rng.randint(1, 50), rng.choice(statuses),
_rdate(rng), round(rng.uniform(20, 800), 2)))
conn.executemany("INSERT INTO orders VALUES(?,?,?,?,?)", orders)
items = []
for i in range(1, 301):
items.append((i, rng.randint(1, 200), rng.randint(1, 16),
rng.randint(1, 5), round(rng.uniform(10, 200), 2)))
conn.executemany("INSERT INTO order_items VALUES(?,?,?,?,?)", items)
reviews = []
for i in range(1, 151):
reviews.append((i, rng.randint(1, 16), rng.randint(1, 50),
rng.randint(1, 5), _rdate(rng)))
conn.executemany("INSERT INTO reviews VALUES(?,?,?,?,?)", reviews)
conn.commit()
def seed_healthcare(conn: sqlite3.Connection, seed: int = 42) -> None:
rng = random.Random(seed)
specs = [("Cardiology","Cardiology"), ("Neurology","Neurology"),
("Orthopedics","Orthopedics"), ("Dermatology","Dermatology"),
("Pediatrics","Pediatrics"), ("Oncology","Oncology"),
("Endocrinology","Endocrinology"), ("Gastroenterology","Gastroenterology")]
for i, (spec, dept) in enumerate(specs, 1):
conn.execute("INSERT INTO doctors VALUES(?,?,?,?,?,?)",
(i, f"Dr. {['Smith','Patel','Kim','MΓΌller','Okafor','Chen','Lopez','Roy'][i-1]}",
spec, dept, rng.randint(2, 25), round(rng.uniform(50, 350), 2)))
genders = ["M", "F", "Other"]
blood_types = ["A+","A-","B+","B-","O+","O-","AB+","AB-"]
countries = ["India","USA","Germany","UK","Canada","Australia"]
for i in range(1, 101):
conn.execute("INSERT INTO patients VALUES(?,?,?,?,?,?,?)",
(i, f"Patient {i}", _rdate(rng, "1950-01-01", "2010-01-01"),
rng.choice(genders), rng.choice(blood_types),
rng.choice(countries), _rdate(rng, "2020-01-01", "2024-12-31")))
appt_statuses = ["scheduled", "completed", "cancelled", "no_show"]
weights = [0.15, 0.60, 0.15, 0.10]
for i in range(1, 301):
conn.execute("INSERT INTO appointments VALUES(?,?,?,?,?,?)",
(i, rng.randint(1, 100), rng.randint(1, 8),
_rdate(rng, "2022-01-01", "2024-12-31"),
rng.choices(appt_statuses, weights)[0], None))
icd_codes = ["I10","E11","J45","M54","K21","F32","G43","L30","N39","R05",
"C50","Z87","I25","E78","J18"]
descs = ["Hypertension","Type 2 Diabetes","Asthma","Back Pain","GERD",
"Depression","Migraine","Dermatitis","UTI","Cough",
"Breast Cancer","Family History","Coronary Artery Disease",
"Hyperlipidemia","Pneumonia"]
severities = ["mild","moderate","severe"]
for i in range(1, 201):
conn.execute("INSERT INTO diagnoses VALUES(?,?,?,?,?)",
(i, rng.randint(1, 300), rng.choice(icd_codes),
rng.choice(descs), rng.choice(severities)))
meds = [("Metformin","Antidiabetic",0.15),("Lisinopril","Antihypertensive",0.20),
("Atorvastatin","Statin",0.25),("Amoxicillin","Antibiotic",0.30),
("Ibuprofen","NSAID",0.10),("Omeprazole","PPI",0.18),
("Sertraline","Antidepressant",0.35),("Cetirizine","Antihistamine",0.08),
("Paracetamol","Analgesic",0.05),("Aspirin","Antiplatelet",0.07)]
for i, (name, cat, price) in enumerate(meds, 1):
conn.execute("INSERT INTO medications VALUES(?,?,?,?)", (i, name, cat, price))
dosages = ["1x daily","2x daily","3x daily","once at night","as needed"]
for i in range(1, 251):
conn.execute("INSERT INTO prescriptions VALUES(?,?,?,?,?,?)",
(i, rng.randint(1, 300), rng.randint(1, 10),
rng.choice(dosages), rng.randint(5, 60), rng.randint(10, 90)))
conn.commit()
def seed_finance(conn: sqlite3.Connection, seed: int = 42) -> None:
rng = random.Random(seed)
countries = ["India","USA","Germany","UK","Singapore","UAE","Canada"]
kyc = ["pending","verified","verified","verified","rejected"]
for i in range(1, 51):
conn.execute("INSERT INTO fin_customers VALUES(?,?,?,?,?,?)",
(i, f"FinClient {i}", f"fincli{i}@bank.com",
rng.choice(countries), rng.choice(kyc), _rdate(rng)))
acct_types = ["savings","savings","current","fixed_deposit"]
statuses = ["active","active","active","dormant","closed"]
for i in range(1, 101):
conn.execute("INSERT INTO accounts VALUES(?,?,?,?,?,?,?)",
(i, rng.randint(1, 50), rng.choice(acct_types),
round(rng.uniform(100, 100000), 2), "USD",
rng.choice(statuses), _rdate(rng)))
merchants = [None, "Amazon", "Walmart", "Netflix", "Uber", "Apple",
"Google Pay", "Zomato", "Flipkart", "Airbnb"]
for i in range(1, 501):
conn.execute("INSERT INTO transactions VALUES(?,?,?,?,?,?,?)",
(i, rng.randint(1, 100), rng.choice(["credit","debit"]),
round(rng.uniform(5, 10000), 2), "USD",
rng.choice(merchants), _rdate(rng)))
loan_types = ["personal","home","auto","business"]
loan_statuses = ["active","active","closed","defaulted"]
for i in range(1, 51):
conn.execute("INSERT INTO loans VALUES(?,?,?,?,?,?,?,?)",
(i, rng.randint(1, 50), rng.choice(loan_types),
round(rng.uniform(5000, 500000), 2),
round(rng.uniform(5, 18), 2), rng.randint(12, 360),
rng.choice(loan_statuses), _rdate(rng)))
for i in range(1, 201):
conn.execute("INSERT INTO loan_payments VALUES(?,?,?,?,?)",
(i, rng.randint(1, 50), round(rng.uniform(500, 10000), 2),
_rdate(rng), rng.randint(0, 1)))
conn.commit()
def seed_hr(conn: sqlite3.Connection, seed: int = 42) -> None:
rng = random.Random(seed)
depts = [("Engineering","Bangalore",8000000),("Marketing","Mumbai",3000000),
("Finance","Delhi",2000000),("HR","Chennai",1500000),
("Sales","Hyderabad",5000000),("Product","Pune",4000000),
("Legal","Delhi",1000000),("Operations","Kolkata",2500000)]
for i, (name, loc, bud) in enumerate(depts, 1):
conn.execute("INSERT INTO departments VALUES(?,?,?,?)", (i, name, loc, bud))
titles = ["Software Engineer","Senior Engineer","Staff Engineer","Principal Engineer",
"Engineering Manager","Product Manager","Data Analyst","Data Scientist",
"Marketing Specialist","Sales Executive","HR Specialist","Finance Analyst",
"Director","VP","Legal Counsel"]
statuses = ["active","active","active","active","resigned","terminated"]
for i in range(1, 101):
conn.execute("INSERT INTO employees VALUES(?,?,?,?,?,?,?,?)",
(i, f"Employee {i}", f"emp{i}@corp.com",
rng.randint(1, 8), rng.choice(titles),
_rdate(rng, "2015-01-01", "2024-01-01"),
round(rng.uniform(25000, 200000), 2), rng.choice(statuses)))
for i in range(1, 201):
conn.execute("INSERT INTO performance_reviews VALUES(?,?,?,?,?,?)",
(i, rng.randint(1, 100), rng.randint(2019, 2024),
rng.randint(1, 5), rng.randint(1, 100),
rng.choice(["Excellent work","Good performance","Needs improvement",
"Outstanding","Meeting expectations"])))
proj_statuses = ["planned","active","active","completed","cancelled"]
for i in range(1, 51):
sd = _rdate(rng, "2021-01-01", "2024-01-01")
conn.execute("INSERT INTO projects VALUES(?,?,?,?,?,?,?)",
(i, f"Project {i}", rng.randint(1, 8), sd,
_rdate(rng, sd, "2025-06-01") if rng.random() > 0.25 else None,
round(rng.uniform(50000, 2000000), 2), rng.choice(proj_statuses)))
roles = ["Lead","Senior Developer","Developer","Tester","Analyst","DevOps"]
for i in range(1, 251):
conn.execute("INSERT INTO project_assignments VALUES(?,?,?,?,?)",
(i, rng.randint(1, 100), rng.randint(1, 50),
rng.choice(roles), rng.randint(20, 400)))
conn.commit()
# ─────────────────────────────────────────────────────────────────────────────
# REGISTRY
# ─────────────────────────────────────────────────────────────────────────────
SEED_MAP: dict[str, Callable] = {
"ecommerce": seed_ecommerce,
"healthcare": seed_healthcare,
"finance": seed_finance,
"hr": seed_hr,
}
def build_connection(domain: str, seed: int = 42) -> sqlite3.Connection:
"""Return a seeded in-memory SQLite connection for the given domain."""
conn = sqlite3.connect(":memory:", check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(SCHEMA_MAP[domain])
SEED_MAP[domain](conn, seed=seed)
return conn