MohitGupta41
Seeded Data Base Creation
95bb394
# app/tools/sql_tool.py
import os, sqlite3, random, math, string
from datetime import date, timedelta
from typing import Optional, Dict, Any, List, Tuple
REGIONS: List[Tuple[str, str]] = [
("NCR", "Delhi NCR"),
("BLR", "Bengaluru"),
("MUM", "Mumbai"),
("HYD", "Hyderabad"),
("CHN", "Chennai"),
("PUN", "Pune"),
]
PRODUCTS: List[Tuple[str, str, str, float]] = [
("ELEC-001", "Electronics", "Smartphone Alpha", 29999.0),
("ELEC-002", "Electronics", "Smartphone Pro", 49999.0),
("ELEC-003", "Electronics", "Laptop 14\"", 65999.0),
("ELEC-004", "Electronics", "Earbuds", 3999.0),
("APP-001", "Apparel", "Athleisure Tee", 999.0),
("APP-002", "Apparel", "Formal Shirt", 1599.0),
("APP-003", "Apparel", "Denim Jeans", 2499.0),
("GROC-001", "Grocery", "Olive Oil 1L", 799.0),
("GROC-002", "Grocery", "Basmati 5kg", 899.0),
("GROC-003", "Grocery", "Almonds 1kg", 1199.0),
("HOME-001", "Home", "Mixer Grinder", 3499.0),
("HOME-002", "Home", "Air Fryer", 6999.0),
]
def _rand_name(rnd: random.Random):
first = ["Ramesh","Suresh","Mahesh","Amit","Priya","Anita","Kiran","Sunil","Neha","Pooja","Ravi","Vijay","Anil","Meera","Tarun"]
last = ["Kumar","Sharma","Patel","Verma","Reddy","Iyer","Das","Ghosh","Yadav","Gupta","Singh","Menon"]
return f"{rnd.choice(first)} {rnd.choice(last)}"
class SQLTool:
"""
Enterprise-ish SQLite schema with dims/facts and safe read-only SQL execution.
dim_region(code, name)
dim_product(sku, category, name, price)
dim_employee(emp_id, name, region_code, role, hire_date)
fact_sales(day, region_code, sku, channel, units, revenue) -- daily aggregates
fact_sales_detail(day, region_code, sku, channel, employee_id, units, revenue) -- retail split by employee
inv_stock(day, region_code, sku, on_hand_qty)
"""
def __init__(self, db_path: Optional[str] = None):
path = db_path or os.getenv("SQLITE_PATH", ":memory:")
self.path = path
self.conn = sqlite3.connect(path, check_same_thread=False)
self.conn.execute("PRAGMA journal_mode=WAL;")
self.conn.execute("PRAGMA synchronous=NORMAL;")
# ---------------------- schema & seed ----------------------
def setup_demo_enterprise(self, start: str = "2025-08-10", end: str = "2025-09-10", seed: int = 42):
cur = self.conn.cursor()
cur.executescript(
"""
CREATE TABLE IF NOT EXISTS dim_region (
code TEXT PRIMARY KEY,
name TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS dim_product (
sku TEXT PRIMARY KEY,
category TEXT NOT NULL,
name TEXT NOT NULL,
price REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS dim_employee (
emp_id TEXT PRIMARY KEY,
name TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
role TEXT NOT NULL,
hire_date TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS fact_sales (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
channel TEXT NOT NULL,
units INTEGER NOT NULL,
revenue REAL NOT NULL,
PRIMARY KEY (day, region_code, sku, channel)
);
CREATE TABLE IF NOT EXISTS fact_sales_detail (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
channel TEXT NOT NULL,
employee_id TEXT NULL REFERENCES dim_employee(emp_id),
units INTEGER NOT NULL,
revenue REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS inv_stock (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
on_hand_qty INTEGER NOT NULL,
PRIMARY KEY (day, region_code, sku)
);
CREATE INDEX IF NOT EXISTS idx_sales_day ON fact_sales(day);
CREATE INDEX IF NOT EXISTS idx_sales_region ON fact_sales(region_code);
CREATE INDEX IF NOT EXISTS idx_sales_sku ON fact_sales(sku);
CREATE INDEX IF NOT EXISTS idx_sales_day_region ON fact_sales(day, region_code);
CREATE INDEX IF NOT EXISTS idx_detail_day_region ON fact_sales_detail(day, region_code);
CREATE INDEX IF NOT EXISTS idx_detail_emp ON fact_sales_detail(employee_id);
CREATE INDEX IF NOT EXISTS idx_stock_day_region ON inv_stock(day, region_code);
"""
)
existing = set(r[0] for r in cur.execute("SELECT code FROM dim_region"))
to_ins = [(c, n) for c, n in REGIONS if c not in existing]
if to_ins:
cur.executemany("INSERT INTO dim_region(code, name) VALUES (?,?)", to_ins)
existing_sku = set(r[0] for r in cur.execute("SELECT sku FROM dim_product"))
to_ins_p = [p for p in PRODUCTS if p[0] not in existing_sku]
if to_ins_p:
cur.executemany("INSERT INTO dim_product(sku, category, name, price) VALUES (?,?,?,?)", to_ins_p)
emp_count = cur.execute("SELECT COUNT(*) FROM dim_employee").fetchone()[0]
rnd = random.Random(seed)
if emp_count == 0:
rows = []
for code, _ in REGIONS:
n = 8
for _ in range(n):
emp_id = f"E{code}{rnd.randint(1000,9999)}"
rows.append((emp_id, _rand_name(rnd), code, rnd.choice(["AE","SE","AM"]), "2023-01-01"))
cur.executemany("INSERT INTO dim_employee(emp_id,name,region_code,role,hire_date) VALUES (?,?,?,?,?)", rows)
n_sales = cur.execute("SELECT COUNT(*) FROM fact_sales").fetchone()[0]
if n_sales == 0:
self._seed_fact_sales(cur, start, end, seed)
n_detail = cur.execute("SELECT COUNT(*) FROM fact_sales_detail").fetchone()[0]
if n_detail == 0:
self._seed_sales_detail(cur, seed)
n_stock = cur.execute("SELECT COUNT(*) FROM inv_stock").fetchone()[0]
if n_stock == 0:
self._seed_stock(cur, start, end, seed)
self.conn.commit()
def _seed_fact_sales(self, cur: sqlite3.Cursor, start: str, end: str, seed: int):
rnd = random.Random(seed)
start_d = date.fromisoformat(start)
end_d = date.fromisoformat(end)
days = (end_d - start_d).days + 1
region_factor = {"NCR":1.25,"MUM":1.15,"BLR":1.10,"HYD":0.95,"CHN":0.90,"PUN":0.85}
channels = ["Online","Retail"]
batch = []
for i in range(days):
d = (start_d + timedelta(days=i)).isoformat()
wknd = (start_d + timedelta(days=i)).weekday() >= 5
wknd_boost = 1.10 if wknd else 1.0
for code, _name in REGIONS:
rfac = region_factor[code]
for sku, category, _nm, price in PRODUCTS:
for ch in channels:
base = {"Electronics":12,"Apparel":25,"Grocery":40,"Home":9}[category]
ch_mult = 1.15 if ch == "Online" else 0.95
mu = base * rfac * wknd_boost * ch_mult
sigma = max(1.0, mu * 0.25)
units = max(0, int(rnd.gauss(mu, sigma)))
season = 1.0 + 0.08 * math.sin(i / 3.5)
revenue = round(units * price * season, 2)
batch.append((d, code, sku, ch, units, revenue))
cur.executemany(
"INSERT INTO fact_sales(day, region_code, sku, channel, units, revenue) VALUES (?,?,?,?,?,?)",
batch
)
def _seed_sales_detail(self, cur: sqlite3.Cursor, seed: int):
rnd = random.Random(seed+7)
rows = cur.execute(
"SELECT day, region_code, sku, units, revenue FROM fact_sales WHERE channel='Retail'"
).fetchall()
for d, region, sku, units, revenue in rows:
if units == 0:
continue
emp_ids = [r[0] for r in cur.execute("SELECT emp_id FROM dim_employee WHERE region_code=?", (region,))]
parts = rnd.randint(1, min(4, max(1, units)))
cuts = sorted(rnd.sample(range(1, units), parts-1)) if units > parts else []
splits = [b-a for a,b in zip([0]+cuts, cuts+[units])]
total_units = float(sum(splits))
rev_splits = [round(revenue * (u/total_units), 2) for u in splits]
if rev_splits:
drift = round(revenue - sum(rev_splits), 2)
rev_splits[0] += drift
for u, r in zip(splits, rev_splits):
emp = rnd.choice(emp_ids) if emp_ids else None
cur.execute(
"INSERT INTO fact_sales_detail(day, region_code, sku, channel, employee_id, units, revenue) "
"VALUES (?,?,?,?,?,?,?)",
(d, region, sku, "Retail", emp, u, r)
)
def _seed_stock(self, cur: sqlite3.Cursor, start: str, end: str, seed: int):
rnd = random.Random(seed+13)
start_d = date.fromisoformat(start)
end_d = date.fromisoformat(end)
days = (end_d - start_d).days + 1
for i in range(days):
d = (start_d + timedelta(days=i)).isoformat()
for code, _ in REGIONS:
for sku, category, _nm, _price in PRODUCTS:
base = {"Electronics":400,"Apparel":800,"Grocery":600,"Home":300}[category]
noise = rnd.randint(-30, 30)
on_hand = max(0, base + noise - i*2)
cur.execute(
"INSERT INTO inv_stock(day, region_code, sku, on_hand_qty) VALUES (?,?,?,?)",
(d, code, sku, on_hand)
)
# ---------------------- helpers + read-only executor ----------------------
def region_codes(self) -> List[str]:
rows = self.conn.execute("SELECT code FROM dim_region").fetchall()
return [r[0] for r in rows]
def execute_sql_readonly(self, sql: str) -> Dict[str, Any]:
"""Hard safety: allow only a single SELECT statement; no comments/CTEs with semicolons."""
s = sql.strip()
bad = (";", "--", "/*", "*/")
if not s.lower().startswith("select"):
raise ValueError("Only SELECT statements are allowed.")
if any(tok in s for tok in bad):
raise ValueError("Disallowed token in SQL.")
cur = self.conn.cursor()
cur.execute(s)
cols = [d[0] for d in cur.description] if cur.description else []
rows = cur.fetchall()
return {"columns": cols, "rows": rows, "rowcount": len(rows)}
if __name__ == "__main__":
import argparse
import os
import sys
from pathlib import Path
parser = argparse.ArgumentParser(
description="Create and persist the demo enterprise SQLite database."
)
parser.add_argument(
"--db",
default=os.getenv("SQLITE_PATH", "./demo_enterprise.sqlite"),
help="Path to the SQLite DB file to create (defaults to ./demo_enterprise.sqlite or $SQLITE_PATH).",
)
parser.add_argument(
"--start",
default="2025-08-10",
help="Start date (YYYY-MM-DD) for seeding data.",
)
parser.add_argument(
"--end",
default="2025-09-10",
help="End date (YYYY-MM-DD) for seeding data.",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed used for deterministic seeding.",
)
args = parser.parse_args()
# Ensure we persist to disk (avoid in-memory DB).
db_path = args.db
if db_path == ":memory:":
print("':memory:' was requested; switching to ./demo_enterprise.sqlite so the DB is saved to disk.")
db_path = "./demo_enterprise.db"
# Make sure the parent directory exists
parent = Path(db_path).expanduser().resolve().parent
parent.mkdir(parents=True, exist_ok=True)
tool = SQLTool(db_path=str(Path(db_path).expanduser()))
tool.setup_demo_enterprise(start=args.start, end=args.end, seed=args.seed)
# Flush and close
tool.conn.commit()
tool.conn.close()
print(f"✅ Database created at: {Path(db_path).expanduser().resolve()}")
print(f" Seed window: {args.start}{args.end} | seed={args.seed}")
# # Default location ./demo_enterprise.sqlite
# python app/tools/sql_tool.py
# # Custom location and date range
# python BI_Assistant_Backend/app/tools/sql_tool.py --db ./data/retail_demo.sqlite --start 2025-08-01 --end 2025-09-10 --seed 123