Spaces:
Sleeping
Sleeping
File size: 13,047 Bytes
bebb279 6f18489 2cef617 bebb279 2cef617 6f18489 bebb279 6f18489 bebb279 6f18489 2cef617 bebb279 2cef617 6f18489 2cef617 6f18489 bebb279 6f18489 bebb279 6f18489 bebb279 6f18489 bebb279 bd7254f bebb279 6f18489 bebb279 6f18489 2cef617 bebb279 6f18489 bebb279 6f18489 bebb279 6f18489 bebb279 6f18489 bebb279 bd7254f bebb279 6f18489 bebb279 95bb394 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
# app/tools/sql_tool.py
import os, sqlite3, random, math, string
from datetime import date, timedelta
from typing import Optional, Dict, Any, List, Tuple
REGIONS: List[Tuple[str, str]] = [
("NCR", "Delhi NCR"),
("BLR", "Bengaluru"),
("MUM", "Mumbai"),
("HYD", "Hyderabad"),
("CHN", "Chennai"),
("PUN", "Pune"),
]
PRODUCTS: List[Tuple[str, str, str, float]] = [
("ELEC-001", "Electronics", "Smartphone Alpha", 29999.0),
("ELEC-002", "Electronics", "Smartphone Pro", 49999.0),
("ELEC-003", "Electronics", "Laptop 14\"", 65999.0),
("ELEC-004", "Electronics", "Earbuds", 3999.0),
("APP-001", "Apparel", "Athleisure Tee", 999.0),
("APP-002", "Apparel", "Formal Shirt", 1599.0),
("APP-003", "Apparel", "Denim Jeans", 2499.0),
("GROC-001", "Grocery", "Olive Oil 1L", 799.0),
("GROC-002", "Grocery", "Basmati 5kg", 899.0),
("GROC-003", "Grocery", "Almonds 1kg", 1199.0),
("HOME-001", "Home", "Mixer Grinder", 3499.0),
("HOME-002", "Home", "Air Fryer", 6999.0),
]
def _rand_name(rnd: random.Random):
first = ["Ramesh","Suresh","Mahesh","Amit","Priya","Anita","Kiran","Sunil","Neha","Pooja","Ravi","Vijay","Anil","Meera","Tarun"]
last = ["Kumar","Sharma","Patel","Verma","Reddy","Iyer","Das","Ghosh","Yadav","Gupta","Singh","Menon"]
return f"{rnd.choice(first)} {rnd.choice(last)}"
class SQLTool:
"""
Enterprise-ish SQLite schema with dims/facts and safe read-only SQL execution.
dim_region(code, name)
dim_product(sku, category, name, price)
dim_employee(emp_id, name, region_code, role, hire_date)
fact_sales(day, region_code, sku, channel, units, revenue) -- daily aggregates
fact_sales_detail(day, region_code, sku, channel, employee_id, units, revenue) -- retail split by employee
inv_stock(day, region_code, sku, on_hand_qty)
"""
def __init__(self, db_path: Optional[str] = None):
path = db_path or os.getenv("SQLITE_PATH", ":memory:")
self.path = path
self.conn = sqlite3.connect(path, check_same_thread=False)
self.conn.execute("PRAGMA journal_mode=WAL;")
self.conn.execute("PRAGMA synchronous=NORMAL;")
# ---------------------- schema & seed ----------------------
def setup_demo_enterprise(self, start: str = "2025-08-10", end: str = "2025-09-10", seed: int = 42):
cur = self.conn.cursor()
cur.executescript(
"""
CREATE TABLE IF NOT EXISTS dim_region (
code TEXT PRIMARY KEY,
name TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS dim_product (
sku TEXT PRIMARY KEY,
category TEXT NOT NULL,
name TEXT NOT NULL,
price REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS dim_employee (
emp_id TEXT PRIMARY KEY,
name TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
role TEXT NOT NULL,
hire_date TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS fact_sales (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
channel TEXT NOT NULL,
units INTEGER NOT NULL,
revenue REAL NOT NULL,
PRIMARY KEY (day, region_code, sku, channel)
);
CREATE TABLE IF NOT EXISTS fact_sales_detail (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
channel TEXT NOT NULL,
employee_id TEXT NULL REFERENCES dim_employee(emp_id),
units INTEGER NOT NULL,
revenue REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS inv_stock (
day TEXT NOT NULL,
region_code TEXT NOT NULL REFERENCES dim_region(code),
sku TEXT NOT NULL REFERENCES dim_product(sku),
on_hand_qty INTEGER NOT NULL,
PRIMARY KEY (day, region_code, sku)
);
CREATE INDEX IF NOT EXISTS idx_sales_day ON fact_sales(day);
CREATE INDEX IF NOT EXISTS idx_sales_region ON fact_sales(region_code);
CREATE INDEX IF NOT EXISTS idx_sales_sku ON fact_sales(sku);
CREATE INDEX IF NOT EXISTS idx_sales_day_region ON fact_sales(day, region_code);
CREATE INDEX IF NOT EXISTS idx_detail_day_region ON fact_sales_detail(day, region_code);
CREATE INDEX IF NOT EXISTS idx_detail_emp ON fact_sales_detail(employee_id);
CREATE INDEX IF NOT EXISTS idx_stock_day_region ON inv_stock(day, region_code);
"""
)
existing = set(r[0] for r in cur.execute("SELECT code FROM dim_region"))
to_ins = [(c, n) for c, n in REGIONS if c not in existing]
if to_ins:
cur.executemany("INSERT INTO dim_region(code, name) VALUES (?,?)", to_ins)
existing_sku = set(r[0] for r in cur.execute("SELECT sku FROM dim_product"))
to_ins_p = [p for p in PRODUCTS if p[0] not in existing_sku]
if to_ins_p:
cur.executemany("INSERT INTO dim_product(sku, category, name, price) VALUES (?,?,?,?)", to_ins_p)
emp_count = cur.execute("SELECT COUNT(*) FROM dim_employee").fetchone()[0]
rnd = random.Random(seed)
if emp_count == 0:
rows = []
for code, _ in REGIONS:
n = 8
for _ in range(n):
emp_id = f"E{code}{rnd.randint(1000,9999)}"
rows.append((emp_id, _rand_name(rnd), code, rnd.choice(["AE","SE","AM"]), "2023-01-01"))
cur.executemany("INSERT INTO dim_employee(emp_id,name,region_code,role,hire_date) VALUES (?,?,?,?,?)", rows)
n_sales = cur.execute("SELECT COUNT(*) FROM fact_sales").fetchone()[0]
if n_sales == 0:
self._seed_fact_sales(cur, start, end, seed)
n_detail = cur.execute("SELECT COUNT(*) FROM fact_sales_detail").fetchone()[0]
if n_detail == 0:
self._seed_sales_detail(cur, seed)
n_stock = cur.execute("SELECT COUNT(*) FROM inv_stock").fetchone()[0]
if n_stock == 0:
self._seed_stock(cur, start, end, seed)
self.conn.commit()
def _seed_fact_sales(self, cur: sqlite3.Cursor, start: str, end: str, seed: int):
rnd = random.Random(seed)
start_d = date.fromisoformat(start)
end_d = date.fromisoformat(end)
days = (end_d - start_d).days + 1
region_factor = {"NCR":1.25,"MUM":1.15,"BLR":1.10,"HYD":0.95,"CHN":0.90,"PUN":0.85}
channels = ["Online","Retail"]
batch = []
for i in range(days):
d = (start_d + timedelta(days=i)).isoformat()
wknd = (start_d + timedelta(days=i)).weekday() >= 5
wknd_boost = 1.10 if wknd else 1.0
for code, _name in REGIONS:
rfac = region_factor[code]
for sku, category, _nm, price in PRODUCTS:
for ch in channels:
base = {"Electronics":12,"Apparel":25,"Grocery":40,"Home":9}[category]
ch_mult = 1.15 if ch == "Online" else 0.95
mu = base * rfac * wknd_boost * ch_mult
sigma = max(1.0, mu * 0.25)
units = max(0, int(rnd.gauss(mu, sigma)))
season = 1.0 + 0.08 * math.sin(i / 3.5)
revenue = round(units * price * season, 2)
batch.append((d, code, sku, ch, units, revenue))
cur.executemany(
"INSERT INTO fact_sales(day, region_code, sku, channel, units, revenue) VALUES (?,?,?,?,?,?)",
batch
)
def _seed_sales_detail(self, cur: sqlite3.Cursor, seed: int):
rnd = random.Random(seed+7)
rows = cur.execute(
"SELECT day, region_code, sku, units, revenue FROM fact_sales WHERE channel='Retail'"
).fetchall()
for d, region, sku, units, revenue in rows:
if units == 0:
continue
emp_ids = [r[0] for r in cur.execute("SELECT emp_id FROM dim_employee WHERE region_code=?", (region,))]
parts = rnd.randint(1, min(4, max(1, units)))
cuts = sorted(rnd.sample(range(1, units), parts-1)) if units > parts else []
splits = [b-a for a,b in zip([0]+cuts, cuts+[units])]
total_units = float(sum(splits))
rev_splits = [round(revenue * (u/total_units), 2) for u in splits]
if rev_splits:
drift = round(revenue - sum(rev_splits), 2)
rev_splits[0] += drift
for u, r in zip(splits, rev_splits):
emp = rnd.choice(emp_ids) if emp_ids else None
cur.execute(
"INSERT INTO fact_sales_detail(day, region_code, sku, channel, employee_id, units, revenue) "
"VALUES (?,?,?,?,?,?,?)",
(d, region, sku, "Retail", emp, u, r)
)
def _seed_stock(self, cur: sqlite3.Cursor, start: str, end: str, seed: int):
rnd = random.Random(seed+13)
start_d = date.fromisoformat(start)
end_d = date.fromisoformat(end)
days = (end_d - start_d).days + 1
for i in range(days):
d = (start_d + timedelta(days=i)).isoformat()
for code, _ in REGIONS:
for sku, category, _nm, _price in PRODUCTS:
base = {"Electronics":400,"Apparel":800,"Grocery":600,"Home":300}[category]
noise = rnd.randint(-30, 30)
on_hand = max(0, base + noise - i*2)
cur.execute(
"INSERT INTO inv_stock(day, region_code, sku, on_hand_qty) VALUES (?,?,?,?)",
(d, code, sku, on_hand)
)
# ---------------------- helpers + read-only executor ----------------------
def region_codes(self) -> List[str]:
rows = self.conn.execute("SELECT code FROM dim_region").fetchall()
return [r[0] for r in rows]
def execute_sql_readonly(self, sql: str) -> Dict[str, Any]:
"""Hard safety: allow only a single SELECT statement; no comments/CTEs with semicolons."""
s = sql.strip()
bad = (";", "--", "/*", "*/")
if not s.lower().startswith("select"):
raise ValueError("Only SELECT statements are allowed.")
if any(tok in s for tok in bad):
raise ValueError("Disallowed token in SQL.")
cur = self.conn.cursor()
cur.execute(s)
cols = [d[0] for d in cur.description] if cur.description else []
rows = cur.fetchall()
return {"columns": cols, "rows": rows, "rowcount": len(rows)}
if __name__ == "__main__":
import argparse
import os
import sys
from pathlib import Path
parser = argparse.ArgumentParser(
description="Create and persist the demo enterprise SQLite database."
)
parser.add_argument(
"--db",
default=os.getenv("SQLITE_PATH", "./demo_enterprise.sqlite"),
help="Path to the SQLite DB file to create (defaults to ./demo_enterprise.sqlite or $SQLITE_PATH).",
)
parser.add_argument(
"--start",
default="2025-08-10",
help="Start date (YYYY-MM-DD) for seeding data.",
)
parser.add_argument(
"--end",
default="2025-09-10",
help="End date (YYYY-MM-DD) for seeding data.",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed used for deterministic seeding.",
)
args = parser.parse_args()
# Ensure we persist to disk (avoid in-memory DB).
db_path = args.db
if db_path == ":memory:":
print("':memory:' was requested; switching to ./demo_enterprise.sqlite so the DB is saved to disk.")
db_path = "./demo_enterprise.db"
# Make sure the parent directory exists
parent = Path(db_path).expanduser().resolve().parent
parent.mkdir(parents=True, exist_ok=True)
tool = SQLTool(db_path=str(Path(db_path).expanduser()))
tool.setup_demo_enterprise(start=args.start, end=args.end, seed=args.seed)
# Flush and close
tool.conn.commit()
tool.conn.close()
print(f"✅ Database created at: {Path(db_path).expanduser().resolve()}")
print(f" Seed window: {args.start} → {args.end} | seed={args.seed}")
# # Default location ./demo_enterprise.sqlite
# python app/tools/sql_tool.py
# # Custom location and date range
# python BI_Assistant_Backend/app/tools/sql_tool.py --db ./data/retail_demo.sqlite --start 2025-08-01 --end 2025-09-10 --seed 123
|