PowerBIguide_AB / app.py
ajayinsac's picture
Update app.py
da360ae verified
#!/usr/bin/env python3
"""
AI Agent for Power BI β€” Gradio app for Hugging Face Spaces
- Upload a spreadsheet (.csv/.xlsx) or a Word doc (.docx)
- Profile the data, recommend visuals & report patterns (with drilldown, field params, tooltips, decomposition tree)
- Generate "Power BI build steps" tailored to the dataset
- Offline RAG Q&A over a small allow-listed corpus (seeded at runtime) focused on Microsoft Learn topics
Notes:
- No outbound web calls are required at runtime.
- If sentence-transformers or faiss-cpu is not available, the app falls back to TF-IDF + cosine.
- In Spaces, do NOT use share=True in launch().
"""
import os
import io
import re
import gc
import csv
import json
import math
import time
import glob
import uuid
import shutil
import string
import random
import hashlib
import textwrap
import tempfile
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple
import numpy as np
import pandas as pd
import gradio as gr
# Optional dependencies
try:
import docx # python-docx
HAS_DOCX = True
except Exception:
HAS_DOCX = False
# Embeddings stack (robust fallbacks)
_EMBED_BACKEND = "st"
try:
from sentence_transformers import SentenceTransformer
import faiss # type: ignore
HAS_ST = True
HAS_FAISS = True
except Exception:
HAS_ST = False
try:
import faiss # type: ignore
HAS_FAISS = True
except Exception:
HAS_FAISS = False
_EMBED_BACKEND = "tfidf"
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
APP_DIR = os.path.dirname(os.path.abspath(__file__))
DOCS_DIR = os.path.join(APP_DIR, "docs") # local allow-listed offline bundle
CACHE_DIR = os.path.join(APP_DIR, ".cache")
os.makedirs(CACHE_DIR, exist_ok=True)
# ---------------------------
# Utilities
# ---------------------------
def clean_text(s: str) -> str:
if not s:
return ""
s = s.replace("\\u0000", " ").replace("\\x00", " ")
s = re.sub(r"[\u0000-\u001F]", " ", s) # control chars
s = re.sub(r"\s+", " ", s).strip()
return s
def to_markdown_table(rows: List[Dict[str, Any]]) -> str:
if not rows:
return "_No rows_"
headers = list(rows[0].keys())
out = []
out.append("| " + " | ".join(headers) + " |")
out.append("| " + " | ".join(["---"] * len(headers)) + " |")
for r in rows:
out.append("| " + " | ".join([str(r.get(h, "")) for h in headers]) + " |")
return "\n".join(out)
def save_temp_textfile(name: str, text: str) -> str:
tmp = os.path.join(tempfile.gettempdir(), name)
with open(tmp, "w", encoding="utf-8") as f:
f.write(text)
return tmp
# ---------------------------
# Seed trusted corpus (offline)
# ---------------------------
SEED_DOCS = [
{
"title": "Star Schema and Modeling Guidance",
"source_url": "https://learn.microsoft.com/power-bi/guidance/star-schema",
"text": """
Key ideas for Power BI modeling:
- Prefer a star schema: one or more fact tables at the center, surrounded by dimension tables.
- Use one-to-many relationships from dimensions to facts; default filter direction single-direction.
- Avoid many-to-many between fact tables. Use conformed dimensions to integrate subjects.
- Create and mark a proper Date table for time intelligence; avoid relying on auto date/time for complex models.
- Keep columns narrow and typed correctly; hide surrogate keys and technical columns from report view.
""".strip()
},
{
"title": "Drillthrough Pages",
"source_url": "https://learn.microsoft.com/power-bi/visuals/drillthrough",
"text": """
Drillthrough pattern:
- Create a dedicated drillthrough page; add the target field to the Drillthrough well.
- Place visuals that explain the selected entity (e.g., Customer, Product).
- Add a Back button to return to the source page.
- Test by right-clicking a category value on a source visual and choosing Drillthrough.
""".strip()
},
{
"title": "Report Page Tooltips",
"source_url": "https://learn.microsoft.com/power-bi/create-reports/desktop-tooltips",
"text": """
Report page tooltips:
- Create a small page sized for tooltip (e.g., 320x240 or default tooltip page size).
- Add detailed visuals or KPIs intended to show on hover.
- In the source visual, enable tooltip and assign the tooltip page.
- Keep tooltip visuals compact; avoid heavy interactions on hover.
""".strip()
},
{
"title": "Field Parameters for Self-Service",
"source_url": "https://learn.microsoft.com/power-bi/create-reports/field-parameters",
"text": """
Field parameters:
- Create a field parameter to allow viewers to swap dimensions/measures on visuals.
- Add the parameter field to the Axis/Legend/Values well of visuals.
- Provide a slicer bound to the parameter to let users choose fields at runtime.
- Useful for building one flexible page instead of many near-duplicate pages.
""".strip()
},
{
"title": "Incremental Refresh",
"source_url": "https://learn.microsoft.com/power-bi/enterprise/incremental-refresh-overview",
"text": """
Incremental refresh:
- Define RangeStart and RangeEnd parameters to partition by date/time.
- Configure policy: how much history to store and how much of the newer period to refresh.
- Only the most recent partitions are refreshed in the Service; historical partitions remain stable.
- Great for large tables; combine with aggregations for best performance.
""".strip()
},
{
"title": "Aggregations",
"source_url": "https://learn.microsoft.com/power-bi/transform-model/aggregations-advanced",
"text": """
Aggregations pattern:
- Create an aggregate table with pre-summarized values (e.g., by Month/Region/Product).
- Map it to a detailed DirectQuery or Import table via manage aggregations.
- Power BI routes queries to the smaller aggregated table when possible.
- Especially valuable with DirectQuery/composite models.
""".strip()
},
{
"title": "Row-Level Security (RLS)",
"source_url": "https://learn.microsoft.com/power-bi/admin/rls",
"text": """
RLS guidance:
- Define roles with filters on dimension tables (e.g., Region = USERPRINCIPALNAME() mapping table).
- Test roles in Power BI Desktop using 'View as'.
- In the Service, assign users/groups to roles; viewers see only permitted rows.
- Keep RLS logic simple; avoid complex bi-directional relationships.
""".strip()
},
{
"title": "Decomposition Tree Visual",
"source_url": "https://learn.microsoft.com/power-bi/visuals/power-bi-visualization-decomposition-tree",
"text": """
Decomposition Tree:
- A visual to perform root-cause analysis by breaking down a measure across dimensions.
- Add dimensions in the 'Explain by' well; choose 'High value' or 'Low value' splits to guide analysis.
- Great for iterative 'why' exploration in operational datasets.
""".strip()
}
]
def ensure_docs_directory():
"""Create ./docs with seed files if it doesn't exist or is empty."""
os.makedirs(DOCS_DIR, exist_ok=True)
md_files = list(glob.glob(os.path.join(DOCS_DIR, "*.md")))
if md_files:
return
# Write seed docs
for i, d in enumerate(SEED_DOCS, start=1):
fn = os.path.join(DOCS_DIR, f"{i:02d}-{re.sub(r'[^a-z0-9]+', '-', d['title'].lower()).strip('-')}.md")
with open(fn, "w", encoding="utf-8") as f:
f.write(f"# {d['title']}\n\n")
f.write(f"Source: {d['source_url']}\n\n")
f.write(d["text"].strip() + "\n")
# ---------------------------
# Simple offline RAG
# ---------------------------
@dataclass
class DocChunk:
doc_id: str
title: str
source_url: str
text: str
class SimpleVectorStore:
def __init__(self):
self.backend = _EMBED_BACKEND # "st" or "tfidf"
self.model = None
self.index = None
self.texts: List[str] = []
self.meta: List[Tuple[str, str, str]] = [] # (doc_id, title, source_url)
if self.backend == "st" and HAS_ST:
# Smaller model is fine for HF CPU Spaces
self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def _embed(self, texts: List[str]) -> np.ndarray:
if self.backend == "st" and HAS_ST:
emb = self.model.encode(texts, batch_size=32, convert_to_numpy=True, normalize_embeddings=True)
return emb.astype("float32")
else:
# TF-IDF fallback (cosine will be computed via sklearn)
self.vectorizer = TfidfVectorizer(min_df=1, max_df=0.95, ngram_range=(1,2))
X = self.vectorizer.fit_transform(texts)
self._tfidf_matrix = X
return np.zeros((X.shape[0], 1), dtype="float32") # dummy; we won't use FAISS path
def build(self, chunks: List[DocChunk]) -> None:
self.texts = [c.text for c in chunks]
self.meta = [(c.doc_id, c.title, c.source_url) for c in chunks]
if self.backend == "st" and HAS_ST and HAS_FAISS:
embs = self._embed(self.texts)
dim = embs.shape[1]
self.index = faiss.IndexFlatIP(dim)
self.index.add(embs)
else:
# TF-IDF fallback doesn't build a FAISS index
if self.backend == "st" and not HAS_FAISS:
# Still have ST model but no FAISS -> cosine on numpy
self.embs = self._embed(self.texts)
elif self.backend == "tfidf":
# vectorizer built in _embed
self._embed(self.texts)
def query(self, q: str, top_k: int = 4) -> List[Tuple[float, int]]:
"""Return [(score, idx), ...] sorted by score desc."""
q = clean_text(q)
if not q.strip() or not self.texts:
return []
if self.backend == "st" and HAS_ST and HAS_FAISS:
q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
sims, idxs = self.index.search(q_emb, top_k)
return [(float(sims[0][i]), int(idxs[0][i])) for i in range(len(idxs[0]))]
elif self.backend == "st" and HAS_ST and not HAS_FAISS:
# cosine on numpy
q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
sims = (self.embs @ q_emb.T).ravel()
idxs = np.argsort(-sims)[:top_k]
return [(float(sims[i]), int(i)) for i in idxs]
else:
# TF-IDF
q_vec = self.vectorizer.transform([q])
sims = cosine_similarity(self._tfidf_matrix, q_vec).ravel()
idxs = np.argsort(-sims)[:top_k]
return [(float(sims[i]), int(i)) for i in idxs]
def load_corpus_chunks() -> List[DocChunk]:
ensure_docs_directory()
chunks: List[DocChunk] = []
for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.md"))):
with open(path, "r", encoding="utf-8") as f:
raw = f.read()
# first line is title, second may contain Source: URL
lines = [l.strip() for l in raw.splitlines() if l.strip()]
title = lines[0].lstrip("# ").strip() if lines else os.path.basename(path)
src = ""
if len(lines) > 1 and lines[1].lower().startswith("source:"):
src = lines[1].split(":", 1)[1].strip()
body = "\n".join(lines[2:]) if len(lines) > 2 else "\n".join(lines[1:])
body = clean_text(body)
# Chunk by paragraphs ~600-900 chars
paras = re.split(r"\n{2,}", body)
buf = ""
for p in paras:
if len(buf) + len(p) < 800:
buf += (p + "\n\n")
else:
if buf.strip():
chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip()))
buf = p + "\n\n"
if buf.strip():
chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip()))
return chunks
def compose_answer(question: str, hits: List[Tuple[float, int]], store: SimpleVectorStore) -> Tuple[str, List[Dict[str, str]]]:
if not hits:
return ("I couldn't find this in the trusted corpus. Try rephrasing or add more Microsoft Learn pages to the ./docs folder.", [])
pieces = []
cites = []
for score, idx in hits:
text = store.texts[idx]
doc_id, title, src = store.meta[idx]
pieces.append(f"- {text}")
cites.append({"title": title, "source": src})
answer = "### Answer (from trusted sources)\n" + "\n".join(pieces[:3])
answer += "\n\n**Citations:**\n" + "\n".join([f"- {c['title']} β€” {c['source']}" for c in cites[:3]])
return answer, cites
# ---------------------------
# Data ingestion & profiling
# ---------------------------
READABLE_EXTS = [".csv", ".xlsx"]
DOC_EXTS = [".docx"]
def read_table_file(path: str) -> pd.DataFrame:
ext = os.path.splitext(path)[1].lower()
if ext == ".csv":
return pd.read_csv(path)
elif ext == ".xlsx":
return pd.read_excel(path)
else:
raise ValueError(f"Unsupported table extension: {ext}")
def read_docx_text(path: str) -> str:
if not HAS_DOCX:
return ""
doc = docx.Document(path)
return "\n".join([p.text for p in doc.paragraphs])
def infer_col_type(series: pd.Series) -> str:
if pd.api.types.is_datetime64_any_dtype(series):
return "datetime"
if pd.api.types.is_bool_dtype(series):
return "boolean"
if pd.api.types.is_numeric_dtype(series):
return "numeric"
# treat everything else as text/categorical
# attempt parse datetime small sample
try:
pd.to_datetime(series.dropna().head(20), errors="raise") # may raise
return "datetime"
except Exception:
pass
# if few unique values relative to rows -> categorical
n = len(series)
dc = series.nunique(dropna=True)
if n > 0 and dc / max(n,1) < 0.2:
return "categorical"
return "text"
def profile_dataframe(df: pd.DataFrame, max_sample: int = 10_000) -> Dict[str, Any]:
work = df.copy()
if len(work) > max_sample:
work = work.sample(max_sample, random_state=42).reset_index(drop=True)
cols = []
for col in work.columns:
s = work[col]
inferred = infer_col_type(s)
role = "measure" if inferred == "numeric" else "dimension"
n = len(s)
miss = float(s.isna().sum()) / max(n, 1)
distinct = int(s.nunique(dropna=True))
sample_vals = list(s.dropna().unique()[:5])
cols.append({
"column": col,
"pandas_dtype": str(s.dtype),
"inferred_type": inferred,
"role": role,
"rows": n,
"distinct_count": distinct,
"pct_missing": round(miss * 100.0, 2),
"sample_values": sample_vals
})
# high-level signals
has_date = any(c["inferred_type"] == "datetime" for c in cols)
measures = sum(1 for c in cols if c["role"] == "measure")
dims = sum(1 for c in cols if c["role"] == "dimension")
low_card_dims = [c for c in cols if c["role"] == "dimension" and c["distinct_count"] <= 20]
two_small_categories = len(low_card_dims) >= 2
hierarchies_found = has_date or two_small_categories
return {
"n_rows": int(len(work)),
"n_cols": int(len(work.columns)),
"has_date": bool(has_date),
"measures": int(measures),
"dimensions": int(dims),
"two_small_categories": bool(two_small_categories),
"hierarchies_found": bool(hierarchies_found),
"columns": cols
}
# ---------------------------
# Visual recommendation engine
# ---------------------------
def recommend_visuals(profile: Dict[str, Any]) -> List[Dict[str, Any]]:
recs = []
def add(name, score, why, extras=None):
recs.append({
"visual": name,
"score": score,
"rationale": why,
"extras": extras or []
})
has_date = profile.get("has_date", False)
measures = profile.get("measures", 0)
two_small = profile.get("two_small_categories", False)
hier = profile.get("hierarchies_found", False)
# Time series
if has_date and measures >= 1:
add("Line chart (trend over time)",
0.96,
"Date/time column detected + at least one measure β†’ trend visualization with drill down (Y/Q/M).",
["Enable drill hierarchy on Date table.", "Consider YoY or rolling average in a small multiple."])
# Ranked bars for categories
if measures >= 1:
low_card_dims = [c for c in profile["columns"] if c["role"] == "dimension" and c["distinct_count"] <= 20]
if low_card_dims:
add("Clustered bar/column (ranked categories)",
0.90,
"Low-cardinality categorical fields with one measure β†’ ranked comparison.",
["Sort by measure desc.", "Consider report-page tooltip for extra detail."])
# Part-to-whole
if measures == 1 and two_small:
add("100% stacked bar/column (part-to-whole)",
0.78,
"Two small categorical fields + one measure β†’ relative composition across segments.",
["Prefer 100% stacked for share emphasis.", "Avoid pies/donuts unless very few categories."])
# Compare measures
if measures >= 2:
add("Scatter (compare two measures)",
0.74,
"Two or more measures β†’ scatter for correlation; add category legend if helpful.",
["Enable data labels, add trendline where suitable."])
# Distribution
numeric_cols = [c for c in profile["columns"] if c["inferred_type"] == "numeric"]
if numeric_cols:
add("Histogram/Box plot (distribution & outliers)",
0.66,
"Numeric columns β†’ reveal distribution shape and outliers.",
["Use custom bins for stability.", "Use box plot for skew/outlier awareness."])
# Root-cause analysis
if hier:
add("Decomposition tree (root-cause)",
0.64,
"Date hierarchy or multiple small categorical fields β†’ iterative 'why' analysis.",
["Seed 'Explain by' with key dimensions.", "Use High/Low splits strategically."])
# Rank by score
recs.sort(key=lambda x: x["score"], reverse=True)
return recs
# ---------------------------
# Power BI steps generator
# ---------------------------
def generate_powerbi_steps(profile: Dict[str, Any], context_text: str = "") -> str:
bullets = []
# Modeling
bullets.append("Modeling & relationships:")
bullets.append("- Use a star schema: fact table(s) with dimension tables related 1:*; default single-direction filters.")
bullets.append("- Create a dedicated Date table (one row per date) and Mark as date table; use it for all time axes.")
bullets.append("- Hide surrogate keys/technical columns; keep columns typed & narrow.")
# Visuals based on recs
recs = recommend_visuals(profile)
bullets.append("\nVisuals to build (with interactivity):")
for r in recs[:4]:
bullets.append(f"- **{r['visual']}** β€” {r['rationale']}")
for ex in r.get("extras", []):
bullets.append(f" - {ex}")
# Interactivity patterns
bullets.append("\nInteractivity patterns:")
bullets.append("- Drillthrough: create a detail page, drag the entity field to the Drillthrough well, add a Back button.")
bullets.append("- Report page tooltips: create a tooltip-sized page, assign as tooltip on dense visuals.")
bullets.append("- Field parameters: add a parameter to let users swap dimensions/measures at runtime.")
bullets.append("- Consider a Decomposition Tree for explorations of 'why' a metric changes.")
# Scale & governance
bullets.append("\nScale & governance:")
bullets.append("- For large fact tables, configure Incremental Refresh (RangeStart/RangeEnd).")
bullets.append("- Consider Aggregations for DirectQuery/composite models.")
bullets.append("- Define RLS roles on dimensions; test with 'View as'. Keep logic simple.")
if context_text.strip():
bullets.append("\nContext captured from the provided .docx:")
bullets.append(f"> {context_text[:600]}{'...' if len(context_text) > 600 else ''}")
md = "# Power BI Build Steps\n\n" + "\n".join([f"{line}" for line in bullets])
return md
# ---------------------------
# Gradio UI callbacks
# ---------------------------
def cb_profile(file_path: str, docx_path: str):
df, profile, notes = None, {}, ""
if file_path:
try:
df = read_table_file(file_path)
except Exception as e:
return gr.update(value=None), json.dumps({}, indent=2), f"Error reading table: {e}", None, None
if docx_path and os.path.splitext(docx_path)[1].lower() in DOC_EXTS:
try:
notes = read_docx_text(docx_path) if HAS_DOCX else ""
except Exception as e:
notes = f"(Could not parse .docx: {e})"
if df is None:
return gr.update(value=None), json.dumps({}, indent=2), "Please upload a .csv or .xlsx file.", None, None
profile = profile_dataframe(df)
# Build a small dataframe for display
cols_df = pd.DataFrame(profile["columns"])
head_df = df.head(10)
# Save profile CSV for export
out_csv = os.path.join(tempfile.gettempdir(), f"profile_{uuid.uuid4().hex[:8]}.csv")
cols_df.to_csv(out_csv, index=False)
# Save sample steps
steps_md = generate_powerbi_steps(profile, context_text=notes)
out_md = os.path.join(tempfile.gettempdir(), f"pbi_steps_{uuid.uuid4().hex[:8]}.md")
with open(out_md, "w", encoding="utf-8") as f:
f.write(steps_md)
summary = {
"rows": profile["n_rows"],
"columns": profile["n_cols"],
"has_date": profile["has_date"],
"measures": profile["measures"],
"dimensions": profile["dimensions"],
"hierarchies_found": profile["hierarchies_found"]
}
return head_df, json.dumps(summary, indent=2), "Profile computed. See 'Column profile' tab for details.", cols_df, [out_md, out_csv]
# RAG global state
_VECTOR_STORE = None
def build_vector_store():
global _VECTOR_STORE
chunks = load_corpus_chunks()
store = SimpleVectorStore()
store.build(chunks)
_VECTOR_STORE = store
def cb_qa(question: str, restrict: str):
global _VECTOR_STORE
if _VECTOR_STORE is None:
build_vector_store()
# In a full implementation, 'restrict' could filter by domain/source.
hits = _VECTOR_STORE.query(question, top_k=4)
answer, cites = compose_answer(question, hits, _VECTOR_STORE)
return answer
def cb_reset():
# Nothing to deeply reset beyond UI; transient files are in /tmp
return None, "{}", "Session cleared.", None, None, "", "Microsoft Learn (default only)"
# ---------------------------
# Build the UI
# ---------------------------
with gr.Blocks(title="AI Agent for Power BI") as demo:
gr.Markdown("# AI Agent for Power BI\nUpload data β†’ get visual/report recommendations, drilldown ideas, and Power BI steps.\nAsk about patterns from trusted docs (offline bundle).")
with gr.Tabs():
with gr.TabItem("Upload & Profile"):
with gr.Row():
file_in = gr.File(label="Upload .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath")
docx_in = gr.File(label="Optional: upload a .docx with requirements (context only)", file_types=[".docx"], type="filepath")
with gr.Row():
btn_profile = gr.Button("Profile Data & Generate Steps", variant="primary")
btn_reset = gr.Button("Reset Session")
with gr.Row():
df_head = gr.Dataframe(label="Sample data (first 10 rows)", interactive=False)
profile_summary = gr.Code(label="Profile summary (JSON)")
status_box = gr.Textbox(label="Status", interactive=False)
with gr.Row():
gr.Markdown("### Column profile")
cols_profile = gr.Dataframe(label="Columns", interactive=False)
with gr.Row():
exports = gr.File(label="Download build steps (.md) and column profile (.csv)", file_count="multiple")
with gr.TabItem("Patterns & Best Practices (Q&A)"):
q = gr.Textbox(label="Ask a Power BI question (modeling, drillthrough, tooltips, field parameters, RLS, incremental refresh, aggregations...)", lines=3, placeholder="e.g., When should I use a star schema vs. a snowflake?")
restrict = gr.Dropdown(choices=["Microsoft Learn (default only)", "Microsoft Learn + curated community"], value="Microsoft Learn (default only)", label="Trusted sources")
btn_ask = gr.Button("Answer from trusted corpus", variant="primary")
answer_md = gr.Markdown()
with gr.TabItem("Explainability & Exports"):
gr.Markdown("This app uses simple heuristics for visual recommendations and an offline RAG for Q&A.\n- Visuals: date/time β†’ line; low-cardinality categories β†’ bars; 2+ measures β†’ scatter; numeric distribution β†’ histogram/box; hierarchies β†’ decomposition tree.\n- Interactivity: drillthrough pages, report page tooltips, field parameters.\n- Scale: incremental refresh & aggregations; security via RLS.")
# Bind events
btn_profile.click(cb_profile, inputs=[file_in, docx_in], outputs=[df_head, profile_summary, status_box, cols_profile, exports])
btn_reset.click(cb_reset, inputs=None, outputs=[df_head, profile_summary, status_box, cols_profile, exports, answer_md, restrict])
btn_ask.click(cb_qa, inputs=[q, restrict], outputs=[answer_md])
# Build vector store once on startup for snappy Q&A
try:
build_vector_store()
except Exception as e:
print(f"[WARN] Vector store build failed: {e}")
if __name__ == "__main__":
# Do NOT pass share=True for Hugging Face Spaces
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))