#!/usr/bin/env python3 """ AI Agent for Power BI — Gradio app for Hugging Face Spaces - Upload a spreadsheet (.csv/.xlsx) or a Word doc (.docx) - Profile the data, recommend visuals & report patterns (with drilldown, field params, tooltips, decomposition tree) - Generate "Power BI build steps" tailored to the dataset - Offline RAG Q&A over a small allow-listed corpus (seeded at runtime) focused on Microsoft Learn topics Notes: - No outbound web calls are required at runtime. - If sentence-transformers or faiss-cpu is not available, the app falls back to TF-IDF + cosine. - In Spaces, do NOT use share=True in launch(). """ import os import io import re import gc import csv import json import math import time import glob import uuid import shutil import string import random import hashlib import textwrap import tempfile from dataclasses import dataclass from typing import List, Dict, Any, Tuple import numpy as np import pandas as pd import gradio as gr # Optional dependencies try: import docx # python-docx HAS_DOCX = True except Exception: HAS_DOCX = False # Embeddings stack (robust fallbacks) _EMBED_BACKEND = "st" try: from sentence_transformers import SentenceTransformer import faiss # type: ignore HAS_ST = True HAS_FAISS = True except Exception: HAS_ST = False try: import faiss # type: ignore HAS_FAISS = True except Exception: HAS_FAISS = False _EMBED_BACKEND = "tfidf" from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) APP_DIR = os.path.dirname(os.path.abspath(__file__)) DOCS_DIR = os.path.join(APP_DIR, "docs") # local allow-listed offline bundle CACHE_DIR = os.path.join(APP_DIR, ".cache") os.makedirs(CACHE_DIR, exist_ok=True) # --------------------------- # Utilities # --------------------------- def clean_text(s: str) -> str: if not s: return "" s = s.replace("\\u0000", " ").replace("\\x00", " ") s = re.sub(r"[\u0000-\u001F]", " ", s) # control chars s = re.sub(r"\s+", " ", s).strip() return s def to_markdown_table(rows: List[Dict[str, Any]]) -> str: if not rows: return "_No rows_" headers = list(rows[0].keys()) out = [] out.append("| " + " | ".join(headers) + " |") out.append("| " + " | ".join(["---"] * len(headers)) + " |") for r in rows: out.append("| " + " | ".join([str(r.get(h, "")) for h in headers]) + " |") return "\n".join(out) def save_temp_textfile(name: str, text: str) -> str: tmp = os.path.join(tempfile.gettempdir(), name) with open(tmp, "w", encoding="utf-8") as f: f.write(text) return tmp # --------------------------- # Seed trusted corpus (offline) # --------------------------- SEED_DOCS = [ { "title": "Star Schema and Modeling Guidance", "source_url": "https://learn.microsoft.com/power-bi/guidance/star-schema", "text": """ Key ideas for Power BI modeling: - Prefer a star schema: one or more fact tables at the center, surrounded by dimension tables. - Use one-to-many relationships from dimensions to facts; default filter direction single-direction. - Avoid many-to-many between fact tables. Use conformed dimensions to integrate subjects. - Create and mark a proper Date table for time intelligence; avoid relying on auto date/time for complex models. - Keep columns narrow and typed correctly; hide surrogate keys and technical columns from report view. """.strip() }, { "title": "Drillthrough Pages", "source_url": "https://learn.microsoft.com/power-bi/visuals/drillthrough", "text": """ Drillthrough pattern: - Create a dedicated drillthrough page; add the target field to the Drillthrough well. - Place visuals that explain the selected entity (e.g., Customer, Product). - Add a Back button to return to the source page. - Test by right-clicking a category value on a source visual and choosing Drillthrough. """.strip() }, { "title": "Report Page Tooltips", "source_url": "https://learn.microsoft.com/power-bi/create-reports/desktop-tooltips", "text": """ Report page tooltips: - Create a small page sized for tooltip (e.g., 320x240 or default tooltip page size). - Add detailed visuals or KPIs intended to show on hover. - In the source visual, enable tooltip and assign the tooltip page. - Keep tooltip visuals compact; avoid heavy interactions on hover. """.strip() }, { "title": "Field Parameters for Self-Service", "source_url": "https://learn.microsoft.com/power-bi/create-reports/field-parameters", "text": """ Field parameters: - Create a field parameter to allow viewers to swap dimensions/measures on visuals. - Add the parameter field to the Axis/Legend/Values well of visuals. - Provide a slicer bound to the parameter to let users choose fields at runtime. - Useful for building one flexible page instead of many near-duplicate pages. """.strip() }, { "title": "Incremental Refresh", "source_url": "https://learn.microsoft.com/power-bi/enterprise/incremental-refresh-overview", "text": """ Incremental refresh: - Define RangeStart and RangeEnd parameters to partition by date/time. - Configure policy: how much history to store and how much of the newer period to refresh. - Only the most recent partitions are refreshed in the Service; historical partitions remain stable. - Great for large tables; combine with aggregations for best performance. """.strip() }, { "title": "Aggregations", "source_url": "https://learn.microsoft.com/power-bi/transform-model/aggregations-advanced", "text": """ Aggregations pattern: - Create an aggregate table with pre-summarized values (e.g., by Month/Region/Product). - Map it to a detailed DirectQuery or Import table via manage aggregations. - Power BI routes queries to the smaller aggregated table when possible. - Especially valuable with DirectQuery/composite models. """.strip() }, { "title": "Row-Level Security (RLS)", "source_url": "https://learn.microsoft.com/power-bi/admin/rls", "text": """ RLS guidance: - Define roles with filters on dimension tables (e.g., Region = USERPRINCIPALNAME() mapping table). - Test roles in Power BI Desktop using 'View as'. - In the Service, assign users/groups to roles; viewers see only permitted rows. - Keep RLS logic simple; avoid complex bi-directional relationships. """.strip() }, { "title": "Decomposition Tree Visual", "source_url": "https://learn.microsoft.com/power-bi/visuals/power-bi-visualization-decomposition-tree", "text": """ Decomposition Tree: - A visual to perform root-cause analysis by breaking down a measure across dimensions. - Add dimensions in the 'Explain by' well; choose 'High value' or 'Low value' splits to guide analysis. - Great for iterative 'why' exploration in operational datasets. """.strip() } ] def ensure_docs_directory(): """Create ./docs with seed files if it doesn't exist or is empty.""" os.makedirs(DOCS_DIR, exist_ok=True) md_files = list(glob.glob(os.path.join(DOCS_DIR, "*.md"))) if md_files: return # Write seed docs for i, d in enumerate(SEED_DOCS, start=1): fn = os.path.join(DOCS_DIR, f"{i:02d}-{re.sub(r'[^a-z0-9]+', '-', d['title'].lower()).strip('-')}.md") with open(fn, "w", encoding="utf-8") as f: f.write(f"# {d['title']}\n\n") f.write(f"Source: {d['source_url']}\n\n") f.write(d["text"].strip() + "\n") # --------------------------- # Simple offline RAG # --------------------------- @dataclass class DocChunk: doc_id: str title: str source_url: str text: str class SimpleVectorStore: def __init__(self): self.backend = _EMBED_BACKEND # "st" or "tfidf" self.model = None self.index = None self.texts: List[str] = [] self.meta: List[Tuple[str, str, str]] = [] # (doc_id, title, source_url) if self.backend == "st" and HAS_ST: # Smaller model is fine for HF CPU Spaces self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") def _embed(self, texts: List[str]) -> np.ndarray: if self.backend == "st" and HAS_ST: emb = self.model.encode(texts, batch_size=32, convert_to_numpy=True, normalize_embeddings=True) return emb.astype("float32") else: # TF-IDF fallback (cosine will be computed via sklearn) self.vectorizer = TfidfVectorizer(min_df=1, max_df=0.95, ngram_range=(1,2)) X = self.vectorizer.fit_transform(texts) self._tfidf_matrix = X return np.zeros((X.shape[0], 1), dtype="float32") # dummy; we won't use FAISS path def build(self, chunks: List[DocChunk]) -> None: self.texts = [c.text for c in chunks] self.meta = [(c.doc_id, c.title, c.source_url) for c in chunks] if self.backend == "st" and HAS_ST and HAS_FAISS: embs = self._embed(self.texts) dim = embs.shape[1] self.index = faiss.IndexFlatIP(dim) self.index.add(embs) else: # TF-IDF fallback doesn't build a FAISS index if self.backend == "st" and not HAS_FAISS: # Still have ST model but no FAISS -> cosine on numpy self.embs = self._embed(self.texts) elif self.backend == "tfidf": # vectorizer built in _embed self._embed(self.texts) def query(self, q: str, top_k: int = 4) -> List[Tuple[float, int]]: """Return [(score, idx), ...] sorted by score desc.""" q = clean_text(q) if not q.strip() or not self.texts: return [] if self.backend == "st" and HAS_ST and HAS_FAISS: q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32") sims, idxs = self.index.search(q_emb, top_k) return [(float(sims[0][i]), int(idxs[0][i])) for i in range(len(idxs[0]))] elif self.backend == "st" and HAS_ST and not HAS_FAISS: # cosine on numpy q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32") sims = (self.embs @ q_emb.T).ravel() idxs = np.argsort(-sims)[:top_k] return [(float(sims[i]), int(i)) for i in idxs] else: # TF-IDF q_vec = self.vectorizer.transform([q]) sims = cosine_similarity(self._tfidf_matrix, q_vec).ravel() idxs = np.argsort(-sims)[:top_k] return [(float(sims[i]), int(i)) for i in idxs] def load_corpus_chunks() -> List[DocChunk]: ensure_docs_directory() chunks: List[DocChunk] = [] for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.md"))): with open(path, "r", encoding="utf-8") as f: raw = f.read() # first line is title, second may contain Source: URL lines = [l.strip() for l in raw.splitlines() if l.strip()] title = lines[0].lstrip("# ").strip() if lines else os.path.basename(path) src = "" if len(lines) > 1 and lines[1].lower().startswith("source:"): src = lines[1].split(":", 1)[1].strip() body = "\n".join(lines[2:]) if len(lines) > 2 else "\n".join(lines[1:]) body = clean_text(body) # Chunk by paragraphs ~600-900 chars paras = re.split(r"\n{2,}", body) buf = "" for p in paras: if len(buf) + len(p) < 800: buf += (p + "\n\n") else: if buf.strip(): chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip())) buf = p + "\n\n" if buf.strip(): chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip())) return chunks def compose_answer(question: str, hits: List[Tuple[float, int]], store: SimpleVectorStore) -> Tuple[str, List[Dict[str, str]]]: if not hits: return ("I couldn't find this in the trusted corpus. Try rephrasing or add more Microsoft Learn pages to the ./docs folder.", []) pieces = [] cites = [] for score, idx in hits: text = store.texts[idx] doc_id, title, src = store.meta[idx] pieces.append(f"- {text}") cites.append({"title": title, "source": src}) answer = "### Answer (from trusted sources)\n" + "\n".join(pieces[:3]) answer += "\n\n**Citations:**\n" + "\n".join([f"- {c['title']} — {c['source']}" for c in cites[:3]]) return answer, cites # --------------------------- # Data ingestion & profiling # --------------------------- READABLE_EXTS = [".csv", ".xlsx"] DOC_EXTS = [".docx"] def read_table_file(path: str) -> pd.DataFrame: ext = os.path.splitext(path)[1].lower() if ext == ".csv": return pd.read_csv(path) elif ext == ".xlsx": return pd.read_excel(path) else: raise ValueError(f"Unsupported table extension: {ext}") def read_docx_text(path: str) -> str: if not HAS_DOCX: return "" doc = docx.Document(path) return "\n".join([p.text for p in doc.paragraphs]) def infer_col_type(series: pd.Series) -> str: if pd.api.types.is_datetime64_any_dtype(series): return "datetime" if pd.api.types.is_bool_dtype(series): return "boolean" if pd.api.types.is_numeric_dtype(series): return "numeric" # treat everything else as text/categorical # attempt parse datetime small sample try: pd.to_datetime(series.dropna().head(20), errors="raise") # may raise return "datetime" except Exception: pass # if few unique values relative to rows -> categorical n = len(series) dc = series.nunique(dropna=True) if n > 0 and dc / max(n,1) < 0.2: return "categorical" return "text" def profile_dataframe(df: pd.DataFrame, max_sample: int = 10_000) -> Dict[str, Any]: work = df.copy() if len(work) > max_sample: work = work.sample(max_sample, random_state=42).reset_index(drop=True) cols = [] for col in work.columns: s = work[col] inferred = infer_col_type(s) role = "measure" if inferred == "numeric" else "dimension" n = len(s) miss = float(s.isna().sum()) / max(n, 1) distinct = int(s.nunique(dropna=True)) sample_vals = list(s.dropna().unique()[:5]) cols.append({ "column": col, "pandas_dtype": str(s.dtype), "inferred_type": inferred, "role": role, "rows": n, "distinct_count": distinct, "pct_missing": round(miss * 100.0, 2), "sample_values": sample_vals }) # high-level signals has_date = any(c["inferred_type"] == "datetime" for c in cols) measures = sum(1 for c in cols if c["role"] == "measure") dims = sum(1 for c in cols if c["role"] == "dimension") low_card_dims = [c for c in cols if c["role"] == "dimension" and c["distinct_count"] <= 20] two_small_categories = len(low_card_dims) >= 2 hierarchies_found = has_date or two_small_categories return { "n_rows": int(len(work)), "n_cols": int(len(work.columns)), "has_date": bool(has_date), "measures": int(measures), "dimensions": int(dims), "two_small_categories": bool(two_small_categories), "hierarchies_found": bool(hierarchies_found), "columns": cols } # --------------------------- # Visual recommendation engine # --------------------------- def recommend_visuals(profile: Dict[str, Any]) -> List[Dict[str, Any]]: recs = [] def add(name, score, why, extras=None): recs.append({ "visual": name, "score": score, "rationale": why, "extras": extras or [] }) has_date = profile.get("has_date", False) measures = profile.get("measures", 0) two_small = profile.get("two_small_categories", False) hier = profile.get("hierarchies_found", False) # Time series if has_date and measures >= 1: add("Line chart (trend over time)", 0.96, "Date/time column detected + at least one measure → trend visualization with drill down (Y/Q/M).", ["Enable drill hierarchy on Date table.", "Consider YoY or rolling average in a small multiple."]) # Ranked bars for categories if measures >= 1: low_card_dims = [c for c in profile["columns"] if c["role"] == "dimension" and c["distinct_count"] <= 20] if low_card_dims: add("Clustered bar/column (ranked categories)", 0.90, "Low-cardinality categorical fields with one measure → ranked comparison.", ["Sort by measure desc.", "Consider report-page tooltip for extra detail."]) # Part-to-whole if measures == 1 and two_small: add("100% stacked bar/column (part-to-whole)", 0.78, "Two small categorical fields + one measure → relative composition across segments.", ["Prefer 100% stacked for share emphasis.", "Avoid pies/donuts unless very few categories."]) # Compare measures if measures >= 2: add("Scatter (compare two measures)", 0.74, "Two or more measures → scatter for correlation; add category legend if helpful.", ["Enable data labels, add trendline where suitable."]) # Distribution numeric_cols = [c for c in profile["columns"] if c["inferred_type"] == "numeric"] if numeric_cols: add("Histogram/Box plot (distribution & outliers)", 0.66, "Numeric columns → reveal distribution shape and outliers.", ["Use custom bins for stability.", "Use box plot for skew/outlier awareness."]) # Root-cause analysis if hier: add("Decomposition tree (root-cause)", 0.64, "Date hierarchy or multiple small categorical fields → iterative 'why' analysis.", ["Seed 'Explain by' with key dimensions.", "Use High/Low splits strategically."]) # Rank by score recs.sort(key=lambda x: x["score"], reverse=True) return recs # --------------------------- # Power BI steps generator # --------------------------- def generate_powerbi_steps(profile: Dict[str, Any], context_text: str = "") -> str: bullets = [] # Modeling bullets.append("Modeling & relationships:") bullets.append("- Use a star schema: fact table(s) with dimension tables related 1:*; default single-direction filters.") bullets.append("- Create a dedicated Date table (one row per date) and Mark as date table; use it for all time axes.") bullets.append("- Hide surrogate keys/technical columns; keep columns typed & narrow.") # Visuals based on recs recs = recommend_visuals(profile) bullets.append("\nVisuals to build (with interactivity):") for r in recs[:4]: bullets.append(f"- **{r['visual']}** — {r['rationale']}") for ex in r.get("extras", []): bullets.append(f" - {ex}") # Interactivity patterns bullets.append("\nInteractivity patterns:") bullets.append("- Drillthrough: create a detail page, drag the entity field to the Drillthrough well, add a Back button.") bullets.append("- Report page tooltips: create a tooltip-sized page, assign as tooltip on dense visuals.") bullets.append("- Field parameters: add a parameter to let users swap dimensions/measures at runtime.") bullets.append("- Consider a Decomposition Tree for explorations of 'why' a metric changes.") # Scale & governance bullets.append("\nScale & governance:") bullets.append("- For large fact tables, configure Incremental Refresh (RangeStart/RangeEnd).") bullets.append("- Consider Aggregations for DirectQuery/composite models.") bullets.append("- Define RLS roles on dimensions; test with 'View as'. Keep logic simple.") if context_text.strip(): bullets.append("\nContext captured from the provided .docx:") bullets.append(f"> {context_text[:600]}{'...' if len(context_text) > 600 else ''}") md = "# Power BI Build Steps\n\n" + "\n".join([f"{line}" for line in bullets]) return md # --------------------------- # Gradio UI callbacks # --------------------------- def cb_profile(file_path: str, docx_path: str): df, profile, notes = None, {}, "" if file_path: try: df = read_table_file(file_path) except Exception as e: return gr.update(value=None), json.dumps({}, indent=2), f"Error reading table: {e}", None, None if docx_path and os.path.splitext(docx_path)[1].lower() in DOC_EXTS: try: notes = read_docx_text(docx_path) if HAS_DOCX else "" except Exception as e: notes = f"(Could not parse .docx: {e})" if df is None: return gr.update(value=None), json.dumps({}, indent=2), "Please upload a .csv or .xlsx file.", None, None profile = profile_dataframe(df) # Build a small dataframe for display cols_df = pd.DataFrame(profile["columns"]) head_df = df.head(10) # Save profile CSV for export out_csv = os.path.join(tempfile.gettempdir(), f"profile_{uuid.uuid4().hex[:8]}.csv") cols_df.to_csv(out_csv, index=False) # Save sample steps steps_md = generate_powerbi_steps(profile, context_text=notes) out_md = os.path.join(tempfile.gettempdir(), f"pbi_steps_{uuid.uuid4().hex[:8]}.md") with open(out_md, "w", encoding="utf-8") as f: f.write(steps_md) summary = { "rows": profile["n_rows"], "columns": profile["n_cols"], "has_date": profile["has_date"], "measures": profile["measures"], "dimensions": profile["dimensions"], "hierarchies_found": profile["hierarchies_found"] } return head_df, json.dumps(summary, indent=2), "Profile computed. See 'Column profile' tab for details.", cols_df, [out_md, out_csv] # RAG global state _VECTOR_STORE = None def build_vector_store(): global _VECTOR_STORE chunks = load_corpus_chunks() store = SimpleVectorStore() store.build(chunks) _VECTOR_STORE = store def cb_qa(question: str, restrict: str): global _VECTOR_STORE if _VECTOR_STORE is None: build_vector_store() # In a full implementation, 'restrict' could filter by domain/source. hits = _VECTOR_STORE.query(question, top_k=4) answer, cites = compose_answer(question, hits, _VECTOR_STORE) return answer def cb_reset(): # Nothing to deeply reset beyond UI; transient files are in /tmp return None, "{}", "Session cleared.", None, None, "", "Microsoft Learn (default only)" # --------------------------- # Build the UI # --------------------------- with gr.Blocks(title="AI Agent for Power BI") as demo: gr.Markdown("# AI Agent for Power BI\nUpload data → get visual/report recommendations, drilldown ideas, and Power BI steps.\nAsk about patterns from trusted docs (offline bundle).") with gr.Tabs(): with gr.TabItem("Upload & Profile"): with gr.Row(): file_in = gr.File(label="Upload .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath") docx_in = gr.File(label="Optional: upload a .docx with requirements (context only)", file_types=[".docx"], type="filepath") with gr.Row(): btn_profile = gr.Button("Profile Data & Generate Steps", variant="primary") btn_reset = gr.Button("Reset Session") with gr.Row(): df_head = gr.Dataframe(label="Sample data (first 10 rows)", interactive=False) profile_summary = gr.Code(label="Profile summary (JSON)") status_box = gr.Textbox(label="Status", interactive=False) with gr.Row(): gr.Markdown("### Column profile") cols_profile = gr.Dataframe(label="Columns", interactive=False) with gr.Row(): exports = gr.File(label="Download build steps (.md) and column profile (.csv)", file_count="multiple") with gr.TabItem("Patterns & Best Practices (Q&A)"): q = gr.Textbox(label="Ask a Power BI question (modeling, drillthrough, tooltips, field parameters, RLS, incremental refresh, aggregations...)", lines=3, placeholder="e.g., When should I use a star schema vs. a snowflake?") restrict = gr.Dropdown(choices=["Microsoft Learn (default only)", "Microsoft Learn + curated community"], value="Microsoft Learn (default only)", label="Trusted sources") btn_ask = gr.Button("Answer from trusted corpus", variant="primary") answer_md = gr.Markdown() with gr.TabItem("Explainability & Exports"): gr.Markdown("This app uses simple heuristics for visual recommendations and an offline RAG for Q&A.\n- Visuals: date/time → line; low-cardinality categories → bars; 2+ measures → scatter; numeric distribution → histogram/box; hierarchies → decomposition tree.\n- Interactivity: drillthrough pages, report page tooltips, field parameters.\n- Scale: incremental refresh & aggregations; security via RLS.") # Bind events btn_profile.click(cb_profile, inputs=[file_in, docx_in], outputs=[df_head, profile_summary, status_box, cols_profile, exports]) btn_reset.click(cb_reset, inputs=None, outputs=[df_head, profile_summary, status_box, cols_profile, exports, answer_md, restrict]) btn_ask.click(cb_qa, inputs=[q, restrict], outputs=[answer_md]) # Build vector store once on startup for snappy Q&A try: build_vector_store() except Exception as e: print(f"[WARN] Vector store build failed: {e}") if __name__ == "__main__": # Do NOT pass share=True for Hugging Face Spaces demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))