Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| AI Agent for Power BI β Gradio app for Hugging Face Spaces | |
| - Upload a spreadsheet (.csv/.xlsx) or a Word doc (.docx) | |
| - Profile the data, recommend visuals & report patterns (with drilldown, field params, tooltips, decomposition tree) | |
| - Generate "Power BI build steps" tailored to the dataset | |
| - Offline RAG Q&A over a small allow-listed corpus (seeded at runtime) focused on Microsoft Learn topics | |
| Notes: | |
| - No outbound web calls are required at runtime. | |
| - If sentence-transformers or faiss-cpu is not available, the app falls back to TF-IDF + cosine. | |
| - In Spaces, do NOT use share=True in launch(). | |
| """ | |
| import os | |
| import io | |
| import re | |
| import gc | |
| import csv | |
| import json | |
| import math | |
| import time | |
| import glob | |
| import uuid | |
| import shutil | |
| import string | |
| import random | |
| import hashlib | |
| import textwrap | |
| import tempfile | |
| from dataclasses import dataclass | |
| from typing import List, Dict, Any, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| # Optional dependencies | |
| try: | |
| import docx # python-docx | |
| HAS_DOCX = True | |
| except Exception: | |
| HAS_DOCX = False | |
| # Embeddings stack (robust fallbacks) | |
| _EMBED_BACKEND = "st" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| import faiss # type: ignore | |
| HAS_ST = True | |
| HAS_FAISS = True | |
| except Exception: | |
| HAS_ST = False | |
| try: | |
| import faiss # type: ignore | |
| HAS_FAISS = True | |
| except Exception: | |
| HAS_FAISS = False | |
| _EMBED_BACKEND = "tfidf" | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) | |
| APP_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DOCS_DIR = os.path.join(APP_DIR, "docs") # local allow-listed offline bundle | |
| CACHE_DIR = os.path.join(APP_DIR, ".cache") | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| # --------------------------- | |
| # Utilities | |
| # --------------------------- | |
| def clean_text(s: str) -> str: | |
| if not s: | |
| return "" | |
| s = s.replace("\\u0000", " ").replace("\\x00", " ") | |
| s = re.sub(r"[\u0000-\u001F]", " ", s) # control chars | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def to_markdown_table(rows: List[Dict[str, Any]]) -> str: | |
| if not rows: | |
| return "_No rows_" | |
| headers = list(rows[0].keys()) | |
| out = [] | |
| out.append("| " + " | ".join(headers) + " |") | |
| out.append("| " + " | ".join(["---"] * len(headers)) + " |") | |
| for r in rows: | |
| out.append("| " + " | ".join([str(r.get(h, "")) for h in headers]) + " |") | |
| return "\n".join(out) | |
| def save_temp_textfile(name: str, text: str) -> str: | |
| tmp = os.path.join(tempfile.gettempdir(), name) | |
| with open(tmp, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| return tmp | |
| # --------------------------- | |
| # Seed trusted corpus (offline) | |
| # --------------------------- | |
| SEED_DOCS = [ | |
| { | |
| "title": "Star Schema and Modeling Guidance", | |
| "source_url": "https://learn.microsoft.com/power-bi/guidance/star-schema", | |
| "text": """ | |
| Key ideas for Power BI modeling: | |
| - Prefer a star schema: one or more fact tables at the center, surrounded by dimension tables. | |
| - Use one-to-many relationships from dimensions to facts; default filter direction single-direction. | |
| - Avoid many-to-many between fact tables. Use conformed dimensions to integrate subjects. | |
| - Create and mark a proper Date table for time intelligence; avoid relying on auto date/time for complex models. | |
| - Keep columns narrow and typed correctly; hide surrogate keys and technical columns from report view. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Drillthrough Pages", | |
| "source_url": "https://learn.microsoft.com/power-bi/visuals/drillthrough", | |
| "text": """ | |
| Drillthrough pattern: | |
| - Create a dedicated drillthrough page; add the target field to the Drillthrough well. | |
| - Place visuals that explain the selected entity (e.g., Customer, Product). | |
| - Add a Back button to return to the source page. | |
| - Test by right-clicking a category value on a source visual and choosing Drillthrough. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Report Page Tooltips", | |
| "source_url": "https://learn.microsoft.com/power-bi/create-reports/desktop-tooltips", | |
| "text": """ | |
| Report page tooltips: | |
| - Create a small page sized for tooltip (e.g., 320x240 or default tooltip page size). | |
| - Add detailed visuals or KPIs intended to show on hover. | |
| - In the source visual, enable tooltip and assign the tooltip page. | |
| - Keep tooltip visuals compact; avoid heavy interactions on hover. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Field Parameters for Self-Service", | |
| "source_url": "https://learn.microsoft.com/power-bi/create-reports/field-parameters", | |
| "text": """ | |
| Field parameters: | |
| - Create a field parameter to allow viewers to swap dimensions/measures on visuals. | |
| - Add the parameter field to the Axis/Legend/Values well of visuals. | |
| - Provide a slicer bound to the parameter to let users choose fields at runtime. | |
| - Useful for building one flexible page instead of many near-duplicate pages. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Incremental Refresh", | |
| "source_url": "https://learn.microsoft.com/power-bi/enterprise/incremental-refresh-overview", | |
| "text": """ | |
| Incremental refresh: | |
| - Define RangeStart and RangeEnd parameters to partition by date/time. | |
| - Configure policy: how much history to store and how much of the newer period to refresh. | |
| - Only the most recent partitions are refreshed in the Service; historical partitions remain stable. | |
| - Great for large tables; combine with aggregations for best performance. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Aggregations", | |
| "source_url": "https://learn.microsoft.com/power-bi/transform-model/aggregations-advanced", | |
| "text": """ | |
| Aggregations pattern: | |
| - Create an aggregate table with pre-summarized values (e.g., by Month/Region/Product). | |
| - Map it to a detailed DirectQuery or Import table via manage aggregations. | |
| - Power BI routes queries to the smaller aggregated table when possible. | |
| - Especially valuable with DirectQuery/composite models. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Row-Level Security (RLS)", | |
| "source_url": "https://learn.microsoft.com/power-bi/admin/rls", | |
| "text": """ | |
| RLS guidance: | |
| - Define roles with filters on dimension tables (e.g., Region = USERPRINCIPALNAME() mapping table). | |
| - Test roles in Power BI Desktop using 'View as'. | |
| - In the Service, assign users/groups to roles; viewers see only permitted rows. | |
| - Keep RLS logic simple; avoid complex bi-directional relationships. | |
| """.strip() | |
| }, | |
| { | |
| "title": "Decomposition Tree Visual", | |
| "source_url": "https://learn.microsoft.com/power-bi/visuals/power-bi-visualization-decomposition-tree", | |
| "text": """ | |
| Decomposition Tree: | |
| - A visual to perform root-cause analysis by breaking down a measure across dimensions. | |
| - Add dimensions in the 'Explain by' well; choose 'High value' or 'Low value' splits to guide analysis. | |
| - Great for iterative 'why' exploration in operational datasets. | |
| """.strip() | |
| } | |
| ] | |
| def ensure_docs_directory(): | |
| """Create ./docs with seed files if it doesn't exist or is empty.""" | |
| os.makedirs(DOCS_DIR, exist_ok=True) | |
| md_files = list(glob.glob(os.path.join(DOCS_DIR, "*.md"))) | |
| if md_files: | |
| return | |
| # Write seed docs | |
| for i, d in enumerate(SEED_DOCS, start=1): | |
| fn = os.path.join(DOCS_DIR, f"{i:02d}-{re.sub(r'[^a-z0-9]+', '-', d['title'].lower()).strip('-')}.md") | |
| with open(fn, "w", encoding="utf-8") as f: | |
| f.write(f"# {d['title']}\n\n") | |
| f.write(f"Source: {d['source_url']}\n\n") | |
| f.write(d["text"].strip() + "\n") | |
| # --------------------------- | |
| # Simple offline RAG | |
| # --------------------------- | |
| class DocChunk: | |
| doc_id: str | |
| title: str | |
| source_url: str | |
| text: str | |
| class SimpleVectorStore: | |
| def __init__(self): | |
| self.backend = _EMBED_BACKEND # "st" or "tfidf" | |
| self.model = None | |
| self.index = None | |
| self.texts: List[str] = [] | |
| self.meta: List[Tuple[str, str, str]] = [] # (doc_id, title, source_url) | |
| if self.backend == "st" and HAS_ST: | |
| # Smaller model is fine for HF CPU Spaces | |
| self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def _embed(self, texts: List[str]) -> np.ndarray: | |
| if self.backend == "st" and HAS_ST: | |
| emb = self.model.encode(texts, batch_size=32, convert_to_numpy=True, normalize_embeddings=True) | |
| return emb.astype("float32") | |
| else: | |
| # TF-IDF fallback (cosine will be computed via sklearn) | |
| self.vectorizer = TfidfVectorizer(min_df=1, max_df=0.95, ngram_range=(1,2)) | |
| X = self.vectorizer.fit_transform(texts) | |
| self._tfidf_matrix = X | |
| return np.zeros((X.shape[0], 1), dtype="float32") # dummy; we won't use FAISS path | |
| def build(self, chunks: List[DocChunk]) -> None: | |
| self.texts = [c.text for c in chunks] | |
| self.meta = [(c.doc_id, c.title, c.source_url) for c in chunks] | |
| if self.backend == "st" and HAS_ST and HAS_FAISS: | |
| embs = self._embed(self.texts) | |
| dim = embs.shape[1] | |
| self.index = faiss.IndexFlatIP(dim) | |
| self.index.add(embs) | |
| else: | |
| # TF-IDF fallback doesn't build a FAISS index | |
| if self.backend == "st" and not HAS_FAISS: | |
| # Still have ST model but no FAISS -> cosine on numpy | |
| self.embs = self._embed(self.texts) | |
| elif self.backend == "tfidf": | |
| # vectorizer built in _embed | |
| self._embed(self.texts) | |
| def query(self, q: str, top_k: int = 4) -> List[Tuple[float, int]]: | |
| """Return [(score, idx), ...] sorted by score desc.""" | |
| q = clean_text(q) | |
| if not q.strip() or not self.texts: | |
| return [] | |
| if self.backend == "st" and HAS_ST and HAS_FAISS: | |
| q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32") | |
| sims, idxs = self.index.search(q_emb, top_k) | |
| return [(float(sims[0][i]), int(idxs[0][i])) for i in range(len(idxs[0]))] | |
| elif self.backend == "st" and HAS_ST and not HAS_FAISS: | |
| # cosine on numpy | |
| q_emb = self.model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32") | |
| sims = (self.embs @ q_emb.T).ravel() | |
| idxs = np.argsort(-sims)[:top_k] | |
| return [(float(sims[i]), int(i)) for i in idxs] | |
| else: | |
| # TF-IDF | |
| q_vec = self.vectorizer.transform([q]) | |
| sims = cosine_similarity(self._tfidf_matrix, q_vec).ravel() | |
| idxs = np.argsort(-sims)[:top_k] | |
| return [(float(sims[i]), int(i)) for i in idxs] | |
| def load_corpus_chunks() -> List[DocChunk]: | |
| ensure_docs_directory() | |
| chunks: List[DocChunk] = [] | |
| for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.md"))): | |
| with open(path, "r", encoding="utf-8") as f: | |
| raw = f.read() | |
| # first line is title, second may contain Source: URL | |
| lines = [l.strip() for l in raw.splitlines() if l.strip()] | |
| title = lines[0].lstrip("# ").strip() if lines else os.path.basename(path) | |
| src = "" | |
| if len(lines) > 1 and lines[1].lower().startswith("source:"): | |
| src = lines[1].split(":", 1)[1].strip() | |
| body = "\n".join(lines[2:]) if len(lines) > 2 else "\n".join(lines[1:]) | |
| body = clean_text(body) | |
| # Chunk by paragraphs ~600-900 chars | |
| paras = re.split(r"\n{2,}", body) | |
| buf = "" | |
| for p in paras: | |
| if len(buf) + len(p) < 800: | |
| buf += (p + "\n\n") | |
| else: | |
| if buf.strip(): | |
| chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip())) | |
| buf = p + "\n\n" | |
| if buf.strip(): | |
| chunks.append(DocChunk(doc_id=os.path.basename(path), title=title, source_url=src, text=buf.strip())) | |
| return chunks | |
| def compose_answer(question: str, hits: List[Tuple[float, int]], store: SimpleVectorStore) -> Tuple[str, List[Dict[str, str]]]: | |
| if not hits: | |
| return ("I couldn't find this in the trusted corpus. Try rephrasing or add more Microsoft Learn pages to the ./docs folder.", []) | |
| pieces = [] | |
| cites = [] | |
| for score, idx in hits: | |
| text = store.texts[idx] | |
| doc_id, title, src = store.meta[idx] | |
| pieces.append(f"- {text}") | |
| cites.append({"title": title, "source": src}) | |
| answer = "### Answer (from trusted sources)\n" + "\n".join(pieces[:3]) | |
| answer += "\n\n**Citations:**\n" + "\n".join([f"- {c['title']} β {c['source']}" for c in cites[:3]]) | |
| return answer, cites | |
| # --------------------------- | |
| # Data ingestion & profiling | |
| # --------------------------- | |
| READABLE_EXTS = [".csv", ".xlsx"] | |
| DOC_EXTS = [".docx"] | |
| def read_table_file(path: str) -> pd.DataFrame: | |
| ext = os.path.splitext(path)[1].lower() | |
| if ext == ".csv": | |
| return pd.read_csv(path) | |
| elif ext == ".xlsx": | |
| return pd.read_excel(path) | |
| else: | |
| raise ValueError(f"Unsupported table extension: {ext}") | |
| def read_docx_text(path: str) -> str: | |
| if not HAS_DOCX: | |
| return "" | |
| doc = docx.Document(path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| def infer_col_type(series: pd.Series) -> str: | |
| if pd.api.types.is_datetime64_any_dtype(series): | |
| return "datetime" | |
| if pd.api.types.is_bool_dtype(series): | |
| return "boolean" | |
| if pd.api.types.is_numeric_dtype(series): | |
| return "numeric" | |
| # treat everything else as text/categorical | |
| # attempt parse datetime small sample | |
| try: | |
| pd.to_datetime(series.dropna().head(20), errors="raise") # may raise | |
| return "datetime" | |
| except Exception: | |
| pass | |
| # if few unique values relative to rows -> categorical | |
| n = len(series) | |
| dc = series.nunique(dropna=True) | |
| if n > 0 and dc / max(n,1) < 0.2: | |
| return "categorical" | |
| return "text" | |
| def profile_dataframe(df: pd.DataFrame, max_sample: int = 10_000) -> Dict[str, Any]: | |
| work = df.copy() | |
| if len(work) > max_sample: | |
| work = work.sample(max_sample, random_state=42).reset_index(drop=True) | |
| cols = [] | |
| for col in work.columns: | |
| s = work[col] | |
| inferred = infer_col_type(s) | |
| role = "measure" if inferred == "numeric" else "dimension" | |
| n = len(s) | |
| miss = float(s.isna().sum()) / max(n, 1) | |
| distinct = int(s.nunique(dropna=True)) | |
| sample_vals = list(s.dropna().unique()[:5]) | |
| cols.append({ | |
| "column": col, | |
| "pandas_dtype": str(s.dtype), | |
| "inferred_type": inferred, | |
| "role": role, | |
| "rows": n, | |
| "distinct_count": distinct, | |
| "pct_missing": round(miss * 100.0, 2), | |
| "sample_values": sample_vals | |
| }) | |
| # high-level signals | |
| has_date = any(c["inferred_type"] == "datetime" for c in cols) | |
| measures = sum(1 for c in cols if c["role"] == "measure") | |
| dims = sum(1 for c in cols if c["role"] == "dimension") | |
| low_card_dims = [c for c in cols if c["role"] == "dimension" and c["distinct_count"] <= 20] | |
| two_small_categories = len(low_card_dims) >= 2 | |
| hierarchies_found = has_date or two_small_categories | |
| return { | |
| "n_rows": int(len(work)), | |
| "n_cols": int(len(work.columns)), | |
| "has_date": bool(has_date), | |
| "measures": int(measures), | |
| "dimensions": int(dims), | |
| "two_small_categories": bool(two_small_categories), | |
| "hierarchies_found": bool(hierarchies_found), | |
| "columns": cols | |
| } | |
| # --------------------------- | |
| # Visual recommendation engine | |
| # --------------------------- | |
| def recommend_visuals(profile: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| recs = [] | |
| def add(name, score, why, extras=None): | |
| recs.append({ | |
| "visual": name, | |
| "score": score, | |
| "rationale": why, | |
| "extras": extras or [] | |
| }) | |
| has_date = profile.get("has_date", False) | |
| measures = profile.get("measures", 0) | |
| two_small = profile.get("two_small_categories", False) | |
| hier = profile.get("hierarchies_found", False) | |
| # Time series | |
| if has_date and measures >= 1: | |
| add("Line chart (trend over time)", | |
| 0.96, | |
| "Date/time column detected + at least one measure β trend visualization with drill down (Y/Q/M).", | |
| ["Enable drill hierarchy on Date table.", "Consider YoY or rolling average in a small multiple."]) | |
| # Ranked bars for categories | |
| if measures >= 1: | |
| low_card_dims = [c for c in profile["columns"] if c["role"] == "dimension" and c["distinct_count"] <= 20] | |
| if low_card_dims: | |
| add("Clustered bar/column (ranked categories)", | |
| 0.90, | |
| "Low-cardinality categorical fields with one measure β ranked comparison.", | |
| ["Sort by measure desc.", "Consider report-page tooltip for extra detail."]) | |
| # Part-to-whole | |
| if measures == 1 and two_small: | |
| add("100% stacked bar/column (part-to-whole)", | |
| 0.78, | |
| "Two small categorical fields + one measure β relative composition across segments.", | |
| ["Prefer 100% stacked for share emphasis.", "Avoid pies/donuts unless very few categories."]) | |
| # Compare measures | |
| if measures >= 2: | |
| add("Scatter (compare two measures)", | |
| 0.74, | |
| "Two or more measures β scatter for correlation; add category legend if helpful.", | |
| ["Enable data labels, add trendline where suitable."]) | |
| # Distribution | |
| numeric_cols = [c for c in profile["columns"] if c["inferred_type"] == "numeric"] | |
| if numeric_cols: | |
| add("Histogram/Box plot (distribution & outliers)", | |
| 0.66, | |
| "Numeric columns β reveal distribution shape and outliers.", | |
| ["Use custom bins for stability.", "Use box plot for skew/outlier awareness."]) | |
| # Root-cause analysis | |
| if hier: | |
| add("Decomposition tree (root-cause)", | |
| 0.64, | |
| "Date hierarchy or multiple small categorical fields β iterative 'why' analysis.", | |
| ["Seed 'Explain by' with key dimensions.", "Use High/Low splits strategically."]) | |
| # Rank by score | |
| recs.sort(key=lambda x: x["score"], reverse=True) | |
| return recs | |
| # --------------------------- | |
| # Power BI steps generator | |
| # --------------------------- | |
| def generate_powerbi_steps(profile: Dict[str, Any], context_text: str = "") -> str: | |
| bullets = [] | |
| # Modeling | |
| bullets.append("Modeling & relationships:") | |
| bullets.append("- Use a star schema: fact table(s) with dimension tables related 1:*; default single-direction filters.") | |
| bullets.append("- Create a dedicated Date table (one row per date) and Mark as date table; use it for all time axes.") | |
| bullets.append("- Hide surrogate keys/technical columns; keep columns typed & narrow.") | |
| # Visuals based on recs | |
| recs = recommend_visuals(profile) | |
| bullets.append("\nVisuals to build (with interactivity):") | |
| for r in recs[:4]: | |
| bullets.append(f"- **{r['visual']}** β {r['rationale']}") | |
| for ex in r.get("extras", []): | |
| bullets.append(f" - {ex}") | |
| # Interactivity patterns | |
| bullets.append("\nInteractivity patterns:") | |
| bullets.append("- Drillthrough: create a detail page, drag the entity field to the Drillthrough well, add a Back button.") | |
| bullets.append("- Report page tooltips: create a tooltip-sized page, assign as tooltip on dense visuals.") | |
| bullets.append("- Field parameters: add a parameter to let users swap dimensions/measures at runtime.") | |
| bullets.append("- Consider a Decomposition Tree for explorations of 'why' a metric changes.") | |
| # Scale & governance | |
| bullets.append("\nScale & governance:") | |
| bullets.append("- For large fact tables, configure Incremental Refresh (RangeStart/RangeEnd).") | |
| bullets.append("- Consider Aggregations for DirectQuery/composite models.") | |
| bullets.append("- Define RLS roles on dimensions; test with 'View as'. Keep logic simple.") | |
| if context_text.strip(): | |
| bullets.append("\nContext captured from the provided .docx:") | |
| bullets.append(f"> {context_text[:600]}{'...' if len(context_text) > 600 else ''}") | |
| md = "# Power BI Build Steps\n\n" + "\n".join([f"{line}" for line in bullets]) | |
| return md | |
| # --------------------------- | |
| # Gradio UI callbacks | |
| # --------------------------- | |
| def cb_profile(file_path: str, docx_path: str): | |
| df, profile, notes = None, {}, "" | |
| if file_path: | |
| try: | |
| df = read_table_file(file_path) | |
| except Exception as e: | |
| return gr.update(value=None), json.dumps({}, indent=2), f"Error reading table: {e}", None, None | |
| if docx_path and os.path.splitext(docx_path)[1].lower() in DOC_EXTS: | |
| try: | |
| notes = read_docx_text(docx_path) if HAS_DOCX else "" | |
| except Exception as e: | |
| notes = f"(Could not parse .docx: {e})" | |
| if df is None: | |
| return gr.update(value=None), json.dumps({}, indent=2), "Please upload a .csv or .xlsx file.", None, None | |
| profile = profile_dataframe(df) | |
| # Build a small dataframe for display | |
| cols_df = pd.DataFrame(profile["columns"]) | |
| head_df = df.head(10) | |
| # Save profile CSV for export | |
| out_csv = os.path.join(tempfile.gettempdir(), f"profile_{uuid.uuid4().hex[:8]}.csv") | |
| cols_df.to_csv(out_csv, index=False) | |
| # Save sample steps | |
| steps_md = generate_powerbi_steps(profile, context_text=notes) | |
| out_md = os.path.join(tempfile.gettempdir(), f"pbi_steps_{uuid.uuid4().hex[:8]}.md") | |
| with open(out_md, "w", encoding="utf-8") as f: | |
| f.write(steps_md) | |
| summary = { | |
| "rows": profile["n_rows"], | |
| "columns": profile["n_cols"], | |
| "has_date": profile["has_date"], | |
| "measures": profile["measures"], | |
| "dimensions": profile["dimensions"], | |
| "hierarchies_found": profile["hierarchies_found"] | |
| } | |
| return head_df, json.dumps(summary, indent=2), "Profile computed. See 'Column profile' tab for details.", cols_df, [out_md, out_csv] | |
| # RAG global state | |
| _VECTOR_STORE = None | |
| def build_vector_store(): | |
| global _VECTOR_STORE | |
| chunks = load_corpus_chunks() | |
| store = SimpleVectorStore() | |
| store.build(chunks) | |
| _VECTOR_STORE = store | |
| def cb_qa(question: str, restrict: str): | |
| global _VECTOR_STORE | |
| if _VECTOR_STORE is None: | |
| build_vector_store() | |
| # In a full implementation, 'restrict' could filter by domain/source. | |
| hits = _VECTOR_STORE.query(question, top_k=4) | |
| answer, cites = compose_answer(question, hits, _VECTOR_STORE) | |
| return answer | |
| def cb_reset(): | |
| # Nothing to deeply reset beyond UI; transient files are in /tmp | |
| return None, "{}", "Session cleared.", None, None, "", "Microsoft Learn (default only)" | |
| # --------------------------- | |
| # Build the UI | |
| # --------------------------- | |
| with gr.Blocks(title="AI Agent for Power BI") as demo: | |
| gr.Markdown("# AI Agent for Power BI\nUpload data β get visual/report recommendations, drilldown ideas, and Power BI steps.\nAsk about patterns from trusted docs (offline bundle).") | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload & Profile"): | |
| with gr.Row(): | |
| file_in = gr.File(label="Upload .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath") | |
| docx_in = gr.File(label="Optional: upload a .docx with requirements (context only)", file_types=[".docx"], type="filepath") | |
| with gr.Row(): | |
| btn_profile = gr.Button("Profile Data & Generate Steps", variant="primary") | |
| btn_reset = gr.Button("Reset Session") | |
| with gr.Row(): | |
| df_head = gr.Dataframe(label="Sample data (first 10 rows)", interactive=False) | |
| profile_summary = gr.Code(label="Profile summary (JSON)") | |
| status_box = gr.Textbox(label="Status", interactive=False) | |
| with gr.Row(): | |
| gr.Markdown("### Column profile") | |
| cols_profile = gr.Dataframe(label="Columns", interactive=False) | |
| with gr.Row(): | |
| exports = gr.File(label="Download build steps (.md) and column profile (.csv)", file_count="multiple") | |
| with gr.TabItem("Patterns & Best Practices (Q&A)"): | |
| q = gr.Textbox(label="Ask a Power BI question (modeling, drillthrough, tooltips, field parameters, RLS, incremental refresh, aggregations...)", lines=3, placeholder="e.g., When should I use a star schema vs. a snowflake?") | |
| restrict = gr.Dropdown(choices=["Microsoft Learn (default only)", "Microsoft Learn + curated community"], value="Microsoft Learn (default only)", label="Trusted sources") | |
| btn_ask = gr.Button("Answer from trusted corpus", variant="primary") | |
| answer_md = gr.Markdown() | |
| with gr.TabItem("Explainability & Exports"): | |
| gr.Markdown("This app uses simple heuristics for visual recommendations and an offline RAG for Q&A.\n- Visuals: date/time β line; low-cardinality categories β bars; 2+ measures β scatter; numeric distribution β histogram/box; hierarchies β decomposition tree.\n- Interactivity: drillthrough pages, report page tooltips, field parameters.\n- Scale: incremental refresh & aggregations; security via RLS.") | |
| # Bind events | |
| btn_profile.click(cb_profile, inputs=[file_in, docx_in], outputs=[df_head, profile_summary, status_box, cols_profile, exports]) | |
| btn_reset.click(cb_reset, inputs=None, outputs=[df_head, profile_summary, status_box, cols_profile, exports, answer_md, restrict]) | |
| btn_ask.click(cb_qa, inputs=[q, restrict], outputs=[answer_md]) | |
| # Build vector store once on startup for snappy Q&A | |
| try: | |
| build_vector_store() | |
| except Exception as e: | |
| print(f"[WARN] Vector store build failed: {e}") | |
| if __name__ == "__main__": | |
| # Do NOT pass share=True for Hugging Face Spaces | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860))) | |