Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 8

Commit

624e55c

verified ·

1 Parent(s): 6b8bffc

Delete space

Browse files

Files changed (13) hide show

space/Dockerfile +0 -7
space/README_SPACE.md +0 -12
space/app.py +0 -138
space/templates/report_styles.css +0 -6
space/templates/report_template.md +0 -26
space/tools/__init__.py +0 -0
space/tools/explain_tool.py +0 -44
space/tools/predict_tool.py +0 -32
space/tools/report_tool.py +0 -25
space/tools/sql_tool.py +0 -49
space/utils/config.py +0 -21
space/utils/hf_io.py +0 -0
space/utils/tracing.py +0 -30

space/Dockerfile DELETED Viewed

@@ -1,7 +0,0 @@
-FROM python:3.11-slim
-WORKDIR /app
-COPY ../requirements.txt /app/requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . /app
-ENV HF_HOME=/app/.cache/hf_cache
-CMD ["python", "app.py"]

space/README_SPACE.md DELETED Viewed

@@ -1,12 +0,0 @@
-# Deploying as a Hugging Face Space
-1. Create a new **Gradio** Space.
-2. Upload the **contents of `space/`** to the Space root.
-3. Add Space Secrets:
-   - `HF_TOKEN`
-   - For BigQuery: `GCP_SERVICE_ACCOUNT_JSON`, `GCP_PROJECT`
-   - For MotherDuck: `MOTHERDUCK_TOKEN`, `MOTHERDUCK_DB`
-   - Optional tracing: `LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_HOST`
-4. Set `SQL_BACKEND` to `bigquery` or `motherduck`.
-5. Set `HF_MODEL_REPO` to your private model repo id.
-6. (Optional) Set `ORCHESTRATOR_MODEL` for the tiny CPU LLM.

space/app.py DELETED Viewed

@@ -1,138 +0,0 @@
-import os
-import json
-import gradio as gr
-import pandas as pd
-from typing import Dict, Any
-from tools.sql_tool import SQLTool
-from tools.predict_tool import PredictTool
-from tools.explain_tool import ExplainTool
-from tools.report_tool import ReportTool
-from utils.tracing import Tracer
-from utils.config import AppConfig
-# Optional: tiny orchestration LLM (keep it simple on CPU)
-try:
-    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-    LLM_ID = os.getenv("ORCHESTRATOR_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
-    _tok = AutoTokenizer.from_pretrained(LLM_ID)
-    _mdl = AutoModelForCausalLM.from_pretrained(LLM_ID)
-    llm = pipeline("text-generation", model=_mdl, tokenizer=_tok, max_new_tokens=512)
-except Exception:
-    llm = None  # Fallback: deterministic tool routing without LLM
-cfg = AppConfig.from_env()
-tracer = Tracer.from_env()
-sql_tool = SQLTool(cfg, tracer)
-predict_tool = PredictTool(cfg, tracer)
-explain_tool = ExplainTool(cfg, tracer)
-report_tool = ReportTool(cfg, tracer)
-SYSTEM_PROMPT = (
-    "You are an analytical assistant for tabular data. "
-    "When the user asks a question, decide which tools to call in order: "
-    "1) SQL (if data retrieval is needed) 2) Predict (if scoring is requested) "
-    "3) Explain (if attributions or why-questions) 4) Report (if a document is requested). "
-    "Always disclose the steps taken and include links to traces if available."
-)
-def plan_actions(message: str) -> Dict[str, Any]:
-    """Very lightweight planner. Uses LLM if available, else rule-based heuristics."""
-    if llm is not None:
-        prompt = (
-            f"{SYSTEM_PROMPT}\nUser: {message}\n"
-            "Return JSON with fields: steps (array, subset of ['sql','predict','explain','report']), "
-            "and rationale (one sentence)."
-        )
-        out = llm(prompt)[0]["generated_text"].split("\n")[-1]
-        try:
-            plan = json.loads(out)
-            return plan
-        except Exception:
-            pass
-    # Heuristic fallback
-    steps = []
-    m = message.lower()
-    if any(k in m for k in ["show", "average", "count", "trend", "top", "sql", "query", "kpi"]):
-        steps.append("sql")
-    if any(k in m for k in ["predict", "score", "risk", "propensity", "probability"]):
-        steps.append("predict")
-    if any(k in m for k in ["why", "explain", "shap", "feature", "attribution"]):
-        steps.append("explain")
-    if any(k in m for k in ["report", "download", "pdf", "summary"]):
-        steps.append("report")
-    if not steps:
-        steps = ["sql"]
-    return {"steps": steps, "rationale": "Rule-based plan."}
-def run_agent(message: str, hitl_decision: str = "Approve", reviewer_note: str = ""):
-    tracer.trace_event("user_message", {"message": message})
-    plan = plan_actions(message)
-    tracer.trace_event("plan", plan)
-    sql_df = None
-    predict_df = None
-    explain_plots = {}
-    artifacts = {}
-    if "sql" in plan["steps"]:
-        sql_df = sql_tool.run(message)
-        artifacts["sql_rows"] = len(sql_df) if isinstance(sql_df, pd.DataFrame) else 0
-    if "predict" in plan["steps"]:
-        predict_df = predict_tool.run(sql_df)
-    if "explain" in plan["steps"]:
-        explain_plots = explain_tool.run(predict_df or sql_df)
-    report_link = None
-    if "report" in plan["steps"]:
-        report_link = report_tool.render_and_save(
-            user_query=message,
-            sql_preview=sql_df.head(50) if isinstance(sql_df, pd.DataFrame) else None,
-            predict_preview=predict_df.head(50) if isinstance(predict_df, pd.DataFrame) else None,
-            explain_images=explain_plots,
-            plan=plan,
-        )
-    # HITL log (append-only). In production, push to a private HF dataset via API.
-    hitl_record = {
-        "message": message,
-        "decision": hitl_decision,
-        "reviewer_note": reviewer_note,
-        "timestamp": pd.Timestamp.utcnow().isoformat(),
-        "artifacts": artifacts,
-        "plan": plan,
-    }
-    tracer.trace_event("hitl", hitl_record)
-    response = f"**Plan:** {plan['steps']}\n**Rationale:** {plan['rationale']}\n"
-    if isinstance(sql_df, pd.DataFrame):
-        response += f"\n**SQL rows:** {len(sql_df)}"
-    if isinstance(predict_df, pd.DataFrame):
-        response += f"\n**Predictions rows:** {len(predict_df)}"
-    if report_link:
-        response += f"\n**Report:** {report_link}"
-    if tracer.trace_url:
-        response += f"\n**Trace:** {tracer.trace_url}"
-    preview_df = predict_df or sql_df
-    return response, preview_df
-with gr.Blocks() as demo:
-    gr.Markdown("# Tabular Agentic XAI (Free‑Tier)")
-    with gr.Row():
-        msg = gr.Textbox(label="Ask your question")
-    with gr.Row():
-        hitl = gr.Radio(["Approve", "Needs Changes"], value="Approve", label="Human Review")
-        note = gr.Textbox(label="Reviewer note (optional)")
-    out_md = gr.Markdown()
-    out_df = gr.Dataframe(interactive=False)
-    ask = gr.Button("Run")
-    ask.click(run_agent, inputs=[msg, hitl, note], outputs=[out_md, out_df])
-if __name__ == "__main__":
-    demo.launch()

space/templates/report_styles.css DELETED Viewed

@@ -1,6 +0,0 @@
-body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; padding: 24px; line-height: 1.5; }
-h1,h2,h3 { margin-top: 1.2em; }
-code, pre { background: #f6f8fa; padding: 2px 4px; border-radius: 4px; }
-table { border-collapse: collapse; width: 100%; }
-th, td { border: 1px solid #ddd; padding: 8px; }
-th { background: #fafafa; }

space/templates/report_template.md DELETED Viewed

@@ -1,26 +0,0 @@
-# Insight Report
-**User Query**: {{ user_query }}
-**Plan**: {{ plan.steps }}
-**Rationale**: {{ plan.rationale }}
-{% if sql_preview %}
-## SQL Preview
-{{ sql_preview }}
-{% endif %}
-{% if predict_preview %}
-## Predictions Preview
-{{ predict_preview }}
-{% endif %}
-{% if explain_images.global_bar %}
-## Global Feature Importance (SHAP)
-<img src="{{ explain_images.global_bar }}" style="max-width: 100%;" />
-{% endif %}
-{% if explain_images.beeswarm %}
-## SHAP Beeswarm
-<img src="{{ explain_images.beeswarm }}" style="max-width: 100%;" />
-{% endif %}

space/tools/__init__.py DELETED Viewed

File without changes

space/tools/explain_tool.py DELETED Viewed

@@ -1,44 +0,0 @@
-import os
-import io
-import shap
-import base64
-import pandas as pd
-from huggingface_hub import hf_hub_download
-from ..utils.config import AppConfig
-from ..utils.tracing import Tracer
-class ExplainTool:
-    def __init__(self, cfg: AppConfig, tracer: Tracer):
-        self.cfg = cfg
-        self.tracer = tracer
-        self._model = None
-    def _ensure_model(self):
-        if self._model is None:
-            import joblib
-            path = hf_hub_download(repo_id=self.cfg.hf_model_repo, filename="model.pkl", token=os.getenv("HF_TOKEN"))
-            self._model = joblib.load(path)
-    def _to_data_uri(self, fig) -> str:
-        buf = io.BytesIO()
-        fig.savefig(buf, format="png", bbox_inches="tight")
-        buf.seek(0)
-        return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
-    def run(self, df: pd.DataFrame):
-        self._ensure_model()
-        # Use a small sample for speed on CPU Spaces
-        sample = df.sample(min(len(df), 500), random_state=42)
-        explainer = shap.Explainer(self._model, sample, feature_names=list(sample.columns))
-        shap_values = explainer(sample)
-        # Global summary plot
-        fig1 = shap.plots.bar(shap_values, show=False)
-        img1 = self._to_data_uri(fig1)
-        # Beeswarm (optional)
-        fig2 = shap.plots.beeswarm(shap_values, show=False)
-        img2 = self._to_data_uri(fig2)
-        self.tracer.trace_event("explain", {"rows": len(sample)})
-        return {"global_bar": img1, "beeswarm": img2}

space/tools/predict_tool.py DELETED Viewed

@@ -1,32 +0,0 @@
-import os
-import pandas as pd
-import joblib
-from huggingface_hub import hf_hub_download
-from ..utils.config import AppConfig
-from ..utils.tracing import Tracer
-class PredictTool:
-    def __init__(self, cfg: AppConfig, tracer: Tracer):
-        self.cfg = cfg
-        self.tracer = tracer
-        self._model = None
-        self._feature_meta = None
-    def _ensure_loaded(self):
-        if self._model is None:
-            path = hf_hub_download(repo_id=self.cfg.hf_model_repo, filename="model.pkl", token=os.getenv("HF_TOKEN"))
-            self._model = joblib.load(path)
-            meta = hf_hub_download(repo_id=self.cfg.hf_model_repo, filename="feature_metadata.json", token=os.getenv("HF_TOKEN"))
-            import json
-            with open(meta, "r") as f:
-                self._feature_meta = json.load(f)
-    def run(self, df: pd.DataFrame) -> pd.DataFrame:
-        self._ensure_loaded()
-        use_cols = self._feature_meta.get("feature_order", list(df.columns))
-        X = df[use_cols].copy()
-        preds = self._model.predict_proba(X)[:, 1] if hasattr(self._model, "predict_proba") else self._model.predict(X)
-        out = df.copy()
-        out[self._feature_meta.get("prediction_column", "prediction")] = preds
-        self.tracer.trace_event("predict", {"rows": len(out)})
-        return out

space/tools/report_tool.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from jinja2 import Environment, FileSystemLoader
-import pandas as pd
-from ..utils.tracing import Tracer
-class ReportTool:
-    def __init__(self, cfg, tracer: Tracer):
-        self.cfg = cfg
-        self.tracer = tracer
-        self.env = Environment(loader=FileSystemLoader("templates"))
-    def render_and_save(self, user_query: str, sql_preview: pd.DataFrame | None, predict_preview: pd.DataFrame | None, explain_images: dict, plan: dict):
-        tmpl = self.env.get_template("report_template.md")
-        html = tmpl.render(
-            user_query=user_query,
-            plan=plan,
-            sql_preview=sql_preview.to_markdown(index=False) if sql_preview is not None else "",
-            predict_preview=predict_preview.to_markdown(index=False) if predict_preview is not None else "",
-            explain_images=explain_images,
-        )
-        out_path = f"report_{pd.Timestamp.utcnow().strftime('%Y%m%d_%H%M%S')}.html"
-        with open(out_path, "w", encoding="utf-8") as f:
-            f.write("<link rel=\"stylesheet\" href=\"templates/report_styles.css\">\n" + html)
-        self.tracer.trace_event("report", {"path": out_path})
-        return out_path

space/tools/sql_tool.py DELETED Viewed

@@ -1,49 +0,0 @@
-import os
-import re
-import pandas as pd
-from typing import Optional
-from ..utils.config import AppConfig
-from ..utils.tracing import Tracer
-class SQLTool:
-    def __init__(self, cfg: AppConfig, tracer: Tracer):
-        self.cfg = cfg
-        self.tracer = tracer
-        self.backend = cfg.sql_backend  # "bigquery" or "motherduck"
-        if self.backend == "bigquery":
-            from google.cloud import bigquery
-            from google.oauth2 import service_account
-            key_json = os.getenv("GCP_SERVICE_ACCOUNT_JSON")
-            if not key_json:
-                raise RuntimeError("Missing GCP_SERVICE_ACCOUNT_JSON secret")
-            creds = service_account.Credentials.from_service_account_info(
-                eval(key_json) if key_json.strip().startswith("{") else {}
-            )
-            self.client = bigquery.Client(credentials=creds, project=cfg.gcp_project)
-        elif self.backend == "motherduck":
-            import duckdb
-            token = self.cfg.motherduck_token or os.getenv("MOTHERDUCK_TOKEN")
-            db_name = self.cfg.motherduck_db or "default"
-            self.client = duckdb.connect(f"md:/{db_name}?motherduck_token={token}")
-        else:
-            raise RuntimeError("Unknown SQL backend")
-    def _nl_to_sql(self, message: str) -> str:
-        # Minimal NL2SQL heuristic; replace with your own mapping or LLM prompt.
-        # Expect users to include table names. Example: "avg revenue by month from dataset.sales"
-        m = message.lower()
-        if "avg" in m and " by " in m:
-            return "-- Example template; edit me\nSELECT DATE_TRUNC(month, date_col) AS month, AVG(metric) AS avg_metric FROM dataset.table GROUP BY 1 ORDER BY 1;"
-        # fallback: pass-through if user typed SQL explicitly
-        if re.match(r"^\s*select ", m):
-            return message
-        return "SELECT * FROM dataset.table LIMIT 100;"
-    def run(self, message: str) -> pd.DataFrame:
-        sql = self._nl_to_sql(message)
-        self.tracer.trace_event("sql_query", {"sql": sql, "backend": self.backend})
-        if self.backend == "bigquery":
-            df = self.client.query(sql).to_dataframe()
-        else:
-            df = self.client.execute(sql).fetch_df()
-        return df

space/utils/config.py DELETED Viewed

@@ -1,21 +0,0 @@
-import os
-from dataclasses import dataclass
-@dataclass
-class AppConfig:
-    hf_model_repo: str
-    sql_backend: str # "bigquery" or "motherduck"
-    gcp_project: str | None = None
-    motherduck_db: str | None = None
-    motherduck_token: str | None = None
-@classmethod
-def from_env(cls):
-    return cls(
-        hf_model_repo=os.getenv("HF_MODEL_REPO", "your-username/your-private-tabular-model"),
-        sql_backend=os.getenv("SQL_BACKEND", "motherduck"),
-        gcp_project=os.getenv("GCP_PROJECT"),
-        motherduck_db=os.getenv("MOTHERDUCK_DB", "default"),
-        motherduck_token=os.getenv("MOTHERDUCK_TOKEN")
-)

space/utils/hf_io.py DELETED Viewed

File without changes

space/utils/tracing.py DELETED Viewed

@@ -1,30 +0,0 @@
-import os
-import json
-from typing import Optional
-class Tracer:
-    def __init__(self, client=None, trace_url: Optional[str] = None):
-        self.client = client
-        self.trace_url = trace_url
-    @classmethod
-    def from_env(cls):
-        try:
-            from langfuse import Langfuse
-            pk = os.getenv("LANGFUSE_PUBLIC_KEY")
-            sk = os.getenv("LANGFUSE_SECRET_KEY")
-            host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
-            if pk and sk:
-                client = Langfuse(public_key=pk, secret_key=sk, host=host)
-                session = client.trace("tabular-agentic-xai")
-                return cls(client=session, trace_url=session.get_url() if hasattr(session, "get_url") else None)
-        except Exception:
-            pass
-        return cls()
-    def trace_event(self, name: str, payload: dict):
-        if self.client:
-            try:
-                self.client.event(name=name, input=json.dumps(payload))
-            except Exception:
-                pass