Spaces:

subhamb04
/

datum

Sleeping

App Files Files Community

subhamb04 commited on Sep 9, 2025

Commit

9cccf74

verified ·

1 Parent(s): 41e5f2f

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitignore +122 -0
README.md +152 -12
app.py +13 -0
builder/agent_runner.py +31 -0
builder/graph_builder.py +30 -0
builder/nodes.py +78 -0
builder/state.py +12 -0
builder/ui.py +25 -0
clients/llm.py +13 -0
datastore/db.py +12 -0
requirements.txt +9 -0
utils/charts.py +23 -0
utils/insight_utils.py +16 -0
utils/tracer_utils.py +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,122 @@

+# ------------------------------
+# Python
+# ------------------------------
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+*.pyd
+*.dll
+# ------------------------------
+# Environments
+# ------------------------------
+.venv/
+venv/
+env/
+ENV/
+.venv*/
+venv*/
+env*/
+ENV*/
+.python-version
+# ------------------------------
+# Distribution / packaging
+# ------------------------------
+.Python
+build/
+dist/
+downloads/
+eggs/
+.eggs/
+sdist/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+pip-wheel-metadata/
+pip-log.txt
+pip-delete-this-directory.txt
+# ------------------------------
+# Unit test / coverage reports
+# ------------------------------
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.pytest_cache/
+junit*.xml
+# ------------------------------
+# Type checkers / linters
+# ------------------------------
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.pyre/
+.pytype/
+.ruff_cache/
+# ------------------------------
+# PyInstaller
+# ------------------------------
+*.manifest
+*.spec
+# ------------------------------
+# Jupyter
+# ------------------------------
+.ipynb_checkpoints/
+# ------------------------------
+# Logs and runtime files
+# ------------------------------
+logs/
+*.log
+*.pid
+*.pid.lock
+# ------------------------------
+# Local environment variables & secrets
+# ------------------------------
+.env
+.env.*
+!.env.example
+# ------------------------------
+# Editors / IDEs / Tooling
+# ------------------------------
+.idea/
+*.iml
+.vscode/
+.history/
+.cursor/
+*.code-workspace
+# ------------------------------
+# OS-specific
+# ------------------------------
+.DS_Store
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+# ------------------------------
+# Optional local data & temp
+# ------------------------------
+tmp/
+temp/
+data/

README.md CHANGED Viewed

@@ -1,12 +1,152 @@
----
-title: Datum
-emoji: 🦀
-colorFrom: green
-colorTo: purple
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: datum
+app_file: app.py
+sdk: gradio
+sdk_version: 5.44.1
+---
+# Datum - AI-Powered Data Analysis Agent
+A simple yet powerful data analysis agent that uses AI to generate SQL queries, execute them against your data, and provide visualizations and insights through a web interface.
+## Features
+- **Natural Language Queries**: Ask questions about your data in plain English
+- **Auto Routing (Chat vs SQL)**: Agent decides between a quick chat reply or full SQL/database analysis
+- **AI-Generated SQL**: Automatically converts questions into SQL queries
+- **Data Visualization**: Generates charts and graphs from query results
+- **Intelligent Insights**: Provides narrative analysis and recommendations
+- **Web Interface**: Clean, user-friendly Gradio interface
+- **DuckDB Integration**: Fast, in-memory SQL database for data analysis
+- **LangSmith Tracing**: Built-in observability and debugging with LangSmith integration
+## Project Structure
+```
+datum/
+├── app.py                  # Main application with LangGraph workflow
+├── builder/
+│   ├── graph_builder.py    # Graph with router + conditional edges
+│   ├── nodes.py            # Agent nodes (decider, chat, SQL, charting, narration)
+│   ├── state.py            # Typed state definition for the agent
+│   └── ui.py               # Gradio UI wiring
+├── clients/
+│   └── llm.py              # LLM configuration (Google Gemini)
+├── datastore/
+│   └── db.py               # DuckDB setup and data loading
+├── utils/
+│   ├── charts.py           # Chart generation utilities
+│   ├── insight_utils.py    # Insight helpers
+│   └── tracer_utils.py     # LangSmith tracing helpers
+├── data/                   # Sample datasets
+│   ├── sales.csv
+│   ├── marketing_spend.csv
+│   └── customers.csv
+└── requirements.txt        # Python dependencies
+```
+## Setup Instructions
+### Prerequisites
+- Python 3.8 or higher
+- Google API key for Gemini AI
+### Installation
+1. **Clone the repository**
+   ```bash
+   git clone <repository-url>
+   cd datum
+   ```
+2. **Create a virtual environment**
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # On Windows: venv\Scripts\activate
+   ```
+3. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. **Set up environment variables**
+   Create a `.env` file in the project root:
+   ```bash
+   GOOGLE_API_KEY=your_google_api_key_here
+   LANGCHAIN_PROJECT=datum-analysis  # Optional: for LangSmith tracing
+   LANGCHAIN_API_KEY=your_langsmith_api_key  # Optional: for LangSmith tracing
+   LANGCHAIN_TRACING_V2=true  # Optional: enable LangSmith tracing
+   ```
+5. **Run the application**
+   ```bash
+   python app.py
+   ```
+6. **Access the web interface**
+   Open your browser and navigate to the URL shown in the terminal (typically `http://127.0.0.1:7860`)
+## Usage
+1. **Ask a question**: Type your data analysis question in natural language
+   - Example: "What are the top 3 regions by revenue?"
+   - Example: "Show me marketing spend by channel"
+   - Example: "Which products have the highest unit sales?"
+2. **Agent chooses the path automatically**
+   - **Chat route**: Direct conversational answer when no database analysis is needed
+   - **SQL route**: The agent generates SQL and provides:
+     - **Query Result** (table)
+     - **Chart** (visualization)
+     - **Insights** (narrative + recommendation)
+     - **SQL** (for transparency)
+### Routing at a Glance
+The `decider` node analyzes your question and sets a `route` of `chat` or `sql`. The graph then either calls `general_chat` or runs the SQL flow (`sql_generator` → `sql_executor` → `chart_generator` + `narrator`).
+## Sample Data
+The project includes sample datasets:
+- **Sales**: Date, region, product, revenue, units sold
+- **Marketing Spend**: Date, region, channel, spend amount
+- **Customers**: Customer ID, region, age, income
+## Technology Stack
+- **LangGraph**: Workflow orchestration
+- **Google Gemini**: AI language model
+- **DuckDB**: In-memory SQL database
+- **Gradio**: Web interface
+- **Matplotlib**: Chart generation
+- **Pandas**: Data manipulation
+- **LangSmith**: Observability and tracing platform
+## Customization
+- **Add your own data**: Replace CSV files in the `data/` directory and update the schema in `nodes.py`
+- **Modify the LLM**: Change the model or provider in `llm.py`
+- **Customize charts**: Modify chart generation logic in `charts.py`
+- **Extend the workflow**: Add new nodes to the LangGraph workflow in `app.py`
+## Observability & Debugging
+The application includes built-in LangSmith tracing for monitoring and debugging:
+- **Trace Execution**: All agent steps are automatically traced and logged
+- **Performance Monitoring**: Track execution times and token usage
+- **Debug Information**: View detailed logs of SQL generation, execution, and LLM calls
+- **Project Organization**: Traces are organized by project name for easy filtering
+To enable tracing, set the LangSmith environment variables in your `.env` file. Without these variables, the application will run normally but without tracing capabilities.
+## Troubleshooting
+- **API Key Error**: Ensure your `GOOGLE_API_KEY` is set correctly in the `.env` file
+- **Import Errors**: Make sure all dependencies are installed with `pip install -r requirements.txt`
+- **Data Issues**: Verify your CSV files are in the correct format and location
+- **Tracing Issues**: Check LangSmith credentials if you want to use the observability features
+## License
+This project is open source and available under the MIT License.

app.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from builder.graph_builder import build_graph
+from utils.tracer_utils import get_tracer, wait_for_tracers
+from builder.ui import build_ui
+if __name__ == "__main__":
+    tracer = get_tracer()
+    app = build_graph()
+    demo = build_ui(app, tracer)
+    try:
+        demo.launch(inbrowser=True)
+    finally:
+        wait_for_tracers()

builder/agent_runner.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+from utils.insight_utils import df_to_html, pil_to_base64
+from builder.state import AgentState
+def run_agent(app, tracer, message, history):
+    history = history or []
+    result: AgentState = app.invoke(
+        {"question": message, "history": history},
+        config={"callbacks": [tracer]}
+    )
+    bot_message = ""
+    if result.get("narrative"):
+        bot_message += f"**Datum:**\n{result['narrative']}\n\n"
+    if result.get("sql"):
+        bot_message += f"**SQL:**\n```sql\n{result['sql']}\n```\n"
+    if chart_html := pil_to_base64(result.get("chart_pil")):
+        bot_message += chart_html + "\n"
+    if df_html := df_to_html(result.get("df", pd.DataFrame())):
+        bot_message += df_html
+    updated_history = history + [
+        {"role": "user", "content": message},
+        {"role": "assistant", "content": bot_message}
+    ]
+    return updated_history, updated_history, ""

builder/graph_builder.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langgraph.graph import StateGraph
+from builder.state import AgentState
+from builder.nodes import sql_generator, sql_executor, chart_generator, narrator, decider, general_chat
+def build_graph():
+    graph = StateGraph(AgentState)
+    graph.add_node("decider", decider)
+    graph.add_node("sql_generator", sql_generator)
+    graph.add_node("sql_executor", sql_executor)
+    graph.add_node("chart_generator", chart_generator)
+    graph.add_node("narrator", narrator)
+    graph.add_node("general_chat", general_chat)
+    graph.set_entry_point("decider")
+    graph.add_conditional_edges(
+        "decider",
+        lambda state: state["route"],
+        {
+            "sql": "sql_generator",
+            "chat": "general_chat",
+        },
+    )
+    graph.add_edge("sql_generator", "sql_executor")
+    graph.add_edge("sql_executor", "chart_generator")
+    graph.add_edge("sql_executor", "narrator")
+    return graph.compile()

builder/nodes.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from builder.state import AgentState
+from datastore.db import conn
+from clients.llm import complete
+from utils.charts import df_to_pil_chart
+def decider(state: dict) -> dict:
+    """Decide whether to use SQL flow or general LLM chat."""
+    history_text = "\n".join([
+        f"{h['role'].capitalize()}: {h['content']}"
+        for h in state.get("history", [])
+    ])
+    prompt = f"""
+    You are a router. Decide whether the user question requires SQL/database analysis
+    (tables: sales, marketing_spend, customers) OR if it can be answered directly
+    as a general conversational reply.
+    Chat history so far:
+    {history_text}
+    Current question: {state['question']}
+    Answer ONLY with one word: "sql" or "chat".
+    """
+    decision = complete(prompt).lower().strip()
+    if "sql" in decision:
+        return {"route": "sql"}
+    return {"route": "chat"}
+def sql_generator(state: AgentState) -> dict:
+    schema = """
+    Tables:
+    sales(date, region, product, revenue, units_sold)
+    marketing_spend(date, region, channel, spend)
+    customers(customer_id, region, age, income)
+    """
+    prompt = f"You are a helpful SQL expert. Schema: {schema}. Question: {state['question']}. Return only a SELECT SQL query and do not wrap it with ```sql tag."
+    sql = complete(prompt)
+    if not sql.lower().startswith("select"):
+        sql = "SELECT region, SUM(revenue) as total_revenue FROM sales GROUP BY region"
+    return {"sql": sql}
+def sql_executor(state: AgentState) -> dict:
+    df = conn.execute(state["sql"]).df()
+    return {"df": df}
+def chart_generator(state: AgentState) -> dict:
+    pil_img = df_to_pil_chart(state["df"], state["question"])
+    return {"chart_pil": pil_img}
+def narrator(state: AgentState) -> dict:
+    df_json = state["df"].to_dict(orient="records")
+    prompt = f"Question: {state['question']}\nResult: {df_json}\nWrite 3-4 bullet point insights with one recommendation."
+    narrative = complete(prompt)
+    return {"narrative": narrative}
+def general_chat(state: dict) -> dict:
+    """Handle general conversational queries with LLM."""
+    history_text = "\n".join([
+        f"{h['role'].capitalize()}: {h['content']}"
+        for h in state.get("history", [])
+    ])
+    prompt = f"""
+    You are a helpful assistant. Continue the conversation naturally.
+    Chat history so far:
+    {history_text}
+    User question: {state['question']}
+    """
+    response = complete(prompt)
+    return {"narrative": response}

builder/state.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import TypedDict, Literal
+import pandas as pd
+from PIL import Image
+class AgentState(TypedDict, total=False):
+    question: str
+    sql: str
+    df: pd.DataFrame
+    chart_pil: Image.Image
+    narrative: str
+    route: Literal["sql", "chat"]
+    history: list[tuple[str, str]]

builder/ui.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import gradio as gr
+from builder.agent_runner import run_agent
+def build_ui(app, tracer):
+    with gr.Blocks() as demo:
+        gr.Markdown("# Datum : Autonomous Data Analysis Agent")
+        chatbot = gr.Chatbot(type="messages")
+        user_input = gr.Textbox(label="Ask a question", placeholder="Ex: Show me marketing spend by channel")
+        submit_btn = gr.Button("Send")
+        state = gr.State([])
+        user_input.submit(
+            lambda m, h: run_agent(app, tracer, m, h),
+            inputs=[user_input, state],
+            outputs=[chatbot, state, user_input]
+        )
+        submit_btn.click(
+            lambda m, h: run_agent(app, tracer, m, h),
+            inputs=[user_input, state],
+            outputs=[chatbot, state, user_input]
+        )
+    return demo

clients/llm.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+load_dotenv(override=True)
+api_key = os.getenv("GOOGLE_API_KEY")
+if not api_key:
+    raise RuntimeError("Please set GOOGLE_API_KEY in your environment.")
+llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", google_api_key=api_key)
+def complete(prompt: str) -> str:
+    return llm.invoke(prompt).content.strip()

datastore/db.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import duckdb
+import pandas as pd
+conn = duckdb.connect()
+sales = pd.read_csv("data/sales.csv")
+marketing = pd.read_csv("data/marketing_spend.csv")
+customers = pd.read_csv("data/customers.csv")
+conn.register("sales", sales)
+conn.register("marketing_spend", marketing)
+conn.register("customers", customers)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+duckdb>=1.0.0
+numpy>=1.25.2
+pandas>=2.2.2
+matplotlib>=3.8.4
+gradio>=4.44.0
+langgraph>=0.2.34
+google-generativeai>=0.8.3
+python-dotenv>=1.0.1
+grandalf>=0.8

utils/charts.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import matplotlib.pyplot as plt
+import pandas as pd
+import io
+from PIL import Image
+def df_to_pil_chart(df: pd.DataFrame, question: str) -> Image.Image:
+    fig, ax = plt.subplots()
+    if df.shape[1] >= 2:
+        x = df.iloc[:,0].astype(str)
+        y = df.iloc[:,1]
+        ax.bar(x, y)
+        ax.set_xlabel(df.columns[0])
+        ax.set_ylabel(df.columns[1])
+        ax.set_title(question)
+        plt.xticks(rotation=45, ha='right')
+    else:
+        ax.text(0.5,0.5,"No chart", ha='center')
+    buf = io.BytesIO()
+    plt.tight_layout()
+    plt.savefig(buf, format='png', dpi=150)
+    buf.seek(0)
+    plt.close(fig)
+    return Image.open(buf)

utils/insight_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import pandas as pd
+from io import BytesIO
+import base64
+def df_to_html(df: pd.DataFrame):
+    if df.empty:
+        return ""
+    return df.to_html(index=False)
+def pil_to_base64(img):
+    if img is None:
+        return ""
+    buffered = BytesIO()
+    img.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return f"<img src='data:image/png;base64,{img_str}' style='max-width:400px;'>"

utils/tracer_utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from langchain_core.tracers import LangChainTracer
+from langchain_core.tracers.langchain import wait_for_all_tracers
+def get_tracer():
+    project = os.getenv("LANGCHAIN_PROJECT", "default-project")
+    return LangChainTracer(project_name=project)
+def wait_for_tracers():
+    wait_for_all_tracers()