subhamb04 commited on
Commit
9cccf74
·
verified ·
1 Parent(s): 41e5f2f

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------
2
+ # Python
3
+ # ------------------------------
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+ *.pyd
11
+ *.dll
12
+
13
+ # ------------------------------
14
+ # Environments
15
+ # ------------------------------
16
+ .venv/
17
+ venv/
18
+ env/
19
+ ENV/
20
+ .venv*/
21
+ venv*/
22
+ env*/
23
+ ENV*/
24
+ .python-version
25
+
26
+ # ------------------------------
27
+ # Distribution / packaging
28
+ # ------------------------------
29
+ .Python
30
+ build/
31
+ dist/
32
+ downloads/
33
+ eggs/
34
+ .eggs/
35
+ sdist/
36
+ wheels/
37
+ share/python-wheels/
38
+ *.egg-info/
39
+ .installed.cfg
40
+ *.egg
41
+ MANIFEST
42
+ pip-wheel-metadata/
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # ------------------------------
47
+ # Unit test / coverage reports
48
+ # ------------------------------
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ nosetests.xml
56
+ coverage.xml
57
+ *.cover
58
+ *.py,cover
59
+ .pytest_cache/
60
+ junit*.xml
61
+
62
+ # ------------------------------
63
+ # Type checkers / linters
64
+ # ------------------------------
65
+ .mypy_cache/
66
+ .dmypy.json
67
+ dmypy.json
68
+ .pyre/
69
+ .pytype/
70
+ .ruff_cache/
71
+
72
+ # ------------------------------
73
+ # PyInstaller
74
+ # ------------------------------
75
+ *.manifest
76
+ *.spec
77
+
78
+ # ------------------------------
79
+ # Jupyter
80
+ # ------------------------------
81
+ .ipynb_checkpoints/
82
+
83
+ # ------------------------------
84
+ # Logs and runtime files
85
+ # ------------------------------
86
+ logs/
87
+ *.log
88
+ *.pid
89
+ *.pid.lock
90
+
91
+ # ------------------------------
92
+ # Local environment variables & secrets
93
+ # ------------------------------
94
+ .env
95
+ .env.*
96
+ !.env.example
97
+
98
+ # ------------------------------
99
+ # Editors / IDEs / Tooling
100
+ # ------------------------------
101
+ .idea/
102
+ *.iml
103
+ .vscode/
104
+ .history/
105
+ .cursor/
106
+ *.code-workspace
107
+
108
+ # ------------------------------
109
+ # OS-specific
110
+ # ------------------------------
111
+ .DS_Store
112
+ Thumbs.db
113
+ ehthumbs.db
114
+ Desktop.ini
115
+
116
+ # ------------------------------
117
+ # Optional local data & temp
118
+ # ------------------------------
119
+ tmp/
120
+ temp/
121
+ data/
122
+
README.md CHANGED
@@ -1,12 +1,152 @@
1
- ---
2
- title: Datum
3
- emoji: 🦀
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.44.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: datum
3
+ app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.44.1
6
+ ---
7
+ # Datum - AI-Powered Data Analysis Agent
8
+
9
+ A simple yet powerful data analysis agent that uses AI to generate SQL queries, execute them against your data, and provide visualizations and insights through a web interface.
10
+
11
+ ## Features
12
+
13
+ - **Natural Language Queries**: Ask questions about your data in plain English
14
+ - **Auto Routing (Chat vs SQL)**: Agent decides between a quick chat reply or full SQL/database analysis
15
+ - **AI-Generated SQL**: Automatically converts questions into SQL queries
16
+ - **Data Visualization**: Generates charts and graphs from query results
17
+ - **Intelligent Insights**: Provides narrative analysis and recommendations
18
+ - **Web Interface**: Clean, user-friendly Gradio interface
19
+ - **DuckDB Integration**: Fast, in-memory SQL database for data analysis
20
+ - **LangSmith Tracing**: Built-in observability and debugging with LangSmith integration
21
+
22
+ ## Project Structure
23
+
24
+ ```
25
+ datum/
26
+ ├── app.py # Main application with LangGraph workflow
27
+ ├── builder/
28
+ │ ├── graph_builder.py # Graph with router + conditional edges
29
+ │ ├── nodes.py # Agent nodes (decider, chat, SQL, charting, narration)
30
+ │ ├── state.py # Typed state definition for the agent
31
+ │ └── ui.py # Gradio UI wiring
32
+ ├── clients/
33
+ │ └── llm.py # LLM configuration (Google Gemini)
34
+ ├── datastore/
35
+ │ └── db.py # DuckDB setup and data loading
36
+ ├── utils/
37
+ │ ├── charts.py # Chart generation utilities
38
+ │ ├── insight_utils.py # Insight helpers
39
+ │ └── tracer_utils.py # LangSmith tracing helpers
40
+ ├── data/ # Sample datasets
41
+ │ ├── sales.csv
42
+ │ ├── marketing_spend.csv
43
+ │ └── customers.csv
44
+ └── requirements.txt # Python dependencies
45
+ ```
46
+
47
+ ## Setup Instructions
48
+
49
+ ### Prerequisites
50
+
51
+ - Python 3.8 or higher
52
+ - Google API key for Gemini AI
53
+
54
+ ### Installation
55
+
56
+ 1. **Clone the repository**
57
+ ```bash
58
+ git clone <repository-url>
59
+ cd datum
60
+ ```
61
+
62
+ 2. **Create a virtual environment**
63
+ ```bash
64
+ python -m venv venv
65
+ source venv/bin/activate # On Windows: venv\Scripts\activate
66
+ ```
67
+
68
+ 3. **Install dependencies**
69
+ ```bash
70
+ pip install -r requirements.txt
71
+ ```
72
+
73
+ 4. **Set up environment variables**
74
+ Create a `.env` file in the project root:
75
+ ```bash
76
+ GOOGLE_API_KEY=your_google_api_key_here
77
+ LANGCHAIN_PROJECT=datum-analysis # Optional: for LangSmith tracing
78
+ LANGCHAIN_API_KEY=your_langsmith_api_key # Optional: for LangSmith tracing
79
+ LANGCHAIN_TRACING_V2=true # Optional: enable LangSmith tracing
80
+ ```
81
+
82
+ 5. **Run the application**
83
+ ```bash
84
+ python app.py
85
+ ```
86
+
87
+ 6. **Access the web interface**
88
+ Open your browser and navigate to the URL shown in the terminal (typically `http://127.0.0.1:7860`)
89
+
90
+ ## Usage
91
+
92
+ 1. **Ask a question**: Type your data analysis question in natural language
93
+ - Example: "What are the top 3 regions by revenue?"
94
+ - Example: "Show me marketing spend by channel"
95
+ - Example: "Which products have the highest unit sales?"
96
+
97
+ 2. **Agent chooses the path automatically**
98
+ - **Chat route**: Direct conversational answer when no database analysis is needed
99
+ - **SQL route**: The agent generates SQL and provides:
100
+ - **Query Result** (table)
101
+ - **Chart** (visualization)
102
+ - **Insights** (narrative + recommendation)
103
+ - **SQL** (for transparency)
104
+
105
+ ### Routing at a Glance
106
+ The `decider` node analyzes your question and sets a `route` of `chat` or `sql`. The graph then either calls `general_chat` or runs the SQL flow (`sql_generator` → `sql_executor` → `chart_generator` + `narrator`).
107
+
108
+ ## Sample Data
109
+
110
+ The project includes sample datasets:
111
+ - **Sales**: Date, region, product, revenue, units sold
112
+ - **Marketing Spend**: Date, region, channel, spend amount
113
+ - **Customers**: Customer ID, region, age, income
114
+
115
+ ## Technology Stack
116
+
117
+ - **LangGraph**: Workflow orchestration
118
+ - **Google Gemini**: AI language model
119
+ - **DuckDB**: In-memory SQL database
120
+ - **Gradio**: Web interface
121
+ - **Matplotlib**: Chart generation
122
+ - **Pandas**: Data manipulation
123
+ - **LangSmith**: Observability and tracing platform
124
+
125
+ ## Customization
126
+
127
+ - **Add your own data**: Replace CSV files in the `data/` directory and update the schema in `nodes.py`
128
+ - **Modify the LLM**: Change the model or provider in `llm.py`
129
+ - **Customize charts**: Modify chart generation logic in `charts.py`
130
+ - **Extend the workflow**: Add new nodes to the LangGraph workflow in `app.py`
131
+
132
+ ## Observability & Debugging
133
+
134
+ The application includes built-in LangSmith tracing for monitoring and debugging:
135
+
136
+ - **Trace Execution**: All agent steps are automatically traced and logged
137
+ - **Performance Monitoring**: Track execution times and token usage
138
+ - **Debug Information**: View detailed logs of SQL generation, execution, and LLM calls
139
+ - **Project Organization**: Traces are organized by project name for easy filtering
140
+
141
+ To enable tracing, set the LangSmith environment variables in your `.env` file. Without these variables, the application will run normally but without tracing capabilities.
142
+
143
+ ## Troubleshooting
144
+
145
+ - **API Key Error**: Ensure your `GOOGLE_API_KEY` is set correctly in the `.env` file
146
+ - **Import Errors**: Make sure all dependencies are installed with `pip install -r requirements.txt`
147
+ - **Data Issues**: Verify your CSV files are in the correct format and location
148
+ - **Tracing Issues**: Check LangSmith credentials if you want to use the observability features
149
+
150
+ ## License
151
+
152
+ This project is open source and available under the MIT License.
app.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from builder.graph_builder import build_graph
2
+ from utils.tracer_utils import get_tracer, wait_for_tracers
3
+ from builder.ui import build_ui
4
+
5
+ if __name__ == "__main__":
6
+ tracer = get_tracer()
7
+ app = build_graph()
8
+ demo = build_ui(app, tracer)
9
+
10
+ try:
11
+ demo.launch(inbrowser=True)
12
+ finally:
13
+ wait_for_tracers()
builder/agent_runner.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utils.insight_utils import df_to_html, pil_to_base64
3
+ from builder.state import AgentState
4
+
5
+ def run_agent(app, tracer, message, history):
6
+ history = history or []
7
+
8
+ result: AgentState = app.invoke(
9
+ {"question": message, "history": history},
10
+ config={"callbacks": [tracer]}
11
+ )
12
+
13
+ bot_message = ""
14
+ if result.get("narrative"):
15
+ bot_message += f"**Datum:**\n{result['narrative']}\n\n"
16
+
17
+ if result.get("sql"):
18
+ bot_message += f"**SQL:**\n```sql\n{result['sql']}\n```\n"
19
+
20
+ if chart_html := pil_to_base64(result.get("chart_pil")):
21
+ bot_message += chart_html + "\n"
22
+
23
+ if df_html := df_to_html(result.get("df", pd.DataFrame())):
24
+ bot_message += df_html
25
+
26
+ updated_history = history + [
27
+ {"role": "user", "content": message},
28
+ {"role": "assistant", "content": bot_message}
29
+ ]
30
+
31
+ return updated_history, updated_history, ""
builder/graph_builder.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph
2
+ from builder.state import AgentState
3
+ from builder.nodes import sql_generator, sql_executor, chart_generator, narrator, decider, general_chat
4
+
5
+ def build_graph():
6
+ graph = StateGraph(AgentState)
7
+
8
+ graph.add_node("decider", decider)
9
+ graph.add_node("sql_generator", sql_generator)
10
+ graph.add_node("sql_executor", sql_executor)
11
+ graph.add_node("chart_generator", chart_generator)
12
+ graph.add_node("narrator", narrator)
13
+ graph.add_node("general_chat", general_chat)
14
+
15
+ graph.set_entry_point("decider")
16
+
17
+ graph.add_conditional_edges(
18
+ "decider",
19
+ lambda state: state["route"],
20
+ {
21
+ "sql": "sql_generator",
22
+ "chat": "general_chat",
23
+ },
24
+ )
25
+
26
+ graph.add_edge("sql_generator", "sql_executor")
27
+ graph.add_edge("sql_executor", "chart_generator")
28
+ graph.add_edge("sql_executor", "narrator")
29
+
30
+ return graph.compile()
builder/nodes.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from builder.state import AgentState
2
+ from datastore.db import conn
3
+ from clients.llm import complete
4
+ from utils.charts import df_to_pil_chart
5
+
6
+ def decider(state: dict) -> dict:
7
+ """Decide whether to use SQL flow or general LLM chat."""
8
+
9
+ history_text = "\n".join([
10
+ f"{h['role'].capitalize()}: {h['content']}"
11
+ for h in state.get("history", [])
12
+ ])
13
+
14
+ prompt = f"""
15
+ You are a router. Decide whether the user question requires SQL/database analysis
16
+ (tables: sales, marketing_spend, customers) OR if it can be answered directly
17
+ as a general conversational reply.
18
+
19
+ Chat history so far:
20
+ {history_text}
21
+
22
+ Current question: {state['question']}
23
+
24
+ Answer ONLY with one word: "sql" or "chat".
25
+ """
26
+
27
+ decision = complete(prompt).lower().strip()
28
+ if "sql" in decision:
29
+ return {"route": "sql"}
30
+ return {"route": "chat"}
31
+
32
+
33
+ def sql_generator(state: AgentState) -> dict:
34
+ schema = """
35
+ Tables:
36
+ sales(date, region, product, revenue, units_sold)
37
+ marketing_spend(date, region, channel, spend)
38
+ customers(customer_id, region, age, income)
39
+ """
40
+ prompt = f"You are a helpful SQL expert. Schema: {schema}. Question: {state['question']}. Return only a SELECT SQL query and do not wrap it with ```sql tag."
41
+ sql = complete(prompt)
42
+ if not sql.lower().startswith("select"):
43
+ sql = "SELECT region, SUM(revenue) as total_revenue FROM sales GROUP BY region"
44
+ return {"sql": sql}
45
+
46
+ def sql_executor(state: AgentState) -> dict:
47
+ df = conn.execute(state["sql"]).df()
48
+ return {"df": df}
49
+
50
+ def chart_generator(state: AgentState) -> dict:
51
+ pil_img = df_to_pil_chart(state["df"], state["question"])
52
+ return {"chart_pil": pil_img}
53
+
54
+ def narrator(state: AgentState) -> dict:
55
+ df_json = state["df"].to_dict(orient="records")
56
+ prompt = f"Question: {state['question']}\nResult: {df_json}\nWrite 3-4 bullet point insights with one recommendation."
57
+ narrative = complete(prompt)
58
+ return {"narrative": narrative}
59
+
60
+ def general_chat(state: dict) -> dict:
61
+ """Handle general conversational queries with LLM."""
62
+
63
+ history_text = "\n".join([
64
+ f"{h['role'].capitalize()}: {h['content']}"
65
+ for h in state.get("history", [])
66
+ ])
67
+
68
+ prompt = f"""
69
+ You are a helpful assistant. Continue the conversation naturally.
70
+
71
+ Chat history so far:
72
+ {history_text}
73
+
74
+ User question: {state['question']}
75
+ """
76
+
77
+ response = complete(prompt)
78
+ return {"narrative": response}
builder/state.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict, Literal
2
+ import pandas as pd
3
+ from PIL import Image
4
+
5
+ class AgentState(TypedDict, total=False):
6
+ question: str
7
+ sql: str
8
+ df: pd.DataFrame
9
+ chart_pil: Image.Image
10
+ narrative: str
11
+ route: Literal["sql", "chat"]
12
+ history: list[tuple[str, str]]
builder/ui.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from builder.agent_runner import run_agent
3
+
4
+ def build_ui(app, tracer):
5
+ with gr.Blocks() as demo:
6
+ gr.Markdown("# Datum : Autonomous Data Analysis Agent")
7
+ chatbot = gr.Chatbot(type="messages")
8
+ user_input = gr.Textbox(label="Ask a question", placeholder="Ex: Show me marketing spend by channel")
9
+ submit_btn = gr.Button("Send")
10
+
11
+ state = gr.State([])
12
+
13
+ user_input.submit(
14
+ lambda m, h: run_agent(app, tracer, m, h),
15
+ inputs=[user_input, state],
16
+ outputs=[chatbot, state, user_input]
17
+ )
18
+
19
+ submit_btn.click(
20
+ lambda m, h: run_agent(app, tracer, m, h),
21
+ inputs=[user_input, state],
22
+ outputs=[chatbot, state, user_input]
23
+ )
24
+
25
+ return demo
clients/llm.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_google_genai import ChatGoogleGenerativeAI
4
+
5
+ load_dotenv(override=True)
6
+ api_key = os.getenv("GOOGLE_API_KEY")
7
+ if not api_key:
8
+ raise RuntimeError("Please set GOOGLE_API_KEY in your environment.")
9
+
10
+ llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", google_api_key=api_key)
11
+
12
+ def complete(prompt: str) -> str:
13
+ return llm.invoke(prompt).content.strip()
datastore/db.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import pandas as pd
3
+
4
+ conn = duckdb.connect()
5
+
6
+ sales = pd.read_csv("data/sales.csv")
7
+ marketing = pd.read_csv("data/marketing_spend.csv")
8
+ customers = pd.read_csv("data/customers.csv")
9
+
10
+ conn.register("sales", sales)
11
+ conn.register("marketing_spend", marketing)
12
+ conn.register("customers", customers)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ duckdb>=1.0.0
2
+ numpy>=1.25.2
3
+ pandas>=2.2.2
4
+ matplotlib>=3.8.4
5
+ gradio>=4.44.0
6
+ langgraph>=0.2.34
7
+ google-generativeai>=0.8.3
8
+ python-dotenv>=1.0.1
9
+ grandalf>=0.8
utils/charts.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import pandas as pd
3
+ import io
4
+ from PIL import Image
5
+
6
+ def df_to_pil_chart(df: pd.DataFrame, question: str) -> Image.Image:
7
+ fig, ax = plt.subplots()
8
+ if df.shape[1] >= 2:
9
+ x = df.iloc[:,0].astype(str)
10
+ y = df.iloc[:,1]
11
+ ax.bar(x, y)
12
+ ax.set_xlabel(df.columns[0])
13
+ ax.set_ylabel(df.columns[1])
14
+ ax.set_title(question)
15
+ plt.xticks(rotation=45, ha='right')
16
+ else:
17
+ ax.text(0.5,0.5,"No chart", ha='center')
18
+ buf = io.BytesIO()
19
+ plt.tight_layout()
20
+ plt.savefig(buf, format='png', dpi=150)
21
+ buf.seek(0)
22
+ plt.close(fig)
23
+ return Image.open(buf)
utils/insight_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from io import BytesIO
3
+ import base64
4
+
5
+ def df_to_html(df: pd.DataFrame):
6
+ if df.empty:
7
+ return ""
8
+ return df.to_html(index=False)
9
+
10
+ def pil_to_base64(img):
11
+ if img is None:
12
+ return ""
13
+ buffered = BytesIO()
14
+ img.save(buffered, format="PNG")
15
+ img_str = base64.b64encode(buffered.getvalue()).decode()
16
+ return f"<img src='data:image/png;base64,{img_str}' style='max-width:400px;'>"
utils/tracer_utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_core.tracers import LangChainTracer
3
+ from langchain_core.tracers.langchain import wait_for_all_tracers
4
+
5
+ def get_tracer():
6
+ project = os.getenv("LANGCHAIN_PROJECT", "default-project")
7
+ return LangChainTracer(project_name=project)
8
+
9
+ def wait_for_tracers():
10
+ wait_for_all_tracers()