Spaces:

NexDatawork
/

NexDatawork-Mini-Agent

Runtime error

App Files Files Community

andrealiao commited on Jan 12

Commit

edbfd50

1 Parent(s): 00d226e

update

Browse files

Files changed (2) hide show

README.md +1 -1
examples/app.py +531 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🤖
 colorFrom: indigo
 colorTo: purple
 sdk: gradio
-app_file: app.py
 pinned: false
 ---

 colorFrom: indigo
 colorTo: purple
 sdk: gradio
+app_file: examples/app.py
 pinned: false
 ---

examples/app.py CHANGED Viewed

	@@ -0,0 +1,531 @@

+# -*- coding: utf-8 -*-
+"""data_agent_demo.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1DBkfSNSZIyONNTEgSILfCoOyAGrx13DY
+# Introduction
+NexDatawork is an AI data agent for data engineering and analytics without writing code.
+## Prerequisites
+- langchain
+- langgraph
+- sqlalchemy
+- pandas
+- gradio
+ Before starting your work install all the required tools:
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# Clean out any mixed installs first
+# %pip uninstall -y langchain langchain-core langchain-community langchain-openai langchain-anthropic langchain-google-vertexai langchain-experimental langgraph langchain-scrapegraph
+# Install a consistent, modern set
+# %pip install -U \
+#  "langchain==0.3.*" \
+#  "langchain-core==0.3.*" \
+#  "langchain-community==0.3.*" \
+#  "langgraph>=0.2,<0.3" \
+#  "langchain-openai>=0.2.0" \
+#  "langchain-anthropic>=0.2.0" \
+#  "langchain-google-vertexai>=2.0.0" \
+#  "sqlalchemy>=2.0" \
+#  "pandas>=2.0" \
+#  "gradio>=4.0" \
+#  "langchain-experimental"\
+#  "langchain-scrapegraph"
+import sys, importlib.util, importlib.metadata as md
+def v(p):
+    try:
+        return md.version(p)
+    except md.PackageNotFoundError:
+        return "not installed"
+print("Kernel Python:", sys.executable)
+print("langchain:", v("langchain"))
+print("langchain-core:", v("langchain-core"))
+print("langchain-community:", v("langchain-community"))
+print("langgraph:", v("langgraph"))
+print("langchain-openai:", v("langchain-openai"))
+print("langchain-anthropic:", v("langchain-anthropic"))
+print("langchain-google-vertexai:", v("langchain-google-vertexai"))
+print("langchain-experimental:", v("langchain-experimental"))
+print("langchain-scrapegraph:", v("langchain-scrapegraph"))
+print("langgraph importable?", importlib.util.find_spec("langgraph") is not None)
+import os
+import io
+import contextlib
+import pandas as pd
+import gradio as gr
+from IPython.display import Markdown, HTML, display
+from sqlalchemy import (
+    Engine, create_engine, MetaData, Table, Column,
+    String, Integer, Float, insert, inspect, text
+)
+# LangChain 0.3.x import paths
+from langchain_openai import AzureChatOpenAI
+from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.agents import initialize_agent
+from langchain.agents.agent_types import AgentType
+from langchain.tools import tool
+from langchain_scrapegraph.tools import SmartScraperTool
+from langchain.memory import ConversationTokenBufferMemory
+from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
+from langchain_community.agent_toolkits import SQLDatabaseToolkit
+from langchain_community.utilities import SQLDatabase
+from langchain_core.messages import HumanMessage
+# LangGraph
+from langgraph.prebuilt import create_react_agent
+print("✅ Imports OK")
+"""To access AzureOpenAI models you'll need to create an Azure account, create a deployment of an Azure OpenAI model, get the name and endpoint for your deployment, get an Azure OpenAI API key and install the langchain-openai integration package.
+To access SmartScraperTool you will need a ScrapeGraphAI (SGAI) account and get an API key to launch the agent.
+Replace the placeholders with the actual values.
+"""
+os.environ["AZURE_OPENAI_ENDPOINT"] = "INSERT THE AZURE OPENAI ENDPOINT"
+os.environ["AZURE_OPENAI_API_KEY"] = "INSERT YOUR AZURE OPENAI API KEY"
+os.environ["SGAI_API_KEY"] = "INSERT YOUR SGAI API KEY"
+"""To set up the Azure OpenAI model choose the name for ```AZURE_DEPLOYMENT_NAME``` and insert ```AZURE_API_VERSION``` (the latest supported version can be found here: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference)."""
+# Load your Azure environment variables
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_DEPLOYMENT_NAME = "gpt-4.1"  # 👈 Change if needed
+AZURE_API_VERSION = "2025-01-01-preview"  # 👈 Use your correct version
+# Define Azure LLM with streaming enabled
+model = AzureChatOpenAI(
+    openai_api_version=AZURE_API_VERSION,
+    azure_deployment=AZURE_DEPLOYMENT_NAME,
+    azure_endpoint=AZURE_OPENAI_ENDPOINT,
+    streaming=True,
+    callbacks=[StreamingStdOutCallbackHandler()],
+)
+"""The following block contains prompts that define the agents behaviour.
+```CSV_PROMPT_PREFIX``` is responsible for the data agent logic, i.e. steps that it takes to complete a task. The prefix can be modified to change analytical methodology, add specific data processing steps, implement a certain data validation technique and more.
+```CSV_PROMPT_SUFFIX``` defines the structure and the content of the agent's output. Suffix can be modified to change the report structure, add sections, include additional insights and so on.
+```system_message``` is for creating SQL queries. It specifies the behaviour of the agent, making it certify its results and restricting it from changing the database.
+```SCRAPING_PROMPT_PREFIX``` is responsible for the web scraping agent logic. It specifies how the agent should behave and defines its chain of thought when asked to find data online.
+```SCRAPING_PROMPT_SUFFIX``` is responsible for the output of the web scraping agent. It can be changed to set up the format of the output.
+"""
+# Prompt prefix to set the tone for the agent.
+#By specifying the prompt prefix you may make the results of the agent more specific and consistent.
+#The following prompt can be substituted with an original one.
+CSV_PROMPT_PREFIX = """
+Set pandas to show all columns.
+Get the column names and infer data types.
+Then attempt to answer the question using multiple methods.
+Please provide only the Python code required to perform the action, and nothing else.
+"""
+#Prompt suffix describes the output format.
+#Modify this prompt to change the structure of the agent's answer.
+#You can also add more sections so that the agent touches more aspects.
+#The following prompt can be substituted with a personal one.
+CSV_PROMPT_SUFFIX = """
+- Try at least 2 different methods of calculation or filtering.
+- Reflect: Do they give the same result?
+- After performing all necessary actions and analysis with the dataframe, return the answer in clean **Markdown**, include summary table if needed.
+- Include **Execution Recommendation** and **Web Insight** in the final Markdown.
+- Always conclude the final Markdown with:
+### Final Answer
+Your conclusion here.
+---
+### Explanation
+Mention specific columns you used.
+Please provide only the Python code required to perform the action, and nothing else until the final Markdown output.
+"""
+#prompt for creating SQL queries
+#By secifying the pipeline you can make the agent's results more consistent.
+system_message = """
+You are an agent designed to interact with a SQL database.
+Given an input question, create a syntactically correct {dialect} query to run,
+then look at the results of the query and return the answer. Unless the user
+specifies a specific number of examples they wish to obtain, always limit your
+query to at most {top_k} results.
+You can order the results by a relevant column to return the most interesting
+examples in the database. Never query for all the columns from a specific table,
+only ask for the relevant columns given the question.
+You MUST double check your query before executing it. If you get an error while
+executing a query, rewrite the query and try again.
+DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the
+database.
+To start you should ALWAYS look at the tables in the database to see what you
+can query. Do NOT skip this step.
+Then you should query the schema of the most relevant tables.
+""".format(
+    dialect="SQLite",
+    top_k=5,
+)
+sql_suffix_prompt = '''
+ALWAYS end your answer as follows:
+### Final answer
+Your query here
+--
+The answer here
+'''
+SCRAPING_PROMPT_PREFIX = '''
+ROLE: Expert Data Scraper
+MISSION: Extract precise online data using systematic keyword analysis
+THINKING PROCESS:
+1. Keyword Analysis: Identify primary entities (X, Y) and quantifiers (n, m)
+2. Query Strategy: Formulate targeted search queries for each entity
+3. Data Extraction: Scrape exact quantities specified
+4. Validation: Verify results match request parameters
+EXAMPLE:
+User: "List first 5 startups and 3 investors in AI"
+Keywords: ["startups:5", "investors:3", "AI"]
+Action: Search "AI startups" → extract 5 instances → Search "AI investors" → extract 3 instances
+WORKFLOW:
+- Print identified keywords with quantities
+- Execute sequential searches per keyword group
+- Collect exactly specified instances
+- Present structured results
+READY FOR QUERY.
+'''
+SCRAPING_PROMPT_SUFFIX = '''
+ROLE: Data Extraction Agent
+MISSION: Structure all scraped data as valid pandas DataFrames
+OUTPUT REQUIREMENTS:
+- Format: pandas DataFrame
+- Columns: 1-2 word descriptive names
+- Content: Only strings or numerical values (no lists/dicts, no nested structures)
+- Validation: Must pass pd.DataFrame access tests
+VALIDATION CHECKLIST:
+✓ Each column contains only strings or numerics
+✓ No nested structures (lists/dicts) in cells
+✓ Column names are descriptive and concise
+✓ DataFrame is accessible via standard indexing
+✓ All columns MUST BE OF THE SAME LENGTH
+EXAMPLE OUTPUT:
+```python
+pd.DataFrame({
+    'Company': ['Startup A', 'Startup B'],
+    'Funding': [5000000, 7500000],
+    'Industry': 'Artificial Intelligence'
+})
+'''
+"""The following block is responsible for the logic of the agent and the output that it produces.
+```ask_agent``` function concatenates the dataframes into one and starts an AI agent for working with the concatenated dataframes. It uses the prompts from the previous blocks for its logic.
+"""
+# Replace this with your actual LLM setup
+# Example:
+# from langchain_openai import AzureChatOpenAI
+# model = AzureChatOpenAI(...)
+# --- Agent Logic ---
+def ask_agent(files, question, history):
+    try:
+        dfs = [pd.read_csv(f.name) for f in files]
+        df = pd.concat(dfs, ignore_index=True) #concatenation of all of the files uploaded into one
+    except Exception as e:
+        return f"❌ Could not read CSVs: {e}", ""
+    try:
+        agent = create_pandas_dataframe_agent(
+        llm=model, #sets the llm as the one specified earlier (Azure LLM)
+        df=df, #pandas dataframe or a list of pandas dataframes
+        verbose=True, #enables verbose logging for debugging
+        agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, #defines a specific type of agent that performs tasks without additional examples
+        allow_dangerous_code=True, #allows execution of Python code
+        handle_parsing_errors=True,  # 👈 this is the fix
+    ) #creates an agent for working with pandas dataframes
+        full_prompt = CSV_PROMPT_PREFIX + question + CSV_PROMPT_SUFFIX
+        buffer = io.StringIO()
+        with contextlib.redirect_stdout(buffer): #the output is redirected to the buffer
+            result = agent.invoke(full_prompt)
+        trace = buffer.getvalue() #retrieves the text created by the agent
+        output = result["output"] #retrieves the final answer
+        return history + output, output
+    except Exception as e:
+        return f"❌ Agent error: {e}", ""
+"""The block below deals with creating SQL code.
+```create_db``` creates a database where all the uploaded dataframes are stored for the data agent to work with.
+```start_llm``` starts a tool for working with SQL databases.
+```extract_code``` is used for extracting the SQL query from the agent's output.
+```sql_pipeline``` defines the pipeline, starting from creating a database with the uploaded dataframes, starting the agent for working with databases and creating the query according to the user's question.
+"""
+#function create_db receives a dictionary with table names as a key and tables as values
+def create_db(files):
+  print("="*10+"\nCREATE_DB\n"+"="*10)
+  try:
+      print("Attempting to create database...") # Added print statement here
+      engine = create_engine("sqlite:///database.db")
+      dataframes = dict()
+      print("="*10+f"CREATE_DB:\nfiles:{[f.name for f in files]}\n"+"="*10)
+      for f in files:
+        table_name = os.path.splitext(os.path.basename(f.name))[0]
+        dataframes[table_name] = pd.read_csv(f.name)
+      with engine.begin() as connection:
+        for name,table in zip(dataframes.keys(),dataframes.values()):
+          table.to_sql(name,connection,if_exists="replace",index=False) #writes the tables into a database
+      db = SQLDatabase.from_uri("sqlite:///database.db")
+      print("DATABASE database.db CREATED")
+  except Exception as e:
+      return f"Database error: {e}"
+  return db
+#Initialization of a LLM model for SQL queries
+def start_llm(database):
+  try:
+    print("="*10+"\nSTART_LLM\n"+"="*10)
+    toolkit = SQLDatabaseToolkit(db=database, llm=model) #creates a tool for working with SQL databases
+    tools = toolkit.get_tools()
+  except Exception as e:
+    return f"Couldn't retrieve SQLDatabaseToolkit: {e}"
+  print("\nSQLDatabaseToolkit CREATED\n")
+  return model, tools
+def extract_code(HumanMessage):
+    print("="*10+"\nEXTRACT_CODE\n"+"="*10)
+    try:
+        FRONT_INDENT = len('\n\n')
+        BACK_INDENT = len('\n')
+        p1 = HumanMessage.find('### Final answer')
+        print(p1,HumanMessage[p1:p1+50])
+        p2 = p1+FRONT_INDENT
+        return HumanMessage[p1:]
+    except Exception as e:
+      print(f'Extraction error: {e}')
+#Function that receives dataframes, puts them in a database and uses an AI agent to create quieries based on the user's question
+def sql_pipeline(tables,question,history):
+    print("="*10+"\nSQL_PIPELINE\n"+"="*10)
+    db = create_db(tables) #uploads the files added by the user and puts them in a database
+    if not os.path.exists("database.db"):
+      print("Database doesn't exist")
+      return "Database doesn't exist"
+    llm, tools = start_llm(db) #returns the agent and the tools for working with the database
+    try:
+        agent_executor = create_react_agent(llm, tools, prompt=system_message+sql_suffix_prompt)
+        output = ""
+        for step in agent_executor.stream(
+            {"messages": [{"role": "user", "content": question}]},
+            stream_mode="values",
+        ):
+            output += step["messages"][-1].content
+        #query = extract_code(output)
+        final_answer = extract_code(output)
+        return history + final_answer, final_answer
+    except Exception as e:
+        return f"SQL agent error: {e}"
+"""THe following block is responsible for creating a smart ETL pipeline"""
+@tool
+def preview_data(table: str) -> str:
+  "Reads and reviews a table"
+  df = pd.read_csv(table)
+  return df.head()
+@tool
+def suggest_transformation(column_summary: str) -> str:
+  "Suggests transformation based on column summary"
+  prompt = f"""
+  You are a data engineer assistant. Based on the following column summary, suggest simple, short ETL transformation steps.
+  Output format: each suggestion on a new line, without explanations or markdown.
+  Example:
+  Remove $ from revenue and cast to float
+  Column summary:
+  {column_summary}
+  """
+  return model.predict(prompt).strip()
+@tool
+def generate_python_code(transform_description: str) -> str:
+  "Generate pandas code from the transformation description"
+  prompt=f"""
+  You are a data engineer. Write pandas code to apply the following ETL transformation to a dataframe called 'df'.
+  Transformations:
+  {transform_description}
+  Only return pandas code. No explanation, no markdown.
+  """
+  return model.predict(prompt).strip()
+#llm is the agent that creates the etl pipeline
+#dataframe is a string with the name of the dataframe push through the etl process
+def etl_pipeline(dataframe,history):
+  tools = [preview_data, suggest_transformation, generate_python_code]
+  agent = initialize_agent(tools, model, agent='zero-shot-react-description',verbose=True)
+  input_prompt = f"""
+  Preview the table {dataframe} and \
+      generate Python code to read the table, clean it, and finally write the \
+      dataframe into a table called {'Cleaned_'+dataframe}]. \
+      Do not stop the Python session
+      """
+  # Preview + suggest + generate code in a single run
+  response = agent.run({
+      "input": input_prompt,
+      "chat_history": [],
+      "handle_parsing_errors": True
+  })
+  print("Generated Python Code:\n")
+  print(response)
+  response2 = response.strip('`').replace('python', '')
+  return history + response2, response2
+"""The following code is responsible for AI web scraping agent"""
+def web_scraping(question,history):
+  try:
+    tools = [
+        SmartScraperTool(),
+    ]
+    agent = initialize_agent(
+        tools=tools,
+        llm=model,
+        agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
+        verbose=True
+    )
+    buffer = io.StringIO()
+    print(SCRAPING_PROMPT_PREFIX + question + SCRAPING_PROMPT_SUFFIX)
+    with contextlib.redirect_stdout(buffer): #the output is redirected to the buffer
+        response = agent.run(SCRAPING_PROMPT_PREFIX + question + SCRAPING_PROMPT_SUFFIX)
+    trace = buffer.getvalue() #the trace of the agent is saved in the trace variable
+    return history + response, response
+  except Exception as e:
+    return f'Web scraping error {e}',f'Web scraping error {e}',""
+"""The next section creates a web interface using Gradio, providing a user-friendly way to analyze data and create SQL queries.
+```
+with gr.Blocks(
+    css='''
+      Change the code here to modify the styling of the UI
+    '''
+    ) as demo:
+```
+**Display Area**:
+- `result_display`: Markdown report output
+- `trace_display`: Agent reasoning trace
+**Input Section**:
+- `file_input`: Multiple CSV upload
+- `question_input`: User query box
+**Action Buttons**:
+- `sql_button`: Generate SQL queries → `sql_pipeline` function
+- `ask_button`: Run analysis → `ask_agent` function
+**Styling**
+- Light theme with rounded corners
+- Custom CSS for professional appearance
+**Launch**
+`demo.launch(share=True,debug=False)` - Public access enabled, debugging disabled
+For debugging use `debug=True` in order to see the messages in the console.
+"""
+# --- Gradio UI ---
+with gr.Blocks(
+    css="""
+    body, .gradio-container {
+        background: #ffffff !important;
+        color: #1f2937 !important;
+        font-family: 'Segoe UI', sans-serif;
+    }
+    #title {
+        color: #1f2937 !important;
+        font-size: 2rem;
+        font-weight: 600;
+        text-align: center;
+        padding-top: 20px;
+        padding-bottom: 10px;
+    }
+    .gr-box, .gr-input, .gr-output, .gr-markdown, .gr-textbox, .gr-file, textarea, input {
+        background: rgba(0, 0, 0, 0.04) !important;
+        border: 1px solid rgba(0, 0, 0, 0.1);
+        border-radius: 12px !important;
+        color: #1f2937 !important;
+    }
+    .trace-markdown {
+        height: 400px !important;
+        overflow-y: scroll;
+        resize: none;
+    }
+    textarea::placeholder, input::placeholder {
+        color: rgba(31, 41, 55, 0.6) !important;
+    }
+    button {
+        background: rgba(0, 0, 0, 0.07) !important;
+        color: #1f2937 !important;
+        border: 1px solid rgba(0, 0, 0, 0.15) !important;
+        border-radius: 8px !important;
+    }
+    button:hover {
+        background: rgba(0, 0, 0, 0.15) !important;
+    }
+    """
+) as demo:
+    gr.Markdown("<h2 id='title'>📊 NexDatawork Data Agent</h2>")
+    with gr.Column():
+      result_display = gr.Markdown(label="📌 Report Output (Markdown)")
+      with gr.Row():
+        trace_display = gr.Markdown(label="🛠️ Data Agent Reasoning - Your Explainable Agent", elem_classes=["trace-markdown"])
+        sql_display = gr.Markdown(label='SQL Process')
+      with gr.Row(equal_height=True):
+        file_input = gr.File(label="📁 Upload CSV(s)", file_types=[".csv"], file_count="multiple",height=120)
+        question_input = gr.Textbox(label="💬 Ask Your Agent",placeholder="e.g., What is the trend for revenue over time?",lines=2)
+      with gr.Row():
+        ask_button = gr.Button("💡 Analyze")
+        with gr.Row():
+          sql_button = gr.Button('Create Query')
+          scraping_button = gr.Button('Find the answer online')
+          history = gr.State(value="")
+        sql_button.click(fn=sql_pipeline,inputs=[file_input,question_input,history],outputs = [trace_display,history])
+        scraping_button.click(fn=web_scraping,inputs=[question_input,history],outputs = [trace_display,history])
+        ask_button.click(fn=ask_agent,inputs=[file_input, question_input,history],outputs=[trace_display,history])
+demo.launch(share=True,debug=False)