Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """data_agent_demo.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1DBkfSNSZIyONNTEgSILfCoOyAGrx13DY | |
| # Introduction | |
| NexDatawork is an AI data agent for data engineering and analytics without writing code. | |
| ## Prerequisites | |
| - langchain | |
| - langgraph | |
| - sqlalchemy | |
| - pandas | |
| - gradio | |
| Before starting your work install all the required tools: | |
| """ | |
| # Commented out IPython magic to ensure Python compatibility. | |
| # Clean out any mixed installs first | |
| # %pip uninstall -y langchain langchain-core langchain-community langchain-openai langchain-anthropic langchain-google-vertexai langchain-experimental langgraph langchain-scrapegraph | |
| # Install a consistent, modern set | |
| # %pip install -U \ | |
| # "langchain==0.3.*" \ | |
| # "langchain-core==0.3.*" \ | |
| # "langchain-community==0.3.*" \ | |
| # "langgraph>=0.2,<0.3" \ | |
| # "langchain-openai>=0.2.0" \ | |
| # "langchain-anthropic>=0.2.0" \ | |
| # "langchain-google-vertexai>=2.0.0" \ | |
| # "sqlalchemy>=2.0" \ | |
| # "pandas>=2.0" \ | |
| # "gradio>=4.0" \ | |
| # "langchain-experimental"\ | |
| # "langchain-scrapegraph" | |
| import sys, importlib.util, importlib.metadata as md | |
| def v(p): | |
| try: | |
| return md.version(p) | |
| except md.PackageNotFoundError: | |
| return "not installed" | |
| print("Kernel Python:", sys.executable) | |
| print("langchain:", v("langchain")) | |
| print("langchain-core:", v("langchain-core")) | |
| print("langchain-community:", v("langchain-community")) | |
| print("langgraph:", v("langgraph")) | |
| print("langchain-openai:", v("langchain-openai")) | |
| print("langchain-anthropic:", v("langchain-anthropic")) | |
| print("langchain-google-vertexai:", v("langchain-google-vertexai")) | |
| print("langchain-experimental:", v("langchain-experimental")) | |
| print("langchain-scrapegraph:", v("langchain-scrapegraph")) | |
| print("langgraph importable?", importlib.util.find_spec("langgraph") is not None) | |
| import os | |
| import io | |
| import contextlib | |
| import pandas as pd | |
| import gradio as gr | |
| from IPython.display import Markdown, HTML, display | |
| from sqlalchemy import ( | |
| Engine, create_engine, MetaData, Table, Column, | |
| String, Integer, Float, insert, inspect, text | |
| ) | |
| # LangChain 0.3.x import paths | |
| from langchain_openai import AzureChatOpenAI | |
| from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
| from langchain.agents import initialize_agent | |
| from langchain.agents.agent_types import AgentType | |
| from langchain.tools import tool | |
| from langchain_scrapegraph.tools import SmartScraperTool | |
| from langchain.memory import ConversationTokenBufferMemory | |
| from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent | |
| from langchain_community.agent_toolkits import SQLDatabaseToolkit | |
| from langchain_community.utilities import SQLDatabase | |
| from langchain_core.messages import HumanMessage | |
| # LangGraph | |
| from langgraph.prebuilt import create_react_agent | |
| print("β Imports OK") | |
| """To access AzureOpenAI models you'll need to create an Azure account, create a deployment of an Azure OpenAI model, get the name and endpoint for your deployment, get an Azure OpenAI API key and install the langchain-openai integration package. | |
| To access SmartScraperTool you will need a ScrapeGraphAI (SGAI) account and get an API key to launch the agent. | |
| Replace the placeholders with the actual values. | |
| """ | |
| os.environ["AZURE_OPENAI_ENDPOINT"] = "INSERT THE AZURE OPENAI ENDPOINT" | |
| os.environ["AZURE_OPENAI_API_KEY"] = "INSERT YOUR AZURE OPENAI API KEY" | |
| os.environ["SGAI_API_KEY"] = "INSERT YOUR SGAI API KEY" | |
| """To set up the Azure OpenAI model choose the name for ```AZURE_DEPLOYMENT_NAME``` and insert ```AZURE_API_VERSION``` (the latest supported version can be found here: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference).""" | |
| # Load your Azure environment variables | |
| AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
| AZURE_DEPLOYMENT_NAME = "gpt-4.1" # π Change if needed | |
| AZURE_API_VERSION = "2025-01-01-preview" # π Use your correct version | |
| # Define Azure LLM with streaming enabled | |
| model = AzureChatOpenAI( | |
| openai_api_version=AZURE_API_VERSION, | |
| azure_deployment=AZURE_DEPLOYMENT_NAME, | |
| azure_endpoint=AZURE_OPENAI_ENDPOINT, | |
| streaming=True, | |
| callbacks=[StreamingStdOutCallbackHandler()], | |
| ) | |
| """The following block contains prompts that define the agents behaviour. | |
| ```CSV_PROMPT_PREFIX``` is responsible for the data agent logic, i.e. steps that it takes to complete a task. The prefix can be modified to change analytical methodology, add specific data processing steps, implement a certain data validation technique and more. | |
| ```CSV_PROMPT_SUFFIX``` defines the structure and the content of the agent's output. Suffix can be modified to change the report structure, add sections, include additional insights and so on. | |
| ```system_message``` is for creating SQL queries. It specifies the behaviour of the agent, making it certify its results and restricting it from changing the database. | |
| ```SCRAPING_PROMPT_PREFIX``` is responsible for the web scraping agent logic. It specifies how the agent should behave and defines its chain of thought when asked to find data online. | |
| ```SCRAPING_PROMPT_SUFFIX``` is responsible for the output of the web scraping agent. It can be changed to set up the format of the output. | |
| """ | |
| # Prompt prefix to set the tone for the agent. | |
| #By specifying the prompt prefix you may make the results of the agent more specific and consistent. | |
| #The following prompt can be substituted with an original one. | |
| CSV_PROMPT_PREFIX = """ | |
| Set pandas to show all columns. | |
| Get the column names and infer data types. | |
| Then attempt to answer the question using multiple methods. | |
| Please provide only the Python code required to perform the action, and nothing else. | |
| """ | |
| #Prompt suffix describes the output format. | |
| #Modify this prompt to change the structure of the agent's answer. | |
| #You can also add more sections so that the agent touches more aspects. | |
| #The following prompt can be substituted with a personal one. | |
| CSV_PROMPT_SUFFIX = """ | |
| - Try at least 2 different methods of calculation or filtering. | |
| - Reflect: Do they give the same result? | |
| - After performing all necessary actions and analysis with the dataframe, return the answer in clean **Markdown**, include summary table if needed. | |
| - Include **Execution Recommendation** and **Web Insight** in the final Markdown. | |
| - Always conclude the final Markdown with: | |
| ### Final Answer | |
| Your conclusion here. | |
| --- | |
| ### Explanation | |
| Mention specific columns you used. | |
| Please provide only the Python code required to perform the action, and nothing else until the final Markdown output. | |
| """ | |
| #prompt for creating SQL queries | |
| #By secifying the pipeline you can make the agent's results more consistent. | |
| system_message = """ | |
| You are an agent designed to interact with a SQL database. | |
| Given an input question, create a syntactically correct {dialect} query to run, | |
| then look at the results of the query and return the answer. Unless the user | |
| specifies a specific number of examples they wish to obtain, always limit your | |
| query to at most {top_k} results. | |
| You can order the results by a relevant column to return the most interesting | |
| examples in the database. Never query for all the columns from a specific table, | |
| only ask for the relevant columns given the question. | |
| You MUST double check your query before executing it. If you get an error while | |
| executing a query, rewrite the query and try again. | |
| DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the | |
| database. | |
| To start you should ALWAYS look at the tables in the database to see what you | |
| can query. Do NOT skip this step. | |
| Then you should query the schema of the most relevant tables. | |
| """.format( | |
| dialect="SQLite", | |
| top_k=5, | |
| ) | |
| sql_suffix_prompt = ''' | |
| ALWAYS end your answer as follows: | |
| ### Final answer | |
| Your query here | |
| -- | |
| The answer here | |
| ''' | |
| SCRAPING_PROMPT_PREFIX = ''' | |
| ROLE: Expert Data Scraper | |
| MISSION: Extract precise online data using systematic keyword analysis | |
| THINKING PROCESS: | |
| 1. Keyword Analysis: Identify primary entities (X, Y) and quantifiers (n, m) | |
| 2. Query Strategy: Formulate targeted search queries for each entity | |
| 3. Data Extraction: Scrape exact quantities specified | |
| 4. Validation: Verify results match request parameters | |
| EXAMPLE: | |
| User: "List first 5 startups and 3 investors in AI" | |
| Keywords: ["startups:5", "investors:3", "AI"] | |
| Action: Search "AI startups" β extract 5 instances β Search "AI investors" β extract 3 instances | |
| WORKFLOW: | |
| - Print identified keywords with quantities | |
| - Execute sequential searches per keyword group | |
| - Collect exactly specified instances | |
| - Present structured results | |
| READY FOR QUERY. | |
| ''' | |
| SCRAPING_PROMPT_SUFFIX = ''' | |
| ROLE: Data Extraction Agent | |
| MISSION: Structure all scraped data as valid pandas DataFrames | |
| OUTPUT REQUIREMENTS: | |
| - Format: pandas DataFrame | |
| - Columns: 1-2 word descriptive names | |
| - Content: Only strings or numerical values (no lists/dicts, no nested structures) | |
| - Validation: Must pass pd.DataFrame access tests | |
| VALIDATION CHECKLIST: | |
| β Each column contains only strings or numerics | |
| β No nested structures (lists/dicts) in cells | |
| β Column names are descriptive and concise | |
| β DataFrame is accessible via standard indexing | |
| β All columns MUST BE OF THE SAME LENGTH | |
| EXAMPLE OUTPUT: | |
| ```python | |
| pd.DataFrame({ | |
| 'Company': ['Startup A', 'Startup B'], | |
| 'Funding': [5000000, 7500000], | |
| 'Industry': 'Artificial Intelligence' | |
| }) | |
| ''' | |
| """The following block is responsible for the logic of the agent and the output that it produces. | |
| ```ask_agent``` function concatenates the dataframes into one and starts an AI agent for working with the concatenated dataframes. It uses the prompts from the previous blocks for its logic. | |
| """ | |
| # Replace this with your actual LLM setup | |
| # Example: | |
| # from langchain_openai import AzureChatOpenAI | |
| # model = AzureChatOpenAI(...) | |
| # --- Agent Logic --- | |
| def ask_agent(files, question, history): | |
| try: | |
| dfs = [pd.read_csv(f.name) for f in files] | |
| df = pd.concat(dfs, ignore_index=True) #concatenation of all of the files uploaded into one | |
| except Exception as e: | |
| return f"β Could not read CSVs: {e}", "" | |
| try: | |
| agent = create_pandas_dataframe_agent( | |
| llm=model, #sets the llm as the one specified earlier (Azure LLM) | |
| df=df, #pandas dataframe or a list of pandas dataframes | |
| verbose=True, #enables verbose logging for debugging | |
| agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, #defines a specific type of agent that performs tasks without additional examples | |
| allow_dangerous_code=True, #allows execution of Python code | |
| handle_parsing_errors=True, # π this is the fix | |
| ) #creates an agent for working with pandas dataframes | |
| full_prompt = CSV_PROMPT_PREFIX + question + CSV_PROMPT_SUFFIX | |
| buffer = io.StringIO() | |
| with contextlib.redirect_stdout(buffer): #the output is redirected to the buffer | |
| result = agent.invoke(full_prompt) | |
| trace = buffer.getvalue() #retrieves the text created by the agent | |
| output = result["output"] #retrieves the final answer | |
| return history + output, output | |
| except Exception as e: | |
| return f"β Agent error: {e}", "" | |
| """The block below deals with creating SQL code. | |
| ```create_db``` creates a database where all the uploaded dataframes are stored for the data agent to work with. | |
| ```start_llm``` starts a tool for working with SQL databases. | |
| ```extract_code``` is used for extracting the SQL query from the agent's output. | |
| ```sql_pipeline``` defines the pipeline, starting from creating a database with the uploaded dataframes, starting the agent for working with databases and creating the query according to the user's question. | |
| """ | |
| #function create_db receives a dictionary with table names as a key and tables as values | |
| def create_db(files): | |
| print("="*10+"\nCREATE_DB\n"+"="*10) | |
| try: | |
| print("Attempting to create database...") # Added print statement here | |
| engine = create_engine("sqlite:///database.db") | |
| dataframes = dict() | |
| print("="*10+f"CREATE_DB:\nfiles:{[f.name for f in files]}\n"+"="*10) | |
| for f in files: | |
| table_name = os.path.splitext(os.path.basename(f.name))[0] | |
| dataframes[table_name] = pd.read_csv(f.name) | |
| with engine.begin() as connection: | |
| for name,table in zip(dataframes.keys(),dataframes.values()): | |
| table.to_sql(name,connection,if_exists="replace",index=False) #writes the tables into a database | |
| db = SQLDatabase.from_uri("sqlite:///database.db") | |
| print("DATABASE database.db CREATED") | |
| except Exception as e: | |
| return f"Database error: {e}" | |
| return db | |
| #Initialization of a LLM model for SQL queries | |
| def start_llm(database): | |
| try: | |
| print("="*10+"\nSTART_LLM\n"+"="*10) | |
| toolkit = SQLDatabaseToolkit(db=database, llm=model) #creates a tool for working with SQL databases | |
| tools = toolkit.get_tools() | |
| except Exception as e: | |
| return f"Couldn't retrieve SQLDatabaseToolkit: {e}" | |
| print("\nSQLDatabaseToolkit CREATED\n") | |
| return model, tools | |
| def extract_code(HumanMessage): | |
| print("="*10+"\nEXTRACT_CODE\n"+"="*10) | |
| try: | |
| FRONT_INDENT = len('\n\n') | |
| BACK_INDENT = len('\n') | |
| p1 = HumanMessage.find('### Final answer') | |
| print(p1,HumanMessage[p1:p1+50]) | |
| p2 = p1+FRONT_INDENT | |
| return HumanMessage[p1:] | |
| except Exception as e: | |
| print(f'Extraction error: {e}') | |
| #Function that receives dataframes, puts them in a database and uses an AI agent to create quieries based on the user's question | |
| def sql_pipeline(tables,question,history): | |
| print("="*10+"\nSQL_PIPELINE\n"+"="*10) | |
| db = create_db(tables) #uploads the files added by the user and puts them in a database | |
| if not os.path.exists("database.db"): | |
| print("Database doesn't exist") | |
| return "Database doesn't exist" | |
| llm, tools = start_llm(db) #returns the agent and the tools for working with the database | |
| try: | |
| agent_executor = create_react_agent(llm, tools, prompt=system_message+sql_suffix_prompt) | |
| output = "" | |
| for step in agent_executor.stream( | |
| {"messages": [{"role": "user", "content": question}]}, | |
| stream_mode="values", | |
| ): | |
| output += step["messages"][-1].content | |
| #query = extract_code(output) | |
| final_answer = extract_code(output) | |
| return history + final_answer, final_answer | |
| except Exception as e: | |
| return f"SQL agent error: {e}" | |
| """THe following block is responsible for creating a smart ETL pipeline""" | |
| def preview_data(table: str) -> str: | |
| "Reads and reviews a table" | |
| df = pd.read_csv(table) | |
| return df.head() | |
| def suggest_transformation(column_summary: str) -> str: | |
| "Suggests transformation based on column summary" | |
| prompt = f""" | |
| You are a data engineer assistant. Based on the following column summary, suggest simple, short ETL transformation steps. | |
| Output format: each suggestion on a new line, without explanations or markdown. | |
| Example: | |
| Remove $ from revenue and cast to float | |
| Column summary: | |
| {column_summary} | |
| """ | |
| return model.predict(prompt).strip() | |
| def generate_python_code(transform_description: str) -> str: | |
| "Generate pandas code from the transformation description" | |
| prompt=f""" | |
| You are a data engineer. Write pandas code to apply the following ETL transformation to a dataframe called 'df'. | |
| Transformations: | |
| {transform_description} | |
| Only return pandas code. No explanation, no markdown. | |
| """ | |
| return model.predict(prompt).strip() | |
| #llm is the agent that creates the etl pipeline | |
| #dataframe is a string with the name of the dataframe push through the etl process | |
| def etl_pipeline(dataframe,history): | |
| tools = [preview_data, suggest_transformation, generate_python_code] | |
| agent = initialize_agent(tools, model, agent='zero-shot-react-description',verbose=True) | |
| input_prompt = f""" | |
| Preview the table {dataframe} and \ | |
| generate Python code to read the table, clean it, and finally write the \ | |
| dataframe into a table called {'Cleaned_'+dataframe}]. \ | |
| Do not stop the Python session | |
| """ | |
| # Preview + suggest + generate code in a single run | |
| response = agent.run({ | |
| "input": input_prompt, | |
| "chat_history": [], | |
| "handle_parsing_errors": True | |
| }) | |
| print("Generated Python Code:\n") | |
| print(response) | |
| response2 = response.strip('`').replace('python', '') | |
| return history + response2, response2 | |
| """The following code is responsible for AI web scraping agent""" | |
| def web_scraping(question,history): | |
| try: | |
| tools = [ | |
| SmartScraperTool(), | |
| ] | |
| agent = initialize_agent( | |
| tools=tools, | |
| llm=model, | |
| agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, | |
| verbose=True | |
| ) | |
| buffer = io.StringIO() | |
| print(SCRAPING_PROMPT_PREFIX + question + SCRAPING_PROMPT_SUFFIX) | |
| with contextlib.redirect_stdout(buffer): #the output is redirected to the buffer | |
| response = agent.run(SCRAPING_PROMPT_PREFIX + question + SCRAPING_PROMPT_SUFFIX) | |
| trace = buffer.getvalue() #the trace of the agent is saved in the trace variable | |
| return history + response, response | |
| except Exception as e: | |
| return f'Web scraping error {e}',f'Web scraping error {e}',"" | |
| """The next section creates a web interface using Gradio, providing a user-friendly way to analyze data and create SQL queries. | |
| ``` | |
| with gr.Blocks( | |
| css=''' | |
| Change the code here to modify the styling of the UI | |
| ''' | |
| ) as demo: | |
| ``` | |
| **Display Area**: | |
| - `result_display`: Markdown report output | |
| - `trace_display`: Agent reasoning trace | |
| **Input Section**: | |
| - `file_input`: Multiple CSV upload | |
| - `question_input`: User query box | |
| **Action Buttons**: | |
| - `sql_button`: Generate SQL queries β `sql_pipeline` function | |
| - `ask_button`: Run analysis β `ask_agent` function | |
| **Styling** | |
| - Light theme with rounded corners | |
| - Custom CSS for professional appearance | |
| **Launch** | |
| `demo.launch(share=True,debug=False)` - Public access enabled, debugging disabled | |
| For debugging use `debug=True` in order to see the messages in the console. | |
| """ | |
| # --- Gradio UI --- | |
| with gr.Blocks( | |
| css=""" | |
| body, .gradio-container { | |
| background: #ffffff !important; | |
| color: #1f2937 !important; | |
| font-family: 'Segoe UI', sans-serif; | |
| } | |
| #title { | |
| color: #1f2937 !important; | |
| font-size: 2rem; | |
| font-weight: 600; | |
| text-align: center; | |
| padding-top: 20px; | |
| padding-bottom: 10px; | |
| } | |
| .gr-box, .gr-input, .gr-output, .gr-markdown, .gr-textbox, .gr-file, textarea, input { | |
| background: rgba(0, 0, 0, 0.04) !important; | |
| border: 1px solid rgba(0, 0, 0, 0.1); | |
| border-radius: 12px !important; | |
| color: #1f2937 !important; | |
| } | |
| .trace-markdown { | |
| height: 400px !important; | |
| overflow-y: scroll; | |
| resize: none; | |
| } | |
| textarea::placeholder, input::placeholder { | |
| color: rgba(31, 41, 55, 0.6) !important; | |
| } | |
| button { | |
| background: rgba(0, 0, 0, 0.07) !important; | |
| color: #1f2937 !important; | |
| border: 1px solid rgba(0, 0, 0, 0.15) !important; | |
| border-radius: 8px !important; | |
| } | |
| button:hover { | |
| background: rgba(0, 0, 0, 0.15) !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown("<h2 id='title'>π NexDatawork Data Agent</h2>") | |
| with gr.Column(): | |
| result_display = gr.Markdown(label="π Report Output (Markdown)") | |
| with gr.Row(): | |
| trace_display = gr.Markdown(label="π οΈ Data Agent Reasoning - Your Explainable Agent", elem_classes=["trace-markdown"]) | |
| sql_display = gr.Markdown(label='SQL Process') | |
| with gr.Row(equal_height=True): | |
| file_input = gr.File(label="π Upload CSV(s)", file_types=[".csv"], file_count="multiple",height=120) | |
| question_input = gr.Textbox(label="π¬ Ask Your Agent",placeholder="e.g., What is the trend for revenue over time?",lines=2) | |
| with gr.Row(): | |
| ask_button = gr.Button("π‘ Analyze") | |
| with gr.Row(): | |
| sql_button = gr.Button('Create Query') | |
| scraping_button = gr.Button('Find the answer online') | |
| history = gr.State(value="") | |
| sql_button.click(fn=sql_pipeline,inputs=[file_input,question_input,history],outputs = [trace_display,history]) | |
| scraping_button.click(fn=web_scraping,inputs=[question_input,history],outputs = [trace_display,history]) | |
| ask_button.click(fn=ask_agent,inputs=[file_input, question_input,history],outputs=[trace_display,history]) | |
| demo.launch(share=True,debug=False) |