Nikhil-Murade commited on
Commit
3d51c0d
·
verified ·
1 Parent(s): 4d59c5e

Upload 3 files

Browse files
Files changed (3) hide show
  1. data_science_100k.db +0 -0
  2. requirements.txt +82 -0
  3. sql_agent_db.py +166 -0
data_science_100k.db ADDED
Binary file (209 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ async-timeout==4.0.3
8
+ attrs==24.2.0
9
+ blinker==1.8.2
10
+ cachetools==5.5.0
11
+ certifi==2024.8.30
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ dataclasses-json==0.6.7
15
+ distro==1.9.0
16
+ dnspython==2.6.1
17
+ exceptiongroup==1.2.2
18
+ frozenlist==1.4.1
19
+ gitdb==4.0.11
20
+ GitPython==3.1.43
21
+ greenlet==3.1.0
22
+ h11==0.14.0
23
+ httpcore==1.0.5
24
+ httpx==0.27.2
25
+ idna==3.10
26
+ Jinja2==3.1.4
27
+ jsonpatch==1.33
28
+ jsonpointer==3.0.0
29
+ jsonschema==4.23.0
30
+ jsonschema-specifications==2023.12.1
31
+ langchain==0.1.6
32
+ langchain-community==0.0.20
33
+ langchain-core==0.1.23
34
+ langchain-experimental==0.0.49
35
+ langchain-openai==0.0.5
36
+ langsmith==0.0.87
37
+ markdown-it-py==3.0.0
38
+ MarkupSafe==2.1.5
39
+ marshmallow==3.22.0
40
+ mdurl==0.1.2
41
+ multidict==6.1.0
42
+ mypy-extensions==1.0.0
43
+ narwhals==1.8.1
44
+ numpy==1.26.4
45
+ openai==1.12.0
46
+ packaging==23.2
47
+ pandas==2.2.2
48
+ pillow==10.4.0
49
+ protobuf==5.28.1
50
+ pyarrow==17.0.0
51
+ pydantic==2.9.1
52
+ pydantic_core==2.23.3
53
+ pydeck==0.9.1
54
+ Pygments==2.18.0
55
+ pymongo==4.8.0
56
+ pyodbc==5.1.0
57
+ python-dateutil==2.9.0.post0
58
+ python-dotenv==1.0.1
59
+ pytz==2024.2
60
+ PyYAML==6.0.2
61
+ referencing==0.35.1
62
+ regex==2024.9.11
63
+ requests==2.32.3
64
+ rich==13.8.1
65
+ rpds-py==0.20.0
66
+ six==1.16.0
67
+ smmap==5.0.1
68
+ sniffio==1.3.1
69
+ SQLAlchemy==2.0.30
70
+ streamlit==1.38.0
71
+ tabulate==0.9.0
72
+ tenacity==8.5.0
73
+ tiktoken==0.5.2
74
+ toml==0.10.2
75
+ tornado==6.4.1
76
+ tqdm==4.66.5
77
+ typing-inspect==0.9.0
78
+ typing_extensions==4.12.2
79
+ tzdata==2024.1
80
+ urllib3==2.2.3
81
+ watchdog==4.0.2
82
+ yarl==1.11.1
sql_agent_db.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import ChatOpenAI
4
+ import pandas as pd
5
+
6
+ from sqlalchemy import create_engine
7
+
8
+ # load environment variables from .env file
9
+ load_dotenv()
10
+
11
+ openai_key = os.getenv("OPENAI_API_KEY")
12
+
13
+ llm_name = "gpt-3.5-turbo"
14
+ model = ChatOpenAI(api_key=openai_key, model=llm_name)
15
+
16
+ # read csv file
17
+ df = pd.read_csv("Struct Data_Data Science 100K.csv")
18
+
19
+ from langchain.agents import create_sql_agent
20
+ from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
21
+ from langchain_community.utilities import SQLDatabase
22
+
23
+ # Create db from csv file
24
+
25
+ # Path to your SQLite database file
26
+ database_file_path = "./db/data_science_100k.db"
27
+
28
+ # Create an engine to connect to the SQLite database
29
+ # SQLite only requires the path to the database file
30
+ engine = create_engine(f"sqlite:///{database_file_path}")
31
+ file_url = "./ds_salaries.csv"
32
+ os.makedirs(os.path.dirname(database_file_path), exist_ok=True)
33
+
34
+ df = pd.read_csv(file_url)
35
+ df.to_sql("DataScience100k", con=engine, if_exists="replace", index=False)
36
+
37
+ print(f"Database created successfully! {df}")
38
+
39
+
40
+ # db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}")
41
+ # toolkit = SQLDatabaseToolkit(db=db, llm=model)
42
+
43
+ # QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US"""
44
+
45
+ # sql_agent = create_sql_agent(
46
+ # toolkit=toolkit,
47
+ # llm=model,
48
+ # verbose=True
49
+ # )
50
+
51
+ # sql_agent.invoke(QUESTION)
52
+
53
+ # res = sql_agent.invoke(QUESTION)
54
+
55
+ # # print(res)
56
+
57
+ # Part 2 : Prepare the sql prompt
58
+ MSSQL_AGENT_PREFIX = """
59
+ You are an agent designed to interact with a SQL database.
60
+ ## Instructions:
61
+ - Given an input question, create a syntactically correct {dialect} query
62
+ to run, then look at the results of the query and return the answer.
63
+ - Unless the user specifies a specific number of examples they wish to
64
+ obtain, **ALWAYS** limit your query to at most {top_k} results.
65
+ - You can order the results by a relevant column to return the most
66
+ interesting examples in the database.
67
+ - Never query for all the columns from a specific table, only ask for
68
+ the relevant columns given the question.
69
+ - You have access to tools for interacting with the database.
70
+ - You MUST double check your query before executing it.If you get an error
71
+ while executing a query,rewrite the query and try again.
72
+ - DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.)
73
+ to the database.
74
+ - DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE, ONLY USE THE RESULTS
75
+ OF THE CALCULATIONS YOU HAVE DONE.
76
+ - Your response should be in Markdown. However, **when running a SQL Query
77
+ in "Action Input", do not include the markdown backticks**.
78
+ Those are only for formatting the response, not for executing the command.
79
+ - ALWAYS, as part of your final answer, explain how you got to the answer
80
+ on a section that starts with: "Explanation:". Include the SQL query as
81
+ part of the explanation section.
82
+ - If the question does not seem related to the database, just return
83
+ "I don\'t know" as the answer.
84
+ - Only use the below tools. Only use the information returned by the
85
+ below tools to construct your query and final answer.
86
+ - Do not make up table names, only use the tables returned by any of the
87
+ tools below.
88
+ - as part of your final answer, please include the SQL query you used in json format or code format
89
+
90
+ ## Tools:
91
+
92
+ """
93
+
94
+ MSSQL_AGENT_FORMAT_INSTRUCTIONS = """
95
+
96
+ ## Use the following format:
97
+
98
+ Question: the input question you must answer.
99
+ Thought: you should always think about what to do.
100
+ Action: the action to take, should be one of [{tool_names}].
101
+ Action Input: the input to the action.
102
+ Observation: the result of the action.
103
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
104
+ Thought: I now know the final answer.
105
+ Final Answer: the final answer to the original input question.
106
+
107
+ Example of Final Answer:
108
+ <=== Beginning of example
109
+
110
+ Action: query_sql_db
111
+ Action Input:
112
+ SELECT TOP (10) [base_salary], [grade]
113
+ FROM salaries_2023
114
+
115
+ WHERE state = 'Division'
116
+
117
+ Observation:
118
+ [(27437.0,), (27088.0,), (26762.0,), (26521.0,), (26472.0,), (26421.0,), (26408.0,)]
119
+ Thought:I now know the final answer
120
+ Final Answer: There were 27437 workers making 100,000.
121
+
122
+ Explanation:
123
+ I queried the `xyz` table for the `salary` column where the department
124
+ is 'IGM' and the date starts with '2020'. The query returned a list of tuples
125
+ with the bazse salary for each day in 2020. To answer the question,
126
+ I took the sum of all the salaries in the list, which is 27437.
127
+ I used the following query
128
+
129
+ ```sql
130
+ SELECT [salary] FROM xyztable WHERE department = 'IGM' AND date LIKE '2020%'"
131
+ ```
132
+ ===> End of Example
133
+
134
+ """
135
+
136
+ db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}")
137
+ toolkit = SQLDatabaseToolkit(db=db, llm=model)
138
+
139
+ # QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US"""
140
+
141
+ sql_agent = create_sql_agent(
142
+ prefix=MSSQL_AGENT_PREFIX,
143
+ format_instructions=MSSQL_AGENT_FORMAT_INSTRUCTIONS,
144
+ toolkit=toolkit,
145
+ llm=model,
146
+ tok_k=30,
147
+ verbose=True
148
+ )
149
+
150
+ # res = sql_agent.invoke(QUESTION)
151
+
152
+
153
+ import streamlit as st
154
+
155
+ st.title("SQL Query AI Agent")
156
+
157
+ question = st.text_input("Enter your query:")
158
+
159
+ if st.button("Run Query"):
160
+ if question:
161
+ res = sql_agent.invoke(question)
162
+
163
+ st.markdown(res["output"])
164
+
165
+ else:
166
+ st.error("Please Enter a Query.")