## 1. UI/UX Enhancements

#21
by bvproperty - opened
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples/online_retail_data.csv filter=lfs diff=lfs merge=lfs -text
.gitignore DELETED
@@ -1,4 +0,0 @@
1
- __pycache__/
2
- .gradio/
3
- .env
4
- temp/
 
 
 
 
 
README.md CHANGED
@@ -4,10 +4,10 @@ emoji: 📈
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: true
10
- short_description: Queries, visualizations, stat analysis on your data
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.23.3
8
  app_file: app.py
9
  pinned: true
10
+ short_description: Queries, visualizations, stat analysis on data files
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,192 +1,121 @@
1
- from utils import TEMP_DIR, message_dict, api_key_store, model_store
2
- import gradio as gr
3
- import templates.data_file as data_file, templates.sql_db as sql_db, templates.doc_db as doc_db, templates.graphql as graphql
4
-
5
- import os
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv()
9
-
10
- def delete_db(req: gr.Request):
11
- import shutil
12
- dir_path = TEMP_DIR / str(req.session_hash)
13
- if os.path.exists(dir_path):
14
- shutil.rmtree(dir_path)
15
- message_dict[req.session_hash] = {}
16
- api_key_store.pop(req.session_hash, None)
17
- model_store.pop(req.session_hash, None)
18
-
19
- def set_api_key(api_key, model, request: gr.Request):
20
- api_key = api_key.strip()
21
- if not api_key:
22
- return (
23
- gr.update(visible=True),
24
- gr.update(visible=True, value="<p style='color:#b91c1c;text-align:center;margin:6px 0;font-size:14px;'>Please enter your API key.</p>"),
25
- gr.update(visible=False),
26
- )
27
- api_key_store[request.session_hash] = api_key
28
- model_store[request.session_hash] = model
29
- provider = "Anthropic" if api_key.startswith("sk-ant-") else "OpenAI"
30
- provider_icon = "fa-a" if provider == "Anthropic" else "fa-o"
31
- badge_html = f"""
32
- <div style="display:flex;flex-direction:column;align-items:center;gap:6px;padding:10px 0 4px;">
33
- <div style="display:inline-flex;align-items:center;gap:10px;background:#f0fdf4;border:1px solid #86efac;
34
- padding:8px 20px;border-radius:9999px;font-size:13px;font-weight:500;color:#15803d;
35
- box-shadow:0 1px 3px rgba(0,0,0,0.06);">
36
- <i class="fas fa-circle-check" style="font-size:14px;"></i>
37
- <span>{provider}</span>
38
- <span style="color:#86efac;">·</span>
39
- <span style="font-weight:600;">{model}</span>
40
- </div>
41
- <p style="margin:0;font-size:11px;color:#9ca3af;letter-spacing:0.02em;">
42
- Session active use the button below to change
43
- </p>
44
- </div>
45
- """
46
- return gr.update(visible=False), gr.update(visible=True, value=badge_html), gr.update(visible=True)
47
-
48
- def show_api_form():
49
- return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
50
-
51
- css = ".file_marker .large{min-height:50px !important;} .padding{padding:0;} .description_component{overflow:visible !important;}"
52
- head = """<meta charset="UTF-8">
53
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
54
- <title>Virtual Data Analyst</title>
55
- <!-- Tailwind CSS -->
56
- <script src="https://cdn.tailwindcss.com"></script>
57
- <!-- Google Fonts -->
58
- <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
59
- <!-- Font Awesome -->
60
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
61
- <!-- Custom Styles -->
62
- <link rel="stylesheet" href="/gradio_api/file=assets/styles.css">
63
- """
64
-
65
- theme = gr.themes.Base(primary_hue="sky", secondary_hue="slate", font=[gr.themes.GoogleFont("Inter"), "Inter", "sans-serif"]).set(
66
- button_primary_background_fill="#3B82F6",
67
- button_secondary_background_fill="#6B7280",
68
- )
69
-
70
- from pathlib import Path
71
- gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
72
-
73
- _env_api_key = os.getenv("OPENAI_API_KEY", "")
74
-
75
- OPENAI_MODELS = [
76
- "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
77
- "gpt-4o", "gpt-4o-mini",
78
- "o3-mini", "o4-mini",
79
- "gpt-5.4-mini", "gpt-5.4", "gpt-5.5",
80
- ]
81
- ANTHROPIC_MODELS = [
82
- "claude-sonnet-4-6",
83
- "claude-opus-4-8",
84
- "claude-haiku-4-5-20251001",
85
- ]
86
-
87
- def update_models(api_key):
88
- if api_key.strip().startswith("sk-ant-"):
89
- return gr.update(choices=ANTHROPIC_MODELS, value=ANTHROPIC_MODELS[0])
90
- return gr.update(choices=OPENAI_MODELS, value=OPENAI_MODELS[0])
91
-
92
- with gr.Blocks(theme=theme, css=css, head=head, delete_cache=(3600, 3600)) as demo:
93
-
94
- with gr.Column(visible=True) as api_key_section:
95
- gr.HTML("""
96
- <div style="max-width:640px;margin:28px auto 12px;padding:22px 28px;
97
- background:linear-gradient(135deg,#eff6ff 0%,#e0f2fe 100%);
98
- border:1px solid #bfdbfe;border-radius:14px;
99
- box-shadow:0 2px 8px rgba(59,130,246,0.08);">
100
- <div style="display:flex;align-items:flex-start;gap:16px;">
101
- <div style="width:42px;height:42px;flex-shrink:0;background:#3B82F6;
102
- border-radius:10px;display:flex;align-items:center;
103
- justify-content:center;box-shadow:0 2px 6px rgba(59,130,246,0.35);">
104
- <i class="fas fa-key" style="color:white;font-size:16px;"></i>
105
- </div>
106
- <div>
107
- <h3 style="color:#1e40af;margin:0 0 6px;font-size:16px;font-weight:700;letter-spacing:-0.01em;">
108
- Get Started
109
- </h3>
110
- <p style="color:#3730a3;font-size:13.5px;margin:0;line-height:1.6;">
111
- Enter your <strong>OpenAI</strong>
112
- (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-...</code>)
113
- or <strong>Anthropic</strong>
114
- (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-ant-...</code>)
115
- API key. The model list updates automatically. Your key is held in memory only
116
- and cleared when you leave — never saved or shared.
117
- </p>
118
- </div>
119
- </div>
120
- </div>
121
- """)
122
- with gr.Row(equal_height=True):
123
- api_key_input = gr.Textbox(
124
- label="API Key",
125
- placeholder="sk-proj-... or sk-ant-api03-...",
126
- type="password",
127
- value=_env_api_key,
128
- scale=4,
129
- )
130
- model_dropdown = gr.Dropdown(
131
- label="Model",
132
- choices=OPENAI_MODELS,
133
- value=OPENAI_MODELS[0],
134
- scale=2,
135
- )
136
- api_key_btn = gr.Button("Set API Key", variant="primary", scale=1, min_width=120)
137
-
138
- api_key_status = gr.HTML("", visible=False)
139
- change_key_btn = gr.Button("🔑 Change Key / Model", variant="secondary", visible=False, size="sm")
140
-
141
- api_key_input.change(fn=update_models, inputs=api_key_input, outputs=model_dropdown)
142
- api_key_btn.click(
143
- fn=set_api_key,
144
- inputs=[api_key_input, model_dropdown],
145
- outputs=[api_key_section, api_key_status, change_key_btn],
146
- )
147
- change_key_btn.click(fn=show_api_form, outputs=[api_key_section, api_key_status, change_key_btn])
148
-
149
- header = gr.HTML("""
150
- <header class="max-w-4xl mx-auto mb-12 text-center">
151
- <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
152
- <p class="text-lg text-gray-600 mb-6">
153
- A powerful tool for data analysis, visualizations, and insights
154
- </p>
155
- </header>
156
- <main class="max-w-4xl mx-auto">
157
- <div class="mt-12 grid md:grid-cols-3 gap-6" style="margin-bottom:3px !important;">
158
- <div class="feature-card bg-white p-6 rounded-lg shadow-md">
159
- <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
160
- <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
161
- <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
162
- </div>
163
- <div class="feature-card bg-white p-6 rounded-lg shadow-md">
164
- <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
165
- <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
166
- <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
167
- </div>
168
- <div class="feature-card bg-white p-6 rounded-lg shadow-md">
169
- <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
170
- <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
171
- <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
172
- </div>
173
- </div>
174
- </main>""")
175
-
176
- with gr.Tab("📄 Data File"):
177
- data_file.demo.render()
178
- with gr.Tab("🗄 SQL Database"):
179
- sql_db.demo.render()
180
- with gr.Tab("🍃 MongoDB"):
181
- doc_db.demo.render()
182
- with gr.Tab("⚡ GraphQL API"):
183
- graphql.demo.render()
184
-
185
- footer = gr.HTML("""
186
- <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
187
- <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
188
- </footer>""")
189
-
190
- demo.unload(delete_db)
191
-
192
- demo.launch(debug=True, allowed_paths=["temp/", "assets/"])
 
1
+ from data_sources import process_data_upload
2
+ from functions import example_question_generator, chatbot_with_fc
3
+ from utils import TEMP_DIR, message_dict
4
+ import gradio as gr
5
+
6
+ import ast
7
+ import os
8
+ from getpass import getpass
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ if "OPENAI_API_KEY" not in os.environ:
14
+ os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
15
+
16
+ def delete_db(req: gr.Request):
17
+ import shutil
18
+ dir_path = TEMP_DIR / str(req.session_hash)
19
+ if os.path.exists(dir_path):
20
+ shutil.rmtree(dir_path)
21
+ message_dict[req.session_hash] = None
22
+
23
+ def run_example(input):
24
+ return input
25
+
26
+ def example_display(input):
27
+ if input == None:
28
+ display = True
29
+ else:
30
+ display = False
31
+ return [gr.update(visible=display),gr.update(visible=display)]
32
+
33
+ css= ".file_marker .large{min-height:50px !important;} .example_btn{max-width:300px;} .padding{padding:0;}"
34
+
35
+ with gr.Blocks(css=css, delete_cache=(3600,3600)) as demo:
36
+ title = gr.HTML("<h1 style='text-align:center;'>Virtual Data Analyst</h1>")
37
+ description = gr.HTML("""<p style='text-align:center;'>A helpful tool for data analysis, visualizations, regressions, and more.
38
+ Upload a data file and chat with our virtual data analyst to get insights on your data set.
39
+ Try a sample file to get started!</p>
40
+ <ul style="margin:auto;max-width: 500px;">
41
+ <li style="margin:0;line-height:1;">Currently accepts CSV, TSV, TXT, XLS, XLSX, XML, and JSON files.</li>
42
+ <li style="margin:0;line-height:1;">Can run SQL queries, linear regressions, and analyze the results.</li>
43
+ <li style="margin:0;line-height:1;">Can generate scatter plots, line charts, pie charts, bar graphs, histograms, time series, and more.
44
+ New visualizations types added regularly.</li>
45
+ </ul>
46
+ <p style='text-align:center;'>This application is under active development. If you experience bugs with use,
47
+ open a discussion in the community tab and I will respond.</p>""")
48
+ example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
49
+ example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
50
+ with gr.Row():
51
+ example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="example_btn", size="md", variant="primary")
52
+ example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="example_btn", size="md", variant="primary")
53
+
54
+ file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker", file_types=['.csv','.xlsx','.txt','.json','.ndjson','.xml','.xls','.tsv'])
55
+ example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
56
+ example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
57
+ file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2])
58
+
59
+ @gr.render(inputs=file_output)
60
+ def data_options(filename, request: gr.Request):
61
+ print(filename)
62
+ message_dict[request.session_hash] = None
63
+ if filename:
64
+ process_message = process_upload(filename, request.session_hash)
65
+ gr.HTML(value=process_message[1], padding=False)
66
+ if process_message[0] == "success":
67
+ if "bank_marketing_campaign" in filename:
68
+ example_questions = [
69
+ ["Describe the dataset"],
70
+ ["What levels of education have the highest and lowest average balance?"],
71
+ ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
72
+ ["Can you generate a bar chart of education vs. average balance?"],
73
+ ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
74
+ ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
75
+ ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
76
+ ]
77
+ elif "online_retail_data" in filename:
78
+ example_questions = [
79
+ ["Describe the dataset"],
80
+ ["What month had the highest revenue?"],
81
+ ["Is revenue higher in the morning or afternoon?"],
82
+ ["Can you generate a line graph of revenue per month?"],
83
+ ["Can you generate a table of revenue per month?"],
84
+ ["Can we predict how time of day affects transaction value in this data set?"],
85
+ ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
86
+ ]
87
+ else:
88
+ try:
89
+ generated_examples = ast.literal_eval(example_question_generator(request.session_hash))
90
+ example_questions = [
91
+ ["Describe the dataset"]
92
+ ]
93
+ for example in generated_examples:
94
+ example_questions.append([example])
95
+ except:
96
+ example_questions = [
97
+ ["Describe the dataset"],
98
+ ["List the columns in the dataset"],
99
+ ["What could this data be used for?"],
100
+ ]
101
+ parameters = gr.Textbox(visible=False, value=request.session_hash)
102
+ bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
103
+ chat = gr.ChatInterface(
104
+ fn=chatbot_with_fc,
105
+ type='messages',
106
+ chatbot=bot,
107
+ title="Chat with your data file",
108
+ concurrency_limit=None,
109
+ examples=example_questions,
110
+ additional_inputs=parameters
111
+ )
112
+
113
+ def process_upload(upload_value, session_hash):
114
+ if upload_value:
115
+ process_message = process_data_upload(upload_value, session_hash)
116
+ return process_message
117
+
118
+ demo.unload(delete_db)
119
+
120
+ ## Uncomment the line below to launch the chat app with UI
121
+ demo.launch(debug=True, allowed_paths=["temp/"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_sources/__init__.py CHANGED
@@ -1,6 +1,3 @@
1
  from .upload_file import process_data_upload
2
- from .connect_sql_db import connect_sql_db
3
- from .connect_doc_db import connect_doc_db
4
- from .connect_graphql import connect_graphql
5
 
6
- __all__ = ["process_data_upload","connect_sql_db","connect_doc_db","connect_graphql"]
 
1
  from .upload_file import process_data_upload
 
 
 
2
 
3
+ __all__ = ["process_data_upload"]
data_sources/connect_doc_db.py DELETED
@@ -1,36 +0,0 @@
1
- from pymongo import MongoClient
2
- import os
3
- from utils import TEMP_DIR
4
- from pymongo_schema.extract import extract_pymongo_client_schema
5
-
6
- def connect_doc_db(connection_string, nosql_db_name, session_hash):
7
- try:
8
- # Create a MongoClient object
9
- client = MongoClient(connection_string)
10
- print("Connected to NoSQL Mongo DB")
11
-
12
- # Access a database
13
- db = client[nosql_db_name]
14
-
15
- collection_names = db.list_collection_names()
16
-
17
- print(collection_names)
18
-
19
- schema = extract_pymongo_client_schema(client)
20
-
21
- # Close the connection
22
- if 'client' in locals() and client:
23
- client.close()
24
- print("MongoDB Connection closed.")
25
-
26
- session_path = 'doc_db'
27
-
28
- dir_path = TEMP_DIR / str(session_hash) / str(session_path)
29
- os.makedirs(dir_path, exist_ok=True)
30
-
31
- return ["success","<p style='color:green;text-align:center;font-size:18px;'>Document database connected successful</p>", collection_names, schema]
32
- except Exception as e:
33
- print("DocDB CONNECTION ERROR")
34
- print(e)
35
- return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_sources/connect_graphql.py DELETED
@@ -1,148 +0,0 @@
1
- import requests
2
- import certifi
3
- import os
4
- import json
5
- from utils import TEMP_DIR
6
-
7
- def connect_graphql(graphql_url, api_token, graphql_token_header, session_hash):
8
- try:
9
- # Create the GraphQL Introspection Query
10
- query = """
11
- query IntrospectionQuery {
12
- __schema {
13
- queryType { name }
14
- mutationType { name }
15
- subscriptionType { name }
16
- types {
17
- ...FullType
18
- }
19
- directives {
20
- name
21
- description
22
- locations
23
- args {
24
- ...InputValue
25
- }
26
- }
27
- }
28
- }
29
- fragment FullType on __Type {
30
- kind
31
- name
32
- description
33
- fields(includeDeprecated: true) {
34
- name
35
- description
36
- args {
37
- ...InputValue
38
- }
39
- type {
40
- ...TypeRef
41
- }
42
- isDeprecated
43
- deprecationReason
44
- }
45
- inputFields {
46
- ...InputValue
47
- }
48
- interfaces {
49
- ...TypeRef
50
- }
51
- enumValues(includeDeprecated: true) {
52
- name
53
- description
54
- isDeprecated
55
- deprecationReason
56
- }
57
- possibleTypes {
58
- ...TypeRef
59
- }
60
- }
61
- fragment InputValue on __InputValue {
62
- name
63
- description
64
- type { ...TypeRef }
65
- defaultValue
66
- }
67
- fragment TypeRef on __Type {
68
- kind
69
- name
70
- ofType {
71
- kind
72
- name
73
- ofType {
74
- kind
75
- name
76
- ofType {
77
- kind
78
- name
79
- ofType {
80
- kind
81
- name
82
- ofType {
83
- kind
84
- name
85
- ofType {
86
- kind
87
- name
88
- ofType {
89
- kind
90
- name
91
- }
92
- }
93
- }
94
- }
95
- }
96
- }
97
- }
98
- }
99
- """
100
- print("Connecting to GraphQL Endpoint")
101
-
102
- # Access a database
103
- headers = {"Content-Type": "application/json"}
104
- if graphql_token_header and api_token:
105
- headers[graphql_token_header] = api_token
106
- response = requests.post(graphql_url, headers=headers, json={"query": query},
107
- verify=certifi.where())
108
- response.raise_for_status()
109
-
110
- introspection_result = response.json()
111
-
112
- client_schema = introspection_result["data"]["__schema"]
113
-
114
- #Generate the list of types
115
- type_names_query = """
116
- query IntrospectionQuery {
117
- __schema {
118
- types {
119
- name
120
- }
121
- }
122
- }
123
- """
124
- types_response = requests.post(graphql_url, headers=headers, json={"query": type_names_query},
125
- verify=certifi.where())
126
-
127
- types_response_results =types_response.json()
128
-
129
- types_names = types_response_results["data"]
130
-
131
- type_names = []
132
- for name in types_names["__schema"]["types"]:
133
- type_names.append(name["name"])
134
-
135
- session_path = 'graphql'
136
-
137
- dir_path = TEMP_DIR / str(session_hash) / str(session_path)
138
- os.makedirs(dir_path, exist_ok=True)
139
-
140
- with open(f'{dir_path}/schema.json', 'w') as fp:
141
- json.dump(client_schema, fp, indent=2)
142
-
143
- return ["success","<p style='color:green;text-align:center;font-size:18px;'>GraphQL API connected successful</p>", type_names]
144
- except Exception as e:
145
- print("GraphQL CONNECTION ERROR")
146
- print(e)
147
- return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]
148
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_sources/connect_sql_db.py DELETED
@@ -1,42 +0,0 @@
1
- import psycopg2
2
- import os
3
- from utils import TEMP_DIR
4
-
5
- def connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
6
- try:
7
- conn = psycopg2.connect(
8
- database=sql_db_name,
9
- user=sql_user,
10
- password=sql_pass,
11
- host=url, # e.g., "localhost" or an IP address
12
- port=sql_port # default is 5432
13
- )
14
- print("Connected to PostgreSQL")
15
-
16
- # Create a cursor object to execute SQL queries
17
- cur = conn.cursor()
18
- # Example: Execute a query
19
- cur.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'""")
20
- table_tuples = cur.fetchall()
21
- table_names = []
22
- for table in table_tuples:
23
- table_names.append(table[0])
24
-
25
- print(table_names)
26
-
27
- # Close the cursor and connection
28
- cur.close()
29
- conn.close()
30
- print("Connection closed.")
31
-
32
- session_path = 'sql'
33
-
34
- dir_path = TEMP_DIR / str(session_hash) / str(session_path)
35
- os.makedirs(dir_path, exist_ok=True)
36
-
37
- return ["success","<p style='color:green;text-align:center;font-size:18px;'>SQL database connected successful</p>", table_names]
38
- except Exception as e:
39
- print("SQL DB CONNECTION ERROR")
40
- print(e)
41
- return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]
42
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_sources/upload_file.py CHANGED
@@ -65,102 +65,28 @@ def process_data_upload(data_file, session_hash):
65
 
66
  for column in df.columns:
67
  if type(column) is str:
68
- if "date" in column.lower() or "time" in column.lower():
 
69
  try:
70
- df[column] = pd.to_datetime(df[column])
71
- except:
72
- pass
73
- if 'year' in column.lower():
74
- try:
75
- df[column] = pd.to_datetime(df[column], format='%Y')
76
  except:
77
  pass
78
  if df[column].dtype == 'object' and isinstance(df[column].iloc[0], list):
79
  df[column] = df[column].explode()
80
 
81
- session_path = 'file_upload'
82
-
83
- dir_path = TEMP_DIR / str(session_hash) / str(session_path)
84
  os.makedirs(dir_path, exist_ok=True)
85
 
86
  connection = sqlite3.connect(f'{dir_path}/data_source.db')
87
- print("Opened database successfully")
 
88
 
89
  df.to_sql('data_source', connection, if_exists='replace', index = False)
90
-
91
- cur=connection.execute('select * from data_source')
92
- columns = [i[0] for i in cur.description]
93
- print(columns)
94
 
95
  connection.commit()
96
  connection.close()
97
 
98
- missing_per_col = {col: int(df[col].isnull().sum()) for col in df.columns}
99
- total_missing = sum(missing_per_col.values())
100
-
101
- def _simplify_dtype(d):
102
- s = str(d)
103
- if 'int' in s: return 'Integer'
104
- if 'float' in s: return 'Float'
105
- if 'datetime' in s: return 'DateTime'
106
- if 'bool' in s: return 'Boolean'
107
- return 'Text'
108
-
109
- dtypes = {col: _simplify_dtype(df[col].dtype) for col in df.columns}
110
-
111
- preview = []
112
- for _, row in df.head(5).iterrows():
113
- row_vals = []
114
- for v in row:
115
- try:
116
- row_vals.append('' if pd.isna(v) else str(v)[:60])
117
- except Exception:
118
- row_vals.append(str(v)[:60])
119
- preview.append(row_vals)
120
-
121
- duplicate_rows = int(df.duplicated().sum())
122
- unique_counts = {col: int(df[col].nunique()) for col in df.columns}
123
-
124
- col_stats = {}
125
- for col in df.columns:
126
- dtype_str = str(df[col].dtype)
127
- try:
128
- if 'int' in dtype_str or 'float' in dtype_str:
129
- col_stats[col] = {
130
- 'type': 'numeric',
131
- 'min': float(df[col].min()),
132
- 'max': float(df[col].max()),
133
- 'mean': float(df[col].mean()),
134
- }
135
- elif 'datetime' in dtype_str:
136
- col_stats[col] = {
137
- 'type': 'datetime',
138
- 'min': str(df[col].min())[:10],
139
- 'max': str(df[col].max())[:10],
140
- }
141
- except Exception:
142
- pass
143
-
144
- try:
145
- file_size_bytes = os.path.getsize(data_file)
146
- except Exception:
147
- file_size_bytes = 0
148
-
149
- stats = {
150
- 'num_rows': len(df),
151
- 'num_cols': len(df.columns),
152
- 'total_missing': total_missing,
153
- 'missing_per_col': missing_per_col,
154
- 'dtypes': dtypes,
155
- 'preview_cols': list(df.columns),
156
- 'preview': preview,
157
- 'duplicate_rows': duplicate_rows,
158
- 'unique_counts': unique_counts,
159
- 'col_stats': col_stats,
160
- 'file_size_bytes': file_size_bytes,
161
- }
162
-
163
- return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>", columns, stats]
164
  except Exception as e:
165
  print("UPLOAD ERROR")
166
  print(e)
 
65
 
66
  for column in df.columns:
67
  if type(column) is str:
68
+ pattern = 'year|month|date|day|time'
69
+ if re.search(pattern, column.lower()):
70
  try:
71
+ df[column] = pd.to_datetime(df[column], infer_datetime_format=True)
 
 
 
 
 
72
  except:
73
  pass
74
  if df[column].dtype == 'object' and isinstance(df[column].iloc[0], list):
75
  df[column] = df[column].explode()
76
 
77
+ dir_path = TEMP_DIR / str(session_hash)
 
 
78
  os.makedirs(dir_path, exist_ok=True)
79
 
80
  connection = sqlite3.connect(f'{dir_path}/data_source.db')
81
+ print("Opened database successfully");
82
+ print(df.columns)
83
 
84
  df.to_sql('data_source', connection, if_exists='replace', index = False)
 
 
 
 
85
 
86
  connection.commit()
87
  connection.close()
88
 
89
+ return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
  print("UPLOAD ERROR")
92
  print(e)
functions/__init__.py CHANGED
@@ -1,17 +1,9 @@
1
- from .query_functions import graphql_schema_query, graphql_csv_query, query_func
2
  from .chart_functions import table_generation_func, scatter_chart_generation_func, \
3
- line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, \
4
- histogram_generation_func, box_chart_generation_func, correlation_heatmap_func, \
5
- scatter_chart_fig, rolling_stats_func
6
- from .chat_functions import example_question_generator, chatbot_func
7
- from .stat_functions import regression_func, descriptive_stats_func, \
8
- kmeans_clustering_func, hypothesis_test_func
9
 
10
- __all__ = [
11
- "query_func", "graphql_schema_query", "graphql_csv_query",
12
- "table_generation_func", "scatter_chart_generation_func", "line_chart_generation_func",
13
- "bar_chart_generation_func", "pie_chart_generation_func", "histogram_generation_func",
14
- "box_chart_generation_func", "correlation_heatmap_func", "rolling_stats_func",
15
- "regression_func", "descriptive_stats_func", "kmeans_clustering_func", "hypothesis_test_func",
16
- "scatter_chart_fig", "example_question_generator", "chatbot_func",
17
- ]
 
1
+ from .sqlite_functions import SQLiteQuery, sqlite_query_func
2
  from .chart_functions import table_generation_func, scatter_chart_generation_func, \
3
+ line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, histogram_generation_func, scatter_chart_fig
4
+ from .chat_functions import example_question_generator, chatbot_with_fc
5
+ from .stat_functions import regression_func
 
 
 
6
 
7
+ __all__ = ["SQLiteQuery","sqlite_query_func","table_generation_func","scatter_chart_generation_func",
8
+ "line_chart_generation_func","bar_chart_generation_func","regression_func", "pie_chart_generation_func", "histogram_generation_func",
9
+ "scatter_chart_fig","example_question_generator","chatbot_with_fc"]
 
 
 
 
 
functions/chart_functions.py CHANGED
@@ -1,28 +1,17 @@
1
  from typing import List
 
2
  import plotly.io as pio
3
  import plotly.express as px
4
  import pandas as pd
5
  from utils import TEMP_DIR
6
  import os
7
  import ast
 
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
- root_url = os.getenv("ROOT_URL", "")
13
-
14
-
15
- def _write_chart(fig, chart_path, chart_url):
16
- """Write a Plotly figure to disk and return a responsive iframe HTML string."""
17
- pio.write_html(fig, chart_path, full_html=False, config={"responsive": True})
18
- return (
19
- 'Please display this iframe: '
20
- '<div style="width:100%;overflow-x:auto;">'
21
- '<iframe style="width:100%;min-width:400px;" height="500" '
22
- f'src="{chart_url}" frameborder="0" allowfullscreen>'
23
- '</iframe></div>'
24
- )
25
-
26
 
27
  def llm_chart_data_scrub(data, layout):
28
  #Processing data to account for variation from LLM
@@ -103,11 +92,11 @@ def scatter_chart_fig(df, x_column: List[str], y_column: str, category: str="",
103
 
104
  return fig
105
 
106
- def scatter_chart_generation_func(x_column: List[str], y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
107
  category: str="", trendline: str="", trendline_options: List[dict]=[{}], marginal_x: str="", marginal_y: str="",
108
- size: str="", **kwargs):
109
  try:
110
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
111
  chart_path = f'{dir_path}/chart.html'
112
  csv_query_path = f'{dir_path}/query.csv'
113
 
@@ -138,8 +127,13 @@ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_ha
138
  for data_item in fig["data"]:
139
  data_item[key] = value
140
 
141
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
142
- return {"reply": _write_chart(fig, chart_path, chart_url)}
 
 
 
 
 
143
 
144
  except Exception as e:
145
  print("SCATTER PLOT ERROR")
@@ -150,10 +144,10 @@ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_ha
150
  """
151
  return {"reply": reply}
152
 
153
- def line_chart_generation_func(x_column: str, y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
154
- category: str="", **kwargs):
155
  try:
156
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
157
  chart_path = f'{dir_path}/chart.html'
158
  csv_query_path = f'{dir_path}/query.csv'
159
 
@@ -182,10 +176,15 @@ def line_chart_generation_func(x_column: str, y_column: str, session_hash, sessi
182
  for data_item in fig["data"]:
183
  data_item[key] = value
184
 
185
- print(fig)
 
 
186
 
187
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
188
- return {"reply": _write_chart(fig, chart_path, chart_url)}
 
 
 
189
 
190
  except Exception as e:
191
  print("LINE CHART ERROR")
@@ -196,10 +195,10 @@ def line_chart_generation_func(x_column: str, y_column: str, session_hash, sessi
196
  """
197
  return {"reply": reply}
198
 
199
- def bar_chart_generation_func(x_column: str, y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
200
- category: str="", facet_row: str="", facet_col: str="", **kwargs):
201
  try:
202
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
203
  chart_path = f'{dir_path}/chart.html'
204
  csv_query_path = f'{dir_path}/query.csv'
205
 
@@ -232,10 +231,15 @@ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, sessio
232
  for data_item in fig["data"]:
233
  data_item[key] = value
234
 
235
- print(fig)
 
 
 
 
 
 
236
 
237
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
238
- return {"reply": _write_chart(fig, chart_path, chart_url)}
239
 
240
  except Exception as e:
241
  print("BAR CHART ERROR")
@@ -246,9 +250,9 @@ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, sessio
246
  """
247
  return {"reply": reply}
248
 
249
- def pie_chart_generation_func(values: str, names: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}], **kwargs):
250
  try:
251
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
252
  chart_path = f'{dir_path}/chart.html'
253
  csv_query_path = f'{dir_path}/query.csv'
254
 
@@ -274,10 +278,15 @@ def pie_chart_generation_func(values: str, names: str, session_hash, session_fol
274
  for data_item in fig["data"]:
275
  data_item[key] = value
276
 
277
- print(fig)
 
 
 
 
 
 
278
 
279
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
280
- return {"reply": _write_chart(fig, chart_path, chart_url)}
281
 
282
  except Exception as e:
283
  print("PIE CHART ERROR")
@@ -288,15 +297,16 @@ def pie_chart_generation_func(values: str, names: str, session_hash, session_fol
288
  """
289
  return {"reply": reply}
290
 
291
- def histogram_generation_func(x_column: str, session_hash, session_folder, y_column: str="", data: List[dict]=[{}], layout: List[dict]=[{}], histnorm: str="", category: str="",
292
- histfunc: str="", **kwargs):
293
  try:
294
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
295
  chart_path = f'{dir_path}/chart.html'
296
  csv_query_path = f'{dir_path}/query.csv'
297
 
298
  df = pd.read_csv(csv_query_path)
299
 
 
300
  print(x_column)
301
 
302
  function_args = {"data_frame":df, "x":x_column}
@@ -328,10 +338,15 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
328
  for data_item in fig["data"]:
329
  data_item[key] = value
330
 
331
- print(fig)
 
 
 
 
332
 
333
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
334
- return {"reply": _write_chart(fig, chart_path, chart_url)}
 
335
 
336
  except Exception as e:
337
  print("HISTOGRAM ERROR")
@@ -342,185 +357,33 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
342
  """
343
  return {"reply": reply}
344
 
345
- def box_chart_generation_func(y_column: str, session_hash, session_folder,
346
- x_column: str="", category: str="",
347
- layout: List[dict]=[{}], **kwargs):
348
- try:
349
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
350
- chart_path = f'{dir_path}/chart.html'
351
- csv_query_path = f'{dir_path}/query.csv'
352
-
353
- df = pd.read_csv(csv_query_path)
354
-
355
- function_args = {"data_frame": df, "y": y_column}
356
- if x_column:
357
- function_args["x"] = x_column
358
- if category:
359
- function_args["color"] = category
360
-
361
- initial_graph = px.box(**function_args)
362
- fig = initial_graph.to_dict()
363
-
364
- _, layout_dict = llm_chart_data_scrub({}, layout)
365
- if layout_dict:
366
- fig["layout"] = layout_dict
367
-
368
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
369
- return {"reply": _write_chart(fig, chart_path, chart_url)}
370
-
371
- except Exception as e:
372
- print("BOX CHART ERROR")
373
- print(e)
374
- return {"reply": f"There was an error generating the box plot. Error: {e}. You should probably try again."}
375
-
376
-
377
- def correlation_heatmap_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
378
- try:
379
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
380
- chart_path = f'{dir_path}/chart.html'
381
- csv_query_path = f'{dir_path}/query.csv'
382
-
383
- df = pd.read_csv(csv_query_path)
384
-
385
- numeric_df = df[columns].select_dtypes(include='number') if columns else df.select_dtypes(include='number')
386
-
387
- if numeric_df.shape[1] < 2:
388
- return {"reply": "At least two numeric columns are needed for a correlation matrix. Please refine your query to include more numeric columns."}
389
-
390
- corr = numeric_df.corr().round(3)
391
-
392
- fig = px.imshow(
393
- corr,
394
- text_auto='.2f',
395
- color_continuous_scale='RdBu_r',
396
- zmin=-1,
397
- zmax=1,
398
- title='Correlation Matrix',
399
- aspect='auto',
400
- )
401
- fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
402
-
403
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
404
- return {"reply": _write_chart(fig, chart_path, chart_url)}
405
-
406
- except Exception as e:
407
- print("CORRELATION HEATMAP ERROR")
408
- print(e)
409
- return {"reply": f"There was an error generating the correlation heatmap. Error: {e}. You should probably try again."}
410
-
411
-
412
- def rolling_stats_func(x_column: str, y_column: str, session_hash, session_folder,
413
- window: int = 7, stats: List[str] = ["mean"],
414
- layout: List[dict] = [{}], category: str = "", **kwargs):
415
- try:
416
- import plotly.graph_objects as go
417
-
418
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
419
- chart_path = f'{dir_path}/chart.html'
420
  csv_query_path = f'{dir_path}/query.csv'
 
421
 
422
  df = pd.read_csv(csv_query_path)
 
423
 
424
- try:
425
- df[x_column] = pd.to_datetime(df[x_column])
426
- except Exception:
427
- pass
428
- df = df.sort_values(x_column)
429
-
430
- valid_stats = {"mean", "std", "min", "max"}
431
- selected_stats = [s for s in stats if s in valid_stats] or ["mean"]
432
-
433
- fig = go.Figure()
434
-
435
- groups = df[category].unique().tolist() if category and category in df.columns else [None]
436
-
437
- for group in groups:
438
- group_df = df[df[category] == group] if group is not None else df
439
- prefix = f"{group} — " if group is not None else ""
440
-
441
- fig.add_trace(go.Scatter(
442
- x=group_df[x_column].values, y=group_df[y_column].values,
443
- mode="lines", name=f"{prefix}{y_column} (raw)",
444
- opacity=0.35, line=dict(width=1)
445
- ))
446
-
447
- rolling_obj = group_df[y_column].rolling(window)
448
- for stat in selected_stats:
449
- rolled = getattr(rolling_obj, stat)()
450
- fig.add_trace(go.Scatter(
451
- x=group_df[x_column].values, y=rolled.values,
452
- mode="lines", name=f"{prefix}Rolling {stat.capitalize()} (w={window})",
453
- line=dict(width=2.5)
454
- ))
455
-
456
- fig.update_layout(
457
- title=f"Rolling Statistics (window={window}) — {y_column}",
458
- xaxis_title=x_column,
459
- yaxis_title=y_column,
460
- font=dict(family="Inter, system-ui, sans-serif"),
461
- legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
462
- )
463
-
464
- _, layout_dict = llm_chart_data_scrub({}, layout)
465
- if layout_dict:
466
- fig.update_layout(**layout_dict)
467
-
468
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
469
- return {"reply": _write_chart(fig, chart_path, chart_url)}
470
-
471
- except Exception as e:
472
- print("ROLLING STATS ERROR")
473
- print(e)
474
- return {"reply": f"There was an error generating the rolling statistics chart. Error: {e}. You should probably try again."}
475
 
 
 
476
 
477
- def table_generation_func(session_hash, session_folder, **kwargs):
478
- print("TABLE GENERATION")
479
- try:
480
- from html import escape
481
-
482
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
483
- csv_query_path = f'{dir_path}/query.csv'
484
-
485
- df = pd.read_csv(csv_query_path)
486
-
487
- total_rows = len(df)
488
- max_rows = 200
489
- if total_rows > max_rows:
490
- df = df.head(max_rows)
491
- note = (f'<p class="vda-table-note">Showing first {max_rows} of {total_rows} rows'
492
- ' — refine your query to see more specific results.</p>')
493
- else:
494
- note = ''
495
-
496
- header_cells = ''.join(f'<th>{escape(str(col))}</th>' for col in df.columns)
497
- row_html = [
498
- '<tr>' + ''.join(f'<td>{escape(str(val))}</td>' for val in row) + '</tr>'
499
- for _, row in df.iterrows()
500
- ]
501
-
502
- style = (
503
- '<style>'
504
- '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
505
- '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
506
- '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
507
- '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
508
- '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
509
- '.vda-table tbody tr:last-child td{border-bottom:none;}'
510
- '.vda-table-note{font-size:12px;color:#6b7280;margin:4px 0 0;text-align:right;}'
511
- '</style>'
512
- )
513
-
514
- table = (
515
- '<div class="vda-table-wrap"><table class="vda-table">'
516
- f'<thead><tr>{header_cells}</tr></thead>'
517
- '<tbody>' + '\n'.join(row_html) + '</tbody>'
518
- '</table></div>'
519
- )
520
-
521
- return {"reply": style + table + note}
522
 
 
 
 
 
523
  except Exception as e:
524
- print("TABLE ERROR")
525
- print(e)
526
- return {"reply": f"There was an error generating the table. Error: {e}. You should probably try again."}
 
 
 
 
 
1
  from typing import List
2
+ from typing import Dict
3
  import plotly.io as pio
4
  import plotly.express as px
5
  import pandas as pd
6
  from utils import TEMP_DIR
7
  import os
8
  import ast
9
+ import json
10
  from dotenv import load_dotenv
11
 
12
  load_dotenv()
13
 
14
+ root_url = os.getenv("ROOT_URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def llm_chart_data_scrub(data, layout):
17
  #Processing data to account for variation from LLM
 
92
 
93
  return fig
94
 
95
+ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
96
  category: str="", trendline: str="", trendline_options: List[dict]=[{}], marginal_x: str="", marginal_y: str="",
97
+ size: str=""):
98
  try:
99
+ dir_path = TEMP_DIR / str(session_hash)
100
  chart_path = f'{dir_path}/chart.html'
101
  csv_query_path = f'{dir_path}/query.csv'
102
 
 
127
  for data_item in fig["data"]:
128
  data_item[key] = value
129
 
130
+ pio.write_html(fig, chart_path, full_html=False)
131
+
132
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
133
+
134
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
135
+
136
+ return {"reply": iframe}
137
 
138
  except Exception as e:
139
  print("SCATTER PLOT ERROR")
 
144
  """
145
  return {"reply": reply}
146
 
147
+ def line_chart_generation_func(x_column: str, y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
148
+ category: str=""):
149
  try:
150
+ dir_path = TEMP_DIR / str(session_hash)
151
  chart_path = f'{dir_path}/chart.html'
152
  csv_query_path = f'{dir_path}/query.csv'
153
 
 
176
  for data_item in fig["data"]:
177
  data_item[key] = value
178
 
179
+ print(fig)
180
+
181
+ pio.write_html(fig, chart_path, full_html=False)
182
 
183
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
184
+
185
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
186
+
187
+ return {"reply": iframe}
188
 
189
  except Exception as e:
190
  print("LINE CHART ERROR")
 
195
  """
196
  return {"reply": reply}
197
 
198
+ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
199
+ category: str="", facet_row: str="", facet_col: str=""):
200
  try:
201
+ dir_path = TEMP_DIR / str(session_hash)
202
  chart_path = f'{dir_path}/chart.html'
203
  csv_query_path = f'{dir_path}/query.csv'
204
 
 
231
  for data_item in fig["data"]:
232
  data_item[key] = value
233
 
234
+ print(fig)
235
+
236
+ pio.write_html(fig, chart_path, full_html=False)
237
+
238
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
239
+
240
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
241
 
242
+ return {"reply": iframe}
 
243
 
244
  except Exception as e:
245
  print("BAR CHART ERROR")
 
250
  """
251
  return {"reply": reply}
252
 
253
+ def pie_chart_generation_func(values: str, names: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}]):
254
  try:
255
+ dir_path = TEMP_DIR / str(session_hash)
256
  chart_path = f'{dir_path}/chart.html'
257
  csv_query_path = f'{dir_path}/query.csv'
258
 
 
278
  for data_item in fig["data"]:
279
  data_item[key] = value
280
 
281
+ print(fig)
282
+
283
+ pio.write_html(fig, chart_path, full_html=False)
284
+
285
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
286
+
287
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
288
 
289
+ return {"reply": iframe}
 
290
 
291
  except Exception as e:
292
  print("PIE CHART ERROR")
 
297
  """
298
  return {"reply": reply}
299
 
300
+ def histogram_generation_func(x_column: str, session_hash, y_column: str="", data: List[dict]=[{}], layout: List[dict]=[{}], histnorm: str="", category: str="",
301
+ histfunc: str=""):
302
  try:
303
+ dir_path = TEMP_DIR / str(session_hash)
304
  chart_path = f'{dir_path}/chart.html'
305
  csv_query_path = f'{dir_path}/query.csv'
306
 
307
  df = pd.read_csv(csv_query_path)
308
 
309
+ print(df)
310
  print(x_column)
311
 
312
  function_args = {"data_frame":df, "x":x_column}
 
338
  for data_item in fig["data"]:
339
  data_item[key] = value
340
 
341
+ print(fig)
342
+
343
+ pio.write_html(fig, chart_path, full_html=False)
344
+
345
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
346
 
347
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
348
+
349
+ return {"reply": iframe}
350
 
351
  except Exception as e:
352
  print("HISTOGRAM ERROR")
 
357
  """
358
  return {"reply": reply}
359
 
360
+ def table_generation_func(session_hash):
361
+ print("TABLE GENERATION")
362
+ try:
363
+ dir_path = TEMP_DIR / str(session_hash)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  csv_query_path = f'{dir_path}/query.csv'
365
+ table_path = f'{dir_path}/table.html'
366
 
367
  df = pd.read_csv(csv_query_path)
368
+ print(df)
369
 
370
+ html_table = df.to_html()
371
+ print(html_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
+ with open(table_path, "w") as file:
374
+ file.write(html_table)
375
 
376
+ table_url = f'{root_url}/gradio_api/file/temp/{session_hash}/table.html'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + table_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
379
+ print(iframe)
380
+ return {"reply": iframe}
381
+
382
  except Exception as e:
383
+ print("TABLE ERROR")
384
+ print(e)
385
+ reply = f"""There was an error generating the Pandas DataFrame table results.
386
+ The error is {e},
387
+ You should probably try again.
388
+ """
389
+ return {"reply": reply}
functions/chat_functions.py CHANGED
@@ -1,184 +1,93 @@
1
- from utils import message_dict, api_key_store, model_store
2
 
3
  from haystack.dataclasses import ChatMessage
4
  from haystack.components.generators.chat import OpenAIChatGenerator
5
- from haystack.utils import Secret
6
-
7
- def _get_generator(session_hash):
8
- api_key = api_key_store.get(session_hash)
9
- if not api_key:
10
- raise ValueError("No API key found for this session. Please enter your API key at the top of the page.")
11
- model = model_store.get(session_hash, "gpt-4o")
12
- if api_key.startswith("sk-ant-"):
13
- from haystack_integrations.components.generators.chat import AnthropicChatGenerator
14
- return AnthropicChatGenerator(model=model, api_key=Secret.from_token(api_key))
15
- return OpenAIChatGenerator(model=model, api_key=Secret.from_token(api_key))
16
 
 
17
  response = None
18
 
19
- def example_question_message(data_source, name, titles, schema):
20
-
21
- example_message_dict = {
22
- 'file_upload' : ["You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.",
23
- f"""We have a SQLite database with the following {titles}.
24
- We also have an AI agent with access to the same database that will be performing data analysis.
25
- Please return an array of seven strings, each one being a question for our data analysis agent
26
- that we can suggest that you believe will be insightful or helpful to a data analyst looking for
27
- data insights. Return nothing more than the array of questions because I need that specific data structure
28
- to process your response. No other response type or data structure will work."""],
29
-
30
- 'sql' : [f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database called {name}.",
31
- f"""We have a PostgreSQL database with the following tables: {titles}.
32
- We also have an AI agent with access to the same database that will be performing data analysis.
33
- Please return an array of seven strings, each one being a question for our data analysis agent
34
- that we can suggest that you believe will be insightful or helpful to a data analyst looking for
35
- data insights. Return nothing more than the array of questions because I need that specific data structure
36
- to process your response. No other response type or data structure will work."""],
37
-
38
- 'doc_db' : [f"You are a helpful and knowledgeable agent who has access to an MongoDB NoSQL document database called {name}.",
39
- f"""We have a MongoDB NoSQL document database with the following collections: {titles}.
40
- The schema of these collections is: {schema}.
41
- We also have an AI agent with access to the same database that will be performing data analysis.
42
- Please return an array of seven strings, each one being a question for our data analysis agent
43
- that we can suggest that you believe will be insightful or helpful to a data analyst looking for
44
- data insights. Return nothing more than the array of questions because I need that specific data structure
45
- to process your response. No other response type or data structure will work."""],
46
-
47
- 'graphql' : [f"You are a helpful and knowledgeable agent who has access to an GraphQL API endpoint called {name}.",
48
- f"""We have a GraphQL API endpoint with the following types: {titles}.
49
- We also have an AI agent with access to the same GraphQL API endpoint that will be performing data analysis.
50
- Please return an array of seven strings, each one being a question for our data analysis agent
51
- that we can suggest that you believe will be insightful or helpful to a data analyst looking for
52
- data insights. Return nothing more than the array of questions because I need that specific data structure
53
- to process your response. No other response type or data structure will work."""]
54
-
55
- }
56
-
57
- return example_message_dict[data_source]
58
-
59
- def example_question_generator(session_hash, data_source, name, titles, schema):
60
  example_response = None
61
- example_message_list = example_question_message(data_source, name, titles, schema)
62
  example_messages = [
63
  ChatMessage.from_system(
64
- example_message_list[0]
65
  )
66
  ]
67
-
68
- example_messages.append(ChatMessage.from_user(text=example_message_list[1]))
69
-
70
- example_response = _get_generator(session_hash).run(messages=example_messages)
71
-
72
- response_text = example_response["replies"][0].text
73
- start = response_text.index("[") + 1
74
- end = response_text.index("]")
75
- response_content = response_text[start:end]
76
- response_list = '[' + response_content + ']'
77
- print(response_list)
78
-
79
- return response_list
80
-
81
- def system_message(data_source, titles, schema=""):
82
- print("TITLES")
83
- print(titles)
84
-
85
- tools_desc = (
86
- " You have access to tools for querying the data source, generating charts and visualisations,"
87
- " and performing statistical analyses — use them proactively whenever they would help answer the user's question."
88
- " Always display any charts, tables, and visualisations inline in your responses by outputting the returned HTML verbatim."
89
- )
90
-
91
- system_message_dict = {
92
- 'file_upload': (
93
- f"You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source' that contains the following columns: {titles}."
94
- + tools_desc
95
- ),
96
- 'sql': (
97
- f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database which has a series of tables called {titles}."
98
- + tools_desc
99
- ),
100
- 'doc_db': (
101
- f"You are a helpful and knowledgeable agent who has access to a NoSQL MongoDB Document database which has a series of collections called {titles}. "
102
- f"The schema of these collections is: {schema}."
103
- + tools_desc
104
- ),
105
- 'graphql': (
106
- f"You are a helpful and knowledgeable agent who has access to a GraphQL API which has the following types: {titles}. "
107
- "We have also saved a schema.json file that contains the entire introspection query that we can use to find out more about each type before making a query."
108
- + tools_desc
109
- ),
110
- }
111
-
112
- return system_message_dict[data_source]
113
-
114
- def chatbot_func(message, history, session_hash, data_source, titles, schema, *args):
115
- try:
116
- chat_generator = _get_generator(session_hash)
117
- except ValueError as e:
118
- return str(e)
119
-
120
- from functions import (
121
- table_generation_func, regression_func, descriptive_stats_func,
122
- scatter_chart_generation_func, line_chart_generation_func, bar_chart_generation_func,
123
- pie_chart_generation_func, histogram_generation_func,
124
- box_chart_generation_func, correlation_heatmap_func, rolling_stats_func,
125
- query_func, graphql_schema_query, graphql_csv_query,
126
- kmeans_clustering_func, hypothesis_test_func,
127
- )
128
- import tools.tools as tools
129
-
130
- available_functions = {
131
- "query_func": query_func,
132
- "graphql_schema_query": graphql_schema_query,
133
- "graphql_csv_query": graphql_csv_query,
134
- "table_generation_func": table_generation_func,
135
- "scatter_chart_generation_func": scatter_chart_generation_func,
136
- "line_chart_generation_func": line_chart_generation_func,
137
- "bar_chart_generation_func": bar_chart_generation_func,
138
- "pie_chart_generation_func": pie_chart_generation_func,
139
- "histogram_generation_func": histogram_generation_func,
140
- "box_chart_generation_func": box_chart_generation_func,
141
- "correlation_heatmap_func": correlation_heatmap_func,
142
- "rolling_stats_func": rolling_stats_func,
143
- "regression_func": regression_func,
144
- "descriptive_stats_func": descriptive_stats_func,
145
- "kmeans_clustering_func": kmeans_clustering_func,
146
- "hypothesis_test_func": hypothesis_test_func,
147
- }
148
-
149
- if message_dict[session_hash][data_source] != None:
150
- message_dict[session_hash][data_source].append(ChatMessage.from_user(message))
151
  else:
152
  messages = [
153
- ChatMessage.from_system(system_message(data_source, titles, schema))
 
 
 
 
 
 
 
 
 
154
  ]
155
  messages.append(ChatMessage.from_user(message))
156
- message_dict[session_hash][data_source] = messages
157
-
158
- active_tools = tools.tools_call(session_hash, data_source, titles)
159
- response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
160
 
161
  while True:
162
- # if the response is a tool call
163
  if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
164
  function_calls = response["replies"][0].tool_calls
165
  for function_call in function_calls:
166
- message_dict[session_hash][data_source].append(ChatMessage.from_assistant(tool_calls=[function_call]))
167
  ## Parse function calling information
168
  function_name = function_call.tool_name
169
  function_args = function_call.arguments
170
 
171
  ## Find the corresponding function and call it with the given arguments
172
  function_to_call = available_functions[function_name]
173
- function_response = function_to_call(**function_args, session_hash=session_hash, session_folder=data_source, args=args)
174
  print(function_name)
175
  ## Append function response to the messages list using `ChatMessage.from_tool`
176
- message_dict[session_hash][data_source].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
177
- response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
178
 
179
  # Regular Conversation
180
  else:
181
- message_dict[session_hash][data_source].append(response["replies"][0])
182
  break
 
183
 
184
- return response["replies"][0].text
 
1
+ from utils import TEMP_DIR, message_dict
2
 
3
  from haystack.dataclasses import ChatMessage
4
  from haystack.components.generators.chat import OpenAIChatGenerator
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ chat_generator = OpenAIChatGenerator(model="gpt-4o")
7
  response = None
8
 
9
+ def example_question_generator(session_hash):
10
+ import sqlite3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  example_response = None
 
12
  example_messages = [
13
  ChatMessage.from_system(
14
+ "You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'."
15
  )
16
  ]
17
+ dir_path = TEMP_DIR / str(session_hash)
18
+ connection = sqlite3.connect(f'{dir_path}/data_source.db')
19
+ print("Querying questions");
20
+ cur=connection.execute('select * from data_source')
21
+ columns = [i[0] for i in cur.description]
22
+ print("QUESTION COLUMNS")
23
+ print(columns)
24
+ cur.close()
25
+ connection.close()
26
+
27
+ example_messages.append(ChatMessage.from_user(text=f"""We have a SQLite database with the following {columns}.
28
+ We also have an AI agent with access to the same database that will be performing data analysis.
29
+ Please return an array of seven strings, each one being a question for our data analysis agent
30
+ that we can suggest that you believe will be insightful or helpful to a data analysis looking for
31
+ data insights. Return nothing more than the array of questions because I need that specific data structure
32
+ to process your response. No other response type or data structure will work."""))
33
+
34
+ example_response = chat_generator.run(messages=example_messages)
35
+
36
+ return example_response["replies"][0].text
37
+
38
+ def chatbot_with_fc(message, history, session_hash):
39
+ from functions import sqlite_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
40
+ line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
41
+ import tools
42
+
43
+ available_functions = {"sql_query_func": sqlite_query_func,"table_generation_func":table_generation_func,
44
+ "line_chart_generation_func":line_chart_generation_func,"bar_chart_generation_func":bar_chart_generation_func,
45
+ "scatter_chart_generation_func":scatter_chart_generation_func, "pie_chart_generation_func":pie_chart_generation_func,
46
+ "histogram_generation_func":histogram_generation_func,
47
+ "regression_func":regression_func }
48
+
49
+ if message_dict[session_hash] != None:
50
+ message_dict[session_hash].append(ChatMessage.from_user(message))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  else:
52
  messages = [
53
+ ChatMessage.from_system(
54
+ """You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.
55
+ You also have access to a function, called table_generation_func, that can take a query.csv file generated from our sql query and returns an iframe that we can display in our chat window.
56
+ You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we can display in our chat window.
57
+ You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a line chart and returns an iframe that we can display in our chat window.
58
+ You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a bar graph and returns an iframe that we can display in our chat window.
59
+ You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a pie chart and returns an iframe that we can display in our chat window.
60
+ You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a histogram and returns an iframe that we can display in our chat window.
61
+ You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our sql query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe."""
62
+ )
63
  ]
64
  messages.append(ChatMessage.from_user(message))
65
+ message_dict[session_hash] = messages
66
+
67
+ response = chat_generator.run(messages=message_dict[session_hash], generation_kwargs={"tools": tools.tools_call(session_hash)})
 
68
 
69
  while True:
70
+ # if OpenAI response is a tool call
71
  if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
72
  function_calls = response["replies"][0].tool_calls
73
  for function_call in function_calls:
74
+ message_dict[session_hash].append(ChatMessage.from_assistant(tool_calls=[function_call]))
75
  ## Parse function calling information
76
  function_name = function_call.tool_name
77
  function_args = function_call.arguments
78
 
79
  ## Find the corresponding function and call it with the given arguments
80
  function_to_call = available_functions[function_name]
81
+ function_response = function_to_call(**function_args, session_hash=session_hash)
82
  print(function_name)
83
  ## Append function response to the messages list using `ChatMessage.from_tool`
84
+ message_dict[session_hash].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
85
+ response = chat_generator.run(messages=message_dict[session_hash], generation_kwargs={"tools": tools.tools_call(session_hash)})
86
 
87
  # Regular Conversation
88
  else:
89
+ message_dict[session_hash].append(response["replies"][0])
90
  break
91
+ return response["replies"][0].text
92
 
93
+
functions/query_functions.py DELETED
@@ -1,229 +0,0 @@
1
- from typing import List
2
- from typing import AnyStr
3
- from haystack import component
4
- import pandas as pd
5
- from pandasql import sqldf
6
- pd.set_option('display.max_rows', None)
7
- pd.set_option('display.max_columns', None)
8
- pd.set_option('display.width', None)
9
- pd.set_option('display.max_colwidth', None)
10
- import sqlite3
11
- import psycopg2
12
- from pymongo import MongoClient
13
- import pymongoarrow.monkey
14
- import json
15
- import pluck
16
- from utils import TEMP_DIR
17
- import ast
18
-
19
- @component
20
- class SQLiteQuery:
21
-
22
- def __init__(self, sql_database: str):
23
- self.connection = sqlite3.connect(sql_database, check_same_thread=False)
24
-
25
- @component.output_types(results=List[str], queries=List[str])
26
- def run(self, queries: AnyStr, session_hash):
27
- print("ATTEMPTING TO RUN SQLITE QUERY")
28
- dir_path = TEMP_DIR / str(session_hash)
29
- results = []
30
- result = pd.read_sql(queries, self.connection)
31
- result.to_csv(f'{dir_path}/file_upload/query.csv', index=False)
32
- column_names = list(result.columns)
33
- results.append(f"{result}")
34
- self.connection.close()
35
- return {"results": results, "queries": queries, "csv_columns": column_names}
36
-
37
- @component
38
- class PostgreSQLQuery:
39
-
40
- def __init__(self, url: str, sql_port: int, sql_user: str, sql_pass: str, sql_db_name: str):
41
- self.connection = psycopg2.connect(
42
- database=sql_db_name,
43
- user=sql_user,
44
- password=sql_pass,
45
- host=url, # e.g., "localhost" or an IP address
46
- port=sql_port # default is 5432
47
- )
48
-
49
- @component.output_types(results=List[str], queries=List[str])
50
- def run(self, queries: AnyStr, session_hash):
51
- print("ATTEMPTING TO RUN POSTGRESQL QUERY")
52
- dir_path = TEMP_DIR / str(session_hash)
53
- results = []
54
- result = pd.read_sql_query(queries, self.connection)
55
- result.to_csv(f'{dir_path}/sql/query.csv', index=False)
56
- column_names = list(result.columns)
57
- results.append(f"{result}")
58
- self.connection.close()
59
- return {"results": results, "queries": queries, "csv_columns": column_names}
60
-
61
- @component
62
- class DocDBQuery:
63
-
64
- def __init__(self, connection_string: str, doc_db_name: str):
65
- client = MongoClient(connection_string)
66
-
67
- self.client = client
68
- self.connection = client[doc_db_name]
69
-
70
- @component.output_types(results=List[str], queries=List[str])
71
- def run(self, aggregation_pipeline: List[str], db_collection, session_hash):
72
- pymongoarrow.monkey.patch_all()
73
- print("ATTEMPTING TO RUN MONGODB QUERY")
74
- dir_path = TEMP_DIR / str(session_hash)
75
- results = []
76
- print(aggregation_pipeline)
77
-
78
- aggregation_pipeline = aggregation_pipeline.replace(" ", "")
79
-
80
- false_replace = [':false', ': false']
81
- false_value = ':False'
82
- true_replace = [':true', ': true']
83
- true_value = ':True'
84
-
85
- for replace in false_replace:
86
- aggregation_pipeline = aggregation_pipeline.replace(replace, false_value)
87
- for replace in true_replace:
88
- aggregation_pipeline = aggregation_pipeline.replace(replace, true_value)
89
-
90
- query_list = ast.literal_eval(aggregation_pipeline)
91
-
92
- print("QUERY List")
93
- print(query_list)
94
- print(db_collection)
95
-
96
- db = self.connection
97
- collection = db[db_collection]
98
-
99
- print(collection)
100
- docs = collection.aggregate_pandas_all(query_list)
101
- print("DATA FRAME COMPLETE")
102
- docs.to_csv(f'{dir_path}/doc_db/query.csv', index=False)
103
- column_names = list(docs.columns)
104
- print("CSV COMPLETE")
105
- results.append(f"{docs}")
106
- self.client.close()
107
- return {"results": results, "queries": aggregation_pipeline, "csv_columns": column_names}
108
-
109
- @component
110
- class GraphQLQuery:
111
-
112
- def __init__(self):
113
-
114
- self.connection = pluck
115
-
116
- @component.output_types(results=List[str], queries=List[str])
117
- def run(self, graphql_query, graphql_api_string, graphql_api_token, graphql_token_header, session_hash):
118
- print("ATTEMPTING TO RUN GRAPHQL QUERY")
119
- dir_path = TEMP_DIR / str(session_hash)
120
- results = []
121
-
122
- headers = {"Content-Type": "application/json"}
123
- if graphql_token_header and graphql_api_token:
124
- headers[graphql_token_header] = graphql_api_token
125
-
126
- print(graphql_query)
127
-
128
- response = self.connection.execute(url=graphql_api_string, headers=headers, query=graphql_query, column_names="short")
129
-
130
- if response.errors:
131
- raise ValueError(response.errors)
132
- elif response.data:
133
- print("DATA FRAME COMPLETE")
134
- print(response)
135
- response_frame = response.frames['default']
136
- print("RESPONSE FRAME")
137
- #print(response_frame)
138
-
139
- response_frame.to_csv(f'{dir_path}/graphql/query.csv', index=False)
140
- column_names = list(response_frame.columns)
141
- print("CSV COMPLETE")
142
- results.append(f"{response_frame}")
143
- return {"results": results, "queries": graphql_query, "csv_columns": column_names}
144
-
145
- def query_func(queries:AnyStr, session_hash, session_folder, args, **kwargs):
146
- try:
147
- print("QUERY")
148
- print(queries)
149
- if session_folder == "file_upload":
150
- dir_path = TEMP_DIR / str(session_hash)
151
- sql_query = SQLiteQuery(f'{dir_path}/file_upload/data_source.db')
152
- result = sql_query.run(queries, session_hash)
153
- elif session_folder == "sql":
154
- sql_query = PostgreSQLQuery(args[0], args[1], args[2], args[3], args[4])
155
- result = sql_query.run(queries, session_hash)
156
- elif session_folder == 'doc_db':
157
- doc_db_query = DocDBQuery(args[0], args[1])
158
- result = doc_db_query.run(queries, kwargs['db_collection'], session_hash)
159
- elif session_folder == 'graphql':
160
- graphql_object = GraphQLQuery()
161
- result = graphql_object.run(queries, args[0], args[1], args[2], session_hash)
162
- print("RESULT")
163
- print(result["csv_columns"])
164
- if len(result["results"][0]) > 1000:
165
- print("QUERY TOO LARGE")
166
- return {"reply": f"""query result too large to be processed by llm, the query results are in our query.csv file.
167
- The column names of this query.csv file are: {result["csv_columns"]}.
168
- If you need to display the results directly, perhaps use the table_generation_func function."""}
169
- else:
170
- return {"reply": result["results"][0]}
171
-
172
- except Exception as e:
173
- reply = f"""There was an error running the {session_folder} Query = {queries}
174
- The error is {e},
175
- You should probably try again.
176
- """
177
- print(reply)
178
- return {"reply": reply}
179
-
180
- def graphql_schema_query(graphql_type: AnyStr, session_hash, **kwargs):
181
- dir_path = TEMP_DIR / str(session_hash)
182
- try:
183
- with open(f'{dir_path}/graphql/schema.json', 'r') as file:
184
- data = json.load(file)
185
-
186
- types_list = data["types"]
187
- result = list(filter(lambda item: item["name"] == graphql_type, types_list))
188
-
189
- print("SCHEMA RESULT")
190
- print(graphql_type)
191
- print(str(result))
192
-
193
- return {"reply": str(result)}
194
-
195
- except Exception as e:
196
- reply = f"""There was an error querying our schema.json file with the type:{graphql_type}
197
- The error is {e},
198
- You should probably try again.
199
- """
200
- print(reply)
201
- return {"reply": reply}
202
-
203
- def graphql_csv_query(csv_query: AnyStr, session_hash, **kwargs):
204
- dir_path = TEMP_DIR / str(session_hash)
205
- try:
206
- query = pd.read_csv(f'{dir_path}/graphql/query.csv')
207
- query.Name = 'query'
208
- print("GRAPHQL CSV QUERY")
209
- print(csv_query)
210
- queried_df = sqldf(csv_query, locals())
211
- print(queried_df)
212
- column_names = list(queried_df.columns)
213
- queried_df.to_csv(f'{dir_path}/graphql/query.csv', index=False)
214
-
215
- if len(queried_df) > 1000:
216
- print("CSV QUERY TOO LARGE")
217
- return {"reply": f"""The new query results are in our query.csv file.
218
- The column names of this query.csv file are: {column_names}.
219
- If you need to display the results directly, perhaps use the table_generation_func function."""}
220
- else:
221
- return {"reply": str(queried_df)}
222
-
223
- except Exception as e:
224
- reply = f"""There was an error querying our query.csv file with the query:{csv_query}
225
- The error is {e},
226
- You should probably try again.
227
- """
228
- print(reply)
229
- return {"reply": reply}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
functions/sqlite_functions.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from haystack import component
3
+ import pandas as pd
4
+ pd.set_option('display.max_rows', None)
5
+ pd.set_option('display.max_columns', None)
6
+ pd.set_option('display.width', None)
7
+ pd.set_option('display.max_colwidth', None)
8
+ import sqlite3
9
+ from utils import TEMP_DIR
10
+
11
+ @component
12
+ class SQLiteQuery:
13
+
14
+ def __init__(self, sql_database: str):
15
+ self.connection = sqlite3.connect(sql_database, check_same_thread=False)
16
+
17
+ @component.output_types(results=List[str], queries=List[str])
18
+ def run(self, queries: List[str], session_hash):
19
+ print("ATTEMPTING TO RUN QUERY")
20
+ dir_path = TEMP_DIR / str(session_hash)
21
+ results = []
22
+ for query in queries:
23
+ result = pd.read_sql(query, self.connection)
24
+ result.to_csv(f'{dir_path}/query.csv', index=False)
25
+ results.append(f"{result}")
26
+ self.connection.close()
27
+ return {"results": results, "queries": queries}
28
+
29
+
30
+
31
+ def sqlite_query_func(queries: List[str], session_hash):
32
+ dir_path = TEMP_DIR / str(session_hash)
33
+ sql_query = SQLiteQuery(f'{dir_path}/data_source.db')
34
+ try:
35
+ result = sql_query.run(queries, session_hash)
36
+ if len(result["results"][0]) > 1000:
37
+ print("QUERY TOO LARGE")
38
+ return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
39
+ else:
40
+ return {"reply": result["results"][0]}
41
+
42
+ except Exception as e:
43
+ reply = f"""There was an error running the SQL Query = {queries}
44
+ The error is {e},
45
+ You should probably try again.
46
+ """
47
+ return {"reply": reply}
functions/stat_functions.py CHANGED
@@ -5,251 +5,19 @@ from utils import TEMP_DIR
5
  import plotly.express as px
6
  import plotly.io as pio
7
  import os
8
- from functions.chart_functions import scatter_chart_fig, llm_chart_data_scrub, _write_chart
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
12
 
13
- root_url = os.getenv("ROOT_URL", "")
14
 
15
- def descriptive_stats_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
16
- print("DESCRIPTIVE STATISTICS")
17
- try:
18
- from html import escape
19
-
20
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
21
- csv_query_path = f'{dir_path}/query.csv'
22
-
23
- df = pd.read_csv(csv_query_path)
24
-
25
- if columns:
26
- df = df[[c for c in columns if c in df.columns]]
27
-
28
- desc = df.describe().round(4)
29
-
30
- header_cells = '<th style="background:#1e40af;">Statistic</th>' + ''.join(
31
- f'<th>{escape(str(col))}</th>' for col in desc.columns
32
- )
33
- row_html = [
34
- '<tr>'
35
- + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
36
- + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
37
- + '</tr>'
38
- for idx, row in desc.iterrows()
39
- ]
40
-
41
- style = (
42
- '<style>'
43
- '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
44
- '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
45
- '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
46
- '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
47
- '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
48
- '.vda-table tbody tr:last-child td{border-bottom:none;}'
49
- '</style>'
50
- )
51
- table = (
52
- '<div class="vda-table-wrap"><table class="vda-table">'
53
- f'<thead><tr>{header_cells}</tr></thead>'
54
- '<tbody>' + '\n'.join(row_html) + '</tbody>'
55
- '</table></div>'
56
- )
57
-
58
- return {"reply": style + table}
59
-
60
- except Exception as e:
61
- print("DESCRIPTIVE STATS ERROR")
62
- print(e)
63
- return {"reply": f"There was an error generating descriptive statistics. Error: {e}. You should probably try again."}
64
-
65
-
66
- def kmeans_clustering_func(feature_columns: List[str], x_column: str, y_column: str,
67
- session_hash, session_folder, n_clusters: int = 3,
68
- layout: List[dict] = [{}], **kwargs):
69
- print("KMEANS CLUSTERING")
70
- try:
71
- from sklearn.cluster import KMeans
72
- from sklearn.preprocessing import StandardScaler
73
- from html import escape
74
-
75
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
76
- chart_path = f'{dir_path}/chart.html'
77
- csv_query_path = f'{dir_path}/query.csv'
78
-
79
- df = pd.read_csv(csv_query_path)
80
-
81
- feature_df = df[feature_columns].select_dtypes(include='number').dropna()
82
- if feature_df.shape[1] < 1:
83
- return {"reply": "No numeric feature columns found for clustering. Please refine your query to include numeric columns."}
84
-
85
- X_scaled = StandardScaler().fit_transform(feature_df)
86
- labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit_predict(X_scaled)
87
-
88
- df_clustered = df.loc[feature_df.index].copy()
89
- df_clustered['Cluster'] = [f'Cluster {l}' for l in labels]
90
-
91
- fig = px.scatter(
92
- df_clustered, x=x_column, y=y_column, color='Cluster',
93
- title=f'K-Means Clustering (k={n_clusters})',
94
- )
95
- fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
96
-
97
- _, layout_dict = llm_chart_data_scrub({}, layout)
98
- if layout_dict:
99
- fig.update_layout(**layout_dict)
100
-
101
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
102
- iframe = _write_chart(fig, chart_path, chart_url)
103
-
104
- cluster_summary = df_clustered.groupby('Cluster')[feature_columns].mean().round(3)
105
- header_cells = '<th style="background:#1e40af;">Cluster</th>' + ''.join(
106
- f'<th>{escape(str(col))}</th>' for col in cluster_summary.columns
107
- )
108
- row_html = [
109
- '<tr>'
110
- + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
111
- + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
112
- + '</tr>'
113
- for idx, row in cluster_summary.iterrows()
114
- ]
115
- style = (
116
- '<style>'
117
- '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
118
- '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
119
- '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
120
- '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
121
- '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
122
- '.vda-table tbody tr:last-child td{border-bottom:none;}'
123
- '</style>'
124
- )
125
- summary_table = (
126
- '<div class="vda-table-wrap"><table class="vda-table">'
127
- f'<thead><tr>{header_cells}</tr></thead>'
128
- '<tbody>' + '\n'.join(row_html) + '</tbody>'
129
- '</table></div>'
130
- )
131
-
132
- return {"reply": f'{iframe}\n\n**Cluster Centroids (feature means per cluster):**\n{style}{summary_table}'}
133
-
134
- except Exception as e:
135
- print("KMEANS CLUSTERING ERROR")
136
- print(e)
137
- return {"reply": f"There was an error running K-Means clustering. Error: {e}. You should probably try again."}
138
-
139
-
140
- def hypothesis_test_func(test_type: str, column: str, session_hash, session_folder,
141
- column2: str = "", group_column: str = "",
142
- group_values: List[str] = [], pop_mean: float = 0.0, **kwargs):
143
- print("HYPOTHESIS TEST")
144
- try:
145
- from scipy import stats
146
- from html import escape
147
-
148
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
149
- csv_query_path = f'{dir_path}/query.csv'
150
- df = pd.read_csv(csv_query_path)
151
-
152
- if test_type == "t_test_independent":
153
- if not group_column or group_column not in df.columns:
154
- return {"reply": "Please specify a valid group_column for the independent t-test."}
155
- unique_groups = df[group_column].dropna().unique().tolist()
156
- if group_values and len(group_values) == 2:
157
- g1_label, g2_label = group_values[0], group_values[1]
158
- elif len(unique_groups) == 2:
159
- g1_label, g2_label = unique_groups[0], unique_groups[1]
160
- else:
161
- return {"reply": f"For an independent t-test, exactly 2 groups are needed. Found: {unique_groups}. Specify group_values with 2 entries."}
162
-
163
- g1 = df[df[group_column] == g1_label][column].dropna()
164
- g2 = df[df[group_column] == g2_label][column].dropna()
165
- t_stat, p_value = stats.ttest_ind(g1, g2)
166
-
167
- result_rows = [
168
- ("Test", "Independent Samples T-Test"),
169
- ("Column", column),
170
- ("Group Column", group_column),
171
- (f"Group 1", str(g1_label)),
172
- (f"Group 2", str(g2_label)),
173
- (f"Group 1 Mean (n={len(g1)})", f"{g1.mean():.4f}"),
174
- (f"Group 2 Mean (n={len(g2)})", f"{g2.mean():.4f}"),
175
- ("T-Statistic", f"{t_stat:.4f}"),
176
- ("P-Value", f"{p_value:.6f}"),
177
- ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
178
- ]
179
- title = f"T-Test: {column} by {group_column}"
180
-
181
- elif test_type == "t_test_one_sample":
182
- sample = df[column].dropna()
183
- t_stat, p_value = stats.ttest_1samp(sample, pop_mean)
184
- result_rows = [
185
- ("Test", "One-Sample T-Test"),
186
- ("Column", column),
187
- ("Hypothesized Mean (μ₀)", f"{pop_mean:.4f}"),
188
- (f"Sample Mean (n={len(sample)})", f"{sample.mean():.4f}"),
189
- ("Sample Std Dev", f"{sample.std():.4f}"),
190
- ("T-Statistic", f"{t_stat:.4f}"),
191
- ("P-Value", f"{p_value:.6f}"),
192
- ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
193
- ]
194
- title = f"One-Sample T-Test: {column} vs μ={pop_mean}"
195
-
196
- elif test_type == "chi_square":
197
- if not column2 or column2 not in df.columns:
198
- return {"reply": "Please specify a valid column2 for the chi-square test."}
199
- contingency = pd.crosstab(df[column], df[column2])
200
- chi2, p_value, dof, _ = stats.chi2_contingency(contingency)
201
- result_rows = [
202
- ("Test", "Chi-Square Test of Independence"),
203
- ("Column 1", column),
204
- ("Column 2", column2),
205
- ("Chi-Square Statistic", f"{chi2:.4f}"),
206
- ("Degrees of Freedom", str(dof)),
207
- ("P-Value", f"{p_value:.6f}"),
208
- ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
209
- ]
210
- title = f"Chi-Square: {column} × {column2}"
211
-
212
- else:
213
- return {"reply": f"Unknown test_type '{test_type}'. Use one of: t_test_independent, t_test_one_sample, chi_square."}
214
-
215
- style = (
216
- '<style>'
217
- '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
218
- '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
219
- '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
220
- '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
221
- '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
222
- '.vda-table tbody tr:last-child td{border-bottom:none;}'
223
- '</style>'
224
- )
225
- header_cells = f'<th style="background:#1e40af;" colspan="2">{escape(title)}</th>'
226
- row_html = [
227
- '<tr>'
228
- + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(label)}</td>'
229
- + f'<td>{escape(value)}</td>'
230
- + '</tr>'
231
- for label, value in result_rows
232
- ]
233
- table = (
234
- '<div class="vda-table-wrap"><table class="vda-table">'
235
- f'<thead><tr>{header_cells}</tr></thead>'
236
- '<tbody>' + '\n'.join(row_html) + '</tbody>'
237
- '</table></div>'
238
- )
239
- return {"reply": style + table}
240
-
241
- except Exception as e:
242
- print("HYPOTHESIS TEST ERROR")
243
- print(e)
244
- return {"reply": f"There was an error running the hypothesis test. Error: {e}. You should probably try again."}
245
-
246
-
247
- def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, session_folder, category: str='', **kwargs):
248
  print("LINEAR REGRESSION CALCULATION")
249
  print(independent_variables)
250
  print(dependent_variable)
251
  try:
252
- dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
253
  chart_path = f'{dir_path}/chart.html'
254
  csv_query_path = f'{dir_path}/query.csv'
255
 
@@ -262,8 +30,11 @@ def regression_func(independent_variables: List[str], dependent_variable: str, s
262
  fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
263
  trendline="ols")
264
 
265
- chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
266
- iframe = _write_chart(fig, chart_path, chart_url)
 
 
 
267
 
268
  results_frame = px.get_trendline_results(fig)
269
 
 
5
  import plotly.express as px
6
  import plotly.io as pio
7
  import os
8
+ from functions import scatter_chart_fig
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
12
 
13
+ root_url = os.getenv("ROOT_URL")
14
 
15
+ def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, category: str=''):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  print("LINEAR REGRESSION CALCULATION")
17
  print(independent_variables)
18
  print(dependent_variable)
19
  try:
20
+ dir_path = TEMP_DIR / str(session_hash)
21
  chart_path = f'{dir_path}/chart.html'
22
  csv_query_path = f'{dir_path}/query.csv'
23
 
 
30
  fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
31
  trendline="ols")
32
 
33
+ pio.write_html(fig, chart_path, full_html=False)
34
+
35
+ chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
36
+
37
+ iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n width="1000px"\n height="500px"\n src="' + chart_url + '"\n frameborder="0"\n allowfullscreen\n></iframe>\n</div>'
38
 
39
  results_frame = px.get_trendline_results(fig)
40
 
index.html ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Virtual Data Analyst</title>
7
+ <!-- Tailwind CSS -->
8
+ <script src="https://cdn.tailwindcss.com"></script>
9
+ <!-- Google Fonts -->
10
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
11
+ <!-- Font Awesome -->
12
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
13
+ <!-- Custom Styles -->
14
+ <link rel="stylesheet" href="styles.css">
15
+ <script>
16
+ tailwind.config = {
17
+ theme: {
18
+ extend: {
19
+ fontFamily: {
20
+ 'sans': ['Inter', 'sans-serif'],
21
+ },
22
+ colors: {
23
+ primary: '#3B82F6',
24
+ secondary: '#6B7280',
25
+ },
26
+ }
27
+ }
28
+ }
29
+ </script>
30
+ </head>
31
+ <body class="bg-gray-50 font-sans">
32
+ <div class="min-h-screen p-6">
33
+ <!-- Header -->
34
+ <header class="max-w-4xl mx-auto mb-12 text-center">
35
+ <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
36
+ <p class="text-lg text-gray-600 mb-6">
37
+ A powerful tool for data analysis, visualizations, and insights
38
+ </p>
39
+ <div class="bg-blue-50 border border-blue-200 rounded-lg p-4 max-w-2xl mx-auto">
40
+ <h2 class="font-semibold text-blue-800 mb-2">
41
+ <i class="fas fa-info-circle mr-2"></i>Supported Files
42
+ </h2>
43
+ <div class="flex flex-wrap justify-center gap-3 text-blue-700">
44
+ <span class="tooltip">
45
+ <i class="fas fa-file-csv mr-1"></i>CSV
46
+ <span class="tooltip-text">Comma-separated values</span>
47
+ </span>
48
+ <span class="tooltip">
49
+ <i class="fas fa-file-alt mr-1"></i>TSV
50
+ <span class="tooltip-text">Tab-separated values</span>
51
+ </span>
52
+ <span class="tooltip">
53
+ <i class="fas fa-file-alt mr-1"></i>TXT
54
+ <span class="tooltip-text">Text files</span>
55
+ </span>
56
+ <span class="tooltip">
57
+ <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
58
+ <span class="tooltip-text">Excel spreadsheets</span>
59
+ </span>
60
+ <span class="tooltip">
61
+ <i class="fas fa-file-code mr-1"></i>XML
62
+ <span class="tooltip-text">XML documents</span>
63
+ </span>
64
+ <span class="tooltip">
65
+ <i class="fas fa-file-code mr-1"></i>JSON
66
+ <span class="tooltip-text">JSON data files</span>
67
+ </span>
68
+ </div>
69
+ </div>
70
+ </header>
71
+
72
+ <!-- Main Content -->
73
+ <main class="max-w-4xl mx-auto">
74
+ <!-- File Upload Section -->
75
+ <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
76
+ <div class="drop-zone border-2 border-dashed border-gray-300 rounded-lg p-12 text-center hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300">
77
+ <input type="file" id="fileInput" class="hidden" accept=".csv,.tsv,.txt,.xls,.xlsx,.xml,.json">
78
+
79
+ <!-- Upload Icon & Success Checkmark -->
80
+ <div class="relative inline-block">
81
+ <i class="fas fa-cloud-upload-alt text-5xl text-gray-400 mb-4 upload-icon"></i>
82
+ <i class="fas fa-check-circle text-5xl success-checkmark absolute top-0 left-0"></i>
83
+ </div>
84
+
85
+ <!-- Loading Spinner -->
86
+ <div class="loading-spinner mx-auto mb-4"></div>
87
+
88
+ <h3 class="text-xl font-semibold text-gray-700 mb-2">Drop your data file here</h3>
89
+ <p class="text-gray-500 mb-4">or</p>
90
+ <button onclick="document.getElementById('fileInput').click()" class="bg-primary text-white px-6 py-3 rounded-lg hover:bg-blue-600 transition-colors duration-300">
91
+ <i class="fas fa-folder-open mr-2"></i>Browse Files
92
+ </button>
93
+
94
+ <!-- Progress Bar -->
95
+ <div class="progress-bar mt-4">
96
+ <div class="progress-bar-fill"></div>
97
+ </div>
98
+
99
+ <!-- File Info -->
100
+ <div id="fileInfo" class="hidden mt-4 p-4 bg-gray-100 rounded-lg">
101
+ <div class="flex items-center justify-center">
102
+ <i class="file-type-icon fas"></i>
103
+ <span class="file-name font-medium"></span>
104
+ </div>
105
+ <div class="text-sm text-gray-500 mt-2">
106
+ <span class="file-size"></span>
107
+ </div>
108
+ </div>
109
+
110
+ <p class="text-sm text-gray-500 mt-4">Maximum file size: 100MB</p>
111
+ </div>
112
+ </div>
113
+
114
+ <!-- Sample Data Section -->
115
+ <div class="bg-white rounded-xl shadow-lg p-8">
116
+ <h2 class="text-2xl font-semibold text-gray-800 mb-6">
117
+ <i class="fas fa-flask mr-2"></i>Try Sample Datasets
118
+ </h2>
119
+ <div class="grid md:grid-cols-2 gap-4">
120
+ <!-- Marketing Campaign Sample -->
121
+ <button class="sample-btn bg-gradient-to-r from-purple-500 to-indigo-600 text-white p-6 rounded-lg text-left hover:shadow-lg">
122
+ <div class="flex items-center mb-3">
123
+ <i class="fas fa-bullhorn text-2xl mr-3"></i>
124
+ <div>
125
+ <h3 class="text-lg font-semibold">Marketing Campaign Data</h3>
126
+ <p class="text-sm opacity-90">10,000 records</p>
127
+ </div>
128
+ </div>
129
+ <p class="text-sm opacity-90">Analyze customer responses to marketing campaigns and identify key success factors</p>
130
+ <div class="mt-4 text-xs opacity-75">
131
+ <i class="fas fa-table mr-1"></i> CSV format
132
+ </div>
133
+ </button>
134
+
135
+ <!-- Retail Data Sample -->
136
+ <button class="sample-btn bg-gradient-to-r from-green-500 to-teal-600 text-white p-6 rounded-lg text-left hover:shadow-lg">
137
+ <div class="flex items-center mb-3">
138
+ <i class="fas fa-shopping-cart text-2xl mr-3"></i>
139
+ <div>
140
+ <h3 class="text-lg font-semibold">Online Retail Data</h3>
141
+ <p class="text-sm opacity-90">50,000 records</p>
142
+ </div>
143
+ </div>
144
+ <p class="text-sm opacity-90">Explore sales patterns, customer behavior, and product performance</p>
145
+ <div class="mt-4 text-xs opacity-75">
146
+ <i class="fas fa-file-excel mr-1"></i> XLSX format
147
+ </div>
148
+ </button>
149
+ </div>
150
+ </div>
151
+
152
+ <!-- Features Preview -->
153
+ <div class="mt-12 grid md:grid-cols-3 gap-6">
154
+ <div class="feature-card bg-white p-6 rounded-lg shadow-md">
155
+ <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
156
+ <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
157
+ <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
158
+ </div>
159
+ <div class="feature-card bg-white p-6 rounded-lg shadow-md">
160
+ <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
161
+ <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
162
+ <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
163
+ </div>
164
+ <div class="feature-card bg-white p-6 rounded-lg shadow-md">
165
+ <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
166
+ <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
167
+ <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
168
+ </div>
169
+ </div>
170
+ </main>
171
+
172
+ <!-- Footer -->
173
+ <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
174
+ <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
175
+ </footer>
176
+ </div>
177
+
178
+ <!-- Results Section -->
179
+ <div id="results" class="max-w-4xl mx-auto mt-12 hidden">
180
+ <div class="bg-white rounded-xl shadow-lg p-8">
181
+ <div class="flex items-center justify-between mb-6">
182
+ <h2 class="text-2xl font-semibold text-gray-800">
183
+ <i class="fas fa-chart-bar mr-2"></i>Analysis Results
184
+ </h2>
185
+ <button onclick="closeResults()" class="text-gray-500 hover:text-gray-700">
186
+ <i class="fas fa-times"></i>
187
+ </button>
188
+ </div>
189
+
190
+ <!-- Loading State -->
191
+ <div id="resultsLoading" class="text-center py-12">
192
+ <div class="loading-spinner mx-auto mb-4"></div>
193
+ <p class="text-gray-600">Analyzing your data...</p>
194
+ </div>
195
+
196
+ <!-- Error State -->
197
+ <div id="resultsError" class="hidden">
198
+ <div class="bg-red-50 border border-red-200 rounded-lg p-4 text-red-700">
199
+ <i class="fas fa-exclamation-circle mr-2"></i>
200
+ <span id="errorMessage">An error occurred</span>
201
+ </div>
202
+ </div>
203
+
204
+ <!-- Results Content -->
205
+ <div id="resultsContent" class="hidden">
206
+ <!-- Basic Statistics -->
207
+ <div class="mb-8">
208
+ <h3 class="text-lg font-semibold text-gray-700 mb-4">Basic Statistics</h3>
209
+ <div id="basicStats" class="grid grid-cols-2 md:grid-cols-4 gap-4">
210
+ <!-- Stats will be inserted here -->
211
+ </div>
212
+ </div>
213
+
214
+ <!-- Data Preview -->
215
+ <div class="mb-8">
216
+ <h3 class="text-lg font-semibold text-gray-700 mb-4">Data Preview</h3>
217
+ <div class="overflow-x-auto">
218
+ <table id="dataPreview" class="min-w-full divide-y divide-gray-200">
219
+ <!-- Table content will be inserted here -->
220
+ </table>
221
+ </div>
222
+ </div>
223
+
224
+ <!-- Visualizations -->
225
+ <div class="mb-8">
226
+ <h3 class="text-lg font-semibold text-gray-700 mb-4">Visualizations</h3>
227
+ <div id="visualizations" class="grid grid-cols-1 md:grid-cols-2 gap-6">
228
+ <!-- Visualization charts will be inserted here -->
229
+ </div>
230
+ </div>
231
+
232
+ <!-- Insights -->
233
+ <div>
234
+ <h3 class="text-lg font-semibold text-gray-700 mb-4">Key Insights</h3>
235
+ <ul id="insights" class="space-y-3">
236
+ <!-- Insights will be inserted here -->
237
+ </ul>
238
+ </div>
239
+ </div>
240
+ </div>
241
+ </div>
242
+
243
+ <script src="script.js"></script>
244
+ </body>
245
+ </html>
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
- haystack-ai>=2.7.0
2
- anthropic-haystack
3
  python-dotenv
4
  gradio
5
  pandas
@@ -7,12 +6,3 @@ plotly
7
  openpyxl
8
  statsmodels
9
  xlrd
10
- psycopg2-binary
11
- pymongo
12
- pymongoarrow
13
- pymongo_schema
14
- pandasql
15
- pluck-graphql
16
- certifi==2025.1.31
17
- scipy
18
- scikit-learn
 
1
+ haystack-ai
 
2
  python-dotenv
3
  gradio
4
  pandas
 
6
  openpyxl
7
  statsmodels
8
  xlrd
 
 
 
 
 
 
 
 
 
samples/online_retail_data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
samples/tb_illness_data.csv DELETED
The diff for this file is too large to render. See raw diff
 
script.js ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // API Configuration
2
+ const API_URL = 'https://nolanzandi-virtual-data-analyst.hf.space';
3
+ const PREDICT_ENDPOINT = `${API_URL}/api/predict`;
4
+ const SAMPLE_ENDPOINT = `${API_URL}/api/sample`;
5
+
6
+ // File Upload and API Integration
7
+ async function handleFileUpload(file) {
8
+ try {
9
+ // Show loading state
10
+ document.querySelector('.upload-icon').style.display = 'none';
11
+ document.querySelector('.loading-spinner').style.display = 'block';
12
+ document.querySelector('.progress-bar').style.display = 'block';
13
+
14
+ // Create FormData
15
+ const formData = new FormData();
16
+ formData.append('file', file);
17
+
18
+ // Update file info
19
+ updateFileInfo(file);
20
+
21
+ // Simulate progress while actually uploading
22
+ const progressInterval = simulateProgress();
23
+
24
+ // Make API request
25
+ const response = await fetch(API_URL, {
26
+ method: 'POST',
27
+ body: formData
28
+ });
29
+
30
+ if (!response.ok) {
31
+ throw new Error('API request failed');
32
+ }
33
+
34
+ const data = await response.json();
35
+
36
+ // Clear progress simulation
37
+ clearInterval(progressInterval);
38
+
39
+ // Show success state
40
+ showSuccessState();
41
+
42
+ // Handle API response
43
+ handleApiResponse(data);
44
+
45
+ } catch (error) {
46
+ console.error('Error:', error);
47
+ showErrorState(error.message);
48
+ }
49
+ }
50
+
51
+ function updateFileInfo(file) {
52
+ const fileInfo = document.getElementById('fileInfo');
53
+ const fileName = fileInfo.querySelector('.file-name');
54
+ const fileSize = fileInfo.querySelector('.file-size');
55
+ const fileIcon = fileInfo.querySelector('.file-type-icon');
56
+
57
+ const fileType = file.name.split('.').pop().toLowerCase();
58
+ const iconClass = getFileTypeIcon(fileType);
59
+
60
+ fileIcon.className = `file-type-icon fas ${iconClass}`;
61
+ fileName.textContent = file.name;
62
+ fileSize.textContent = formatFileSize(file.size);
63
+ fileInfo.classList.remove('hidden');
64
+ }
65
+
66
+ function simulateProgress() {
67
+ const progressBar = document.querySelector('.progress-bar-fill');
68
+ let progress = 0;
69
+
70
+ return setInterval(() => {
71
+ if (progress < 90) { // Only go up to 90% until we get actual completion
72
+ progress += 5;
73
+ progressBar.style.width = `${progress}%`;
74
+ }
75
+ }, 100);
76
+ }
77
+
78
+ function showSuccessState() {
79
+ document.querySelector('.loading-spinner').style.display = 'none';
80
+ document.querySelector('.success-checkmark').style.display = 'block';
81
+ document.querySelector('.progress-bar-fill').style.width = '100%';
82
+
83
+ setTimeout(() => {
84
+ resetUploadState();
85
+ }, 2000);
86
+ }
87
+
88
+ function showErrorState(message) {
89
+ // Reset upload UI
90
+ resetUploadState();
91
+
92
+ // Show error message
93
+ const errorDiv = document.createElement('div');
94
+ errorDiv.className = 'text-red-500 mt-4';
95
+ errorDiv.innerHTML = `<i class="fas fa-exclamation-circle mr-2"></i>${message}`;
96
+ document.querySelector('.drop-zone').appendChild(errorDiv);
97
+
98
+ setTimeout(() => {
99
+ errorDiv.remove();
100
+ }, 5000);
101
+ }
102
+
103
+ function resetUploadState() {
104
+ document.querySelector('.success-checkmark').style.display = 'none';
105
+ document.querySelector('.upload-icon').style.display = 'block';
106
+ document.querySelector('.progress-bar').style.display = 'none';
107
+ document.querySelector('.progress-bar-fill').style.width = '0%';
108
+ document.getElementById('fileInfo').classList.add('hidden');
109
+ }
110
+
111
+ function handleSampleDataClick(datasetName) {
112
+ // Show loading state in results section
113
+ const resultsSection = document.getElementById('results');
114
+ const resultsLoading = document.getElementById('resultsLoading');
115
+ const resultsContent = document.getElementById('resultsContent');
116
+ const resultsError = document.getElementById('resultsError');
117
+
118
+ resultsSection.classList.remove('hidden');
119
+ resultsLoading.classList.remove('hidden');
120
+ resultsContent.classList.add('hidden');
121
+ resultsError.classList.add('hidden');
122
+
123
+ // Simulate API delay
124
+ setTimeout(() => {
125
+ try {
126
+ // Mock data based on dataset type
127
+ const mockData = datasetName === 'marketing_campaign' ? {
128
+ statistics: {
129
+ rows: 10000,
130
+ columns: 15,
131
+ missing_values: 120,
132
+ data_types: ['numeric', 'categorical', 'datetime']
133
+ },
134
+ preview: {
135
+ columns: ['Campaign ID', 'Customer ID', 'Response', 'Channel'],
136
+ data: [
137
+ ['CAM001', 'C001', 'Converted', 'Email'],
138
+ ['CAM001', 'C002', 'No Response', 'SMS'],
139
+ ['CAM002', 'C003', 'Converted', 'Social Media']
140
+ ]
141
+ },
142
+ visualizations: [
143
+ {
144
+ title: 'Response Rate by Channel',
145
+ description: 'Conversion rates across different marketing channels',
146
+ image_url: 'https://via.placeholder.com/400x300'
147
+ },
148
+ {
149
+ title: 'Campaign Performance',
150
+ description: 'Success metrics for each campaign',
151
+ image_url: 'https://via.placeholder.com/400x300'
152
+ }
153
+ ],
154
+ insights: [
155
+ {
156
+ title: 'Best Performing Channel',
157
+ description: 'Email campaigns show highest conversion rate at 28%'
158
+ },
159
+ {
160
+ title: 'Optimal Send Time',
161
+ description: 'Campaigns sent between 2 PM - 4 PM have better engagement'
162
+ }
163
+ ]
164
+ } : {
165
+ statistics: {
166
+ rows: 50000,
167
+ columns: 12,
168
+ missing_values: 85,
169
+ data_types: ['numeric', 'categorical', 'datetime']
170
+ },
171
+ preview: {
172
+ columns: ['Order ID', 'Product', 'Quantity', 'Price'],
173
+ data: [
174
+ ['ORD001', 'Laptop', '1', '$999.99'],
175
+ ['ORD002', 'Mouse', '2', '$29.99'],
176
+ ['ORD003', 'Monitor', '1', '$299.99']
177
+ ]
178
+ },
179
+ visualizations: [
180
+ {
181
+ title: 'Sales by Category',
182
+ description: 'Distribution of sales across product categories',
183
+ image_url: 'https://via.placeholder.com/400x300'
184
+ },
185
+ {
186
+ title: 'Monthly Revenue',
187
+ description: 'Revenue trends over the past 12 months',
188
+ image_url: 'https://via.placeholder.com/400x300'
189
+ }
190
+ ],
191
+ insights: [
192
+ {
193
+ title: 'Top Products',
194
+ description: 'Electronics category generates 45% of total revenue'
195
+ },
196
+ {
197
+ title: 'Customer Behavior',
198
+ description: 'Average order value increased by 15% in Q4'
199
+ }
200
+ ]
201
+ };
202
+
203
+ handleApiResponse(mockData);
204
+ } catch (error) {
205
+ console.error('Error:', error);
206
+ showErrorState('Failed to process sample dataset');
207
+ }
208
+ }, 1000); // 1 second delay to show loading state
209
+ }
210
+
211
+ function handleApiResponse(data) {
212
+ const resultsSection = document.getElementById('results');
213
+ const resultsLoading = document.getElementById('resultsLoading');
214
+ const resultsContent = document.getElementById('resultsContent');
215
+ const resultsError = document.getElementById('resultsError');
216
+
217
+ // Show results section
218
+ resultsSection.classList.remove('hidden');
219
+ resultsLoading.classList.add('hidden');
220
+ resultsError.classList.add('hidden');
221
+ resultsContent.classList.remove('hidden');
222
+
223
+ // Update Basic Statistics
224
+ updateBasicStats(data.statistics);
225
+
226
+ // Update Data Preview
227
+ updateDataPreview(data.preview);
228
+
229
+ // Update Visualizations
230
+ updateVisualizations(data.visualizations);
231
+
232
+ // Update Insights
233
+ updateInsights(data.insights);
234
+ }
235
+
236
+ function updateBasicStats(statistics) {
237
+ const statsContainer = document.getElementById('basicStats');
238
+ statsContainer.innerHTML = '';
239
+
240
+ const stats = [
241
+ { label: 'Rows', value: statistics.rows, icon: 'fa-list' },
242
+ { label: 'Columns', value: statistics.columns, icon: 'fa-columns' },
243
+ { label: 'Missing Values', value: statistics.missing_values, icon: 'fa-exclamation-triangle' },
244
+ { label: 'Data Types', value: statistics.data_types.length, icon: 'fa-code' }
245
+ ];
246
+
247
+ stats.forEach(stat => {
248
+ const statDiv = document.createElement('div');
249
+ statDiv.className = 'bg-gray-50 rounded-lg p-4';
250
+ statDiv.innerHTML = `
251
+ <div class="flex items-center">
252
+ <i class="fas ${stat.icon} text-primary text-xl mr-3"></i>
253
+ <div>
254
+ <div class="text-sm text-gray-500">${stat.label}</div>
255
+ <div class="text-lg font-semibold">${stat.value}</div>
256
+ </div>
257
+ </div>
258
+ `;
259
+ statsContainer.appendChild(statDiv);
260
+ });
261
+ }
262
+
263
+ function updateDataPreview(preview) {
264
+ const table = document.getElementById('dataPreview');
265
+ table.innerHTML = '';
266
+
267
+ // Add header
268
+ const thead = document.createElement('thead');
269
+ thead.className = 'bg-gray-50';
270
+ const headerRow = document.createElement('tr');
271
+ preview.columns.forEach(column => {
272
+ const th = document.createElement('th');
273
+ th.className = 'px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider';
274
+ th.textContent = column;
275
+ headerRow.appendChild(th);
276
+ });
277
+ thead.appendChild(headerRow);
278
+ table.appendChild(thead);
279
+
280
+ // Add body
281
+ const tbody = document.createElement('tbody');
282
+ tbody.className = 'bg-white divide-y divide-gray-200';
283
+ preview.data.forEach(row => {
284
+ const tr = document.createElement('tr');
285
+ row.forEach(cell => {
286
+ const td = document.createElement('td');
287
+ td.className = 'px-6 py-4 whitespace-nowrap text-sm text-gray-500';
288
+ td.textContent = cell;
289
+ tr.appendChild(td);
290
+ });
291
+ tbody.appendChild(tr);
292
+ });
293
+ table.appendChild(tbody);
294
+ }
295
+
296
+ function updateVisualizations(visualizations) {
297
+ const container = document.getElementById('visualizations');
298
+ container.innerHTML = '';
299
+
300
+ visualizations.forEach(viz => {
301
+ const vizDiv = document.createElement('div');
302
+ vizDiv.className = 'bg-white rounded-lg p-4 shadow';
303
+ vizDiv.innerHTML = `
304
+ <h4 class="text-lg font-medium text-gray-800 mb-4">${viz.title}</h4>
305
+ <div class="aspect-w-16 aspect-h-9">
306
+ <img src="${viz.image_url}" alt="${viz.title}" class="rounded-lg">
307
+ </div>
308
+ <p class="mt-2 text-sm text-gray-600">${viz.description}</p>
309
+ `;
310
+ container.appendChild(vizDiv);
311
+ });
312
+ }
313
+
314
+ function updateInsights(insights) {
315
+ const insightsList = document.getElementById('insights');
316
+ insightsList.innerHTML = '';
317
+
318
+ insights.forEach(insight => {
319
+ const li = document.createElement('li');
320
+ li.className = 'bg-blue-50 rounded-lg p-4';
321
+ li.innerHTML = `
322
+ <div class="flex items-start">
323
+ <i class="fas fa-lightbulb text-yellow-500 mt-1 mr-3"></i>
324
+ <div>
325
+ <div class="font-medium text-blue-900">${insight.title}</div>
326
+ <p class="mt-1 text-sm text-blue-700">${insight.description}</p>
327
+ </div>
328
+ </div>
329
+ `;
330
+ insightsList.appendChild(li);
331
+ });
332
+ }
333
+
334
+ function closeResults() {
335
+ document.getElementById('results').classList.add('hidden');
336
+ }
337
+
338
+ function showErrorState(message) {
339
+ const resultsSection = document.getElementById('results');
340
+ const resultsLoading = document.getElementById('resultsLoading');
341
+ const resultsContent = document.getElementById('resultsContent');
342
+ const resultsError = document.getElementById('resultsError');
343
+ const errorMessage = document.getElementById('errorMessage');
344
+
345
+ resultsSection.classList.remove('hidden');
346
+ resultsLoading.classList.add('hidden');
347
+ resultsContent.classList.add('hidden');
348
+ resultsError.classList.remove('hidden');
349
+ errorMessage.textContent = message;
350
+ }
351
+
352
+ // Event Listeners
353
+ document.addEventListener('DOMContentLoaded', () => {
354
+ // File Upload Handling
355
+ const dropZone = document.querySelector('.drop-zone');
356
+ const fileInput = document.getElementById('fileInput');
357
+
358
+ // Prevent default drag behaviors
359
+ ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
360
+ dropZone.addEventListener(eventName, preventDefaults, false);
361
+ document.body.addEventListener(eventName, preventDefaults, false);
362
+ });
363
+
364
+ // Highlight drop zone when dragging over it
365
+ ['dragenter', 'dragover'].forEach(eventName => {
366
+ dropZone.addEventListener(eventName, highlight, false);
367
+ });
368
+
369
+ ['dragleave', 'drop'].forEach(eventName => {
370
+ dropZone.addEventListener(eventName, unhighlight, false);
371
+ });
372
+
373
+ // Handle dropped files
374
+ dropZone.addEventListener('drop', (e) => {
375
+ const dt = e.dataTransfer;
376
+ const files = dt.files;
377
+ if (files.length > 0) {
378
+ handleFileUpload(files[0]);
379
+ }
380
+ });
381
+
382
+ fileInput.addEventListener('change', (e) => {
383
+ if (e.target.files.length > 0) {
384
+ handleFileUpload(e.target.files[0]);
385
+ }
386
+ });
387
+
388
+ // Sample Data Button Handlers
389
+ const marketingBtn = document.querySelector('.sample-btn:nth-child(1)');
390
+ const retailBtn = document.querySelector('.sample-btn:nth-child(2)');
391
+
392
+ if (marketingBtn) {
393
+ marketingBtn.addEventListener('click', () => {
394
+ console.log('Marketing campaign button clicked');
395
+ handleSampleDataClick('marketing_campaign');
396
+ });
397
+ }
398
+
399
+ if (retailBtn) {
400
+ retailBtn.addEventListener('click', () => {
401
+ console.log('Online retail button clicked');
402
+ handleSampleDataClick('online_retail');
403
+ });
404
+ }
405
+ });
406
+
407
+ // Utility Functions
408
+ function preventDefaults(e) {
409
+ e.preventDefault();
410
+ e.stopPropagation();
411
+ }
412
+
413
+ function highlight(e) {
414
+ document.querySelector('.drop-zone').classList.add('border-primary', 'bg-blue-50');
415
+ }
416
+
417
+ function unhighlight(e) {
418
+ document.querySelector('.drop-zone').classList.remove('border-primary', 'bg-blue-50');
419
+ }
420
+
421
+ function getFileTypeIcon(fileType) {
422
+ const icons = {
423
+ 'csv': 'fa-file-csv',
424
+ 'tsv': 'fa-file-alt',
425
+ 'txt': 'fa-file-alt',
426
+ 'xls': 'fa-file-excel',
427
+ 'xlsx': 'fa-file-excel',
428
+ 'xml': 'fa-file-code',
429
+ 'json': 'fa-file-code'
430
+ };
431
+ return icons[fileType] || 'fa-file';
432
+ }
433
+
434
+ function formatFileSize(bytes) {
435
+ if (bytes === 0) return '0 Bytes';
436
+ const k = 1024;
437
+ const sizes = ['Bytes', 'KB', 'MB', 'GB'];
438
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
439
+ return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
440
+ }
assets/styles.css → styles.css RENAMED
@@ -1,198 +1,168 @@
1
- /* Loading Animation */
2
- .loading-spinner {
3
- display: none;
4
- width: 50px;
5
- height: 50px;
6
- border: 5px solid #f3f3f3;
7
- border-top: 5px solid #3B82F6;
8
- border-radius: 50%;
9
- animation: spin 1s linear infinite;
10
- margin: 0 auto;
11
- }
12
-
13
- @keyframes spin {
14
- 0% { transform: rotate(0deg); }
15
- 100% { transform: rotate(360deg); }
16
- }
17
-
18
- /* File Upload Progress */
19
- .progress-bar {
20
- width: 100%;
21
- height: 6px;
22
- background-color: #e5e7eb;
23
- border-radius: 3px;
24
- overflow: hidden;
25
- display: none;
26
- margin: 1rem auto;
27
- max-width: 300px;
28
- }
29
-
30
- .progress-bar-fill {
31
- height: 100%;
32
- background-color: #3B82F6;
33
- width: 0%;
34
- transition: width 0.3s ease;
35
- }
36
-
37
- /* Tooltip */
38
- .tooltip {
39
- position: relative;
40
- display: inline-block;
41
- }
42
-
43
- .tooltip .tooltip-text {
44
- visibility: hidden;
45
- background-color: #1f2937;
46
- color: white;
47
- text-align: center;
48
- padding: 8px 12px;
49
- border-radius: 6px;
50
- position: absolute;
51
- z-index: 1;
52
- bottom: 125%;
53
- left: 50%;
54
- transform: translateX(-50%);
55
- opacity: 0;
56
- transition: opacity 0.3s;
57
- font-size: 0.875rem;
58
- white-space: nowrap;
59
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
60
- }
61
-
62
- .tooltip:hover .tooltip-text {
63
- visibility: visible;
64
- opacity: 1;
65
- }
66
-
67
- /* File Type Icons */
68
- .file-type-icon {
69
- font-size: 1.5rem;
70
- margin-right: 0.5rem;
71
- color: #3B82F6;
72
- }
73
-
74
- /* Success Animation */
75
- @keyframes checkmark {
76
- 0% { transform: scale(0); opacity: 0; }
77
- 50% { transform: scale(1.2); opacity: 0.8; }
78
- 100% { transform: scale(1); opacity: 1; }
79
- }
80
-
81
- .success-checkmark {
82
- display: none;
83
- color: #10B981;
84
- animation: checkmark 0.5s ease-in-out forwards;
85
- }
86
-
87
- /* Sample Data Cards */
88
- .sample-btn {
89
- transition: all 0.3s ease;
90
- position: relative;
91
- overflow: hidden;
92
- background: linear-gradient(135deg, #3B82F6, #0ea5e9) !important;
93
- }
94
-
95
- .sample-btn::after {
96
- content: '';
97
- position: absolute;
98
- top: 0;
99
- left: 0;
100
- width: 100%;
101
- height: 100%;
102
- background: linear-gradient(rgba(255,255,255,0.12), rgba(255,255,255,0));
103
- transform: translateY(-100%);
104
- transition: transform 0.3s ease;
105
- }
106
-
107
- .sample-btn:hover::after {
108
- transform: translateY(0);
109
- }
110
-
111
- .sample-btn:hover {
112
- transform: translateY(-2px);
113
- box-shadow: 0 8px 20px rgba(59,130,246,0.3);
114
- }
115
-
116
- /* Status badge fade-in */
117
- @keyframes fadeSlideIn {
118
- from { opacity: 0; transform: translateY(-6px); }
119
- to { opacity: 1; transform: translateY(0); }
120
- }
121
-
122
- .api-status-badge {
123
- animation: fadeSlideIn 0.35s ease forwards;
124
- }
125
-
126
- /* Drop Zone Enhancements */
127
- .drop-zone {
128
- transition: all 0.3s ease;
129
- position: relative;
130
- overflow: hidden;
131
- }
132
-
133
- .drop-zone::before {
134
- position: absolute;
135
- top: 0;
136
- left: 0;
137
- right: 0;
138
- bottom: 0;
139
- border-radius: 8px;
140
- border: 2px dashed #3B82F6;
141
- opacity: 0;
142
- transition: opacity 0.3s ease;
143
- }
144
-
145
- .drop-zone:hover::before {
146
- opacity: 1;
147
- }
148
-
149
- /* File Info Card */
150
- #fileInfo {
151
- background: linear-gradient(to right, #f8fafc, #f1f5f9);
152
- border: 1px solid #e2e8f0;
153
- transition: all 0.3s ease;
154
- }
155
-
156
- #fileInfo:hover {
157
- transform: translateY(-2px);
158
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
159
- }
160
-
161
- /* Features Section */
162
- .feature-card {
163
- transition: all 0.3s ease;
164
- }
165
-
166
- .feature-card:hover {
167
- transform: translateY(-2px);
168
- box-shadow: 0 8px 15px rgba(0,0,0,0.1);
169
- }
170
-
171
- .feature-icon {
172
- transition: all 0.3s ease;
173
- }
174
-
175
- .feature-card:hover .feature-icon {
176
- transform: scale(1.1);
177
- color: #2563eb;
178
- }
179
-
180
- @media only screen and (max-width: 600px) {
181
- .feature-card p {grid-column: 1/3;}
182
- .feature-card i, .feature-card h3 {text-align: center;}
183
- .feature-card {
184
- display: grid;
185
- grid-template-columns: 1fr 2fr;
186
- align-items: baseline;
187
- }
188
- }
189
-
190
- dialog {
191
- margin: 10% auto;
192
- width: 80%;
193
- max-width: 350px;
194
- background-color: #fff;
195
- padding: 34px;
196
- border: 0;
197
- border-radius: 5px;
198
- }
 
1
+ /* Loading Animation */
2
+ .loading-spinner {
3
+ display: none;
4
+ width: 50px;
5
+ height: 50px;
6
+ border: 5px solid #f3f3f3;
7
+ border-top: 5px solid #3B82F6;
8
+ border-radius: 50%;
9
+ animation: spin 1s linear infinite;
10
+ margin: 0 auto;
11
+ }
12
+
13
+ @keyframes spin {
14
+ 0% { transform: rotate(0deg); }
15
+ 100% { transform: rotate(360deg); }
16
+ }
17
+
18
+ /* File Upload Progress */
19
+ .progress-bar {
20
+ width: 100%;
21
+ height: 6px;
22
+ background-color: #e5e7eb;
23
+ border-radius: 3px;
24
+ overflow: hidden;
25
+ display: none;
26
+ margin: 1rem auto;
27
+ max-width: 300px;
28
+ }
29
+
30
+ .progress-bar-fill {
31
+ height: 100%;
32
+ background-color: #3B82F6;
33
+ width: 0%;
34
+ transition: width 0.3s ease;
35
+ }
36
+
37
+ /* Tooltip */
38
+ .tooltip {
39
+ position: relative;
40
+ display: inline-block;
41
+ }
42
+
43
+ .tooltip .tooltip-text {
44
+ visibility: hidden;
45
+ background-color: #1f2937;
46
+ color: white;
47
+ text-align: center;
48
+ padding: 8px 12px;
49
+ border-radius: 6px;
50
+ position: absolute;
51
+ z-index: 1;
52
+ bottom: 125%;
53
+ left: 50%;
54
+ transform: translateX(-50%);
55
+ opacity: 0;
56
+ transition: opacity 0.3s;
57
+ font-size: 0.875rem;
58
+ white-space: nowrap;
59
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
60
+ }
61
+
62
+ .tooltip:hover .tooltip-text {
63
+ visibility: visible;
64
+ opacity: 1;
65
+ }
66
+
67
+ /* File Type Icons */
68
+ .file-type-icon {
69
+ font-size: 1.5rem;
70
+ margin-right: 0.5rem;
71
+ color: #3B82F6;
72
+ }
73
+
74
+ /* Success Animation */
75
+ @keyframes checkmark {
76
+ 0% { transform: scale(0); opacity: 0; }
77
+ 50% { transform: scale(1.2); opacity: 0.8; }
78
+ 100% { transform: scale(1); opacity: 1; }
79
+ }
80
+
81
+ .success-checkmark {
82
+ display: none;
83
+ color: #10B981;
84
+ animation: checkmark 0.5s ease-in-out forwards;
85
+ }
86
+
87
+ /* Sample Data Cards */
88
+ .sample-btn {
89
+ transition: all 0.3s ease;
90
+ position: relative;
91
+ overflow: hidden;
92
+ }
93
+
94
+ .sample-btn::after {
95
+ content: '';
96
+ position: absolute;
97
+ top: 0;
98
+ left: 0;
99
+ width: 100%;
100
+ height: 100%;
101
+ background: linear-gradient(rgba(255,255,255,0.1), rgba(255,255,255,0));
102
+ transform: translateY(-100%);
103
+ transition: transform 0.3s ease;
104
+ }
105
+
106
+ .sample-btn:hover::after {
107
+ transform: translateY(0);
108
+ }
109
+
110
+ .sample-btn:hover {
111
+ transform: translateY(-2px);
112
+ box-shadow: 0 8px 15px rgba(0,0,0,0.1);
113
+ }
114
+
115
+ /* Drop Zone Enhancements */
116
+ .drop-zone {
117
+ transition: all 0.3s ease;
118
+ position: relative;
119
+ overflow: hidden;
120
+ }
121
+
122
+ .drop-zone::before {
123
+ content: '';
124
+ position: absolute;
125
+ top: 0;
126
+ left: 0;
127
+ right: 0;
128
+ bottom: 0;
129
+ border-radius: 8px;
130
+ border: 2px dashed #3B82F6;
131
+ opacity: 0;
132
+ transition: opacity 0.3s ease;
133
+ }
134
+
135
+ .drop-zone:hover::before {
136
+ opacity: 1;
137
+ }
138
+
139
+ /* File Info Card */
140
+ #fileInfo {
141
+ background: linear-gradient(to right, #f8fafc, #f1f5f9);
142
+ border: 1px solid #e2e8f0;
143
+ transition: all 0.3s ease;
144
+ }
145
+
146
+ #fileInfo:hover {
147
+ transform: translateY(-2px);
148
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
149
+ }
150
+
151
+ /* Features Section */
152
+ .feature-card {
153
+ transition: all 0.3s ease;
154
+ }
155
+
156
+ .feature-card:hover {
157
+ transform: translateY(-2px);
158
+ box-shadow: 0 8px 15px rgba(0,0,0,0.1);
159
+ }
160
+
161
+ .feature-icon {
162
+ transition: all 0.3s ease;
163
+ }
164
+
165
+ .feature-card:hover .feature-icon {
166
+ transform: scale(1.1);
167
+ color: #2563eb;
168
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
temp/.gitignore DELETED
@@ -1,2 +0,0 @@
1
- *
2
- !.gitignore
 
 
 
templates/data_file.py DELETED
@@ -1,286 +0,0 @@
1
- import gradio as gr
2
- from functions import example_question_generator, chatbot_func
3
- from data_sources import process_data_upload
4
- from utils import message_dict
5
- import ast
6
- import html as _html
7
-
8
- def build_summary_modal(stats):
9
- num_rows = stats['num_rows']
10
- num_cols = stats['num_cols']
11
- total_missing = stats['total_missing']
12
- duplicate_rows = stats.get('duplicate_rows', 0)
13
- file_size_bytes = stats.get('file_size_bytes', 0)
14
-
15
- def _fmt_num(v):
16
- try:
17
- if v != v: return '—' # NaN
18
- abs_v = abs(v)
19
- if abs_v >= 1e9: return f"{v/1e9:.1f}B"
20
- if abs_v >= 1e6: return f"{v/1e6:.1f}M"
21
- if abs_v >= 1e3: return f"{v:,.0f}" if v == int(v) else f"{v:,.1f}"
22
- return f"{v:,.0f}" if v == int(v) else f"{v:.2f}"
23
- except Exception:
24
- return str(v)
25
-
26
- def _fmt_size(b):
27
- if not b: return ''
28
- if b < 1024: return f"{b} B"
29
- if b < 1024 ** 2: return f"{b / 1024:.1f} KB"
30
- if b < 1024 ** 3: return f"{b / 1024 ** 2:.1f} MB"
31
- return f"{b / 1024 ** 3:.2f} GB"
32
-
33
- file_size_label = _fmt_size(file_size_bytes)
34
- dup_color = "#ef4444" if duplicate_rows > 0 else "#a16207"
35
- dup_bg = "#fef2f2" if duplicate_rows > 0 else "#fefce8"
36
- dup_border = "#fecaca" if duplicate_rows > 0 else "#fde68a"
37
-
38
- dtype_rows_html = ""
39
- for i, (col, dtype) in enumerate(stats['dtypes'].items()):
40
- bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
41
- missing = stats['missing_per_col'].get(col, 0)
42
- pct_missing = (missing / num_rows * 100) if num_rows > 0 else 0
43
- missing_color = "#ef4444" if missing > 0 else "#9ca3af"
44
- missing_weight = "600" if missing > 0 else "400"
45
- missing_cell = f'{missing:,} <span style="color:#9ca3af;font-size:0.7rem;">({pct_missing:.1f}%)</span>'
46
-
47
- unique = stats.get('unique_counts', {}).get(col, '—')
48
- is_id = isinstance(unique, int) and num_rows > 0 and (unique / num_rows) >= 0.95 and unique > 10
49
- id_badge = ' <span style="background:#fef3c7;color:#92400e;padding:1px 5px;border-radius:3px;font-size:0.65rem;vertical-align:middle;">ID?</span>' if is_id else ''
50
- unique_cell = f'{unique:,}{id_badge}' if isinstance(unique, int) else str(unique)
51
-
52
- cs = stats.get('col_stats', {}).get(col, {})
53
- if cs.get('type') == 'numeric':
54
- stats_cell = (
55
- f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
56
- f'{_fmt_num(cs["min"])} – {_fmt_num(cs["max"])}'
57
- f'<br><span style="color:#9ca3af;">avg {_fmt_num(cs["mean"])}</span></span>'
58
- )
59
- elif cs.get('type') == 'datetime':
60
- stats_cell = (
61
- f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
62
- f'{cs["min"]}<br>→ {cs["max"]}</span>'
63
- )
64
- else:
65
- stats_cell = '<span style="color:#d1d5db;">—</span>'
66
-
67
- dtype_rows_html += (
68
- f'<tr style="background:{bg}">'
69
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#111827;white-space:nowrap;">{_html.escape(col)}</td>'
70
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;white-space:nowrap;"><span style="background:#dbeafe;color:#1e40af;padding:2px 8px;border-radius:4px;font-size:0.74rem;">{dtype}</span></td>'
71
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;color:{missing_color};font-weight:{missing_weight};white-space:nowrap;">{missing_cell}</td>'
72
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;white-space:nowrap;color:#374151;">{unique_cell}</td>'
73
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;">{stats_cell}</td>'
74
- f'</tr>'
75
- )
76
-
77
- preview_headers_html = "".join(
78
- f'<th style="padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;text-align:left;">{_html.escape(col)}</th>'
79
- for col in stats['preview_cols']
80
- )
81
-
82
- preview_rows_html = ""
83
- for i, row in enumerate(stats['preview']):
84
- bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
85
- cells = "".join(
86
- f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#374151;white-space:nowrap;">{_html.escape(str(cell))}</td>'
87
- for cell in row
88
- )
89
- preview_rows_html += f'<tr style="background:{bg}">{cells}</tr>'
90
-
91
- size_tag = f'<span style="background:rgba(255,255,255,0.2);color:#fff;padding:2px 10px;border-radius:12px;font-size:0.75rem;font-weight:400;">{file_size_label}</span>' if file_size_label else ''
92
-
93
- return f"""
94
- <div class="vda-modal-overlay" style="position:fixed;inset:0;background:rgba(0,0,0,0.55);z-index:9999;display:flex;align-items:center;justify-content:center;font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
95
- <div style="background:#fff;border-radius:14px;width:90%;max-width:800px;max-height:88vh;display:flex;flex-direction:column;box-shadow:0 25px 50px -12px rgba(0,0,0,0.35);overflow:hidden;">
96
- <div style="background:linear-gradient(135deg,#3B82F6,#0ea5e9);padding:16px 20px;display:flex;justify-content:space-between;align-items:center;flex-shrink:0;gap:12px;">
97
- <div style="display:flex;align-items:center;gap:10px;">
98
- <span style="color:#fff;font-weight:600;font-size:1rem;">Dataset Summary</span>
99
- {size_tag}
100
- </div>
101
- <button onclick="document.querySelectorAll('.vda-modal-overlay').forEach(function(e){{e.remove()}})" style="background:rgba(255,255,255,0.2);border:none;color:#fff;width:30px;height:30px;border-radius:50%;cursor:pointer;font-size:1rem;line-height:1;flex-shrink:0;">&#x2715;</button>
102
- </div>
103
- <div style="padding:20px;overflow-y:auto;flex:1;">
104
- <div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:10px;margin-bottom:20px;">
105
- <div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px;padding:12px;text-align:center;">
106
- <div style="font-size:1.4rem;font-weight:700;color:#1d4ed8;">{num_rows:,}</div>
107
- <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Rows</div>
108
- </div>
109
- <div style="background:#f0fdf4;border:1px solid #bbf7d0;border-radius:8px;padding:12px;text-align:center;">
110
- <div style="font-size:1.4rem;font-weight:700;color:#15803d;">{num_cols}</div>
111
- <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Columns</div>
112
- </div>
113
- <div style="background:#fefce8;border:1px solid #fde68a;border-radius:8px;padding:12px;text-align:center;">
114
- <div style="font-size:1.4rem;font-weight:700;color:#a16207;">{total_missing:,}</div>
115
- <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Missing Values</div>
116
- </div>
117
- <div style="background:{dup_bg};border:1px solid {dup_border};border-radius:8px;padding:12px;text-align:center;">
118
- <div style="font-size:1.4rem;font-weight:700;color:{dup_color};">{duplicate_rows:,}</div>
119
- <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Duplicate Rows</div>
120
- </div>
121
- </div>
122
- <div style="margin-bottom:20px;">
123
- <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Column Info</div>
124
- <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
125
- <div style="max-height:210px;overflow:auto;">
126
- <table style="border-collapse:collapse;font-size:0.83rem;min-width:100%;">
127
- <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
128
- <tr>
129
- <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Column</th>
130
- <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Type</th>
131
- <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Missing</th>
132
- <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Unique</th>
133
- <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Stats / Range</th>
134
- </tr>
135
- </thead>
136
- <tbody>{dtype_rows_html}</tbody>
137
- </table>
138
- </div>
139
- </div>
140
- </div>
141
- <div>
142
- <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Data Preview (first 5 rows)</div>
143
- <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
144
- <div style="overflow:auto;max-height:200px;">
145
- <table style="border-collapse:collapse;font-size:0.8rem;">
146
- <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
147
- <tr>{preview_headers_html}</tr>
148
- </thead>
149
- <tbody>{preview_rows_html}</tbody>
150
- </table>
151
- </div>
152
- </div>
153
- </div>
154
- </div>
155
- </div>
156
- </div>
157
- """
158
-
159
- def run_example(input):
160
- return input
161
-
162
- def example_display(input):
163
- if input == None:
164
- display = True
165
- else:
166
- display = False
167
- return [gr.update(visible=display), gr.update(visible=display), gr.update(visible=display), gr.update(visible=display)]
168
-
169
- with gr.Blocks() as demo:
170
- description = gr.HTML("""
171
- <div class="max-w-4xl mx-auto mb-12 text-center">
172
- <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
173
- <h2 class="font-semibold text-blue-800 ">
174
- <i class="fas fa-info-circle mr-2"></i>Supported Files
175
- </h2>
176
- <div class="flex flex-wrap justify-center gap-3 pb-4 text-blue-700">
177
- <span class="tooltip">
178
- <i class="fas fa-file-csv mr-1"></i>CSV
179
- <span class="tooltip-text">Comma-separated values</span>
180
- </span>
181
- <span class="tooltip">
182
- <i class="fas fa-file-alt mr-1"></i>TSV
183
- <span class="tooltip-text">Tab-separated values</span>
184
- </span>
185
- <span class="tooltip">
186
- <i class="fas fa-file-alt mr-1"></i>TXT
187
- <span class="tooltip-text">Text files</span>
188
- </span>
189
- <span class="tooltip">
190
- <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
191
- <span class="tooltip-text">Excel spreadsheets</span>
192
- </span>
193
- <span class="tooltip">
194
- <i class="fas fa-file-code mr-1"></i>XML
195
- <span class="tooltip-text">XML documents</span>
196
- </span>
197
- <span class="tooltip">
198
- <i class="fas fa-file-code mr-1"></i>JSON
199
- <span class="tooltip-text">JSON data files</span>
200
- </span>
201
- </div>
202
- </div>
203
- </div>
204
- """, elem_classes="description_component")
205
- example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
206
- example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
207
- example_file_3 = gr.File(visible=False, value="samples/tb_illness_data.csv")
208
- with gr.Row():
209
- example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
210
- example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
211
- example_btn_3 = gr.Button(value="Try Me: tb_illness_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
212
-
213
- file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker drop-zone border-2 border-dashed border-gray-300 rounded-lg hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300", file_types=['.csv', '.xlsx', '.txt', '.json', '.ndjson', '.xml', '.xls', '.tsv'])
214
- example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
215
- example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
216
- example_btn_3.click(fn=run_example, inputs=example_file_3, outputs=file_output)
217
- file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2, example_btn_3, description])
218
-
219
- @gr.render(inputs=file_output)
220
- def data_options(filename, request: gr.Request):
221
- print(filename)
222
- if request.session_hash not in message_dict:
223
- message_dict[request.session_hash] = {}
224
- message_dict[request.session_hash]['file_upload'] = None
225
- if filename:
226
- process_message = process_upload(filename, request.session_hash)
227
- gr.HTML(value=process_message[1], padding=False)
228
- if process_message[0] == "success":
229
- gr.HTML(value=build_summary_modal(process_message[3]), padding=False)
230
- if "bank_marketing_campaign" in filename:
231
- example_questions = [
232
- ["Describe the dataset"],
233
- ["What levels of education have the highest and lowest average balance?"],
234
- ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
235
- ["Can you generate a bar chart of education vs. average balance?"],
236
- ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
237
- ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
238
- ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
239
- ]
240
- elif "online_retail_data" in filename:
241
- example_questions = [
242
- ["Describe the dataset"],
243
- ["What month had the highest revenue?"],
244
- ["Is revenue higher in the morning or afternoon?"],
245
- ["Can you generate a line graph of revenue per month?"],
246
- ["Can you generate a table of revenue per month?"],
247
- ["Can we predict how time of day affects transaction value in this data set?"],
248
- ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
249
- ]
250
- else:
251
- try:
252
- generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'file_upload', '', process_message[2], ''))
253
- example_questions = [["Describe the dataset"]]
254
- for example in generated_examples:
255
- example_questions.append([example])
256
- except Exception as e:
257
- print("DATA FILE QUESTION GENERATION ERROR")
258
- print(e)
259
- example_questions = [
260
- ["Describe the dataset"],
261
- ["List the columns in the dataset"],
262
- ["What could this data be used for?"],
263
- ]
264
- session_hash = gr.Textbox(visible=False, value=request.session_hash)
265
- data_source = gr.Textbox(visible=False, value='file_upload')
266
- schema = gr.Textbox(visible=False, value='')
267
- titles = gr.Textbox(value=process_message[2], interactive=False, visible=False)
268
- bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
269
- chat = gr.ChatInterface(
270
- fn=chatbot_func,
271
- type='messages',
272
- chatbot=bot,
273
- title="Chat with your data file",
274
- concurrency_limit=None,
275
- examples=example_questions,
276
- additional_inputs=[session_hash, data_source, titles, schema]
277
- )
278
-
279
- def process_upload(upload_value, session_hash):
280
- if upload_value:
281
- process_message = process_data_upload(upload_value, session_hash)
282
- return process_message
283
-
284
-
285
- if __name__ == "__main__":
286
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
templates/doc_db.py DELETED
@@ -1,105 +0,0 @@
1
- import ast
2
- import gradio as gr
3
- from functions import example_question_generator, chatbot_func
4
- from data_sources import connect_doc_db
5
- from utils import message_dict
6
-
7
- with gr.Blocks() as demo:
8
- with gr.Accordion("ℹ️ About the MongoDB Connector", open=False):
9
- gr.HTML("""
10
- <div class="max-w-4xl mx-auto text-center">
11
- <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
12
- <p>Connect to a MongoDB database and query it using natural language.</p>
13
- <p style="font-weight:bold;">
14
- No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
15
- Queries use PyMongoArrow's <code>aggregate_pandas_all</code>, which cannot delete, drop, or insert documents.
16
- Use caution connecting production databases to third-party tools.
17
- </p>
18
- <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
19
- </div>
20
- </div>
21
- """)
22
-
23
- gr.HTML("""
24
- <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
25
- border:1px solid #bae6fd;border-radius:8px;text-align:center;">
26
- <p style="margin:0;font-size:13px;color:#0369a1;">
27
- <i class="fas fa-flask" style="margin-right:6px;"></i>
28
- <strong>Demo credentials pre-filled.</strong>
29
- &nbsp;Replace with your own database details to analyze your own data.
30
- </p>
31
- </div>
32
- """)
33
-
34
- connection_string = gr.Textbox(label="Connection String", value="dataanalyst0.l1klmww.mongodb.net/")
35
- with gr.Row():
36
- connection_user = gr.Textbox(label="Connection User", value="virtual-data-analyst")
37
- connection_password = gr.Textbox(label="Connection Password", value="zcpbmoGJ3mC8o", type="password")
38
- doc_db_name = gr.Textbox(label="Database Name", value="sample_mflix")
39
-
40
- gr.HTML("""
41
- <p style="text-align:center;font-size:13px;color:#6b7280;margin:4px 0 8px;">
42
- <i class="fas fa-circle-info" style="margin-right:4px;"></i>
43
- Schema analysis runs on connect — this may take 1–2 minutes for large databases.
44
- </p>
45
- """)
46
- submit = gr.Button(value="Connect", variant="primary")
47
-
48
- @gr.render(inputs=[connection_string, connection_user, connection_password, doc_db_name], triggers=[submit.click])
49
- def db_chat(request: gr.Request, connection_string=connection_string.value, connection_user=connection_user.value, connection_password=connection_password.value, doc_db_name=doc_db_name.value):
50
- if request.session_hash not in message_dict:
51
- message_dict[request.session_hash] = {}
52
- message_dict[request.session_hash]['doc_db'] = None
53
- connection_login_value = "mongodb+srv://" + connection_user + ":" + connection_password + "@" + connection_string
54
- if connection_login_value:
55
- print("MONGO APP")
56
- process_message = process_doc_db(connection_login_value, doc_db_name, request.session_hash)
57
- gr.HTML(value=process_message[1], padding=False)
58
- if process_message[0] == "success":
59
- if "dataanalyst0.l1klmww.mongodb.net" in connection_login_value:
60
- example_questions = [
61
- ["Describe the dataset"],
62
- ["What are the top 5 most common movie genres?"],
63
- ["How do user comment counts on a movie correlate with the movie award wins?"],
64
- ["Can you generate a pie chart showing the top 10 states with the most movie theaters?"],
65
- ["What are the top 10 most represented directors in the database?"],
66
- ["What are the different movie categories and how many movies are in each category?"]
67
- ]
68
- else:
69
- try:
70
- generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'doc_db', doc_db_name, process_message[2], process_message[3]))
71
- example_questions = [["Describe the dataset"]]
72
- for example in generated_examples:
73
- example_questions.append([example])
74
- except Exception as e:
75
- print("DOC DB QUESTION GENERATION ERROR")
76
- print(e)
77
- example_questions = [
78
- ["Describe the dataset"],
79
- ["List the collections in the database"],
80
- ["What could this data be used for?"],
81
- ]
82
- session_hash = gr.Textbox(visible=False, value=request.session_hash)
83
- db_connection_string = gr.Textbox(visible=False, value=connection_login_value)
84
- db_name = gr.Textbox(visible=False, value=doc_db_name)
85
- titles = gr.Textbox(value=process_message[2], interactive=False, label="DB Collections")
86
- data_source = gr.Textbox(visible=False, value='doc_db')
87
- schema = gr.Textbox(visible=False, value=process_message[3])
88
- bot = gr.Chatbot(type='messages', label="MongoDB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
89
- chat = gr.ChatInterface(
90
- fn=chatbot_func,
91
- type='messages',
92
- chatbot=bot,
93
- title="Chat with your Database",
94
- examples=example_questions,
95
- concurrency_limit=None,
96
- additional_inputs=[session_hash, data_source, titles, schema, db_connection_string, db_name]
97
- )
98
-
99
- def process_doc_db(connection_string, nosql_db_name, session_hash):
100
- if connection_string:
101
- process_message = connect_doc_db(connection_string, nosql_db_name, session_hash)
102
- return process_message
103
-
104
- if __name__ == "__main__":
105
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
templates/graphql.py DELETED
@@ -1,110 +0,0 @@
1
- import ast
2
- import gradio as gr
3
- from functions import example_question_generator, chatbot_func
4
- from data_sources import connect_graphql
5
- from utils import message_dict
6
-
7
- import os
8
- from dotenv import load_dotenv
9
-
10
- load_dotenv()
11
-
12
- graphql_sample_endpoint = os.getenv("GRAPHQL_SAMPLE_ENDPOINT")
13
- graphql_sample_api_token = os.getenv("GRAPHQL_SAMPLE_API_TOKEN")
14
- graphql_sample_header_name = os.getenv("GRAPHQL_SAMPLE_HEADER_NAME")
15
-
16
- with gr.Blocks() as demo:
17
- with gr.Accordion("ℹ️ About the GraphQL Connector", open=False):
18
- gr.HTML("""
19
- <div class="max-w-4xl mx-auto text-center">
20
- <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
21
- <p>Connect to any GraphQL API endpoint and query it using natural language.</p>
22
- <p style="font-weight:bold;">
23
- API querying is the most experimental feature and performance may vary.
24
- No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
25
- Mutations are not exposed and the agent is instructed not to alter data, though restricting
26
- your API token's permissions is still strongly recommended.
27
- </p>
28
- <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
29
- </div>
30
- </div>
31
- """)
32
-
33
- gr.HTML("""
34
- <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
35
- border:1px solid #bae6fd;border-radius:8px;text-align:center;">
36
- <p style="margin:0;font-size:13px;color:#0369a1;">
37
- <i class="fas fa-flask" style="margin-right:6px;"></i>
38
- <strong>Demo credentials pre-filled.</strong>
39
- &nbsp;Replace with your own endpoint and token to analyze your own API.
40
- </p>
41
- </div>
42
- """)
43
-
44
- graphql_url = gr.Textbox(label="GraphQL Endpoint URL", value=graphql_sample_endpoint)
45
- with gr.Row():
46
- api_token_header_name = gr.Textbox(label="API Token Header Name", value=graphql_sample_header_name)
47
- api_token = gr.Textbox(label="API Token", value=graphql_sample_api_token, type="password")
48
-
49
- submit = gr.Button(value="Connect", variant="primary")
50
-
51
- @gr.render(inputs=[graphql_url, api_token, api_token_header_name], triggers=[submit.click])
52
- def api_chat(request: gr.Request, graphql_url=graphql_url.value, api_token=api_token.value, api_token_header_name=api_token_header_name.value):
53
- if request.session_hash not in message_dict:
54
- message_dict[request.session_hash] = {}
55
- message_dict[request.session_hash]['graphql'] = None
56
- if graphql_url:
57
- print("GraphQL API")
58
- process_message = process_graphql(graphql_url, api_token, api_token_header_name, request.session_hash)
59
- gr.HTML(value=process_message[1], padding=False)
60
- if process_message[0] == "success":
61
- if "qdl-app-testing" in graphql_url:
62
- example_questions = [
63
- ["Describe the dataset"],
64
- ["What is the total revenue for this shopify store?"],
65
- ["What is the average duration from the fulfillment of an order to its delivery?"],
66
- ["What is the total value of orders processed in the current month?"],
67
- ["Which product has the highest number of variants in the inventory?"],
68
- ["How many gift cards have been issued this year, and what is their total value?"],
69
- ["How many active apps are currently installed on the store?"],
70
- ["What is the total count of abandoned checkouts over the last month?"]
71
- ]
72
- else:
73
- try:
74
- generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'graphql', graphql_url, process_message[2], ''))
75
- example_questions = [["Describe the dataset"]]
76
- for example in generated_examples:
77
- example_questions.append([example])
78
- except Exception as e:
79
- print("GRAPHQL QUESTION GENERATION ERROR")
80
- print(e)
81
- example_questions = [
82
- ["Describe the dataset"],
83
- ["List the types in this API"],
84
- ["What could this data be used for?"],
85
- ]
86
- session_hash = gr.Textbox(visible=False, value=request.session_hash)
87
- graphql_api_string = gr.Textbox(visible=False, value=graphql_url)
88
- graphql_api_token = gr.Textbox(visible=False, value=api_token)
89
- graphql_token_header = gr.Textbox(visible=False, value=api_token_header_name)
90
- titles = gr.Textbox(value=process_message[2], interactive=False, label="GraphQL Types")
91
- data_source = gr.Textbox(visible=False, value='graphql')
92
- schema = gr.Textbox(visible=False, value='')
93
- bot = gr.Chatbot(type='messages', label="GraphQL Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
94
- chat = gr.ChatInterface(
95
- fn=chatbot_func,
96
- type='messages',
97
- chatbot=bot,
98
- title="Chat with your GraphQL API",
99
- examples=example_questions,
100
- concurrency_limit=None,
101
- additional_inputs=[session_hash, data_source, titles, schema, graphql_api_string, graphql_api_token, graphql_token_header]
102
- )
103
-
104
- def process_graphql(graphql_url, api_token, api_token_header_name, session_hash):
105
- if graphql_url:
106
- process_message = connect_graphql(graphql_url, api_token, api_token_header_name, session_hash)
107
- return process_message
108
-
109
- if __name__ == "__main__":
110
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
templates/sql_db.py DELETED
@@ -1,102 +0,0 @@
1
- import ast
2
- import gradio as gr
3
- from functions import example_question_generator, chatbot_func
4
- from data_sources import connect_sql_db
5
- from utils import message_dict
6
-
7
- with gr.Blocks() as demo:
8
- with gr.Accordion("ℹ️ About the SQL Connector", open=False):
9
- gr.HTML("""
10
- <div class="max-w-4xl mx-auto text-center">
11
- <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
12
- <p>Connect to a PostgreSQL database and query it using natural language.</p>
13
- <p style="font-weight:bold;">
14
- No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
15
- Queries run through Pandas <code>read_sql_query</code>, which cannot delete, drop, or insert rows.
16
- Use caution connecting production databases to third-party tools.
17
- </p>
18
- <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
19
- </div>
20
- </div>
21
- """)
22
-
23
- gr.HTML("""
24
- <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
25
- border:1px solid #bae6fd;border-radius:8px;text-align:center;">
26
- <p style="margin:0;font-size:13px;color:#0369a1;">
27
- <i class="fas fa-flask" style="margin-right:6px;"></i>
28
- <strong>Demo credentials pre-filled.</strong>
29
- &nbsp;Replace with your own database details to analyze your own data.
30
- </p>
31
- </div>
32
- """)
33
-
34
- sql_url = gr.Textbox(label="URL", value="virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com")
35
- with gr.Row():
36
- sql_port = gr.Textbox(label="Port", value="5432")
37
- sql_user = gr.Textbox(label="Username", value="postgres")
38
- sql_pass = gr.Textbox(label="Password", value="Vda-1988", type="password")
39
- sql_db_name = gr.Textbox(label="Database Name", value="dvdrental")
40
-
41
- submit = gr.Button(value="Connect", variant="primary")
42
-
43
- @gr.render(inputs=[sql_url, sql_port, sql_user, sql_pass, sql_db_name], triggers=[submit.click])
44
- def sql_chat(request: gr.Request, url=sql_url.value, sql_port=sql_port.value, sql_user=sql_user.value, sql_pass=sql_pass.value, sql_db_name=sql_db_name.value):
45
- if request.session_hash not in message_dict:
46
- message_dict[request.session_hash] = {}
47
- message_dict[request.session_hash]['sql'] = None
48
- if url:
49
- print("SQL APP")
50
- process_message = process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, request.session_hash)
51
- gr.HTML(value=process_message[1], padding=False)
52
- if process_message[0] == "success":
53
- if "virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com" in url:
54
- example_questions = [
55
- ["Describe the dataset"],
56
- ["What is the total revenue generated by each store?"],
57
- ["Can you generate and display a bar chart of film category to number of films in that category?"],
58
- ["Can you generate a pie chart showing the top 10 most rented films by revenue?"],
59
- ["Can you generate a line chart of rental revenue over time?"],
60
- ["What is the relationship between film length and rental frequency?"]
61
- ]
62
- else:
63
- try:
64
- generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'sql', sql_db_name, process_message[2], ""))
65
- example_questions = [["Describe the dataset"]]
66
- for example in generated_examples:
67
- example_questions.append([example])
68
- except Exception as e:
69
- print("SQL QUESTION GENERATION ERROR")
70
- print(e)
71
- example_questions = [
72
- ["Describe the dataset"],
73
- ["List the tables in the database"],
74
- ["What could this data be used for?"],
75
- ]
76
- session_hash = gr.Textbox(visible=False, value=request.session_hash)
77
- db_url = gr.Textbox(visible=False, value=url)
78
- db_port = gr.Textbox(visible=False, value=sql_port)
79
- db_user = gr.Textbox(visible=False, value=sql_user)
80
- db_pass = gr.Textbox(visible=False, value=sql_pass)
81
- db_name = gr.Textbox(visible=False, value=sql_db_name)
82
- titles = gr.Textbox(value=process_message[2], interactive=False, label="SQL Tables")
83
- data_source = gr.Textbox(visible=False, value='sql')
84
- schema = gr.Textbox(visible=False, value='')
85
- bot = gr.Chatbot(type='messages', label="SQL DB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
86
- chat = gr.ChatInterface(
87
- fn=chatbot_func,
88
- type='messages',
89
- chatbot=bot,
90
- title="Chat with your Database",
91
- examples=example_questions,
92
- concurrency_limit=None,
93
- additional_inputs=[session_hash, data_source, titles, schema, db_url, db_port, db_user, db_pass, db_name]
94
- )
95
-
96
- def process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
97
- if url:
98
- process_message = connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash)
99
- return process_message
100
-
101
- if __name__ == "__main__":
102
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from utils import TEMP_DIR
3
+
4
+ def tools_call(session_hash):
5
+ dir_path = TEMP_DIR / str(session_hash)
6
+ connection = sqlite3.connect(f'{dir_path}/data_source.db')
7
+ print("Querying Database in Tools.py");
8
+ cur=connection.execute('select * from data_source')
9
+ columns = [i[0] for i in cur.description]
10
+ print("COLUMNS 2")
11
+ print(columns)
12
+ cur.close()
13
+ connection.close()
14
+
15
+ column_string = (columns[:625] + '..') if len(columns) > 625 else columns
16
+
17
+ return [
18
+ {
19
+ "type": "function",
20
+ "function": {
21
+ "name": "sql_query_func",
22
+ "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {column_string}.
23
+ There may also be more columns in the table if the number of columns is too large to process.
24
+ This function also saves the results of the query to csv file called query.csv.""",
25
+ "parameters": {
26
+ "type": "object",
27
+ "properties": {
28
+ "queries": {
29
+ "type": "array",
30
+ "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement",
31
+ "items": {
32
+ "type": "string",
33
+ }
34
+ }
35
+ },
36
+ "required": ["queries"],
37
+ },
38
+ },
39
+ },
40
+ {
41
+ "type": "function",
42
+ "function": {
43
+ "name": "scatter_chart_generation_func",
44
+ "description": f"""This is a scatter plot generation tool useful to generate scatter plots from queried data from our SQL table called 'data_source'.
45
+ The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
46
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
47
+ from the scatter_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
48
+ to it for context if desired.""",
49
+ "parameters": {
50
+ "type": "object",
51
+ "properties": {
52
+ "data": {
53
+ "type": "array",
54
+ "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
55
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
56
+ Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
57
+ Infer this from the user's message.""",
58
+ "items": {
59
+ "type": "string",
60
+ }
61
+ },
62
+ "x_column": {
63
+ "type": "array",
64
+ "description": f"""An array of strings that correspond to the the columns in our query.csv file that contain the x values of the graph. There can be more than one column
65
+ that can each be plotted against the y_column, if needed.""",
66
+ "items": {
67
+ "type": "string",
68
+ }
69
+ },
70
+ "y_column": {
71
+ "type": "string",
72
+ "description": f"""The column in our query.csv file that contain the y values of the graph.""",
73
+ "items": {
74
+ "type": "string",
75
+ }
76
+ },
77
+ "category": {
78
+ "type": "string",
79
+ "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
80
+ "items": {
81
+ "type": "string",
82
+ }
83
+ },
84
+ "trendline": {
85
+ "type": "string",
86
+ "description": f"""An optional field to specify the type of plotly trendline we wish to use in the scatter plot.
87
+ This trendline value can be one of ['ols','lowess','rolling','ewm','expanding'].
88
+ Do not send any values outside of this array as the function will fail.
89
+ Infer this from the user's message.""",
90
+ "items": {
91
+ "type": "string",
92
+ }
93
+ },
94
+ "trendline_options": {
95
+ "type": "array",
96
+ "description": """An array containing a dictionary that contains the 'trendline_options' portion of the plotly chart generation.
97
+ The 'lowess', 'rolling', and 'ewm' options require trendline_options to be included.
98
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
99
+ "items": {
100
+ "type": "string",
101
+ }
102
+ },
103
+ "marginal_x": {
104
+ "type": "string",
105
+ "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the x axis.
106
+ This marginal_x value can be one of ['histogram','rug','box','violin'].
107
+ Do not send any values outside of this array as the function will fail.
108
+ Infer this from the user's message.""",
109
+ "items": {
110
+ "type": "string",
111
+ }
112
+ },
113
+ "marginal_y": {
114
+ "type": "string",
115
+ "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the y axis.
116
+ This marginal_y value can be one of ['histogram','rug','box','violin'].
117
+ Do not send any values outside of this array as the function will fail.
118
+ Infer this from the user's message.""",
119
+ "items": {
120
+ "type": "string",
121
+ }
122
+ },
123
+ "layout": {
124
+ "type": "array",
125
+ "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
126
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
127
+ "items": {
128
+ "type": "string",
129
+ }
130
+ },
131
+ "size": {
132
+ "type": "string",
133
+ "description": f"""An optional column in our query.csv file that contain a parameter that will define the size of each plot point.
134
+ This is useful for a bubble chart where another value in our query can be represented by the size of the plotted point.
135
+ Values must be greater than or equal to 0 and so in our query, all values less than 0 should be set equal to zero.""",
136
+ "items": {
137
+ "type": "string",
138
+ }
139
+ }
140
+ },
141
+ "required": ["x_column","y_column"],
142
+ },
143
+ },
144
+ },
145
+ {
146
+ "type": "function",
147
+ "function": {
148
+ "name": "line_chart_generation_func",
149
+ "description": f"""This is a line chart generation tool useful to generate line charts from queried data from our SQL table called 'data_source'.
150
+ The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
151
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
152
+ from the line_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
153
+ to it for context if desired.""",
154
+ "parameters": {
155
+ "type": "object",
156
+ "properties": {
157
+ "data": {
158
+ "type": "array",
159
+ "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
160
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
161
+ Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
162
+ Infer this from the user's message.""",
163
+ "items": {
164
+ "type": "string",
165
+ }
166
+ },
167
+ "x_column": {
168
+ "type": "string",
169
+ "description": f"""The column in our query.csv file that contain the x values of the graph.""",
170
+ "items": {
171
+ "type": "string",
172
+ }
173
+ },
174
+ "y_column": {
175
+ "type": "string",
176
+ "description": f"""The column in our query.csv file that contain the y values of the graph.""",
177
+ "items": {
178
+ "type": "string",
179
+ }
180
+ },
181
+ "category": {
182
+ "type": "string",
183
+ "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
184
+ "items": {
185
+ "type": "string",
186
+ }
187
+ },
188
+ "layout": {
189
+ "type": "array",
190
+ "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
191
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
192
+ "items": {
193
+ "type": "string",
194
+ }
195
+ }
196
+ },
197
+ "required": ["x_column","y_column","layout"],
198
+ },
199
+ },
200
+ },
201
+ {
202
+ "type": "function",
203
+ "function": {
204
+ "name": "bar_chart_generation_func",
205
+ "description": f"""This is a bar chart generation tool useful to generate line charts from queried data from our SQL table called 'data_source'.
206
+ The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
207
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
208
+ from the bar_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
209
+ to it for context if desired.""",
210
+ "parameters": {
211
+ "type": "object",
212
+ "properties": {
213
+ "data": {
214
+ "type": "array",
215
+ "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
216
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
217
+ Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
218
+ Infer this from the user's message.""",
219
+ "items": {
220
+ "type": "string",
221
+ }
222
+ },
223
+ "x_column": {
224
+ "type": "string",
225
+ "description": f"""The column in our query.csv file that contains the x values of the graph.""",
226
+ "items": {
227
+ "type": "string",
228
+ }
229
+ },
230
+ "y_column": {
231
+ "type": "string",
232
+ "description": f"""The column in our query.csv file that contains the y values of the graph.""",
233
+ "items": {
234
+ "type": "string",
235
+ }
236
+ },
237
+ "category": {
238
+ "type": "string",
239
+ "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
240
+ "items": {
241
+ "type": "string",
242
+ }
243
+ },
244
+ "facet_row": {
245
+ "type": "string",
246
+ "description": f"""An optional column in our query.csv file that contains a parameter that will define a faceted subplot, where different rows
247
+ correspond to different values of the query specified in this parameter.""",
248
+ "items": {
249
+ "type": "string",
250
+ }
251
+ },
252
+ "facet_col": {
253
+ "type": "string",
254
+ "description": f"""An optional column in our query.csv file that contain a parameter that will define the faceted column, corresponding to
255
+ different values of our query specified in this parameter.""",
256
+ "items": {
257
+ "type": "string",
258
+ }
259
+ },
260
+ "layout": {
261
+ "type": "array",
262
+ "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
263
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
264
+ "items": {
265
+ "type": "string",
266
+ }
267
+ }
268
+ },
269
+ "required": ["x_column","y_column","layout"],
270
+ },
271
+ },
272
+ },
273
+ {
274
+ "type": "function",
275
+ "function": {
276
+ "name": "pie_chart_generation_func",
277
+ "description": f"""This is a pie chart generation tool useful to generate pie charts from queried data from our SQL table called 'data_source'.
278
+ The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
279
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
280
+ from the pie_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
281
+ to it for context if desired.""",
282
+ "parameters": {
283
+ "type": "object",
284
+ "properties": {
285
+ "data": {
286
+ "type": "array",
287
+ "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
288
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
289
+ Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
290
+ Infer this from the user's message.""",
291
+ "items": {
292
+ "type": "string",
293
+ }
294
+ },
295
+ "values": {
296
+ "type": "string",
297
+ "description": f"""The column in our query.csv file that contain the values of the pie chart.""",
298
+ "items": {
299
+ "type": "string",
300
+ }
301
+ },
302
+ "names": {
303
+ "type": "string",
304
+ "description": f"""The column in our query.csv file that contain the label or section of each piece of the pie graph and allow us to know what each piece of the pie chart represents.""",
305
+ "items": {
306
+ "type": "string",
307
+ }
308
+ },
309
+ "layout": {
310
+ "type": "array",
311
+ "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
312
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
313
+ "items": {
314
+ "type": "string",
315
+ }
316
+ }
317
+ },
318
+ "required": ["values","names","layout"],
319
+ },
320
+ },
321
+ },
322
+ {
323
+ "type": "function",
324
+ "function": {
325
+ "name": "histogram_generation_func",
326
+ "description": f"""This is a histogram generation tool useful to generate histograms from queried data from our SQL table called 'data_source'.
327
+ The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
328
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
329
+ from the histogram_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
330
+ to it for context if desired.""",
331
+ "parameters": {
332
+ "type": "object",
333
+ "properties": {
334
+ "data": {
335
+ "type": "array",
336
+ "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
337
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
338
+ Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
339
+ Infer this from the user's message.""",
340
+ "items": {
341
+ "type": "string",
342
+ }
343
+ },
344
+ "x_column": {
345
+ "type": "string",
346
+ "description": f"""The column in our query.csv file that contains the x values of the histogram.
347
+ This would correspond to the counts that would be distributed in the histogram.""",
348
+ "items": {
349
+ "type": "string",
350
+ }
351
+ },
352
+ "y_column": {
353
+ "type": "string",
354
+ "description": f"""An optional column in our query.csv file that contains the y values of the histogram.""",
355
+ "items": {
356
+ "type": "string",
357
+ }
358
+ },
359
+ "histnorm": {
360
+ "type": "string",
361
+ "description": f"""An optional argument to specify the type of normalization if the default isn't used.
362
+ This histnorm value can be one of ['percent','probability','density','probability density'].
363
+ Do not send any values outside of this array as the function will fail.""",
364
+ "items": {
365
+ "type": "string",
366
+ }
367
+ },
368
+ "category": {
369
+ "type": "string",
370
+ "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
371
+ "items": {
372
+ "type": "string",
373
+ }
374
+ },
375
+ "histfunc": {
376
+ "type": "string",
377
+ "description": f"""An optional value that represents the function of data to compute the function which is used on the optional y column.
378
+ This histfunc value can be one of ['avg','sum','count'].
379
+ Do not send any values outside of this array as the function will fail.""",
380
+ "items": {
381
+ "type": "string",
382
+ }
383
+ },
384
+ "layout": {
385
+ "type": "array",
386
+ "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
387
+ The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
388
+ "items": {
389
+ "type": "string",
390
+ }
391
+ }
392
+ },
393
+ "required": ["x_column"],
394
+ },
395
+ },
396
+ },
397
+ {
398
+ "type": "function",
399
+ "function": {
400
+ "name": "table_generation_func",
401
+ "description": f"""This an table generation tool useful to format data as a table from queried data from our SQL table called 'data_source'.
402
+ Takes no parameters as it uses data queried in our query.csv file to build the table.
403
+ Call this function after running our SQLite query and generating query.csv.
404
+ Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
405
+ from the table_generation_func function in any way and always display the iframe fully to the user in the chat window.""",
406
+ "parameters": {},
407
+ },
408
+ },
409
+ {
410
+ "type": "function",
411
+ "function": {
412
+ "name": "regression_func",
413
+ "description": f"""This a tool to calculate regressions on our SQLite table called 'data_source'.
414
+ We can run queries with our 'sql_query_func' function and they will be available to use in this function via the query.csv file that is generated.
415
+ Returns a dictionary of values that includes a regression_summary and a regression chart (which is an iframe displaying the
416
+ linear regression in chart form and should be shown to the user).""",
417
+ "parameters": {
418
+ "type": "object",
419
+ "properties": {
420
+ "independent_variables": {
421
+ "type": "array",
422
+ "description": f"""An array of strings that states the independent variables in our data set which should be column names in our query.csv file that is generated
423
+ in the 'sql_query_func' function. This will allow us to identify the data to use for our independent variables.
424
+ Infer this from the user's message.""",
425
+ "items": {
426
+ "type": "string",
427
+ }
428
+ },
429
+ "dependent_variable": {
430
+ "type": "string",
431
+ "description": f"""A string that states the dependent variables in our data set which should be a column name in our query.csv file that is generated
432
+ in the 'sql_query_func' function. This will allow us to identify the data to use for our dependent variables.
433
+ Infer this from the user's message.""",
434
+ "items": {
435
+ "type": "string",
436
+ }
437
+ },
438
+ "category": {
439
+ "type": "string",
440
+ "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.
441
+ Do not send value if no category is needed or specified. This category must be present in our query.csv file to be valid.""",
442
+ "items": {
443
+ "type": "string",
444
+ }
445
+ }
446
+ },
447
+ "required": ["independent_variables","dependent_variable"],
448
+ },
449
+ },
450
+ }
451
+ ]
tools/__init__.py DELETED
File without changes
tools/chart_tools.py DELETED
@@ -1,308 +0,0 @@
1
- # Shared parameter snippets reused across chart tool schemas.
2
- # Update here to change the description everywhere at once.
3
-
4
- _LAYOUT_PARAM = {
5
- "type": "array",
6
- "description": (
7
- "Optional. An array containing a single JSON-formatted Plotly layout dictionary. "
8
- "Use to set chart title, axis labels, colours, fonts, and other layout properties. "
9
- "Example: [{\"title\": \"Monthly Sales\", \"xaxis\": {\"title\": \"Month\"}}]"
10
- ),
11
- "items": {"type": "string"},
12
- }
13
-
14
- _TRACE_STYLE_PARAM = {
15
- "type": "array",
16
- "description": (
17
- "Optional. An array containing a single JSON-formatted Plotly trace styling dictionary. "
18
- "Use to control visual properties such as line colour, opacity, and marker style. "
19
- "Do NOT include 'x', 'y', or 'type' keys — those are set automatically from query.csv."
20
- ),
21
- "items": {"type": "string"},
22
- }
23
-
24
- chart_tool_schemas = [
25
- {
26
- "name": "scatter_chart_generation_func",
27
- "description": (
28
- "Generates a Plotly scatter plot from query.csv data. "
29
- "Use when the user wants to visualise the relationship between two numeric columns, "
30
- "create a bubble chart (via the size parameter), or overlay a trendline. "
31
- "Returns an HTML iframe — display it verbatim in the chat."
32
- ),
33
- "parameters": {
34
- "type": "object",
35
- "properties": {
36
- "x_column": {
37
- "type": "array",
38
- "description": (
39
- "One or more column names from query.csv to plot on the x-axis. "
40
- "Multiple columns produce multiple series, each plotted against y_column."
41
- ),
42
- "items": {"type": "string"},
43
- },
44
- "y_column": {
45
- "type": "string",
46
- "description": "Column name from query.csv to plot on the y-axis.",
47
- },
48
- "category": {
49
- "type": "string",
50
- "description": "Optional column name used to colour-code points by a categorical grouping.",
51
- },
52
- "trendline": {
53
- "type": "string",
54
- "description": (
55
- "Optional trendline type. One of: 'ols' (linear regression), "
56
- "'lowess' (local smoothing), 'rolling', 'ewm', 'expanding'. "
57
- "Requires trendline_options when using 'lowess', 'rolling', or 'ewm'."
58
- ),
59
- },
60
- "trendline_options": {
61
- "type": "array",
62
- "description": (
63
- "Required when trendline is 'lowess', 'rolling', or 'ewm'. "
64
- "An array containing a single JSON-formatted dict of trendline options "
65
- "(e.g. [{\"window\": 7}] for a 7-point rolling average)."
66
- ),
67
- "items": {"type": "string"},
68
- },
69
- "marginal_x": {
70
- "type": "string",
71
- "description": "Optional marginal distribution plot along the x-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
72
- },
73
- "marginal_y": {
74
- "type": "string",
75
- "description": "Optional marginal distribution plot along the y-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
76
- },
77
- "size": {
78
- "type": "string",
79
- "description": "Optional column name whose values control the size of each point (bubble chart). Negative values are clamped to zero.",
80
- },
81
- "data": _TRACE_STYLE_PARAM,
82
- "layout": _LAYOUT_PARAM,
83
- },
84
- "required": ["x_column", "y_column"],
85
- },
86
- },
87
- {
88
- "name": "line_chart_generation_func",
89
- "description": (
90
- "Generates a Plotly line chart from query.csv data. "
91
- "Use for trends over time or any ordered sequence. "
92
- "Returns an HTML iframe — display it verbatim in the chat."
93
- ),
94
- "parameters": {
95
- "type": "object",
96
- "properties": {
97
- "x_column": {
98
- "type": "string",
99
- "description": "Column name from query.csv for the x-axis (typically a date or ordered index).",
100
- },
101
- "y_column": {
102
- "type": "string",
103
- "description": "Column name from query.csv for the y-axis (numeric values).",
104
- },
105
- "category": {
106
- "type": "string",
107
- "description": "Optional column name used to split the data into multiple colour-coded lines.",
108
- },
109
- "data": _TRACE_STYLE_PARAM,
110
- "layout": _LAYOUT_PARAM,
111
- },
112
- "required": ["x_column", "y_column"],
113
- },
114
- },
115
- {
116
- "name": "bar_chart_generation_func",
117
- "description": (
118
- "Generates a Plotly bar chart from query.csv data. "
119
- "Use for comparing values across categories. Supports grouped/stacked bars via category, "
120
- "and faceted subplots via facet_row or facet_col. "
121
- "Returns an HTML iframe — display it verbatim in the chat."
122
- ),
123
- "parameters": {
124
- "type": "object",
125
- "properties": {
126
- "x_column": {
127
- "type": "string",
128
- "description": "Column name from query.csv for the x-axis (category labels).",
129
- },
130
- "y_column": {
131
- "type": "string",
132
- "description": "Column name from query.csv for the y-axis (numeric values).",
133
- },
134
- "category": {
135
- "type": "string",
136
- "description": "Optional column name used to colour-code bars into grouped or stacked series.",
137
- },
138
- "facet_row": {
139
- "type": "string",
140
- "description": "Optional column name. Creates one subplot row per unique value — useful for comparing distributions across a second dimension.",
141
- },
142
- "facet_col": {
143
- "type": "string",
144
- "description": "Optional column name. Creates one subplot column per unique value.",
145
- },
146
- "data": _TRACE_STYLE_PARAM,
147
- "layout": _LAYOUT_PARAM,
148
- },
149
- "required": ["x_column", "y_column"],
150
- },
151
- },
152
- {
153
- "name": "pie_chart_generation_func",
154
- "description": (
155
- "Generates a Plotly pie chart from query.csv data. "
156
- "Use when the user wants to show part-to-whole proportions. "
157
- "Returns an HTML iframe — display it verbatim in the chat."
158
- ),
159
- "parameters": {
160
- "type": "object",
161
- "properties": {
162
- "values": {
163
- "type": "string",
164
- "description": "Column name from query.csv containing the numeric value for each slice.",
165
- },
166
- "names": {
167
- "type": "string",
168
- "description": "Column name from query.csv containing the label for each slice.",
169
- },
170
- "data": _TRACE_STYLE_PARAM,
171
- "layout": _LAYOUT_PARAM,
172
- },
173
- "required": ["values", "names"],
174
- },
175
- },
176
- {
177
- "name": "histogram_generation_func",
178
- "description": (
179
- "Generates a Plotly histogram from query.csv data. "
180
- "Use to show the frequency distribution of a numeric column. "
181
- "Supports normalisation (percent, probability, density) and aggregation functions per bin. "
182
- "Returns an HTML iframe — display it verbatim in the chat."
183
- ),
184
- "parameters": {
185
- "type": "object",
186
- "properties": {
187
- "x_column": {
188
- "type": "string",
189
- "description": "Column name from query.csv whose values are binned on the x-axis.",
190
- },
191
- "y_column": {
192
- "type": "string",
193
- "description": "Optional column name aggregated per bin via histfunc (e.g. sum of sales per price bucket).",
194
- },
195
- "histnorm": {
196
- "type": "string",
197
- "description": "Optional normalisation. One of: 'percent', 'probability', 'density', 'probability density'.",
198
- },
199
- "category": {
200
- "type": "string",
201
- "description": "Optional column name used to overlay multiple colour-coded histograms.",
202
- },
203
- "histfunc": {
204
- "type": "string",
205
- "description": "Optional aggregation function applied to y_column per bin. One of: 'avg', 'sum', 'count'.",
206
- },
207
- "data": _TRACE_STYLE_PARAM,
208
- "layout": _LAYOUT_PARAM,
209
- },
210
- "required": ["x_column"],
211
- },
212
- },
213
- {
214
- "name": "box_chart_generation_func",
215
- "description": (
216
- "Generates a Plotly box plot from query.csv data. "
217
- "Use to visualise the distribution of a numeric column and identify outliers. "
218
- "Especially useful for comparing distributions across categories. "
219
- "Returns an HTML iframe — display it verbatim in the chat."
220
- ),
221
- "parameters": {
222
- "type": "object",
223
- "properties": {
224
- "y_column": {
225
- "type": "string",
226
- "description": "Column name from query.csv containing the numeric values to distribute on the y-axis.",
227
- },
228
- "x_column": {
229
- "type": "string",
230
- "description": "Optional column name. Groups data into one box per unique value on the x-axis.",
231
- },
232
- "category": {
233
- "type": "string",
234
- "description": "Optional column name used to colour-code boxes by a secondary grouping.",
235
- },
236
- "layout": _LAYOUT_PARAM,
237
- },
238
- "required": ["y_column"],
239
- },
240
- },
241
- {
242
- "name": "correlation_heatmap_func",
243
- "description": (
244
- "Computes pairwise Pearson correlations between numeric columns in query.csv and renders "
245
- "the result as a colour-coded heatmap (blue = positive, red = negative). "
246
- "Use when the user asks which variables are related, correlated, or associated with each other. "
247
- "Returns an HTML iframe — display it verbatim in the chat."
248
- ),
249
- "parameters": {
250
- "type": "object",
251
- "properties": {
252
- "columns": {
253
- "type": "array",
254
- "description": "Optional list of numeric column names to include in the matrix. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
255
- "items": {"type": "string"},
256
- },
257
- },
258
- "required": [],
259
- },
260
- },
261
- {
262
- "name": "rolling_stats_func",
263
- "description": (
264
- "Generates a rolling statistics / moving average chart from query.csv data. "
265
- "Overlays rolling aggregations (mean, std, min, max) on top of the original series. "
266
- "Use when the user asks for a moving average, rolling average, rolling statistics, or wants to smooth a time series. "
267
- "Returns an HTML iframe — display it verbatim in the chat."
268
- ),
269
- "parameters": {
270
- "type": "object",
271
- "properties": {
272
- "x_column": {
273
- "type": "string",
274
- "description": "Column name from query.csv for the x-axis — typically a date or sequential index.",
275
- },
276
- "y_column": {
277
- "type": "string",
278
- "description": "Column name from query.csv containing the numeric values to compute rolling stats on.",
279
- },
280
- "window": {
281
- "type": "integer",
282
- "description": "Rolling window size in number of rows. Default 7. Infer from the user's request.",
283
- },
284
- "stats": {
285
- "type": "array",
286
- "description": "Statistics to overlay. Valid values: 'mean', 'std', 'min', 'max'. Defaults to ['mean'] if omitted.",
287
- "items": {"type": "string"},
288
- },
289
- "category": {
290
- "type": "string",
291
- "description": "Optional column name to group the data, producing separate rolling stat lines per group.",
292
- },
293
- "layout": _LAYOUT_PARAM,
294
- },
295
- "required": ["x_column", "y_column"],
296
- },
297
- },
298
- {
299
- "name": "table_generation_func",
300
- "description": (
301
- "Formats query.csv results as a styled HTML table. "
302
- "Use when the user wants to view raw query results in a readable format, "
303
- "or when result data is too large to describe in text. Displays up to 200 rows. "
304
- "Returns an HTML table — display it verbatim in the chat."
305
- ),
306
- "parameters": {"type": "object", "properties": {}},
307
- },
308
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/stats_tools.py DELETED
@@ -1,130 +0,0 @@
1
- stats_tool_schemas = [
2
- {
3
- "name": "descriptive_stats_func",
4
- "description": (
5
- "Computes summary statistics for numeric columns in query.csv: "
6
- "count, mean, std, min, 25th/50th/75th percentile, and max. "
7
- "Use when the user asks for summary statistics, descriptive statistics, or a statistical overview. "
8
- "Returns a formatted HTML table."
9
- ),
10
- "parameters": {
11
- "type": "object",
12
- "properties": {
13
- "columns": {
14
- "type": "array",
15
- "description": "Optional list of column names to include. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
16
- "items": {"type": "string"},
17
- },
18
- },
19
- "required": [],
20
- },
21
- },
22
- {
23
- "name": "kmeans_clustering_func",
24
- "description": (
25
- "Runs K-Means clustering on numeric feature columns from query.csv. "
26
- "Groups rows into k clusters, displays a scatter plot coloured by cluster assignment, "
27
- "and returns a centroid summary table showing the mean of each feature per cluster. "
28
- "Use when the user asks to cluster the data, find natural segments or groups, or apply K-Means. "
29
- "Returns an HTML iframe and summary table."
30
- ),
31
- "parameters": {
32
- "type": "object",
33
- "properties": {
34
- "feature_columns": {
35
- "type": "array",
36
- "description": "List of numeric column names from query.csv to use as clustering features.",
37
- "items": {"type": "string"},
38
- },
39
- "x_column": {
40
- "type": "string",
41
- "description": "Column name from query.csv for the x-axis of the scatter plot. Usually one of the feature columns.",
42
- },
43
- "y_column": {
44
- "type": "string",
45
- "description": "Column name from query.csv for the y-axis of the scatter plot. Usually one of the feature columns.",
46
- },
47
- "n_clusters": {
48
- "type": "integer",
49
- "description": "Number of clusters (k). Default 3. Infer from the user's request.",
50
- },
51
- "layout": {
52
- "type": "array",
53
- "description": "Optional. An array containing a single JSON-formatted Plotly layout dictionary.",
54
- "items": {"type": "string"},
55
- },
56
- },
57
- "required": ["feature_columns", "x_column", "y_column"],
58
- },
59
- },
60
- {
61
- "name": "hypothesis_test_func",
62
- "description": (
63
- "Performs a statistical hypothesis test on query.csv data and returns a formatted results table "
64
- "with test statistic, p-value, and significance at α=0.05. "
65
- "Supported tests:\n"
66
- "- 't_test_independent': compare means of a numeric column across two groups "
67
- "(requires group_column; use group_values if the column has more than 2 unique values).\n"
68
- "- 't_test_one_sample': test whether a column's mean equals a hypothesized value (requires pop_mean).\n"
69
- "- 'chi_square': test independence between two categorical columns (requires column and column2)."
70
- ),
71
- "parameters": {
72
- "type": "object",
73
- "properties": {
74
- "test_type": {
75
- "type": "string",
76
- "description": "Test to run. One of: 't_test_independent', 't_test_one_sample', 'chi_square'.",
77
- },
78
- "column": {
79
- "type": "string",
80
- "description": "Primary column for the test. Numeric for t-tests; first categorical column for chi-square.",
81
- },
82
- "column2": {
83
- "type": "string",
84
- "description": "Second categorical column. Required for 'chi_square'.",
85
- },
86
- "group_column": {
87
- "type": "string",
88
- "description": "Grouping column. Required for 't_test_independent'. Must have exactly 2 unique values, or specify group_values.",
89
- },
90
- "group_values": {
91
- "type": "array",
92
- "description": "Exactly 2 group labels to compare. Use when group_column has more than 2 unique values.",
93
- "items": {"type": "string"},
94
- },
95
- "pop_mean": {
96
- "type": "number",
97
- "description": "Hypothesized population mean (μ₀). Required for 't_test_one_sample'.",
98
- },
99
- },
100
- "required": ["test_type", "column"],
101
- },
102
- },
103
- {
104
- "name": "regression_func",
105
- "description": (
106
- "Runs an OLS linear regression on query.csv data. "
107
- "Use when the user wants to model the relationship between variables, assess predictors, or run a regression. "
108
- "Returns a regression summary (coefficients, R², p-values) and a scatter plot with the fitted line as an HTML iframe."
109
- ),
110
- "parameters": {
111
- "type": "object",
112
- "properties": {
113
- "independent_variables": {
114
- "type": "array",
115
- "description": "Column names from query.csv to use as independent (predictor) variables.",
116
- "items": {"type": "string"},
117
- },
118
- "dependent_variable": {
119
- "type": "string",
120
- "description": "Column name from query.csv to use as the dependent (outcome) variable.",
121
- },
122
- "category": {
123
- "type": "string",
124
- "description": "Optional column name used to colour-code points and fit separate regression lines per group.",
125
- },
126
- },
127
- "required": ["independent_variables", "dependent_variable"],
128
- },
129
- },
130
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/tools.py DELETED
@@ -1,130 +0,0 @@
1
- from .stats_tools import stats_tool_schemas
2
- from .chart_tools import chart_tool_schemas
3
-
4
- def tools_call(session_hash, data_source, titles):
5
- from haystack.tools import Tool
6
-
7
- _noop = lambda **kwargs: None
8
-
9
- def make_tool(schema):
10
- return Tool(
11
- name=schema["name"],
12
- description=schema["description"],
13
- parameters=schema["parameters"],
14
- function=_noop,
15
- )
16
-
17
- titles_string = (titles[:625] + '..') if len(titles) > 625 else titles
18
-
19
- query_tool_schemas = {
20
- 'file_upload': {
21
- "name": "query_func",
22
- "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {titles_string}.
23
- There may also be more columns in the table if the number of columns is too large to process.
24
- This function also saves the results of the query to csv file called query.csv.""",
25
- "parameters": {
26
- "type": "object",
27
- "properties": {
28
- "queries": {
29
- "type": "string",
30
- "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement."
31
- }
32
- },
33
- "required": ["queries"]
34
- },
35
- },
36
- 'sql': {
37
- "name": "query_func",
38
- "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {titles_string}.
39
- There may also be more tables in the database if the number of tables is too large to process.
40
- This function also saves the results of the query to csv file called query.csv.""",
41
- "parameters": {
42
- "type": "object",
43
- "properties": {
44
- "queries": {
45
- "type": "string",
46
- "description": "The PostgreSQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
47
- }
48
- },
49
- "required": ["queries"]
50
- },
51
- },
52
- 'doc_db': {
53
- "name": "query_func",
54
- "description": f"""This is a tool useful to build an aggregation pipeline to query a MongoDB NoSQL document database with the following collections, {titles_string}.
55
- There may also be more collections in the database if the number of collections is too large to process.
56
- This function also saves the results of the query to a csv file called query.csv.""",
57
- "parameters": {
58
- "type": "object",
59
- "properties": {
60
- "queries": {
61
- "type": "string",
62
- "description": "The MongoDB aggregation pipeline to use in the search. Infer this from the user's message. It should be a question or a statement."
63
- },
64
- "db_collection": {
65
- "type": "string",
66
- "description": "The MongoDB collection to use in the search. Infer this from the user's message. It should be a question or a statement."
67
- }
68
- },
69
- "required": ["queries", "db_collection"]
70
- },
71
- },
72
- 'graphql': [
73
- {
74
- "name": "query_func",
75
- "description": f"""This is a tool useful to build a GraphQL query for a GraphQL API endpoint with the following types, {titles_string}.
76
- There may also be more types in the GraphQL endpoint if the number of types is too large to process.
77
- This function also saves the results of the query to a csv file called query.csv.""",
78
- "parameters": {
79
- "type": "object",
80
- "properties": {
81
- "queries": {
82
- "type": "string",
83
- "description": "The GraphQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
84
- }
85
- },
86
- "required": ["queries"]
87
- },
88
- },
89
- {
90
- "name": "graphql_schema_query",
91
- "description": f"""This is a tool useful to query a GraphQL type and receive back information about its schema. This is useful because
92
- the GraphQL introspection query is too large to be ingested all at once and this allows us to query the schema one type at a time to
93
- view it in manageable bites. You may realize after viewing the schema, that the type you selected was not appropriate for the question
94
- you are attempting answer. You may then query additional types to find the appropriate types to use for your GraphQL API query.""",
95
- "parameters": {
96
- "type": "object",
97
- "properties": {
98
- "graphql_type": {
99
- "type": "string",
100
- "description": "The GraphQL type that we want to view the schema of in order to make the proper query with our graphql_query_func. Infer this from the user's message. It should be a question or a statement."
101
- }
102
- },
103
- "required": ["graphql_type"]
104
- },
105
- },
106
- {
107
- "name": "graphql_csv_query",
108
- "description": f"""This is a tool useful to SQL query our query.csv file that is generated from our GraphQL query. This is useful in a situation
109
- where the results of the GraphQL query need additional querying to answer the user question. The query.csv file is converted to a Pandas dataframe
110
- and we query that dataframe with SQL on a table called 'query' before converting it back to a csv file.""",
111
- "parameters": {
112
- "type": "object",
113
- "properties": {
114
- "csv_query": {
115
- "type": "string",
116
- "description": "The pandas dataframe SQL query to use in the search. The table that we query is named 'query'. Infer this from the user's message. It should be a question or a statement."
117
- }
118
- },
119
- "required": ["csv_query"]
120
- },
121
- },
122
- ]
123
- }
124
-
125
- source_schemas = query_tool_schemas[data_source]
126
- source_tools = [make_tool(s) for s in (source_schemas if isinstance(source_schemas, list) else [source_schemas])]
127
- chart_tools = [make_tool(s) for s in chart_tool_schemas]
128
- stats_tools = [make_tool(s) for s in stats_tool_schemas]
129
-
130
- return source_tools + chart_tools + stats_tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -4,6 +4,4 @@ current_dir = Path(__file__).parent
4
 
5
  TEMP_DIR = current_dir / 'temp'
6
 
7
- message_dict = {}
8
- api_key_store = {}
9
- model_store = {}
 
4
 
5
  TEMP_DIR = current_dir / 'temp'
6
 
7
+ message_dict = {}