Spaces:

nolanzandi
/

virtual-data-analyst

Running

App Files Files Community

refactor chat functions

#39

by nolanzandi - opened May 11, 2025

base: refs/heads/main

←

from: refs/pr/39

Discussion Files changed

+1329

-102054

Files changed (24) hide show

.gitattributes +1 -0
.gitignore +0 -4
README.md +2 -2
app.py +90 -192
assets/styles.css +3 -24
data_sources/connect_graphql.py +2 -5
data_sources/upload_file.py +1 -66
functions/__init__.py +6 -14
functions/chart_functions.py +55 -196
functions/chat_functions.py +64 -87
functions/query_functions.py +87 -47
functions/stat_functions.py +6 -235
requirements.txt +1 -5
samples/online_retail_data.csv +0 -0
temp/.gitignore +0 -2
templates/data_file.py +136 -286
templates/doc_db.py +99 -105
templates/graphql.py +110 -110
templates/sql_db.py +98 -102
tools/__init__.py +0 -0
tools/chart_tools.py +371 -308
tools/stats_tools.py +44 -130
tools/tools.py +149 -130
utils.py +1 -3

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/online_retail_data.csv filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,4 +0,0 @@
-__pycache__/
-.gradio/
-.env
-temp/

README.md CHANGED Viewed

@@ -4,10 +4,10 @@ emoji: 📈
 colorFrom: pink
 colorTo: blue
 sdk: gradio
-sdk_version: 5.29.0
 app_file: app.py
 pinned: true
-short_description: Queries, visualizations, stat analysis on your data
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: pink
 colorTo: blue
 sdk: gradio
+sdk_version: 5.23.3
 app_file: app.py
 pinned: true
+short_description: Queries, visualizations, analysis on your files/DBs/APIs
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,192 +1,90 @@
-from utils import TEMP_DIR, message_dict, api_key_store, model_store
-import gradio as gr
-import templates.data_file as data_file, templates.sql_db as sql_db, templates.doc_db as doc_db, templates.graphql as graphql
-import os
-from dotenv import load_dotenv
-load_dotenv()
-def delete_db(req: gr.Request):
-    import shutil
-    dir_path = TEMP_DIR / str(req.session_hash)
-    if os.path.exists(dir_path):
-        shutil.rmtree(dir_path)
-        message_dict[req.session_hash] = {}
-    api_key_store.pop(req.session_hash, None)
-    model_store.pop(req.session_hash, None)
-def set_api_key(api_key, model, request: gr.Request):
-    api_key = api_key.strip()
-    if not api_key:
-        return (
-            gr.update(visible=True),
-            gr.update(visible=True, value="<p style='color:#b91c1c;text-align:center;margin:6px 0;font-size:14px;'>Please enter your API key.</p>"),
-            gr.update(visible=False),
-        )
-    api_key_store[request.session_hash] = api_key
-    model_store[request.session_hash] = model
-    provider = "Anthropic" if api_key.startswith("sk-ant-") else "OpenAI"
-    provider_icon = "fa-a" if provider == "Anthropic" else "fa-o"
-    badge_html = f"""
-    <div style="display:flex;flex-direction:column;align-items:center;gap:6px;padding:10px 0 4px;">
-        <div style="display:inline-flex;align-items:center;gap:10px;background:#f0fdf4;border:1px solid #86efac;
-                    padding:8px 20px;border-radius:9999px;font-size:13px;font-weight:500;color:#15803d;
-                    box-shadow:0 1px 3px rgba(0,0,0,0.06);">
-            <i class="fas fa-circle-check" style="font-size:14px;"></i>
-            <span>{provider}</span>
-            <span style="color:#86efac;">·</span>
-            <span style="font-weight:600;">{model}</span>
-        </div>
-        <p style="margin:0;font-size:11px;color:#9ca3af;letter-spacing:0.02em;">
-            Session active — use the button below to change
-        </p>
-    </div>
-    """
-    return gr.update(visible=False), gr.update(visible=True, value=badge_html), gr.update(visible=True)
-def show_api_form():
-    return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
-css = ".file_marker .large{min-height:50px !important;} .padding{padding:0;} .description_component{overflow:visible !important;}"
-head = """<meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Virtual Data Analyst</title>
-    <!-- Tailwind CSS -->
-    <script src="https://cdn.tailwindcss.com"></script>
-    <!-- Google Fonts -->
-    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
-    <!-- Font Awesome -->
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
-    <!-- Custom Styles -->
-    <link rel="stylesheet" href="/gradio_api/file=assets/styles.css">
-    """
-theme = gr.themes.Base(primary_hue="sky", secondary_hue="slate", font=[gr.themes.GoogleFont("Inter"), "Inter", "sans-serif"]).set(
-    button_primary_background_fill="#3B82F6",
-    button_secondary_background_fill="#6B7280",
-)
-from pathlib import Path
-gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
-_env_api_key = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODELS = [
-    "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
-    "gpt-4o", "gpt-4o-mini",
-    "o3-mini", "o4-mini",
-    "gpt-5.4-mini", "gpt-5.4", "gpt-5.5",
-]
-ANTHROPIC_MODELS = [
-    "claude-sonnet-4-6",
-    "claude-opus-4-8",
-    "claude-haiku-4-5-20251001",
-]
-def update_models(api_key):
-    if api_key.strip().startswith("sk-ant-"):
-        return gr.update(choices=ANTHROPIC_MODELS, value=ANTHROPIC_MODELS[0])
-    return gr.update(choices=OPENAI_MODELS, value=OPENAI_MODELS[0])
-with gr.Blocks(theme=theme, css=css, head=head, delete_cache=(3600, 3600)) as demo:
-    with gr.Column(visible=True) as api_key_section:
-        gr.HTML("""
-            <div style="max-width:640px;margin:28px auto 12px;padding:22px 28px;
-                        background:linear-gradient(135deg,#eff6ff 0%,#e0f2fe 100%);
-                        border:1px solid #bfdbfe;border-radius:14px;
-                        box-shadow:0 2px 8px rgba(59,130,246,0.08);">
-                <div style="display:flex;align-items:flex-start;gap:16px;">
-                    <div style="width:42px;height:42px;flex-shrink:0;background:#3B82F6;
-                                border-radius:10px;display:flex;align-items:center;
-                                justify-content:center;box-shadow:0 2px 6px rgba(59,130,246,0.35);">
-                        <i class="fas fa-key" style="color:white;font-size:16px;"></i>
-                    </div>
-                    <div>
-                        <h3 style="color:#1e40af;margin:0 0 6px;font-size:16px;font-weight:700;letter-spacing:-0.01em;">
-                            Get Started
-                        </h3>
-                        <p style="color:#3730a3;font-size:13.5px;margin:0;line-height:1.6;">
-                            Enter your <strong>OpenAI</strong>
-                            (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-...</code>)
-                            or <strong>Anthropic</strong>
-                            (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-ant-...</code>)
-                            API key. The model list updates automatically. Your key is held in memory only
-                            and cleared when you leave — never saved or shared.
-                        </p>
-                    </div>
-                </div>
-            </div>
-        """)
-        with gr.Row(equal_height=True):
-            api_key_input = gr.Textbox(
-                label="API Key",
-                placeholder="sk-proj-...  or  sk-ant-api03-...",
-                type="password",
-                value=_env_api_key,
-                scale=4,
-            )
-            model_dropdown = gr.Dropdown(
-                label="Model",
-                choices=OPENAI_MODELS,
-                value=OPENAI_MODELS[0],
-                scale=2,
-            )
-            api_key_btn = gr.Button("Set API Key", variant="primary", scale=1, min_width=120)
-    api_key_status = gr.HTML("", visible=False)
-    change_key_btn = gr.Button("🔑  Change Key / Model", variant="secondary", visible=False, size="sm")
-    api_key_input.change(fn=update_models, inputs=api_key_input, outputs=model_dropdown)
-    api_key_btn.click(
-        fn=set_api_key,
-        inputs=[api_key_input, model_dropdown],
-        outputs=[api_key_section, api_key_status, change_key_btn],
-    )
-    change_key_btn.click(fn=show_api_form, outputs=[api_key_section, api_key_status, change_key_btn])
-    header = gr.HTML("""
-        <header class="max-w-4xl mx-auto mb-12 text-center">
-            <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
-            <p class="text-lg text-gray-600 mb-6">
-                A powerful tool for data analysis, visualizations, and insights
-            </p>
-        </header>
-        <main class="max-w-4xl mx-auto">
-            <div class="mt-12 grid md:grid-cols-3 gap-6" style="margin-bottom:3px !important;">
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
-                    <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
-                </div>
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
-                    <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
-                </div>
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
-                    <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
-                </div>
-            </div>
-        </main>""")
-    with gr.Tab("📄  Data File"):
-        data_file.demo.render()
-    with gr.Tab("🗄  SQL Database"):
-        sql_db.demo.render()
-    with gr.Tab("🍃  MongoDB"):
-        doc_db.demo.render()
-    with gr.Tab("⚡  GraphQL API"):
-        graphql.demo.render()
-    footer = gr.HTML("""
-        <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
-            <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
-        </footer>""")
-    demo.unload(delete_db)
-demo.launch(debug=True, allowed_paths=["temp/", "assets/"])

+from utils import TEMP_DIR, message_dict
+import gradio as gr
+import templates.data_file as data_file, templates.sql_db as sql_db, templates.doc_db as doc_db, templates.graphql as graphql
+import os
+from getpass import getpass
+from dotenv import load_dotenv
+load_dotenv()
+def delete_db(req: gr.Request):
+    import shutil
+    dir_path = TEMP_DIR / str(req.session_hash)
+    if os.path.exists(dir_path):
+        shutil.rmtree(dir_path)
+        message_dict[req.session_hash] = {}
+if "OPENAI_API_KEY" not in os.environ:
+    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
+css= ".file_marker .large{min-height:50px !important;} .padding{padding:0;} .description_component{overflow:visible !important;}"
+head = """<meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Virtual Data Analyst</title>
+    <!-- Tailwind CSS -->
+    <script src="https://cdn.tailwindcss.com"></script>
+    <!-- Google Fonts -->
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
+    <!-- Custom Styles -->
+    <link rel="stylesheet" href="/gradio_api/file=assets/styles.css">
+    """
+theme = gr.themes.Base(primary_hue="sky", secondary_hue="slate",font=[gr.themes.GoogleFont("Inter"), "Inter", "sans-serif"]).set(
+    button_primary_background_fill="#3B82F6",
+    button_secondary_background_fill="#6B7280",
+)
+from pathlib import Path
+gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"])
+with gr.Blocks(theme=theme, css=css, head=head, delete_cache=(3600,3600)) as demo:
+    header = gr.HTML("""
+                        <!-- Header -->
+                        <header class="max-w-4xl mx-auto mb-12 text-center">
+                            <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
+                            <p class="text-lg text-gray-600 mb-6">
+                                A powerful tool for data analysis, visualizations, and insights
+                            </p>
+                        </header>
+                        <!-- Main Content -->
+                        <main class="max-w-4xl mx-auto">
+                            <!-- Features Preview -->
+                            <div class="mt-12 grid md:grid-cols-3 gap-6" style="margin-bottom:3px !important;">
+                                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                                    <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
+                                    <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
+                                    <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
+                                </div>
+                                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                                    <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
+                                    <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
+                                    <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
+                                </div>
+                                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                                    <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
+                                    <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
+                                    <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
+                                </div>
+                            </div>
+                        </main>""")
+    with gr.Tab("Data File"):
+        data_file.demo.render()
+    with gr.Tab("SQL Database"):
+        sql_db.demo.render()
+    with gr.Tab("Document (MongoDB) Database"):
+        doc_db.demo.render()
+    with gr.Tab("GraphQL API"):
+        graphql.demo.render()
+    footer = gr.HTML("""<!-- Footer -->
+        <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
+            <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
+        </footer>""")
+    demo.unload(delete_db)
+## Uncomment the line below to launch the chat app with UI
+demo.launch(debug=True, allowed_paths=["temp/","assets/"])

assets/styles.css CHANGED Viewed

@@ -89,7 +89,6 @@
     transition: all 0.3s ease;
     position: relative;
     overflow: hidden;
-    background: linear-gradient(135deg, #3B82F6, #0ea5e9) !important;
 }
 .sample-btn::after {
@@ -99,7 +98,7 @@
     left: 0;
     width: 100%;
     height: 100%;
-    background: linear-gradient(rgba(255,255,255,0.12), rgba(255,255,255,0));
     transform: translateY(-100%);
     transition: transform 0.3s ease;
 }
@@ -110,17 +109,7 @@
 .sample-btn:hover {
     transform: translateY(-2px);
-    box-shadow: 0 8px 20px rgba(59,130,246,0.3);
-}
-/* Status badge fade-in */
-@keyframes fadeSlideIn {
-    from { opacity: 0; transform: translateY(-6px); }
-    to   { opacity: 1; transform: translateY(0); }
-}
-.api-status-badge {
-    animation: fadeSlideIn 0.35s ease forwards;
 }
 /* Drop Zone Enhancements */
@@ -185,14 +174,4 @@
         grid-template-columns: 1fr 2fr;
         align-items: baseline;
     }
-  }
-dialog {
-  margin: 10% auto;
-  width: 80%;
-  max-width: 350px;
-  background-color: #fff;
-  padding: 34px;
-  border: 0;
-  border-radius: 5px;
-}

     transition: all 0.3s ease;
     position: relative;
     overflow: hidden;
 }
 .sample-btn::after {
     left: 0;
     width: 100%;
     height: 100%;
+    background: linear-gradient(rgba(255,255,255,0.1), rgba(255,255,255,0));
     transform: translateY(-100%);
     transition: transform 0.3s ease;
 }
 .sample-btn:hover {
     transform: translateY(-2px);
+    box-shadow: 0 8px 15px rgba(0,0,0,0.1);
 }
 /* Drop Zone Enhancements */
         grid-template-columns: 1fr 2fr;
         align-items: baseline;
     }
+  }

data_sources/connect_graphql.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import requests
-import certifi
 import os
 import json
 from utils import TEMP_DIR
@@ -103,8 +102,7 @@ def connect_graphql(graphql_url, api_token, graphql_token_header, session_hash):
         headers = {"Content-Type": "application/json"}
         if graphql_token_header and api_token:
             headers[graphql_token_header] = api_token
-        response = requests.post(graphql_url, headers=headers, json={"query": query},
-                    verify=certifi.where())
         response.raise_for_status()
         introspection_result = response.json()
@@ -121,8 +119,7 @@ def connect_graphql(graphql_url, api_token, graphql_token_header, session_hash):
                     }
                 }
             """
-        types_response = requests.post(graphql_url, headers=headers, json={"query": type_names_query},
-                                       verify=certifi.where())
         types_response_results =types_response.json()

 import requests
 import os
 import json
 from utils import TEMP_DIR
         headers = {"Content-Type": "application/json"}
         if graphql_token_header and api_token:
             headers[graphql_token_header] = api_token
+        response = requests.post(graphql_url, headers=headers, json={"query": query})
         response.raise_for_status()
         introspection_result = response.json()
                     }
                 }
             """
+        types_response = requests.post(graphql_url, headers=headers, json={"query": type_names_query})
         types_response_results =types_response.json()

data_sources/upload_file.py CHANGED Viewed

@@ -95,72 +95,7 @@ def process_data_upload(data_file, session_hash):
         connection.commit()
         connection.close()
-        missing_per_col = {col: int(df[col].isnull().sum()) for col in df.columns}
-        total_missing = sum(missing_per_col.values())
-        def _simplify_dtype(d):
-            s = str(d)
-            if 'int' in s: return 'Integer'
-            if 'float' in s: return 'Float'
-            if 'datetime' in s: return 'DateTime'
-            if 'bool' in s: return 'Boolean'
-            return 'Text'
-        dtypes = {col: _simplify_dtype(df[col].dtype) for col in df.columns}
-        preview = []
-        for _, row in df.head(5).iterrows():
-            row_vals = []
-            for v in row:
-                try:
-                    row_vals.append('' if pd.isna(v) else str(v)[:60])
-                except Exception:
-                    row_vals.append(str(v)[:60])
-            preview.append(row_vals)
-        duplicate_rows = int(df.duplicated().sum())
-        unique_counts = {col: int(df[col].nunique()) for col in df.columns}
-        col_stats = {}
-        for col in df.columns:
-            dtype_str = str(df[col].dtype)
-            try:
-                if 'int' in dtype_str or 'float' in dtype_str:
-                    col_stats[col] = {
-                        'type': 'numeric',
-                        'min': float(df[col].min()),
-                        'max': float(df[col].max()),
-                        'mean': float(df[col].mean()),
-                    }
-                elif 'datetime' in dtype_str:
-                    col_stats[col] = {
-                        'type': 'datetime',
-                        'min': str(df[col].min())[:10],
-                        'max': str(df[col].max())[:10],
-                    }
-            except Exception:
-                pass
-        try:
-            file_size_bytes = os.path.getsize(data_file)
-        except Exception:
-            file_size_bytes = 0
-        stats = {
-            'num_rows': len(df),
-            'num_cols': len(df.columns),
-            'total_missing': total_missing,
-            'missing_per_col': missing_per_col,
-            'dtypes': dtypes,
-            'preview_cols': list(df.columns),
-            'preview': preview,
-            'duplicate_rows': duplicate_rows,
-            'unique_counts': unique_counts,
-            'col_stats': col_stats,
-            'file_size_bytes': file_size_bytes,
-        }
-        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>", columns, stats]
     except Exception as e:
         print("UPLOAD ERROR")
         print(e)

         connection.commit()
         connection.close()
+        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>", columns]
     except Exception as e:
         print("UPLOAD ERROR")
         print(e)

functions/__init__.py CHANGED Viewed

@@ -1,17 +1,9 @@
-from .query_functions import graphql_schema_query, graphql_csv_query, query_func
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
-    line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, \
-    histogram_generation_func, box_chart_generation_func, correlation_heatmap_func, \
-    scatter_chart_fig, rolling_stats_func
 from .chat_functions import example_question_generator, chatbot_func
-from .stat_functions import regression_func, descriptive_stats_func, \
-    kmeans_clustering_func, hypothesis_test_func
-__all__ = [
-    "query_func", "graphql_schema_query", "graphql_csv_query",
-    "table_generation_func", "scatter_chart_generation_func", "line_chart_generation_func",
-    "bar_chart_generation_func", "pie_chart_generation_func", "histogram_generation_func",
-    "box_chart_generation_func", "correlation_heatmap_func", "rolling_stats_func",
-    "regression_func", "descriptive_stats_func", "kmeans_clustering_func", "hypothesis_test_func",
-    "scatter_chart_fig", "example_question_generator", "chatbot_func",
-]

+from .query_functions import SQLiteQuery, sqlite_query_func, sql_query_func, doc_db_query_func, graphql_query_func, graphql_schema_query, graphql_csv_query
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
+line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, histogram_generation_func, scatter_chart_fig
 from .chat_functions import example_question_generator, chatbot_func
+from .stat_functions import regression_func
+__all__ = ["SQLiteQuery","sqlite_query_func","sql_query_func","doc_db_query_func","graphql_query_func","graphql_schema_query","graphql_csv_query","table_generation_func","scatter_chart_generation_func",
+           "line_chart_generation_func","bar_chart_generation_func","regression_func", "pie_chart_generation_func", "histogram_generation_func",
+           "scatter_chart_fig","example_question_generator","chatbot_func"]

functions/chart_functions.py CHANGED Viewed

@@ -9,20 +9,7 @@ from dotenv import load_dotenv
 load_dotenv()
-root_url = os.getenv("ROOT_URL", "")
-def _write_chart(fig, chart_path, chart_url):
-    """Write a Plotly figure to disk and return a responsive iframe HTML string."""
-    pio.write_html(fig, chart_path, full_html=False, config={"responsive": True})
-    return (
-        'Please display this iframe: '
-        '<div style="width:100%;overflow-x:auto;">'
-        '<iframe style="width:100%;min-width:400px;" height="500" '
-        f'src="{chart_url}" frameborder="0" allowfullscreen>'
-        '</iframe></div>'
-    )
 def llm_chart_data_scrub(data, layout):
    #Processing data to account for variation from LLM
@@ -138,8 +125,13 @@ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_ha
             for data_item in fig["data"]:
                data_item[key] = value
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("SCATTER PLOT ERROR")
@@ -182,10 +174,15 @@ def line_chart_generation_func(x_column: str, y_column: str, session_hash, sessi
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("LINE CHART ERROR")
@@ -232,10 +229,15 @@ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, sessio
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("BAR CHART ERROR")
@@ -274,10 +276,15 @@ def pie_chart_generation_func(values: str, names: str, session_hash, session_fol
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("PIE CHART ERROR")
@@ -328,10 +335,15 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("HISTOGRAM ERROR")
@@ -342,185 +354,32 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
             """
       return {"reply": reply}
-def box_chart_generation_func(y_column: str, session_hash, session_folder,
-                              x_column: str="", category: str="",
-                              layout: List[dict]=[{}], **kwargs):
-    try:
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        function_args = {"data_frame": df, "y": y_column}
-        if x_column:
-            function_args["x"] = x_column
-        if category:
-            function_args["color"] = category
-        initial_graph = px.box(**function_args)
-        fig = initial_graph.to_dict()
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig["layout"] = layout_dict
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("BOX CHART ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the box plot. Error: {e}. You should probably try again."}
-def correlation_heatmap_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
-    try:
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        numeric_df = df[columns].select_dtypes(include='number') if columns else df.select_dtypes(include='number')
-        if numeric_df.shape[1] < 2:
-            return {"reply": "At least two numeric columns are needed for a correlation matrix. Please refine your query to include more numeric columns."}
-        corr = numeric_df.corr().round(3)
-        fig = px.imshow(
-            corr,
-            text_auto='.2f',
-            color_continuous_scale='RdBu_r',
-            zmin=-1,
-            zmax=1,
-            title='Correlation Matrix',
-            aspect='auto',
-        )
-        fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("CORRELATION HEATMAP ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the correlation heatmap. Error: {e}. You should probably try again."}
-def rolling_stats_func(x_column: str, y_column: str, session_hash, session_folder,
-                       window: int = 7, stats: List[str] = ["mean"],
-                       layout: List[dict] = [{}], category: str = "", **kwargs):
-    try:
-        import plotly.graph_objects as go
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        try:
-            df[x_column] = pd.to_datetime(df[x_column])
-        except Exception:
-            pass
-        df = df.sort_values(x_column)
-        valid_stats = {"mean", "std", "min", "max"}
-        selected_stats = [s for s in stats if s in valid_stats] or ["mean"]
-        fig = go.Figure()
-        groups = df[category].unique().tolist() if category and category in df.columns else [None]
-        for group in groups:
-            group_df = df[df[category] == group] if group is not None else df
-            prefix = f"{group} — " if group is not None else ""
-            fig.add_trace(go.Scatter(
-                x=group_df[x_column].values, y=group_df[y_column].values,
-                mode="lines", name=f"{prefix}{y_column} (raw)",
-                opacity=0.35, line=dict(width=1)
-            ))
-            rolling_obj = group_df[y_column].rolling(window)
-            for stat in selected_stats:
-                rolled = getattr(rolling_obj, stat)()
-                fig.add_trace(go.Scatter(
-                    x=group_df[x_column].values, y=rolled.values,
-                    mode="lines", name=f"{prefix}Rolling {stat.capitalize()} (w={window})",
-                    line=dict(width=2.5)
-                ))
-        fig.update_layout(
-            title=f"Rolling Statistics (window={window}) — {y_column}",
-            xaxis_title=x_column,
-            yaxis_title=y_column,
-            font=dict(family="Inter, system-ui, sans-serif"),
-            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
-        )
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig.update_layout(**layout_dict)
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("ROLLING STATS ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the rolling statistics chart. Error: {e}. You should probably try again."}
 def table_generation_func(session_hash, session_folder, **kwargs):
     print("TABLE GENERATION")
-    try:
-        from html import escape
         dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
         csv_query_path = f'{dir_path}/query.csv'
         df = pd.read_csv(csv_query_path)
-        total_rows = len(df)
-        max_rows = 200
-        if total_rows > max_rows:
-            df = df.head(max_rows)
-            note = (f'<p class="vda-table-note">Showing first {max_rows} of {total_rows} rows'
-                    ' — refine your query to see more specific results.</p>')
-        else:
-            note = ''
-        header_cells = ''.join(f'<th>{escape(str(col))}</th>' for col in df.columns)
-        row_html = [
-            '<tr>' + ''.join(f'<td>{escape(str(val))}</td>' for val in row) + '</tr>'
-            for _, row in df.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '.vda-table-note{font-size:12px;color:#6b7280;margin:4px 0 0;text-align:right;}'
-            '</style>'
-        )
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table + note}
     except Exception as e:
-        print("TABLE ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the table. Error: {e}. You should probably try again."}

 load_dotenv()
+root_url = os.getenv("ROOT_URL")
 def llm_chart_data_scrub(data, layout):
    #Processing data to account for variation from LLM
             for data_item in fig["data"]:
                data_item[key] = value
+      pio.write_html(fig, chart_path, full_html=False)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+      iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("SCATTER PLOT ERROR")
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+      iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("LINE CHART ERROR")
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+      iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("BAR CHART ERROR")
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+      iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("PIE CHART ERROR")
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
       chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+      iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("HISTOGRAM ERROR")
             """
       return {"reply": reply}
 def table_generation_func(session_hash, session_folder, **kwargs):
     print("TABLE GENERATION")
+    try:
         dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
         csv_query_path = f'{dir_path}/query.csv'
+        table_path = f'{dir_path}/table.html'
         df = pd.read_csv(csv_query_path)
+        html_table = df.to_html()
+        print(html_table[:1000])
+        with open(table_path, "w") as file:
+         file.write(html_table)
+        table_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/table.html'
+        iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + table_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+        print(iframe)
+        return {"reply": iframe}
     except Exception as e:
+      print("TABLE ERROR")
+      print(e)
+      reply = f"""There was an error generating the Pandas DataFrame table results.
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}

functions/chat_functions.py CHANGED Viewed

@@ -1,19 +1,9 @@
-from utils import message_dict, api_key_store, model_store
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
-from haystack.utils import Secret
-def _get_generator(session_hash):
-    api_key = api_key_store.get(session_hash)
-    if not api_key:
-        raise ValueError("No API key found for this session. Please enter your API key at the top of the page.")
-    model = model_store.get(session_hash, "gpt-4o")
-    if api_key.startswith("sk-ant-"):
-        from haystack_integrations.components.generators.chat import AnthropicChatGenerator
-        return AnthropicChatGenerator(model=model, api_key=Secret.from_token(api_key))
-    return OpenAIChatGenerator(model=model, api_key=Secret.from_token(api_key))
 response = None
 def example_question_message(data_source, name, titles, schema):
@@ -23,15 +13,15 @@ def example_question_message(data_source, name, titles, schema):
                          f"""We have a SQLite database with the following {titles}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
-        'sql' : [f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database called {name}.",
                  f"""We have a PostgreSQL database with the following tables: {titles}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
@@ -40,7 +30,7 @@ def example_question_message(data_source, name, titles, schema):
                         The schema of these collections is: {schema}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
@@ -48,7 +38,7 @@ def example_question_message(data_source, name, titles, schema):
                      f"""We have a GraphQL API endpoint with the following types: {titles}.
                         We also have an AI agent with access to the same GraphQL API endpoint that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""]
@@ -67,84 +57,72 @@ def example_question_generator(session_hash, data_source, name, titles, schema):
     example_messages.append(ChatMessage.from_user(text=example_message_list[1]))
-    example_response = _get_generator(session_hash).run(messages=example_messages)
-    response_text = example_response["replies"][0].text
-    start = response_text.index("[") + 1
-    end = response_text.index("]")
-    response_content = response_text[start:end]
-    response_list = '[' + response_content + ']'
-    print(response_list)
-    return response_list
 def system_message(data_source, titles, schema=""):
-    print("TITLES")
-    print(titles)
-    tools_desc = (
-        " You have access to tools for querying the data source, generating charts and visualisations,"
-        " and performing statistical analyses — use them proactively whenever they would help answer the user's question."
-        " Always display any charts, tables, and visualisations inline in your responses by outputting the returned HTML verbatim."
-    )
     system_message_dict = {
-        'file_upload': (
-            f"You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source' that contains the following columns: {titles}."
-            + tools_desc
-        ),
-        'sql': (
-            f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database which has a series of tables called {titles}."
-            + tools_desc
-        ),
-        'doc_db': (
-            f"You are a helpful and knowledgeable agent who has access to a NoSQL MongoDB Document database which has a series of collections called {titles}. "
-            f"The schema of these collections is: {schema}."
-            + tools_desc
-        ),
-        'graphql': (
-            f"You are a helpful and knowledgeable agent who has access to a GraphQL API which has the following types: {titles}. "
-            "We have also saved a schema.json file that contains the entire introspection query that we can use to find out more about each type before making a query."
-            + tools_desc
-        ),
     }
     return system_message_dict[data_source]
 def chatbot_func(message, history, session_hash, data_source, titles, schema, *args):
-    try:
-        chat_generator = _get_generator(session_hash)
-    except ValueError as e:
-        return str(e)
-    from functions import (
-        table_generation_func, regression_func, descriptive_stats_func,
-        scatter_chart_generation_func, line_chart_generation_func, bar_chart_generation_func,
-        pie_chart_generation_func, histogram_generation_func,
-        box_chart_generation_func, correlation_heatmap_func, rolling_stats_func,
-        query_func, graphql_schema_query, graphql_csv_query,
-        kmeans_clustering_func, hypothesis_test_func,
-    )
     import tools.tools as tools
-    available_functions = {
-        "query_func": query_func,
-        "graphql_schema_query": graphql_schema_query,
-        "graphql_csv_query": graphql_csv_query,
-        "table_generation_func": table_generation_func,
-        "scatter_chart_generation_func": scatter_chart_generation_func,
-        "line_chart_generation_func": line_chart_generation_func,
-        "bar_chart_generation_func": bar_chart_generation_func,
-        "pie_chart_generation_func": pie_chart_generation_func,
-        "histogram_generation_func": histogram_generation_func,
-        "box_chart_generation_func": box_chart_generation_func,
-        "correlation_heatmap_func": correlation_heatmap_func,
-        "rolling_stats_func": rolling_stats_func,
-        "regression_func": regression_func,
-        "descriptive_stats_func": descriptive_stats_func,
-        "kmeans_clustering_func": kmeans_clustering_func,
-        "hypothesis_test_func": hypothesis_test_func,
-    }
     if message_dict[session_hash][data_source] != None:
         message_dict[session_hash][data_source].append(ChatMessage.from_user(message))
@@ -155,11 +133,10 @@ def chatbot_func(message, history, session_hash, data_source, titles, schema, *a
         messages.append(ChatMessage.from_user(message))
         message_dict[session_hash][data_source] = messages
-    active_tools = tools.tools_call(session_hash, data_source, titles)
-    response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
     while True:
-        # if the response is a tool call
         if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
             function_calls = response["replies"][0].tool_calls
             for function_call in function_calls:
@@ -174,7 +151,7 @@ def chatbot_func(message, history, session_hash, data_source, titles, schema, *a
                 print(function_name)
                 ## Append function response to the messages list using `ChatMessage.from_tool`
                 message_dict[session_hash][data_source].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
-                response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
         # Regular Conversation
         else:

+from utils import message_dict
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
+chat_generator = OpenAIChatGenerator(model="gpt-4o")
 response = None
 def example_question_message(data_source, name, titles, schema):
                          f"""We have a SQLite database with the following {titles}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
+                        that we can suggest that you believe will be insightful or helpful to a data analysis looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
+        'sql' : [f"You are a helpful and knowledgeable agent who has access to an MongoDB NoSQL document database called {name}.",
                  f"""We have a PostgreSQL database with the following tables: {titles}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
+                        that we can suggest that you believe will be insightful or helpful to a data analysis looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
                         The schema of these collections is: {schema}.
                         We also have an AI agent with access to the same database that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
+                        that we can suggest that you believe will be insightful or helpful to a data analysis looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""],
                      f"""We have a GraphQL API endpoint with the following types: {titles}.
                         We also have an AI agent with access to the same GraphQL API endpoint that will be performing data analysis.
                         Please return an array of seven strings, each one being a question for our data analysis agent
+                        that we can suggest that you believe will be insightful or helpful to a data analysis looking for
                         data insights. Return nothing more than the array of questions because I need that specific data structure
                         to process your response. No other response type or data structure will work."""]
     example_messages.append(ChatMessage.from_user(text=example_message_list[1]))
+    example_response = chat_generator.run(messages=example_messages)
+    return example_response["replies"][0].text
 def system_message(data_source, titles, schema=""):
     system_message_dict = {
+        'file_upload' : f"""You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source' that contains the following columns: {titles}.
+                    You also have access to a function, called table_generation_func, that can take a query.csv file generated from our sql query and returns an iframe that we should display in our chat window.
+                    You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we should display in our chat window.
+                    You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a line chart and returns an iframe that we should display in our chat window.
+                    You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a bar graph and returns an iframe that we should display in our chat window.
+                    You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a pie chart and returns an iframe that we should display in our chat window.
+                    You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a histogram and returns an iframe that we should display in our chat window.
+                    You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our sql query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe.
+                    Could you please always display the generated charts, tables, and visualizations as part of your output?""",
+        'sql' : f"""You are a helpful and knowledgeable agent who has access to an PostgreSQL database which has a series of tables called {titles}.
+                    You also have access to a function, called table_generation_func, that can take a query.csv file generated from our sql query and returns an iframe that we should display in our chat window.
+                    You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we should display in our chat window.
+                    You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a line chart and returns an iframe that we should display in our chat window.
+                    You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a bar graph and returns an iframe that we should display in our chat window.
+                    You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a pie chart and returns an iframe that we should display in our chat window.
+                    You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a histogram and returns an iframe that we should display in our chat window.
+                    You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our sql query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe.
+                    Could you please always display the generated charts, tables, and visualizations as part of your output?""",
+        'doc_db' : f"""You are a helpful and knowledgeable agent who has access to a NoSQL MongoDB Document database which has a series of collections called {titles}.
+                    The schema of these collections is: {schema}.
+                    You also have access to a function, called table_generation_func, that can take a query.csv file generated from our MongoDB query and returns an iframe that we should display in our chat window.
+                    You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our MongoDB query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we should display in our chat window.
+                    You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our MongoDB query and uses plotly dictionaries to generate a line chart and returns an iframe that we should display in our chat window.
+                    You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our MongoDB query and uses plotly dictionaries to generate a bar graph and returns an iframe that we should display in our chat window.
+                    You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our MongoDB query and uses plotly dictionaries to generate a pie chart and returns an iframe that we should display in our chat window.
+                    You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our MongoDB query and uses plotly dictionaries to generate a histogram and returns an iframe that we should display in our chat window.
+                    You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our MongoDB query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe.
+                    Could you please always display the generated charts, tables, and visualizations as part of your output?""",
+        'graphql' : f"""You are a helpful and knowledgeable agent who has access to a GraphQL API which has the following types: {titles}.
+                    We have also saved a schema.json file that contains the entire introspection query that we can use to find out more about each type before making a query.
+                    You also have access to a function, called table_generation_func, that can take a query.csv file generated from our GraphQL API query and returns an iframe that we should display in our chat window.
+                    You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our GraphQL API query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we should display in our chat window.
+                    You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our GraphQL API query and uses plotly dictionaries to generate a line chart and returns an iframe that we should display in our chat window.
+                    You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our GraphQL API query and uses plotly dictionaries to generate a bar graph and returns an iframe that we should display in our chat window.
+                    You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our GraphQL API query and uses plotly dictionaries to generate a pie chart and returns an iframe that we should display in our chat window.
+                    You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our GraphQL API query and uses plotly dictionaries to generate a histogram and returns an iframe that we should display in our chat window.
+                    You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our GraphQL API query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe.
+                    Could you please always display the generated charts, tables, and visualizations as part of your output?"""
     }
     return system_message_dict[data_source]
 def chatbot_func(message, history, session_hash, data_source, titles, schema, *args):
+    from functions import sqlite_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
+        sql_query_func, doc_db_query_func, graphql_query_func, graphql_schema_query, graphql_csv_query, \
+        line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
     import tools.tools as tools
+    available_functions = {"sqlite_query_func": sqlite_query_func,"sql_query_func": sql_query_func,"doc_db_query_func": doc_db_query_func,
+                           "graphql_query_func": graphql_query_func,"graphql_schema_query": graphql_schema_query,"graphql_csv_query": graphql_csv_query,
+                           "table_generation_func":table_generation_func,
+                           "line_chart_generation_func":line_chart_generation_func,"bar_chart_generation_func":bar_chart_generation_func,
+                           "scatter_chart_generation_func":scatter_chart_generation_func, "pie_chart_generation_func":pie_chart_generation_func,
+                           "histogram_generation_func":histogram_generation_func,
+                           "regression_func":regression_func }
     if message_dict[session_hash][data_source] != None:
         message_dict[session_hash][data_source].append(ChatMessage.from_user(message))
         messages.append(ChatMessage.from_user(message))
         message_dict[session_hash][data_source] = messages
+    response = chat_generator.run(messages=message_dict[session_hash][data_source], generation_kwargs={"tools": tools.tools_call(session_hash, data_source, titles)})
     while True:
+        # if OpenAI response is a tool call
         if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
             function_calls = response["replies"][0].tool_calls
             for function_call in function_calls:
                 print(function_name)
                 ## Append function response to the messages list using `ChatMessage.from_tool`
                 message_dict[session_hash][data_source].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
+                response = chat_generator.run(messages=message_dict[session_hash][data_source], generation_kwargs={"tools": tools.tools_call(session_hash, data_source, titles)})
         # Regular Conversation
         else:

functions/query_functions.py CHANGED Viewed

@@ -23,16 +23,36 @@ class SQLiteQuery:
       self.connection = sqlite3.connect(sql_database, check_same_thread=False)
     @component.output_types(results=List[str], queries=List[str])
-    def run(self, queries: AnyStr, session_hash):
         print("ATTEMPTING TO RUN SQLITE QUERY")
         dir_path = TEMP_DIR / str(session_hash)
         results = []
-        result = pd.read_sql(queries, self.connection)
-        result.to_csv(f'{dir_path}/file_upload/query.csv', index=False)
-        column_names = list(result.columns)
-        results.append(f"{result}")
         self.connection.close()
-        return {"results": results, "queries": queries, "csv_columns": column_names}
 @component
 class PostgreSQLQuery:
@@ -47,16 +67,39 @@ class PostgreSQLQuery:
         )
     @component.output_types(results=List[str], queries=List[str])
-    def run(self, queries: AnyStr, session_hash):
         print("ATTEMPTING TO RUN POSTGRESQL QUERY")
         dir_path = TEMP_DIR / str(session_hash)
         results = []
-        result = pd.read_sql_query(queries, self.connection)
-        result.to_csv(f'{dir_path}/sql/query.csv', index=False)
-        column_names = list(result.columns)
-        results.append(f"{result}")
         self.connection.close()
-        return {"results": results, "queries": queries, "csv_columns": column_names}
 @component
 class DocDBQuery:
@@ -100,11 +143,31 @@ class DocDBQuery:
         docs = collection.aggregate_pandas_all(query_list)
         print("DATA FRAME COMPLETE")
         docs.to_csv(f'{dir_path}/doc_db/query.csv', index=False)
-        column_names = list(docs.columns)
         print("CSV COMPLETE")
         results.append(f"{docs}")
         self.client.close()
-        return {"results": results, "queries": aggregation_pipeline, "csv_columns": column_names}
 @component
 class GraphQLQuery:
@@ -137,40 +200,25 @@ class GraphQLQuery:
           #print(response_frame)
           response_frame.to_csv(f'{dir_path}/graphql/query.csv', index=False)
-          column_names = list(response_frame.columns)
           print("CSV COMPLETE")
           results.append(f"{response_frame}")
-          return {"results": results, "queries": graphql_query, "csv_columns": column_names}
-def query_func(queries:AnyStr, session_hash, session_folder, args, **kwargs):
     try:
-      print("QUERY")
-      print(queries)
-      if session_folder == "file_upload":
-        dir_path = TEMP_DIR / str(session_hash)
-        sql_query = SQLiteQuery(f'{dir_path}/file_upload/data_source.db')
-        result = sql_query.run(queries, session_hash)
-      elif session_folder == "sql":
-        sql_query = PostgreSQLQuery(args[0], args[1], args[2], args[3], args[4])
-        result = sql_query.run(queries, session_hash)
-      elif session_folder == 'doc_db':
-        doc_db_query = DocDBQuery(args[0], args[1])
-        result = doc_db_query.run(queries, kwargs['db_collection'], session_hash)
-      elif session_folder == 'graphql':
-        graphql_object = GraphQLQuery()
-        result = graphql_object.run(queries, args[0], args[1], args[2], session_hash)
       print("RESULT")
-      print(result["csv_columns"])
       if len(result["results"][0]) > 1000:
         print("QUERY TOO LARGE")
-        return {"reply": f"""query result too large to be processed by llm, the query results are in our query.csv file.
-                The column names of this query.csv file are: {result["csv_columns"]}.
-                If you need to display the results directly, perhaps use the table_generation_func function."""}
       else:
         return {"reply": result["results"][0]}
     except Exception as e:
-      reply = f"""There was an error running the {session_folder} Query = {queries}
               The error is {e},
               You should probably try again.
               """
@@ -206,19 +254,11 @@ def graphql_csv_query(csv_query: AnyStr, session_hash, **kwargs):
       query = pd.read_csv(f'{dir_path}/graphql/query.csv')
       query.Name = 'query'
       print("GRAPHQL CSV QUERY")
-      print(csv_query)
       queried_df = sqldf(csv_query, locals())
       print(queried_df)
-      column_names = list(queried_df.columns)
       queried_df.to_csv(f'{dir_path}/graphql/query.csv', index=False)
-      if len(queried_df) > 1000:
-        print("CSV QUERY TOO LARGE")
-        return {"reply": f"""The new query results are in our query.csv file.
-                The column names of this query.csv file are: {column_names}.
-                If you need to display the results directly, perhaps use the table_generation_func function."""}
-      else:
-        return {"reply": str(queried_df)}
     except Exception as e:
       reply = f"""There was an error querying our query.csv file with the query:{csv_query}
@@ -226,4 +266,4 @@ def graphql_csv_query(csv_query: AnyStr, session_hash, **kwargs):
               You should probably try again.
               """
       print(reply)
-      return {"reply": reply}

       self.connection = sqlite3.connect(sql_database, check_same_thread=False)
     @component.output_types(results=List[str], queries=List[str])
+    def run(self, queries: List[str], session_hash):
         print("ATTEMPTING TO RUN SQLITE QUERY")
         dir_path = TEMP_DIR / str(session_hash)
         results = []
+        for query in queries:
+          result = pd.read_sql(query, self.connection)
+          result.to_csv(f'{dir_path}/file_upload/query.csv', index=False)
+          results.append(f"{result}")
         self.connection.close()
+        return {"results": results, "queries": queries}
+def sqlite_query_func(queries: List[str], session_hash, **kwargs):
+    dir_path = TEMP_DIR / str(session_hash)
+    sql_query = SQLiteQuery(f'{dir_path}/file_upload/data_source.db')
+    try:
+      result = sql_query.run(queries, session_hash)
+      if len(result["results"][0]) > 1000:
+        print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
+      else:
+        return {"reply": result["results"][0]}
+    except Exception as e:
+      reply = f"""There was an error running the SQL Query = {queries}
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}
 @component
 class PostgreSQLQuery:
         )
     @component.output_types(results=List[str], queries=List[str])
+    def run(self, queries: List[str], session_hash):
         print("ATTEMPTING TO RUN POSTGRESQL QUERY")
         dir_path = TEMP_DIR / str(session_hash)
         results = []
+        for query in queries:
+          print(query)
+          result = pd.read_sql_query(query, self.connection)
+          result.to_csv(f'{dir_path}/sql/query.csv', index=False)
+          results.append(f"{result}")
         self.connection.close()
+        return {"results": results, "queries": queries}
+def sql_query_func(queries: List[str], session_hash, args, **kwargs):
+    sql_query = PostgreSQLQuery(args[0], args[1], args[2], args[3], args[4])
+    try:
+      result = sql_query.run(queries, session_hash)
+      print("RESULT")
+      print(result)
+      if len(result["results"][0]) > 1000:
+        print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
+      else:
+        return {"reply": result["results"][0]}
+    except Exception as e:
+      reply = f"""There was an error running the SQL Query = {queries}
+              The error is {e},
+              You should probably try again.
+              """
+      print(reply)
+      return {"reply": reply}
 @component
 class DocDBQuery:
         docs = collection.aggregate_pandas_all(query_list)
         print("DATA FRAME COMPLETE")
         docs.to_csv(f'{dir_path}/doc_db/query.csv', index=False)
         print("CSV COMPLETE")
         results.append(f"{docs}")
         self.client.close()
+        return {"results": results, "queries": aggregation_pipeline}
+def doc_db_query_func(aggregation_pipeline: List[str], db_collection: AnyStr, session_hash, args, **kwargs):
+    doc_db_query = DocDBQuery(args[0], args[1])
+    try:
+      result = doc_db_query.run(aggregation_pipeline, db_collection, session_hash)
+      print("RESULT")
+      if len(result["results"][0]) > 1000:
+        print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
+      else:
+        return {"reply": result["results"][0]}
+    except Exception as e:
+      reply = f"""There was an error running the NoSQL (Mongo) Query = {aggregation_pipeline}
+              The error is {e},
+              You should probably try again.
+              """
+      print(reply)
+      return {"reply": reply}
 @component
 class GraphQLQuery:
           #print(response_frame)
           response_frame.to_csv(f'{dir_path}/graphql/query.csv', index=False)
           print("CSV COMPLETE")
           results.append(f"{response_frame}")
+          return {"results": results, "queries": graphql_query}
+def graphql_query_func(graphql_query: AnyStr, session_hash, args, **kwargs):
+    graphql_object = GraphQLQuery()
     try:
+      result = graphql_object.run(graphql_query, args[0], args[1], args[2], session_hash)
       print("RESULT")
       if len(result["results"][0]) > 1000:
         print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
       else:
         return {"reply": result["results"][0]}
     except Exception as e:
+      reply = f"""There was an error running the GraphQL Query = {graphql_query}
               The error is {e},
               You should probably try again.
               """
       query = pd.read_csv(f'{dir_path}/graphql/query.csv')
       query.Name = 'query'
       print("GRAPHQL CSV QUERY")
       queried_df = sqldf(csv_query, locals())
       print(queried_df)
       queried_df.to_csv(f'{dir_path}/graphql/query.csv', index=False)
+      return {"reply": "The new query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
     except Exception as e:
       reply = f"""There was an error querying our query.csv file with the query:{csv_query}
               You should probably try again.
               """
       print(reply)
+      return {"reply": reply}

functions/stat_functions.py CHANGED Viewed

@@ -5,244 +5,12 @@ from utils import TEMP_DIR
 import plotly.express as px
 import plotly.io as pio
 import os
-from functions.chart_functions import scatter_chart_fig, llm_chart_data_scrub, _write_chart
 from dotenv import load_dotenv
 load_dotenv()
-root_url = os.getenv("ROOT_URL", "")
-def descriptive_stats_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
-    print("DESCRIPTIVE STATISTICS")
-    try:
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        if columns:
-            df = df[[c for c in columns if c in df.columns]]
-        desc = df.describe().round(4)
-        header_cells = '<th style="background:#1e40af;">Statistic</th>' + ''.join(
-            f'<th>{escape(str(col))}</th>' for col in desc.columns
-        )
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
-            + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
-            + '</tr>'
-            for idx, row in desc.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table}
-    except Exception as e:
-        print("DESCRIPTIVE STATS ERROR")
-        print(e)
-        return {"reply": f"There was an error generating descriptive statistics. Error: {e}. You should probably try again."}
-def kmeans_clustering_func(feature_columns: List[str], x_column: str, y_column: str,
-                           session_hash, session_folder, n_clusters: int = 3,
-                           layout: List[dict] = [{}], **kwargs):
-    print("KMEANS CLUSTERING")
-    try:
-        from sklearn.cluster import KMeans
-        from sklearn.preprocessing import StandardScaler
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        feature_df = df[feature_columns].select_dtypes(include='number').dropna()
-        if feature_df.shape[1] < 1:
-            return {"reply": "No numeric feature columns found for clustering. Please refine your query to include numeric columns."}
-        X_scaled = StandardScaler().fit_transform(feature_df)
-        labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit_predict(X_scaled)
-        df_clustered = df.loc[feature_df.index].copy()
-        df_clustered['Cluster'] = [f'Cluster {l}' for l in labels]
-        fig = px.scatter(
-            df_clustered, x=x_column, y=y_column, color='Cluster',
-            title=f'K-Means Clustering (k={n_clusters})',
-        )
-        fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig.update_layout(**layout_dict)
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        iframe = _write_chart(fig, chart_path, chart_url)
-        cluster_summary = df_clustered.groupby('Cluster')[feature_columns].mean().round(3)
-        header_cells = '<th style="background:#1e40af;">Cluster</th>' + ''.join(
-            f'<th>{escape(str(col))}</th>' for col in cluster_summary.columns
-        )
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
-            + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
-            + '</tr>'
-            for idx, row in cluster_summary.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        summary_table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": f'{iframe}\n\n**Cluster Centroids (feature means per cluster):**\n{style}{summary_table}'}
-    except Exception as e:
-        print("KMEANS CLUSTERING ERROR")
-        print(e)
-        return {"reply": f"There was an error running K-Means clustering. Error: {e}. You should probably try again."}
-def hypothesis_test_func(test_type: str, column: str, session_hash, session_folder,
-                         column2: str = "", group_column: str = "",
-                         group_values: List[str] = [], pop_mean: float = 0.0, **kwargs):
-    print("HYPOTHESIS TEST")
-    try:
-        from scipy import stats
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        if test_type == "t_test_independent":
-            if not group_column or group_column not in df.columns:
-                return {"reply": "Please specify a valid group_column for the independent t-test."}
-            unique_groups = df[group_column].dropna().unique().tolist()
-            if group_values and len(group_values) == 2:
-                g1_label, g2_label = group_values[0], group_values[1]
-            elif len(unique_groups) == 2:
-                g1_label, g2_label = unique_groups[0], unique_groups[1]
-            else:
-                return {"reply": f"For an independent t-test, exactly 2 groups are needed. Found: {unique_groups}. Specify group_values with 2 entries."}
-            g1 = df[df[group_column] == g1_label][column].dropna()
-            g2 = df[df[group_column] == g2_label][column].dropna()
-            t_stat, p_value = stats.ttest_ind(g1, g2)
-            result_rows = [
-                ("Test", "Independent Samples T-Test"),
-                ("Column", column),
-                ("Group Column", group_column),
-                (f"Group 1", str(g1_label)),
-                (f"Group 2", str(g2_label)),
-                (f"Group 1 Mean (n={len(g1)})", f"{g1.mean():.4f}"),
-                (f"Group 2 Mean (n={len(g2)})", f"{g2.mean():.4f}"),
-                ("T-Statistic", f"{t_stat:.4f}"),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"T-Test: {column} by {group_column}"
-        elif test_type == "t_test_one_sample":
-            sample = df[column].dropna()
-            t_stat, p_value = stats.ttest_1samp(sample, pop_mean)
-            result_rows = [
-                ("Test", "One-Sample T-Test"),
-                ("Column", column),
-                ("Hypothesized Mean (μ₀)", f"{pop_mean:.4f}"),
-                (f"Sample Mean (n={len(sample)})", f"{sample.mean():.4f}"),
-                ("Sample Std Dev", f"{sample.std():.4f}"),
-                ("T-Statistic", f"{t_stat:.4f}"),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"One-Sample T-Test: {column} vs μ={pop_mean}"
-        elif test_type == "chi_square":
-            if not column2 or column2 not in df.columns:
-                return {"reply": "Please specify a valid column2 for the chi-square test."}
-            contingency = pd.crosstab(df[column], df[column2])
-            chi2, p_value, dof, _ = stats.chi2_contingency(contingency)
-            result_rows = [
-                ("Test", "Chi-Square Test of Independence"),
-                ("Column 1", column),
-                ("Column 2", column2),
-                ("Chi-Square Statistic", f"{chi2:.4f}"),
-                ("Degrees of Freedom", str(dof)),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"Chi-Square: {column} × {column2}"
-        else:
-            return {"reply": f"Unknown test_type '{test_type}'. Use one of: t_test_independent, t_test_one_sample, chi_square."}
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        header_cells = f'<th style="background:#1e40af;" colspan="2">{escape(title)}</th>'
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(label)}</td>'
-            + f'<td>{escape(value)}</td>'
-            + '</tr>'
-            for label, value in result_rows
-        ]
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table}
-    except Exception as e:
-        print("HYPOTHESIS TEST ERROR")
-        print(e)
-        return {"reply": f"There was an error running the hypothesis test. Error: {e}. You should probably try again."}
 def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, session_folder, category: str='', **kwargs):
     print("LINEAR REGRESSION CALCULATION")
@@ -262,8 +30,11 @@ def regression_func(independent_variables: List[str], dependent_variable: str, s
            fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
                                     trendline="ols")
         chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        iframe = _write_chart(fig, chart_path, chart_url)
         results_frame = px.get_trendline_results(fig)

 import plotly.express as px
 import plotly.io as pio
 import os
+from functions import scatter_chart_fig
 from dotenv import load_dotenv
 load_dotenv()
+root_url = os.getenv("ROOT_URL")
 def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, session_folder, category: str='', **kwargs):
     print("LINEAR REGRESSION CALCULATION")
            fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
                                     trendline="ols")
+        pio.write_html(fig, chart_path, full_html=False)
         chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
+        iframe = 'Please display this iframe: <div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
         results_frame = px.get_trendline_results(fig)

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-haystack-ai>=2.7.0
-anthropic-haystack
 python-dotenv
 gradio
 pandas
@@ -13,6 +12,3 @@ pymongoarrow
 pymongo_schema
 pandasql
 pluck-graphql
-certifi==2025.1.31
-scipy
-scikit-learn

+haystack-ai
 python-dotenv
 gradio
 pandas
 pymongo_schema
 pandasql
 pluck-graphql

samples/online_retail_data.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

temp/.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- *
2	- !.gitignore

templates/data_file.py CHANGED Viewed

@@ -1,286 +1,136 @@
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import process_data_upload
-from utils import message_dict
-import ast
-import html as _html
-def build_summary_modal(stats):
-    num_rows = stats['num_rows']
-    num_cols = stats['num_cols']
-    total_missing = stats['total_missing']
-    duplicate_rows = stats.get('duplicate_rows', 0)
-    file_size_bytes = stats.get('file_size_bytes', 0)
-    def _fmt_num(v):
-        try:
-            if v != v: return '—'  # NaN
-            abs_v = abs(v)
-            if abs_v >= 1e9: return f"{v/1e9:.1f}B"
-            if abs_v >= 1e6: return f"{v/1e6:.1f}M"
-            if abs_v >= 1e3: return f"{v:,.0f}" if v == int(v) else f"{v:,.1f}"
-            return f"{v:,.0f}" if v == int(v) else f"{v:.2f}"
-        except Exception:
-            return str(v)
-    def _fmt_size(b):
-        if not b: return ''
-        if b < 1024: return f"{b} B"
-        if b < 1024 ** 2: return f"{b / 1024:.1f} KB"
-        if b < 1024 ** 3: return f"{b / 1024 ** 2:.1f} MB"
-        return f"{b / 1024 ** 3:.2f} GB"
-    file_size_label = _fmt_size(file_size_bytes)
-    dup_color = "#ef4444" if duplicate_rows > 0 else "#a16207"
-    dup_bg = "#fef2f2" if duplicate_rows > 0 else "#fefce8"
-    dup_border = "#fecaca" if duplicate_rows > 0 else "#fde68a"
-    dtype_rows_html = ""
-    for i, (col, dtype) in enumerate(stats['dtypes'].items()):
-        bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
-        missing = stats['missing_per_col'].get(col, 0)
-        pct_missing = (missing / num_rows * 100) if num_rows > 0 else 0
-        missing_color = "#ef4444" if missing > 0 else "#9ca3af"
-        missing_weight = "600" if missing > 0 else "400"
-        missing_cell = f'{missing:,} <span style="color:#9ca3af;font-size:0.7rem;">({pct_missing:.1f}%)</span>'
-        unique = stats.get('unique_counts', {}).get(col, '—')
-        is_id = isinstance(unique, int) and num_rows > 0 and (unique / num_rows) >= 0.95 and unique > 10
-        id_badge = ' <span style="background:#fef3c7;color:#92400e;padding:1px 5px;border-radius:3px;font-size:0.65rem;vertical-align:middle;">ID?</span>' if is_id else ''
-        unique_cell = f'{unique:,}{id_badge}' if isinstance(unique, int) else str(unique)
-        cs = stats.get('col_stats', {}).get(col, {})
-        if cs.get('type') == 'numeric':
-            stats_cell = (
-                f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
-                f'{_fmt_num(cs["min"])} – {_fmt_num(cs["max"])}'
-                f'<br><span style="color:#9ca3af;">avg {_fmt_num(cs["mean"])}</span></span>'
-            )
-        elif cs.get('type') == 'datetime':
-            stats_cell = (
-                f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
-                f'{cs["min"]}<br>→ {cs["max"]}</span>'
-            )
-        else:
-            stats_cell = '<span style="color:#d1d5db;">—</span>'
-        dtype_rows_html += (
-            f'<tr style="background:{bg}">'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#111827;white-space:nowrap;">{_html.escape(col)}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;white-space:nowrap;"><span style="background:#dbeafe;color:#1e40af;padding:2px 8px;border-radius:4px;font-size:0.74rem;">{dtype}</span></td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;color:{missing_color};font-weight:{missing_weight};white-space:nowrap;">{missing_cell}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;white-space:nowrap;color:#374151;">{unique_cell}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;">{stats_cell}</td>'
-            f'</tr>'
-        )
-    preview_headers_html = "".join(
-        f'<th style="padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;text-align:left;">{_html.escape(col)}</th>'
-        for col in stats['preview_cols']
-    )
-    preview_rows_html = ""
-    for i, row in enumerate(stats['preview']):
-        bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
-        cells = "".join(
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#374151;white-space:nowrap;">{_html.escape(str(cell))}</td>'
-            for cell in row
-        )
-        preview_rows_html += f'<tr style="background:{bg}">{cells}</tr>'
-    size_tag = f'<span style="background:rgba(255,255,255,0.2);color:#fff;padding:2px 10px;border-radius:12px;font-size:0.75rem;font-weight:400;">{file_size_label}</span>' if file_size_label else ''
-    return f"""
-<div class="vda-modal-overlay" style="position:fixed;inset:0;background:rgba(0,0,0,0.55);z-index:9999;display:flex;align-items:center;justify-content:center;font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
-  <div style="background:#fff;border-radius:14px;width:90%;max-width:800px;max-height:88vh;display:flex;flex-direction:column;box-shadow:0 25px 50px -12px rgba(0,0,0,0.35);overflow:hidden;">
-    <div style="background:linear-gradient(135deg,#3B82F6,#0ea5e9);padding:16px 20px;display:flex;justify-content:space-between;align-items:center;flex-shrink:0;gap:12px;">
-      <div style="display:flex;align-items:center;gap:10px;">
-        <span style="color:#fff;font-weight:600;font-size:1rem;">Dataset Summary</span>
-        {size_tag}
-      </div>
-      <button onclick="document.querySelectorAll('.vda-modal-overlay').forEach(function(e){{e.remove()}})" style="background:rgba(255,255,255,0.2);border:none;color:#fff;width:30px;height:30px;border-radius:50%;cursor:pointer;font-size:1rem;line-height:1;flex-shrink:0;">&#x2715;</button>
-    </div>
-    <div style="padding:20px;overflow-y:auto;flex:1;">
-      <div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:10px;margin-bottom:20px;">
-        <div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#1d4ed8;">{num_rows:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Rows</div>
-        </div>
-        <div style="background:#f0fdf4;border:1px solid #bbf7d0;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#15803d;">{num_cols}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Columns</div>
-        </div>
-        <div style="background:#fefce8;border:1px solid #fde68a;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#a16207;">{total_missing:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Missing Values</div>
-        </div>
-        <div style="background:{dup_bg};border:1px solid {dup_border};border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:{dup_color};">{duplicate_rows:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Duplicate Rows</div>
-        </div>
-      </div>
-      <div style="margin-bottom:20px;">
-        <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Column Info</div>
-        <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
-          <div style="max-height:210px;overflow:auto;">
-            <table style="border-collapse:collapse;font-size:0.83rem;min-width:100%;">
-              <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
-                <tr>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Column</th>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Type</th>
-                  <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Missing</th>
-                  <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Unique</th>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Stats / Range</th>
-                </tr>
-              </thead>
-              <tbody>{dtype_rows_html}</tbody>
-            </table>
-          </div>
-        </div>
-      </div>
-      <div>
-        <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Data Preview (first 5 rows)</div>
-        <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
-          <div style="overflow:auto;max-height:200px;">
-            <table style="border-collapse:collapse;font-size:0.8rem;">
-              <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
-                <tr>{preview_headers_html}</tr>
-              </thead>
-              <tbody>{preview_rows_html}</tbody>
-            </table>
-          </div>
-        </div>
-      </div>
-    </div>
-  </div>
-</div>
-"""
-def run_example(input):
-    return input
-def example_display(input):
-    if input == None:
-        display = True
-    else:
-        display = False
-    return [gr.update(visible=display), gr.update(visible=display), gr.update(visible=display), gr.update(visible=display)]
-with gr.Blocks() as demo:
-    description = gr.HTML("""
-                        <div class="max-w-4xl mx-auto mb-12 text-center">
-                            <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
-                                <h2 class="font-semibold text-blue-800 ">
-                                    <i class="fas fa-info-circle mr-2"></i>Supported Files
-                                </h2>
-                                <div class="flex flex-wrap justify-center gap-3 pb-4 text-blue-700">
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-csv mr-1"></i>CSV
-                                        <span class="tooltip-text">Comma-separated values</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-alt mr-1"></i>TSV
-                                        <span class="tooltip-text">Tab-separated values</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-alt mr-1"></i>TXT
-                                        <span class="tooltip-text">Text files</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
-                                        <span class="tooltip-text">Excel spreadsheets</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-code mr-1"></i>XML
-                                        <span class="tooltip-text">XML documents</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-code mr-1"></i>JSON
-                                        <span class="tooltip-text">JSON data files</span>
-                                    </span>
-                                </div>
-                            </div>
-                        </div>
-                          """, elem_classes="description_component")
-    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
-    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
-    example_file_3 = gr.File(visible=False, value="samples/tb_illness_data.csv")
-    with gr.Row():
-        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-        example_btn_3 = gr.Button(value="Try Me: tb_illness_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker drop-zone border-2 border-dashed border-gray-300 rounded-lg hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300", file_types=['.csv', '.xlsx', '.txt', '.json', '.ndjson', '.xml', '.xls', '.tsv'])
-    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
-    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
-    example_btn_3.click(fn=run_example, inputs=example_file_3, outputs=file_output)
-    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2, example_btn_3, description])
-    @gr.render(inputs=file_output)
-    def data_options(filename, request: gr.Request):
-        print(filename)
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['file_upload'] = None
-        if filename:
-            process_message = process_upload(filename, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                gr.HTML(value=build_summary_modal(process_message[3]), padding=False)
-                if "bank_marketing_campaign" in filename:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What levels of education have the highest and lowest average balance?"],
-                                            ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
-                                            ["Can you generate a bar chart of education vs. average balance?"],
-                                            ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
-                                            ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
-                                            ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
-                                        ]
-                elif "online_retail_data" in filename:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What month had the highest revenue?"],
-                                            ["Is revenue higher in the morning or afternoon?"],
-                                            ["Can you generate a line graph of revenue per month?"],
-                                            ["Can you generate a table of revenue per month?"],
-                                            ["Can we predict how time of day affects transaction value in this data set?"],
-                                            ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'file_upload', '', process_message[2], ''))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("DATA FILE QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the columns in the dataset"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                data_source = gr.Textbox(visible=False, value='file_upload')
-                schema = gr.Textbox(visible=False, value='')
-                titles = gr.Textbox(value=process_message[2], interactive=False, visible=False)
-                bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your data file",
-                                    concurrency_limit=None,
-                                    examples=example_questions,
-                                    additional_inputs=[session_hash, data_source, titles, schema]
-                                    )
-    def process_upload(upload_value, session_hash):
-        if upload_value:
-            process_message = process_data_upload(upload_value, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+from functions import example_question_generator, chatbot_func
+from data_sources import process_data_upload
+from utils import message_dict
+import ast
+def run_example(input):
+    return input
+def example_display(input):
+    if input == None:
+        display = True
+    else:
+        display = False
+    return [gr.update(visible=display),gr.update(visible=display),gr.update(visible=display),gr.update(visible=display)]
+with gr.Blocks() as demo:
+    description = gr.HTML("""
+                        <!-- Header -->
+                        <div class="max-w-4xl mx-auto mb-12 text-center">
+                            <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
+                                <h2 class="font-semibold text-blue-800 ">
+                                    <i class="fas fa-info-circle mr-2"></i>Supported Files
+                                </h2>
+                                <div class="flex flex-wrap justify-center gap-3 pb-4 text-blue-700">
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-csv mr-1"></i>CSV
+                                        <span class="tooltip-text">Comma-separated values</span>
+                                    </span>
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-alt mr-1"></i>TSV
+                                        <span class="tooltip-text">Tab-separated values</span>
+                                    </span>
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-alt mr-1"></i>TXT
+                                        <span class="tooltip-text">Text files</span>
+                                    </span>
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
+                                        <span class="tooltip-text">Excel spreadsheets</span>
+                                    </span>
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-code mr-1"></i>XML
+                                        <span class="tooltip-text">XML documents</span>
+                                    </span>
+                                    <span class="tooltip">
+                                        <i class="fas fa-file-code mr-1"></i>JSON
+                                        <span class="tooltip-text">JSON data files</span>
+                                    </span>
+                                </div>
+                            </div>
+                        </div>
+                          """, elem_classes="description_component")
+    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
+    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
+    example_file_3 = gr.File(visible=False, value="samples/tb_illness_data.csv")
+    with gr.Row():
+        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="sample-btn bg-gradient-to-r from-purple-500 to-indigo-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
+        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="sample-btn bg-gradient-to-r from-purple-500 to-indigo-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
+        example_btn_3 = gr.Button(value="Try Me: tb_illness_data.csv", elem_classes="sample-btn bg-gradient-to-r from-purple-500 to-indigo-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
+    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker drop-zone border-2 border-dashed border-gray-300 rounded-lg hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300", file_types=['.csv','.xlsx','.txt','.json','.ndjson','.xml','.xls','.tsv'])
+    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
+    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
+    example_btn_3.click(fn=run_example, inputs=example_file_3, outputs=file_output)
+    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2, example_btn_3, description])
+    @gr.render(inputs=file_output)
+    def data_options(filename, request: gr.Request):
+        print(filename)
+        if request.session_hash not in message_dict:
+            message_dict[request.session_hash] = {}
+        message_dict[request.session_hash]['file_upload'] = None
+        if filename:
+            process_message = process_upload(filename, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "bank_marketing_campaign" in filename:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What levels of education have the highest and lowest average balance?"],
+                                            ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
+                                            ["Can you generate a bar chart of education vs. average balance?"],
+                                            ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
+                                            ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
+                                            ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
+                                        ]
+                elif "online_retail_data" in filename:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What month had the highest revenue?"],
+                                            ["Is revenue higher in the morning or afternoon?"],
+                                            ["Can you generate a line graph of revenue per month?"],
+                                            ["Can you generate a table of revenue per month?"],
+                                            ["Can we predict how time of day affects transaction value in this data set?"],
+                                            ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'file_upload', '', process_message[1], ''))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except Exception as e:
+                        print("DATA FILE QUESTION GENERATION ERROR")
+                        print(e)
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                session_hash = gr.Textbox(visible=False, value=request.session_hash)
+                data_source = gr.Textbox(visible=False, value='file_upload')
+                schema = gr.Textbox(visible=False, value='')
+                titles = gr.Textbox(value=process_message[1], interactive=False, visible=False)
+                bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=chatbot_func,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your data file",
+                                    concurrency_limit=None,
+                                    examples=example_questions,
+                                    additional_inputs=[session_hash, data_source, titles, schema]
+                                    )
+    def process_upload(upload_value, session_hash):
+        if upload_value:
+            process_message = process_data_upload(upload_value, session_hash)
+        return process_message
+if __name__ == "__main__":
+    demo.launch()

templates/doc_db.py CHANGED Viewed

@@ -1,105 +1,99 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_doc_db
-from utils import message_dict
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the MongoDB Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to a MongoDB database and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Queries use PyMongoArrow's <code>aggregate_pandas_all</code>, which cannot delete, drop, or insert documents.
-                        Use caution connecting production databases to third-party tools.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own database details to analyze your own data.
-            </p>
-        </div>
-    """)
-    connection_string = gr.Textbox(label="Connection String", value="dataanalyst0.l1klmww.mongodb.net/")
-    with gr.Row():
-        connection_user = gr.Textbox(label="Connection User", value="virtual-data-analyst")
-        connection_password = gr.Textbox(label="Connection Password", value="zcpbmoGJ3mC8o", type="password")
-        doc_db_name = gr.Textbox(label="Database Name", value="sample_mflix")
-    gr.HTML("""
-        <p style="text-align:center;font-size:13px;color:#6b7280;margin:4px 0 8px;">
-            <i class="fas fa-circle-info" style="margin-right:4px;"></i>
-            Schema analysis runs on connect — this may take 1–2 minutes for large databases.
-        </p>
-    """)
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[connection_string, connection_user, connection_password, doc_db_name], triggers=[submit.click])
-    def db_chat(request: gr.Request, connection_string=connection_string.value, connection_user=connection_user.value, connection_password=connection_password.value, doc_db_name=doc_db_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['doc_db'] = None
-        connection_login_value = "mongodb+srv://" + connection_user + ":" + connection_password + "@" + connection_string
-        if connection_login_value:
-            print("MONGO APP")
-            process_message = process_doc_db(connection_login_value, doc_db_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "dataanalyst0.l1klmww.mongodb.net" in connection_login_value:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What are the top 5 most common movie genres?"],
-                                            ["How do user comment counts on a movie correlate with the movie award wins?"],
-                                            ["Can you generate a pie chart showing the top 10 states with the most movie theaters?"],
-                                            ["What are the top 10 most represented directors in the database?"],
-                                            ["What are the different movie categories and how many movies are in each category?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'doc_db', doc_db_name, process_message[2], process_message[3]))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("DOC DB QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the collections in the database"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                db_connection_string = gr.Textbox(visible=False, value=connection_login_value)
-                db_name = gr.Textbox(visible=False, value=doc_db_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="DB Collections")
-                data_source = gr.Textbox(visible=False, value='doc_db')
-                schema = gr.Textbox(visible=False, value=process_message[3])
-                bot = gr.Chatbot(type='messages', label="MongoDB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your Database",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, db_connection_string, db_name]
-                                    )
-    def process_doc_db(connection_string, nosql_db_name, session_hash):
-        if connection_string:
-            process_message = connect_doc_db(connection_string, nosql_db_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

+import ast
+import gradio as gr
+from functions import example_question_generator, chatbot_func
+from data_sources import connect_doc_db
+from utils import message_dict
+def hide_info():
+    return gr.update(visible=False)
+with gr.Blocks() as demo:
+    description = gr.HTML("""
+                    <!-- Header -->
+                    <div class="max-w-4xl mx-auto mb-12 text-center">
+                        <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
+                         <p>This tool allows users to communicate with and query real time data from a Document DB (MongoDB for now, others can be added if requested) using natural
+                          language and the above features.</p>
+                         <p style="font-weight:bold;">Notice: the way this system is designed, no login information is retained and credentials are passed as session variables until the user leaves or
+                          refreshes the page in which they disappear. They are never saved to any files. I also make use of the PyMongoArrow aggregate_pandas_all function to apply pipelines,
+                          which can't delete, drop, or add database lines to avoid unhappy accidents or glitches.
+                          That being said, it's probably best to use caution when connecting to a production database to a strange AI tool with an unfamiliar author.
+                          This should be for demonstration purposes.</p>
+                          <p>Contact me if this is something you would like built in your organization, on your infrastructure, and with the requisite privacy and control a production
+                          database analytics tool requires.</p>
+                        </div>
+                    </div>
+                        """, elem_classes="description_component")
+    status_message = gr.HTML(value='<p style="color:green;text-align:center;font-size:18px;">Please be patient while connecting as we need to generate '
+    'and read a schema before connection can be successful. This process can take a few minutes.</p>', padding=False)
+    connection_string = gr.Textbox(label="Connection String", value="dataanalyst0.l1klmww.mongodb.net/")
+    with gr.Row():
+        connection_user = gr.Textbox(label="Connection User", value="virtual-data-analyst")
+        connection_password = gr.Textbox(label="Connection Password", value="zcpbmoGJ3mC8o", type="password")
+        doc_db_name = gr.Textbox(label="Database Name", value="sample_mflix")
+    submit = gr.Button(value="Submit")
+    submit.click(fn=hide_info, outputs=description)
+    @gr.render(inputs=[connection_string,connection_user,connection_password,doc_db_name], triggers=[submit.click])
+    def db_chat(request: gr.Request, connection_string=connection_string.value, connection_user=connection_user.value, connection_password=connection_password.value, doc_db_name=doc_db_name.value):
+        if request.session_hash not in message_dict:
+            message_dict[request.session_hash] = {}
+        message_dict[request.session_hash]['doc_db'] = None
+        connection_login_value = "mongodb+srv://" + connection_user + ":" + connection_password + "@" + connection_string
+        if connection_login_value:
+            print("MONGO APP")
+            process_message = process_doc_db(connection_login_value, doc_db_name, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "dataanalyst0.l1klmww.mongodb.net" in connection_login_value:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What are the top 5 most common movie genres?"],
+                                            ["How do user comment counts on a movie correlate with the movie award wins?"],
+                                            ["Can you generate a pie chart showing the top 10 states with the most movie theaters?"],
+                                            ["What are the top 10 most represented directors in the database?"],
+                                            ["What are the different movie categories and how many movies are in each category?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'graphql', doc_db_name, process_message[2], process_message[3]))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except Exception as e:
+                        print("DOC DB QUESTION GENERATION ERROR")
+                        print(e)
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                session_hash = gr.Textbox(visible=False, value=request.session_hash)
+                db_connection_string = gr.Textbox(visible=False, value=connection_login_value)
+                db_name = gr.Textbox(visible=False, value=doc_db_name)
+                titles = gr.Textbox(value=process_message[2], interactive=False, label="DB Collections")
+                data_source = gr.Textbox(visible=False, value='doc_db')
+                schema = gr.Textbox(visible=False, value=process_message[3])
+                bot = gr.Chatbot(type='messages', label="DocDB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=chatbot_func,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your Database",
+                                    examples=example_questions,
+                                    concurrency_limit=None,
+                                    additional_inputs=[session_hash, data_source, titles, schema, db_connection_string, db_name]
+                                    )
+    def process_doc_db(connection_string, nosql_db_name, session_hash):
+        if connection_string:
+            process_message = connect_doc_db(connection_string, nosql_db_name, session_hash)
+        return process_message
+if __name__ == "__main__":
+    demo.launch()

templates/graphql.py CHANGED Viewed

@@ -1,110 +1,110 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_graphql
-from utils import message_dict
-import os
-from dotenv import load_dotenv
-load_dotenv()
-graphql_sample_endpoint = os.getenv("GRAPHQL_SAMPLE_ENDPOINT")
-graphql_sample_api_token = os.getenv("GRAPHQL_SAMPLE_API_TOKEN")
-graphql_sample_header_name = os.getenv("GRAPHQL_SAMPLE_HEADER_NAME")
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the GraphQL Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to any GraphQL API endpoint and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        API querying is the most experimental feature and performance may vary.
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Mutations are not exposed and the agent is instructed not to alter data, though restricting
-                        your API token's permissions is still strongly recommended.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own endpoint and token to analyze your own API.
-            </p>
-        </div>
-    """)
-    graphql_url = gr.Textbox(label="GraphQL Endpoint URL", value=graphql_sample_endpoint)
-    with gr.Row():
-        api_token_header_name = gr.Textbox(label="API Token Header Name", value=graphql_sample_header_name)
-        api_token = gr.Textbox(label="API Token", value=graphql_sample_api_token, type="password")
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[graphql_url, api_token, api_token_header_name], triggers=[submit.click])
-    def api_chat(request: gr.Request, graphql_url=graphql_url.value, api_token=api_token.value, api_token_header_name=api_token_header_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['graphql'] = None
-        if graphql_url:
-            print("GraphQL API")
-            process_message = process_graphql(graphql_url, api_token, api_token_header_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "qdl-app-testing" in graphql_url:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What is the total revenue for this shopify store?"],
-                                            ["What is the average duration from the fulfillment of an order to its delivery?"],
-                                            ["What is the total value of orders processed in the current month?"],
-                                            ["Which product has the highest number of variants in the inventory?"],
-                                            ["How many gift cards have been issued this year, and what is their total value?"],
-                                            ["How many active apps are currently installed on the store?"],
-                                            ["What is the total count of abandoned checkouts over the last month?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'graphql', graphql_url, process_message[2], ''))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("GRAPHQL QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the types in this API"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                graphql_api_string = gr.Textbox(visible=False, value=graphql_url)
-                graphql_api_token = gr.Textbox(visible=False, value=api_token)
-                graphql_token_header = gr.Textbox(visible=False, value=api_token_header_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="GraphQL Types")
-                data_source = gr.Textbox(visible=False, value='graphql')
-                schema = gr.Textbox(visible=False, value='')
-                bot = gr.Chatbot(type='messages', label="GraphQL Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your GraphQL API",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, graphql_api_string, graphql_api_token, graphql_token_header]
-                                    )
-    def process_graphql(graphql_url, api_token, api_token_header_name, session_hash):
-        if graphql_url:
-            process_message = connect_graphql(graphql_url, api_token, api_token_header_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

+import ast
+import gradio as gr
+from functions import example_question_generator, chatbot_func
+from data_sources import connect_graphql
+from utils import message_dict
+import os
+from dotenv import load_dotenv
+load_dotenv()
+graphql_sample_endpoint = os.getenv("GRAPHQL_SAMPLE_ENDPOINT")
+graphql_sample_api_token = os.getenv("GRAPHQL_SAMPLE_API_TOKEN")
+graphql_sample_header_name = os.getenv("GRAPHQL_SAMPLE_HEADER_NAME")
+def hide_info():
+    return gr.update(visible=False)
+with gr.Blocks() as demo:
+    description = gr.HTML("""
+                    <!-- Header -->
+                    <div class="max-w-4xl mx-auto mb-12 text-center">
+                        <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
+                         <p>This tool allows users to communicate with and query real time data from a GraphQL API endpoint using natural
+                          language and the above features.</p>
+                         <p style="font-weight:bold;">Notice: API querying is the most difficult and experimental feature so far.
+                          This tool may have variable performance and quality, although it should get better over time as I evaluate use.
+                          No login information is retained and credentials are passed as session variables until the user leaves or
+                          refreshes the page in which they disappear. They are never saved to any files.</p>
+                          <p style="font-weight:bold;"> I don't include a function that allows the system to run mutations and I instruct the agent to not alter any data, but it could in theory be possible,
+                          although my testing wasn't able to get the system to alter or write to the api. I would be careful to make sure permissions are restricted for the
+                          api token being used.
+                          And of course, it's probably best to use caution when connecting to a strange AI tool with an unfamiliar author.
+                          This should be for demonstration purposes.</p>
+                          <p>Contact me if this is something you would like built in your organization, on your infrastructure, and with the requisite privacy and control a production
+                          database analytics tool requires.</p>
+                        </div>
+                    </div>
+                        """, elem_classes="description_component")
+    graphql_url = gr.Textbox(label="GraphQL Endpoint URL", value=graphql_sample_endpoint)
+    with gr.Row():
+        api_token_header_name = gr.Textbox(label="API Token Header Name", value=graphql_sample_header_name)
+        api_token = gr.Textbox(label="API Token", value=graphql_sample_api_token, type="password")
+    submit = gr.Button(value="Submit")
+    submit.click(fn=hide_info, outputs=description)
+    @gr.render(inputs=[graphql_url,api_token,api_token_header_name], triggers=[submit.click])
+    def api_chat(request: gr.Request, graphql_url=graphql_url.value, api_token=api_token.value, api_token_header_name=api_token_header_name.value):
+        if request.session_hash not in message_dict:
+            message_dict[request.session_hash] = {}
+        message_dict[request.session_hash]['graphql'] = None
+        if graphql_url:
+            print("GraphQL API")
+            process_message = process_graphql(graphql_url, api_token, api_token_header_name, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "qdl-app-testing" in graphql_url:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What is the total revenue for this shopify store?"],
+                                            ["What is the average duration from the fulfillment of an order to its delivery?"],
+                                            ["What is the total value of orders processed in the current month?"],
+                                            ["Which product has the highest number of variants in the inventory?"],
+                                            ["How many gift cards have been issued this year, and what is their total value?"],
+                                            ["How many active apps are currently installed on the store?"],
+                                            ["What is the total count of abandoned checkouts over the last month?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'graphql', graphql_url, process_message[2], ''))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except Exception as e:
+                        print("GRAPHQL QUESTION GENERATION ERROR")
+                        print(e)
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                session_hash = gr.Textbox(visible=False, value=request.session_hash)
+                graphql_api_string = gr.Textbox(visible=False, value=graphql_url)
+                graphql_api_token = gr.Textbox(visible=False, value=api_token)
+                graphql_token_header = gr.Textbox(visible=False, value=api_token_header_name)
+                titles = gr.Textbox(value=process_message[2], interactive=False, label="GraphQL Types")
+                data_source = gr.Textbox(visible=False, value='graphql')
+                schema = gr.Textbox(visible=False, value='')
+                bot = gr.Chatbot(type='messages', label="GraphQL Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=chatbot_func,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your Graphql API",
+                                    examples=example_questions,
+                                    concurrency_limit=None,
+                                    additional_inputs=[session_hash, data_source, titles, schema, graphql_api_string, graphql_api_token, graphql_token_header]
+                                    )
+    def process_graphql(graphql_url, api_token, api_token_header_name, session_hash):
+        if graphql_url:
+            process_message = connect_graphql(graphql_url, api_token, api_token_header_name, session_hash)
+        return process_message
+if __name__ == "__main__":
+    demo.launch()

templates/sql_db.py CHANGED Viewed

@@ -1,102 +1,98 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_sql_db
-from utils import message_dict
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the SQL Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to a PostgreSQL database and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Queries run through Pandas <code>read_sql_query</code>, which cannot delete, drop, or insert rows.
-                        Use caution connecting production databases to third-party tools.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own database details to analyze your own data.
-            </p>
-        </div>
-    """)
-    sql_url = gr.Textbox(label="URL", value="virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com")
-    with gr.Row():
-        sql_port = gr.Textbox(label="Port", value="5432")
-        sql_user = gr.Textbox(label="Username", value="postgres")
-        sql_pass = gr.Textbox(label="Password", value="Vda-1988", type="password")
-        sql_db_name = gr.Textbox(label="Database Name", value="dvdrental")
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[sql_url, sql_port, sql_user, sql_pass, sql_db_name], triggers=[submit.click])
-    def sql_chat(request: gr.Request, url=sql_url.value, sql_port=sql_port.value, sql_user=sql_user.value, sql_pass=sql_pass.value, sql_db_name=sql_db_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['sql'] = None
-        if url:
-            print("SQL APP")
-            process_message = process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com" in url:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What is the total revenue generated by each store?"],
-                                            ["Can you generate and display a bar chart of film category to number of films in that category?"],
-                                            ["Can you generate a pie chart showing the top 10 most rented films by revenue?"],
-                                            ["Can you generate a line chart of rental revenue over time?"],
-                                            ["What is the relationship between film length and rental frequency?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'sql', sql_db_name, process_message[2], ""))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("SQL QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the tables in the database"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                db_url = gr.Textbox(visible=False, value=url)
-                db_port = gr.Textbox(visible=False, value=sql_port)
-                db_user = gr.Textbox(visible=False, value=sql_user)
-                db_pass = gr.Textbox(visible=False, value=sql_pass)
-                db_name = gr.Textbox(visible=False, value=sql_db_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="SQL Tables")
-                data_source = gr.Textbox(visible=False, value='sql')
-                schema = gr.Textbox(visible=False, value='')
-                bot = gr.Chatbot(type='messages', label="SQL DB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your Database",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, db_url, db_port, db_user, db_pass, db_name]
-                                    )
-    def process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
-        if url:
-            process_message = connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

+import ast
+import gradio as gr
+from functions import example_question_generator, chatbot_func
+from data_sources import connect_sql_db
+from utils import message_dict
+def hide_info():
+    return gr.update(visible=False)
+with gr.Blocks() as demo:
+    description = gr.HTML("""
+                    <!-- Header -->
+                    <div class="max-w-4xl mx-auto mb-12 text-center">
+                        <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
+                         <p>This tool allows users to communicate with and query real time data from a SQL DB (postgres for now, others can be added if requested) using natural
+                          language and the above features.</p>
+                         <p style="font-weight:bold;">Notice: the way this system is designed, no login information is retained and credentials are passed as session variables until the user leaves or
+                          refreshes the page in which they disappear. They are never saved to any files. I also make use of the Pandas read_sql_query function to apply SQL
+                          queries, which can't delete, drop, or add database lines to avoid unhappy accidents or glitches.
+                          That being said, it's probably best to use caution when connecting to a production database to a strange AI tool with an unfamiliar author.
+                          This should be for demonstration purposes.</p>
+                          <p>Contact me if this is something you would like built in your organization, on your infrastructure, and with the requisite privacy and control a production
+                          database analytics tool requires.</p>
+                        </div>
+                    </div>
+                        """, elem_classes="description_component")
+    sql_url = gr.Textbox(label="URL", value="virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com")
+    with gr.Row():
+        sql_port = gr.Textbox(label="Port", value="5432")
+        sql_user = gr.Textbox(label="Username", value="postgres")
+        sql_pass = gr.Textbox(label="Password", value="Vda-1988", type="password")
+        sql_db_name = gr.Textbox(label="Database Name", value="dvdrental")
+    submit = gr.Button(value="Submit")
+    submit.click(fn=hide_info, outputs=description)
+    @gr.render(inputs=[sql_url,sql_port,sql_user,sql_pass,sql_db_name], triggers=[submit.click])
+    def sql_chat(request: gr.Request, url=sql_url.value, sql_port=sql_port.value, sql_user=sql_user.value, sql_pass=sql_pass.value, sql_db_name=sql_db_name.value):
+        if request.session_hash not in message_dict:
+            message_dict[request.session_hash] = {}
+        message_dict[request.session_hash]['sql'] = None
+        if url:
+            print("SQL APP")
+            process_message = process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com" in url:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What is the total revenue generated by each store?"],
+                                            ["Can you generate and display a bar chart of film category to number of films in that category?"],
+                                            ["Can you generate a pie chart showing the top 10 most rented films by revenue vs all other films?"],
+                                            ["Can you generate a line chart of rental revenue over time?"],
+                                            ["What is the relationship between film length and rental frequency?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'sql', sql_db_name, process_message[2], ""))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except Exception as e:
+                        print("SQL QUESTION GENERATION ERROR")
+                        print(e)
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                session_hash = gr.Textbox(visible=False, value=request.session_hash)
+                db_url = gr.Textbox(visible=False, value=url)
+                db_port = gr.Textbox(visible=False, value=sql_port)
+                db_user = gr.Textbox(visible=False, value=sql_user)
+                db_pass = gr.Textbox(visible=False, value=sql_pass)
+                db_name = gr.Textbox(visible=False, value=sql_db_name)
+                titles = gr.Textbox(value=process_message[2], interactive=False, label="SQL Tables")
+                data_source = gr.Textbox(visible=False, value='sql')
+                schema = gr.Textbox(visible=False, value='')
+                bot = gr.Chatbot(type='messages', label="SQL DB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=chatbot_func,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your Database",
+                                    examples=example_questions,
+                                    concurrency_limit=None,
+                                    additional_inputs=[session_hash, data_source, titles, schema, db_url, db_port, db_user, db_pass, db_name]
+                                    )
+    def process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
+        if url:
+            process_message = connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash)
+        return process_message
+if __name__ == "__main__":
+    demo.launch()

tools/__init__.py DELETED Viewed

File without changes

tools/chart_tools.py CHANGED Viewed

@@ -1,308 +1,371 @@
-# Shared parameter snippets reused across chart tool schemas.
-# Update here to change the description everywhere at once.
-_LAYOUT_PARAM = {
-    "type": "array",
-    "description": (
-        "Optional. An array containing a single JSON-formatted Plotly layout dictionary. "
-        "Use to set chart title, axis labels, colours, fonts, and other layout properties. "
-        "Example: [{\"title\": \"Monthly Sales\", \"xaxis\": {\"title\": \"Month\"}}]"
-    ),
-    "items": {"type": "string"},
-}
-_TRACE_STYLE_PARAM = {
-    "type": "array",
-    "description": (
-        "Optional. An array containing a single JSON-formatted Plotly trace styling dictionary. "
-        "Use to control visual properties such as line colour, opacity, and marker style. "
-        "Do NOT include 'x', 'y', or 'type' keys — those are set automatically from query.csv."
-    ),
-    "items": {"type": "string"},
-}
-chart_tool_schemas = [
-    {
-        "name": "scatter_chart_generation_func",
-        "description": (
-            "Generates a Plotly scatter plot from query.csv data. "
-            "Use when the user wants to visualise the relationship between two numeric columns, "
-            "create a bubble chart (via the size parameter), or overlay a trendline. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "array",
-                    "description": (
-                        "One or more column names from query.csv to plot on the x-axis. "
-                        "Multiple columns produce multiple series, each plotted against y_column."
-                    ),
-                    "items": {"type": "string"},
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv to plot on the y-axis.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code points by a categorical grouping.",
-                },
-                "trendline": {
-                    "type": "string",
-                    "description": (
-                        "Optional trendline type. One of: 'ols' (linear regression), "
-                        "'lowess' (local smoothing), 'rolling', 'ewm', 'expanding'. "
-                        "Requires trendline_options when using 'lowess', 'rolling', or 'ewm'."
-                    ),
-                },
-                "trendline_options": {
-                    "type": "array",
-                    "description": (
-                        "Required when trendline is 'lowess', 'rolling', or 'ewm'. "
-                        "An array containing a single JSON-formatted dict of trendline options "
-                        "(e.g. [{\"window\": 7}] for a 7-point rolling average)."
-                    ),
-                    "items": {"type": "string"},
-                },
-                "marginal_x": {
-                    "type": "string",
-                    "description": "Optional marginal distribution plot along the x-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
-                },
-                "marginal_y": {
-                    "type": "string",
-                    "description": "Optional marginal distribution plot along the y-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
-                },
-                "size": {
-                    "type": "string",
-                    "description": "Optional column name whose values control the size of each point (bubble chart). Negative values are clamped to zero.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "line_chart_generation_func",
-        "description": (
-            "Generates a Plotly line chart from query.csv data. "
-            "Use for trends over time or any ordered sequence. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis (typically a date or ordered index).",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis (numeric values).",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to split the data into multiple colour-coded lines.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "bar_chart_generation_func",
-        "description": (
-            "Generates a Plotly bar chart from query.csv data. "
-            "Use for comparing values across categories. Supports grouped/stacked bars via category, "
-            "and faceted subplots via facet_row or facet_col. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis (category labels).",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis (numeric values).",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code bars into grouped or stacked series.",
-                },
-                "facet_row": {
-                    "type": "string",
-                    "description": "Optional column name. Creates one subplot row per unique value — useful for comparing distributions across a second dimension.",
-                },
-                "facet_col": {
-                    "type": "string",
-                    "description": "Optional column name. Creates one subplot column per unique value.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "pie_chart_generation_func",
-        "description": (
-            "Generates a Plotly pie chart from query.csv data. "
-            "Use when the user wants to show part-to-whole proportions. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "values": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric value for each slice.",
-                },
-                "names": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the label for each slice.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["values", "names"],
-        },
-    },
-    {
-        "name": "histogram_generation_func",
-        "description": (
-            "Generates a Plotly histogram from query.csv data. "
-            "Use to show the frequency distribution of a numeric column. "
-            "Supports normalisation (percent, probability, density) and aggregation functions per bin. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv whose values are binned on the x-axis.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Optional column name aggregated per bin via histfunc (e.g. sum of sales per price bucket).",
-                },
-                "histnorm": {
-                    "type": "string",
-                    "description": "Optional normalisation. One of: 'percent', 'probability', 'density', 'probability density'.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to overlay multiple colour-coded histograms.",
-                },
-                "histfunc": {
-                    "type": "string",
-                    "description": "Optional aggregation function applied to y_column per bin. One of: 'avg', 'sum', 'count'.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column"],
-        },
-    },
-    {
-        "name": "box_chart_generation_func",
-        "description": (
-            "Generates a Plotly box plot from query.csv data. "
-            "Use to visualise the distribution of a numeric column and identify outliers. "
-            "Especially useful for comparing distributions across categories. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric values to distribute on the y-axis.",
-                },
-                "x_column": {
-                    "type": "string",
-                    "description": "Optional column name. Groups data into one box per unique value on the x-axis.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code boxes by a secondary grouping.",
-                },
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["y_column"],
-        },
-    },
-    {
-        "name": "correlation_heatmap_func",
-        "description": (
-            "Computes pairwise Pearson correlations between numeric columns in query.csv and renders "
-            "the result as a colour-coded heatmap (blue = positive, red = negative). "
-            "Use when the user asks which variables are related, correlated, or associated with each other. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "columns": {
-                    "type": "array",
-                    "description": "Optional list of numeric column names to include in the matrix. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": [],
-        },
-    },
-    {
-        "name": "rolling_stats_func",
-        "description": (
-            "Generates a rolling statistics / moving average chart from query.csv data. "
-            "Overlays rolling aggregations (mean, std, min, max) on top of the original series. "
-            "Use when the user asks for a moving average, rolling average, rolling statistics, or wants to smooth a time series. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis — typically a date or sequential index.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric values to compute rolling stats on.",
-                },
-                "window": {
-                    "type": "integer",
-                    "description": "Rolling window size in number of rows. Default 7. Infer from the user's request.",
-                },
-                "stats": {
-                    "type": "array",
-                    "description": "Statistics to overlay. Valid values: 'mean', 'std', 'min', 'max'. Defaults to ['mean'] if omitted.",
-                    "items": {"type": "string"},
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name to group the data, producing separate rolling stat lines per group.",
-                },
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "table_generation_func",
-        "description": (
-            "Formats query.csv results as a styled HTML table. "
-            "Use when the user wants to view raw query results in a readable format, "
-            "or when result data is too large to describe in text. Displays up to 200 rows. "
-            "Returns an HTML table — display it verbatim in the chat."
-        ),
-        "parameters": {"type": "object", "properties": {}},
-    },
-]

+chart_tools = [
+    {
+        "type": "function",
+            "function": {
+                "name": "scatter_chart_generation_func",
+                "description": f"""This is a scatter plot generation tool useful to generate scatter plots from queried data from our data source that we are querying.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the scatter_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "array",
+                            "description": f"""An array of strings that correspond to the the columns in our query.csv file that contain the x values of the graph. There can be more than one column
+                            that can each be plotted against the y_column, if needed.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "trendline": {
+                            "type": "string",
+                            "description": f"""An optional field to specify the type of plotly trendline we wish to use in the scatter plot.
+                                This trendline value can be one of ['ols','lowess','rolling','ewm','expanding'].
+                                Do not send any values outside of this array as the function will fail.
+                                Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "trendline_options": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'trendline_options' portion of the plotly chart generation.
+                            The 'lowess', 'rolling', and 'ewm' options require trendline_options to be included.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "marginal_x": {
+                            "type": "string",
+                            "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the x axis.
+                                This marginal_x value can be one of ['histogram','rug','box','violin'].
+                                Do not send any values outside of this array as the function will fail.
+                                Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "marginal_y": {
+                            "type": "string",
+                            "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the y axis.
+                                This marginal_y value can be one of ['histogram','rug','box','violin'].
+                                Do not send any values outside of this array as the function will fail.
+                                Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "size": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the size of each plot point.
+                            This is useful for a bubble chart where another value in our query can be represented by the size of the plotted point.
+                            Values must be greater than or equal to 0 and so in our query, all values less than 0 should be set equal to zero.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "line_chart_generation_func",
+                "description": f"""This is a line chart generation tool useful to generate line charts from queried data from our data source that we are querying.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the line_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the x values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "bar_chart_generation_func",
+                "description": f"""This is a bar chart generation tool useful to generate line charts from queried data from our data source that we are querying.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the bar_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the x values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "facet_row": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define a faceted subplot, where different rows
+                            correspond to different values of the query specified in this parameter.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "facet_col": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the faceted column, corresponding to
+                            different values of our query specified in this parameter.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "pie_chart_generation_func",
+                "description": f"""This is a pie chart generation tool useful to generate pie charts from queried data from our data source that we are querying.
+                The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the pie_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "values": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the values of the pie chart.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "names": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the label or section of each piece of the pie graph and allow us to know what each piece of the pie chart represents.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["values","names","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "histogram_generation_func",
+                "description": f"""This is a histogram generation tool useful to generate histograms from queried data from our data source that we are querying.
+                The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the histogram_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the x values of the histogram.
+                            This would correspond to the counts that would be distributed in the histogram.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains the y values of the histogram.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "histnorm": {
+                            "type": "string",
+                            "description": f"""An optional argument to specify the type of normalization if the default isn't used.
+                            This histnorm value can be one of ['percent','probability','density','probability density'].
+                            Do not send any values outside of this array as the function will fail.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "histfunc": {
+                            "type": "string",
+                            "description": f"""An optional value that represents the function of data to compute the function which is used on the optional y column.
+                            This histfunc value can be one of ['avg','sum','count'].
+                                Do not send any values outside of this array as the function will fail.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "table_generation_func",
+                "description": f"""This an table generation tool useful to format data as a table from queried data from our data source that we are querying.
+                Takes no parameters as it uses data queried in our query.csv file to build the table.
+                Call this function after running our SQLite query and generating query.csv.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the table_generation_func function in any way and always display the iframe fully to the user in the chat window.""",
+                "parameters": {},
+            },
+        }
+]

tools/stats_tools.py CHANGED Viewed

@@ -1,130 +1,44 @@
-stats_tool_schemas = [
-    {
-        "name": "descriptive_stats_func",
-        "description": (
-            "Computes summary statistics for numeric columns in query.csv: "
-            "count, mean, std, min, 25th/50th/75th percentile, and max. "
-            "Use when the user asks for summary statistics, descriptive statistics, or a statistical overview. "
-            "Returns a formatted HTML table."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "columns": {
-                    "type": "array",
-                    "description": "Optional list of column names to include. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": [],
-        },
-    },
-    {
-        "name": "kmeans_clustering_func",
-        "description": (
-            "Runs K-Means clustering on numeric feature columns from query.csv. "
-            "Groups rows into k clusters, displays a scatter plot coloured by cluster assignment, "
-            "and returns a centroid summary table showing the mean of each feature per cluster. "
-            "Use when the user asks to cluster the data, find natural segments or groups, or apply K-Means. "
-            "Returns an HTML iframe and summary table."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "feature_columns": {
-                    "type": "array",
-                    "description": "List of numeric column names from query.csv to use as clustering features.",
-                    "items": {"type": "string"},
-                },
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis of the scatter plot. Usually one of the feature columns.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis of the scatter plot. Usually one of the feature columns.",
-                },
-                "n_clusters": {
-                    "type": "integer",
-                    "description": "Number of clusters (k). Default 3. Infer from the user's request.",
-                },
-                "layout": {
-                    "type": "array",
-                    "description": "Optional. An array containing a single JSON-formatted Plotly layout dictionary.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": ["feature_columns", "x_column", "y_column"],
-        },
-    },
-    {
-        "name": "hypothesis_test_func",
-        "description": (
-            "Performs a statistical hypothesis test on query.csv data and returns a formatted results table "
-            "with test statistic, p-value, and significance at α=0.05. "
-            "Supported tests:\n"
-            "- 't_test_independent': compare means of a numeric column across two groups "
-            "(requires group_column; use group_values if the column has more than 2 unique values).\n"
-            "- 't_test_one_sample': test whether a column's mean equals a hypothesized value (requires pop_mean).\n"
-            "- 'chi_square': test independence between two categorical columns (requires column and column2)."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "test_type": {
-                    "type": "string",
-                    "description": "Test to run. One of: 't_test_independent', 't_test_one_sample', 'chi_square'.",
-                },
-                "column": {
-                    "type": "string",
-                    "description": "Primary column for the test. Numeric for t-tests; first categorical column for chi-square.",
-                },
-                "column2": {
-                    "type": "string",
-                    "description": "Second categorical column. Required for 'chi_square'.",
-                },
-                "group_column": {
-                    "type": "string",
-                    "description": "Grouping column. Required for 't_test_independent'. Must have exactly 2 unique values, or specify group_values.",
-                },
-                "group_values": {
-                    "type": "array",
-                    "description": "Exactly 2 group labels to compare. Use when group_column has more than 2 unique values.",
-                    "items": {"type": "string"},
-                },
-                "pop_mean": {
-                    "type": "number",
-                    "description": "Hypothesized population mean (μ₀). Required for 't_test_one_sample'.",
-                },
-            },
-            "required": ["test_type", "column"],
-        },
-    },
-    {
-        "name": "regression_func",
-        "description": (
-            "Runs an OLS linear regression on query.csv data. "
-            "Use when the user wants to model the relationship between variables, assess predictors, or run a regression. "
-            "Returns a regression summary (coefficients, R², p-values) and a scatter plot with the fitted line as an HTML iframe."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "independent_variables": {
-                    "type": "array",
-                    "description": "Column names from query.csv to use as independent (predictor) variables.",
-                    "items": {"type": "string"},
-                },
-                "dependent_variable": {
-                    "type": "string",
-                    "description": "Column name from query.csv to use as the dependent (outcome) variable.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code points and fit separate regression lines per group.",
-                },
-            },
-            "required": ["independent_variables", "dependent_variable"],
-        },
-    },
-]

+stats_tools = [
+        {
+        "type": "function",
+            "function": {
+                "name": "regression_func",
+                "description": f"""This a tool to calculate regressions on our data source that we are querying.
+                We can run queries with our 'sql_query_func' function and they will be available to use in this function via the query.csv file that is generated.
+                Returns a dictionary of values that includes a regression_summary and a regression chart (which is an iframe displaying the
+                linear regression in chart form and should be shown to the user).""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "independent_variables": {
+                            "type": "array",
+                            "description": f"""An array of strings that states the independent variables in our data set which should be column names in our query.csv file that is generated
+                            in the 'sql_query_func' function. This will allow us to identify the data to use for our independent variables.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "dependent_variable": {
+                            "type": "string",
+                            "description": f"""A string that states the dependent variables in our data set which should be a column name in our query.csv file that is generated
+                            in the 'sql_query_func' function. This will allow us to identify the data to use for our dependent variables.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.
+                            Do not send value if no category is needed or specified. This category must be present in our query.csv file to be valid.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["independent_variables","dependent_variable"],
+                },
+            },
+        }
+]

tools/tools.py CHANGED Viewed

@@ -1,130 +1,149 @@
-from .stats_tools import stats_tool_schemas
-from .chart_tools import chart_tool_schemas
-def tools_call(session_hash, data_source, titles):
-    from haystack.tools import Tool
-    _noop = lambda **kwargs: None
-    def make_tool(schema):
-        return Tool(
-            name=schema["name"],
-            description=schema["description"],
-            parameters=schema["parameters"],
-            function=_noop,
-        )
-    titles_string = (titles[:625] + '..') if len(titles) > 625 else titles
-    query_tool_schemas = {
-        'file_upload': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {titles_string}.
-            There may also be more columns in the table if the number of columns is too large to process.
-            This function also saves the results of the query to csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries"]
-            },
-        },
-        'sql': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {titles_string}.
-            There may also be more tables in the database if the number of tables is too large to process.
-            This function also saves the results of the query to csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The PostgreSQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries"]
-            },
-        },
-        'doc_db': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to build an aggregation pipeline to query a MongoDB NoSQL document database with the following collections, {titles_string}.
-            There may also be more collections in the database if the number of collections is too large to process.
-            This function also saves the results of the query to a csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The MongoDB aggregation pipeline to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    },
-                    "db_collection": {
-                        "type": "string",
-                        "description": "The MongoDB collection to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries", "db_collection"]
-            },
-        },
-        'graphql': [
-            {
-                "name": "query_func",
-                "description": f"""This is a tool useful to build a GraphQL query for a GraphQL API endpoint with the following types, {titles_string}.
-                There may also be more types in the GraphQL endpoint if the number of types is too large to process.
-                This function also saves the results of the query to a csv file called query.csv.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "queries": {
-                            "type": "string",
-                            "description": "The GraphQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["queries"]
-                },
-            },
-            {
-                "name": "graphql_schema_query",
-                "description": f"""This is a tool useful to query a GraphQL type and receive back information about its schema. This is useful because
-                the GraphQL introspection query is too large to be ingested all at once and this allows us to query the schema one type at a time to
-                view it in manageable bites. You may realize after viewing the schema, that the type you selected was not appropriate for the question
-                you are attempting answer. You may then query additional types to find the appropriate types to use for your GraphQL API query.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "graphql_type": {
-                            "type": "string",
-                            "description": "The GraphQL type that we want to view the schema of in order to make the proper query with our graphql_query_func. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["graphql_type"]
-                },
-            },
-            {
-                "name": "graphql_csv_query",
-                "description": f"""This is a tool useful to SQL query our query.csv file that is generated from our GraphQL query. This is useful in a situation
-                where the results of the GraphQL query need additional querying to answer the user question. The query.csv file is converted to a Pandas dataframe
-                and we query that dataframe with SQL on a table called 'query' before converting it back to a csv file.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "csv_query": {
-                            "type": "string",
-                            "description": "The pandas dataframe SQL query to use in the search. The table that we query is named 'query'. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["csv_query"]
-                },
-            },
-        ]
-    }
-    source_schemas = query_tool_schemas[data_source]
-    source_tools = [make_tool(s) for s in (source_schemas if isinstance(source_schemas, list) else [source_schemas])]
-    chart_tools = [make_tool(s) for s in chart_tool_schemas]
-    stats_tools = [make_tool(s) for s in stats_tool_schemas]
-    return source_tools + chart_tools + stats_tools

+from .stats_tools import stats_tools
+from .chart_tools import chart_tools
+def tools_call(session_hash, data_source, titles):
+    titles_string = (titles[:625] + '..') if len(titles) > 625 else titles
+    tools_calls = {
+            'file_upload' : [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "sqlite_query_func",
+                        "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {titles_string}.
+                        There may also be more columns in the table if the number of columns is too large to process.
+                        This function also saves the results of the query to csv file called query.csv.""",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "queries": {
+                                    "type": "array",
+                                    "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement",
+                                    "items": {
+                                        "type": "string",
+                                    }
+                                }
+                            },
+                            "required": ["queries"],
+                        },
+                    },
+                },
+            ],
+            'sql' : [
+            {
+                "type": "function",
+                "function": {
+                    "name": "sql_query_func",
+                    "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {titles_string}.
+                    There may also be more tables in the database if the number of tables is too large to process.
+                    This function also saves the results of the query to csv file called query.csv.""",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "queries": {
+                                "type": "array",
+                                "description": "The PostgreSQL query to use in the search. Infer this from the user's message. It should be a question or a statement",
+                                "items": {
+                                    "type": "string",
+                                }
+                            }
+                        },
+                        "required": ["queries"],
+                    },
+                },
+            },
+        ],
+        'doc_db' : [
+            {
+                "type": "function",
+                "function": {
+                    "name": "doc_db_query_func",
+                    "description": f"""This is a tool useful to build an aggregation pipeline to query a MongoDB NoSQL document database with the following collections, {titles_string}.
+                    There may also be more collections in the database if the number of tables is too large to process.
+                    This function also saves the results of the query to a csv file called query.csv.""",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_pipeline": {
+                                "type": "string",
+                                "description": "The MongoDB aggregation pipeline to use in the search. Infer this from the user's message. It should be a question or a statement."
+                            },
+                            "db_collection": {
+                                "type": "string",
+                                "description": "The MongoDB collection to use in the search. Infer this from the user's message. It should be a question or a statement.",
+                            }
+                        },
+                        "required": ["aggregation_pipeline","db_collection"],
+                    },
+                },
+            },
+        ],
+        'graphql' : [
+            {
+                "type": "function",
+                "function": {
+                    "name": "graphql_query_func",
+                    "description": f"""This is a tool useful to build a GraphQL query for a GraphQL API endpoint with the following types, {titles_string}.
+                    There may also be more types in the GraphQL endpoint if the number of types is too large to process.
+                    This function also saves the results of the query to a csv file called query.csv.""",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "graphql_query": {
+                                "type": "string",
+                                "description": "The GraphQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
+                            }
+                        },
+                        "required": ["graphql_query"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "graphql_schema_query",
+                    "description": f"""This is a tool useful to query a GraphQL type and receive back information about its schema. This is useful because
+                    the GraphQL introspection query is too large to be ingested all at once and this allows us to query the schema one type at a time to
+                    view it in manageable bites. You may realize after viewing the schema, that the type you selected was not appropriate for the question
+                    you are attempting answer. You may then query additional types to find the appropriate types to use for your GraphQL API query.""",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "graphql_type": {
+                                "type": "string",
+                                "description": "The GraphQL type that we want to view the schema of in order to make the proper query with our graphql_query_func. Infer this from the user's message. It should be a question or a statement."
+                            }
+                        },
+                        "required": ["graphql_type"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "graphql_csv_query",
+                    "description": f"""This is a tool useful to SQL query our query.csv file that is generated from our GraphQL query. This is useful in a situation
+                    where the results of the GraphQL query need additional querying to answer the user question.  The query.csv file is converted to a Pandas dataframe
+                    and we query that dataframe with SQL on a table called 'query' before converting it back to a csv file.""",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "csv_query": {
+                                "type": "string",
+                                "description": "The pandas dataframe SQL query to use in the search. The table that we query is named 'query'. Infer this from the user's message. It should be a question or a statement"
+                            }
+                        },
+                        "required": ["csv_query"],
+                    },
+                },
+            },
+        ]
+    }
+    tools = tools_calls[data_source]
+    tools.extend(chart_tools)
+    tools.extend(stats_tools)
+    return tools

utils.py CHANGED Viewed

@@ -4,6 +4,4 @@ current_dir = Path(__file__).parent
 TEMP_DIR = current_dir / 'temp'
-message_dict = {}
-api_key_store = {}
-model_store = {}


4
5	TEMP_DIR = current_dir / 'temp'
6
7	+ message_dict = {}