Spaces:

nolanzandi
/

virtual-data-analyst

Running

App Files Files Community

## 1. UI/UX Enhancements

#21

by bvproperty - opened Apr 12, 2025

base: refs/heads/main

←

from: refs/pr/21

Discussion Files changed

+1643

-107865

Files changed (32) hide show

.gitattributes +1 -0
.gitignore +0 -4
README.md +2 -2
app.py +121 -192
data_sources/__init__.py +1 -4
data_sources/connect_doc_db.py +0 -36
data_sources/connect_graphql.py +0 -148
data_sources/connect_sql_db.py +0 -42
data_sources/upload_file.py +7 -81
functions/__init__.py +7 -15
functions/chart_functions.py +79 -216
functions/chat_functions.py +60 -151
functions/query_functions.py +0 -229
functions/sqlite_functions.py +47 -0
functions/stat_functions.py +9 -238
index.html +245 -0
requirements.txt +1 -11
samples/online_retail_data.csv +0 -0
samples/tb_illness_data.csv +0 -0
script.js +440 -0
assets/styles.css → styles.css +168 -198
temp/.gitignore +0 -2
templates/data_file.py +0 -286
templates/doc_db.py +0 -105
templates/graphql.py +0 -110
templates/sql_db.py +0 -102
tools.py +451 -0
tools/__init__.py +0 -0
tools/chart_tools.py +0 -308
tools/stats_tools.py +0 -130
tools/tools.py +0 -130
utils.py +1 -3

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/online_retail_data.csv filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,4 +0,0 @@
-__pycache__/
-.gradio/
-.env
-temp/

README.md CHANGED Viewed

@@ -4,10 +4,10 @@ emoji: 📈
 colorFrom: pink
 colorTo: blue
 sdk: gradio
-sdk_version: 5.29.0
 app_file: app.py
 pinned: true
-short_description: Queries, visualizations, stat analysis on your data
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: pink
 colorTo: blue
 sdk: gradio
+sdk_version: 5.23.3
 app_file: app.py
 pinned: true
+short_description: Queries, visualizations, stat analysis on data files
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,192 +1,121 @@
-from utils import TEMP_DIR, message_dict, api_key_store, model_store
-import gradio as gr
-import templates.data_file as data_file, templates.sql_db as sql_db, templates.doc_db as doc_db, templates.graphql as graphql
-import os
-from dotenv import load_dotenv
-load_dotenv()
-def delete_db(req: gr.Request):
-    import shutil
-    dir_path = TEMP_DIR / str(req.session_hash)
-    if os.path.exists(dir_path):
-        shutil.rmtree(dir_path)
-        message_dict[req.session_hash] = {}
-    api_key_store.pop(req.session_hash, None)
-    model_store.pop(req.session_hash, None)
-def set_api_key(api_key, model, request: gr.Request):
-    api_key = api_key.strip()
-    if not api_key:
-        return (
-            gr.update(visible=True),
-            gr.update(visible=True, value="<p style='color:#b91c1c;text-align:center;margin:6px 0;font-size:14px;'>Please enter your API key.</p>"),
-            gr.update(visible=False),
-        )
-    api_key_store[request.session_hash] = api_key
-    model_store[request.session_hash] = model
-    provider = "Anthropic" if api_key.startswith("sk-ant-") else "OpenAI"
-    provider_icon = "fa-a" if provider == "Anthropic" else "fa-o"
-    badge_html = f"""
-    <div style="display:flex;flex-direction:column;align-items:center;gap:6px;padding:10px 0 4px;">
-        <div style="display:inline-flex;align-items:center;gap:10px;background:#f0fdf4;border:1px solid #86efac;
-                    padding:8px 20px;border-radius:9999px;font-size:13px;font-weight:500;color:#15803d;
-                    box-shadow:0 1px 3px rgba(0,0,0,0.06);">
-            <i class="fas fa-circle-check" style="font-size:14px;"></i>
-            <span>{provider}</span>
-            <span style="color:#86efac;">·</span>
-            <span style="font-weight:600;">{model}</span>
-        </div>
-        <p style="margin:0;font-size:11px;color:#9ca3af;letter-spacing:0.02em;">
-            Session active — use the button below to change
-        </p>
-    </div>
-    """
-    return gr.update(visible=False), gr.update(visible=True, value=badge_html), gr.update(visible=True)
-def show_api_form():
-    return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
-css = ".file_marker .large{min-height:50px !important;} .padding{padding:0;} .description_component{overflow:visible !important;}"
-head = """<meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Virtual Data Analyst</title>
-    <!-- Tailwind CSS -->
-    <script src="https://cdn.tailwindcss.com"></script>
-    <!-- Google Fonts -->
-    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
-    <!-- Font Awesome -->
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
-    <!-- Custom Styles -->
-    <link rel="stylesheet" href="/gradio_api/file=assets/styles.css">
-    """
-theme = gr.themes.Base(primary_hue="sky", secondary_hue="slate", font=[gr.themes.GoogleFont("Inter"), "Inter", "sans-serif"]).set(
-    button_primary_background_fill="#3B82F6",
-    button_secondary_background_fill="#6B7280",
-)
-from pathlib import Path
-gr.set_static_paths(paths=[Path.cwd().absolute() / "assets"])
-_env_api_key = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODELS = [
-    "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
-    "gpt-4o", "gpt-4o-mini",
-    "o3-mini", "o4-mini",
-    "gpt-5.4-mini", "gpt-5.4", "gpt-5.5",
-]
-ANTHROPIC_MODELS = [
-    "claude-sonnet-4-6",
-    "claude-opus-4-8",
-    "claude-haiku-4-5-20251001",
-]
-def update_models(api_key):
-    if api_key.strip().startswith("sk-ant-"):
-        return gr.update(choices=ANTHROPIC_MODELS, value=ANTHROPIC_MODELS[0])
-    return gr.update(choices=OPENAI_MODELS, value=OPENAI_MODELS[0])
-with gr.Blocks(theme=theme, css=css, head=head, delete_cache=(3600, 3600)) as demo:
-    with gr.Column(visible=True) as api_key_section:
-        gr.HTML("""
-            <div style="max-width:640px;margin:28px auto 12px;padding:22px 28px;
-                        background:linear-gradient(135deg,#eff6ff 0%,#e0f2fe 100%);
-                        border:1px solid #bfdbfe;border-radius:14px;
-                        box-shadow:0 2px 8px rgba(59,130,246,0.08);">
-                <div style="display:flex;align-items:flex-start;gap:16px;">
-                    <div style="width:42px;height:42px;flex-shrink:0;background:#3B82F6;
-                                border-radius:10px;display:flex;align-items:center;
-                                justify-content:center;box-shadow:0 2px 6px rgba(59,130,246,0.35);">
-                        <i class="fas fa-key" style="color:white;font-size:16px;"></i>
-                    </div>
-                    <div>
-                        <h3 style="color:#1e40af;margin:0 0 6px;font-size:16px;font-weight:700;letter-spacing:-0.01em;">
-                            Get Started
-                        </h3>
-                        <p style="color:#3730a3;font-size:13.5px;margin:0;line-height:1.6;">
-                            Enter your <strong>OpenAI</strong>
-                            (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-...</code>)
-                            or <strong>Anthropic</strong>
-                            (<code style="background:rgba(255,255,255,0.7);padding:1px 6px;border-radius:4px;font-size:12px;">sk-ant-...</code>)
-                            API key. The model list updates automatically. Your key is held in memory only
-                            and cleared when you leave — never saved or shared.
-                        </p>
-                    </div>
-                </div>
-            </div>
-        """)
-        with gr.Row(equal_height=True):
-            api_key_input = gr.Textbox(
-                label="API Key",
-                placeholder="sk-proj-...  or  sk-ant-api03-...",
-                type="password",
-                value=_env_api_key,
-                scale=4,
-            )
-            model_dropdown = gr.Dropdown(
-                label="Model",
-                choices=OPENAI_MODELS,
-                value=OPENAI_MODELS[0],
-                scale=2,
-            )
-            api_key_btn = gr.Button("Set API Key", variant="primary", scale=1, min_width=120)
-    api_key_status = gr.HTML("", visible=False)
-    change_key_btn = gr.Button("🔑  Change Key / Model", variant="secondary", visible=False, size="sm")
-    api_key_input.change(fn=update_models, inputs=api_key_input, outputs=model_dropdown)
-    api_key_btn.click(
-        fn=set_api_key,
-        inputs=[api_key_input, model_dropdown],
-        outputs=[api_key_section, api_key_status, change_key_btn],
-    )
-    change_key_btn.click(fn=show_api_form, outputs=[api_key_section, api_key_status, change_key_btn])
-    header = gr.HTML("""
-        <header class="max-w-4xl mx-auto mb-12 text-center">
-            <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
-            <p class="text-lg text-gray-600 mb-6">
-                A powerful tool for data analysis, visualizations, and insights
-            </p>
-        </header>
-        <main class="max-w-4xl mx-auto">
-            <div class="mt-12 grid md:grid-cols-3 gap-6" style="margin-bottom:3px !important;">
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
-                    <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
-                </div>
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
-                    <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
-                </div>
-                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
-                    <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
-                    <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
-                    <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
-                </div>
-            </div>
-        </main>""")
-    with gr.Tab("📄  Data File"):
-        data_file.demo.render()
-    with gr.Tab("🗄  SQL Database"):
-        sql_db.demo.render()
-    with gr.Tab("🍃  MongoDB"):
-        doc_db.demo.render()
-    with gr.Tab("⚡  GraphQL API"):
-        graphql.demo.render()
-    footer = gr.HTML("""
-        <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
-            <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
-        </footer>""")
-    demo.unload(delete_db)
-demo.launch(debug=True, allowed_paths=["temp/", "assets/"])

+from data_sources import process_data_upload
+from functions import example_question_generator, chatbot_with_fc
+from utils import TEMP_DIR, message_dict
+import gradio as gr
+import ast
+import os
+from getpass import getpass
+from dotenv import load_dotenv
+load_dotenv()
+if "OPENAI_API_KEY" not in os.environ:
+    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
+def delete_db(req: gr.Request):
+    import shutil
+    dir_path = TEMP_DIR / str(req.session_hash)
+    if os.path.exists(dir_path):
+        shutil.rmtree(dir_path)
+        message_dict[req.session_hash] = None
+def run_example(input):
+    return input
+def example_display(input):
+    if input == None:
+        display = True
+    else:
+        display = False
+    return [gr.update(visible=display),gr.update(visible=display)]
+css= ".file_marker .large{min-height:50px !important;} .example_btn{max-width:300px;} .padding{padding:0;}"
+with gr.Blocks(css=css, delete_cache=(3600,3600)) as demo:
+    title = gr.HTML("<h1 style='text-align:center;'>Virtual Data Analyst</h1>")
+    description = gr.HTML("""<p style='text-align:center;'>A helpful tool for data analysis, visualizations, regressions, and more.
+                          Upload a data file and chat with our virtual data analyst to get insights on your data set.
+                          Try a sample file to get started!</p>
+                          <ul style="margin:auto;max-width: 500px;">
+                          <li style="margin:0;line-height:1;">Currently accepts CSV, TSV, TXT, XLS, XLSX, XML, and JSON files.</li>
+                          <li style="margin:0;line-height:1;">Can run SQL queries, linear regressions, and analyze the results.</li>
+                          <li style="margin:0;line-height:1;">Can generate scatter plots, line charts, pie charts, bar graphs, histograms, time series, and more.
+                          New visualizations types added regularly.</li>
+                          </ul>
+                          <p style='text-align:center;'>This application is under active development. If you experience bugs with use,
+                          open a discussion in the community tab and I will respond.</p>""")
+    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
+    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
+    with gr.Row():
+        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="example_btn", size="md", variant="primary")
+        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="example_btn", size="md", variant="primary")
+    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker", file_types=['.csv','.xlsx','.txt','.json','.ndjson','.xml','.xls','.tsv'])
+    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
+    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
+    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2])
+    @gr.render(inputs=file_output)
+    def data_options(filename, request: gr.Request):
+        print(filename)
+        message_dict[request.session_hash] = None
+        if filename:
+            process_message = process_upload(filename, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "bank_marketing_campaign" in filename:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What levels of education have the highest and lowest average balance?"],
+                                            ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
+                                            ["Can you generate a bar chart of education vs. average balance?"],
+                                            ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
+                                            ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
+                                            ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
+                                        ]
+                elif "online_retail_data" in filename:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What month had the highest revenue?"],
+                                            ["Is revenue higher in the morning or afternoon?"],
+                                            ["Can you generate a line graph of revenue per month?"],
+                                            ["Can you generate a table of revenue per month?"],
+                                            ["Can we predict how time of day affects transaction value in this data set?"],
+                                            ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except:
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                parameters = gr.Textbox(visible=False, value=request.session_hash)
+                bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=chatbot_with_fc,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your data file",
+                                    concurrency_limit=None,
+                                    examples=example_questions,
+                                    additional_inputs=parameters
+                                    )
+    def process_upload(upload_value, session_hash):
+        if upload_value:
+            process_message = process_data_upload(upload_value, session_hash)
+        return process_message
+    demo.unload(delete_db)
+## Uncomment the line below to launch the chat app with UI
+demo.launch(debug=True, allowed_paths=["temp/"])

data_sources/__init__.py CHANGED Viewed

@@ -1,6 +1,3 @@
 from .upload_file import process_data_upload
-from .connect_sql_db import connect_sql_db
-from .connect_doc_db import connect_doc_db
-from .connect_graphql import connect_graphql
-__all__ = ["process_data_upload","connect_sql_db","connect_doc_db","connect_graphql"]


1	from .upload_file import process_data_upload



2
3	+ __all__ = ["process_data_upload"]

data_sources/connect_doc_db.py DELETED Viewed

@@ -1,36 +0,0 @@
-from pymongo import MongoClient
-import os
-from utils import TEMP_DIR
-from pymongo_schema.extract import extract_pymongo_client_schema
-def connect_doc_db(connection_string, nosql_db_name, session_hash):
-    try:
-        # Create a MongoClient object
-        client = MongoClient(connection_string)
-        print("Connected to NoSQL Mongo DB")
-        # Access a database
-        db = client[nosql_db_name]
-        collection_names = db.list_collection_names()
-        print(collection_names)
-        schema = extract_pymongo_client_schema(client)
-        # Close the connection
-        if 'client' in locals() and client:
-            client.close()
-            print("MongoDB Connection closed.")
-        session_path = 'doc_db'
-        dir_path = TEMP_DIR / str(session_hash) / str(session_path)
-        os.makedirs(dir_path, exist_ok=True)
-        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Document database connected successful</p>", collection_names, schema]
-    except Exception as e:
-        print("DocDB CONNECTION ERROR")
-        print(e)
-        return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

data_sources/connect_graphql.py DELETED Viewed

@@ -1,148 +0,0 @@
-import requests
-import certifi
-import os
-import json
-from utils import TEMP_DIR
-def connect_graphql(graphql_url, api_token, graphql_token_header, session_hash):
-    try:
-        # Create the GraphQL Introspection Query
-        query = """
-                query IntrospectionQuery {
-                    __schema {
-                    queryType { name }
-                    mutationType { name }
-                    subscriptionType { name }
-                    types {
-                        ...FullType
-                    }
-                    directives {
-                        name
-                        description
-                        locations
-                        args {
-                        ...InputValue
-                        }
-                    }
-                    }
-                }
-                fragment FullType on __Type {
-                    kind
-                    name
-                    description
-                    fields(includeDeprecated: true) {
-                    name
-                    description
-                    args {
-                        ...InputValue
-                    }
-                    type {
-                        ...TypeRef
-                    }
-                    isDeprecated
-                    deprecationReason
-                    }
-                    inputFields {
-                    ...InputValue
-                    }
-                    interfaces {
-                    ...TypeRef
-                    }
-                    enumValues(includeDeprecated: true) {
-                    name
-                    description
-                    isDeprecated
-                    deprecationReason
-                    }
-                    possibleTypes {
-                    ...TypeRef
-                    }
-                }
-                fragment InputValue on __InputValue {
-                    name
-                    description
-                    type { ...TypeRef }
-                    defaultValue
-                }
-                fragment TypeRef on __Type {
-                    kind
-                    name
-                    ofType {
-                    kind
-                    name
-                    ofType {
-                        kind
-                        name
-                        ofType {
-                        kind
-                        name
-                        ofType {
-                            kind
-                            name
-                            ofType {
-                            kind
-                            name
-                            ofType {
-                                kind
-                                name
-                                ofType {
-                                kind
-                                name
-                                }
-                            }
-                            }
-                        }
-                        }
-                    }
-                    }
-                }
-                """
-        print("Connecting to GraphQL Endpoint")
-        # Access a database
-        headers = {"Content-Type": "application/json"}
-        if graphql_token_header and api_token:
-            headers[graphql_token_header] = api_token
-        response = requests.post(graphql_url, headers=headers, json={"query": query},
-                    verify=certifi.where())
-        response.raise_for_status()
-        introspection_result = response.json()
-        client_schema = introspection_result["data"]["__schema"]
-        #Generate the list of types
-        type_names_query = """
-                query IntrospectionQuery {
-                    __schema {
-                        types {
-                            name
-                        }
-                    }
-                }
-            """
-        types_response = requests.post(graphql_url, headers=headers, json={"query": type_names_query},
-                                       verify=certifi.where())
-        types_response_results =types_response.json()
-        types_names = types_response_results["data"]
-        type_names = []
-        for name in types_names["__schema"]["types"]:
-            type_names.append(name["name"])
-        session_path = 'graphql'
-        dir_path = TEMP_DIR / str(session_hash) / str(session_path)
-        os.makedirs(dir_path, exist_ok=True)
-        with open(f'{dir_path}/schema.json', 'w') as fp:
-            json.dump(client_schema, fp, indent=2)
-        return ["success","<p style='color:green;text-align:center;font-size:18px;'>GraphQL API connected successful</p>", type_names]
-    except Exception as e:
-        print("GraphQL CONNECTION ERROR")
-        print(e)
-        return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

data_sources/connect_sql_db.py DELETED Viewed

@@ -1,42 +0,0 @@
-import psycopg2
-import os
-from utils import TEMP_DIR
-def connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
-    try:
-        conn = psycopg2.connect(
-            database=sql_db_name,
-            user=sql_user,
-            password=sql_pass,
-            host=url,  # e.g., "localhost" or an IP address
-            port=sql_port  # default is 5432
-        )
-        print("Connected to PostgreSQL")
-        # Create a cursor object to execute SQL queries
-        cur = conn.cursor()
-        # Example: Execute a query
-        cur.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'""")
-        table_tuples = cur.fetchall()
-        table_names = []
-        for table in table_tuples:
-            table_names.append(table[0])
-        print(table_names)
-        # Close the cursor and connection
-        cur.close()
-        conn.close()
-        print("Connection closed.")
-        session_path = 'sql'
-        dir_path = TEMP_DIR / str(session_hash) / str(session_path)
-        os.makedirs(dir_path, exist_ok=True)
-        return ["success","<p style='color:green;text-align:center;font-size:18px;'>SQL database connected successful</p>", table_names]
-    except Exception as e:
-        print("SQL DB CONNECTION ERROR")
-        print(e)
-        return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

data_sources/upload_file.py CHANGED Viewed

@@ -65,102 +65,28 @@ def process_data_upload(data_file, session_hash):
         for column in df.columns:
             if type(column) is str:
-                if "date" in column.lower() or "time" in column.lower():
                     try:
-                        df[column] = pd.to_datetime(df[column])
-                    except:
-                        pass
-                if 'year' in column.lower():
-                    try:
-                        df[column] = pd.to_datetime(df[column], format='%Y')
                     except:
                         pass
             if df[column].dtype == 'object' and isinstance(df[column].iloc[0], list):
                 df[column] = df[column].explode()
-        session_path = 'file_upload'
-        dir_path = TEMP_DIR / str(session_hash) / str(session_path)
         os.makedirs(dir_path, exist_ok=True)
         connection = sqlite3.connect(f'{dir_path}/data_source.db')
-        print("Opened database successfully")
         df.to_sql('data_source', connection, if_exists='replace', index = False)
-        cur=connection.execute('select * from data_source')
-        columns = [i[0] for i in cur.description]
-        print(columns)
         connection.commit()
         connection.close()
-        missing_per_col = {col: int(df[col].isnull().sum()) for col in df.columns}
-        total_missing = sum(missing_per_col.values())
-        def _simplify_dtype(d):
-            s = str(d)
-            if 'int' in s: return 'Integer'
-            if 'float' in s: return 'Float'
-            if 'datetime' in s: return 'DateTime'
-            if 'bool' in s: return 'Boolean'
-            return 'Text'
-        dtypes = {col: _simplify_dtype(df[col].dtype) for col in df.columns}
-        preview = []
-        for _, row in df.head(5).iterrows():
-            row_vals = []
-            for v in row:
-                try:
-                    row_vals.append('' if pd.isna(v) else str(v)[:60])
-                except Exception:
-                    row_vals.append(str(v)[:60])
-            preview.append(row_vals)
-        duplicate_rows = int(df.duplicated().sum())
-        unique_counts = {col: int(df[col].nunique()) for col in df.columns}
-        col_stats = {}
-        for col in df.columns:
-            dtype_str = str(df[col].dtype)
-            try:
-                if 'int' in dtype_str or 'float' in dtype_str:
-                    col_stats[col] = {
-                        'type': 'numeric',
-                        'min': float(df[col].min()),
-                        'max': float(df[col].max()),
-                        'mean': float(df[col].mean()),
-                    }
-                elif 'datetime' in dtype_str:
-                    col_stats[col] = {
-                        'type': 'datetime',
-                        'min': str(df[col].min())[:10],
-                        'max': str(df[col].max())[:10],
-                    }
-            except Exception:
-                pass
-        try:
-            file_size_bytes = os.path.getsize(data_file)
-        except Exception:
-            file_size_bytes = 0
-        stats = {
-            'num_rows': len(df),
-            'num_cols': len(df.columns),
-            'total_missing': total_missing,
-            'missing_per_col': missing_per_col,
-            'dtypes': dtypes,
-            'preview_cols': list(df.columns),
-            'preview': preview,
-            'duplicate_rows': duplicate_rows,
-            'unique_counts': unique_counts,
-            'col_stats': col_stats,
-            'file_size_bytes': file_size_bytes,
-        }
-        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>", columns, stats]
     except Exception as e:
         print("UPLOAD ERROR")
         print(e)

         for column in df.columns:
             if type(column) is str:
+                pattern = 'year|month|date|day|time'
+                if re.search(pattern, column.lower()):
                     try:
+                        df[column] = pd.to_datetime(df[column], infer_datetime_format=True)
                     except:
                         pass
             if df[column].dtype == 'object' and isinstance(df[column].iloc[0], list):
                 df[column] = df[column].explode()
+        dir_path = TEMP_DIR / str(session_hash)
         os.makedirs(dir_path, exist_ok=True)
         connection = sqlite3.connect(f'{dir_path}/data_source.db')
+        print("Opened database successfully");
+        print(df.columns)
         df.to_sql('data_source', connection, if_exists='replace', index = False)
         connection.commit()
         connection.close()
+        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Data upload successful</p>"]
     except Exception as e:
         print("UPLOAD ERROR")
         print(e)

functions/__init__.py CHANGED Viewed

@@ -1,17 +1,9 @@
-from .query_functions import graphql_schema_query, graphql_csv_query, query_func
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
-    line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, \
-    histogram_generation_func, box_chart_generation_func, correlation_heatmap_func, \
-    scatter_chart_fig, rolling_stats_func
-from .chat_functions import example_question_generator, chatbot_func
-from .stat_functions import regression_func, descriptive_stats_func, \
-    kmeans_clustering_func, hypothesis_test_func
-__all__ = [
-    "query_func", "graphql_schema_query", "graphql_csv_query",
-    "table_generation_func", "scatter_chart_generation_func", "line_chart_generation_func",
-    "bar_chart_generation_func", "pie_chart_generation_func", "histogram_generation_func",
-    "box_chart_generation_func", "correlation_heatmap_func", "rolling_stats_func",
-    "regression_func", "descriptive_stats_func", "kmeans_clustering_func", "hypothesis_test_func",
-    "scatter_chart_fig", "example_question_generator", "chatbot_func",
-]

+from .sqlite_functions import SQLiteQuery, sqlite_query_func
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
+line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, histogram_generation_func, scatter_chart_fig
+from .chat_functions import example_question_generator, chatbot_with_fc
+from .stat_functions import regression_func
+__all__ = ["SQLiteQuery","sqlite_query_func","table_generation_func","scatter_chart_generation_func",
+           "line_chart_generation_func","bar_chart_generation_func","regression_func", "pie_chart_generation_func", "histogram_generation_func",
+           "scatter_chart_fig","example_question_generator","chatbot_with_fc"]

functions/chart_functions.py CHANGED Viewed

@@ -1,28 +1,17 @@
 from typing import List
 import plotly.io as pio
 import plotly.express as px
 import pandas as pd
 from utils import TEMP_DIR
 import os
 import ast
 from dotenv import load_dotenv
 load_dotenv()
-root_url = os.getenv("ROOT_URL", "")
-def _write_chart(fig, chart_path, chart_url):
-    """Write a Plotly figure to disk and return a responsive iframe HTML string."""
-    pio.write_html(fig, chart_path, full_html=False, config={"responsive": True})
-    return (
-        'Please display this iframe: '
-        '<div style="width:100%;overflow-x:auto;">'
-        '<iframe style="width:100%;min-width:400px;" height="500" '
-        f'src="{chart_url}" frameborder="0" allowfullscreen>'
-        '</iframe></div>'
-    )
 def llm_chart_data_scrub(data, layout):
    #Processing data to account for variation from LLM
@@ -103,11 +92,11 @@ def scatter_chart_fig(df, x_column: List[str], y_column: str, category: str="",
    return fig
-def scatter_chart_generation_func(x_column: List[str], y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
                                   category: str="", trendline: str="", trendline_options: List[dict]=[{}], marginal_x: str="", marginal_y: str="",
-                                  size: str="", **kwargs):
    try:
-      dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
@@ -138,8 +127,13 @@ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_ha
             for data_item in fig["data"]:
                data_item[key] = value
-      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("SCATTER PLOT ERROR")
@@ -150,10 +144,10 @@ def scatter_chart_generation_func(x_column: List[str], y_column: str, session_ha
             """
       return {"reply": reply}
-def line_chart_generation_func(x_column: str, y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
-                                  category: str="", **kwargs):
    try:
-      dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
@@ -182,10 +176,15 @@ def line_chart_generation_func(x_column: str, y_column: str, session_hash, sessi
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
-      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("LINE CHART ERROR")
@@ -196,10 +195,10 @@ def line_chart_generation_func(x_column: str, y_column: str, session_hash, sessi
             """
       return {"reply": reply}
-def bar_chart_generation_func(x_column: str, y_column: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}],
-                                  category: str="", facet_row: str="", facet_col: str="", **kwargs):
    try:
-      dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
@@ -232,10 +231,15 @@ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, sessio
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
-      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("BAR CHART ERROR")
@@ -246,9 +250,9 @@ def bar_chart_generation_func(x_column: str, y_column: str, session_hash, sessio
             """
       return {"reply": reply}
-def pie_chart_generation_func(values: str, names: str, session_hash, session_folder, data: List[dict]=[{}], layout: List[dict]=[{}], **kwargs):
    try:
-      dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
@@ -274,10 +278,15 @@ def pie_chart_generation_func(values: str, names: str, session_hash, session_fol
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
-      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("PIE CHART ERROR")
@@ -288,15 +297,16 @@ def pie_chart_generation_func(values: str, names: str, session_hash, session_fol
             """
       return {"reply": reply}
-def histogram_generation_func(x_column: str, session_hash, session_folder, y_column: str="", data: List[dict]=[{}], layout: List[dict]=[{}], histnorm: str="", category: str="",
-                              histfunc: str="", **kwargs):
    try:
-      dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
       df = pd.read_csv(csv_query_path)
       print(x_column)
       function_args = {"data_frame":df, "x":x_column}
@@ -328,10 +338,15 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
             for data_item in fig["data"]:
                data_item[key] = value
-      print(fig)
-      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-      return {"reply": _write_chart(fig, chart_path, chart_url)}
    except Exception as e:
       print("HISTOGRAM ERROR")
@@ -342,185 +357,33 @@ def histogram_generation_func(x_column: str, session_hash, session_folder, y_col
             """
       return {"reply": reply}
-def box_chart_generation_func(y_column: str, session_hash, session_folder,
-                              x_column: str="", category: str="",
-                              layout: List[dict]=[{}], **kwargs):
-    try:
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        function_args = {"data_frame": df, "y": y_column}
-        if x_column:
-            function_args["x"] = x_column
-        if category:
-            function_args["color"] = category
-        initial_graph = px.box(**function_args)
-        fig = initial_graph.to_dict()
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig["layout"] = layout_dict
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("BOX CHART ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the box plot. Error: {e}. You should probably try again."}
-def correlation_heatmap_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
-    try:
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        numeric_df = df[columns].select_dtypes(include='number') if columns else df.select_dtypes(include='number')
-        if numeric_df.shape[1] < 2:
-            return {"reply": "At least two numeric columns are needed for a correlation matrix. Please refine your query to include more numeric columns."}
-        corr = numeric_df.corr().round(3)
-        fig = px.imshow(
-            corr,
-            text_auto='.2f',
-            color_continuous_scale='RdBu_r',
-            zmin=-1,
-            zmax=1,
-            title='Correlation Matrix',
-            aspect='auto',
-        )
-        fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("CORRELATION HEATMAP ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the correlation heatmap. Error: {e}. You should probably try again."}
-def rolling_stats_func(x_column: str, y_column: str, session_hash, session_folder,
-                       window: int = 7, stats: List[str] = ["mean"],
-                       layout: List[dict] = [{}], category: str = "", **kwargs):
-    try:
-        import plotly.graph_objects as go
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
         csv_query_path = f'{dir_path}/query.csv'
         df = pd.read_csv(csv_query_path)
-        try:
-            df[x_column] = pd.to_datetime(df[x_column])
-        except Exception:
-            pass
-        df = df.sort_values(x_column)
-        valid_stats = {"mean", "std", "min", "max"}
-        selected_stats = [s for s in stats if s in valid_stats] or ["mean"]
-        fig = go.Figure()
-        groups = df[category].unique().tolist() if category and category in df.columns else [None]
-        for group in groups:
-            group_df = df[df[category] == group] if group is not None else df
-            prefix = f"{group} — " if group is not None else ""
-            fig.add_trace(go.Scatter(
-                x=group_df[x_column].values, y=group_df[y_column].values,
-                mode="lines", name=f"{prefix}{y_column} (raw)",
-                opacity=0.35, line=dict(width=1)
-            ))
-            rolling_obj = group_df[y_column].rolling(window)
-            for stat in selected_stats:
-                rolled = getattr(rolling_obj, stat)()
-                fig.add_trace(go.Scatter(
-                    x=group_df[x_column].values, y=rolled.values,
-                    mode="lines", name=f"{prefix}Rolling {stat.capitalize()} (w={window})",
-                    line=dict(width=2.5)
-                ))
-        fig.update_layout(
-            title=f"Rolling Statistics (window={window}) — {y_column}",
-            xaxis_title=x_column,
-            yaxis_title=y_column,
-            font=dict(family="Inter, system-ui, sans-serif"),
-            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
-        )
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig.update_layout(**layout_dict)
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        return {"reply": _write_chart(fig, chart_path, chart_url)}
-    except Exception as e:
-        print("ROLLING STATS ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the rolling statistics chart. Error: {e}. You should probably try again."}
-def table_generation_func(session_hash, session_folder, **kwargs):
-    print("TABLE GENERATION")
-    try:
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        total_rows = len(df)
-        max_rows = 200
-        if total_rows > max_rows:
-            df = df.head(max_rows)
-            note = (f'<p class="vda-table-note">Showing first {max_rows} of {total_rows} rows'
-                    ' — refine your query to see more specific results.</p>')
-        else:
-            note = ''
-        header_cells = ''.join(f'<th>{escape(str(col))}</th>' for col in df.columns)
-        row_html = [
-            '<tr>' + ''.join(f'<td>{escape(str(val))}</td>' for val in row) + '</tr>'
-            for _, row in df.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '.vda-table-note{font-size:12px;color:#6b7280;margin:4px 0 0;text-align:right;}'
-            '</style>'
-        )
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table + note}
     except Exception as e:
-        print("TABLE ERROR")
-        print(e)
-        return {"reply": f"There was an error generating the table. Error: {e}. You should probably try again."}

 from typing import List
+from typing import Dict
 import plotly.io as pio
 import plotly.express as px
 import pandas as pd
 from utils import TEMP_DIR
 import os
 import ast
+import json
 from dotenv import load_dotenv
 load_dotenv()
+root_url = os.getenv("ROOT_URL")
 def llm_chart_data_scrub(data, layout):
    #Processing data to account for variation from LLM
    return fig
+def scatter_chart_generation_func(x_column: List[str], y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
                                   category: str="", trendline: str="", trendline_options: List[dict]=[{}], marginal_x: str="", marginal_y: str="",
+                                  size: str=""):
    try:
+      dir_path = TEMP_DIR / str(session_hash)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
             for data_item in fig["data"]:
                data_item[key] = value
+      pio.write_html(fig, chart_path, full_html=False)
+      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+      iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("SCATTER PLOT ERROR")
             """
       return {"reply": reply}
+def line_chart_generation_func(x_column: str, y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
+                                  category: str=""):
    try:
+      dir_path = TEMP_DIR / str(session_hash)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
+      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+      iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("LINE CHART ERROR")
             """
       return {"reply": reply}
+def bar_chart_generation_func(x_column: str, y_column: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}],
+                                  category: str="", facet_row: str="", facet_col: str=""):
    try:
+      dir_path = TEMP_DIR / str(session_hash)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
+      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+      iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("BAR CHART ERROR")
             """
       return {"reply": reply}
+def pie_chart_generation_func(values: str, names: str, session_hash, data: List[dict]=[{}], layout: List[dict]=[{}]):
    try:
+      dir_path = TEMP_DIR / str(session_hash)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
+      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+      iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("PIE CHART ERROR")
             """
       return {"reply": reply}
+def histogram_generation_func(x_column: str, session_hash, y_column: str="", data: List[dict]=[{}], layout: List[dict]=[{}], histnorm: str="", category: str="",
+                              histfunc: str=""):
    try:
+      dir_path = TEMP_DIR / str(session_hash)
       chart_path = f'{dir_path}/chart.html'
       csv_query_path = f'{dir_path}/query.csv'
       df = pd.read_csv(csv_query_path)
+      print(df)
       print(x_column)
       function_args = {"data_frame":df, "x":x_column}
             for data_item in fig["data"]:
                data_item[key] = value
+      print(fig)
+      pio.write_html(fig, chart_path, full_html=False)
+      chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+      iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+      return {"reply": iframe}
    except Exception as e:
       print("HISTOGRAM ERROR")
             """
       return {"reply": reply}
+def table_generation_func(session_hash):
+    print("TABLE GENERATION")
+    try:
+        dir_path = TEMP_DIR / str(session_hash)
         csv_query_path = f'{dir_path}/query.csv'
+        table_path = f'{dir_path}/table.html'
         df = pd.read_csv(csv_query_path)
+        print(df)
+        html_table = df.to_html()
+        print(html_table)
+        with open(table_path, "w") as file:
+         file.write(html_table)
+        table_url = f'{root_url}/gradio_api/file/temp/{session_hash}/table.html'
+        iframe = '<div style=overflow:auto;><iframe\n scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + table_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+        print(iframe)
+        return {"reply": iframe}
     except Exception as e:
+      print("TABLE ERROR")
+      print(e)
+      reply = f"""There was an error generating the Pandas DataFrame table results.
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}

functions/chat_functions.py CHANGED Viewed

@@ -1,184 +1,93 @@
-from utils import message_dict, api_key_store, model_store
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
-from haystack.utils import Secret
-def _get_generator(session_hash):
-    api_key = api_key_store.get(session_hash)
-    if not api_key:
-        raise ValueError("No API key found for this session. Please enter your API key at the top of the page.")
-    model = model_store.get(session_hash, "gpt-4o")
-    if api_key.startswith("sk-ant-"):
-        from haystack_integrations.components.generators.chat import AnthropicChatGenerator
-        return AnthropicChatGenerator(model=model, api_key=Secret.from_token(api_key))
-    return OpenAIChatGenerator(model=model, api_key=Secret.from_token(api_key))
 response = None
-def example_question_message(data_source, name, titles, schema):
-    example_message_dict = {
-        'file_upload' : ["You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.",
-                         f"""We have a SQLite database with the following {titles}.
-                        We also have an AI agent with access to the same database that will be performing data analysis.
-                        Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
-                        data insights. Return nothing more than the array of questions because I need that specific data structure
-                        to process your response. No other response type or data structure will work."""],
-        'sql' : [f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database called {name}.",
-                 f"""We have a PostgreSQL database with the following tables: {titles}.
-                        We also have an AI agent with access to the same database that will be performing data analysis.
-                        Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
-                        data insights. Return nothing more than the array of questions because I need that specific data structure
-                        to process your response. No other response type or data structure will work."""],
-        'doc_db' : [f"You are a helpful and knowledgeable agent who has access to an MongoDB NoSQL document database called {name}.",
-                    f"""We have a MongoDB NoSQL document database with the following collections: {titles}.
-                        The schema of these collections is: {schema}.
-                        We also have an AI agent with access to the same database that will be performing data analysis.
-                        Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
-                        data insights. Return nothing more than the array of questions because I need that specific data structure
-                        to process your response. No other response type or data structure will work."""],
-        'graphql' : [f"You are a helpful and knowledgeable agent who has access to an GraphQL API endpoint called {name}.",
-                     f"""We have a GraphQL API endpoint with the following types: {titles}.
-                        We also have an AI agent with access to the same GraphQL API endpoint that will be performing data analysis.
-                        Please return an array of seven strings, each one being a question for our data analysis agent
-                        that we can suggest that you believe will be insightful or helpful to a data analyst looking for
-                        data insights. Return nothing more than the array of questions because I need that specific data structure
-                        to process your response. No other response type or data structure will work."""]
-    }
-    return example_message_dict[data_source]
-def example_question_generator(session_hash, data_source, name, titles, schema):
     example_response = None
-    example_message_list = example_question_message(data_source, name, titles, schema)
     example_messages = [
         ChatMessage.from_system(
-            example_message_list[0]
         )
     ]
-    example_messages.append(ChatMessage.from_user(text=example_message_list[1]))
-    example_response = _get_generator(session_hash).run(messages=example_messages)
-    response_text = example_response["replies"][0].text
-    start = response_text.index("[") + 1
-    end = response_text.index("]")
-    response_content = response_text[start:end]
-    response_list = '[' + response_content + ']'
-    print(response_list)
-    return response_list
-def system_message(data_source, titles, schema=""):
-    print("TITLES")
-    print(titles)
-    tools_desc = (
-        " You have access to tools for querying the data source, generating charts and visualisations,"
-        " and performing statistical analyses — use them proactively whenever they would help answer the user's question."
-        " Always display any charts, tables, and visualisations inline in your responses by outputting the returned HTML verbatim."
-    )
-    system_message_dict = {
-        'file_upload': (
-            f"You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source' that contains the following columns: {titles}."
-            + tools_desc
-        ),
-        'sql': (
-            f"You are a helpful and knowledgeable agent who has access to a PostgreSQL database which has a series of tables called {titles}."
-            + tools_desc
-        ),
-        'doc_db': (
-            f"You are a helpful and knowledgeable agent who has access to a NoSQL MongoDB Document database which has a series of collections called {titles}. "
-            f"The schema of these collections is: {schema}."
-            + tools_desc
-        ),
-        'graphql': (
-            f"You are a helpful and knowledgeable agent who has access to a GraphQL API which has the following types: {titles}. "
-            "We have also saved a schema.json file that contains the entire introspection query that we can use to find out more about each type before making a query."
-            + tools_desc
-        ),
-    }
-    return system_message_dict[data_source]
-def chatbot_func(message, history, session_hash, data_source, titles, schema, *args):
-    try:
-        chat_generator = _get_generator(session_hash)
-    except ValueError as e:
-        return str(e)
-    from functions import (
-        table_generation_func, regression_func, descriptive_stats_func,
-        scatter_chart_generation_func, line_chart_generation_func, bar_chart_generation_func,
-        pie_chart_generation_func, histogram_generation_func,
-        box_chart_generation_func, correlation_heatmap_func, rolling_stats_func,
-        query_func, graphql_schema_query, graphql_csv_query,
-        kmeans_clustering_func, hypothesis_test_func,
-    )
-    import tools.tools as tools
-    available_functions = {
-        "query_func": query_func,
-        "graphql_schema_query": graphql_schema_query,
-        "graphql_csv_query": graphql_csv_query,
-        "table_generation_func": table_generation_func,
-        "scatter_chart_generation_func": scatter_chart_generation_func,
-        "line_chart_generation_func": line_chart_generation_func,
-        "bar_chart_generation_func": bar_chart_generation_func,
-        "pie_chart_generation_func": pie_chart_generation_func,
-        "histogram_generation_func": histogram_generation_func,
-        "box_chart_generation_func": box_chart_generation_func,
-        "correlation_heatmap_func": correlation_heatmap_func,
-        "rolling_stats_func": rolling_stats_func,
-        "regression_func": regression_func,
-        "descriptive_stats_func": descriptive_stats_func,
-        "kmeans_clustering_func": kmeans_clustering_func,
-        "hypothesis_test_func": hypothesis_test_func,
-    }
-    if message_dict[session_hash][data_source] != None:
-        message_dict[session_hash][data_source].append(ChatMessage.from_user(message))
     else:
         messages = [
-            ChatMessage.from_system(system_message(data_source, titles, schema))
         ]
         messages.append(ChatMessage.from_user(message))
-        message_dict[session_hash][data_source] = messages
-    active_tools = tools.tools_call(session_hash, data_source, titles)
-    response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
     while True:
-        # if the response is a tool call
         if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
             function_calls = response["replies"][0].tool_calls
             for function_call in function_calls:
-                message_dict[session_hash][data_source].append(ChatMessage.from_assistant(tool_calls=[function_call]))
                 ## Parse function calling information
                 function_name = function_call.tool_name
                 function_args = function_call.arguments
                 ## Find the corresponding function and call it with the given arguments
                 function_to_call = available_functions[function_name]
-                function_response = function_to_call(**function_args, session_hash=session_hash, session_folder=data_source, args=args)
                 print(function_name)
                 ## Append function response to the messages list using `ChatMessage.from_tool`
-                message_dict[session_hash][data_source].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
-                response = chat_generator.run(messages=message_dict[session_hash][data_source], tools=active_tools)
         # Regular Conversation
         else:
-            message_dict[session_hash][data_source].append(response["replies"][0])
             break
-    return response["replies"][0].text

+from utils import TEMP_DIR, message_dict
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
+chat_generator = OpenAIChatGenerator(model="gpt-4o")
 response = None
+def example_question_generator(session_hash):
+    import sqlite3
     example_response = None
     example_messages = [
         ChatMessage.from_system(
+            "You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'."
         )
     ]
+    dir_path = TEMP_DIR / str(session_hash)
+    connection = sqlite3.connect(f'{dir_path}/data_source.db')
+    print("Querying questions");
+    cur=connection.execute('select * from data_source')
+    columns = [i[0] for i in cur.description]
+    print("QUESTION COLUMNS")
+    print(columns)
+    cur.close()
+    connection.close()
+    example_messages.append(ChatMessage.from_user(text=f"""We have a SQLite database with the following {columns}.
+                                                  We also have an AI agent with access to the same database that will be performing data analysis.
+                                                  Please return an array of seven strings, each one being a question for our data analysis agent
+                                                  that we can suggest that you believe will be insightful or helpful to a data analysis looking for
+                                                  data insights. Return nothing more than the array of questions because I need that specific data structure
+                                                  to process your response. No other response type or data structure will work."""))
+    example_response = chat_generator.run(messages=example_messages)
+    return example_response["replies"][0].text
+def chatbot_with_fc(message, history, session_hash):
+    from functions import sqlite_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
+        line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
+    import tools
+    available_functions = {"sql_query_func": sqlite_query_func,"table_generation_func":table_generation_func,
+                           "line_chart_generation_func":line_chart_generation_func,"bar_chart_generation_func":bar_chart_generation_func,
+                           "scatter_chart_generation_func":scatter_chart_generation_func, "pie_chart_generation_func":pie_chart_generation_func,
+                           "histogram_generation_func":histogram_generation_func,
+                           "regression_func":regression_func }
+    if message_dict[session_hash] != None:
+        message_dict[session_hash].append(ChatMessage.from_user(message))
     else:
         messages = [
+            ChatMessage.from_system(
+                """You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.
+                You also have access to a function, called table_generation_func, that can take a query.csv file generated from our sql query and returns an iframe that we can display in our chat window.
+                You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we can display in our chat window.
+                You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a line chart and returns an iframe that we can display in our chat window.
+                You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a bar graph and returns an iframe that we can display in our chat window.
+                You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a pie chart and returns an iframe that we can display in our chat window.
+                You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a histogram and returns an iframe that we can display in our chat window.
+                You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our sql query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe."""
+            )
         ]
         messages.append(ChatMessage.from_user(message))
+        message_dict[session_hash] = messages
+    response = chat_generator.run(messages=message_dict[session_hash], generation_kwargs={"tools": tools.tools_call(session_hash)})
     while True:
+        # if OpenAI response is a tool call
         if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
             function_calls = response["replies"][0].tool_calls
             for function_call in function_calls:
+                message_dict[session_hash].append(ChatMessage.from_assistant(tool_calls=[function_call]))
                 ## Parse function calling information
                 function_name = function_call.tool_name
                 function_args = function_call.arguments
                 ## Find the corresponding function and call it with the given arguments
                 function_to_call = available_functions[function_name]
+                function_response = function_to_call(**function_args, session_hash=session_hash)
                 print(function_name)
                 ## Append function response to the messages list using `ChatMessage.from_tool`
+                message_dict[session_hash].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
+                response = chat_generator.run(messages=message_dict[session_hash], generation_kwargs={"tools": tools.tools_call(session_hash)})
         # Regular Conversation
         else:
+            message_dict[session_hash].append(response["replies"][0])
             break
+    return response["replies"][0].text

functions/query_functions.py DELETED Viewed

@@ -1,229 +0,0 @@
-from typing import List
-from typing import AnyStr
-from haystack import component
-import pandas as pd
-from pandasql import sqldf
-pd.set_option('display.max_rows', None)
-pd.set_option('display.max_columns', None)
-pd.set_option('display.width', None)
-pd.set_option('display.max_colwidth', None)
-import sqlite3
-import psycopg2
-from pymongo import MongoClient
-import pymongoarrow.monkey
-import json
-import pluck
-from utils import TEMP_DIR
-import ast
-@component
-class SQLiteQuery:
-    def __init__(self, sql_database: str):
-      self.connection = sqlite3.connect(sql_database, check_same_thread=False)
-    @component.output_types(results=List[str], queries=List[str])
-    def run(self, queries: AnyStr, session_hash):
-        print("ATTEMPTING TO RUN SQLITE QUERY")
-        dir_path = TEMP_DIR / str(session_hash)
-        results = []
-        result = pd.read_sql(queries, self.connection)
-        result.to_csv(f'{dir_path}/file_upload/query.csv', index=False)
-        column_names = list(result.columns)
-        results.append(f"{result}")
-        self.connection.close()
-        return {"results": results, "queries": queries, "csv_columns": column_names}
-@component
-class PostgreSQLQuery:
-    def __init__(self, url: str, sql_port: int, sql_user: str, sql_pass: str, sql_db_name: str):
-      self.connection = psycopg2.connect(
-            database=sql_db_name,
-            user=sql_user,
-            password=sql_pass,
-            host=url,  # e.g., "localhost" or an IP address
-            port=sql_port  # default is 5432
-        )
-    @component.output_types(results=List[str], queries=List[str])
-    def run(self, queries: AnyStr, session_hash):
-        print("ATTEMPTING TO RUN POSTGRESQL QUERY")
-        dir_path = TEMP_DIR / str(session_hash)
-        results = []
-        result = pd.read_sql_query(queries, self.connection)
-        result.to_csv(f'{dir_path}/sql/query.csv', index=False)
-        column_names = list(result.columns)
-        results.append(f"{result}")
-        self.connection.close()
-        return {"results": results, "queries": queries, "csv_columns": column_names}
-@component
-class DocDBQuery:
-    def __init__(self, connection_string: str, doc_db_name: str):
-      client = MongoClient(connection_string)
-      self.client = client
-      self.connection = client[doc_db_name]
-    @component.output_types(results=List[str], queries=List[str])
-    def run(self, aggregation_pipeline: List[str], db_collection,  session_hash):
-        pymongoarrow.monkey.patch_all()
-        print("ATTEMPTING TO RUN MONGODB QUERY")
-        dir_path = TEMP_DIR / str(session_hash)
-        results = []
-        print(aggregation_pipeline)
-        aggregation_pipeline = aggregation_pipeline.replace(" ", "")
-        false_replace = [':false', ': false']
-        false_value = ':False'
-        true_replace = [':true', ': true']
-        true_value = ':True'
-        for replace in false_replace:
-            aggregation_pipeline = aggregation_pipeline.replace(replace, false_value)
-        for replace in true_replace:
-            aggregation_pipeline = aggregation_pipeline.replace(replace, true_value)
-        query_list = ast.literal_eval(aggregation_pipeline)
-        print("QUERY List")
-        print(query_list)
-        print(db_collection)
-        db = self.connection
-        collection = db[db_collection]
-        print(collection)
-        docs = collection.aggregate_pandas_all(query_list)
-        print("DATA FRAME COMPLETE")
-        docs.to_csv(f'{dir_path}/doc_db/query.csv', index=False)
-        column_names = list(docs.columns)
-        print("CSV COMPLETE")
-        results.append(f"{docs}")
-        self.client.close()
-        return {"results": results, "queries": aggregation_pipeline, "csv_columns": column_names}
-@component
-class GraphQLQuery:
-    def __init__(self):
-      self.connection = pluck
-    @component.output_types(results=List[str], queries=List[str])
-    def run(self, graphql_query, graphql_api_string, graphql_api_token, graphql_token_header, session_hash):
-        print("ATTEMPTING TO RUN GRAPHQL QUERY")
-        dir_path = TEMP_DIR / str(session_hash)
-        results = []
-        headers = {"Content-Type": "application/json"}
-        if graphql_token_header and graphql_api_token:
-          headers[graphql_token_header] = graphql_api_token
-        print(graphql_query)
-        response = self.connection.execute(url=graphql_api_string, headers=headers, query=graphql_query, column_names="short")
-        if response.errors:
-           raise ValueError(response.errors)
-        elif response.data:
-          print("DATA FRAME COMPLETE")
-          print(response)
-          response_frame = response.frames['default']
-          print("RESPONSE FRAME")
-          #print(response_frame)
-          response_frame.to_csv(f'{dir_path}/graphql/query.csv', index=False)
-          column_names = list(response_frame.columns)
-          print("CSV COMPLETE")
-          results.append(f"{response_frame}")
-          return {"results": results, "queries": graphql_query, "csv_columns": column_names}
-def query_func(queries:AnyStr, session_hash, session_folder, args, **kwargs):
-    try:
-      print("QUERY")
-      print(queries)
-      if session_folder == "file_upload":
-        dir_path = TEMP_DIR / str(session_hash)
-        sql_query = SQLiteQuery(f'{dir_path}/file_upload/data_source.db')
-        result = sql_query.run(queries, session_hash)
-      elif session_folder == "sql":
-        sql_query = PostgreSQLQuery(args[0], args[1], args[2], args[3], args[4])
-        result = sql_query.run(queries, session_hash)
-      elif session_folder == 'doc_db':
-        doc_db_query = DocDBQuery(args[0], args[1])
-        result = doc_db_query.run(queries, kwargs['db_collection'], session_hash)
-      elif session_folder == 'graphql':
-        graphql_object = GraphQLQuery()
-        result = graphql_object.run(queries, args[0], args[1], args[2], session_hash)
-      print("RESULT")
-      print(result["csv_columns"])
-      if len(result["results"][0]) > 1000:
-        print("QUERY TOO LARGE")
-        return {"reply": f"""query result too large to be processed by llm, the query results are in our query.csv file.
-                The column names of this query.csv file are: {result["csv_columns"]}.
-                If you need to display the results directly, perhaps use the table_generation_func function."""}
-      else:
-        return {"reply": result["results"][0]}
-    except Exception as e:
-      reply = f"""There was an error running the {session_folder} Query = {queries}
-              The error is {e},
-              You should probably try again.
-              """
-      print(reply)
-      return {"reply": reply}
-def graphql_schema_query(graphql_type: AnyStr, session_hash, **kwargs):
-    dir_path = TEMP_DIR / str(session_hash)
-    try:
-      with open(f'{dir_path}/graphql/schema.json', 'r') as file:
-        data = json.load(file)
-      types_list = data["types"]
-      result = list(filter(lambda item: item["name"] == graphql_type, types_list))
-      print("SCHEMA RESULT")
-      print(graphql_type)
-      print(str(result))
-      return {"reply": str(result)}
-    except Exception as e:
-      reply = f"""There was an error querying our schema.json file with the type:{graphql_type}
-              The error is {e},
-              You should probably try again.
-              """
-      print(reply)
-      return {"reply": reply}
-def graphql_csv_query(csv_query: AnyStr, session_hash, **kwargs):
-    dir_path = TEMP_DIR / str(session_hash)
-    try:
-      query = pd.read_csv(f'{dir_path}/graphql/query.csv')
-      query.Name = 'query'
-      print("GRAPHQL CSV QUERY")
-      print(csv_query)
-      queried_df = sqldf(csv_query, locals())
-      print(queried_df)
-      column_names = list(queried_df.columns)
-      queried_df.to_csv(f'{dir_path}/graphql/query.csv', index=False)
-      if len(queried_df) > 1000:
-        print("CSV QUERY TOO LARGE")
-        return {"reply": f"""The new query results are in our query.csv file.
-                The column names of this query.csv file are: {column_names}.
-                If you need to display the results directly, perhaps use the table_generation_func function."""}
-      else:
-        return {"reply": str(queried_df)}
-    except Exception as e:
-      reply = f"""There was an error querying our query.csv file with the query:{csv_query}
-              The error is {e},
-              You should probably try again.
-              """
-      print(reply)
-      return {"reply": reply}

functions/sqlite_functions.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import List
+from haystack import component
+import pandas as pd
+pd.set_option('display.max_rows', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', None)
+pd.set_option('display.max_colwidth', None)
+import sqlite3
+from utils import TEMP_DIR
+@component
+class SQLiteQuery:
+    def __init__(self, sql_database: str):
+      self.connection = sqlite3.connect(sql_database, check_same_thread=False)
+    @component.output_types(results=List[str], queries=List[str])
+    def run(self, queries: List[str], session_hash):
+        print("ATTEMPTING TO RUN QUERY")
+        dir_path = TEMP_DIR / str(session_hash)
+        results = []
+        for query in queries:
+          result = pd.read_sql(query, self.connection)
+          result.to_csv(f'{dir_path}/query.csv', index=False)
+          results.append(f"{result}")
+        self.connection.close()
+        return {"results": results, "queries": queries}
+def sqlite_query_func(queries: List[str], session_hash):
+    dir_path = TEMP_DIR / str(session_hash)
+    sql_query = SQLiteQuery(f'{dir_path}/data_source.db')
+    try:
+      result = sql_query.run(queries, session_hash)
+      if len(result["results"][0]) > 1000:
+        print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
+      else:
+        return {"reply": result["results"][0]}
+    except Exception as e:
+      reply = f"""There was an error running the SQL Query = {queries}
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}

functions/stat_functions.py CHANGED Viewed

@@ -5,251 +5,19 @@ from utils import TEMP_DIR
 import plotly.express as px
 import plotly.io as pio
 import os
-from functions.chart_functions import scatter_chart_fig, llm_chart_data_scrub, _write_chart
 from dotenv import load_dotenv
 load_dotenv()
-root_url = os.getenv("ROOT_URL", "")
-def descriptive_stats_func(session_hash, session_folder, columns: List[str]=[], **kwargs):
-    print("DESCRIPTIVE STATISTICS")
-    try:
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        if columns:
-            df = df[[c for c in columns if c in df.columns]]
-        desc = df.describe().round(4)
-        header_cells = '<th style="background:#1e40af;">Statistic</th>' + ''.join(
-            f'<th>{escape(str(col))}</th>' for col in desc.columns
-        )
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
-            + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
-            + '</tr>'
-            for idx, row in desc.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table}
-    except Exception as e:
-        print("DESCRIPTIVE STATS ERROR")
-        print(e)
-        return {"reply": f"There was an error generating descriptive statistics. Error: {e}. You should probably try again."}
-def kmeans_clustering_func(feature_columns: List[str], x_column: str, y_column: str,
-                           session_hash, session_folder, n_clusters: int = 3,
-                           layout: List[dict] = [{}], **kwargs):
-    print("KMEANS CLUSTERING")
-    try:
-        from sklearn.cluster import KMeans
-        from sklearn.preprocessing import StandardScaler
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        chart_path = f'{dir_path}/chart.html'
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        feature_df = df[feature_columns].select_dtypes(include='number').dropna()
-        if feature_df.shape[1] < 1:
-            return {"reply": "No numeric feature columns found for clustering. Please refine your query to include numeric columns."}
-        X_scaled = StandardScaler().fit_transform(feature_df)
-        labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit_predict(X_scaled)
-        df_clustered = df.loc[feature_df.index].copy()
-        df_clustered['Cluster'] = [f'Cluster {l}' for l in labels]
-        fig = px.scatter(
-            df_clustered, x=x_column, y=y_column, color='Cluster',
-            title=f'K-Means Clustering (k={n_clusters})',
-        )
-        fig.update_layout(font=dict(family='Inter, system-ui, sans-serif'))
-        _, layout_dict = llm_chart_data_scrub({}, layout)
-        if layout_dict:
-            fig.update_layout(**layout_dict)
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        iframe = _write_chart(fig, chart_path, chart_url)
-        cluster_summary = df_clustered.groupby('Cluster')[feature_columns].mean().round(3)
-        header_cells = '<th style="background:#1e40af;">Cluster</th>' + ''.join(
-            f'<th>{escape(str(col))}</th>' for col in cluster_summary.columns
-        )
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(str(idx))}</td>'
-            + ''.join(f'<td>{escape(str(val))}</td>' for val in row)
-            + '</tr>'
-            for idx, row in cluster_summary.iterrows()
-        ]
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        summary_table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": f'{iframe}\n\n**Cluster Centroids (feature means per cluster):**\n{style}{summary_table}'}
-    except Exception as e:
-        print("KMEANS CLUSTERING ERROR")
-        print(e)
-        return {"reply": f"There was an error running K-Means clustering. Error: {e}. You should probably try again."}
-def hypothesis_test_func(test_type: str, column: str, session_hash, session_folder,
-                         column2: str = "", group_column: str = "",
-                         group_values: List[str] = [], pop_mean: float = 0.0, **kwargs):
-    print("HYPOTHESIS TEST")
-    try:
-        from scipy import stats
-        from html import escape
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
-        csv_query_path = f'{dir_path}/query.csv'
-        df = pd.read_csv(csv_query_path)
-        if test_type == "t_test_independent":
-            if not group_column or group_column not in df.columns:
-                return {"reply": "Please specify a valid group_column for the independent t-test."}
-            unique_groups = df[group_column].dropna().unique().tolist()
-            if group_values and len(group_values) == 2:
-                g1_label, g2_label = group_values[0], group_values[1]
-            elif len(unique_groups) == 2:
-                g1_label, g2_label = unique_groups[0], unique_groups[1]
-            else:
-                return {"reply": f"For an independent t-test, exactly 2 groups are needed. Found: {unique_groups}. Specify group_values with 2 entries."}
-            g1 = df[df[group_column] == g1_label][column].dropna()
-            g2 = df[df[group_column] == g2_label][column].dropna()
-            t_stat, p_value = stats.ttest_ind(g1, g2)
-            result_rows = [
-                ("Test", "Independent Samples T-Test"),
-                ("Column", column),
-                ("Group Column", group_column),
-                (f"Group 1", str(g1_label)),
-                (f"Group 2", str(g2_label)),
-                (f"Group 1 Mean (n={len(g1)})", f"{g1.mean():.4f}"),
-                (f"Group 2 Mean (n={len(g2)})", f"{g2.mean():.4f}"),
-                ("T-Statistic", f"{t_stat:.4f}"),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"T-Test: {column} by {group_column}"
-        elif test_type == "t_test_one_sample":
-            sample = df[column].dropna()
-            t_stat, p_value = stats.ttest_1samp(sample, pop_mean)
-            result_rows = [
-                ("Test", "One-Sample T-Test"),
-                ("Column", column),
-                ("Hypothesized Mean (μ₀)", f"{pop_mean:.4f}"),
-                (f"Sample Mean (n={len(sample)})", f"{sample.mean():.4f}"),
-                ("Sample Std Dev", f"{sample.std():.4f}"),
-                ("T-Statistic", f"{t_stat:.4f}"),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"One-Sample T-Test: {column} vs μ={pop_mean}"
-        elif test_type == "chi_square":
-            if not column2 or column2 not in df.columns:
-                return {"reply": "Please specify a valid column2 for the chi-square test."}
-            contingency = pd.crosstab(df[column], df[column2])
-            chi2, p_value, dof, _ = stats.chi2_contingency(contingency)
-            result_rows = [
-                ("Test", "Chi-Square Test of Independence"),
-                ("Column 1", column),
-                ("Column 2", column2),
-                ("Chi-Square Statistic", f"{chi2:.4f}"),
-                ("Degrees of Freedom", str(dof)),
-                ("P-Value", f"{p_value:.6f}"),
-                ("Significant at α=0.05", "Yes ✓" if p_value < 0.05 else "No ✗"),
-            ]
-            title = f"Chi-Square: {column} × {column2}"
-        else:
-            return {"reply": f"Unknown test_type '{test_type}'. Use one of: t_test_independent, t_test_one_sample, chi_square."}
-        style = (
-            '<style>'
-            '.vda-table-wrap{overflow-x:auto;margin:8px 0;border-radius:8px;border:1px solid #e5e7eb;}'
-            '.vda-table{width:100%;border-collapse:collapse;font-size:13px;font-family:Inter,system-ui,sans-serif;}'
-            '.vda-table thead th{background:#3B82F6;color:#fff;padding:9px 14px;text-align:left;white-space:nowrap;font-weight:600;}'
-            '.vda-table tbody td{padding:7px 14px;border-bottom:1px solid #f1f5f9;white-space:nowrap;}'
-            '.vda-table tbody tr:nth-child(even){background:#f8fafc;}'
-            '.vda-table tbody tr:last-child td{border-bottom:none;}'
-            '</style>'
-        )
-        header_cells = f'<th style="background:#1e40af;" colspan="2">{escape(title)}</th>'
-        row_html = [
-            '<tr>'
-            + f'<td style="font-weight:600;color:#1e40af;background:#eff6ff;white-space:nowrap;">{escape(label)}</td>'
-            + f'<td>{escape(value)}</td>'
-            + '</tr>'
-            for label, value in result_rows
-        ]
-        table = (
-            '<div class="vda-table-wrap"><table class="vda-table">'
-            f'<thead><tr>{header_cells}</tr></thead>'
-            '<tbody>' + '\n'.join(row_html) + '</tbody>'
-            '</table></div>'
-        )
-        return {"reply": style + table}
-    except Exception as e:
-        print("HYPOTHESIS TEST ERROR")
-        print(e)
-        return {"reply": f"There was an error running the hypothesis test. Error: {e}. You should probably try again."}
-def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, session_folder, category: str='', **kwargs):
     print("LINEAR REGRESSION CALCULATION")
     print(independent_variables)
     print(dependent_variable)
     try:
-        dir_path = TEMP_DIR / str(session_hash) / str(session_folder)
         chart_path = f'{dir_path}/chart.html'
         csv_query_path = f'{dir_path}/query.csv'
@@ -262,8 +30,11 @@ def regression_func(independent_variables: List[str], dependent_variable: str, s
            fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
                                     trendline="ols")
-        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/{session_folder}/chart.html'
-        iframe = _write_chart(fig, chart_path, chart_url)
         results_frame = px.get_trendline_results(fig)

 import plotly.express as px
 import plotly.io as pio
 import os
+from functions import scatter_chart_fig
 from dotenv import load_dotenv
 load_dotenv()
+root_url = os.getenv("ROOT_URL")
+def regression_func(independent_variables: List[str], dependent_variable: str, session_hash, category: str=''):
     print("LINEAR REGRESSION CALCULATION")
     print(independent_variables)
     print(dependent_variable)
     try:
+        dir_path = TEMP_DIR / str(session_hash)
         chart_path = f'{dir_path}/chart.html'
         csv_query_path = f'{dir_path}/query.csv'
            fig = scatter_chart_fig(df=df,x_column=independent_variables,y_column=dependent_variable,
                                     trendline="ols")
+        pio.write_html(fig, chart_path, full_html=False)
+        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+        iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
         results_frame = px.get_trendline_results(fig)

index.html ADDED Viewed

	@@ -0,0 +1,245 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Virtual Data Analyst</title>
+    <!-- Tailwind CSS -->
+    <script src="https://cdn.tailwindcss.com"></script>
+    <!-- Google Fonts -->
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+    <!-- Font Awesome -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
+    <!-- Custom Styles -->
+    <link rel="stylesheet" href="styles.css">
+    <script>
+        tailwind.config = {
+            theme: {
+                extend: {
+                    fontFamily: {
+                        'sans': ['Inter', 'sans-serif'],
+                    },
+                    colors: {
+                        primary: '#3B82F6',
+                        secondary: '#6B7280',
+                    },
+                }
+            }
+        }
+    </script>
+</head>
+<body class="bg-gray-50 font-sans">
+    <div class="min-h-screen p-6">
+        <!-- Header -->
+        <header class="max-w-4xl mx-auto mb-12 text-center">
+            <h1 class="text-4xl font-bold text-gray-900 mb-4">Virtual Data Analyst</h1>
+            <p class="text-lg text-gray-600 mb-6">
+                A powerful tool for data analysis, visualizations, and insights
+            </p>
+            <div class="bg-blue-50 border border-blue-200 rounded-lg p-4 max-w-2xl mx-auto">
+                <h2 class="font-semibold text-blue-800 mb-2">
+                    <i class="fas fa-info-circle mr-2"></i>Supported Files
+                </h2>
+                <div class="flex flex-wrap justify-center gap-3 text-blue-700">
+                    <span class="tooltip">
+                        <i class="fas fa-file-csv mr-1"></i>CSV
+                        <span class="tooltip-text">Comma-separated values</span>
+                    </span>
+                    <span class="tooltip">
+                        <i class="fas fa-file-alt mr-1"></i>TSV
+                        <span class="tooltip-text">Tab-separated values</span>
+                    </span>
+                    <span class="tooltip">
+                        <i class="fas fa-file-alt mr-1"></i>TXT
+                        <span class="tooltip-text">Text files</span>
+                    </span>
+                    <span class="tooltip">
+                        <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
+                        <span class="tooltip-text">Excel spreadsheets</span>
+                    </span>
+                    <span class="tooltip">
+                        <i class="fas fa-file-code mr-1"></i>XML
+                        <span class="tooltip-text">XML documents</span>
+                    </span>
+                    <span class="tooltip">
+                        <i class="fas fa-file-code mr-1"></i>JSON
+                        <span class="tooltip-text">JSON data files</span>
+                    </span>
+                </div>
+            </div>
+        </header>
+        <!-- Main Content -->
+        <main class="max-w-4xl mx-auto">
+            <!-- File Upload Section -->
+            <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
+                <div class="drop-zone border-2 border-dashed border-gray-300 rounded-lg p-12 text-center hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300">
+                    <input type="file" id="fileInput" class="hidden" accept=".csv,.tsv,.txt,.xls,.xlsx,.xml,.json">
+                    <!-- Upload Icon & Success Checkmark -->
+                    <div class="relative inline-block">
+                        <i class="fas fa-cloud-upload-alt text-5xl text-gray-400 mb-4 upload-icon"></i>
+                        <i class="fas fa-check-circle text-5xl success-checkmark absolute top-0 left-0"></i>
+                    </div>
+                    <!-- Loading Spinner -->
+                    <div class="loading-spinner mx-auto mb-4"></div>
+                    <h3 class="text-xl font-semibold text-gray-700 mb-2">Drop your data file here</h3>
+                    <p class="text-gray-500 mb-4">or</p>
+                    <button onclick="document.getElementById('fileInput').click()" class="bg-primary text-white px-6 py-3 rounded-lg hover:bg-blue-600 transition-colors duration-300">
+                        <i class="fas fa-folder-open mr-2"></i>Browse Files
+                    </button>
+                    <!-- Progress Bar -->
+                    <div class="progress-bar mt-4">
+                        <div class="progress-bar-fill"></div>
+                    </div>
+                    <!-- File Info -->
+                    <div id="fileInfo" class="hidden mt-4 p-4 bg-gray-100 rounded-lg">
+                        <div class="flex items-center justify-center">
+                            <i class="file-type-icon fas"></i>
+                            <span class="file-name font-medium"></span>
+                        </div>
+                        <div class="text-sm text-gray-500 mt-2">
+                            <span class="file-size"></span>
+                        </div>
+                    </div>
+                    <p class="text-sm text-gray-500 mt-4">Maximum file size: 100MB</p>
+                </div>
+            </div>
+            <!-- Sample Data Section -->
+            <div class="bg-white rounded-xl shadow-lg p-8">
+                <h2 class="text-2xl font-semibold text-gray-800 mb-6">
+                    <i class="fas fa-flask mr-2"></i>Try Sample Datasets
+                </h2>
+                <div class="grid md:grid-cols-2 gap-4">
+                    <!-- Marketing Campaign Sample -->
+                    <button class="sample-btn bg-gradient-to-r from-purple-500 to-indigo-600 text-white p-6 rounded-lg text-left hover:shadow-lg">
+                        <div class="flex items-center mb-3">
+                            <i class="fas fa-bullhorn text-2xl mr-3"></i>
+                            <div>
+                                <h3 class="text-lg font-semibold">Marketing Campaign Data</h3>
+                                <p class="text-sm opacity-90">10,000 records</p>
+                            </div>
+                        </div>
+                        <p class="text-sm opacity-90">Analyze customer responses to marketing campaigns and identify key success factors</p>
+                        <div class="mt-4 text-xs opacity-75">
+                            <i class="fas fa-table mr-1"></i> CSV format
+                        </div>
+                    </button>
+                    <!-- Retail Data Sample -->
+                    <button class="sample-btn bg-gradient-to-r from-green-500 to-teal-600 text-white p-6 rounded-lg text-left hover:shadow-lg">
+                        <div class="flex items-center mb-3">
+                            <i class="fas fa-shopping-cart text-2xl mr-3"></i>
+                            <div>
+                                <h3 class="text-lg font-semibold">Online Retail Data</h3>
+                                <p class="text-sm opacity-90">50,000 records</p>
+                            </div>
+                        </div>
+                        <p class="text-sm opacity-90">Explore sales patterns, customer behavior, and product performance</p>
+                        <div class="mt-4 text-xs opacity-75">
+                            <i class="fas fa-file-excel mr-1"></i> XLSX format
+                        </div>
+                    </button>
+                </div>
+            </div>
+            <!-- Features Preview -->
+            <div class="mt-12 grid md:grid-cols-3 gap-6">
+                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                    <i class="feature-icon fas fa-chart-line text-primary text-2xl mb-4"></i>
+                    <h3 class="font-semibold text-gray-800 mb-2">Advanced Analytics</h3>
+                    <p class="text-gray-600 text-sm">Run SQL queries, perform regressions, and analyze results with ease</p>
+                </div>
+                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                    <i class="feature-icon fas fa-chart-pie text-primary text-2xl mb-4"></i>
+                    <h3 class="font-semibold text-gray-800 mb-2">Rich Visualizations</h3>
+                    <p class="text-gray-600 text-sm">Create scatter plots, line charts, pie charts, and more</p>
+                </div>
+                <div class="feature-card bg-white p-6 rounded-lg shadow-md">
+                    <i class="feature-icon fas fa-magic text-primary text-2xl mb-4"></i>
+                    <h3 class="font-semibold text-gray-800 mb-2">Automated Insights</h3>
+                    <p class="text-gray-600 text-sm">Get instant insights and recommendations for your data</p>
+                </div>
+            </div>
+        </main>
+        <!-- Footer -->
+        <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">
+            <p>This application is under active development. For bugs or feedback, please open a discussion in the community tab.</p>
+        </footer>
+    </div>
+    <!-- Results Section -->
+    <div id="results" class="max-w-4xl mx-auto mt-12 hidden">
+        <div class="bg-white rounded-xl shadow-lg p-8">
+            <div class="flex items-center justify-between mb-6">
+                <h2 class="text-2xl font-semibold text-gray-800">
+                    <i class="fas fa-chart-bar mr-2"></i>Analysis Results
+                </h2>
+                <button onclick="closeResults()" class="text-gray-500 hover:text-gray-700">
+                    <i class="fas fa-times"></i>
+                </button>
+            </div>
+            <!-- Loading State -->
+            <div id="resultsLoading" class="text-center py-12">
+                <div class="loading-spinner mx-auto mb-4"></div>
+                <p class="text-gray-600">Analyzing your data...</p>
+            </div>
+            <!-- Error State -->
+            <div id="resultsError" class="hidden">
+                <div class="bg-red-50 border border-red-200 rounded-lg p-4 text-red-700">
+                    <i class="fas fa-exclamation-circle mr-2"></i>
+                    <span id="errorMessage">An error occurred</span>
+                </div>
+            </div>
+            <!-- Results Content -->
+            <div id="resultsContent" class="hidden">
+                <!-- Basic Statistics -->
+                <div class="mb-8">
+                    <h3 class="text-lg font-semibold text-gray-700 mb-4">Basic Statistics</h3>
+                    <div id="basicStats" class="grid grid-cols-2 md:grid-cols-4 gap-4">
+                        <!-- Stats will be inserted here -->
+                    </div>
+                </div>
+                <!-- Data Preview -->
+                <div class="mb-8">
+                    <h3 class="text-lg font-semibold text-gray-700 mb-4">Data Preview</h3>
+                    <div class="overflow-x-auto">
+                        <table id="dataPreview" class="min-w-full divide-y divide-gray-200">
+                            <!-- Table content will be inserted here -->
+                        </table>
+                    </div>
+                </div>
+                <!-- Visualizations -->
+                <div class="mb-8">
+                    <h3 class="text-lg font-semibold text-gray-700 mb-4">Visualizations</h3>
+                    <div id="visualizations" class="grid grid-cols-1 md:grid-cols-2 gap-6">
+                        <!-- Visualization charts will be inserted here -->
+                    </div>
+                </div>
+                <!-- Insights -->
+                <div>
+                    <h3 class="text-lg font-semibold text-gray-700 mb-4">Key Insights</h3>
+                    <ul id="insights" class="space-y-3">
+                        <!-- Insights will be inserted here -->
+                    </ul>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="script.js"></script>
+</body>
+</html>

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-haystack-ai>=2.7.0
-anthropic-haystack
 python-dotenv
 gradio
 pandas
@@ -7,12 +6,3 @@ plotly
 openpyxl
 statsmodels
 xlrd
-psycopg2-binary
-pymongo
-pymongoarrow
-pymongo_schema
-pandasql
-pluck-graphql
-certifi==2025.1.31
-scipy
-scikit-learn

+haystack-ai
 python-dotenv
 gradio
 pandas
 openpyxl
 statsmodels
 xlrd

samples/online_retail_data.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

samples/tb_illness_data.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

script.js ADDED Viewed

	@@ -0,0 +1,440 @@

+// API Configuration
+const API_URL = 'https://nolanzandi-virtual-data-analyst.hf.space';
+const PREDICT_ENDPOINT = `${API_URL}/api/predict`;
+const SAMPLE_ENDPOINT = `${API_URL}/api/sample`;
+// File Upload and API Integration
+async function handleFileUpload(file) {
+    try {
+        // Show loading state
+        document.querySelector('.upload-icon').style.display = 'none';
+        document.querySelector('.loading-spinner').style.display = 'block';
+        document.querySelector('.progress-bar').style.display = 'block';
+        // Create FormData
+        const formData = new FormData();
+        formData.append('file', file);
+        // Update file info
+        updateFileInfo(file);
+        // Simulate progress while actually uploading
+        const progressInterval = simulateProgress();
+        // Make API request
+        const response = await fetch(API_URL, {
+            method: 'POST',
+            body: formData
+        });
+        if (!response.ok) {
+            throw new Error('API request failed');
+        }
+        const data = await response.json();
+        // Clear progress simulation
+        clearInterval(progressInterval);
+        // Show success state
+        showSuccessState();
+        // Handle API response
+        handleApiResponse(data);
+    } catch (error) {
+        console.error('Error:', error);
+        showErrorState(error.message);
+    }
+}
+function updateFileInfo(file) {
+    const fileInfo = document.getElementById('fileInfo');
+    const fileName = fileInfo.querySelector('.file-name');
+    const fileSize = fileInfo.querySelector('.file-size');
+    const fileIcon = fileInfo.querySelector('.file-type-icon');
+    const fileType = file.name.split('.').pop().toLowerCase();
+    const iconClass = getFileTypeIcon(fileType);
+    fileIcon.className = `file-type-icon fas ${iconClass}`;
+    fileName.textContent = file.name;
+    fileSize.textContent = formatFileSize(file.size);
+    fileInfo.classList.remove('hidden');
+}
+function simulateProgress() {
+    const progressBar = document.querySelector('.progress-bar-fill');
+    let progress = 0;
+    return setInterval(() => {
+        if (progress < 90) { // Only go up to 90% until we get actual completion
+            progress += 5;
+            progressBar.style.width = `${progress}%`;
+        }
+    }, 100);
+}
+function showSuccessState() {
+    document.querySelector('.loading-spinner').style.display = 'none';
+    document.querySelector('.success-checkmark').style.display = 'block';
+    document.querySelector('.progress-bar-fill').style.width = '100%';
+    setTimeout(() => {
+        resetUploadState();
+    }, 2000);
+}
+function showErrorState(message) {
+    // Reset upload UI
+    resetUploadState();
+    // Show error message
+    const errorDiv = document.createElement('div');
+    errorDiv.className = 'text-red-500 mt-4';
+    errorDiv.innerHTML = `<i class="fas fa-exclamation-circle mr-2"></i>${message}`;
+    document.querySelector('.drop-zone').appendChild(errorDiv);
+    setTimeout(() => {
+        errorDiv.remove();
+    }, 5000);
+}
+function resetUploadState() {
+    document.querySelector('.success-checkmark').style.display = 'none';
+    document.querySelector('.upload-icon').style.display = 'block';
+    document.querySelector('.progress-bar').style.display = 'none';
+    document.querySelector('.progress-bar-fill').style.width = '0%';
+    document.getElementById('fileInfo').classList.add('hidden');
+}
+function handleSampleDataClick(datasetName) {
+    // Show loading state in results section
+    const resultsSection = document.getElementById('results');
+    const resultsLoading = document.getElementById('resultsLoading');
+    const resultsContent = document.getElementById('resultsContent');
+    const resultsError = document.getElementById('resultsError');
+    resultsSection.classList.remove('hidden');
+    resultsLoading.classList.remove('hidden');
+    resultsContent.classList.add('hidden');
+    resultsError.classList.add('hidden');
+    // Simulate API delay
+    setTimeout(() => {
+        try {
+            // Mock data based on dataset type
+            const mockData = datasetName === 'marketing_campaign' ? {
+                statistics: {
+                    rows: 10000,
+                    columns: 15,
+                    missing_values: 120,
+                    data_types: ['numeric', 'categorical', 'datetime']
+                },
+                preview: {
+                    columns: ['Campaign ID', 'Customer ID', 'Response', 'Channel'],
+                    data: [
+                        ['CAM001', 'C001', 'Converted', 'Email'],
+                        ['CAM001', 'C002', 'No Response', 'SMS'],
+                        ['CAM002', 'C003', 'Converted', 'Social Media']
+                    ]
+                },
+                visualizations: [
+                    {
+                        title: 'Response Rate by Channel',
+                        description: 'Conversion rates across different marketing channels',
+                        image_url: 'https://via.placeholder.com/400x300'
+                    },
+                    {
+                        title: 'Campaign Performance',
+                        description: 'Success metrics for each campaign',
+                        image_url: 'https://via.placeholder.com/400x300'
+                    }
+                ],
+                insights: [
+                    {
+                        title: 'Best Performing Channel',
+                        description: 'Email campaigns show highest conversion rate at 28%'
+                    },
+                    {
+                        title: 'Optimal Send Time',
+                        description: 'Campaigns sent between 2 PM - 4 PM have better engagement'
+                    }
+                ]
+            } : {
+                statistics: {
+                    rows: 50000,
+                    columns: 12,
+                    missing_values: 85,
+                    data_types: ['numeric', 'categorical', 'datetime']
+                },
+                preview: {
+                    columns: ['Order ID', 'Product', 'Quantity', 'Price'],
+                    data: [
+                        ['ORD001', 'Laptop', '1', '$999.99'],
+                        ['ORD002', 'Mouse', '2', '$29.99'],
+                        ['ORD003', 'Monitor', '1', '$299.99']
+                    ]
+                },
+                visualizations: [
+                    {
+                        title: 'Sales by Category',
+                        description: 'Distribution of sales across product categories',
+                        image_url: 'https://via.placeholder.com/400x300'
+                    },
+                    {
+                        title: 'Monthly Revenue',
+                        description: 'Revenue trends over the past 12 months',
+                        image_url: 'https://via.placeholder.com/400x300'
+                    }
+                ],
+                insights: [
+                    {
+                        title: 'Top Products',
+                        description: 'Electronics category generates 45% of total revenue'
+                    },
+                    {
+                        title: 'Customer Behavior',
+                        description: 'Average order value increased by 15% in Q4'
+                    }
+                ]
+            };
+            handleApiResponse(mockData);
+        } catch (error) {
+            console.error('Error:', error);
+            showErrorState('Failed to process sample dataset');
+        }
+    }, 1000); // 1 second delay to show loading state
+}
+function handleApiResponse(data) {
+    const resultsSection = document.getElementById('results');
+    const resultsLoading = document.getElementById('resultsLoading');
+    const resultsContent = document.getElementById('resultsContent');
+    const resultsError = document.getElementById('resultsError');
+    // Show results section
+    resultsSection.classList.remove('hidden');
+    resultsLoading.classList.add('hidden');
+    resultsError.classList.add('hidden');
+    resultsContent.classList.remove('hidden');
+    // Update Basic Statistics
+    updateBasicStats(data.statistics);
+    // Update Data Preview
+    updateDataPreview(data.preview);
+    // Update Visualizations
+    updateVisualizations(data.visualizations);
+    // Update Insights
+    updateInsights(data.insights);
+}
+function updateBasicStats(statistics) {
+    const statsContainer = document.getElementById('basicStats');
+    statsContainer.innerHTML = '';
+    const stats = [
+        { label: 'Rows', value: statistics.rows, icon: 'fa-list' },
+        { label: 'Columns', value: statistics.columns, icon: 'fa-columns' },
+        { label: 'Missing Values', value: statistics.missing_values, icon: 'fa-exclamation-triangle' },
+        { label: 'Data Types', value: statistics.data_types.length, icon: 'fa-code' }
+    ];
+    stats.forEach(stat => {
+        const statDiv = document.createElement('div');
+        statDiv.className = 'bg-gray-50 rounded-lg p-4';
+        statDiv.innerHTML = `
+            <div class="flex items-center">
+                <i class="fas ${stat.icon} text-primary text-xl mr-3"></i>
+                <div>
+                    <div class="text-sm text-gray-500">${stat.label}</div>
+                    <div class="text-lg font-semibold">${stat.value}</div>
+                </div>
+            </div>
+        `;
+        statsContainer.appendChild(statDiv);
+    });
+}
+function updateDataPreview(preview) {
+    const table = document.getElementById('dataPreview');
+    table.innerHTML = '';
+    // Add header
+    const thead = document.createElement('thead');
+    thead.className = 'bg-gray-50';
+    const headerRow = document.createElement('tr');
+    preview.columns.forEach(column => {
+        const th = document.createElement('th');
+        th.className = 'px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider';
+        th.textContent = column;
+        headerRow.appendChild(th);
+    });
+    thead.appendChild(headerRow);
+    table.appendChild(thead);
+    // Add body
+    const tbody = document.createElement('tbody');
+    tbody.className = 'bg-white divide-y divide-gray-200';
+    preview.data.forEach(row => {
+        const tr = document.createElement('tr');
+        row.forEach(cell => {
+            const td = document.createElement('td');
+            td.className = 'px-6 py-4 whitespace-nowrap text-sm text-gray-500';
+            td.textContent = cell;
+            tr.appendChild(td);
+        });
+        tbody.appendChild(tr);
+    });
+    table.appendChild(tbody);
+}
+function updateVisualizations(visualizations) {
+    const container = document.getElementById('visualizations');
+    container.innerHTML = '';
+    visualizations.forEach(viz => {
+        const vizDiv = document.createElement('div');
+        vizDiv.className = 'bg-white rounded-lg p-4 shadow';
+        vizDiv.innerHTML = `
+            <h4 class="text-lg font-medium text-gray-800 mb-4">${viz.title}</h4>
+            <div class="aspect-w-16 aspect-h-9">
+                <img src="${viz.image_url}" alt="${viz.title}" class="rounded-lg">
+            </div>
+            <p class="mt-2 text-sm text-gray-600">${viz.description}</p>
+        `;
+        container.appendChild(vizDiv);
+    });
+}
+function updateInsights(insights) {
+    const insightsList = document.getElementById('insights');
+    insightsList.innerHTML = '';
+    insights.forEach(insight => {
+        const li = document.createElement('li');
+        li.className = 'bg-blue-50 rounded-lg p-4';
+        li.innerHTML = `
+            <div class="flex items-start">
+                <i class="fas fa-lightbulb text-yellow-500 mt-1 mr-3"></i>
+                <div>
+                    <div class="font-medium text-blue-900">${insight.title}</div>
+                    <p class="mt-1 text-sm text-blue-700">${insight.description}</p>
+                </div>
+            </div>
+        `;
+        insightsList.appendChild(li);
+    });
+}
+function closeResults() {
+    document.getElementById('results').classList.add('hidden');
+}
+function showErrorState(message) {
+    const resultsSection = document.getElementById('results');
+    const resultsLoading = document.getElementById('resultsLoading');
+    const resultsContent = document.getElementById('resultsContent');
+    const resultsError = document.getElementById('resultsError');
+    const errorMessage = document.getElementById('errorMessage');
+    resultsSection.classList.remove('hidden');
+    resultsLoading.classList.add('hidden');
+    resultsContent.classList.add('hidden');
+    resultsError.classList.remove('hidden');
+    errorMessage.textContent = message;
+}
+// Event Listeners
+document.addEventListener('DOMContentLoaded', () => {
+    // File Upload Handling
+    const dropZone = document.querySelector('.drop-zone');
+    const fileInput = document.getElementById('fileInput');
+    // Prevent default drag behaviors
+    ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
+        dropZone.addEventListener(eventName, preventDefaults, false);
+        document.body.addEventListener(eventName, preventDefaults, false);
+    });
+    // Highlight drop zone when dragging over it
+    ['dragenter', 'dragover'].forEach(eventName => {
+        dropZone.addEventListener(eventName, highlight, false);
+    });
+    ['dragleave', 'drop'].forEach(eventName => {
+        dropZone.addEventListener(eventName, unhighlight, false);
+    });
+    // Handle dropped files
+    dropZone.addEventListener('drop', (e) => {
+        const dt = e.dataTransfer;
+        const files = dt.files;
+        if (files.length > 0) {
+            handleFileUpload(files[0]);
+        }
+    });
+    fileInput.addEventListener('change', (e) => {
+        if (e.target.files.length > 0) {
+            handleFileUpload(e.target.files[0]);
+        }
+    });
+    // Sample Data Button Handlers
+    const marketingBtn = document.querySelector('.sample-btn:nth-child(1)');
+    const retailBtn = document.querySelector('.sample-btn:nth-child(2)');
+    if (marketingBtn) {
+        marketingBtn.addEventListener('click', () => {
+            console.log('Marketing campaign button clicked');
+            handleSampleDataClick('marketing_campaign');
+        });
+    }
+    if (retailBtn) {
+        retailBtn.addEventListener('click', () => {
+            console.log('Online retail button clicked');
+            handleSampleDataClick('online_retail');
+        });
+    }
+});
+// Utility Functions
+function preventDefaults(e) {
+    e.preventDefault();
+    e.stopPropagation();
+}
+function highlight(e) {
+    document.querySelector('.drop-zone').classList.add('border-primary', 'bg-blue-50');
+}
+function unhighlight(e) {
+    document.querySelector('.drop-zone').classList.remove('border-primary', 'bg-blue-50');
+}
+function getFileTypeIcon(fileType) {
+    const icons = {
+        'csv': 'fa-file-csv',
+        'tsv': 'fa-file-alt',
+        'txt': 'fa-file-alt',
+        'xls': 'fa-file-excel',
+        'xlsx': 'fa-file-excel',
+        'xml': 'fa-file-code',
+        'json': 'fa-file-code'
+    };
+    return icons[fileType] || 'fa-file';
+}
+function formatFileSize(bytes) {
+    if (bytes === 0) return '0 Bytes';
+    const k = 1024;
+    const sizes = ['Bytes', 'KB', 'MB', 'GB'];
+    const i = Math.floor(Math.log(bytes) / Math.log(k));
+    return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+}

assets/styles.css → styles.css RENAMED Viewed

@@ -1,198 +1,168 @@
-/* Loading Animation */
-.loading-spinner {
-    display: none;
-    width: 50px;
-    height: 50px;
-    border: 5px solid #f3f3f3;
-    border-top: 5px solid #3B82F6;
-    border-radius: 50%;
-    animation: spin 1s linear infinite;
-    margin: 0 auto;
-}
-@keyframes spin {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
-}
-/* File Upload Progress */
-.progress-bar {
-    width: 100%;
-    height: 6px;
-    background-color: #e5e7eb;
-    border-radius: 3px;
-    overflow: hidden;
-    display: none;
-    margin: 1rem auto;
-    max-width: 300px;
-}
-.progress-bar-fill {
-    height: 100%;
-    background-color: #3B82F6;
-    width: 0%;
-    transition: width 0.3s ease;
-}
-/* Tooltip */
-.tooltip {
-    position: relative;
-    display: inline-block;
-}
-.tooltip .tooltip-text {
-    visibility: hidden;
-    background-color: #1f2937;
-    color: white;
-    text-align: center;
-    padding: 8px 12px;
-    border-radius: 6px;
-    position: absolute;
-    z-index: 1;
-    bottom: 125%;
-    left: 50%;
-    transform: translateX(-50%);
-    opacity: 0;
-    transition: opacity 0.3s;
-    font-size: 0.875rem;
-    white-space: nowrap;
-    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
-}
-.tooltip:hover .tooltip-text {
-    visibility: visible;
-    opacity: 1;
-}
-/* File Type Icons */
-.file-type-icon {
-    font-size: 1.5rem;
-    margin-right: 0.5rem;
-    color: #3B82F6;
-}
-/* Success Animation */
-@keyframes checkmark {
-    0% { transform: scale(0); opacity: 0; }
-    50% { transform: scale(1.2); opacity: 0.8; }
-    100% { transform: scale(1); opacity: 1; }
-}
-.success-checkmark {
-    display: none;
-    color: #10B981;
-    animation: checkmark 0.5s ease-in-out forwards;
-}
-/* Sample Data Cards */
-.sample-btn {
-    transition: all 0.3s ease;
-    position: relative;
-    overflow: hidden;
-    background: linear-gradient(135deg, #3B82F6, #0ea5e9) !important;
-}
-.sample-btn::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    width: 100%;
-    height: 100%;
-    background: linear-gradient(rgba(255,255,255,0.12), rgba(255,255,255,0));
-    transform: translateY(-100%);
-    transition: transform 0.3s ease;
-}
-.sample-btn:hover::after {
-    transform: translateY(0);
-}
-.sample-btn:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 8px 20px rgba(59,130,246,0.3);
-}
-/* Status badge fade-in */
-@keyframes fadeSlideIn {
-    from { opacity: 0; transform: translateY(-6px); }
-    to   { opacity: 1; transform: translateY(0); }
-}
-.api-status-badge {
-    animation: fadeSlideIn 0.35s ease forwards;
-}
-/* Drop Zone Enhancements */
-.drop-zone {
-    transition: all 0.3s ease;
-    position: relative;
-    overflow: hidden;
-}
-.drop-zone::before {
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    border-radius: 8px;
-    border: 2px dashed #3B82F6;
-    opacity: 0;
-    transition: opacity 0.3s ease;
-}
-.drop-zone:hover::before {
-    opacity: 1;
-}
-/* File Info Card */
-#fileInfo {
-    background: linear-gradient(to right, #f8fafc, #f1f5f9);
-    border: 1px solid #e2e8f0;
-    transition: all 0.3s ease;
-}
-#fileInfo:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
-}
-/* Features Section */
-.feature-card {
-    transition: all 0.3s ease;
-}
-.feature-card:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 8px 15px rgba(0,0,0,0.1);
-}
-.feature-icon {
-    transition: all 0.3s ease;
-}
-.feature-card:hover .feature-icon {
-    transform: scale(1.1);
-    color: #2563eb;
-}
-@media only screen and (max-width: 600px) {
-    .feature-card p {grid-column: 1/3;}
-    .feature-card i, .feature-card h3 {text-align: center;}
-    .feature-card {
-        display: grid;
-        grid-template-columns: 1fr 2fr;
-        align-items: baseline;
-    }
-  }
-dialog {
-  margin: 10% auto;
-  width: 80%;
-  max-width: 350px;
-  background-color: #fff;
-  padding: 34px;
-  border: 0;
-  border-radius: 5px;
-}

+/* Loading Animation */
+.loading-spinner {
+    display: none;
+    width: 50px;
+    height: 50px;
+    border: 5px solid #f3f3f3;
+    border-top: 5px solid #3B82F6;
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+    margin: 0 auto;
+}
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+}
+/* File Upload Progress */
+.progress-bar {
+    width: 100%;
+    height: 6px;
+    background-color: #e5e7eb;
+    border-radius: 3px;
+    overflow: hidden;
+    display: none;
+    margin: 1rem auto;
+    max-width: 300px;
+}
+.progress-bar-fill {
+    height: 100%;
+    background-color: #3B82F6;
+    width: 0%;
+    transition: width 0.3s ease;
+}
+/* Tooltip */
+.tooltip {
+    position: relative;
+    display: inline-block;
+}
+.tooltip .tooltip-text {
+    visibility: hidden;
+    background-color: #1f2937;
+    color: white;
+    text-align: center;
+    padding: 8px 12px;
+    border-radius: 6px;
+    position: absolute;
+    z-index: 1;
+    bottom: 125%;
+    left: 50%;
+    transform: translateX(-50%);
+    opacity: 0;
+    transition: opacity 0.3s;
+    font-size: 0.875rem;
+    white-space: nowrap;
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+.tooltip:hover .tooltip-text {
+    visibility: visible;
+    opacity: 1;
+}
+/* File Type Icons */
+.file-type-icon {
+    font-size: 1.5rem;
+    margin-right: 0.5rem;
+    color: #3B82F6;
+}
+/* Success Animation */
+@keyframes checkmark {
+    0% { transform: scale(0); opacity: 0; }
+    50% { transform: scale(1.2); opacity: 0.8; }
+    100% { transform: scale(1); opacity: 1; }
+}
+.success-checkmark {
+    display: none;
+    color: #10B981;
+    animation: checkmark 0.5s ease-in-out forwards;
+}
+/* Sample Data Cards */
+.sample-btn {
+    transition: all 0.3s ease;
+    position: relative;
+    overflow: hidden;
+}
+.sample-btn::after {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    background: linear-gradient(rgba(255,255,255,0.1), rgba(255,255,255,0));
+    transform: translateY(-100%);
+    transition: transform 0.3s ease;
+}
+.sample-btn:hover::after {
+    transform: translateY(0);
+}
+.sample-btn:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 8px 15px rgba(0,0,0,0.1);
+}
+/* Drop Zone Enhancements */
+.drop-zone {
+    transition: all 0.3s ease;
+    position: relative;
+    overflow: hidden;
+}
+.drop-zone::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    border-radius: 8px;
+    border: 2px dashed #3B82F6;
+    opacity: 0;
+    transition: opacity 0.3s ease;
+}
+.drop-zone:hover::before {
+    opacity: 1;
+}
+/* File Info Card */
+#fileInfo {
+    background: linear-gradient(to right, #f8fafc, #f1f5f9);
+    border: 1px solid #e2e8f0;
+    transition: all 0.3s ease;
+}
+#fileInfo:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+}
+/* Features Section */
+.feature-card {
+    transition: all 0.3s ease;
+}
+.feature-card:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 8px 15px rgba(0,0,0,0.1);
+}
+.feature-icon {
+    transition: all 0.3s ease;
+}
+.feature-card:hover .feature-icon {
+    transform: scale(1.1);
+    color: #2563eb;
+}

temp/.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- *
2	- !.gitignore

templates/data_file.py DELETED Viewed

@@ -1,286 +0,0 @@
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import process_data_upload
-from utils import message_dict
-import ast
-import html as _html
-def build_summary_modal(stats):
-    num_rows = stats['num_rows']
-    num_cols = stats['num_cols']
-    total_missing = stats['total_missing']
-    duplicate_rows = stats.get('duplicate_rows', 0)
-    file_size_bytes = stats.get('file_size_bytes', 0)
-    def _fmt_num(v):
-        try:
-            if v != v: return '—'  # NaN
-            abs_v = abs(v)
-            if abs_v >= 1e9: return f"{v/1e9:.1f}B"
-            if abs_v >= 1e6: return f"{v/1e6:.1f}M"
-            if abs_v >= 1e3: return f"{v:,.0f}" if v == int(v) else f"{v:,.1f}"
-            return f"{v:,.0f}" if v == int(v) else f"{v:.2f}"
-        except Exception:
-            return str(v)
-    def _fmt_size(b):
-        if not b: return ''
-        if b < 1024: return f"{b} B"
-        if b < 1024 ** 2: return f"{b / 1024:.1f} KB"
-        if b < 1024 ** 3: return f"{b / 1024 ** 2:.1f} MB"
-        return f"{b / 1024 ** 3:.2f} GB"
-    file_size_label = _fmt_size(file_size_bytes)
-    dup_color = "#ef4444" if duplicate_rows > 0 else "#a16207"
-    dup_bg = "#fef2f2" if duplicate_rows > 0 else "#fefce8"
-    dup_border = "#fecaca" if duplicate_rows > 0 else "#fde68a"
-    dtype_rows_html = ""
-    for i, (col, dtype) in enumerate(stats['dtypes'].items()):
-        bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
-        missing = stats['missing_per_col'].get(col, 0)
-        pct_missing = (missing / num_rows * 100) if num_rows > 0 else 0
-        missing_color = "#ef4444" if missing > 0 else "#9ca3af"
-        missing_weight = "600" if missing > 0 else "400"
-        missing_cell = f'{missing:,} <span style="color:#9ca3af;font-size:0.7rem;">({pct_missing:.1f}%)</span>'
-        unique = stats.get('unique_counts', {}).get(col, '—')
-        is_id = isinstance(unique, int) and num_rows > 0 and (unique / num_rows) >= 0.95 and unique > 10
-        id_badge = ' <span style="background:#fef3c7;color:#92400e;padding:1px 5px;border-radius:3px;font-size:0.65rem;vertical-align:middle;">ID?</span>' if is_id else ''
-        unique_cell = f'{unique:,}{id_badge}' if isinstance(unique, int) else str(unique)
-        cs = stats.get('col_stats', {}).get(col, {})
-        if cs.get('type') == 'numeric':
-            stats_cell = (
-                f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
-                f'{_fmt_num(cs["min"])} – {_fmt_num(cs["max"])}'
-                f'<br><span style="color:#9ca3af;">avg {_fmt_num(cs["mean"])}</span></span>'
-            )
-        elif cs.get('type') == 'datetime':
-            stats_cell = (
-                f'<span style="font-size:0.74rem;color:#6b7280;line-height:1.6;">'
-                f'{cs["min"]}<br>→ {cs["max"]}</span>'
-            )
-        else:
-            stats_cell = '<span style="color:#d1d5db;">—</span>'
-        dtype_rows_html += (
-            f'<tr style="background:{bg}">'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#111827;white-space:nowrap;">{_html.escape(col)}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;white-space:nowrap;"><span style="background:#dbeafe;color:#1e40af;padding:2px 8px;border-radius:4px;font-size:0.74rem;">{dtype}</span></td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;color:{missing_color};font-weight:{missing_weight};white-space:nowrap;">{missing_cell}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;text-align:right;white-space:nowrap;color:#374151;">{unique_cell}</td>'
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;">{stats_cell}</td>'
-            f'</tr>'
-        )
-    preview_headers_html = "".join(
-        f'<th style="padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;text-align:left;">{_html.escape(col)}</th>'
-        for col in stats['preview_cols']
-    )
-    preview_rows_html = ""
-    for i, row in enumerate(stats['preview']):
-        bg = "#ffffff" if i % 2 == 0 else "#f9fafb"
-        cells = "".join(
-            f'<td style="padding:7px 12px;border-bottom:1px solid #f3f4f6;color:#374151;white-space:nowrap;">{_html.escape(str(cell))}</td>'
-            for cell in row
-        )
-        preview_rows_html += f'<tr style="background:{bg}">{cells}</tr>'
-    size_tag = f'<span style="background:rgba(255,255,255,0.2);color:#fff;padding:2px 10px;border-radius:12px;font-size:0.75rem;font-weight:400;">{file_size_label}</span>' if file_size_label else ''
-    return f"""
-<div class="vda-modal-overlay" style="position:fixed;inset:0;background:rgba(0,0,0,0.55);z-index:9999;display:flex;align-items:center;justify-content:center;font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
-  <div style="background:#fff;border-radius:14px;width:90%;max-width:800px;max-height:88vh;display:flex;flex-direction:column;box-shadow:0 25px 50px -12px rgba(0,0,0,0.35);overflow:hidden;">
-    <div style="background:linear-gradient(135deg,#3B82F6,#0ea5e9);padding:16px 20px;display:flex;justify-content:space-between;align-items:center;flex-shrink:0;gap:12px;">
-      <div style="display:flex;align-items:center;gap:10px;">
-        <span style="color:#fff;font-weight:600;font-size:1rem;">Dataset Summary</span>
-        {size_tag}
-      </div>
-      <button onclick="document.querySelectorAll('.vda-modal-overlay').forEach(function(e){{e.remove()}})" style="background:rgba(255,255,255,0.2);border:none;color:#fff;width:30px;height:30px;border-radius:50%;cursor:pointer;font-size:1rem;line-height:1;flex-shrink:0;">&#x2715;</button>
-    </div>
-    <div style="padding:20px;overflow-y:auto;flex:1;">
-      <div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:10px;margin-bottom:20px;">
-        <div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#1d4ed8;">{num_rows:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Rows</div>
-        </div>
-        <div style="background:#f0fdf4;border:1px solid #bbf7d0;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#15803d;">{num_cols}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Columns</div>
-        </div>
-        <div style="background:#fefce8;border:1px solid #fde68a;border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:#a16207;">{total_missing:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Missing Values</div>
-        </div>
-        <div style="background:{dup_bg};border:1px solid {dup_border};border-radius:8px;padding:12px;text-align:center;">
-          <div style="font-size:1.4rem;font-weight:700;color:{dup_color};">{duplicate_rows:,}</div>
-          <div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.06em;margin-top:4px;">Duplicate Rows</div>
-        </div>
-      </div>
-      <div style="margin-bottom:20px;">
-        <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Column Info</div>
-        <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
-          <div style="max-height:210px;overflow:auto;">
-            <table style="border-collapse:collapse;font-size:0.83rem;min-width:100%;">
-              <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
-                <tr>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Column</th>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Type</th>
-                  <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Missing</th>
-                  <th style="text-align:right;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Unique</th>
-                  <th style="text-align:left;padding:8px 12px;color:#6b7280;font-weight:500;border-bottom:1px solid #e5e7eb;white-space:nowrap;">Stats / Range</th>
-                </tr>
-              </thead>
-              <tbody>{dtype_rows_html}</tbody>
-            </table>
-          </div>
-        </div>
-      </div>
-      <div>
-        <div style="font-size:0.78rem;font-weight:600;color:#374151;text-transform:uppercase;letter-spacing:0.06em;margin-bottom:10px;">Data Preview (first 5 rows)</div>
-        <div style="border:1px solid #e5e7eb;border-radius:8px;overflow:hidden;">
-          <div style="overflow:auto;max-height:200px;">
-            <table style="border-collapse:collapse;font-size:0.8rem;">
-              <thead style="background:#f9fafb;position:sticky;top:0;z-index:1;">
-                <tr>{preview_headers_html}</tr>
-              </thead>
-              <tbody>{preview_rows_html}</tbody>
-            </table>
-          </div>
-        </div>
-      </div>
-    </div>
-  </div>
-</div>
-"""
-def run_example(input):
-    return input
-def example_display(input):
-    if input == None:
-        display = True
-    else:
-        display = False
-    return [gr.update(visible=display), gr.update(visible=display), gr.update(visible=display), gr.update(visible=display)]
-with gr.Blocks() as demo:
-    description = gr.HTML("""
-                        <div class="max-w-4xl mx-auto mb-12 text-center">
-                            <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
-                                <h2 class="font-semibold text-blue-800 ">
-                                    <i class="fas fa-info-circle mr-2"></i>Supported Files
-                                </h2>
-                                <div class="flex flex-wrap justify-center gap-3 pb-4 text-blue-700">
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-csv mr-1"></i>CSV
-                                        <span class="tooltip-text">Comma-separated values</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-alt mr-1"></i>TSV
-                                        <span class="tooltip-text">Tab-separated values</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-alt mr-1"></i>TXT
-                                        <span class="tooltip-text">Text files</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-excel mr-1"></i>XLS/XLSX
-                                        <span class="tooltip-text">Excel spreadsheets</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-code mr-1"></i>XML
-                                        <span class="tooltip-text">XML documents</span>
-                                    </span>
-                                    <span class="tooltip">
-                                        <i class="fas fa-file-code mr-1"></i>JSON
-                                        <span class="tooltip-text">JSON data files</span>
-                                    </span>
-                                </div>
-                            </div>
-                        </div>
-                          """, elem_classes="description_component")
-    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
-    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
-    example_file_3 = gr.File(visible=False, value="samples/tb_illness_data.csv")
-    with gr.Row():
-        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-        example_btn_3 = gr.Button(value="Try Me: tb_illness_data.csv", elem_classes="sample-btn bg-gradient-to-r from-blue-500 to-sky-600 text-white p-6 rounded-lg text-left hover:shadow-lg", size="md", variant="primary")
-    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker drop-zone border-2 border-dashed border-gray-300 rounded-lg hover:border-primary cursor-pointer bg-gray-50 hover:bg-blue-50 transition-colors duration-300", file_types=['.csv', '.xlsx', '.txt', '.json', '.ndjson', '.xml', '.xls', '.tsv'])
-    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
-    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
-    example_btn_3.click(fn=run_example, inputs=example_file_3, outputs=file_output)
-    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2, example_btn_3, description])
-    @gr.render(inputs=file_output)
-    def data_options(filename, request: gr.Request):
-        print(filename)
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['file_upload'] = None
-        if filename:
-            process_message = process_upload(filename, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                gr.HTML(value=build_summary_modal(process_message[3]), padding=False)
-                if "bank_marketing_campaign" in filename:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What levels of education have the highest and lowest average balance?"],
-                                            ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
-                                            ["Can you generate a bar chart of education vs. average balance?"],
-                                            ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"],
-                                            ["Can we predict the relationship between the number of contacts performed before this campaign and the average balance?"],
-                                            ["Can you plot the number of contacts performed before this campaign versus the duration and use balance as the size in a bubble chart?"]
-                                        ]
-                elif "online_retail_data" in filename:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What month had the highest revenue?"],
-                                            ["Is revenue higher in the morning or afternoon?"],
-                                            ["Can you generate a line graph of revenue per month?"],
-                                            ["Can you generate a table of revenue per month?"],
-                                            ["Can we predict how time of day affects transaction value in this data set?"],
-                                            ["Can you plot revenue per month with size being the number of units sold that month in a bubble chart?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'file_upload', '', process_message[2], ''))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("DATA FILE QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the columns in the dataset"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                data_source = gr.Textbox(visible=False, value='file_upload')
-                schema = gr.Textbox(visible=False, value='')
-                titles = gr.Textbox(value=process_message[2], interactive=False, visible=False)
-                bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your data file",
-                                    concurrency_limit=None,
-                                    examples=example_questions,
-                                    additional_inputs=[session_hash, data_source, titles, schema]
-                                    )
-    def process_upload(upload_value, session_hash):
-        if upload_value:
-            process_message = process_data_upload(upload_value, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

templates/doc_db.py DELETED Viewed

@@ -1,105 +0,0 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_doc_db
-from utils import message_dict
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the MongoDB Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to a MongoDB database and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Queries use PyMongoArrow's <code>aggregate_pandas_all</code>, which cannot delete, drop, or insert documents.
-                        Use caution connecting production databases to third-party tools.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own database details to analyze your own data.
-            </p>
-        </div>
-    """)
-    connection_string = gr.Textbox(label="Connection String", value="dataanalyst0.l1klmww.mongodb.net/")
-    with gr.Row():
-        connection_user = gr.Textbox(label="Connection User", value="virtual-data-analyst")
-        connection_password = gr.Textbox(label="Connection Password", value="zcpbmoGJ3mC8o", type="password")
-        doc_db_name = gr.Textbox(label="Database Name", value="sample_mflix")
-    gr.HTML("""
-        <p style="text-align:center;font-size:13px;color:#6b7280;margin:4px 0 8px;">
-            <i class="fas fa-circle-info" style="margin-right:4px;"></i>
-            Schema analysis runs on connect — this may take 1–2 minutes for large databases.
-        </p>
-    """)
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[connection_string, connection_user, connection_password, doc_db_name], triggers=[submit.click])
-    def db_chat(request: gr.Request, connection_string=connection_string.value, connection_user=connection_user.value, connection_password=connection_password.value, doc_db_name=doc_db_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['doc_db'] = None
-        connection_login_value = "mongodb+srv://" + connection_user + ":" + connection_password + "@" + connection_string
-        if connection_login_value:
-            print("MONGO APP")
-            process_message = process_doc_db(connection_login_value, doc_db_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "dataanalyst0.l1klmww.mongodb.net" in connection_login_value:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What are the top 5 most common movie genres?"],
-                                            ["How do user comment counts on a movie correlate with the movie award wins?"],
-                                            ["Can you generate a pie chart showing the top 10 states with the most movie theaters?"],
-                                            ["What are the top 10 most represented directors in the database?"],
-                                            ["What are the different movie categories and how many movies are in each category?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'doc_db', doc_db_name, process_message[2], process_message[3]))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("DOC DB QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the collections in the database"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                db_connection_string = gr.Textbox(visible=False, value=connection_login_value)
-                db_name = gr.Textbox(visible=False, value=doc_db_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="DB Collections")
-                data_source = gr.Textbox(visible=False, value='doc_db')
-                schema = gr.Textbox(visible=False, value=process_message[3])
-                bot = gr.Chatbot(type='messages', label="MongoDB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your Database",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, db_connection_string, db_name]
-                                    )
-    def process_doc_db(connection_string, nosql_db_name, session_hash):
-        if connection_string:
-            process_message = connect_doc_db(connection_string, nosql_db_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

templates/graphql.py DELETED Viewed

@@ -1,110 +0,0 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_graphql
-from utils import message_dict
-import os
-from dotenv import load_dotenv
-load_dotenv()
-graphql_sample_endpoint = os.getenv("GRAPHQL_SAMPLE_ENDPOINT")
-graphql_sample_api_token = os.getenv("GRAPHQL_SAMPLE_API_TOKEN")
-graphql_sample_header_name = os.getenv("GRAPHQL_SAMPLE_HEADER_NAME")
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the GraphQL Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to any GraphQL API endpoint and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        API querying is the most experimental feature and performance may vary.
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Mutations are not exposed and the agent is instructed not to alter data, though restricting
-                        your API token's permissions is still strongly recommended.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own endpoint and token to analyze your own API.
-            </p>
-        </div>
-    """)
-    graphql_url = gr.Textbox(label="GraphQL Endpoint URL", value=graphql_sample_endpoint)
-    with gr.Row():
-        api_token_header_name = gr.Textbox(label="API Token Header Name", value=graphql_sample_header_name)
-        api_token = gr.Textbox(label="API Token", value=graphql_sample_api_token, type="password")
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[graphql_url, api_token, api_token_header_name], triggers=[submit.click])
-    def api_chat(request: gr.Request, graphql_url=graphql_url.value, api_token=api_token.value, api_token_header_name=api_token_header_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['graphql'] = None
-        if graphql_url:
-            print("GraphQL API")
-            process_message = process_graphql(graphql_url, api_token, api_token_header_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "qdl-app-testing" in graphql_url:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What is the total revenue for this shopify store?"],
-                                            ["What is the average duration from the fulfillment of an order to its delivery?"],
-                                            ["What is the total value of orders processed in the current month?"],
-                                            ["Which product has the highest number of variants in the inventory?"],
-                                            ["How many gift cards have been issued this year, and what is their total value?"],
-                                            ["How many active apps are currently installed on the store?"],
-                                            ["What is the total count of abandoned checkouts over the last month?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'graphql', graphql_url, process_message[2], ''))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("GRAPHQL QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the types in this API"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                graphql_api_string = gr.Textbox(visible=False, value=graphql_url)
-                graphql_api_token = gr.Textbox(visible=False, value=api_token)
-                graphql_token_header = gr.Textbox(visible=False, value=api_token_header_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="GraphQL Types")
-                data_source = gr.Textbox(visible=False, value='graphql')
-                schema = gr.Textbox(visible=False, value='')
-                bot = gr.Chatbot(type='messages', label="GraphQL Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your GraphQL API",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, graphql_api_string, graphql_api_token, graphql_token_header]
-                                    )
-    def process_graphql(graphql_url, api_token, api_token_header_name, session_hash):
-        if graphql_url:
-            process_message = connect_graphql(graphql_url, api_token, api_token_header_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

templates/sql_db.py DELETED Viewed

@@ -1,102 +0,0 @@
-import ast
-import gradio as gr
-from functions import example_question_generator, chatbot_func
-from data_sources import connect_sql_db
-from utils import message_dict
-with gr.Blocks() as demo:
-    with gr.Accordion("ℹ️  About the SQL Connector", open=False):
-        gr.HTML("""
-            <div class="max-w-4xl mx-auto text-center">
-                <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto p-4">
-                    <p>Connect to a PostgreSQL database and query it using natural language.</p>
-                    <p style="font-weight:bold;">
-                        No credentials are retained — they are passed as session variables and disappear when you leave or refresh.
-                        Queries run through Pandas <code>read_sql_query</code>, which cannot delete, drop, or insert rows.
-                        Use caution connecting production databases to third-party tools.
-                    </p>
-                    <p>Contact me if you'd like this built for your organization with proper infrastructure and security controls.</p>
-                </div>
-            </div>
-        """)
-    gr.HTML("""
-        <div style="max-width:560px;margin:8px auto 4px;padding:8px 14px;background:#f0f9ff;
-                    border:1px solid #bae6fd;border-radius:8px;text-align:center;">
-            <p style="margin:0;font-size:13px;color:#0369a1;">
-                <i class="fas fa-flask" style="margin-right:6px;"></i>
-                <strong>Demo credentials pre-filled.</strong>
-                &nbsp;Replace with your own database details to analyze your own data.
-            </p>
-        </div>
-    """)
-    sql_url = gr.Textbox(label="URL", value="virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com")
-    with gr.Row():
-        sql_port = gr.Textbox(label="Port", value="5432")
-        sql_user = gr.Textbox(label="Username", value="postgres")
-        sql_pass = gr.Textbox(label="Password", value="Vda-1988", type="password")
-        sql_db_name = gr.Textbox(label="Database Name", value="dvdrental")
-    submit = gr.Button(value="Connect", variant="primary")
-    @gr.render(inputs=[sql_url, sql_port, sql_user, sql_pass, sql_db_name], triggers=[submit.click])
-    def sql_chat(request: gr.Request, url=sql_url.value, sql_port=sql_port.value, sql_user=sql_user.value, sql_pass=sql_pass.value, sql_db_name=sql_db_name.value):
-        if request.session_hash not in message_dict:
-            message_dict[request.session_hash] = {}
-        message_dict[request.session_hash]['sql'] = None
-        if url:
-            print("SQL APP")
-            process_message = process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, request.session_hash)
-            gr.HTML(value=process_message[1], padding=False)
-            if process_message[0] == "success":
-                if "virtual-data-analyst-pg.cyetm2yjzppu.us-west-1.rds.amazonaws.com" in url:
-                    example_questions = [
-                                            ["Describe the dataset"],
-                                            ["What is the total revenue generated by each store?"],
-                                            ["Can you generate and display a bar chart of film category to number of films in that category?"],
-                                            ["Can you generate a pie chart showing the top 10 most rented films by revenue?"],
-                                            ["Can you generate a line chart of rental revenue over time?"],
-                                            ["What is the relationship between film length and rental frequency?"]
-                                        ]
-                else:
-                    try:
-                        generated_examples = ast.literal_eval(example_question_generator(request.session_hash, 'sql', sql_db_name, process_message[2], ""))
-                        example_questions = [["Describe the dataset"]]
-                        for example in generated_examples:
-                            example_questions.append([example])
-                    except Exception as e:
-                        print("SQL QUESTION GENERATION ERROR")
-                        print(e)
-                        example_questions = [
-                                            ["Describe the dataset"],
-                                            ["List the tables in the database"],
-                                            ["What could this data be used for?"],
-                                        ]
-                session_hash = gr.Textbox(visible=False, value=request.session_hash)
-                db_url = gr.Textbox(visible=False, value=url)
-                db_port = gr.Textbox(visible=False, value=sql_port)
-                db_user = gr.Textbox(visible=False, value=sql_user)
-                db_pass = gr.Textbox(visible=False, value=sql_pass)
-                db_name = gr.Textbox(visible=False, value=sql_db_name)
-                titles = gr.Textbox(value=process_message[2], interactive=False, label="SQL Tables")
-                data_source = gr.Textbox(visible=False, value='sql')
-                schema = gr.Textbox(visible=False, value='')
-                bot = gr.Chatbot(type='messages', label="SQL DB Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-                chat = gr.ChatInterface(
-                                    fn=chatbot_func,
-                                    type='messages',
-                                    chatbot=bot,
-                                    title="Chat with your Database",
-                                    examples=example_questions,
-                                    concurrency_limit=None,
-                                    additional_inputs=[session_hash, data_source, titles, schema, db_url, db_port, db_user, db_pass, db_name]
-                                    )
-    def process_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash):
-        if url:
-            process_message = connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash)
-        return process_message
-if __name__ == "__main__":
-    demo.launch()

tools.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import sqlite3
+from utils import TEMP_DIR
+def tools_call(session_hash):
+    dir_path = TEMP_DIR / str(session_hash)
+    connection = sqlite3.connect(f'{dir_path}/data_source.db')
+    print("Querying Database in Tools.py");
+    cur=connection.execute('select * from data_source')
+    columns = [i[0] for i in cur.description]
+    print("COLUMNS 2")
+    print(columns)
+    cur.close()
+    connection.close()
+    column_string = (columns[:625] + '..') if len(columns) > 625 else columns
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": "sql_query_func",
+                "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {column_string}.
+                There may also be more columns in the table if the number of columns is too large to process.
+                This function also saves the results of the query to csv file called query.csv.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "queries": {
+                            "type": "array",
+                            "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["queries"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "scatter_chart_generation_func",
+                "description": f"""This is a scatter plot generation tool useful to generate scatter plots from queried data from our SQL table called 'data_source'.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the scatter_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "array",
+                            "description": f"""An array of strings that correspond to the the columns in our query.csv file that contain the x values of the graph. There can be more than one column
+                            that can each be plotted against the y_column, if needed.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "trendline": {
+                            "type": "string",
+                            "description": f"""An optional field to specify the type of plotly trendline we wish to use in the scatter plot.
+                             This trendline value can be one of ['ols','lowess','rolling','ewm','expanding'].
+                             Do not send any values outside of this array as the function will fail.
+                             Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "trendline_options": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'trendline_options' portion of the plotly chart generation.
+                            The 'lowess', 'rolling', and 'ewm' options require trendline_options to be included.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "marginal_x": {
+                            "type": "string",
+                            "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the x axis.
+                             This marginal_x value can be one of ['histogram','rug','box','violin'].
+                             Do not send any values outside of this array as the function will fail.
+                             Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "marginal_y": {
+                            "type": "string",
+                            "description": f"""The type of marginal distribution plot we'd like to specify for the plotly scatter plot for the y axis.
+                             This marginal_y value can be one of ['histogram','rug','box','violin'].
+                             Do not send any values outside of this array as the function will fail.
+                             Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "size": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the size of each plot point.
+                            This is useful for a bubble chart where another value in our query can be represented by the size of the plotted point.
+                            Values must be greater than or equal to 0 and so in our query, all values less than 0 should be set equal to zero.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "line_chart_generation_func",
+                "description": f"""This is a line chart generation tool useful to generate line charts from queried data from our SQL table called 'data_source'.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the line_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the x values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "bar_chart_generation_func",
+                "description": f"""This is a bar chart generation tool useful to generate line charts from queried data from our SQL table called 'data_source'.
+                The data values will come from the columns of our query.csv (the 'x' and 'y' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the bar_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the x values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the y values of the graph.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "facet_row": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define a faceted subplot, where different rows
+                            correspond to different values of the query specified in this parameter.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "facet_col": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the faceted column, corresponding to
+                            different values of our query specified in this parameter.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column","y_column","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "pie_chart_generation_func",
+                "description": f"""This is a pie chart generation tool useful to generate pie charts from queried data from our SQL table called 'data_source'.
+                The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the pie_chart_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "values": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the values of the pie chart.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "names": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contain the label or section of each piece of the pie graph and allow us to know what each piece of the pie chart represents.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["values","names","layout"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "histogram_generation_func",
+                "description": f"""This is a histogram generation tool useful to generate histograms from queried data from our SQL table called 'data_source'.
+                The data values will come from the columns of our query.csv (the 'values' and 'names' values of each graph) file but the layout section of the plotly dictionary objects will be generated by you.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the histogram_generation_func function in any way and always display the iframe fully to the user in the chat window. You can add your own text supplementary
+                to it for context if desired.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "data": {
+                            "type": "array",
+                            "description": """The array containing a dictionary that contains the 'data' portion of the plotly chart generation and will include the options requested by the user.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.
+                            Do not include the 'x' or 'y' portions of the object as this will come from the query.csv file generated by our SQLite query.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "x_column": {
+                            "type": "string",
+                            "description": f"""The column in our query.csv file that contains the x values of the histogram.
+                            This would correspond to the counts that would be distributed in the histogram.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "y_column": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains the y values of the histogram.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "histnorm": {
+                            "type": "string",
+                            "description": f"""An optional argument to specify the type of normalization if the default isn't used.
+                            This histnorm value can be one of ['percent','probability','density','probability density'].
+                            Do not send any values outside of this array as the function will fail.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contains a parameter that will define the category for the data.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "histfunc": {
+                            "type": "string",
+                            "description": f"""An optional value that represents the function of data to compute the function which is used on the optional y column.
+                            This histfunc value can be one of ['avg','sum','count'].
+                             Do not send any values outside of this array as the function will fail.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """An array containing a dictionary that contains the 'layout' portion of the plotly chart generation.
+                            The array must contain a json formatted dictionary with outer brackets included, any other format will not work.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["x_column"],
+                },
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "table_generation_func",
+                "description": f"""This an table generation tool useful to format data as a table from queried data from our SQL table called 'data_source'.
+                Takes no parameters as it uses data queried in our query.csv file to build the table.
+                Call this function after running our SQLite query and generating query.csv.
+                Returns an iframe string which will be displayed inline in our chat window. Do not edit the iframe string returned
+                from the table_generation_func function in any way and always display the iframe fully to the user in the chat window.""",
+                "parameters": {},
+            },
+        },
+        {
+        "type": "function",
+            "function": {
+                "name": "regression_func",
+                "description": f"""This a tool to calculate regressions on our SQLite table called 'data_source'.
+                We can run queries with our 'sql_query_func' function and they will be available to use in this function via the query.csv file that is generated.
+                Returns a dictionary of values that includes a regression_summary and a regression chart (which is an iframe displaying the
+                linear regression in chart form and should be shown to the user).""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "independent_variables": {
+                            "type": "array",
+                            "description": f"""An array of strings that states the independent variables in our data set which should be column names in our query.csv file that is generated
+                            in the 'sql_query_func' function. This will allow us to identify the data to use for our independent variables.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "dependent_variable": {
+                            "type": "string",
+                            "description": f"""A string that states the dependent variables in our data set which should be a column name in our query.csv file that is generated
+                            in the 'sql_query_func' function. This will allow us to identify the data to use for our dependent variables.
+                            Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": f"""An optional column in our query.csv file that contain a parameter that will define the category for the data.
+                            Do not send value if no category is needed or specified. This category must be present in our query.csv file to be valid.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        }
+                    },
+                    "required": ["independent_variables","dependent_variable"],
+                },
+            },
+        }
+    ]

tools/__init__.py DELETED Viewed

File without changes

tools/chart_tools.py DELETED Viewed

@@ -1,308 +0,0 @@
-# Shared parameter snippets reused across chart tool schemas.
-# Update here to change the description everywhere at once.
-_LAYOUT_PARAM = {
-    "type": "array",
-    "description": (
-        "Optional. An array containing a single JSON-formatted Plotly layout dictionary. "
-        "Use to set chart title, axis labels, colours, fonts, and other layout properties. "
-        "Example: [{\"title\": \"Monthly Sales\", \"xaxis\": {\"title\": \"Month\"}}]"
-    ),
-    "items": {"type": "string"},
-}
-_TRACE_STYLE_PARAM = {
-    "type": "array",
-    "description": (
-        "Optional. An array containing a single JSON-formatted Plotly trace styling dictionary. "
-        "Use to control visual properties such as line colour, opacity, and marker style. "
-        "Do NOT include 'x', 'y', or 'type' keys — those are set automatically from query.csv."
-    ),
-    "items": {"type": "string"},
-}
-chart_tool_schemas = [
-    {
-        "name": "scatter_chart_generation_func",
-        "description": (
-            "Generates a Plotly scatter plot from query.csv data. "
-            "Use when the user wants to visualise the relationship between two numeric columns, "
-            "create a bubble chart (via the size parameter), or overlay a trendline. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "array",
-                    "description": (
-                        "One or more column names from query.csv to plot on the x-axis. "
-                        "Multiple columns produce multiple series, each plotted against y_column."
-                    ),
-                    "items": {"type": "string"},
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv to plot on the y-axis.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code points by a categorical grouping.",
-                },
-                "trendline": {
-                    "type": "string",
-                    "description": (
-                        "Optional trendline type. One of: 'ols' (linear regression), "
-                        "'lowess' (local smoothing), 'rolling', 'ewm', 'expanding'. "
-                        "Requires trendline_options when using 'lowess', 'rolling', or 'ewm'."
-                    ),
-                },
-                "trendline_options": {
-                    "type": "array",
-                    "description": (
-                        "Required when trendline is 'lowess', 'rolling', or 'ewm'. "
-                        "An array containing a single JSON-formatted dict of trendline options "
-                        "(e.g. [{\"window\": 7}] for a 7-point rolling average)."
-                    ),
-                    "items": {"type": "string"},
-                },
-                "marginal_x": {
-                    "type": "string",
-                    "description": "Optional marginal distribution plot along the x-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
-                },
-                "marginal_y": {
-                    "type": "string",
-                    "description": "Optional marginal distribution plot along the y-axis. One of: 'histogram', 'rug', 'box', 'violin'.",
-                },
-                "size": {
-                    "type": "string",
-                    "description": "Optional column name whose values control the size of each point (bubble chart). Negative values are clamped to zero.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "line_chart_generation_func",
-        "description": (
-            "Generates a Plotly line chart from query.csv data. "
-            "Use for trends over time or any ordered sequence. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis (typically a date or ordered index).",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis (numeric values).",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to split the data into multiple colour-coded lines.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "bar_chart_generation_func",
-        "description": (
-            "Generates a Plotly bar chart from query.csv data. "
-            "Use for comparing values across categories. Supports grouped/stacked bars via category, "
-            "and faceted subplots via facet_row or facet_col. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis (category labels).",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis (numeric values).",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code bars into grouped or stacked series.",
-                },
-                "facet_row": {
-                    "type": "string",
-                    "description": "Optional column name. Creates one subplot row per unique value — useful for comparing distributions across a second dimension.",
-                },
-                "facet_col": {
-                    "type": "string",
-                    "description": "Optional column name. Creates one subplot column per unique value.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "pie_chart_generation_func",
-        "description": (
-            "Generates a Plotly pie chart from query.csv data. "
-            "Use when the user wants to show part-to-whole proportions. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "values": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric value for each slice.",
-                },
-                "names": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the label for each slice.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["values", "names"],
-        },
-    },
-    {
-        "name": "histogram_generation_func",
-        "description": (
-            "Generates a Plotly histogram from query.csv data. "
-            "Use to show the frequency distribution of a numeric column. "
-            "Supports normalisation (percent, probability, density) and aggregation functions per bin. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv whose values are binned on the x-axis.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Optional column name aggregated per bin via histfunc (e.g. sum of sales per price bucket).",
-                },
-                "histnorm": {
-                    "type": "string",
-                    "description": "Optional normalisation. One of: 'percent', 'probability', 'density', 'probability density'.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to overlay multiple colour-coded histograms.",
-                },
-                "histfunc": {
-                    "type": "string",
-                    "description": "Optional aggregation function applied to y_column per bin. One of: 'avg', 'sum', 'count'.",
-                },
-                "data": _TRACE_STYLE_PARAM,
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column"],
-        },
-    },
-    {
-        "name": "box_chart_generation_func",
-        "description": (
-            "Generates a Plotly box plot from query.csv data. "
-            "Use to visualise the distribution of a numeric column and identify outliers. "
-            "Especially useful for comparing distributions across categories. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric values to distribute on the y-axis.",
-                },
-                "x_column": {
-                    "type": "string",
-                    "description": "Optional column name. Groups data into one box per unique value on the x-axis.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code boxes by a secondary grouping.",
-                },
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["y_column"],
-        },
-    },
-    {
-        "name": "correlation_heatmap_func",
-        "description": (
-            "Computes pairwise Pearson correlations between numeric columns in query.csv and renders "
-            "the result as a colour-coded heatmap (blue = positive, red = negative). "
-            "Use when the user asks which variables are related, correlated, or associated with each other. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "columns": {
-                    "type": "array",
-                    "description": "Optional list of numeric column names to include in the matrix. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": [],
-        },
-    },
-    {
-        "name": "rolling_stats_func",
-        "description": (
-            "Generates a rolling statistics / moving average chart from query.csv data. "
-            "Overlays rolling aggregations (mean, std, min, max) on top of the original series. "
-            "Use when the user asks for a moving average, rolling average, rolling statistics, or wants to smooth a time series. "
-            "Returns an HTML iframe — display it verbatim in the chat."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis — typically a date or sequential index.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv containing the numeric values to compute rolling stats on.",
-                },
-                "window": {
-                    "type": "integer",
-                    "description": "Rolling window size in number of rows. Default 7. Infer from the user's request.",
-                },
-                "stats": {
-                    "type": "array",
-                    "description": "Statistics to overlay. Valid values: 'mean', 'std', 'min', 'max'. Defaults to ['mean'] if omitted.",
-                    "items": {"type": "string"},
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name to group the data, producing separate rolling stat lines per group.",
-                },
-                "layout": _LAYOUT_PARAM,
-            },
-            "required": ["x_column", "y_column"],
-        },
-    },
-    {
-        "name": "table_generation_func",
-        "description": (
-            "Formats query.csv results as a styled HTML table. "
-            "Use when the user wants to view raw query results in a readable format, "
-            "or when result data is too large to describe in text. Displays up to 200 rows. "
-            "Returns an HTML table — display it verbatim in the chat."
-        ),
-        "parameters": {"type": "object", "properties": {}},
-    },
-]

tools/stats_tools.py DELETED Viewed

@@ -1,130 +0,0 @@
-stats_tool_schemas = [
-    {
-        "name": "descriptive_stats_func",
-        "description": (
-            "Computes summary statistics for numeric columns in query.csv: "
-            "count, mean, std, min, 25th/50th/75th percentile, and max. "
-            "Use when the user asks for summary statistics, descriptive statistics, or a statistical overview. "
-            "Returns a formatted HTML table."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "columns": {
-                    "type": "array",
-                    "description": "Optional list of column names to include. If omitted, all numeric columns from query.csv are used. Avoid ID or index columns.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": [],
-        },
-    },
-    {
-        "name": "kmeans_clustering_func",
-        "description": (
-            "Runs K-Means clustering on numeric feature columns from query.csv. "
-            "Groups rows into k clusters, displays a scatter plot coloured by cluster assignment, "
-            "and returns a centroid summary table showing the mean of each feature per cluster. "
-            "Use when the user asks to cluster the data, find natural segments or groups, or apply K-Means. "
-            "Returns an HTML iframe and summary table."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "feature_columns": {
-                    "type": "array",
-                    "description": "List of numeric column names from query.csv to use as clustering features.",
-                    "items": {"type": "string"},
-                },
-                "x_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the x-axis of the scatter plot. Usually one of the feature columns.",
-                },
-                "y_column": {
-                    "type": "string",
-                    "description": "Column name from query.csv for the y-axis of the scatter plot. Usually one of the feature columns.",
-                },
-                "n_clusters": {
-                    "type": "integer",
-                    "description": "Number of clusters (k). Default 3. Infer from the user's request.",
-                },
-                "layout": {
-                    "type": "array",
-                    "description": "Optional. An array containing a single JSON-formatted Plotly layout dictionary.",
-                    "items": {"type": "string"},
-                },
-            },
-            "required": ["feature_columns", "x_column", "y_column"],
-        },
-    },
-    {
-        "name": "hypothesis_test_func",
-        "description": (
-            "Performs a statistical hypothesis test on query.csv data and returns a formatted results table "
-            "with test statistic, p-value, and significance at α=0.05. "
-            "Supported tests:\n"
-            "- 't_test_independent': compare means of a numeric column across two groups "
-            "(requires group_column; use group_values if the column has more than 2 unique values).\n"
-            "- 't_test_one_sample': test whether a column's mean equals a hypothesized value (requires pop_mean).\n"
-            "- 'chi_square': test independence between two categorical columns (requires column and column2)."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "test_type": {
-                    "type": "string",
-                    "description": "Test to run. One of: 't_test_independent', 't_test_one_sample', 'chi_square'.",
-                },
-                "column": {
-                    "type": "string",
-                    "description": "Primary column for the test. Numeric for t-tests; first categorical column for chi-square.",
-                },
-                "column2": {
-                    "type": "string",
-                    "description": "Second categorical column. Required for 'chi_square'.",
-                },
-                "group_column": {
-                    "type": "string",
-                    "description": "Grouping column. Required for 't_test_independent'. Must have exactly 2 unique values, or specify group_values.",
-                },
-                "group_values": {
-                    "type": "array",
-                    "description": "Exactly 2 group labels to compare. Use when group_column has more than 2 unique values.",
-                    "items": {"type": "string"},
-                },
-                "pop_mean": {
-                    "type": "number",
-                    "description": "Hypothesized population mean (μ₀). Required for 't_test_one_sample'.",
-                },
-            },
-            "required": ["test_type", "column"],
-        },
-    },
-    {
-        "name": "regression_func",
-        "description": (
-            "Runs an OLS linear regression on query.csv data. "
-            "Use when the user wants to model the relationship between variables, assess predictors, or run a regression. "
-            "Returns a regression summary (coefficients, R², p-values) and a scatter plot with the fitted line as an HTML iframe."
-        ),
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "independent_variables": {
-                    "type": "array",
-                    "description": "Column names from query.csv to use as independent (predictor) variables.",
-                    "items": {"type": "string"},
-                },
-                "dependent_variable": {
-                    "type": "string",
-                    "description": "Column name from query.csv to use as the dependent (outcome) variable.",
-                },
-                "category": {
-                    "type": "string",
-                    "description": "Optional column name used to colour-code points and fit separate regression lines per group.",
-                },
-            },
-            "required": ["independent_variables", "dependent_variable"],
-        },
-    },
-]

tools/tools.py DELETED Viewed

@@ -1,130 +0,0 @@
-from .stats_tools import stats_tool_schemas
-from .chart_tools import chart_tool_schemas
-def tools_call(session_hash, data_source, titles):
-    from haystack.tools import Tool
-    _noop = lambda **kwargs: None
-    def make_tool(schema):
-        return Tool(
-            name=schema["name"],
-            description=schema["description"],
-            parameters=schema["parameters"],
-            function=_noop,
-        )
-    titles_string = (titles[:625] + '..') if len(titles) > 625 else titles
-    query_tool_schemas = {
-        'file_upload': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to query a SQLite table called 'data_source' with the following Columns: {titles_string}.
-            There may also be more columns in the table if the number of columns is too large to process.
-            This function also saves the results of the query to csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries"]
-            },
-        },
-        'sql': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {titles_string}.
-            There may also be more tables in the database if the number of tables is too large to process.
-            This function also saves the results of the query to csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The PostgreSQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries"]
-            },
-        },
-        'doc_db': {
-            "name": "query_func",
-            "description": f"""This is a tool useful to build an aggregation pipeline to query a MongoDB NoSQL document database with the following collections, {titles_string}.
-            There may also be more collections in the database if the number of collections is too large to process.
-            This function also saves the results of the query to a csv file called query.csv.""",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "queries": {
-                        "type": "string",
-                        "description": "The MongoDB aggregation pipeline to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    },
-                    "db_collection": {
-                        "type": "string",
-                        "description": "The MongoDB collection to use in the search. Infer this from the user's message. It should be a question or a statement."
-                    }
-                },
-                "required": ["queries", "db_collection"]
-            },
-        },
-        'graphql': [
-            {
-                "name": "query_func",
-                "description": f"""This is a tool useful to build a GraphQL query for a GraphQL API endpoint with the following types, {titles_string}.
-                There may also be more types in the GraphQL endpoint if the number of types is too large to process.
-                This function also saves the results of the query to a csv file called query.csv.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "queries": {
-                            "type": "string",
-                            "description": "The GraphQL query to use in the search. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["queries"]
-                },
-            },
-            {
-                "name": "graphql_schema_query",
-                "description": f"""This is a tool useful to query a GraphQL type and receive back information about its schema. This is useful because
-                the GraphQL introspection query is too large to be ingested all at once and this allows us to query the schema one type at a time to
-                view it in manageable bites. You may realize after viewing the schema, that the type you selected was not appropriate for the question
-                you are attempting answer. You may then query additional types to find the appropriate types to use for your GraphQL API query.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "graphql_type": {
-                            "type": "string",
-                            "description": "The GraphQL type that we want to view the schema of in order to make the proper query with our graphql_query_func. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["graphql_type"]
-                },
-            },
-            {
-                "name": "graphql_csv_query",
-                "description": f"""This is a tool useful to SQL query our query.csv file that is generated from our GraphQL query. This is useful in a situation
-                where the results of the GraphQL query need additional querying to answer the user question. The query.csv file is converted to a Pandas dataframe
-                and we query that dataframe with SQL on a table called 'query' before converting it back to a csv file.""",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "csv_query": {
-                            "type": "string",
-                            "description": "The pandas dataframe SQL query to use in the search. The table that we query is named 'query'. Infer this from the user's message. It should be a question or a statement."
-                        }
-                    },
-                    "required": ["csv_query"]
-                },
-            },
-        ]
-    }
-    source_schemas = query_tool_schemas[data_source]
-    source_tools = [make_tool(s) for s in (source_schemas if isinstance(source_schemas, list) else [source_schemas])]
-    chart_tools = [make_tool(s) for s in chart_tool_schemas]
-    stats_tools = [make_tool(s) for s in stats_tool_schemas]
-    return source_tools + chart_tools + stats_tools

utils.py CHANGED Viewed

@@ -4,6 +4,4 @@ current_dir = Path(__file__).parent
 TEMP_DIR = current_dir / 'temp'
-message_dict = {}
-api_key_store = {}
-model_store = {}


4
5	TEMP_DIR = current_dir / 'temp'
6
7	+ message_dict = {}