Spaces:

SparkBrainsAI
/

SPARKNOVA

Build error

App Files Files Community

Tamannathakur commited on Dec 12, 2025

Commit

75bedb4

verified ·

1 Parent(s): de5ce45

Upload 7 files

Browse files

Files changed (7) hide show

Sparknova.py +412 -0
ai_agent.py +346 -0
app.py +580 -0
app_backup.py +682 -0
data_engine.py +539 -0
prompts.py +168 -0
requirements.txt +14 -0

Sparknova.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import os
+import json
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+import plotly.express as px
+import traceback
+from io import BytesIO
+from langchain_groq import ChatGroq
+from prompts import ENHANCED_SYSTEM_PROMPT, SAMPLE_QUESTIONS, get_chart_prompt, validate_plot_spec, INSIGHTS_SYSTEM_PROMPT, get_insights_prompt
+GROQ_API_KEY = "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"
+llm = ChatGroq(
+    api_key=GROQ_API_KEY,
+    model="llama-3.3-70b-versatile",
+    temperature=0.0
+)
+print("GROQ API initialized successfully")
+def call_groq(messages):
+    try:
+        res = llm.invoke(messages)
+        return res.content if hasattr(res, "content") else str(res)
+    except Exception as e:
+        raise RuntimeError(f"GROQ API error: {e}")
+def parse_plan(raw_text):
+    txt = raw_text.strip().replace("```json", "").replace("```", "").strip()
+    try:
+        start = txt.index("{")
+        end = txt.rindex("}") + 1
+        plan = json.loads(txt[start:end])
+        plan.setdefault("type", "analysis")
+        plan.setdefault("operations", [])
+        plan.setdefault("plot", None)
+        plan.setdefault("narrative", "")
+        plan.setdefault("insights_needed", False)
+        return plan
+    except Exception as e:
+        return {
+            "type": "error",
+            "operations": [],
+            "plot": None,
+            "narrative": f"Error parsing JSON: {str(e)}",
+            "insights_needed": False
+        }
+def clean_numeric(df):
+    df = df.copy()
+    for col in df.columns:
+        if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object:
+            s = df[col].astype(str).str.strip()
+            if s.str.contains("%", na=False).any():
+                numeric_vals = pd.to_numeric(s.str.replace("%", "", regex=False), errors="coerce")
+                if numeric_vals.notna().sum() / len(df) > 0.5:
+                    df[col] = numeric_vals / 100.0
+                    continue
+            cleaned = s.str.replace(",", "", regex=False).str.replace("₹", "", regex=False).str.replace("$", "", regex=False)
+            numeric_vals = pd.to_numeric(cleaned, errors="coerce")
+            if numeric_vals.notna().sum() / len(df) > 0.5:
+                df[col] = numeric_vals
+    return df
+def generate_insights(df, dfw, plan, plot_created):
+    context_parts = []
+    for op in plan.get("operations", []):
+        if op.get("op") == "describe":
+            cols = op.get("columns", [])
+            for col in cols:
+                if col in df.columns:
+                    desc = df[col].describe()
+                    context_parts.append(f"\n{col} Statistics:\n{desc.to_string()}")
+        elif op.get("op") == "groupby":
+            context_parts.append(f"\nGrouped Results:\n{dfw.head(10).to_string()}")
+    plot_spec = plan.get("plot")
+    if plot_created and plot_spec:
+        context_parts.append(f"\nChart Type: {plot_spec.get('type')}")
+        context_parts.append(f"Visualization: {plot_spec.get('title')}")
+    if len(dfw) > 0:
+        context_parts.append(f"\nResult Preview:\n{dfw.head(10).to_string()}")
+    insights_prompt = get_insights_prompt(context_parts, plan.get('narrative', ''))
+    try:
+        insights_response = call_groq([
+            {"role": "system", "content": INSIGHTS_SYSTEM_PROMPT},
+            {"role": "user", "content": insights_prompt}
+        ])
+        return insights_response.strip()
+    except Exception as e:
+        return f"Analysis completed successfully\n{len(dfw)} records in result\nError generating detailed insights: {str(e)}"
+def execute_plan(df, plan):
+    dfw = df.copy()
+    plot_bytes = None
+    plot_html = None
+    describe_stats = {}
+    try:
+        for op in plan.get("operations", []):
+            optype = op.get("op", "").lower()
+            if optype == "describe":
+                cols = op.get("columns", [])
+                for col in cols:
+                    if col in dfw.columns:
+                        stats = dfw[col].describe()
+                        describe_stats[col] = stats
+                        print(f"Described {col}")
+                        print(f"\n{stats}\n")
+                continue
+            elif optype == "groupby":
+                cols = op.get("columns", [])
+                agg_col = op.get("agg_col")
+                agg_func = op.get("agg_func", "count")
+                if not cols:
+                    raise ValueError("No columns specified for groupby")
+                if agg_func == "count" or not agg_col:
+                    dfw = dfw.groupby(cols).size().reset_index(name="count")
+                    print(f"Grouped by {cols} with count")
+                else:
+                    if agg_col not in dfw.columns:
+                        raise ValueError(f"Column '{agg_col}' not found for aggregation")
+                    result_col = f"{agg_func}_{agg_col}"
+                    dfw = dfw.groupby(cols)[agg_col].agg(agg_func).reset_index(name=result_col)
+                    print(f"Grouped by {cols}, calculated {agg_func} of {agg_col}")
+            elif optype == "filter":
+                expr = op.get("expr", "")
+                if expr:
+                    dfw = dfw.query(expr)
+                    print(f"Filter applied: {expr}")
+            elif optype == "calculate":
+                expr = op.get("expr", "")
+                new_col = op.get("new_col", "Calculated")
+                dfw[new_col] = dfw.eval(expr)
+                print(f"Calculated {new_col} = {expr}")
+        plot_spec = plan.get("plot")
+        if plot_spec and plot_spec is not None:
+            ptype = plot_spec.get("type", "bar")
+            x = plot_spec.get("x")
+            y = plot_spec.get("y")
+            title = plot_spec.get("title", "Chart")
+            plot_df = df if describe_stats else dfw
+            if not x and len(plot_df.columns) > 0:
+                categorical_cols = plot_df.select_dtypes(include=['object', 'category']).columns
+                x = categorical_cols[0] if len(categorical_cols) > 0 else plot_df.columns[0]
+            if not y:
+                numeric_cols = plot_df.select_dtypes(include=[np.number]).columns
+                y = numeric_cols[0] if len(numeric_cols) > 0 else None
+            if not y:
+                print("No suitable Y column found for plotting.")
+            else:
+                if ptype == "pie":
+                    if x and x in plot_df.columns:
+                        value_counts = plot_df[x].value_counts()
+                        fig = go.Figure(data=[go.Pie(
+                            labels=value_counts.index,
+                            values=value_counts.values,
+                            hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>',
+                            textposition='auto',
+                            hole=0.3
+                        )])
+                    else:
+                        df_pie = plot_df[y].value_counts()
+                        fig = go.Figure(data=[go.Pie(
+                            labels=df_pie.index,
+                            values=df_pie.values,
+                            hole=0.3
+                        )])
+                    fig.update_layout(
+                        title=title,
+                        title_font_size=16,
+                        showlegend=True,
+                        width=950,
+                        height=550
+                    )
+                    plot_html = fig.to_html(include_plotlyjs='cdn')
+                    print("Enhanced pie chart generated")
+                elif ptype == "bar":
+                    fig, ax = plt.subplots(figsize=(12, 7))
+                    if x and x in plot_df.columns and y and y in plot_df.columns:
+                        plot_df.plot.bar(x=x, y=y, ax=ax, legend=False, color='steelblue', edgecolor='black', alpha=0.8)
+                        ax.set_xlabel(x, fontsize=12, fontweight='bold')
+                        n_categories = len(plot_df[x].unique())
+                        if n_categories > 10:
+                            plt.xticks(rotation=90, ha='right', fontsize=9)
+                        elif n_categories > 5:
+                            plt.xticks(rotation=45, ha='right', fontsize=10)
+                        else:
+                            plt.xticks(rotation=0, fontsize=10)
+                    else:
+                        plot_df[y].plot.bar(ax=ax, color='steelblue', edgecolor='black', alpha=0.8)
+                    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
+                    ax.set_ylabel(y, fontsize=12, fontweight='bold')
+                    ax.grid(axis='y', alpha=0.3, linestyle='--')
+                    plt.tight_layout()
+                    buf = BytesIO()
+                    plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+                    buf.seek(0)
+                    plot_bytes = buf.read()
+                    plt.close()
+                    print("Enhanced bar chart generated")
+                elif ptype == "line":
+                    fig, ax = plt.subplots(figsize=(12, 7))
+                    if x and x in plot_df.columns and y and y in plot_df.columns:
+                        plot_df.plot.line(x=x, y=y, ax=ax, marker="o", linewidth=3,
+                                         markersize=8, color='darkblue', alpha=0.8)
+                        ax.set_xlabel(x, fontsize=12, fontweight='bold')
+                        if len(plot_df) > 15:
+                            plt.xticks(rotation=45, ha='right', fontsize=9)
+                        else:
+                            plt.xticks(rotation=0, fontsize=10)
+                    else:
+                        plot_df[y].plot.line(ax=ax, marker="o", linewidth=3,
+                                            markersize=8, color='darkblue', alpha=0.8)
+                    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
+                    ax.set_ylabel(y, fontsize=12, fontweight='bold')
+                    ax.grid(True, alpha=0.3, linestyle='--')
+                    plt.tight_layout()
+                    buf = BytesIO()
+                    plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+                    buf.seek(0)
+                    plot_bytes = buf.read()
+                    plt.close()
+                    print("Enhanced line chart generated")
+                elif ptype == "hist":
+                    fig, ax = plt.subplots(figsize=(11, 7))
+                    plot_df[y].dropna().plot.hist(ax=ax, bins=25, edgecolor='black',
+                                                   alpha=0.7, color='teal')
+                    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
+                    ax.set_xlabel(y, fontsize=12, fontweight='bold')
+                    ax.set_ylabel("Frequency", fontsize=12, fontweight='bold')
+                    ax.grid(axis='y', alpha=0.3, linestyle='--')
+                    plt.tight_layout()
+                    buf = BytesIO()
+                    plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+                    buf.seek(0)
+                    plot_bytes = buf.read()
+                    plt.close()
+                    print("Enhanced histogram generated")
+                elif ptype == "scatter":
+                    fig, ax = plt.subplots(figsize=(11, 7))
+                    if x and x in plot_df.columns and y and y in plot_df.columns:
+                        plot_df.plot.scatter(x=x, y=y, ax=ax, alpha=0.6, s=60, color='purple')
+                        ax.set_xlabel(x, fontsize=12, fontweight='bold')
+                        ax.set_ylabel(y, fontsize=12, fontweight='bold')
+                    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
+                    ax.grid(True, alpha=0.3, linestyle='--')
+                    plt.tight_layout()
+                    buf = BytesIO()
+                    plt.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+                    buf.seek(0)
+                    plot_bytes = buf.read()
+                    plt.close()
+                    print("Enhanced scatter plot generated")
+        return dfw, plot_bytes, plot_html, describe_stats
+    except Exception as e:
+        print(f"EXECUTION ERROR: {e}")
+        traceback.print_exc()
+        raise
+def make_context(df):
+    sample_data = df.head(3).to_string(max_cols=10, max_colwidth=20)
+    return f"""Dataset: {len(df)} rows, {len(df.columns)} columns
+Columns: {', '.join(df.columns)}
+Data types: {df.dtypes.value_counts().to_dict()}
+Sample data:
+{sample_data}"""
+def load_file(file_path):
+    if file_path.endswith('.csv'):
+        return pd.read_csv(file_path)
+    elif file_path.endswith(('.xlsx', '.xls')):
+        return pd.read_excel(file_path)
+    else:
+        raise ValueError("Unsupported file format. Please use CSV or Excel files.")
+def start_agent():
+    print("=" * 80)
+    print("SparkNova v5.0 – Advanced Data Analysis & Visualization")
+    print("=" * 80)
+    df = None
+    while True:
+        if df is None:
+            file_path = input("\nEnter file path (CSV or Excel): ").strip()
+            if not file_path:
+                continue
+            try:
+                df = load_file(file_path)
+                df = clean_numeric(df)
+                print(f"Loaded {file_path} ({len(df)} rows × {len(df.columns)} cols)")
+                print("\nFirst 5 rows:")
+                print(df.head())
+                print(f"\nColumn types:\n{df.dtypes}")
+                print("\nSample Questions You Can Ask:")
+                for i, question in enumerate(SAMPLE_QUESTIONS[:8], 1):
+                    print(f"{i}. {question}")
+                data_ctx = make_context(df)
+            except Exception as e:
+                print(f"Error loading file: {e}")
+                continue
+        q = input("\nYour question (or 'exit'/'reload'): ").strip()
+        if not q:
+            continue
+        if q.lower() in ("exit", "quit"):
+            print("Thank you for using SparkNova!")
+            break
+        if q.lower() == "reload":
+            df = None
+            continue
+        enhanced_prompt = get_chart_prompt(q, df.columns.tolist(), df.head(3).to_string())
+        try:
+            raw = call_groq([
+                {"role": "system", "content": ENHANCED_SYSTEM_PROMPT},
+                {"role": "user", "content": enhanced_prompt}
+            ])
+        except Exception as e:
+            print(f"LLM call failed: {e}")
+            continue
+        plan = parse_plan(raw)
+        if plan.get("type") == "explain":
+            print("\nExplanation:")
+            print(plan.get("narrative", ""))
+            continue
+        if plan.get("type") == "error":
+            print("\nError:")
+            print(plan.get("narrative", ""))
+            continue
+        print("\nAnalysis Plan:")
+        print(json.dumps(plan, indent=2))
+        if plan.get("plot"):
+            plan["plot"] = validate_plot_spec(plan["plot"], df.columns.tolist())
+        try:
+            print("\nExecuting operations...")
+            res, plot_img, plot_html, desc_stats = execute_plan(df, plan)
+            if not desc_stats or len(res) != len(df):
+                print("\nResult:")
+                print(res.head(20))
+            if plot_html:
+                print("\nGenerated Interactive Chart (HTML saved as chart.html)")
+                with open("chart.html", "w") as f:
+                    f.write(plot_html)
+            elif plot_img:
+                print("\nGenerated Chart (saved as chart.png)")
+                with open("chart.png", "wb") as f:
+                    f.write(plot_img)
+            narrative = plan.get("narrative", "")
+            if narrative:
+                print(f"\nSummary: {narrative}")
+            if plan.get("insights_needed") and (plot_html or plot_img):
+                print("\nDetailed Insights:")
+                insights = generate_insights(df, res, plan, True)
+                print(insights)
+        except Exception as e:
+            print(f"Execution failed: {e}")
+            continue
+if __name__ == "__main__":
+    start_agent()

ai_agent.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import os
+import json
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from langchain_groq import ChatGroq
+from prompts import ENHANCED_SYSTEM_PROMPT, get_chart_prompt, validate_plot_spec, INSIGHTS_SYSTEM_PROMPT, get_insights_prompt
+GROQ_API_KEY = "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"
+def initialize_llm():
+    try:
+        os.environ["GROQ_API_KEY"] = GROQ_API_KEY
+        llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY, temperature=0.0)
+        return llm
+    except:
+        return None
+def parse_plan(raw_text):
+    txt = raw_text.strip().replace("```json", "").replace("```", "").strip()
+    try:
+        start = txt.index("{")
+        end = txt.rindex("}") + 1
+        plan = json.loads(txt[start:end])
+        plan.setdefault("type", "analysis")
+        plan.setdefault("operations", [])
+        plan.setdefault("plot", None)
+        plan.setdefault("narrative", "")
+        plan.setdefault("insights_needed", False)
+        return plan
+    except Exception as e:
+        return {
+            "type": "error",
+            "operations": [],
+            "plot": None,
+            "narrative": f"Error parsing response: {str(e)}",
+            "insights_needed": False
+        }
+def execute_plan(df, plan):
+    dfw = df.copy()
+    describe_stats = {}
+    try:
+        for op in plan.get("operations", []):
+            optype = op.get("op", "").lower()
+            if optype == "describe":
+                cols = op.get("columns", [])
+                for col in cols:
+                    if col in dfw.columns:
+                        stats = dfw[col].describe()
+                        describe_stats[col] = stats
+            elif optype == "groupby":
+                cols = op.get("columns", [])
+                agg_col = op.get("agg_col")
+                agg_func = op.get("agg_func", "count")
+                if cols and all(c in dfw.columns for c in cols):
+                    if agg_func == "count" or not agg_col:
+                        dfw = dfw.groupby(cols).size().reset_index(name="count")
+                    else:
+                        if agg_col in dfw.columns:
+                            result_col = f"{agg_func}_{agg_col}"
+                            dfw = dfw.groupby(cols)[agg_col].agg(agg_func).reset_index(name=result_col)
+            elif optype == "filter":
+                expr = op.get("expr", "")
+                column = op.get("column")
+                value = op.get("value")
+                if expr:
+                    try:
+                        dfw = dfw.query(expr)
+                    except Exception:
+                        # If query fails, try alternative filtering methods
+                        if column and column in dfw.columns and value:
+                            if dfw[column].dtype == 'object':
+                                dfw = dfw[dfw[column].str.contains(str(value), case=False, na=False)]
+                            else:
+                                dfw = dfw[dfw[column] == value]
+                elif column and column in dfw.columns and value:
+                    if dfw[column].dtype == 'object':
+                        dfw = dfw[dfw[column].str.contains(str(value), case=False, na=False)]
+                    else:
+                        dfw = dfw[dfw[column] == value]
+            elif optype == "calculate":
+                expr = op.get("expr", "")
+                new_col = op.get("new_col", "Calculated")
+                if expr:
+                    try:
+                        dfw[new_col] = dfw.eval(expr)
+                    except:
+                        if "std" in expr:
+                            for col in dfw.select_dtypes(include=[np.number]).columns:
+                                if col in expr:
+                                    dfw[new_col] = dfw[col].std()
+                                    break
+                        elif "mean" in expr:
+                            for col in dfw.select_dtypes(include=[np.number]).columns:
+                                if col in expr:
+                                    dfw[new_col] = dfw[col].mean()
+                                    break
+                        else:
+                            pass
+            elif optype == "count":
+                column = op.get("column")
+                value = op.get("value")
+                if column and column in dfw.columns:
+                    if value:
+                        # Handle both string and numeric columns dynamically
+                        if dfw[column].dtype == 'object':
+                            count_result = dfw[column].str.contains(str(value), case=False, na=False).sum()
+                        else:
+                            count_result = (dfw[column] == value).sum()
+                        describe_stats[f"count_{column}_{value}"] = count_result
+                    else:
+                        # Show all unique values with their counts
+                        count_result = dfw[column].value_counts()
+                        describe_stats[f"values_{column}"] = count_result
+        return dfw, describe_stats
+    except Exception as e:
+        raise Exception(f"Execution error: {str(e)}")
+def create_chart(df, selected_columns=None, chart_type="bar", title=None):
+    try:
+        if selected_columns and len(selected_columns) >= 2:
+            x_col, y_col = selected_columns[0], selected_columns[1]
+            if x_col in df.columns and y_col in df.columns:
+                if chart_type == "scatter":
+                    fig = px.scatter(df.head(100), x=x_col, y=y_col, title=title or f"{y_col} vs {x_col}")
+                elif chart_type == "line":
+                    fig = px.line(df.head(50), x=x_col, y=y_col, title=title or f"{y_col} over {x_col}", markers=True)
+                else:
+                    fig = px.bar(df.head(50), x=x_col, y=y_col, title=title or f"{y_col} by {x_col}")
+                fig.update_layout(width=900, height=500)
+                return fig
+        if selected_columns and len(selected_columns) == 1:
+            col = selected_columns[0]
+            if col in df.columns:
+                if pd.api.types.is_numeric_dtype(df[col]):
+                    fig = px.histogram(df, x=col, title=f"Distribution of {col}")
+                else:
+                    value_counts = df[col].value_counts().head(10)
+                    fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {col}")
+                fig.update_layout(width=900, height=500)
+                return fig
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 1:
+            fig = px.scatter(df.head(100), x=numeric_cols[0], y=numeric_cols[1], title=f"{numeric_cols[1]} vs {numeric_cols[0]}")
+        elif len(categorical_cols) > 0 and len(numeric_cols) > 0:
+            fig = px.bar(df.head(50), x=categorical_cols[0], y=numeric_cols[0], title=f"{numeric_cols[0]} by {categorical_cols[0]}")
+        elif len(categorical_cols) > 0:
+            value_counts = df[categorical_cols[0]].value_counts().head(10)
+            fig = px.pie(values=value_counts.values, names=value_counts.index, title=f"Distribution of {categorical_cols[0]}")
+        elif len(numeric_cols) > 0:
+            fig = px.histogram(df, x=numeric_cols[0], title=f"Distribution of {numeric_cols[0]}")
+        else:
+            return None
+        fig.update_layout(width=900, height=500)
+        return fig
+    except:
+        return None
+def create_plot(df, dfw, plan, describe_stats, selected_columns=None):
+    plot_spec = plan.get("plot")
+    if not plot_spec:
+        return None
+    ptype = plot_spec.get("type", "bar")
+    title = plot_spec.get("title", "Chart")
+    plot_df = df if describe_stats else dfw
+    x = plot_spec.get("x")
+    y = plot_spec.get("y")
+    if not x and len(plot_df.columns) > 0:
+        categorical_cols = plot_df.select_dtypes(include=['object', 'category']).columns
+        x = categorical_cols[0] if len(categorical_cols) > 0 else plot_df.columns[0]
+    if not y:
+        numeric_cols = plot_df.select_dtypes(include=[np.number]).columns
+        y = numeric_cols[0] if len(numeric_cols) > 0 else None
+    try:
+        if ptype == "pie" and x and x in plot_df.columns:
+            value_counts = plot_df[x].value_counts()
+            fig = go.Figure(data=[go.Pie(labels=value_counts.index, values=value_counts.values, hole=0.3)])
+            fig.update_layout(title=title, width=900, height=500)
+            return fig
+        elif ptype == "bar" and x and x in plot_df.columns and y and y in plot_df.columns:
+            fig = px.bar(plot_df, x=x, y=y, title=title)
+            fig.update_layout(width=900, height=500)
+            return fig
+        elif ptype == "line" and x and x in plot_df.columns and y and y in plot_df.columns:
+            fig = px.line(plot_df, x=x, y=y, title=title, markers=True)
+            fig.update_layout(width=900, height=500)
+            return fig
+        elif ptype == "hist" and y and y in plot_df.columns:
+            fig = px.histogram(plot_df, x=y, title=title, nbins=30)
+            fig.update_layout(width=900, height=500)
+            return fig
+        elif ptype == "scatter" and x and x in plot_df.columns and y and y in plot_df.columns:
+            fig = px.scatter(plot_df, x=x, y=y, title=title)
+            fig.update_layout(width=900, height=500)
+            return fig
+    except:
+        pass
+    return None
+def generate_insights(df, dfw, plan, llm):
+    try:
+        context_parts = []
+        for op in plan.get("operations", []):
+            if op.get("op") == "describe":
+                cols = op.get("columns", [])
+                for col in cols:
+                    if col in df.columns:
+                        desc = df[col].describe()
+                        context_parts.append(f"\n{col} Statistics:\n{desc.to_string()}")
+            elif op.get("op") == "groupby":
+                context_parts.append(f"\nGrouped Results:\n{dfw.head(10).to_string()}")
+        insights_prompt = get_insights_prompt(context_parts, plan.get('narrative', ''))
+        response = llm.invoke([
+            {"role": "system", "content": INSIGHTS_SYSTEM_PROMPT},
+            {"role": "user", "content": insights_prompt}
+        ])
+        return response.content if hasattr(response, 'content') else str(response)
+    except Exception as e:
+        return f"Error generating insights: {str(e)}"
+def analyze_question(question, selected_columns, uploaded_df, llm):
+    if llm is None:
+        return "API not initialized. Please restart.", None, None
+    if uploaded_df is None:
+        return "Please upload a dataset first.", None, None
+    if not question.strip():
+        return "Please enter a question.", None, None
+    try:
+        df_to_analyze = uploaded_df[selected_columns] if selected_columns else uploaded_df
+        sample_data = df_to_analyze.head(3).to_string(max_cols=10, max_colwidth=20)
+        if selected_columns:
+            column_context = f"Selected columns for analysis: {', '.join(selected_columns)}\n"
+        else:
+            column_context = ""
+        # data_ctx = f"""{column_context}Dataset: {len(df_to_analyze)} rows, {len(df_to_analyze.columns)} columns
+# Columns: {', '.join(df_to_analyze.columns)}
+# Sample data:
+# {sample_data}"""
+        enhanced_prompt = get_chart_prompt(question, df_to_analyze.columns.tolist(), sample_data)
+        messages = [
+            {"role": "system", "content": ENHANCED_SYSTEM_PROMPT},
+            {"role": "user", "content": enhanced_prompt}
+        ]
+        response = llm.invoke(messages)
+        raw_text = response.content if hasattr(response, 'content') else str(response)
+        plan = parse_plan(raw_text)
+        if plan.get("type") == "explain":
+            return plan.get("narrative", "No explanation provided"), None, None
+        if plan.get("type") == "error":
+            return plan.get("narrative", "Error occurred"), None, None
+        if plan.get("plot"):
+            plan["plot"] = validate_plot_spec(plan["plot"], df_to_analyze.columns.tolist())
+        dfw, describe_stats = execute_plan(df_to_analyze, plan)
+        # Only use narrative for explain type, avoid hallucinations for data operations
+        has_data_operations = any(col.startswith(("count_", "values_")) for col in describe_stats.keys()) if describe_stats else False
+        has_filtered_data = len(dfw) != len(df_to_analyze)
+        if has_data_operations:
+            response_text = "Analysis completed."
+        elif has_filtered_data:
+            response_text = f"Filter applied. Found {len(dfw)} matching rows out of {len(df_to_analyze)} total rows."
+        else:
+            response_text = plan.get("narrative", "Analysis complete")
+        if describe_stats:
+            response_text += "\n\nResults:\n"
+            for col, stats in describe_stats.items():
+                if col.startswith("count_"):
+                    # Extract column and value from key for dynamic display
+                    parts = col.replace('count_', '').split('_', 1)
+                    if len(parts) == 2:
+                        column_name, value_name = parts
+                        response_text += f"\nCount of '{value_name}' in {column_name}: {int(stats)}\n"
+                    else:
+                        response_text += f"\n{col}: {int(stats) if isinstance(stats, (int, float, np.integer)) else stats}\n"
+                elif col.startswith("values_"):
+                    # Show all values in the column
+                    column_name = col.replace('values_', '')
+                    if hasattr(stats, 'to_string'):
+                        response_text += f"\nAll values in {column_name}:\n{stats.to_string()}\n"
+                    else:
+                        response_text += f"\nAll values in {column_name}: {stats}\n"
+                else:
+                    if hasattr(stats, 'to_string'):
+                        response_text += f"\n{col}:\n{stats.to_string()}\n"
+                    else:
+                        response_text += f"\n{col}: {stats}\n"
+        fig = None
+        if plan.get("plot"):
+            fig = create_plot(df_to_analyze, dfw, plan, describe_stats, selected_columns)
+        if fig is None:
+            fig = create_chart(df_to_analyze, selected_columns)
+        if fig:
+            response_text += "\n\nChart generated successfully!"
+            if selected_columns and len(selected_columns) >= 1:
+                response_text += f"\nUsing selected columns: {', '.join(selected_columns)}"
+        if plan.get("insights_needed") and fig:
+            insights = generate_insights(df_to_analyze, dfw, plan, llm)
+            response_text += f"\n\nKey Insights:\n{insights}"
+        result_table = None
+        if len(dfw) != len(df_to_analyze):
+            result_table = dfw.head(50)
+        elif not describe_stats and len(dfw) > 0:
+            result_table = dfw.head(50)
+        return response_text, fig, result_table
+    except Exception as e:
+        return f"Error during analysis: {str(e)}", None, None

app.py ADDED Viewed

	@@ -0,0 +1,580 @@

+import gradio as gr
+import pandas as pd
+import os
+import base64
+from data_engine import (
+    clean_numeric, run_analysis, create_visualization, handle_missing_data,
+    undo_last_change, undo_all_changes, download_dataset,
+    display_data_format, display_text_format
+)
+try:
+    from ai_agent import initialize_llm, analyze_question
+except (ImportError, RuntimeError) as e:
+    print(f"Warning: Full AI agent not available: {e}")
+    def initialize_llm():
+        return None
+    def analyze_question(question, columns, df, llm):
+        return "AI agent not available. Please check dependencies.", None, None
+from prompts import SAMPLE_QUESTIONS
+llm = None
+uploaded_df = None
+original_df = None
+dataset_name = None
+change_history = []
+logo_path = os.path.join(os.getcwd(), "public/main-logo.png")
+def embed_image_base64(path):
+    with open(path, "rb") as f:
+        return "data:image/png;base64," + base64.b64encode(f.read()).decode()
+logo_b64 = embed_image_base64(logo_path)
+with open("public/style.css") as f:
+    css = f.read()
+# Base + custom CSS
+custom_css = css + """
+.chat-question-input textarea {
+    min-height: 40px !important;
+    max-height: 40px !important;
+    height: 40px !important;
+    resize: none !important;
+}
+/* Hide the 'or' text in file upload */
+.gr-file span.or,
+span[class*="or"],
+.upload-text span {
+    display: none !important;
+}
+.gr-file,
+.gr-file .wrap,
+.gr-file .wrap > div {
+    position: relative !important;
+}
+.gr-file svg,
+.gr-file .wrap svg,
+svg.feather-upload {
+    position: absolute !important;
+    top: 50% !important;
+    left: 50% !important;
+    transform: translate(-50%, -50%) !important;
+    width: 60px !important;
+    height: 60px !important;
+    opacity: 0.9 !important;
+    margin: 0 !important;
+    z-index: 10 !important;
+}
+.gr-file .wrap span {
+    opacity: 0 !important;
+}
+#analysis-type-box {
+    padding: 12px !important;
+    min-height: auto !important;
+}
+#analysis-type-box h3 {
+    font-size: 16px !important;
+    margin: 0 0 8px 0 !important;
+}
+#visualization-box {
+    padding: 12px !important;
+    min-height: auto !important;
+}
+#visualization-box h3 {
+    font-size: 16px !important;
+    margin: 0 0 8px 0 !important;
+}
+/* Column Selector */
+.gr-checkbox-group,
+.gr-checkboxgroup {
+    background: transparent !important;
+}
+.gr-checkbox-group label,
+.gr-checkboxgroup label,
+.gr-checkbox-group span,
+.gr-checkboxgroup span {
+    color: #000000 !important;
+}
+/* Display Format label - white text */
+.gradio-container .contain span[class*="svelte"] {
+    color: rgb(255, 255, 255) !important;
+}
+.gradio-container.gradio-container-4-20-0 .contain span[class*="svelte"] {
+    color: rgb(255, 255, 255) !important;
+}
+/* Force disable text wrapping in all dataframes */
+.gradio-container table td,
+.gradio-container table th,
+.dataframe td,
+.dataframe th,
+table.dataframe td,
+table.dataframe th,
+.gr-dataframe td,
+.gr-dataframe th {
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+}
+/* Target Gradio's internal dataframe cells */
+div[class*="table"] td,
+div[class*="table"] th,
+div[class*="dataframe"] td,
+div[class*="dataframe"] th {
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+}
+/* Disable wrapping in table data elements */
+.gradio-container [data-testid="table"] td,
+.gradio-container [data-testid="table"] th {
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+}
+/* ====================================================================== */
+/* FIX WHITE BORDER AROUND ALL DROPDOWNS — ONLY REQUIRED CHANGES APPLIED */
+/* ====================================================================== */
+/* Prevent clipping of borders */
+.wrap,
+.wrap-inner,
+.secondary-wrap,
+.container,
+.input-container {
+    overflow: visible !important;
+}
+/* The actual dropdown box — apply the white border here */
+.input-container {
+    border: 2px solid #ffffff !important;
+    border-radius: 12px !important;
+    padding: 12px !important;
+    box-sizing: border-box !important;
+    background: transparent !important;
+    min-height: 58px !important;
+}
+/* Ensure child elements render fully */
+.input-container > * {
+    overflow: visible !important;
+}
+/* Remove duplicate border on secondary-wrap */
+.secondary-wrap,
+.secondary-wrap.svelte-vomtxz {
+    border: none !important;
+    padding: 0 !important;
+    background: none !important;
+}
+/* Clean container without affecting layout */
+.svelte-vomtxz.container {
+    border: none !important;
+    padding: 0 !important;
+    margin: 0 !important;
+    background: none !important;
+    box-shadow: none !important;
+    border-radius: 0 !important;
+}
+/* Remove problematic display: contents */
+.wrap.svelte-vomtxz {
+    display: block !important;
+    padding: 0 !important;
+    border: none !important;
+    background: none !important;
+    box-shadow: none !important;
+}
+/* Simple white border around the dropdown wrapper */
+.wrap-inner.svelte-vomtxz {
+    border: 2px solid white !important;
+    border-radius: 8px !important;
+    box-sizing: border-box !important;
+}
+/* Force component-37 to be a single-line input box */
+#component-37 {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-37.hide-container {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-37 .block,
+#component-37 .svelte-90oupt,
+#component-37 .padded {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-37 textarea,
+#component-37 input,
+#component-37 .gr-textbox textarea,
+#component-37 .gr-textbox input {
+    display: block !important;
+    visibility: visible !important;
+    min-height: 45px !important;
+    max-height: 45px !important;
+    height: 45px !important;
+    resize: none !important;
+    overflow: hidden !important;
+    line-height: 45px !important;
+    padding: 0 15px !important;
+    background: white !important;
+    border: 1px solid #ccc !important;
+    border-radius: 8px !important;
+    color: #333 !important;
+}
+#component-37 label {
+    display: none !important;
+}
+/* Force component-88 to be a single-line input box */
+#component-88 {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-88.hide-container {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-88 .block,
+#component-88 .svelte-90oupt,
+#component-88 .padded {
+    display: block !important;
+    visibility: visible !important;
+}
+#component-88 textarea,
+#component-88 input,
+#component-88 .gr-textbox textarea,
+#component-88 .gr-textbox input {
+    display: block !important;
+    visibility: visible !important;
+    min-height: 45px !important;
+    max-height: 45px !important;
+    height: 45px !important;
+    resize: none !important;
+    overflow: hidden !important;
+    line-height: 45px !important;
+    padding: 0 15px !important;
+    background: white !important;
+    border: 1px solid #ccc !important;
+    border-radius: 8px !important;
+    color: #333 !important;
+}
+#component-88 label {
+    display: none !important;
+}
+/* Make Sample Questions heading text black */
+.sample-header,
+.sample-header h4,
+.sample-header p {
+    color: #000000 !important;
+}
+/* Force Enter Your Question h4 text to black */
+.chat-popup-box h4,
+.chat-popup-box .gr-markdown h4 {
+    color: #000000 !important;
+}
+/* Make entire chat popup box scrollable */
+.chat-popup-box {
+    overflow-y: auto !important;
+}
+.chat-popup-box > * {
+    flex-shrink: 0 !important;
+}
+/* Make all markdown text in chat popup black */
+.chat-popup-box .gr-markdown,
+.chat-popup-box .gr-markdown h1,
+.chat-popup-box .gr-markdown h2,
+.chat-popup-box .gr-markdown h3,
+.chat-popup-box .gr-markdown h4,
+.chat-popup-box .gr-markdown h5,
+.chat-popup-box .gr-markdown h6,
+.chat-popup-box .gr-markdown p {
+    color: #000000 !important;
+}
+/* Force component-96 button text to black */
+#component-96-button {
+    color: #000000 !important;
+}
+/* Force component-93 button text to black */
+#component-93-button {
+    color: #000000 !important;
+}
+/* Force component-98 button text to black */
+#component-98-button {
+    color: #000000 !important;
+}
+/* Fix z-index: chat popup should be above how-to-use */
+.chat-popup-box {
+    z-index: 1001 !important;
+}
+.how-to-use-sidebar {
+    z-index: 1000 !important;
+}
+/* Force all tab buttons text to black */
+.chat-popup-box button,
+.chat-popup-box .gr-button,
+button[id*="component-"] {
+    color: #000000 !important;
+}
+/* Force typed text in textbox to black */
+#component-88 textarea,
+#component-88 input {
+    color: #000000 !important;
+}
+/* Force Analysis Output label to white */
+#component-19 span.svelte-1gfkn6j {
+    color: #ffffff !important;
+}
+"""
+def upload_dataset(file):
+    global uploaded_df, original_df, dataset_name
+    if file is None:
+        return "No file uploaded", gr.update(visible=False), gr.update(choices=[]), gr.update(visible=False)
+    try:
+        dataset_name = os.path.basename(file.name)
+        if file.name.endswith('.csv'):
+            uploaded_df = pd.read_csv(file.name)
+        elif file.name.endswith(('.xlsx', '.xls')):
+            uploaded_df = pd.read_excel(file.name)
+        else:
+            return "Unsupported file format. Please upload CSV or Excel files.", gr.update(visible=False), gr.update(choices=[]), gr.update(visible=False)
+        uploaded_df = clean_numeric(uploaded_df)
+        original_df = uploaded_df.copy()
+        info_text = f" Dataset Loaded: {dataset_name} ({uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns)"
+        return info_text, gr.update(visible=False), gr.update(choices=list(uploaded_df.columns), value=[]), gr.update(visible=True)
+    except Exception as e:
+        return f"Error loading file: {str(e)}", gr.update(visible=False), gr.update(choices=[]), gr.update(visible=False)
+def clear_dataset():
+    global uploaded_df, original_df, dataset_name, change_history
+    uploaded_df = None
+    original_df = None
+    dataset_name = None
+    change_history = []
+    return "Dataset cleared. Please upload a new file.", gr.update(visible=False), gr.update(choices=[], value=[]), gr.update(visible=False)
+def update_preview(format_type, selected_columns):
+    if format_type == "None":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    elif format_type == "DataFrame":
+        return gr.update(value=display_data_format(format_type, selected_columns, uploaded_df), visible=True), gr.update(visible=False), gr.update(visible=True)
+    else:
+        return gr.update(visible=False), gr.update(value=display_text_format(format_type, selected_columns, uploaded_df), visible=True), gr.update(visible=True)
+def handle_analysis_change(analysis_type, selected_columns):
+    result_text, data_table = run_analysis(analysis_type, selected_columns, uploaded_df)
+    if result_text and result_text.strip() and analysis_type != "None":
+        if data_table is not None:
+            return gr.update(value=result_text, visible=True), gr.update(visible=True), gr.update(value=data_table, visible=True)
+        else:
+            return gr.update(value=result_text, visible=True), gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(value="", visible=False), gr.update(visible=False), gr.update(visible=False)
+def handle_viz_change(viz_type, selected_columns):
+    result = create_visualization(viz_type, selected_columns, uploaded_df)
+    if result and len(result) == 3:
+        fig, explanation, chart_obj = result
+        if explanation and fig is not None:
+            return fig, gr.update(visible=True), explanation, gr.update(visible=True)
+        else:
+            return None, gr.update(visible=False), explanation or "Error in visualization", gr.update(visible=False)
+    else:
+        return None, gr.update(visible=False), "Error in visualization", gr.update(visible=False)
+def handle_question_analysis(question):
+    global uploaded_df, llm
+    if uploaded_df is None:
+        return "Please upload a dataset first.", None, None
+    if llm is None:
+        return "AI agent not initialized. Please restart the application.", None, None
+    return analyze_question(question, [], uploaded_df, llm)
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.HTML(f"""
+        <div class="how-to-use-sidebar">
+            <div class="how-to-use-content">
+                <h3>How to Use</h3>
+                <ul>
+                    <li>• Use Display Format to preview your data.</li>
+                    <li>• Select an Analysis Type.</li>
+                    <li>• Choose a Visualization Type.</li>
+                    <li>• Ask any question about your data.</li>
+                </ul>
+            </div>
+        </div>
+        <div class="header-box">
+            <div style="flex:1; display:flex; justify-content:flex-start; width: 100px;">
+                <img src="{logo_b64}" width="120" style="margin-bottom:70px; margin-top: 30px; opacity:0.95;">
+            </div>
+            <div style="flex:1; display:flex; justify-content:center;">
+                <h1 class="header-title">SparkNova</h1>
+            </div>
+            <div style="flex:1;"></div>
+        </div>
+        <div style="text-align: center; display: flex; justify-content: center;">
+            <p style="margin-top:20px; font-size:1.30em; opacity:1; color: white; max-width: 1450px; line-height: 1.45;">
+SparkNova is an ata analysis platform that allows users to upload datasets, explore insights, visualize patterns, and ask questions about their data.
+It simplifies data analytics by automating cleaning, visualization, and intelligent interpretation for quick decision-making.
+            </p>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Upload Dataset")
+            file_input = gr.File(label="DROP YOUR FILE HERE", file_types=[".csv", ".xlsx", ".xls"])
+            dataset_info = gr.Markdown()
+            with gr.Row():
+                clear_btn = gr.Button("Clear Dataset", variant="secondary", size="sm")
+            format_selector = gr.Dropdown(
+                choices=["None", "DataFrame", "JSON", "Dictionary"],
+                value="None",
+                label="Display Format"
+            )
+            column_selector = gr.CheckboxGroup(
+                label="Select Columns",
+                choices=[],
+                visible=False
+            )
+            gr.Markdown("### Choose an Analysis Type")
+            analysis_selector = gr.Dropdown(
+                choices=["None", "Summary", "Describe", "Top 5 Rows", "Bottom 5 Rows", "Missing Values", "Group & Aggregate", "Calculate Expressions", "Highest Correlation"],
+                value="None",
+                label="Analysis Type"
+            )
+            gr.Markdown("### Visualization Types")
+            viz_selector = gr.Dropdown(
+                choices=["None", "Bar Chart", "Line Chart", "Scatter Plot", "Pie Chart", "Histogram", "Box Plot", "Heat Map"],
+                value="None",
+                label="Chart Type"
+            )
+        with gr.Column(scale=2):
+            preview_heading = gr.Markdown("### Dataset Preview", visible=False)
+            dataset_preview = gr.Dataframe(wrap=True, visible=False)
+            text_preview = gr.Textbox(label="Text Preview", lines=15, visible=False)
+            analysis_heading = gr.Markdown("### Analysis Results", visible=False)
+            analysis_output = gr.Textbox(label="Analysis Output", lines=10, visible=False, interactive=False)
+            analysis_data_table = gr.Dataframe(label="Data Table", wrap=True, visible=False)
+            chart_output_new = gr.Plot(label="Chart", visible=False)
+            chart_explanation = gr.Textbox(label="Chart Analysis", lines=5, visible=False, interactive=False)
+    # Floating Chat Button
+    chat_toggle_btn = gr.Button("💬", elem_classes=["floating-chat-btn"], size="sm")
+    with gr.Column(visible=False, elem_classes=["chat-popup-box"]) as chat_popup:
+        gr.HTML("<div class='chat-header'><h3>Ask SparkNova</h3></div>")
+        gr.Markdown("#### Sample Questions", elem_classes=["sample-header"])
+        for i in range(min(5, len(SAMPLE_QUESTIONS))):
+            gr.Markdown(f"• {SAMPLE_QUESTIONS[i]}", elem_classes=["sample-q-text"])
+        gr.Markdown("#### Enter Your Question")
+        user_question = gr.Textbox(
+            label="Your Question",
+            placeholder="Type your question here...",
+            lines=2,
+            max_lines=3,
+            elem_classes=["chat-question-input"],
+            interactive=True,
+            show_label=True
+        )
+        submit_btn = gr.Button("Analyze", variant="primary", size="lg", elem_classes=["chat-submitbtn"])
+        gr.HTML("<div class='response-box'><h4>Analysis Results</h4></div>")
+        with gr.Tabs():
+            with gr.Tab("Response"):
+                output_text = gr.Textbox(label="", interactive=False, lines=10, show_copy_button=True)
+            with gr.Tab("Visualization"):
+                chart_output = gr.Plot(label="")
+            with gr.Tab("Data"):
+                result_table = gr.Dataframe(label="", wrap=True)
+    chat_visible = [False]
+    def toggle_chat():
+        chat_visible[0] = not chat_visible[0]
+        return gr.update(visible=chat_visible[0])
+    chat_toggle_btn.click(toggle_chat, inputs=None, outputs=chat_popup)
+    file_input.change(upload_dataset, inputs=file_input, outputs=[dataset_info, dataset_preview, column_selector, column_selector])
+    clear_btn.click(clear_dataset, outputs=[dataset_info, dataset_preview, column_selector, column_selector])
+    format_selector.change(update_preview, inputs=[format_selector, column_selector], outputs=[dataset_preview, text_preview, preview_heading])
+    column_selector.change(update_preview, inputs=[format_selector, column_selector], outputs=[dataset_preview, text_preview, preview_heading])
+    analysis_selector.change(handle_analysis_change, inputs=[analysis_selector, column_selector], outputs=[analysis_output, analysis_heading, analysis_data_table])
+    column_selector.change(handle_analysis_change, inputs=[analysis_selector, column_selector], outputs=[analysis_output, analysis_heading, analysis_data_table])
+    viz_selector.change(handle_viz_change, inputs=[viz_selector, column_selector], outputs=[chart_output_new, chart_output_new, chart_explanation, chart_explanation])
+    column_selector.change(handle_viz_change, inputs=[viz_selector, column_selector], outputs=[chart_output_new, chart_output_new, chart_explanation, chart_explanation])
+    submit_btn.click(handle_question_analysis, inputs=[user_question], outputs=[output_text, chart_output, result_table])
+    gr.HTML("<div style='text-align: center; margin-top: 20px; color: #666;'>Powered by GROQ LLM & Gradio</div>")
+if __name__ == "__main__":
+    llm = initialize_llm()
+    if not llm:
+        print("Warning: Failed to initialize GROQ API")
+    demo.launch(show_error=True, share=False)

app_backup.py ADDED Viewed

	@@ -0,0 +1,682 @@

+import gradio as gr
+import pandas as pd
+import os
+import base64
+import json
+from data_engine import (
+    clean_numeric, run_analysis, create_visualization, handle_missing_data,
+    undo_last_change, undo_all_changes, download_dataset,
+    display_data_format, display_text_format
+)
+try:
+    from ai_agent import initialize_llm, analyze_question
+except (ImportError, RuntimeError) as e:
+    print(f"Warning: Full AI agent not available: {e}")
+    def initialize_llm():
+        return None
+    def analyze_question(question, columns, df, llm):
+        return "AI agent not available. Please check dependencies.", None, None
+from prompts import SAMPLE_QUESTIONS
+llm = None
+uploaded_df = None
+original_df = None
+dataset_name = None
+change_history = []
+logo_path = os.path.join(os.getcwd(), "public/main-logo.png")
+def embed_image_base64(path):
+    with open(path, "rb") as f:
+        return "data:image/png;base64," + base64.b64encode(f.read()).decode()
+logo_b64 = embed_image_base64(logo_path)
+with open("public/style.css") as f:
+    css = f.read()
+# Enhanced custom CSS for dropdown
+custom_css = css + """
+.dropdown-wrapper {
+    width: 100%;
+    margin: 10px 0;
+}
+.dropdown-button {
+    width: 100%;
+    padding: 12px 16px;
+    background: rgba(138, 43, 226, 0.15);
+    border: 1px solid rgba(138, 43, 226, 0.3);
+    border-radius: 8px;
+    cursor: pointer;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    font-size: 14px;
+    color: #e0e0e0;
+    transition: all 0.3s;
+}
+.dropdown-button:hover {
+    background: rgba(138, 43, 226, 0.25);
+    border-color: rgba(138, 43, 226, 0.5);
+}
+.dropdown-menu {
+    display: none;
+    position: absolute;
+    width: calc(100% - 32px);
+    max-height: 300px;
+    overflow-y: auto;
+    background: #2a2a3e;
+    border: 1px solid rgba(138, 43, 226, 0.3);
+    border-radius: 8px;
+    margin-top: 5px;
+    z-index: 1000;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+}
+.dropdown-menu.show {
+    display: block;
+}
+.dropdown-item {
+    padding: 12px 16px;
+    cursor: pointer;
+    color: #e0e0e0;
+    transition: all 0.2s;
+    display: flex;
+    align-items: center;
+    gap: 10px;
+}
+.dropdown-item:hover {
+    background: rgba(138, 43, 226, 0.2);
+}
+.dropdown-item.selected {
+    background: rgba(138, 43, 226, 0.3);
+    color: #fff;
+    font-weight: 500;
+}
+.dropdown-item::before {
+    content: '☐';
+    font-size: 18px;
+}
+.dropdown-item.selected::before {
+    content: '☑';
+    color: #8a2be2;
+}
+.selected-tags {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-top: 10px;
+    min-height: 20px;
+}
+.tag {
+    background: rgba(138, 43, 226, 0.3);
+    color: #fff;
+    padding: 6px 12px;
+    border-radius: 20px;
+    font-size: 13px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.tag .remove {
+    cursor: pointer;
+    font-weight: bold;
+    color: #ff6b6b;
+    font-size: 16px;
+    line-height: 1;
+}
+.tag .remove:hover {
+    color: #ff4444;
+}
+.dropdown-arrow {
+    transition: transform 0.3s;
+}
+.dropdown-arrow.open {
+    transform: rotate(180deg);
+}
+"""
+def upload_dataset(file):
+    global uploaded_df, original_df, dataset_name, change_history
+    if file is None:
+        return "No file uploaded", None, [], ""
+    try:
+        if file.name.endswith('.csv'):
+            uploaded_df = pd.read_csv(file.name)
+        elif file.name.endswith(('.xlsx', '.xls')):
+            uploaded_df = pd.read_excel(file.name)
+        else:
+            return "Unsupported file format", None, [], ""
+        original_df = uploaded_df.copy()
+        dataset_name = os.path.basename(file.name)
+        change_history = []
+        info = f"**Dataset:** {dataset_name}\n**Shape:** {uploaded_df.shape}\n**Columns:** {', '.join(uploaded_df.columns)}"
+        columns = list(uploaded_df.columns)
+        # Create the dropdown HTML with updated columns
+        dropdown_html = create_dropdown_html(columns, [])
+        return info, uploaded_df.head(), columns, dropdown_html
+    except Exception as e:
+        return f"Error uploading file: {str(e)}", None, [], ""
+def create_dropdown_html(available_columns, selected_columns):
+    """Create the dropdown HTML structure"""
+    if not available_columns:
+        return """
+        <div class="dropdown-wrapper">
+            <div class="dropdown-button" style="opacity: 0.5; cursor: not-allowed;">
+                <span>No columns available</span>
+                <span class="dropdown-arrow">▼</span>
+            </div>
+        </div>
+        """
+    columns_json = json.dumps(available_columns)
+    selected_json = json.dumps(selected_columns)
+    dropdown_items = ''.join([
+        f'<div class="dropdown-item{" selected" if col in selected_columns else ""}" data-column="{col}">{col}</div>'
+        for col in available_columns
+    ])
+    selected_tags = ''.join([
+        f'<span class="tag" data-column="{col}">{col}<span class="remove">×</span></span>'
+        for col in selected_columns
+    ])
+    return f"""
+    <div class="dropdown-wrapper">
+        <div class="dropdown-button" onclick="toggleDropdown()">
+            <span id="dropdown-text">{len(selected_columns)} column(s) selected" if selected_columns else "Choose columns to work with</span>
+            <span class="dropdown-arrow" id="dropdown-arrow">▼</span>
+        </div>
+        <div class="dropdown-menu" id="dropdown-menu">
+            {dropdown_items}
+        </div>
+        <div class="selected-tags" id="selected-tags">
+            {selected_tags}
+        </div>
+    </div>
+    <input type="hidden" id="available-columns" value='{columns_json}'>
+    <input type="hidden" id="selected-columns-data" value='{selected_json}'>
+    <script>
+    (function() {{
+        let selectedColumns = {selected_json};
+        let availableColumns = {columns_json};
+        function updateHiddenInput() {{
+            const hiddenInput = document.getElementById('selected-columns-data');
+            if (hiddenInput) {{
+                hiddenInput.value = JSON.stringify(selectedColumns);
+                // Trigger change event for Gradio
+                hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
+            }}
+        }}
+        window.toggleDropdown = function() {{
+            const menu = document.getElementById('dropdown-menu');
+            const arrow = document.getElementById('dropdown-arrow');
+            if (menu && arrow) {{
+                menu.classList.toggle('show');
+                arrow.classList.toggle('open');
+            }}
+        }}
+        // Handle dropdown item clicks
+        document.addEventListener('click', function(e) {{
+            if (e.target.classList.contains('dropdown-item')) {{
+                const column = e.target.getAttribute('data-column');
+                if (selectedColumns.includes(column)) {{
+                    selectedColumns = selectedColumns.filter(c => c !== column);
+                    e.target.classList.remove('selected');
+                }} else {{
+                    selectedColumns.push(column);
+                    e.target.classList.add('selected');
+                }}
+                updateSelectedDisplay();
+                updateHiddenInput();
+            }}
+            // Handle tag remove
+            if (e.target.classList.contains('remove')) {{
+                const tag = e.target.parentElement;
+                const column = tag.getAttribute('data-column');
+                selectedColumns = selectedColumns.filter(c => c !== column);
+                // Update dropdown item
+                const dropdownItem = document.querySelector(`.dropdown-item[data-column="${{column}}"]`);
+                if (dropdownItem) {{
+                    dropdownItem.classList.remove('selected');
+                }}
+                updateSelectedDisplay();
+                updateHiddenInput();
+            }}
+            // Close dropdown when clicking outside
+            if (!e.target.closest('.dropdown-wrapper')) {{
+                const menu = document.getElementById('dropdown-menu');
+                const arrow = document.getElementById('dropdown-arrow');
+                if (menu && arrow) {{
+                    menu.classList.remove('show');
+                    arrow.classList.remove('open');
+                }}
+            }}
+        }});
+        function updateSelectedDisplay() {{
+            const tagsContainer = document.getElementById('selected-tags');
+            const dropdownText = document.getElementById('dropdown-text');
+            if (tagsContainer && dropdownText) {{
+                if (selectedColumns.length === 0) {{
+                    dropdownText.textContent = 'Choose columns to work with';
+                    tagsContainer.innerHTML = '';
+                }} else {{
+                    dropdownText.textContent = selectedColumns.length + ' column(s) selected';
+                    tagsContainer.innerHTML = selectedColumns.map(col =>
+                        `<span class="tag" data-column="${{col}}">${{col}}<span class="remove">×</span></span>`
+                    ).join('');
+                }}
+            }}
+        }}
+    }})();
+    </script>
+    """
+def get_selected_columns_from_html():
+    """This would normally extract from the hidden input, but we'll use State instead"""
+    return []
+def clear_dataset():
+    global uploaded_df, original_df, dataset_name, change_history
+    uploaded_df = None
+    original_df = None
+    dataset_name = None
+    change_history = []
+    return "", None, [], create_dropdown_html([], [])
+def update_preview(format_type, columns):
+    global uploaded_df
+    if uploaded_df is None or format_type == "None":
+        return None, "", gr.update(visible=False), gr.update(visible=False)
+    try:
+        # Use selected columns or all columns if none selected
+        selected_columns = columns if columns else list(uploaded_df.columns)
+        selected_df = uploaded_df[selected_columns]
+        if format_type == "DataFrame":
+            return selected_df, "", gr.update(visible=True), gr.update(visible=False)
+        elif format_type == "JSON":
+            json_str = selected_df.to_json(indent=2)
+            return None, json_str, gr.update(visible=False), gr.update(visible=True)
+        elif format_type == "Dictionary":
+            dict_str = str(selected_df.to_dict())
+            return None, dict_str, gr.update(visible=False), gr.update(visible=True)
+    except Exception as e:
+        return None, f"Error: {str(e)}", gr.update(visible=False), gr.update(visible=True)
+def handle_analysis_change(analysis_type, columns):
+    global uploaded_df
+    if uploaded_df is None or analysis_type == "None":
+        return "", "", None
+    try:
+        result = run_analysis(uploaded_df, analysis_type, columns)
+        if isinstance(result, tuple):
+            text_result, data_result = result
+            return text_result, "", data_result
+        else:
+            return str(result), "", None
+    except Exception as e:
+        return f"Error in analysis: {str(e)}", "", None
+def handle_viz_change(viz_type, columns):
+    global uploaded_df
+    if uploaded_df is None or viz_type == "None":
+        return None, None, "", ""
+    try:
+        chart, explanation = create_visualization(uploaded_df, viz_type, columns)
+        return chart, None, explanation, ""
+    except Exception as e:
+        return None, None, f"Error creating visualization: {str(e)}", ""
+def show_constant_input(handler_type):
+    return gr.update(visible=(handler_type == "Constant Fill"))
+def handle_data_and_refresh(handler, columns, constant, analysis_type):
+    global uploaded_df, change_history
+    if uploaded_df is None or handler == "None":
+        return "", "", [], ""
+    try:
+        result = handle_missing_data(uploaded_df, handler, columns, constant)
+        change_history.append(uploaded_df.copy())
+        analysis_result = ""
+        if analysis_type != "None":
+            analysis_result = str(run_analysis(uploaded_df, analysis_type, columns))
+        new_columns = list(uploaded_df.columns)
+        info = f"Applied {handler} to dataset\nShape: {uploaded_df.shape}"
+        dropdown_html = create_dropdown_html(new_columns, new_columns)
+        return result, analysis_result, new_columns, info, dropdown_html
+    except Exception as e:
+        return f"Error: {str(e)}", "", [], "", ""
+def handle_undo_and_refresh(analysis_type, undo_all):
+    global uploaded_df, change_history
+    if uploaded_df is None:
+        return "", "", [], "", ""
+    try:
+        if undo_all:
+            result = undo_all_changes()
+        else:
+            result = undo_last_change()
+        analysis_result = ""
+        if analysis_type != "None":
+            analysis_result = str(run_analysis(uploaded_df, analysis_type, []))
+        new_columns = list(uploaded_df.columns)
+        info = f"Dataset restored\nShape: {uploaded_df.shape}"
+        dropdown_html = create_dropdown_html(new_columns, new_columns)
+        return result, analysis_result, new_columns, info, dropdown_html
+    except Exception as e:
+        return f"Error: {str(e)}", "", [], "", ""
+def handle_question_analysis(question, columns):
+    global llm, uploaded_df
+    if llm is None:
+        llm = initialize_llm()
+    return analyze_question(question, columns, uploaded_df, llm)
+def sync_selected_columns(selected_json_str):
+    """Sync selected columns from hidden input"""
+    try:
+        if selected_json_str:
+            return json.loads(selected_json_str)
+    except:
+        pass
+    return []
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    popup_visible = gr.State(False)
+    selected_columns_state = gr.State([])
+    gr.HTML(f"""
+        <!-- How to Use Section - Fixed Left Side -->
+        <div class="how-to-use-sidebar">
+            <div class="how-to-use-content">
+                <h3>How to Use</h3>
+                <ul>
+                    <li>• Use Display Format to preview or view summary of your data.</li>
+                    <li>• Select an Analysis Type to explore key insights and patterns.</li>
+                    <li>• Choose a Visualization Type to generate charts and graphical views.</li>
+                    <li>• Ask any question about your data in the text box and click Analyze to get AI-driven results.</li>
+                </ul>
+            </div>
+        </div>
+        <div class="header-box">
+            <div style="flex:1; display:flex; justify-content:flex-start; width: 100px;">
+                <img src="{logo_b64}" width="120" style="margin-bottom:70px; margin-top: 30px; opacity:0.95;">
+            </div>
+            <div style="flex:1; display:flex; justify-content:center;">
+                <h1 class="header-title">SparkNova</h1>
+            </div>
+            <div style="flex:1;"></div>
+        </div>
+        <div style="text-align: center; display: flex; justify-content: center;">
+            <p style="margin-top:20px; font-size:1.30em; opacity:0.92; color: white; max-width: 1450px; line-height: 1.45;">
+                SparkNova is a data analysis platform that allows users to upload datasets, explore insights, visualize patterns, and ask questions about their data. It simplifies data analytics by automating cleaning, visualization, and intelligent interpretation for quick decision-making.
+            </p>
+        </div>
+    """)
+    with gr.Row(elem_classes="first-row"):
+        # Left Column - Upload Dataset
+        with gr.Column(scale=1):
+            with gr.Group(elem_id="upload-wrapper", elem_classes="upload-section"):
+                gr.Markdown("### Upload Dataset", elem_classes="upload-title")
+                file_input = gr.File(
+                    label="Choose File",
+                    file_types=[".csv", ".xlsx", ".xls"],
+                    elem_classes="upload-card",
+                )
+                dataset_info = gr.Markdown(value="", elem_classes="upload-info")
+                clear_btn = gr.Button("Clear Dataset", variant="secondary", size="sm")
+        # Middle Column - Custom Dropdown Column Selector
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 📋 Column Selector")
+                column_dropdown_html = gr.HTML(create_dropdown_html([], []))
+                # Hidden textbox to capture selected columns from JavaScript
+                selected_columns_json = gr.Textbox(visible=False, elem_id="selected-columns-data")
+        # Right Column - Display Format
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🔍 Display Format")
+                format_selector = gr.Dropdown(
+                    choices=["None", "DataFrame", "JSON", "Dictionary"],
+                    value="None",
+                    label="Format Type"
+                )
+    # Hidden elements
+    constant_input = gr.Textbox(
+        label="Constant Value",
+        placeholder="Enter value",
+        visible=False
+    )
+    data_handling_output = gr.Textbox(label="Results", lines=2, visible=False, interactive=False)
+    download_file = gr.File(label="Download", visible=False)
+    with gr.Row(elem_classes="second-row"):
+        with gr.Group(elem_classes="overflow-class"):
+            gr.Markdown("### 📊 Analysis Type")
+            analysis_selector = gr.Dropdown(
+                choices=["None", "Summary", "Describe", "Top 5 Rows", "Bottom 5 Rows",
+                         "Missing Values", "Group & Aggregate", "Calculate Expressions", "Highest Correlation"],
+                value="None",
+                label="Select Analysis"
+            )
+        with gr.Group():
+            gr.Markdown("### 📈 Visualization")
+            viz_selector = gr.Dropdown(
+                choices=["None", "Bar Chart", "Line Chart", "Scatter Plot", "Pie Chart",
+                         "Histogram", "Box Plot", "Heat Map"],
+                value="None",
+                label="Chart Type"
+            )
+        with gr.Group():
+            gr.Markdown("### 🔧 Data Handling")
+            data_handler = gr.Dropdown(
+                choices=["None", "Forward Fill", "Backward Fill", "Constant Fill",
+                         "Mean Fill", "Median Fill", "Mode Fill", "Drop Columns"],
+                value="None",
+                label="Method"
+            )
+            with gr.Row():
+                apply_btn = gr.Button("Apply", variant="primary", size="sm")
+                undo_last_btn = gr.Button("Undo", variant="secondary", size="sm")
+            with gr.Row():
+                undo_all_btn = gr.Button("Undo All", variant="secondary", size="sm")
+                download_btn = gr.Button("Download", variant="secondary", size="sm")
+    with gr.Column(scale=2):
+        preview_heading = gr.Markdown("", visible=False)
+        dataset_preview = gr.Dataframe(visible=False)
+        text_preview = gr.Textbox(label="Text Preview", lines=8, visible=False)
+        analysis_heading = gr.Markdown("### Analysis Results", visible=False)
+        analysis_output = gr.Textbox(label="Analysis Output", lines=6, visible=False, interactive=False)
+        analysis_data_table = gr.Dataframe(label="Data Table", visible=False)
+        chart_output_new = gr.Plot(label="Chart", visible=False)
+        chart_explanation = gr.Textbox(label="Chart Analysis", lines=3, visible=False, interactive=False)
+    with gr.Column(visible=False, elem_classes="chat-popup-box") as chat_popup:
+        gr.HTML('<div class="chat-header"><h3>💬 Ask a Question</h3></div>')
+        with gr.Column(elem_classes="chat-content-area"):
+            gr.HTML('''
+                <div class="sample-questions-box">
+                    <h4>Sample Questions</h4>
+                    <ul style="list-style: none; padding: 0; margin: 10px 0;">
+                        <li style="margin: 8px 0; color: #555;">• What is the average of all numeric columns?</li>
+                        <li style="margin: 8px 0; color: #555;">• Show me the correlation between columns</li>
+                        <li style="margin: 8px 0; color: #555;">• What are the missing values in my dataset?</li>
+                    </ul>
+                </div>
+            ''')
+            gr.HTML('<div style="margin: 20px 0 10px 0; font-weight: 600; color: #333; font-size: 15px;">✍️ Type Your Question Here</div>')
+            question_input = gr.Textbox(
+                placeholder="Ask anything about your dataset...",
+                lines=2,
+                elem_classes="question-input-field",
+                show_label=False
+            )
+            with gr.Row():
+                ask_btn = gr.Button("Ask Question", variant="primary", size="sm")
+                close_chat_btn = gr.Button("Close", variant="secondary", size="sm")
+            answer_output = gr.Textbox(
+                label="Answer",
+                lines=4,
+                interactive=False,
+                elem_id="answer-output"
+            )
+            answer_chart = gr.Plot(label="Generated Chart", visible=False)
+            answer_data = gr.Dataframe(label="Generated Data", visible=False)
+    # Floating Chat Button
+    floating_chat_button = gr.Button("💬", elem_classes="floating-chat-btn")
+    # Event handlers
+    file_input.upload(
+        fn=upload_dataset,
+        inputs=[file_input],
+        outputs=[dataset_info, dataset_preview, selected_columns_state, column_dropdown_html]
+    )
+    clear_btn.click(
+        fn=clear_dataset,
+        outputs=[dataset_info, dataset_preview, selected_columns_state, column_dropdown_html]
+    )
+    # Sync selected columns from hidden input
+    selected_columns_json.change(
+        fn=sync_selected_columns,
+        inputs=[selected_columns_json],
+        outputs=[selected_columns_state]
+    )
+    format_selector.change(
+        fn=update_preview,
+        inputs=[format_selector, selected_columns_state],
+        outputs=[dataset_preview, text_preview, preview_heading, analysis_heading]
+    )
+    analysis_selector.change(
+        fn=handle_analysis_change,
+        inputs=[analysis_selector, selected_columns_state],
+        outputs=[analysis_output, chart_explanation, analysis_data_table]
+    )
+    viz_selector.change(
+        fn=handle_viz_change,
+        inputs=[viz_selector, selected_columns_state],
+        outputs=[chart_output_new, analysis_data_table, chart_explanation, analysis_output]
+    )
+    data_handler.change(
+        fn=show_constant_input,
+        inputs=[data_handler],
+        outputs=[constant_input]
+    )
+    apply_btn.click(
+        fn=handle_data_and_refresh,
+        inputs=[data_handler, selected_columns_state, constant_input, analysis_selector],
+        outputs=[data_handling_output, analysis_output, selected_columns_state, dataset_info, column_dropdown_html]
+    )
+    undo_last_btn.click(
+        fn=lambda analysis_type: handle_undo_and_refresh(analysis_type, False),
+        inputs=[analysis_selector],
+        outputs=[data_handling_output, analysis_output, selected_columns_state, dataset_info, column_dropdown_html]
+    )
+    undo_all_btn.click(
+        fn=lambda analysis_type: handle_undo_and_refresh(analysis_type, True),
+        inputs=[analysis_selector],
+        outputs=[data_handling_output, analysis_output, selected_columns_state, dataset_info, column_dropdown_html]
+    )
+    # Toggle chat popup visibility
+    def toggle_chat_popup(current_visible):
+        new_visible = not current_visible
+        return gr.update(visible=new_visible), new_visible
+    floating_chat_button.click(
+        toggle_chat_popup,
+        inputs=[popup_visible],
+        outputs=[chat_popup, popup_visible]
+    )
+    close_chat_btn.click(
+        fn=lambda: (gr.update(visible=False), False),
+        outputs=[chat_popup, popup_visible]
+    )
+    ask_btn.click(
+        fn=handle_question_analysis,
+        inputs=[question_input, selected_columns_state],
+        outputs=[answer_output, answer_chart, answer_data]
+    )
+    def handle_download():
+        if uploaded_df is not None:
+            filepath = download_dataset(uploaded_df, dataset_name)
+            return gr.update(value=filepath, visible=bool(filepath))
+        return gr.update(visible=False)
+    download_btn.click(handle_download, outputs=[download_file])
+if __name__ == "__main__":
+    demo.launch(share=True, debug=True)

data_engine.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+import os
+import tempfile
+def clean_numeric(df):
+    df = df.copy()
+    for col in df.columns:
+        if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object:
+            s = df[col].astype(str).str.strip()
+            if s.str.contains("%", na=False).any():
+                numeric_vals = pd.to_numeric(s.str.replace("%", "", regex=False), errors="coerce")
+                if numeric_vals.notna().sum() / len(df) > 0.5:
+                    df[col] = numeric_vals / 100.0
+                    continue
+            cleaned = s.str.replace(",", "", regex=False).str.replace("₹", "", regex=False).str.replace("$", "", regex=False)
+            numeric_vals = pd.to_numeric(cleaned, errors="coerce")
+            if numeric_vals.notna().sum() / len(df) > 0.5:
+                df[col] = numeric_vals
+    return df
+def run_analysis(analysis_type, selected_columns, uploaded_df):
+    if uploaded_df is None:
+        return "Please upload a dataset first.", None
+    if analysis_type == "None" or analysis_type is None:
+        return "", None
+    if 'title' in uploaded_df.columns:
+        title_nulls = uploaded_df['title'].isnull().sum()
+        print(f"DEBUG: Title column has {title_nulls} null values at analysis time")
+    whole_dataset_analyses = ["Summary", "Top 5 Rows", "Bottom 5 Rows", "Missing Values"]
+    if analysis_type in whole_dataset_analyses:
+        df_to_analyze = uploaded_df
+    else:
+        if not selected_columns:
+            return f"Please select columns for {analysis_type} analysis.", None
+        df_to_analyze = uploaded_df[selected_columns]
+    try:
+        if analysis_type == "Summary":
+            numeric_cols = uploaded_df.select_dtypes(include=[np.number]).columns
+            categorical_cols = uploaded_df.select_dtypes(include=['object', 'category']).columns
+            result = f"Dataset Summary:\nRows: {len(uploaded_df):,}\nColumns: {len(uploaded_df.columns)}\nNumeric Columns: {len(numeric_cols)}\nText Columns: {len(categorical_cols)}\n\n"
+            if len(numeric_cols) > 0:
+                result += "Numeric Columns: " + ", ".join(numeric_cols.tolist()) + "\n"
+            if len(categorical_cols) > 0:
+                result += "Text Columns: " + ", ".join(categorical_cols.tolist())
+            return result, None
+        elif analysis_type == "Describe":
+            result = "Column Description:\n" + "=" * 30 + "\n\n"
+            for col in selected_columns:
+                if col in df_to_analyze.columns:
+                    result += f"Column: {col}\n"
+                    if pd.api.types.is_numeric_dtype(df_to_analyze[col]):
+                        stats = df_to_analyze[col].describe()
+                        result += f"  Type: Numeric\n  Count: {stats['count']:.0f}\n  Mean: {stats['mean']:.3f}\n  Std: {stats['std']:.3f}\n  Min: {stats['min']:.3f}\n  25%: {stats['25%']:.3f}\n  50%: {stats['50%']:.3f}\n  75%: {stats['75%']:.3f}\n  Max: {stats['max']:.3f}\n\n"
+                    else:
+                        unique_count = df_to_analyze[col].nunique()
+                        null_count = df_to_analyze[col].isnull().sum()
+                        most_common = df_to_analyze[col].mode().iloc[0] if len(df_to_analyze[col].mode()) > 0 else "N/A"
+                        result += f"  Type: Categorical/Text\n  Unique Values: {unique_count}\n  Missing Values: {null_count}\n  Most Common: {most_common}\n"
+                        top_values = df_to_analyze[col].value_counts().head(5)
+                        result += "  Top Values:\n"
+                        for val, count in top_values.items():
+                            result += f"    {val}: {count} times\n"
+                        result += "\n"
+            return result, None
+        elif analysis_type == "Top 5 Rows":
+            return "Top 5 Rows - See data table below", df_to_analyze.head(5)
+        elif analysis_type == "Bottom 5 Rows":
+            return "Bottom 5 Rows - See data table below", df_to_analyze.tail(5)
+        elif analysis_type == "Missing Values":
+            result = "Missing Values Analysis:\n" + "=" * 30 + "\n\n"
+            patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
+                       'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
+                       'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
+            for col in uploaded_df.columns:
+                nan_count = uploaded_df[col].isnull().sum()
+                pseudo_missing_count = 0
+                non_null_data = uploaded_df[col].dropna()
+                if len(non_null_data) > 0:
+                    col_str = non_null_data.astype(str).str.strip()
+                    empty_count = (col_str == '').sum()
+                    pattern_count = 0
+                    for pattern in patterns:
+                        if pattern != '':
+                            pattern_count += (col_str.str.lower() == pattern.lower()).sum()
+                    pseudo_missing_count = empty_count + pattern_count
+                total_missing = nan_count + pseudo_missing_count
+                missing_percent = (total_missing / len(uploaded_df)) * 100
+                if col == 'title':
+                    print(f"DEBUG: Title analysis - NaN: {nan_count}, Pseudo: {pseudo_missing_count}, Total: {total_missing}")
+                if total_missing > 0:
+                    details = []
+                    if nan_count > 0:
+                        details.append(f"{nan_count} NaN")
+                    if pseudo_missing_count > 0:
+                        details.append(f"{pseudo_missing_count} text-missing")
+                    detail_str = f" ({', '.join(details)})"
+                else:
+                    detail_str = ""
+                result += f"{col}: {total_missing} missing ({missing_percent:.2f}%){detail_str}\n"
+            return result, None
+        elif analysis_type == "Highest Correlation":
+            numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) < 2:
+                return "Need at least 2 numeric columns for correlation analysis.", None
+            corr_matrix = df_to_analyze[numeric_cols].corr()
+            result = "Highest Correlations:\n" + "=" * 25 + "\n\n"
+            correlations = []
+            for i in range(len(corr_matrix.columns)):
+                for j in range(i+1, len(corr_matrix.columns)):
+                    col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
+                    corr_val = corr_matrix.iloc[i, j]
+                    correlations.append((abs(corr_val), col1, col2, corr_val))
+            correlations.sort(reverse=True)
+            for _, col1, col2, corr_val in correlations[:10]:
+                result += f"{col1} ↔ {col2}: {corr_val:.3f}\n"
+            return result, None
+        elif analysis_type == "Group & Aggregate":
+            if not selected_columns:
+                result = "Please select columns for grouping and aggregation."
+            else:
+                categorical_cols = [col for col in selected_columns if not pd.api.types.is_numeric_dtype(df_to_analyze[col])]
+                numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_analyze[col])]
+                if categorical_cols and numeric_cols:
+                    group_col = categorical_cols[0]
+                    agg_col = numeric_cols[0]
+                    grouped = df_to_analyze.groupby(group_col)[agg_col].agg(['count', 'mean', 'sum']).round(2)
+                    result = f"Group & Aggregate Analysis:\n" + "=" * 35 + "\n\n"
+                    result += f"Grouped by: {group_col}\nAggregated: {agg_col}\n\n"
+                    result += grouped.to_string()
+                elif categorical_cols:
+                    group_col = categorical_cols[0]
+                    grouped = df_to_analyze[group_col].value_counts()
+                    result = f"Group Count Analysis:\n" + "=" * 25 + "\n\n"
+                    result += grouped.to_string()
+                else:
+                    result = "Please select at least one categorical column for grouping."
+            return result, None
+        elif analysis_type == "Calculate Expressions":
+            numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) >= 2:
+                col1, col2 = numeric_cols[0], numeric_cols[1]
+                df_calc = df_to_analyze.copy()
+                df_calc['Sum'] = df_calc[col1] + df_calc[col2]
+                df_calc['Difference'] = df_calc[col1] - df_calc[col2]
+                result = f"Calculated Expressions:\n" + "=" * 30 + "\n\n"
+                result += f"Using columns: {col1} and {col2}\n\n"
+                result += f"New calculated columns:\nSum = {col1} + {col2}\nDifference = {col1} - {col2}\n\n"
+                result += "Sample results:\n"
+                result += df_calc[['Sum', 'Difference']].head().to_string()
+            else:
+                result = "Need at least 2 numeric columns for calculations."
+            return result, None
+        else:
+            return f"Analysis type '{analysis_type}' is under development.", None
+    except Exception as e:
+        return f"Error in analysis: {str(e)}", None
+def create_chart_explanation(viz_type, df_to_plot, selected_columns, fig_data=None):
+    try:
+        if viz_type == "Bar Chart" and len(selected_columns) >= 2:
+            x_col, y_col = selected_columns[0], selected_columns[1]
+            if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
+                max_val_idx = df_to_plot[y_col].idxmax()
+                max_category = df_to_plot.loc[max_val_idx, x_col]
+                max_value = df_to_plot[y_col].max()
+                y_mean = df_to_plot[y_col].mean()
+            else:
+                grouped = df_to_plot.groupby(x_col)[y_col].count()
+                max_category = grouped.idxmax()
+                max_value = grouped.max()
+                y_mean = grouped.mean()
+            return f"BAR CHART: {y_col} by {x_col}\nHighest: {max_category} ({max_value:.2f})\nAverage: {y_mean:.2f}\nCategories: {df_to_plot[x_col].nunique()}"
+        elif viz_type == "Line Chart" and fig_data is not None:
+            max_combo = fig_data.loc[fig_data['Count'].idxmax()]
+            min_combo = fig_data.loc[fig_data['Count'].idxmin()]
+            return f"LINE CHART: Distribution\nHighest: {max_combo[selected_columns[1]]} in {max_combo[selected_columns[0]]} ({max_combo['Count']})\nLowest: {min_combo[selected_columns[1]]} in {min_combo[selected_columns[0]]} ({min_combo['Count']})\nTotal: {len(df_to_plot)}"
+    except:
+        pass
+    return f"{viz_type} visualization\nShows data patterns and relationships"
+def create_visualization(viz_type, selected_columns, uploaded_df):
+    if uploaded_df is None or viz_type == "None":
+        return None, "", None
+    if not selected_columns:
+        return None, "Please select columns for visualization.", None
+    df_to_plot = uploaded_df[selected_columns]
+    try:
+        if viz_type == "Bar Chart":
+            if len(selected_columns) >= 2:
+                x_col, y_col = selected_columns[0], selected_columns[1]
+                color_col = selected_columns[2] if len(selected_columns) > 2 else None
+                # Handle different data type combinations
+                if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
+                    # Numeric Y-axis: use as-is
+                    plot_data = df_to_plot.head(100)
+                    fig = px.bar(plot_data, x=x_col, y=y_col, color=color_col, title=f"{y_col} by {x_col}")
+                else:
+                    # Non-numeric Y-axis: count occurrences
+                    if pd.api.types.is_numeric_dtype(df_to_plot[x_col]):
+                        # If X is numeric, group and count Y values
+                        grouped = df_to_plot.groupby(x_col)[y_col].count().reset_index()
+                        grouped.columns = [x_col, f'Count of {y_col}']
+                        fig = px.bar(grouped, x=x_col, y=f'Count of {y_col}', title=f"Count of {y_col} by {x_col}")
+                    else:
+                        # Both categorical: cross-tabulation
+                        crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
+                        crosstab_reset = crosstab.reset_index().melt(id_vars=[x_col], var_name=y_col, value_name='Count')
+                        fig = px.bar(crosstab_reset, x=x_col, y='Count', color=y_col, title=f"{y_col} distribution by {x_col}")
+                explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns)
+            else:
+                col = selected_columns[0]
+                if pd.api.types.is_numeric_dtype(df_to_plot[col]):
+                    fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}")
+                else:
+                    value_counts = df_to_plot[col].value_counts().head(15)
+                    fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {col}")
+                explanation = f"Chart showing distribution of {col}"
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        elif viz_type == "Pie Chart":
+            col = selected_columns[0]
+            if len(selected_columns) >= 2 and pd.api.types.is_numeric_dtype(df_to_plot[selected_columns[1]]):
+                grouped_data = df_to_plot.groupby(col)[selected_columns[1]].sum().reset_index()
+                fig = px.pie(grouped_data, values=selected_columns[1], names=col, title=f"Total {selected_columns[1]} by {col}")
+                legend_title = f"{col} Categories"
+            else:
+                value_counts = df_to_plot[col].value_counts().head(10)
+                fig = px.pie(values=value_counts.values, names=value_counts.index, title=f"Distribution of {col}")
+                legend_title = f"{col} Values"
+            fig.update_layout(
+                width=800,
+                height=500,
+                showlegend=True,
+                legend=dict(
+                    title=dict(text=legend_title, font=dict(size=14, color="black")),
+                    orientation="v",
+                    yanchor="middle",
+                    y=0.5,
+                    xanchor="left",
+                    x=1.05,
+                    font=dict(size=12)
+                )
+            )
+            explanation = f"PIE CHART: {col} Distribution\nShows proportion of each category\nUse to understand category distribution patterns"
+            return fig, explanation, fig
+        elif viz_type == "Scatter Plot":
+            if len(selected_columns) >= 2:
+                x_col, y_col = selected_columns[0], selected_columns[1]
+                color_col = selected_columns[2] if len(selected_columns) > 2 else None
+                # Check if both columns are suitable for scatter plot
+                if not (pd.api.types.is_numeric_dtype(df_to_plot[x_col]) and pd.api.types.is_numeric_dtype(df_to_plot[y_col])):
+                    return None, f"Scatter plot requires numeric data. {x_col} and {y_col} must be numeric.", None
+                fig = px.scatter(df_to_plot, x=x_col, y=y_col, color=color_col, title=f"{y_col} vs {x_col}")
+                explanation = f"Scatter plot showing relationship between {x_col} and {y_col}"
+            else:
+                return None, "Scatter plot requires at least 2 columns.", None
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        elif viz_type == "Line Chart":
+            if len(selected_columns) >= 2:
+                x_col, y_col = selected_columns[0], selected_columns[1]
+                if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
+                    # Numeric Y: sort by X and plot trend
+                    sorted_data = df_to_plot.sort_values(x_col)
+                    fig = px.line(sorted_data, x=x_col, y=y_col, title=f"Trend of {y_col} over {x_col}", markers=True)
+                    explanation = f"Line chart showing trend of {y_col} over {x_col}"
+                else:
+                    # Non-numeric Y: create cross-tabulation
+                    crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
+                    melted = pd.melt(crosstab.reset_index(), id_vars=[x_col], var_name=y_col, value_name='Count')
+                    fig = px.line(melted, x=x_col, y='Count', color=y_col, title=f"Distribution of {y_col} across {x_col}", markers=True)
+                    explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns, melted)
+            else:
+                return None, "Line chart requires at least 2 columns.", None
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        elif viz_type == "Histogram":
+            col = selected_columns[0]
+            if pd.api.types.is_numeric_dtype(df_to_plot[col]):
+                fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}", nbins=30)
+                explanation = f"Histogram showing distribution of {col}"
+            else:
+                return None, f"Histogram requires numeric data. Try Bar Chart instead.", None
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        elif viz_type == "Heat Map":
+            if len(selected_columns) >= 2:
+                numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_plot[col])]
+                if len(numeric_cols) >= 2:
+                    corr_matrix = df_to_plot[numeric_cols].corr()
+                    fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", title="Correlation Heatmap", color_continuous_scale='RdBu')
+                    explanation = f"Heatmap showing correlations between numeric columns"
+                else:
+                    x_col, y_col = selected_columns[0], selected_columns[1]
+                    crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
+                    fig = px.imshow(crosstab.values, x=crosstab.columns, y=crosstab.index, text_auto=True, aspect="auto", title=f"Cross-tabulation: {y_col} vs {x_col}")
+                    explanation = f"Heatmap showing cross-tabulation between {x_col} and {y_col}"
+            else:
+                return None, "Heat map requires at least 2 columns.", None
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        elif viz_type == "Box Plot":
+            if len(selected_columns) >= 1:
+                y_col = selected_columns[0]
+                if not pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
+                    return None, f"Box plot requires numeric Y-axis. {y_col} is not numeric.", None
+                x_col = selected_columns[1] if len(selected_columns) > 1 else None
+                fig = px.box(df_to_plot, x=x_col, y=y_col, title=f"Box Plot of {y_col}" + (f" by {x_col}" if x_col else ""))
+                explanation = f"Box plot showing distribution of {y_col}" + (f" grouped by {x_col}" if x_col else "")
+            else:
+                return None, "Box plot requires at least 1 column.", None
+            fig.update_layout(width=800, height=500)
+            return fig, explanation, fig
+        else:
+            return None, f"Visualization type '{viz_type}' is under development.", None
+    except Exception as e:
+        return None, f"Error creating visualization: {str(e)}", None
+def handle_missing_data(method, selected_columns, constant_value, uploaded_df, change_history):
+    print(f"DEBUG: Starting {method} on columns {selected_columns}")
+    if uploaded_df is None:
+        return "Please upload a dataset first.", uploaded_df, change_history
+    if method == "None":
+        return "", uploaded_df, change_history
+    if not selected_columns:
+        return "Please select columns to apply data handling.", uploaded_df, change_history
+    try:
+        change_history.append(uploaded_df.copy())
+        df_copy = uploaded_df.copy()
+        if method == "Clean All Missing":
+            return "Clean All Missing is not available", uploaded_df, change_history
+        processed_columns = []
+        dropped_columns = []
+        for col in selected_columns:
+            if col not in df_copy.columns:
+                continue
+            if method == "Forward Fill":
+                if col == 'title':
+                    print(f"DEBUG: Skipping title column due to data inconsistencies")
+                    continue
+                if df_copy[col].dtype == 'object':
+                    patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
+                               'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
+                               'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
+                    for pattern in patterns:
+                        df_copy[col] = df_copy[col].replace(pattern, np.nan)
+                    df_copy[col] = df_copy[col].replace('', np.nan)
+                df_copy[col] = df_copy[col].ffill()
+                processed_columns.append(col)
+            elif method == "Backward Fill":
+                if df_copy[col].dtype == 'object':
+                    patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
+                               'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
+                               'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
+                    for pattern in patterns:
+                        df_copy[col] = df_copy[col].replace(pattern, np.nan)
+                    df_copy[col] = df_copy[col].replace('', np.nan)
+                df_copy[col] = df_copy[col].bfill()
+                processed_columns.append(col)
+            elif method == "Constant Fill":
+                if df_copy[col].dtype == 'object':
+                    patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
+                               'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
+                               'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
+                    for pattern in patterns:
+                        df_copy[col] = df_copy[col].replace(pattern, np.nan)
+                    df_copy[col] = df_copy[col].replace('', np.nan)
+                fill_val = constant_value.strip() if constant_value else "Unknown"
+                df_copy[col] = df_copy[col].fillna(fill_val)
+                processed_columns.append(col)
+            elif method == "Mean Fill":
+                if pd.api.types.is_numeric_dtype(df_copy[col]):
+                    if not df_copy[col].isna().all():
+                        mean_val = df_copy[col].mean()
+                        df_copy[col] = df_copy[col].fillna(mean_val)
+                        processed_columns.append(col)
+                else:
+                    numeric_col = pd.to_numeric(df_copy[col], errors='coerce')
+                    if not numeric_col.isna().all():
+                        mean_val = numeric_col.mean()
+                        df_copy[col] = numeric_col.fillna(mean_val)
+                        processed_columns.append(col)
+            elif method == "Median Fill":
+                if pd.api.types.is_numeric_dtype(df_copy[col]):
+                    if not df_copy[col].isna().all():
+                        median_val = df_copy[col].median()
+                        df_copy[col] = df_copy[col].fillna(median_val)
+                        processed_columns.append(col)
+                else:
+                    numeric_col = pd.to_numeric(df_copy[col], errors='coerce')
+                    if not numeric_col.isna().all():
+                        median_val = numeric_col.median()
+                        df_copy[col] = numeric_col.fillna(median_val)
+                        processed_columns.append(col)
+            elif method == "Mode Fill":
+                patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
+                           'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
+                           'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
+                valid_values = df_copy[col][~df_copy[col].isin(patterns) & df_copy[col].notna()]
+                if len(valid_values) > 0:
+                    mode_value = valid_values.mode()
+                    if len(mode_value) > 0:
+                        most_common = mode_value.iloc[0]
+                        print(f"DEBUG: Mode Fill - Most common value for {col}: {most_common}")
+                        for pattern in patterns:
+                            df_copy[col] = df_copy[col].replace(pattern, most_common)
+                        df_copy[col] = df_copy[col].fillna(most_common)
+                processed_columns.append(col)
+            elif method == "Drop Columns":
+                df_copy = df_copy.drop(columns=[col])
+                dropped_columns.append(col)
+        uploaded_df = df_copy
+        remaining_cols = [col for col in selected_columns if col not in dropped_columns]
+        if 'title' in uploaded_df.columns:
+            title_check = uploaded_df['title'].astype(str).str.contains('UNKNOWN', case=False, na=False).sum()
+            print(f"DEBUG: After update, title has {title_check} UNKNOWN values")
+        if processed_columns:
+            result = f"Applied {method} to: {', '.join(processed_columns)}"
+            for col in processed_columns:
+                if col in uploaded_df.columns:
+                    after_missing = uploaded_df[col].isnull().sum()
+                    result += f"\n- {col}: {after_missing} missing values remaining"
+        elif dropped_columns:
+            result = f"Dropped columns: {', '.join(dropped_columns)}"
+        else:
+            result = "No columns processed - check column selection or data types"
+        return result, uploaded_df, change_history
+    except Exception as e:
+        return f"Error: {str(e)}", uploaded_df, change_history
+def undo_last_change(uploaded_df, change_history):
+    if not change_history:
+        return "No changes to undo.", uploaded_df, change_history
+    uploaded_df = change_history.pop()
+    return f"Undid last change. Dataset now has {uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns", uploaded_df, change_history
+def undo_all_changes(original_df, change_history):
+    if original_df is None:
+        return "No original dataset to restore.", None, change_history
+    uploaded_df = original_df.copy()
+    change_history = []
+    return f"Dataset restored to original state ({uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns)", uploaded_df, change_history
+def download_dataset(uploaded_df, dataset_name):
+    if uploaded_df is None:
+        return None
+    if dataset_name:
+        base_name = dataset_name.replace('.csv', '').replace('.xlsx', '').replace('.xls', '')
+        filename = f"{base_name}_modified.csv"
+    else:
+        filename = "modified_dataset.csv"
+    temp_dir = tempfile.gettempdir()
+    filepath = os.path.join(temp_dir, filename)
+    uploaded_df.to_csv(filepath, index=False)
+    return filepath
+def display_data_format(format_type, selected_columns, uploaded_df):
+    if uploaded_df is None or format_type == "None":
+        return None
+    if selected_columns and len(selected_columns) > 0:
+        df_to_show = uploaded_df[selected_columns]
+    else:
+        df_to_show = uploaded_df
+    return df_to_show.head(100) if format_type == "DataFrame" else None
+def display_text_format(format_type, selected_columns, uploaded_df):
+    if uploaded_df is None or format_type == "None":
+        return ""
+    if selected_columns and len(selected_columns) > 0:
+        df_to_show = uploaded_df[selected_columns]
+    else:
+        df_to_show = uploaded_df
+    if format_type == "JSON":
+        return df_to_show.head(20).to_json(orient='records', indent=2)
+    elif format_type == "Dictionary":
+        return str(df_to_show.head(20).to_dict(orient='records'))

prompts.py ADDED Viewed

	@@ -0,0 +1,168 @@

+ENHANCED_SYSTEM_PROMPT = """You are a data analysis assistant. Respond ONLY in valid JSON format.
+RULES:
+1. DATASET ANALYSIS QUESTIONS (always process these): patterns, trends, insights, statistics, correlations, distributions, summaries, comparisons, relationships, data quality, outliers, analysis, exploration, findings, recommendations
+2. NON-DATASET QUESTIONS (reject these): general knowledge, current events, personal questions, definitions unrelated to the data
+3. Parse user queries to identify: column names, values, conditions, operations
+4. For complex queries with multiple conditions, use multiple operations in sequence
+5. Always use exact column names from the available columns list
+OPERATIONS:
+- filter: Use "expr" for conditions like "column_name > 100" or "column" + "value" for exact matches
+- count: Count specific values in columns
+- describe: Statistical summary
+- groupby: Group and aggregate data
+- calculate: Mathematical operations
+FOR MULTI-CONDITION QUERIES:
+- Step 1: Filter data based on conditions
+- Step 2: Perform count/analysis on filtered data
+CHART CREATION RULES:
+- For visualization requests: Always include "plot" object
+- For informational queries: Set "plot": null
+RESPONSE FORMATS:
+1. INFORMATIONAL (no visualization):
+{
+  "type": "explain",
+  "operations": [],
+  "plot": null,
+  "narrative": "detailed answer",
+  "insights_needed": false
+}
+2. STATISTICAL DESCRIPTION:
+{
+  "type": "describe",
+  "operations": [{"op": "describe", "columns": ["col1", "col2"]}],
+  "plot": null,
+  "narrative": "statistical summary",
+  "insights_needed": false
+}
+3. VISUALIZATION REQUEST:
+{
+  "type": "analysis",
+  "operations": [
+    {"op": "groupby", "columns": ["category"], "agg_col": "value", "agg_func": "sum"}
+  ],
+  "plot": {
+    "type": "bar|line|pie|hist|scatter",
+    "x": "category",
+    "y": "sum_value",
+    "title": "Chart Title"
+  },
+  "narrative": "brief explanation",
+  "insights_needed": true
+}
+4. FILTERING:
+{
+  "type": "analysis",
+  "operations": [{"op": "filter", "column": "column_name", "value": "specific_value"}],
+  "plot": null,
+  "narrative": "filtered data explanation",
+  "insights_needed": false
+}
+5. CALCULATIONS:
+{
+  "type": "analysis",
+  "operations": [{"op": "calculate", "expr": "Col1 * Col2", "new_col": "Product"}],
+  "plot": null,
+  "narrative": "calculation explanation",
+  "insights_needed": false
+}
+6. COUNT VALUES:
+{
+  "type": "analysis",
+  "operations": [{"op": "count", "column": "column_name", "value": "specific_value"}],
+  "plot": null,
+  "narrative": "count result explanation",
+  "insights_needed": false
+}
+7. SHOW ALL VALUES:
+{
+  "type": "analysis",
+  "operations": [{"op": "count", "column": "column_name"}],
+  "plot": null,
+  "narrative": "showing all unique values",
+  "insights_needed": false
+}
+8. MULTI-CONDITION QUERIES:
+{
+  "type": "analysis",
+  "operations": [
+    {"op": "filter", "expr": "column_name > value"},
+    {"op": "count", "column": "another_column", "value": "target_value"}
+  ],
+  "plot": null,
+  "narrative": "",
+  "insights_needed": false
+}
+CHART TYPES:
+- "bar": For categorical comparisons
+- "line": For trends over time/sequence
+- "pie": For proportions/percentages
+- "hist": For distributions
+- "scatter": For correlations
+Always ensure column names exist in the dataset before referencing them.
+"""
+INSIGHTS_SYSTEM_PROMPT = "You are a data insights expert. Analyze the provided data context and generate meaningful insights about patterns, trends, relationships, and key findings. Focus on actionable insights that help understand the data better. Provide clear, specific observations based on the actual data values and statistics shown."
+SAMPLE_QUESTIONS = [
+    "What are the key patterns in this dataset?",
+    "Show me insights about this data",
+    # "What trends can you identify?",
+    # "Analyze the relationships between columns",
+    # "What are the main findings from this data?",
+    "Describe the data distribution and patterns",
+    # "What recommendations can you make?",
+    # "Find correlations in the dataset",
+    # "Summarize the key statistics"
+]
+def get_chart_prompt(question, columns, data_sample):
+    return f"""
+User Query: {question}
+Dataset Information:
+Available Columns: {', '.join(columns)}
+Sample Data:
+{data_sample}
+CRITICAL INSTRUCTIONS:
+1. If the question contains ANY of these keywords, it's a DATASET ANALYSIS question - ALWAYS process it:
+   - patterns, trends, insights, analysis, statistics, correlations, relationships
+   - distribution, summary, compare, explore, findings, recommendations
+   - data, dataset, columns, values, records, rows
+   - show, find, count, filter, group, calculate, describe
+2. ONLY reject questions about: presidents, weather, news, definitions, general knowledge
+3. For dataset analysis questions:
+   - Use describe operations for exploratory questions
+   - Set "insights_needed": true for pattern/trend questions
+   - Create appropriate operations based on available columns
+4. ALWAYS use exact column names from: {', '.join(columns)}
+5. For vague questions like "analyze this data", use describe on all key columns
+"""
+def validate_plot_spec(plot_spec, available_columns):
+    if not plot_spec:
+        return plot_spec
+    x_col = plot_spec.get('x')
+    y_col = plot_spec.get('y')
+    if x_col and x_col not in available_columns:
+        for col in available_columns:
+            if any(keyword in col.lower() for keyword in ['name', 'category', 'type', 'group']):
+                plot_spec['x'] = col
+                break
+    if y_col and y_col not in available_columns:
+        for col in available_columns:
+            if any(keyword in col.lower() for keyword in ['value', 'amount', 'count', 'price', 'sales']):
+                plot_spec['y'] = col
+                break
+    return plot_spec
+def get_insights_prompt(context_parts, narrative):
+    insights_context = "\n".join(context_parts)
+    return f"""Based on this analysis, provide 4-6 detailed bullet points explaining key insights, patterns, and findings.
+Analysis Context:
+{insights_context}
+Original Question Context:
+{narrative}
+Provide insights as bullet points."""

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+pandas>=1.5.0
+numpy>=1.21.0
+matplotlib>=3.5.0
+plotly>=5.0.0
+langchain_groq>=0.1.0
+gradio==4.20.0
+openpyxl>=3.0.0
+xlrd>=2.0.0
+uuid-utils>=0.7.0
+tokenizers>=0.13.0
+safetensors>=0.3.0
+setuptools-rust>=1.5.0
+wheel>=0.37.0
+setuptools>=65.0.0