import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go import os import tempfile def clean_numeric(df): df = df.copy() for col in df.columns: if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object: s = df[col].astype(str).str.strip() if s.str.contains("%", na=False).any(): numeric_vals = pd.to_numeric(s.str.replace("%", "", regex=False), errors="coerce") if numeric_vals.notna().sum() / len(df) > 0.5: df[col] = numeric_vals / 100.0 continue cleaned = s.str.replace(",", "", regex=False).str.replace("₹", "", regex=False).str.replace("$", "", regex=False) numeric_vals = pd.to_numeric(cleaned, errors="coerce") if numeric_vals.notna().sum() / len(df) > 0.5: df[col] = numeric_vals return df def run_analysis(analysis_type, selected_columns, uploaded_df): if uploaded_df is None: return "Please upload a dataset first.", None if analysis_type == "None" or analysis_type is None: return "", None if 'title' in uploaded_df.columns: title_nulls = uploaded_df['title'].isnull().sum() print(f"DEBUG: Title column has {title_nulls} null values at analysis time") whole_dataset_analyses = ["Summary", "Top 5 Rows", "Bottom 5 Rows", "Missing Values"] if analysis_type in whole_dataset_analyses: df_to_analyze = uploaded_df else: if not selected_columns: return f"Please select columns for {analysis_type} analysis.", None df_to_analyze = uploaded_df[selected_columns] try: if analysis_type == "Summary": numeric_cols = uploaded_df.select_dtypes(include=[np.number]).columns categorical_cols = uploaded_df.select_dtypes(include=['object', 'category']).columns result = f"Dataset Summary:\nRows: {len(uploaded_df):,}\nColumns: {len(uploaded_df.columns)}\nNumeric Columns: {len(numeric_cols)}\nText Columns: {len(categorical_cols)}\n\n" if len(numeric_cols) > 0: result += "Numeric Columns: " + ", ".join(numeric_cols.tolist()) + "\n" if len(categorical_cols) > 0: result += "Text Columns: " + ", ".join(categorical_cols.tolist()) return result, None elif analysis_type == "Describe": result = "Column Description:\n" + "=" * 30 + "\n\n" for col in selected_columns: if col in df_to_analyze.columns: result += f"Column: {col}\n" if pd.api.types.is_numeric_dtype(df_to_analyze[col]): stats = df_to_analyze[col].describe() result += f" Type: Numeric\n Count: {stats['count']:.0f}\n Mean: {stats['mean']:.3f}\n Std: {stats['std']:.3f}\n Min: {stats['min']:.3f}\n 25%: {stats['25%']:.3f}\n 50%: {stats['50%']:.3f}\n 75%: {stats['75%']:.3f}\n Max: {stats['max']:.3f}\n\n" else: unique_count = df_to_analyze[col].nunique() null_count = df_to_analyze[col].isnull().sum() most_common = df_to_analyze[col].mode().iloc[0] if len(df_to_analyze[col].mode()) > 0 else "N/A" result += f" Type: Categorical/Text\n Unique Values: {unique_count}\n Missing Values: {null_count}\n Most Common: {most_common}\n" top_values = df_to_analyze[col].value_counts().head(5) result += " Top Values:\n" for val, count in top_values.items(): result += f" {val}: {count} times\n" result += "\n" return result, None elif analysis_type == "Top 5 Rows": return "Top 5 Rows - See data table below", df_to_analyze.head(5) elif analysis_type == "Bottom 5 Rows": return "Bottom 5 Rows - See data table below", df_to_analyze.tail(5) elif analysis_type == "Missing Values": result = "Missing Values Analysis:\n" + "=" * 30 + "\n\n" patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A', 'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan', 'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.'] for col in uploaded_df.columns: nan_count = uploaded_df[col].isnull().sum() pseudo_missing_count = 0 non_null_data = uploaded_df[col].dropna() if len(non_null_data) > 0: col_str = non_null_data.astype(str).str.strip() empty_count = (col_str == '').sum() pattern_count = 0 for pattern in patterns: if pattern != '': pattern_count += (col_str.str.lower() == pattern.lower()).sum() pseudo_missing_count = empty_count + pattern_count total_missing = nan_count + pseudo_missing_count missing_percent = (total_missing / len(uploaded_df)) * 100 if col == 'title': print(f"DEBUG: Title analysis - NaN: {nan_count}, Pseudo: {pseudo_missing_count}, Total: {total_missing}") if total_missing > 0: details = [] if nan_count > 0: details.append(f"{nan_count} NaN") if pseudo_missing_count > 0: details.append(f"{pseudo_missing_count} text-missing") detail_str = f" ({', '.join(details)})" else: detail_str = "" result += f"{col}: {total_missing} missing ({missing_percent:.2f}%){detail_str}\n" return result, None elif analysis_type == "Highest Correlation": numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns if len(numeric_cols) < 2: return "Need at least 2 numeric columns for correlation analysis.", None corr_matrix = df_to_analyze[numeric_cols].corr() result = "Highest Correlations:\n" + "=" * 25 + "\n\n" correlations = [] for i in range(len(corr_matrix.columns)): for j in range(i+1, len(corr_matrix.columns)): col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j] corr_val = corr_matrix.iloc[i, j] correlations.append((abs(corr_val), col1, col2, corr_val)) correlations.sort(reverse=True) for _, col1, col2, corr_val in correlations[:10]: result += f"{col1} ↔ {col2}: {corr_val:.3f}\n" return result, None elif analysis_type == "Group & Aggregate": if not selected_columns: result = "Please select columns for grouping and aggregation." else: categorical_cols = [col for col in selected_columns if not pd.api.types.is_numeric_dtype(df_to_analyze[col])] numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_analyze[col])] if categorical_cols and numeric_cols: group_col = categorical_cols[0] agg_col = numeric_cols[0] grouped = df_to_analyze.groupby(group_col)[agg_col].agg(['count', 'mean', 'sum']).round(2) result = f"Group & Aggregate Analysis:\n" + "=" * 35 + "\n\n" result += f"Grouped by: {group_col}\nAggregated: {agg_col}\n\n" result += grouped.to_string() elif categorical_cols: group_col = categorical_cols[0] grouped = df_to_analyze[group_col].value_counts() result = f"Group Count Analysis:\n" + "=" * 25 + "\n\n" result += grouped.to_string() else: result = "Please select at least one categorical column for grouping." return result, None elif analysis_type == "Calculate Expressions": numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns if len(numeric_cols) >= 2: col1, col2 = numeric_cols[0], numeric_cols[1] df_calc = df_to_analyze.copy() df_calc['Sum'] = df_calc[col1] + df_calc[col2] df_calc['Difference'] = df_calc[col1] - df_calc[col2] result = f"Calculated Expressions:\n" + "=" * 30 + "\n\n" result += f"Using columns: {col1} and {col2}\n\n" result += f"New calculated columns:\nSum = {col1} + {col2}\nDifference = {col1} - {col2}\n\n" result += "Sample results:\n" result += df_calc[['Sum', 'Difference']].head().to_string() else: result = "Need at least 2 numeric columns for calculations." return result, None else: return f"Analysis type '{analysis_type}' is under development.", None except Exception as e: return f"Error in analysis: {str(e)}", None def create_chart_explanation(viz_type, df_to_plot, selected_columns, fig_data=None): try: if viz_type == "Bar Chart" and len(selected_columns) >= 2: x_col, y_col = selected_columns[0], selected_columns[1] if pd.api.types.is_numeric_dtype(df_to_plot[y_col]): max_val_idx = df_to_plot[y_col].idxmax() max_category = df_to_plot.loc[max_val_idx, x_col] max_value = df_to_plot[y_col].max() y_mean = df_to_plot[y_col].mean() else: grouped = df_to_plot.groupby(x_col)[y_col].count() max_category = grouped.idxmax() max_value = grouped.max() y_mean = grouped.mean() return f"BAR CHART: {y_col} by {x_col}\nHighest: {max_category} ({max_value:.2f})\nAverage: {y_mean:.2f}\nCategories: {df_to_plot[x_col].nunique()}" elif viz_type == "Line Chart" and fig_data is not None: max_combo = fig_data.loc[fig_data['Count'].idxmax()] min_combo = fig_data.loc[fig_data['Count'].idxmin()] return f"LINE CHART: Distribution\nHighest: {max_combo[selected_columns[1]]} in {max_combo[selected_columns[0]]} ({max_combo['Count']})\nLowest: {min_combo[selected_columns[1]]} in {min_combo[selected_columns[0]]} ({min_combo['Count']})\nTotal: {len(df_to_plot)}" except: pass return f"{viz_type} visualization\nShows data patterns and relationships" def create_visualization(viz_type, selected_columns, uploaded_df): if uploaded_df is None or viz_type == "None": return None, "", None if not selected_columns: return None, "Please select columns for visualization.", None df_to_plot = uploaded_df[selected_columns] try: if viz_type == "Bar Chart": if len(selected_columns) >= 2: x_col, y_col = selected_columns[0], selected_columns[1] color_col = selected_columns[2] if len(selected_columns) > 2 else None # Handle different data type combinations if pd.api.types.is_numeric_dtype(df_to_plot[y_col]): # Numeric Y-axis: use as-is plot_data = df_to_plot.head(100) fig = px.bar(plot_data, x=x_col, y=y_col, color=color_col, title=f"{y_col} by {x_col}") else: # Non-numeric Y-axis: count occurrences if pd.api.types.is_numeric_dtype(df_to_plot[x_col]): # If X is numeric, group and count Y values grouped = df_to_plot.groupby(x_col)[y_col].count().reset_index() grouped.columns = [x_col, f'Count of {y_col}'] fig = px.bar(grouped, x=x_col, y=f'Count of {y_col}', title=f"Count of {y_col} by {x_col}") else: # Both categorical: cross-tabulation crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col]) crosstab_reset = crosstab.reset_index().melt(id_vars=[x_col], var_name=y_col, value_name='Count') fig = px.bar(crosstab_reset, x=x_col, y='Count', color=y_col, title=f"{y_col} distribution by {x_col}") explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns) else: col = selected_columns[0] if pd.api.types.is_numeric_dtype(df_to_plot[col]): fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}") else: value_counts = df_to_plot[col].value_counts().head(15) fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {col}") explanation = f"Chart showing distribution of {col}" fig.update_layout(width=800, height=500) return fig, explanation, fig elif viz_type == "Pie Chart": col = selected_columns[0] if len(selected_columns) >= 2 and pd.api.types.is_numeric_dtype(df_to_plot[selected_columns[1]]): grouped_data = df_to_plot.groupby(col)[selected_columns[1]].sum().reset_index() fig = px.pie(grouped_data, values=selected_columns[1], names=col, title=f"Total {selected_columns[1]} by {col}") legend_title = f"{col} Categories" else: value_counts = df_to_plot[col].value_counts().head(10) fig = px.pie(values=value_counts.values, names=value_counts.index, title=f"Distribution of {col}") legend_title = f"{col} Values" fig.update_layout( width=800, height=500, showlegend=True, legend=dict( title=dict(text=legend_title, font=dict(size=14, color="black")), orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05, font=dict(size=12) ) ) explanation = f"PIE CHART: {col} Distribution\nShows proportion of each category\nUse to understand category distribution patterns" return fig, explanation, fig elif viz_type == "Scatter Plot": if len(selected_columns) >= 2: x_col, y_col = selected_columns[0], selected_columns[1] color_col = selected_columns[2] if len(selected_columns) > 2 else None # Check if both columns are suitable for scatter plot if not (pd.api.types.is_numeric_dtype(df_to_plot[x_col]) and pd.api.types.is_numeric_dtype(df_to_plot[y_col])): return None, f"Scatter plot requires numeric data. {x_col} and {y_col} must be numeric.", None fig = px.scatter(df_to_plot, x=x_col, y=y_col, color=color_col, title=f"{y_col} vs {x_col}") explanation = f"Scatter plot showing relationship between {x_col} and {y_col}" else: return None, "Scatter plot requires at least 2 columns.", None fig.update_layout(width=800, height=500) return fig, explanation, fig elif viz_type == "Line Chart": if len(selected_columns) >= 2: x_col, y_col = selected_columns[0], selected_columns[1] if pd.api.types.is_numeric_dtype(df_to_plot[y_col]): # Numeric Y: sort by X and plot trend sorted_data = df_to_plot.sort_values(x_col) fig = px.line(sorted_data, x=x_col, y=y_col, title=f"Trend of {y_col} over {x_col}", markers=True) explanation = f"Line chart showing trend of {y_col} over {x_col}" else: # Non-numeric Y: create cross-tabulation crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col]) melted = pd.melt(crosstab.reset_index(), id_vars=[x_col], var_name=y_col, value_name='Count') fig = px.line(melted, x=x_col, y='Count', color=y_col, title=f"Distribution of {y_col} across {x_col}", markers=True) explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns, melted) else: return None, "Line chart requires at least 2 columns.", None fig.update_layout(width=800, height=500) return fig, explanation, fig elif viz_type == "Histogram": col = selected_columns[0] if pd.api.types.is_numeric_dtype(df_to_plot[col]): fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}", nbins=30) explanation = f"Histogram showing distribution of {col}" else: return None, f"Histogram requires numeric data. Try Bar Chart instead.", None fig.update_layout(width=800, height=500) return fig, explanation, fig elif viz_type == "Heat Map": if len(selected_columns) >= 2: numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_plot[col])] if len(numeric_cols) >= 2: corr_matrix = df_to_plot[numeric_cols].corr() fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", title="Correlation Heatmap", color_continuous_scale='RdBu') explanation = f"Heatmap showing correlations between numeric columns" else: x_col, y_col = selected_columns[0], selected_columns[1] crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col]) fig = px.imshow(crosstab.values, x=crosstab.columns, y=crosstab.index, text_auto=True, aspect="auto", title=f"Cross-tabulation: {y_col} vs {x_col}") explanation = f"Heatmap showing cross-tabulation between {x_col} and {y_col}" else: return None, "Heat map requires at least 2 columns.", None fig.update_layout(width=800, height=500) return fig, explanation, fig elif viz_type == "Box Plot": if len(selected_columns) >= 1: y_col = selected_columns[0] if not pd.api.types.is_numeric_dtype(df_to_plot[y_col]): return None, f"Box plot requires numeric Y-axis. {y_col} is not numeric.", None x_col = selected_columns[1] if len(selected_columns) > 1 else None fig = px.box(df_to_plot, x=x_col, y=y_col, title=f"Box Plot of {y_col}" + (f" by {x_col}" if x_col else "")) explanation = f"Box plot showing distribution of {y_col}" + (f" grouped by {x_col}" if x_col else "") else: return None, "Box plot requires at least 1 column.", None fig.update_layout(width=800, height=500) return fig, explanation, fig else: return None, f"Visualization type '{viz_type}' is under development.", None except Exception as e: return None, f"Error creating visualization: {str(e)}", None def handle_missing_data(method, selected_columns, constant_value, uploaded_df, change_history): print(f"DEBUG: Starting {method} on columns {selected_columns}") if uploaded_df is None: return "Please upload a dataset first.", uploaded_df, change_history if method == "None": return "", uploaded_df, change_history if not selected_columns: return "Please select columns to apply data handling.", uploaded_df, change_history try: change_history.append(uploaded_df.copy()) df_copy = uploaded_df.copy() if method == "Clean All Missing": return "Clean All Missing is not available", uploaded_df, change_history processed_columns = [] dropped_columns = [] for col in selected_columns: if col not in df_copy.columns: continue if method == "Forward Fill": if col == 'title': print(f"DEBUG: Skipping title column due to data inconsistencies") continue if df_copy[col].dtype == 'object': patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A', 'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan', 'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.'] for pattern in patterns: df_copy[col] = df_copy[col].replace(pattern, np.nan) df_copy[col] = df_copy[col].replace('', np.nan) df_copy[col] = df_copy[col].ffill() processed_columns.append(col) elif method == "Backward Fill": if df_copy[col].dtype == 'object': patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A', 'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan', 'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.'] for pattern in patterns: df_copy[col] = df_copy[col].replace(pattern, np.nan) df_copy[col] = df_copy[col].replace('', np.nan) df_copy[col] = df_copy[col].bfill() processed_columns.append(col) elif method == "Constant Fill": if df_copy[col].dtype == 'object': patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A', 'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan', 'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.'] for pattern in patterns: df_copy[col] = df_copy[col].replace(pattern, np.nan) df_copy[col] = df_copy[col].replace('', np.nan) fill_val = constant_value.strip() if constant_value else "Unknown" df_copy[col] = df_copy[col].fillna(fill_val) processed_columns.append(col) elif method == "Mean Fill": if pd.api.types.is_numeric_dtype(df_copy[col]): if not df_copy[col].isna().all(): mean_val = df_copy[col].mean() df_copy[col] = df_copy[col].fillna(mean_val) processed_columns.append(col) else: numeric_col = pd.to_numeric(df_copy[col], errors='coerce') if not numeric_col.isna().all(): mean_val = numeric_col.mean() df_copy[col] = numeric_col.fillna(mean_val) processed_columns.append(col) elif method == "Median Fill": if pd.api.types.is_numeric_dtype(df_copy[col]): if not df_copy[col].isna().all(): median_val = df_copy[col].median() df_copy[col] = df_copy[col].fillna(median_val) processed_columns.append(col) else: numeric_col = pd.to_numeric(df_copy[col], errors='coerce') if not numeric_col.isna().all(): median_val = numeric_col.median() df_copy[col] = numeric_col.fillna(median_val) processed_columns.append(col) elif method == "Mode Fill": patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A', 'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan', 'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.'] valid_values = df_copy[col][~df_copy[col].isin(patterns) & df_copy[col].notna()] if len(valid_values) > 0: mode_value = valid_values.mode() if len(mode_value) > 0: most_common = mode_value.iloc[0] print(f"DEBUG: Mode Fill - Most common value for {col}: {most_common}") for pattern in patterns: df_copy[col] = df_copy[col].replace(pattern, most_common) df_copy[col] = df_copy[col].fillna(most_common) processed_columns.append(col) elif method == "Drop Columns": df_copy = df_copy.drop(columns=[col]) dropped_columns.append(col) uploaded_df = df_copy remaining_cols = [col for col in selected_columns if col not in dropped_columns] if 'title' in uploaded_df.columns: title_check = uploaded_df['title'].astype(str).str.contains('UNKNOWN', case=False, na=False).sum() print(f"DEBUG: After update, title has {title_check} UNKNOWN values") if processed_columns: result = f"Applied {method} to: {', '.join(processed_columns)}" for col in processed_columns: if col in uploaded_df.columns: after_missing = uploaded_df[col].isnull().sum() result += f"\n- {col}: {after_missing} missing values remaining" elif dropped_columns: result = f"Dropped columns: {', '.join(dropped_columns)}" else: result = "No columns processed - check column selection or data types" return result, uploaded_df, change_history except Exception as e: return f"Error: {str(e)}", uploaded_df, change_history def undo_last_change(uploaded_df, change_history): if not change_history: return "No changes to undo.", uploaded_df, change_history uploaded_df = change_history.pop() return f"Undid last change. Dataset now has {uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns", uploaded_df, change_history def undo_all_changes(original_df, change_history): if original_df is None: return "No original dataset to restore.", None, change_history uploaded_df = original_df.copy() change_history = [] return f"Dataset restored to original state ({uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns)", uploaded_df, change_history def download_dataset(uploaded_df, dataset_name): if uploaded_df is None: return None if dataset_name: base_name = dataset_name.replace('.csv', '').replace('.xlsx', '').replace('.xls', '') filename = f"{base_name}_modified.csv" else: filename = "modified_dataset.csv" temp_dir = tempfile.gettempdir() filepath = os.path.join(temp_dir, filename) uploaded_df.to_csv(filepath, index=False) return filepath def display_data_format(format_type, selected_columns, uploaded_df): if uploaded_df is None or format_type == "None": return None if selected_columns and len(selected_columns) > 0: df_to_show = uploaded_df[selected_columns] else: df_to_show = uploaded_df return df_to_show.head(100) if format_type == "DataFrame" else None def display_text_format(format_type, selected_columns, uploaded_df): if uploaded_df is None or format_type == "None": return "" if selected_columns and len(selected_columns) > 0: df_to_show = uploaded_df[selected_columns] else: df_to_show = uploaded_df if format_type == "JSON": return df_to_show.head(20).to_json(orient='records', indent=2) elif format_type == "Dictionary": return str(df_to_show.head(20).to_dict(orient='records'))