# Example script to run the demo without AI model dependencies for local testing # Save this as demo.py import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go import io from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import os import json import re # Set plot styling sns.set(style="whitegrid") plt.rcParams["figure.figsize"] = (10, 6) def read_file(file): """Read different file formats into a pandas DataFrame with robust separator detection.""" if file is None: return None file_name = file.name if hasattr(file, 'name') else '' print(f"Reading file: {file_name}") try: # Handle different file types if file_name.endswith('.csv'): # First try with comma try: df = pd.read_csv(file) # Check if we got only one column but it contains semicolons if len(df.columns) == 1 and ';' in str(df.columns[0]): print("Detected potential semicolon-separated file") # Reset file position file.seek(0) # Try with semicolon df = pd.read_csv(file, sep=';') print(f"Read file with semicolon separator: {df.shape}") else: print(f"Read file with comma separator: {df.shape}") # Convert columns to appropriate types for col in df.columns: # Try to convert string columns to numeric if df[col].dtype == 'object': df[col] = pd.to_numeric(df[col], errors='ignore') return df except Exception as e: print(f"Error with standard separators: {e}") # Try with semicolon file.seek(0) try: df = pd.read_csv(file, sep=';') print(f"Read file with semicolon separator after error: {df.shape}") return df except: # Final attempt with Python's csv sniffer file.seek(0) return pd.read_csv(file, sep=None, engine='python') elif file_name.endswith(('.xls', '.xlsx')): return pd.read_excel(file) elif file_name.endswith('.json'): return pd.read_json(file) elif file_name.endswith('.txt'): # Try tab separator first for text files try: df = pd.read_csv(file, delimiter='\t') if len(df.columns) <= 1: # If tab doesn't work well, try with separator detection file.seek(0) df = pd.read_csv(file, sep=None, engine='python') return df except: # Fall back to separator detection file.seek(0) return pd.read_csv(file, sep=None, engine='python') else: return "Unsupported file format. Please upload .csv, .xlsx, .xls, .json, or .txt files." except Exception as e: print(f"Error reading file: {str(e)}") return f"Error reading file: {str(e)}" def analyze_data(df): """Generate basic statistics and information about the dataset.""" if not isinstance(df, pd.DataFrame): return df # Return error message if df is not a DataFrame # Basic info info = {} info['Shape'] = df.shape info['Columns'] = df.columns.tolist() info['Data Types'] = df.dtypes.astype(str).to_dict() # Check for missing values missing_values = df.isnull().sum() if missing_values.sum() > 0: info['Missing Values'] = missing_values[missing_values > 0].to_dict() else: info['Missing Values'] = "No missing values found" # Data quality issues info['Data Quality Issues'] = identify_data_quality_issues(df) # Basic statistics for numerical columns numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() if numeric_cols: info['Numeric Columns'] = numeric_cols info['Statistics'] = df[numeric_cols].describe().to_html() # Check for outliers outliers = detect_outliers(df, numeric_cols) if outliers: info['Outliers'] = outliers # Identify categorical columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() if categorical_cols: info['Categorical Columns'] = categorical_cols # Get unique value counts for categorical columns (limit to first 5 for brevity) cat_counts = {} for col in categorical_cols[:5]: # Limit to first 5 categorical columns cat_counts[col] = df[col].value_counts().head(10).to_dict() # Show top 10 values info['Category Counts'] = cat_counts return info def identify_data_quality_issues(df): """Identify common data quality issues.""" issues = {} # Check for duplicate rows duplicate_count = df.duplicated().sum() if duplicate_count > 0: issues['Duplicate Rows'] = duplicate_count # Check for high cardinality in categorical columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() high_cardinality = {} for col in categorical_cols: unique_count = df[col].nunique() if unique_count > 50: # Arbitrary threshold high_cardinality[col] = unique_count if high_cardinality: issues['High Cardinality Columns'] = high_cardinality # Check for potential date columns not properly formatted potential_date_cols = [] for col in df.select_dtypes(include=['object']).columns: # Sample the first 10 non-null values sample = df[col].dropna().head(10).tolist() if all(isinstance(x, str) for x in sample): # Simple date pattern check date_pattern = re.compile(r'\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}') if any(date_pattern.search(str(x)) for x in sample): potential_date_cols.append(col) if potential_date_cols: issues['Potential Date Columns'] = potential_date_cols # Check for columns with mostly missing values high_missing = {} for col in df.columns: missing_pct = df[col].isnull().mean() * 100 if missing_pct > 50: # More than 50% missing high_missing[col] = f"{missing_pct:.2f}%" if high_missing: issues['Columns with >50% Missing'] = high_missing return issues def detect_outliers(df, numeric_cols): """Detect outliers in numeric columns using IQR method.""" outliers = {} for col in numeric_cols: # Skip columns with too many unique values (potentially ID columns) if df[col].nunique() > df.shape[0] * 0.9: continue # Calculate IQR Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 # Define outlier bounds lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR # Count outliers outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum() if outlier_count > 0: outlier_pct = (outlier_count / df.shape[0]) * 100 if outlier_pct > 1: # Only report if more than 1% are outliers outliers[col] = { 'count': outlier_count, 'percentage': f"{outlier_pct:.2f}%", 'lower_bound': lower_bound, 'upper_bound': upper_bound } return outliers def generate_visualizations(df): """Generate appropriate visualizations based on the data types.""" if not isinstance(df, pd.DataFrame): print(f"Not a DataFrame: {type(df)}") return df # Return error message if df is not a DataFrame print(f"Starting visualization generation for DataFrame with shape: {df.shape}") visualizations = {} # Identify column types numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() date_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns]' or (df[col].dtype == 'object' and pd.to_datetime(df[col], errors='coerce').notna().all())] print(f"Found {len(numeric_cols)} numeric columns: {numeric_cols}") print(f"Found {len(categorical_cols)} categorical columns: {categorical_cols}") print(f"Found {len(date_cols)} date columns: {date_cols}") try: # Simple test plot to verify Plotly is working if len(df) > 0 and len(df.columns) > 0: col = df.columns[0] try: test_data = df[col].head(100) fig = px.histogram(x=test_data, title=f"Test Plot for {col}") visualizations['test_plot'] = fig print(f"Generated test plot for column: {col}") except Exception as e: print(f"Error creating test plot: {e}") # 1. Distribution plots for numeric columns (first 5) if numeric_cols: for i, col in enumerate(numeric_cols[:5]): # Limit to first 5 numeric columns try: fig = px.histogram(df, x=col, marginal="box", title=f"Distribution of {col}") visualizations[f'dist_{col}'] = fig print(f"Generated distribution plot for {col}") except Exception as e: print(f"Error creating histogram for {col}: {e}") # 2. Bar charts for categorical columns (first 5) if categorical_cols: for i, col in enumerate(categorical_cols[:5]): # Limit to first 5 categorical columns try: # Get value counts and handle potential large number of categories value_counts = df[col].value_counts().nlargest(10) # Top 10 categories # Convert indices to strings to ensure they can be plotted value_counts.index = value_counts.index.astype(str) fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top 10 categories in {col}") fig.update_xaxes(title=col) fig.update_yaxes(title="Count") visualizations[f'bar_{col}'] = fig print(f"Generated bar chart for {col}") except Exception as e: print(f"Error creating bar chart for {col}: {e}") # 3. Correlation heatmap for numeric columns if len(numeric_cols) > 1: try: corr_matrix = df[numeric_cols].corr() fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", title="Correlation Heatmap") visualizations['correlation'] = fig print("Generated correlation heatmap") except Exception as e: print(f"Error creating correlation heatmap: {e}") # 4. Scatter plot matrix (first 3 numeric columns to keep it manageable) if len(numeric_cols) >= 2: try: plot_cols = numeric_cols[:3] # Limit to first 3 numeric columns fig = px.scatter_matrix(df, dimensions=plot_cols, title="Scatter Plot Matrix") visualizations['scatter_matrix'] = fig print("Generated scatter plot matrix") except Exception as e: print(f"Error creating scatter matrix: {e}") # 5. Time series plot if date column exists if date_cols and numeric_cols: try: date_col = date_cols[0] # Use the first date column # Convert to datetime if not already if df[date_col].dtype != 'datetime64[ns]': df[date_col] = pd.to_datetime(df[date_col], errors='coerce') # Sort by date df_sorted = df.sort_values(by=date_col) # Create time series for first numeric column num_col = numeric_cols[0] fig = px.line(df_sorted, x=date_col, y=num_col, title=f"{num_col} over Time") visualizations['time_series'] = fig print("Generated time series plot") except Exception as e: print(f"Error creating time series plot: {e}") # 6. PCA visualization if enough numeric columns if len(numeric_cols) >= 3: try: # Apply PCA to numeric data numeric_data = df[numeric_cols].select_dtypes(include=[np.number]) # Fill NaN values with mean for PCA numeric_data = numeric_data.fillna(numeric_data.mean()) # Standardize the data scaler = StandardScaler() scaled_data = scaler.fit_transform(numeric_data) # Apply PCA with 2 components pca = PCA(n_components=2) pca_result = pca.fit_transform(scaled_data) # Create a DataFrame with PCA results pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2']) # If categorical column exists, use it for color if categorical_cols: cat_col = categorical_cols[0] pca_df[cat_col] = df[cat_col].values fig = px.scatter(pca_df, x='PC1', y='PC2', color=cat_col, title="PCA Visualization") else: fig = px.scatter(pca_df, x='PC1', y='PC2', title="PCA Visualization") variance_ratio = pca.explained_variance_ratio_ fig.update_layout( annotations=[ dict( text=f"PC1 explained variance: {variance_ratio[0]:.2f}", showarrow=False, x=0.5, y=1.05, xref="paper", yref="paper" ), dict( text=f"PC2 explained variance: {variance_ratio[1]:.2f}", showarrow=False, x=0.5, y=1.02, xref="paper", yref="paper" ) ] ) visualizations['pca'] = fig print("Generated PCA visualization") except Exception as e: print(f"Error creating PCA visualization: {e}") except Exception as e: print(f"Error in visualization generation: {e}") print(f"Generated {len(visualizations)} visualizations") # If no visualizations were created, add a fallback if not visualizations: print("No visualizations generated, creating fallback") try: # Create simple fallback visualization fig = go.Figure() # Add a simple scatter plot with random data if needed if len(df) > 0: fig.add_trace(go.Scatter( x=list(range(min(20, len(df)))), y=df.iloc[:min(20, len(df)), 0] if len(df.columns) > 0 else list(range(min(20, len(df)))), mode='markers', name='Fallback Plot' )) else: fig.add_annotation(text="No data to visualize", showarrow=False) fig.update_layout(title="Fallback Visualization") visualizations['fallback'] = fig except Exception as e: print(f"Error creating fallback visualization: {e}") return visualizations def display_analysis(analysis): """Format the analysis results for display.""" if analysis is None: return "No analysis available." if isinstance(analysis, str): # Error message return analysis # Format analysis as HTML html = "
Shape: {analysis['Shape'][0]} rows, {analysis['Shape'][1]} columns
" html += f"Columns: {', '.join(analysis['Columns'])}
" # Missing values html += "{analysis['Missing Values']}
" else: html += "{issue_details}
" # Outliers if 'Outliers' in analysis and analysis['Outliers']: html += "No visualizations could be generated for this dataset.
" # Combine analysis and visualizations result_html = f"""