Spaces:

HaryaniAnjali
/

Data_Visualization_App_using_GenerativeAI

Sleeping

App Files Files Community

HaryaniAnjali commited on Apr 5, 2025

Commit

06fe39c

verified ·

1 Parent(s): 3f73c0a

Create app.py

Browse files

Files changed (1) hide show

app.py +788 -0

app.py ADDED Viewed

	@@ -0,0 +1,788 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+import io
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+import os
+import json
+import requests
+import re
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import torch
+import openai
+# Set plot styling
+sns.set(style="whitegrid")
+plt.rcParams["figure.figsize"] = (10, 6)
+# Initialize AI Models
+def initialize_ai_models():
+    """Initialize the AI models for data analysis."""
+    # Initialize OpenAI API (keys will be loaded from environment variables)
+    # Note: Users need to set OPENAI_API_KEY in their Hugging Face Space secrets
+    # Initialize Hugging Face model for data recommendations
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
+        model = AutoModelForCausalLM.from_pretrained("google/flan-t5-base")
+        data_assistant = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    except:
+        # Fallback to a smaller model if the main one fails to load
+        data_assistant = pipeline("text-generation", model="distilgpt2")
+    return data_assistant
+# Global variables for AI models
+data_assistant = None
+def read_file(file):
+    """Read different file formats into a pandas DataFrame."""
+    if file is None:
+        return None
+    file_name = file.name if hasattr(file, 'name') else ''
+    try:
+        # Handle different file types
+        if file_name.endswith('.csv'):
+            return pd.read_csv(file)
+        elif file_name.endswith(('.xls', '.xlsx')):
+            return pd.read_excel(file)
+        elif file_name.endswith('.json'):
+            return pd.read_json(file)
+        elif file_name.endswith('.txt'):
+            return pd.read_csv(file, delimiter='\t')
+        else:
+            return "Unsupported file format. Please upload .csv, .xlsx, .xls, .json, or .txt files."
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+def analyze_data(df):
+    """Generate basic statistics and information about the dataset."""
+    if not isinstance(df, pd.DataFrame):
+        return df  # Return error message if df is not a DataFrame
+    # Basic info
+    info = {}
+    info['Shape'] = df.shape
+    info['Columns'] = df.columns.tolist()
+    info['Data Types'] = df.dtypes.astype(str).to_dict()
+    # Check for missing values
+    missing_values = df.isnull().sum()
+    if missing_values.sum() > 0:
+        info['Missing Values'] = missing_values[missing_values > 0].to_dict()
+    else:
+        info['Missing Values'] = "No missing values found"
+    # Data quality issues
+    info['Data Quality Issues'] = identify_data_quality_issues(df)
+    # Basic statistics for numerical columns
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if numeric_cols:
+        info['Numeric Columns'] = numeric_cols
+        info['Statistics'] = df[numeric_cols].describe().to_html()
+        # Check for outliers
+        outliers = detect_outliers(df, numeric_cols)
+        if outliers:
+            info['Outliers'] = outliers
+    # Identify categorical columns
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    if categorical_cols:
+        info['Categorical Columns'] = categorical_cols
+        # Get unique value counts for categorical columns (limit to first 5 for brevity)
+        cat_counts = {}
+        for col in categorical_cols[:5]:  # Limit to first 5 categorical columns
+            cat_counts[col] = df[col].value_counts().head(10).to_dict()  # Show top 10 values
+        info['Category Counts'] = cat_counts
+    return info
+def identify_data_quality_issues(df):
+    """Identify common data quality issues."""
+    issues = {}
+    # Check for duplicate rows
+    duplicate_count = df.duplicated().sum()
+    if duplicate_count > 0:
+        issues['Duplicate Rows'] = duplicate_count
+    # Check for high cardinality in categorical columns
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    high_cardinality = {}
+    for col in categorical_cols:
+        unique_count = df[col].nunique()
+        if unique_count > 50:  # Arbitrary threshold
+            high_cardinality[col] = unique_count
+    if high_cardinality:
+        issues['High Cardinality Columns'] = high_cardinality
+    # Check for potential date columns not properly formatted
+    potential_date_cols = []
+    for col in df.select_dtypes(include=['object']).columns:
+        # Sample the first 10 non-null values
+        sample = df[col].dropna().head(10).tolist()
+        if all(isinstance(x, str) for x in sample):
+            # Simple date pattern check
+            date_pattern = re.compile(r'\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}')
+            if any(date_pattern.search(str(x)) for x in sample):
+                potential_date_cols.append(col)
+    if potential_date_cols:
+        issues['Potential Date Columns'] = potential_date_cols
+    # Check for columns with mostly missing values
+    high_missing = {}
+    for col in df.columns:
+        missing_pct = df[col].isnull().mean() * 100
+        if missing_pct > 50:  # More than 50% missing
+            high_missing[col] = f"{missing_pct:.2f}%"
+    if high_missing:
+        issues['Columns with >50% Missing'] = high_missing
+    return issues
+def detect_outliers(df, numeric_cols):
+    """Detect outliers in numeric columns using IQR method."""
+    outliers = {}
+    for col in numeric_cols:
+        # Skip columns with too many unique values (potentially ID columns)
+        if df[col].nunique() > df.shape[0] * 0.9:
+            continue
+        # Calculate IQR
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        # Define outlier bounds
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        # Count outliers
+        outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
+        if outlier_count > 0:
+            outlier_pct = (outlier_count / df.shape[0]) * 100
+            if outlier_pct > 1:  # Only report if more than 1% are outliers
+                outliers[col] = {
+                    'count': outlier_count,
+                    'percentage': f"{outlier_pct:.2f}%",
+                    'lower_bound': lower_bound,
+                    'upper_bound': upper_bound
+                }
+    return outliers
+def generate_visualizations(df):
+    """Generate appropriate visualizations based on the data types."""
+    if not isinstance(df, pd.DataFrame):
+        return df  # Return error message if df is not a DataFrame
+    visualizations = {}
+    # Identify column types
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    date_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns]' or
+                (df[col].dtype == 'object' and pd.to_datetime(df[col], errors='coerce').notna().all())]
+    # 1. Distribution plots for numeric columns (first 5)
+    if numeric_cols:
+        for i, col in enumerate(numeric_cols[:5]):  # Limit to first 5 numeric columns
+            fig = px.histogram(df, x=col, marginal="box", title=f"Distribution of {col}")
+            visualizations[f'dist_{col}'] = fig
+    # 2. Bar charts for categorical columns (first 5)
+    if categorical_cols:
+        for i, col in enumerate(categorical_cols[:5]):  # Limit to first 5 categorical columns
+            value_counts = df[col].value_counts().nlargest(10)  # Top 10 categories
+            fig = px.bar(x=value_counts.index, y=value_counts.values,
+                         title=f"Top 10 categories in {col}")
+            fig.update_xaxes(title=col)
+            fig.update_yaxes(title="Count")
+            visualizations[f'bar_{col}'] = fig
+    # 3. Correlation heatmap for numeric columns
+    if len(numeric_cols) > 1:
+        corr_matrix = df[numeric_cols].corr()
+        fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
+                        title="Correlation Heatmap")
+        visualizations['correlation'] = fig
+    # 4. Scatter plot matrix (first 4 numeric columns)
+    if len(numeric_cols) >= 2:
+        plot_cols = numeric_cols[:4]  # Limit to first 4 numeric columns
+        fig = px.scatter_matrix(df, dimensions=plot_cols, title="Scatter Plot Matrix")
+        visualizations['scatter_matrix'] = fig
+    # 5. Time series plot if date column exists
+    if date_cols and numeric_cols:
+        date_col = date_cols[0]  # Use the first date column
+        # Convert to datetime if not already
+        if df[date_col].dtype != 'datetime64[ns]':
+            df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
+        # Sort by date
+        df_sorted = df.sort_values(by=date_col)
+        # Create time series for first numeric column
+        num_col = numeric_cols[0]
+        fig = px.line(df_sorted, x=date_col, y=num_col,
+                      title=f"{num_col} over Time")
+        visualizations['time_series'] = fig
+    # 6. PCA visualization if enough numeric columns
+    if len(numeric_cols) >= 3:
+        # Apply PCA to numeric data
+        numeric_data = df[numeric_cols].select_dtypes(include=[np.number])
+        # Fill NaN values with mean for PCA
+        numeric_data = numeric_data.fillna(numeric_data.mean())
+        # Standardize the data
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(numeric_data)
+        # Apply PCA with 2 components
+        pca = PCA(n_components=2)
+        pca_result = pca.fit_transform(scaled_data)
+        # Create a DataFrame with PCA results
+        pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
+        # If categorical column exists, use it for color
+        if categorical_cols:
+            cat_col = categorical_cols[0]
+            pca_df[cat_col] = df[cat_col].values
+            fig = px.scatter(pca_df, x='PC1', y='PC2', color=cat_col,
+                             title="PCA Visualization")
+        else:
+            fig = px.scatter(pca_df, x='PC1', y='PC2',
+                             title="PCA Visualization")
+        variance_ratio = pca.explained_variance_ratio_
+        fig.update_layout(
+            annotations=[
+                dict(
+                    text=f"PC1 explained variance: {variance_ratio[0]:.2f}",
+                    showarrow=False,
+                    x=0.5,
+                    y=1.05,
+                    xref="paper",
+                    yref="paper"
+                ),
+                dict(
+                    text=f"PC2 explained variance: {variance_ratio[1]:.2f}",
+                    showarrow=False,
+                    x=0.5,
+                    y=1.02,
+                    xref="paper",
+                    yref="paper"
+                )
+            ]
+        )
+        visualizations['pca'] = fig
+    return visualizations
+def get_ai_cleaning_recommendations(df):
+    """Get AI-powered recommendations for data cleaning using OpenAI."""
+    try:
+        # Prepare the dataset summary
+        summary = {
+            "shape": df.shape,
+            "columns": df.columns.tolist(),
+            "dtypes": df.dtypes.astype(str).to_dict(),
+            "missing_values": df.isnull().sum().to_dict(),
+            "duplicates": df.duplicated().sum(),
+            "sample_data": df.head(5).to_dict()
+        }
+        # Create the prompt for OpenAI
+        prompt = f"""
+        I have a dataset with the following properties:
+        - Shape: {summary['shape']}
+        - Columns: {', '.join(summary['columns'])}
+        - Missing values: {summary['missing_values']}
+        - Duplicate rows: {summary['duplicates']}
+        Here's a sample of the data:
+        {json.dumps(summary['sample_data'], indent=2)}
+        Based on this information, provide specific data cleaning recommendations in a bulleted list.
+        Include suggestions for handling missing values, outliers, data types, and duplicate rows.
+        Format your response as markdown and ONLY include the cleaning recommendations.
+        """
+        # Check if OpenAI API key is available
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            openai.api_key = api_key
+            response = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "You are a data science assistant focused on data cleaning recommendations."},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=700
+            )
+            return response.choices[0].message.content
+        else:
+            # Fallback to Hugging Face model if OpenAI key is not available
+            global data_assistant
+            if data_assistant is None:
+                data_assistant = initialize_ai_models()
+            # Shorten the prompt for the smaller model
+            short_prompt = f"Data cleaning recommendations for dataset with {df.shape[0]} rows, {df.shape[1]} columns, and columns: {', '.join(df.columns[:5])}..."
+            # Generate recommendations
+            recommendations = data_assistant(
+                short_prompt,
+                max_length=500,
+                num_return_sequences=1
+            )[0]['generated_text']
+            return f"""
+            ## Data Cleaning Recommendations
+            * Handle missing values in columns with appropriate imputation techniques
+            * Check for and remove duplicate records
+            * Standardize text fields and correct spelling errors
+            * Convert columns to appropriate data types
+            * Check for and handle outliers in numerical columns
+            Note: These are generic recommendations as AI model access is limited.
+            """
+    except Exception as e:
+        return f"""
+        ## Data Cleaning Recommendations
+        * Handle missing values by either removing rows or imputing with mean/median/mode
+        * Remove duplicate rows if present
+        * Convert date-like string columns to proper datetime format
+        * Standardize text data by removing extra spaces and converting to lowercase
+        * Check for and handle outliers in numerical columns
+        Note: Could not access AI models for customized recommendations. Error: {str(e)}
+        """
+def get_hf_model_insights(df):
+    """Get dataset insights using Hugging Face model."""
+    try:
+        global data_assistant
+        if data_assistant is None:
+            data_assistant = initialize_ai_models()
+        # Prepare a brief summary of the dataset
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+        dataset_summary = f"""
+        Dataset with {df.shape[0]} rows and {df.shape[1]} columns.
+        Numeric columns: {', '.join(numeric_cols[:5])}
+        Categorical columns: {', '.join(categorical_cols[:5])}
+        """
+        # Generate analysis insights
+        prompt = f"Based on this dataset summary, suggest data analysis approaches: {dataset_summary}"
+        response = data_assistant(
+            prompt,
+            max_length=300,
+            num_return_sequences=1
+        )[0]['generated_text']
+        # Clean up the response
+        analysis_insights = response.replace(prompt, "").strip()
+        if not analysis_insights or len(analysis_insights) < 50:
+            # Fallback if the model doesn't produce good results
+            analysis_insights = """
+            ## Data Analysis Suggestions
+            1. For numeric columns, calculate correlation matrices to identify relationships
+            2. For categorical columns, analyze frequency distributions
+            3. Consider creating pivot tables to understand how categories relate
+            4. Look for time-based patterns if datetime columns are present
+            5. Consider dimensionality reduction techniques like PCA for visualization
+            """
+        return analysis_insights
+    except Exception as e:
+        return f"""
+        ## Data Analysis Suggestions
+        1. Examine the distribution of each numeric column
+        2. Analyze correlations between numeric features
+        3. Look for patterns in categorical data
+        4. Consider creating visualizations like histograms and scatter plots
+        5. Explore relationships between different variables
+        Note: Could not access AI models for customized recommendations. Error: {str(e)}
+        """
+def process_file(file):
+    """Main function to process uploaded file and generate analysis."""
+    # Read the file
+    df = read_file(file)
+    if isinstance(df, str):  # If error message
+        return df, None, None, None
+    # Convert date columns to datetime
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            try:
+                if pd.to_datetime(df[col], errors='coerce').notna().all():
+                    df[col] = pd.to_datetime(df[col])
+            except:
+                pass
+    # Analyze data
+    analysis = analyze_data(df)
+    # Generate visualizations
+    visualizations = generate_visualizations(df)
+    # Get AI cleaning recommendations
+    cleaning_recommendations = get_ai_cleaning_recommendations(df)
+    # Get insights from Hugging Face model
+    analysis_insights = get_hf_model_insights(df)
+    return analysis, visualizations, cleaning_recommendations, analysis_insights
+def display_analysis(analysis):
+    """Format the analysis results for display."""
+    if analysis is None:
+        return "No analysis available."
+    if isinstance(analysis, str):  # Error message
+        return analysis
+    # Format analysis as HTML
+    html = "<h2>Data Analysis</h2>"
+    # Basic info
+    html += f"<p><strong>Shape:</strong> {analysis['Shape'][0]} rows, {analysis['Shape'][1]} columns</p>"
+    html += f"<p><strong>Columns:</strong> {', '.join(analysis['Columns'])}</p>"
+    # Missing values
+    html += "<h3>Missing Values</h3>"
+    if isinstance(analysis['Missing Values'], str):
+        html += f"<p>{analysis['Missing Values']}</p>"
+    else:
+        html += "<ul>"
+        for col, count in analysis['Missing Values'].items():
+            html += f"<li>{col}: {count}</li>"
+        html += "</ul>"
+    # Data quality issues
+    if 'Data Quality Issues' in analysis and analysis['Data Quality Issues']:
+        html += "<h3>Data Quality Issues</h3>"
+        for issue_type, issue_details in analysis['Data Quality Issues'].items():
+            html += f"<h4>{issue_type}</h4>"
+            if isinstance(issue_details, dict):
+                html += "<ul>"
+                for key, value in issue_details.items():
+                    html += f"<li>{key}: {value}</li>"
+                html += "</ul>"
+            else:
+                html += f"<p>{issue_details}</p>"
+    # Outliers
+    if 'Outliers' in analysis and analysis['Outliers']:
+        html += "<h3>Outliers Detected</h3>"
+        html += "<ul>"
+        for col, details in analysis['Outliers'].items():
+            html += f"<li><strong>{col}:</strong> {details['count']} outliers ({details['percentage']})<br>"
+            html += f"Values outside range: [{details['lower_bound']:.2f}, {details['upper_bound']:.2f}]</li>"
+        html += "</ul>"
+    # Statistics for numeric columns
+    if 'Statistics' in analysis:
+        html += "<h3>Numeric Statistics</h3>"
+        html += analysis['Statistics']
+    # Categorical columns info
+    if 'Category Counts' in analysis:
+        html += "<h3>Categorical Data (Top Values)</h3>"
+        for col, counts in analysis['Category Counts'].items():
+            html += f"<h4>{col}</h4><ul>"
+            for val, count in counts.items():
+                html += f"<li>{val}: {count}</li>"
+            html += "</ul>"
+    return html
+def apply_data_cleaning(df, cleaning_options):
+    """Apply selected data cleaning operations to the DataFrame."""
+    cleaned_df = df.copy()
+    cleaning_log = []
+    # Handle missing values
+    if cleaning_options.get("handle_missing"):
+        method = cleaning_options.get("missing_method", "drop")
+        for col in cleaned_df.columns:
+            missing_count_before = cleaned_df[col].isnull().sum()
+            if missing_count_before > 0:
+                if method == "drop":
+                    # Drop rows with missing values in this column
+                    cleaned_df = cleaned_df.dropna(subset=[col])
+                    cleaning_log.append(f"Dropped {missing_count_before} rows with missing values in column '{col}'")
+                elif method == "mean" and cleaned_df[col].dtype in [np.float64, np.int64]:
+                    # Fill with mean for numeric columns
+                    mean_val = cleaned_df[col].mean()
+                    cleaned_df[col] = cleaned_df[col].fillna(mean_val)
+                    cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with mean ({mean_val:.2f})")
+                elif method == "median" and cleaned_df[col].dtype in [np.float64, np.int64]:
+                    # Fill with median for numeric columns
+                    median_val = cleaned_df[col].median()
+                    cleaned_df[col] = cleaned_df[col].fillna(median_val)
+                    cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with median ({median_val:.2f})")
+                elif method == "mode":
+                    # Fill with mode for any column type
+                    mode_val = cleaned_df[col].mode()[0]
+                    cleaned_df[col] = cleaned_df[col].fillna(mode_val)
+                    cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with mode ({mode_val})")
+                elif method == "zero" and cleaned_df[col].dtype in [np.float64, np.int64]:
+                    # Fill with zeros for numeric columns
+                    cleaned_df[col] = cleaned_df[col].fillna(0)
+                    cleaning_log.append(f"Filled {missing_count_before} missing values in column '{col}' with 0")
+    # Remove duplicates
+    if cleaning_options.get("remove_duplicates"):
+        dupe_count_before = cleaned_df.duplicated().sum()
+        if dupe_count_before > 0:
+            cleaned_df = cleaned_df.drop_duplicates()
+            cleaning_log.append(f"Removed {dupe_count_before} duplicate rows")
+    # Handle outliers in numeric columns
+    if cleaning_options.get("handle_outliers"):
+        method = cleaning_options.get("outlier_method", "remove")
+        numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            # Calculate IQR
+            Q1 = cleaned_df[col].quantile(0.25)
+            Q3 = cleaned_df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            # Define outlier bounds
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            # Identify outliers
+            outliers = ((cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound))
+            outlier_count = outliers.sum()
+            if outlier_count > 0:
+                if method == "remove":
+                    # Remove rows with outliers
+                    cleaned_df = cleaned_df[~outliers]
+                    cleaning_log.append(f"Removed {outlier_count} rows with outliers in column '{col}'")
+                elif method == "cap":
+                    # Cap outliers at the bounds
+                    cleaned_df.loc[cleaned_df[col] < lower_bound, col] = lower_bound
+                    cleaned_df.loc[cleaned_df[col] > upper_bound, col] = upper_bound
+                    cleaning_log.append(f"Capped {outlier_count} outliers in column '{col}' to range [{lower_bound:.2f}, {upper_bound:.2f}]")
+    # Convert date columns
+    if cleaning_options.get("convert_dates"):
+        for col in cleaned_df.columns:
+            if col in cleaning_options.get("date_columns", []):
+                try:
+                    cleaned_df[col] = pd.to_datetime(cleaned_df[col])
+                    cleaning_log.append(f"Converted column '{col}' to datetime format")
+                except:
+                    cleaning_log.append(f"Failed to convert column '{col}' to datetime format")
+    # Normalize numeric columns
+    if cleaning_options.get("normalize_columns"):
+        for col in cleaned_df.columns:
+            if col in cleaning_options.get("normalize_columns_list", []) and cleaned_df[col].dtype in [np.float64, np.int64]:
+                # Min-max normalization
+                min_val = cleaned_df[col].min()
+                max_val = cleaned_df[col].max()
+                if max_val > min_val:  # Avoid division by zero
+                    cleaned_df[col] = (cleaned_df[col] - min_val) / (max_val - min_val)
+                    cleaning_log.append(f"Normalized column '{col}' to range [0, 1]")
+    return cleaned_df, cleaning_log
+def app_ui(file):
+    """Main function for the Gradio interface."""
+    if file is None:
+        return "Please upload a file to begin analysis.", None, None, None
+    # Process the file
+    analysis, visualizations, cleaning_recommendations, analysis_insights = process_file(file)
+    if isinstance(analysis, str):  # If error message
+        return analysis, None, None, None
+    # Format analysis for display
+    analysis_html = display_analysis(analysis)
+    # Prepare visualizations for display
+    viz_html = ""
+    if visualizations and not isinstance(visualizations, str):
+        for viz_name, fig in visualizations.items():
+            # Convert plotly figure to HTML
+            viz_html += f'<div style="margin-bottom: 30px;">{fig.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
+    # Combine analysis and visualizations
+    result_html = f"""
+    <div style="display: flex; flex-direction: column;">
+        <div>{analysis_html}</div>
+        <h2>Data Visualizations</h2>
+        <div>{viz_html}</div>
+    </div>
+    """
+    return result_html, visualizations, cleaning_recommendations, analysis_insights
+def apply_cleaning_ui(file, handle_missing, missing_method, remove_duplicates,
+                     handle_outliers, outlier_method, convert_dates, date_columns,
+                     normalize_numeric):
+    """UI function for data cleaning workflow."""
+    if file is None:
+        return "Please upload a file before attempting to clean data.", None
+    # Read the file
+    df = read_file(file)
+    if isinstance(df, str):  # If error message
+        return df, None
+    # Configure cleaning options
+    cleaning_options = {
+        "handle_missing": handle_missing,
+        "missing_method": missing_method,
+        "remove_duplicates": remove_duplicates,
+        "handle_outliers": handle_outliers,
+        "outlier_method": outlier_method,
+        "convert_dates": convert_dates,
+        "date_columns": date_columns.split(",") if date_columns else [],
+        "normalize_columns": normalize_numeric,
+        "normalize_columns_list": df.select_dtypes(include=[np.number]).columns.tolist() if normalize_numeric else []
+    }
+    # Apply cleaning
+    cleaned_df, cleaning_log = apply_data_cleaning(df, cleaning_options)
+    # Generate info about the cleaning
+    result_summary = f"""
+    <h2>Data Cleaning Results</h2>
+    <p>Original data: {df.shape[0]} rows, {df.shape[1]} columns</p>
+    <p>Cleaned data: {cleaned_df.shape[0]} rows, {cleaned_df.shape[1]} columns</p>
+    <h3>Cleaning Operations Applied:</h3>
+    <ul>
+    """
+    for log_item in cleaning_log:
+        result_summary += f"<li>{log_item}</li>"
+    result_summary += "</ul>"
+    # Save cleaned data for download
+    buffer = io.BytesIO()
+    cleaned_df.to_csv(buffer, index=False)
+    buffer.seek(0)
+    return result_summary, buffer
+# Create Gradio interface
+with gr.Blocks(title="Data Visualization & Cleaning AI") as demo:
+    gr.Markdown("# Data Visualization & Cleaning AI")
+    gr.Markdown("Upload your data file (CSV, Excel, JSON, or TXT) and get automatic analysis, visualizations, and AI-powered insights.")
+    with gr.Row():
+        file_input = gr.File(label="Upload Data File")
+    with gr.Tabs():
+        with gr.TabItem("Data Analysis"):
+            with gr.Row():
+                analyze_button = gr.Button("Analyze Data")
+            with gr.Tabs():
+                with gr.TabItem("Analysis & Visualizations"):
+                    output = gr.HTML(label="Results")
+                with gr.TabItem("AI Cleaning Recommendations"):
+                    cleaning_recommendations_output = gr.Markdown(label="AI Recommendations")
+                with gr.TabItem("AI Analysis Insights"):
+                    analysis_insights_output = gr.Markdown(label="Analysis Insights")
+                with gr.TabItem("Raw Visualization Objects"):
+                    viz_output = gr.JSON(label="Visualization Objects")
+        with gr.TabItem("Data Cleaning"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Cleaning Options")
+                    handle_missing = gr.Checkbox(label="Handle Missing Values", value=True)
+                    missing_method = gr.Radio(
+                        label="Missing Values Method",
+                        choices=["drop", "mean", "median", "mode", "zero"],
+                        value="mean"
+                    )
+                    remove_duplicates = gr.Checkbox(label="Remove Duplicate Rows", value=True)
+                    handle_outliers = gr.Checkbox(label="Handle Outliers", value=False)
+                    outlier_method = gr.Radio(
+                        label="Outlier Method",
+                        choices=["remove", "cap"],
+                        value="cap"
+                    )
+                    convert_dates = gr.Checkbox(label="Convert Date Columns", value=False)
+                    date_columns = gr.Textbox(
+                        label="Date Columns (comma-separated)",
+                        placeholder="e.g., date,created_at,timestamp"
+                    )
+                    normalize_numeric = gr.Checkbox(label="Normalize Numeric Columns", value=False)
+                with gr.Column(scale=2):
+                    clean_button = gr.Button("Clean Data")
+                    cleaning_output = gr.HTML(label="Cleaning Results")
+                    cleaned_file_output = gr.File(label="Download Cleaned Data")
+    # Connect the buttons to functions
+    analyze_button.click(
+        fn=app_ui,
+        inputs=[file_input],
+        outputs=[output, viz_output, cleaning_recommendations_output, analysis_insights_output]
+    )
+    clean_button.click(
+        fn=apply_cleaning_ui,
+        inputs=[
+            file_input, handle_missing, missing_method, remove_duplicates,
+            handle_outliers, outlier_method, convert_dates, date_columns,
+            normalize_numeric
+        ],
+        outputs=[cleaning_output, cleaned_file_output]
+    )
+# Initialize AI models
+try:
+    data_assistant = initialize_ai_models()
+except Exception as e:
+    print(f"Error initializing AI models: {e}")
+    data_assistant = None
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()