Spaces:

Learnerbegginer
/

Auto-ML-Preprocessing

Running

App Files Files Community

Learnerbegginer commited on Mar 9

Commit

c3c3195

1 Parent(s): c3b2831

Deploy working Streamlit version - exact local version that works perfectly

Browse files

Files changed (3) hide show

README.md +3 -3
app.py +355 -207
requirements.txt +0 -0

README.md CHANGED Viewed

@@ -3,8 +3,8 @@ title: PromptPrepML
 emoji: 🤖
 colorFrom: blue
 colorTo: indigo
-sdk: gradio
-sdk_version: 4.28.0
 python_version: 3.11
 app_file: app.py
 pinned: false
@@ -37,6 +37,6 @@ Upload your dataset and describe your preprocessing needs in natural language. O
 ## 🛠️ Tech Stack
 - **Backend**: Python, FastAPI, scikit-learn
-- **Frontend**: Gradio
 - **EDA**: ydata-profiling
 - **ML**: pandas, numpy

 emoji: 🤖
 colorFrom: blue
 colorTo: indigo
+sdk: streamlit
+sdk_version: 1.28.0
 python_version: 3.11
 app_file: app.py
 pinned: false
 ## 🛠️ Tech Stack
 - **Backend**: Python, FastAPI, scikit-learn
+- **Frontend**: Streamlit
 - **EDA**: ydata-profiling
 - **ML**: pandas, numpy

app.py CHANGED Viewed

@@ -1,228 +1,376 @@
-import gradio as gr
 import pandas as pd
-import numpy as np
-from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.feature_selection import VarianceThreshold
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import train_test_split
 import io
-import warnings
-warnings.filterwarnings('ignore')
-class StandalonePreprocessor:
-    def __init__(self):
-        self.pipeline = None
-        self.feature_names = []
-        self.analysis = {}
-    def analyze_columns(self, df):
-        """Analyze dataset columns"""
-        analysis = {
-            'identifiers': [],
-            'dates': [],
-            'text_features': [],
-            'categorical_low_cardinality': [],
-            'categorical_high_cardinality': [],
-            'numeric': []
-        }
-        for col in df.columns:
-            col_lower = col.lower()
-            # Identifier detection
-            is_identifier = (
-                any(keyword in col_lower for keyword in ['id', 'index', 'uuid', 'key']) and
-                (df[col].nunique() / len(df) > 0.8)
-            )
-            if is_identifier:
-                analysis['identifiers'].append(col)
-                continue
-            # Date detection
-            if df[col].dtype == 'object':
-                try:
-                    pd.to_datetime(df[col].dropna().head(10))
-                    analysis['dates'].append(col)
-                    continue
-                except:
-                    pass
-            # Text feature detection
-            text_keywords = ['name', 'email', 'phone', 'website', 'address', 'description']
-            if any(keyword in col_lower for keyword in text_keywords):
-                analysis['text_features'].append(col)
-                continue
-            # Categorical vs Numeric
-            if df[col].dtype == 'object':
-                unique_ratio = df[col].nunique() / len(df)
-                if unique_ratio > 0.5:
-                    analysis['categorical_high_cardinality'].append(col)
-                else:
-                    analysis['categorical_low_cardinality'].append(col)
-            else:
-                analysis['numeric'].append(col)
-        return analysis
-    def extract_date_features(self, df, date_cols):
-        """Extract features from date columns"""
-        df_processed = df.copy()
-        for col in date_cols:
-            try:
-                dates = pd.to_datetime(df_processed[col])
-                df_processed[f'{col}_year'] = dates.dt.year
-                df_processed[f'{col}_month'] = dates.dt.month
-                df_processed[f'{col}_day'] = dates.dt.day
-                df_processed[f'{col}_weekday'] = dates.dt.weekday
-                df_processed.drop(col, axis=1, inplace=True)
-            except:
-                pass
-        return df_processed
-    def process(self, df):
-        """Main processing function"""
-        # Step 1: Analyze columns
-        self.analysis = self.analyze_columns(df)
-        # Step 2: Remove unwanted columns
-        columns_to_drop = (
-            self.analysis['identifiers'] +
-            self.analysis['text_features'] +
-            self.analysis['categorical_high_cardinality']
-        )
-        df_clean = df.drop(columns=columns_to_drop, errors='ignore')
-        # Step 3: Extract date features
-        if self.analysis['dates']:
-            df_clean = self.extract_date_features(df_clean, self.analysis['dates'])
-        # Step 4: Create preprocessing pipeline
-        numeric_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
-        categorical_features = df_clean.select_dtypes(include=['object']).columns.tolist()
-        preprocessor = ColumnTransformer(
-            transformers=[
-                ('numeric', Pipeline([
-                    ('imputer', SimpleImputer(strategy='median')),
-                    ('scaler', StandardScaler())
-                ]), numeric_features),
-                ('categorical', Pipeline([
-                    ('imputer', SimpleImputer(strategy='most_frequent')),
-                    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
-                ]), categorical_features)
-            ]
         )
-        # Step 5: Create full pipeline
-        self.pipeline = Pipeline([
-            ('preprocessor', preprocessor),
-            ('feature_selector', VarianceThreshold(threshold=0.01))
-        ])
-        # Step 6: Fit and transform
-        processed_data = self.pipeline.fit_transform(df_clean)
-        # Step 7: Get feature names
-        try:
-            feature_names = []
-            if numeric_features:
-                feature_names.extend([f'numeric__{f}' for f in numeric_features])
-            if categorical_features:
-                encoder = self.pipeline.named_steps['preprocessor'].named_transformers_['categorical'].named_steps['encoder']
-                cat_names = encoder.get_feature_names_out(categorical_features)
-                feature_names.extend([f'categorical__{name}' for name in cat_names])
-            self.feature_names = feature_names[:processed_data.shape[1]]
-        except:
-            self.feature_names = [f'feature_{i}' for i in range(processed_data.shape[1])]
-        # Step 8: Create processed DataFrame
-        processed_df = pd.DataFrame(processed_data, columns=self.feature_names)
-        return processed_df
-    def split_data(self, df):
-        """Split dataset into train and test"""
-        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
-        return train_df, test_df
-# Global preprocessor
-preprocessor = StandalonePreprocessor()
-def process_dataset(file, prompt):
-    if file is None:
-        return "Please upload a dataset", None, None, None, None, ""
-    try:
-        # Read uploaded file
-        file_content = file.read()
-        df = pd.read_csv(io.BytesIO(file_content))
-        # Process dataset
-        processed_df = preprocessor.process(df)
-        # Split dataset
-        train_df, test_df = preprocessor.split_data(processed_df)
-        # Create summary
-        analysis = preprocessor.analysis
-        summary = f"""
-        **✅ Processing Complete!**
-        **📊 Dataset Information**
-        - Original Shape: {df.shape}
-        - Processed Shape: {processed_df.shape}
-        - Training Set: {train_df.shape}
-        - Test Set: {test_df.shape}
-        **🔍 Column Analysis**
-        - Identifiers Removed: {len(analysis['identifiers'])} columns
-        - Text Features Removed: {len(analysis['text_features'])} columns
-        - Date Columns Processed: {len(analysis['dates'])} columns
-        - Low Cardinality Encoded: {len(analysis['categorical_low_cardinality'])} columns
-        - High Cardinality Dropped: {len(analysis['categorical_high_cardinality'])} columns
-        - Numeric Features: {len(analysis['numeric'])} columns
-        """
-        # Convert DataFrames to CSV for download
-        processed_csv = processed_df.to_csv(index=False).encode('utf-8')
-        train_csv = train_df.to_csv(index=False).encode('utf-8')
-        test_csv = test_df.to_csv(index=False).encode('utf-8')
-        return summary, processed_csv, train_csv, test_csv, processed_df.head(10), "✅ Processing completed successfully!"
-    except Exception as e:
-        return f"❌ Error: {str(e)}", None, None, None, None, f"❌ Processing failed: {str(e)}"
-# Create simple Gradio interface
-iface = gr.Interface(
-    fn=process_dataset,
-    inputs=[
-        gr.File(label="Upload CSV Dataset", file_types=[".csv"]),
-        gr.Textbox(label="Processing Instructions",
-                  value="Prepare this dataset for machine learning. Handle missing values, remove identifier columns, extract date features, encode categorical variables, and scale numeric features.",
-                  lines=3)
-    ],
-    outputs=[
-        gr.Markdown(label="Results Summary"),
-        gr.File(label="Processed Dataset"),
-        gr.File(label="Training Set"),
-        gr.File(label="Test Set"),
-        gr.Dataframe(label="Dataset Preview"),
-        gr.Textbox(label="Status")
-    ],
-    title="🤖 PromptPrepML",
-    description="AI-Powered Machine Learning Data Preprocessing Assistant",
-    allow_flagging="never"
-)
-# Launch the app
 if __name__ == "__main__":
-    iface.launch()

+import streamlit as st
+import requests
 import pandas as pd
 import io
+import os
+from PIL import Image
+import time
+# Configure page
+st.set_page_config(
+    page_title="PromptPrepML - Auto ML Data Preprocessing",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f2937;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .step-header {
+        font-size: 1.5rem;
+        font-weight: 600;
+        color: #374151;
+        margin: 1rem 0;
+    }
+    .success-box {
+        background-color: #f0fdf4;
+        border: 1px solid #bbf7d0;
+        border-radius: 0.5rem;
+        padding: 1rem;
+        margin: 1rem 0;
+    }
+    .info-box {
+        background-color: #eff6ff;
+        border: 1px solid #bfdbfe;
+        border-radius: 0.5rem;
+        padding: 1rem;
+        margin: 1rem 0;
+    }
+    .warning-box {
+        background-color: #fffbeb;
+        border: 1px solid #fed7aa;
+        border-radius: 0.5rem;
+        padding: 1rem;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# API base URL
+API_BASE = "http://localhost:8000"
+def check_backend_health():
+    """Check if backend is running"""
+    try:
+        response = requests.get(f"{API_BASE}/health", timeout=5)
+        return response.status_code == 200
+    except:
+        return False
+def upload_dataset(uploaded_file):
+    """Upload dataset to backend"""
+    try:
+        files = {'file': uploaded_file}
+        response = requests.post(f"{API_BASE}/api/upload-dataset", files=files, timeout=30)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            return None
+    except Exception as e:
+        st.error(f"Upload error: {str(e)}")
+        return None
+def process_pipeline(uploaded_file, prompt):
+    """Process dataset with ML pipeline"""
+    try:
+        files = {'file': uploaded_file}
+        data = {'prompt': prompt}
+        response = requests.post(f"{API_BASE}/process-pipeline", files=files, data=data, timeout=120)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            st.error(f"Processing error: {response.text}")
+            return None
+    except Exception as e:
+        st.error(f"Processing error: {str(e)}")
+        return None
+def download_file(filename):
+    """Generate download link for file"""
+    return f"{API_BASE}/api/download/{filename}"
+def main():
+    # Main header
+    st.markdown('<h1 class="main-header">🤖 PromptPrepML</h1>', unsafe_allow_html=True)
+    st.markdown('<p style="text-align: center; color: #6b7280; font-size: 1.1rem;">Convert natural language prompts into ML-ready datasets</p>', unsafe_allow_html=True)
+    # Check backend health
+    if not check_backend_health():
+        st.error("❌ Backend is not running! Please start the backend first:")
+        st.code("""
+cd promptprepml/backend
+venv\\Scripts\\activate
+python app/main.py
+""")
+        return
+    st.success("✅ Backend is connected and ready!")
+    # Sidebar for navigation
+    st.sidebar.title("📋 Processing Steps")
+    # Initialize session state
+    if 'step' not in st.session_state:
+        st.session_state.step = 'upload'
+    if 'dataset_info' not in st.session_state:
+        st.session_state.dataset_info = None
+    if 'processing_results' not in st.session_state:
+        st.session_state.processing_results = None
+    if 'uploaded_file' not in st.session_state:
+        st.session_state.uploaded_file = None
+    # Step indicators
+    steps = ['📤 Upload Dataset', '💬 Enter Prompt', '⚡ Processing', '📊 Results']
+    step_mapping = {
+        'upload': 0,
+        'prompt': 1,
+        'processing': 2,
+        'results': 3
+    }
+    current_step_idx = step_mapping.get(st.session_state.step, 0)
+    for i, step in enumerate(steps):
+        if i <= current_step_idx:
+            st.sidebar.markdown(f"✅ {step}")
+        else:
+            st.sidebar.markdown(f"⏳ {step}")
+    # Main content based on current step
+    if st.session_state.step == 'upload':
+        st.markdown('<h2 class="step-header">📤 Upload Your Dataset</h2>', unsafe_allow_html=True)
+        # File upload
+        uploaded_file = st.file_uploader(
+            "Choose a CSV file",
+            type=['csv'],
+            help="Upload your dataset in CSV format. Maximum file size: 200MB"
         )
+        if uploaded_file is not None:
+            # Display file info
+            st.markdown('<div class="info-box">', unsafe_allow_html=True)
+            st.write(f"**Filename:** {uploaded_file.name}")
+            st.write(f"**Size:** {uploaded_file.size / 1024 / 1024:.2f} MB")
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Preview data
+            try:
+                df = pd.read_csv(uploaded_file)
+                st.write("**Data Preview:**")
+                st.dataframe(df.head(), use_container_width=True)
+                st.write(f"**Shape:** {df.shape[0]} rows × {df.shape[1]} columns")
+                # Upload button
+                if st.button("🚀 Upload Dataset", type="primary"):
+                    with st.spinner("Uploading dataset..."):
+                        # Reset file pointer
+                        uploaded_file.seek(0)
+                        result = upload_dataset(uploaded_file)
+                        if result:
+                            st.session_state.dataset_info = result
+                            st.session_state.uploaded_file = uploaded_file  # Store the file
+                            st.session_state.step = 'prompt'
+                            st.rerun()
+                        else:
+                            st.error("Upload failed. Please try again.")
+            except Exception as e:
+                st.error(f"Error reading CSV file: {str(e)}")
+    elif st.session_state.step == 'prompt':
+        st.markdown('<h2 class="step-header">💬 Describe Your Preprocessing Needs</h2>', unsafe_allow_html=True)
+        # Show dataset info
+        if st.session_state.dataset_info:
+            info = st.session_state.dataset_info['dataset_info']
+            st.markdown('<div class="info-box">', unsafe_allow_html=True)
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Rows", info['shape'][0])
+            with col2:
+                st.metric("Columns", info['shape'][1])
+            with col3:
+                st.metric("Missing Values", sum(info['missing_values'].values()))
+            with col4:
+                st.metric("Duplicates", info['duplicates'])
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Show file info
+            if st.session_state.uploaded_file:
+                st.info(f"📁 File loaded: {st.session_state.uploaded_file.name} ({st.session_state.uploaded_file.size / 1024 / 1024:.2f} MB)")
+        # Prompt input
+        st.write("**Enter your preprocessing instructions in natural language:**")
+        # Example prompts
+        example_prompts = [
+            "Prepare this dataset for fraud classification, handle missing values, encode categorical variables, remove outliers, and scale numeric features.",
+            "Clean this dataset for customer churn prediction, fill missing values with median, one-hot encode categories, and apply standard scaling.",
+            "Preprocess data for regression analysis, handle null values, remove duplicates, and normalize numerical features.",
+            "Get this dataset ready for machine learning, handle missing data, encode categorical variables, and scale features.",
+            "Analyze this customer dataset and prepare it for machine learning. Remove duplicate rows and unnecessary identifier columns. Handle missing values appropriately. Encode categorical variables such as country, city, and company. Extract useful features from the subscription date. Scale any numerical features if present. Remove low-variance features and prepare the dataset for clustering or classification."
+        ]
+        # Prompt text area
+        prompt = st.text_area(
+            "Your prompt:",
+            height=120,
+            placeholder="e.g., Handle missing values, encode categorical variables, remove outliers, and scale numeric features",
+            help="Describe how you want to preprocess your dataset in plain English"
+        )
+        # Example prompts section
+        with st.expander("💡 Example Prompts"):
+            for i, example in enumerate(example_prompts, 1):
+                if st.button(f"Use Example {i}", key=f"example_{i}"):
+                    prompt = example
+                    st.rerun()
+                st.write(f"{i}. {example}")
+        # Supported operations info
+        with st.expander("🔧 Supported Operations"):
+            st.write("""
+            **Missing Values:**
+            - Mean/median/mode imputation
+            - Constant value filling
+            - Row deletion
+            **Categorical Encoding:**
+            - One-hot encoding
+            - Label encoding
+            **Feature Scaling:**
+            - Standard scaling (Z-score)
+            - Min-max scaling
+            - Robust scaling
+            **Outlier Detection:**
+            - Isolation Forest
+            - IQR method
+            - Z-score method
+            **Feature Engineering:**
+            - Variance threshold selection
+            - Correlation filtering
+            - Interaction features
+            """)
+        # Process button
+        if prompt and st.button("🚀 Process Dataset", type="primary"):
+            if st.session_state.uploaded_file:
+                with st.spinner("Processing dataset... This may take a few minutes."):
+                    # Reset file pointer
+                    st.session_state.uploaded_file.seek(0)
+                    result = process_pipeline(st.session_state.uploaded_file, prompt)
+                    if result:
+                        st.session_state.processing_results = result
+                        st.session_state.step = 'results'
+                        st.rerun()
+            else:
+                st.warning("No file found. Please upload your dataset again.")
+    elif st.session_state.step == 'results':
+        st.markdown('<h2 class="step-header">🎉 Processing Complete!</h2>', unsafe_allow_html=True)
+        if st.session_state.processing_results:
+            results = st.session_state.processing_results
+            # Success message
+            st.markdown('<div class="success-box">', unsafe_allow_html=True)
+            st.success("✅ Your dataset has been successfully preprocessed and is ready for machine learning!")
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Dataset information
+            st.write("### 📊 Dataset Information")
+            info = results['dataset_info']['basic_info']
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Original Shape", f"{info['shape'][0]} × {info['shape'][1]}")
+            with col2:
+                st.metric("Numeric Columns", len(info['numeric_columns']))
+            with col3:
+                st.metric("Categorical Columns", len(info['categorical_columns']))
+            with col4:
+                missing_total = sum(results['dataset_info']['missing_values']['counts'].values())
+                st.metric("Missing Values", missing_total)
+            # Applied preprocessing steps
+            st.write("### 🔧 Applied Preprocessing Steps")
+            for i, step in enumerate(results['preprocessing_steps'], 1):
+                st.markdown(f"""
+                <div style="padding: 1rem; margin: 0.5rem 0; background-color: #f8fafc; border-left: 4px solid #3b82f6; border-radius: 0.25rem;">
+                    <strong>Step {i}:</strong> {step['description']}<br>
+                    <small>Method: {step.get('method', 'N/A')}</small>
+                </div>
+                """, unsafe_allow_html=True)
+            # Download files
+            st.write("### 📁 Download Files")
+            files_to_download = [
+                ("processed_dataset.csv", "📊 Processed Dataset", "Fully preprocessed dataset ready for ML"),
+                ("train.csv", "🚂 Training Set", "80% of data for model training"),
+                ("test.csv", "🧪 Test Set", "20% of data for model testing"),
+                ("pipeline.pkl", "⚙️ Pipeline", "Scikit-learn pipeline for reuse"),
+                ("eda_report.html", "📈 EDA Report", "Exploratory Data Analysis report")
+            ]
+            col1, col2 = st.columns(2)
+            for i, (filename, title, description) in enumerate(files_to_download):
+                with col1 if i % 2 == 0 else col2:
+                    st.markdown(f"""
+                    <div style="padding: 1rem; margin: 0.5rem 0; border: 1px solid #e5e7eb; border-radius: 0.5rem;">
+                        <h4>{title}</h4>
+                        <p><small>{description}</small></p>
+                        <a href="{download_file(filename)}" download="{filename}" style="text-decoration: none;">
+                            <button style="background-color: #3b82f6; color: white; padding: 0.5rem 1rem; border: none; border-radius: 0.25rem; cursor: pointer;">
+                                📥 Download {filename}
+                            </button>
+                        </a>
+                    </div>
+                    """, unsafe_allow_html=True)
+            # Quick actions
+            st.write("### ⚡ Quick Actions")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                if st.button("📈 View EDA Report", type="secondary"):
+                    st.info(f"EDA Report will be available at: {download_file('eda_report.html')}")
+            with col2:
+                if st.button("⚙️ Download Pipeline", type="secondary"):
+                    st.info(f"Pipeline file: {download_file('pipeline.pkl')}")
+            with col3:
+                if st.button("🔄 Process Another Dataset", type="primary"):
+                    # Reset session state
+                    for key in st.session_state.keys():
+                        del st.session_state[key]
+                    st.session_state.step = 'upload'
+                    st.rerun()
+        else:
+            st.error("No processing results available. Please start over.")
+    # Footer
+    st.markdown("---")
+    st.markdown("""
+    <div style="text-align: center; color: #6b7280; margin-top: 2rem;">
+        <p><strong>PromptPrepML</strong> - Automated ML Data Preprocessing</p>
+        <p><small>Convert natural language prompts into ML-ready datasets</small></p>
+    </div>
+    """, unsafe_allow_html=True)
 if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ