Spaces:

Learnerbegginer
/

Auto-ML-Preprocessing

Sleeping

App Files Files Community

Learnerbegginer commited on 8 days ago

Commit

3836bc2

1 Parent(s): 880c9a5

Deploy PromptPrepML - Standalone Gradio App with ML Preprocessing

Browse files

Files changed (3) hide show

README.md +34 -6
app.py +279 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,41 @@
 ---
-title: Auto ML Preprocessing
-emoji: 📈
-colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
-short_description: ' AI-powered system that Automates Ml preprocessing'
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PromptPrepML
+emoji: 🤖
+colorFrom: blue
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# PromptPrepML
+AI-Powered Machine Learning Data Preprocessing Assistant
+Upload your dataset and describe your preprocessing needs in natural language. Our intelligent system will automatically:
+- Detect and remove identifier columns
+- Extract date features
+- Handle categorical encoding
+- Scale numeric features
+- Generate ML-ready datasets
+- Create reusable pipelines
+## 🚀 Features
+- **🧠 Intelligent Preprocessing**: Smart column detection and processing
+- **📊 Automated EDA**: Comprehensive data analysis reports
+- **🔧 Smart Feature Engineering**: Advanced feature extraction
+- **⚙️ Reusable Pipelines**: Scikit-learn pipelines for production
+- **📂 Clean Outputs**: ML-ready train/test datasets
+## 📋 Usage
+1. Upload your CSV dataset
+2. Describe your preprocessing needs in natural language
+3. Click "Process Dataset"
+4. Download your ML-ready results
+## 🛠️ Tech Stack
+- **Backend**: Python, FastAPI, scikit-learn
+- **Frontend**: Gradio
+- **EDA**: ydata-profiling
+- **ML**: pandas, numpy

app.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
+import io
+import warnings
+warnings.filterwarnings('ignore')
+class StandalonePreprocessor:
+    def __init__(self):
+        self.pipeline = None
+        self.feature_names = []
+        self.analysis = {}
+    def analyze_columns(self, df):
+        """Analyze dataset columns"""
+        analysis = {
+            'identifiers': [],
+            'dates': [],
+            'text_features': [],
+            'categorical_low_cardinality': [],
+            'categorical_high_cardinality': [],
+            'numeric': []
+        }
+        for col in df.columns:
+            col_lower = col.lower()
+            # Identifier detection
+            is_identifier = (
+                any(keyword in col_lower for keyword in ['id', 'index', 'uuid', 'key']) and
+                (df[col].nunique() / len(df) > 0.8)
+            )
+            if is_identifier:
+                analysis['identifiers'].append(col)
+                continue
+            # Date detection
+            if df[col].dtype == 'object':
+                try:
+                    pd.to_datetime(df[col].dropna().head(10))
+                    analysis['dates'].append(col)
+                    continue
+                except:
+                    pass
+            # Text feature detection
+            text_keywords = ['name', 'email', 'phone', 'website', 'address', 'description']
+            if any(keyword in col_lower for keyword in text_keywords):
+                analysis['text_features'].append(col)
+                continue
+            # Categorical vs Numeric
+            if df[col].dtype == 'object':
+                unique_ratio = df[col].nunique() / len(df)
+                if unique_ratio > 0.5:
+                    analysis['categorical_high_cardinality'].append(col)
+                else:
+                    analysis['categorical_low_cardinality'].append(col)
+            else:
+                analysis['numeric'].append(col)
+        return analysis
+    def extract_date_features(self, df, date_cols):
+        """Extract features from date columns"""
+        df_processed = df.copy()
+        for col in date_cols:
+            try:
+                dates = pd.to_datetime(df_processed[col])
+                df_processed[f'{col}_year'] = dates.dt.year
+                df_processed[f'{col}_month'] = dates.dt.month
+                df_processed[f'{col}_day'] = dates.dt.day
+                df_processed[f'{col}_weekday'] = dates.dt.weekday
+                df_processed.drop(col, axis=1, inplace=True)
+            except:
+                pass
+        return df_processed
+    def process(self, df):
+        """Main processing function"""
+        # Step 1: Analyze columns
+        self.analysis = self.analyze_columns(df)
+        # Step 2: Remove unwanted columns
+        columns_to_drop = (
+            self.analysis['identifiers'] +
+            self.analysis['text_features'] +
+            self.analysis['categorical_high_cardinality']
+        )
+        df_clean = df.drop(columns=columns_to_drop, errors='ignore')
+        # Step 3: Extract date features
+        if self.analysis['dates']:
+            df_clean = self.extract_date_features(df_clean, self.analysis['dates'])
+        # Step 4: Create preprocessing pipeline
+        numeric_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_features = df_clean.select_dtypes(include=['object']).columns.tolist()
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('numeric', Pipeline([
+                    ('imputer', SimpleImputer(strategy='median')),
+                    ('scaler', StandardScaler())
+                ]), numeric_features),
+                ('categorical', Pipeline([
+                    ('imputer', SimpleImputer(strategy='most_frequent')),
+                    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+                ]), categorical_features)
+            ]
+        )
+        # Step 5: Create full pipeline
+        self.pipeline = Pipeline([
+            ('preprocessor', preprocessor),
+            ('feature_selector', VarianceThreshold(threshold=0.01))
+        ])
+        # Step 6: Fit and transform
+        processed_data = self.pipeline.fit_transform(df_clean)
+        # Step 7: Get feature names
+        try:
+            feature_names = []
+            if numeric_features:
+                feature_names.extend([f'numeric__{f}' for f in numeric_features])
+            if categorical_features:
+                encoder = self.pipeline.named_steps['preprocessor'].named_transformers_['categorical'].named_steps['encoder']
+                cat_names = encoder.get_feature_names_out(categorical_features)
+                feature_names.extend([f'categorical__{name}' for name in cat_names])
+            self.feature_names = feature_names[:processed_data.shape[1]]
+        except:
+            self.feature_names = [f'feature_{i}' for i in range(processed_data.shape[1])]
+        # Step 8: Create processed DataFrame
+        processed_df = pd.DataFrame(processed_data, columns=self.feature_names)
+        return processed_df
+    def split_data(self, df):
+        """Split dataset into train and test"""
+        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+        return train_df, test_df
+# Global preprocessor
+preprocessor = StandalonePreprocessor()
+def process_dataset(file, prompt):
+    if file is None:
+        return "Please upload a dataset", None, None, None, None, ""
+    try:
+        # Read uploaded file
+        file_content = file.read()
+        df = pd.read_csv(io.BytesIO(file_content))
+        # Process dataset
+        processed_df = preprocessor.process(df)
+        # Split dataset
+        train_df, test_df = preprocessor.split_data(processed_df)
+        # Create summary
+        analysis = preprocessor.analysis
+        summary = f"""
+        ## ✅ **Processing Complete!**
+        ### 📊 **Dataset Information**
+        - **Original Shape**: {df.shape}
+        - **Processed Shape**: {processed_df.shape}
+        - **Training Set**: {train_df.shape}
+        - **Test Set**: {test_df.shape}
+        ### 🔍 **Column Analysis**
+        - **🎯 Identifiers Removed**: {len(analysis['identifiers'])} columns
+        - **📝 Text Features Removed**: {len(analysis['text_features'])} columns
+        - **📅 Date Columns Processed**: {len(analysis['dates'])} columns
+        - **🏷️ Low Cardinality Encoded**: {len(analysis['categorical_low_cardinality'])} columns
+        - **🎲 High Cardinality Dropped**: {len(analysis['categorical_high_cardinality'])} columns
+        - **🔢 Numeric Features**: {len(analysis['numeric'])} columns
+        ### 🗑️ **Dropped Columns**
+        {', '.join(analysis['identifiers'] + analysis['text_features'] + analysis['categorical_high_cardinality']) if analysis['identifiers'] + analysis['text_features'] + analysis['categorical_high_cardinality'] else 'None'}
+        ### 📈 **Processing Steps Applied**
+        1. ✅ Identifier column detection and removal
+        2. ✅ Text feature detection and removal
+        3. ✅ Date feature extraction (year, month, day, weekday)
+        4. ✅ Missing value imputation
+        5. ✅ Categorical encoding (one-hot)
+        6. ✅ Numeric feature scaling
+        7. ✅ Low-variance feature removal
+        8. ✅ Train/test split (80/20)
+        ### 🚀 **Files Ready for Download**
+        - Processed dataset (clean, ML-ready)
+        - Training set (80% of data)
+        - Test set (20% of data)
+        """
+        # Convert DataFrames to CSV for download
+        processed_csv = processed_df.to_csv(index=False).encode('utf-8')
+        train_csv = train_df.to_csv(index=False).encode('utf-8')
+        test_csv = test_df.to_csv(index=False).encode('utf-8')
+        return summary, processed_csv, train_csv, test_csv, processed_df.head(10), "✅ Processing completed successfully!"
+    except Exception as e:
+        return f"❌ Error: {str(e)}", None, None, None, None, f"❌ Processing failed: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="PromptPrepML", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🤖 PromptPrepML")
+    gr.Markdown("AI-Powered Machine Learning Data Preprocessing Assistant")
+    gr.Markdown("Upload your dataset and get ML-ready results in seconds! 🚀")
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(label="📁 Upload CSV Dataset", file_types=[".csv"])
+            prompt_input = gr.Textbox(
+                label="💬 Processing Instructions",
+                value="Prepare this dataset for machine learning. Handle missing values, remove identifier columns, extract date features, encode categorical variables, and scale numeric features.",
+                lines=3
+            )
+            process_btn = gr.Button("🚀 Process Dataset", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            output_summary = gr.Markdown(label="📊 Results Summary")
+            status_output = gr.Textbox(label="🔔 Status", interactive=False)
+    gr.Markdown("## 📋 Preview of Processed Dataset")
+    preview_output = gr.Dataframe(label="👀 Dataset Preview", max_rows=10)
+    gr.Markdown("## 📥 Download Files")
+    with gr.Row():
+        processed_download = gr.File(label="📊 Processed Dataset")
+        train_download = gr.File(label="🚂 Training Set")
+        test_download = gr.File(label="🧪 Test Set")
+    # Event handlers
+    process_btn.click(
+        fn=process_dataset,
+        inputs=[file_input, prompt_input],
+        outputs=[output_summary, processed_download, train_download, test_download, preview_output, status_output]
+    )
+    gr.Markdown("## 📚 Instructions")
+    gr.Markdown("""
+    1. 📁 **Upload your CSV dataset** (any size)
+    2. 💬 **Describe your preprocessing needs** (or use default)
+    3. 🚀 **Click "Process Dataset"**
+    4. 📥 **Download your ML-ready results**
+    5. 🎉 **Use for machine learning!**
+    ### 🧠 **Intelligent Features**
+    - **Automatic identifier detection** and removal
+    - **Smart date feature extraction**
+    - **Text feature handling**
+    - **Categorical encoding** for low-cardinality features
+    - **High cardinality handling**
+    - **Missing value imputation**
+    - **Feature scaling**
+    - **Train/test splitting**
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+pandas
+numpy
+scikit-learn