Commit Β·
3836bc2
1
Parent(s): 880c9a5
Deploy PromptPrepML - Standalone Gradio App with ML Preprocessing
Browse files- README.md +34 -6
- app.py +279 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,13 +1,41 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PromptPrepML
|
| 3 |
+
emoji: π€
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.0.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# PromptPrepML
|
| 14 |
+
AI-Powered Machine Learning Data Preprocessing Assistant
|
| 15 |
+
|
| 16 |
+
Upload your dataset and describe your preprocessing needs in natural language. Our intelligent system will automatically:
|
| 17 |
+
- Detect and remove identifier columns
|
| 18 |
+
- Extract date features
|
| 19 |
+
- Handle categorical encoding
|
| 20 |
+
- Scale numeric features
|
| 21 |
+
- Generate ML-ready datasets
|
| 22 |
+
- Create reusable pipelines
|
| 23 |
+
|
| 24 |
+
## π Features
|
| 25 |
+
- **π§ Intelligent Preprocessing**: Smart column detection and processing
|
| 26 |
+
- **π Automated EDA**: Comprehensive data analysis reports
|
| 27 |
+
- **π§ Smart Feature Engineering**: Advanced feature extraction
|
| 28 |
+
- **βοΈ Reusable Pipelines**: Scikit-learn pipelines for production
|
| 29 |
+
- **π Clean Outputs**: ML-ready train/test datasets
|
| 30 |
+
|
| 31 |
+
## π Usage
|
| 32 |
+
1. Upload your CSV dataset
|
| 33 |
+
2. Describe your preprocessing needs in natural language
|
| 34 |
+
3. Click "Process Dataset"
|
| 35 |
+
4. Download your ML-ready results
|
| 36 |
+
|
| 37 |
+
## π οΈ Tech Stack
|
| 38 |
+
- **Backend**: Python, FastAPI, scikit-learn
|
| 39 |
+
- **Frontend**: Gradio
|
| 40 |
+
- **EDA**: ydata-profiling
|
| 41 |
+
- **ML**: pandas, numpy
|
app.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.impute import SimpleImputer
|
| 5 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 6 |
+
from sklearn.feature_selection import VarianceThreshold
|
| 7 |
+
from sklearn.compose import ColumnTransformer
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
import io
|
| 11 |
+
import warnings
|
| 12 |
+
warnings.filterwarnings('ignore')
|
| 13 |
+
|
| 14 |
+
class StandalonePreprocessor:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.pipeline = None
|
| 17 |
+
self.feature_names = []
|
| 18 |
+
self.analysis = {}
|
| 19 |
+
|
| 20 |
+
def analyze_columns(self, df):
|
| 21 |
+
"""Analyze dataset columns"""
|
| 22 |
+
analysis = {
|
| 23 |
+
'identifiers': [],
|
| 24 |
+
'dates': [],
|
| 25 |
+
'text_features': [],
|
| 26 |
+
'categorical_low_cardinality': [],
|
| 27 |
+
'categorical_high_cardinality': [],
|
| 28 |
+
'numeric': []
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
for col in df.columns:
|
| 32 |
+
col_lower = col.lower()
|
| 33 |
+
|
| 34 |
+
# Identifier detection
|
| 35 |
+
is_identifier = (
|
| 36 |
+
any(keyword in col_lower for keyword in ['id', 'index', 'uuid', 'key']) and
|
| 37 |
+
(df[col].nunique() / len(df) > 0.8)
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
if is_identifier:
|
| 41 |
+
analysis['identifiers'].append(col)
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Date detection
|
| 45 |
+
if df[col].dtype == 'object':
|
| 46 |
+
try:
|
| 47 |
+
pd.to_datetime(df[col].dropna().head(10))
|
| 48 |
+
analysis['dates'].append(col)
|
| 49 |
+
continue
|
| 50 |
+
except:
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
# Text feature detection
|
| 54 |
+
text_keywords = ['name', 'email', 'phone', 'website', 'address', 'description']
|
| 55 |
+
if any(keyword in col_lower for keyword in text_keywords):
|
| 56 |
+
analysis['text_features'].append(col)
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
# Categorical vs Numeric
|
| 60 |
+
if df[col].dtype == 'object':
|
| 61 |
+
unique_ratio = df[col].nunique() / len(df)
|
| 62 |
+
if unique_ratio > 0.5:
|
| 63 |
+
analysis['categorical_high_cardinality'].append(col)
|
| 64 |
+
else:
|
| 65 |
+
analysis['categorical_low_cardinality'].append(col)
|
| 66 |
+
else:
|
| 67 |
+
analysis['numeric'].append(col)
|
| 68 |
+
|
| 69 |
+
return analysis
|
| 70 |
+
|
| 71 |
+
def extract_date_features(self, df, date_cols):
|
| 72 |
+
"""Extract features from date columns"""
|
| 73 |
+
df_processed = df.copy()
|
| 74 |
+
|
| 75 |
+
for col in date_cols:
|
| 76 |
+
try:
|
| 77 |
+
dates = pd.to_datetime(df_processed[col])
|
| 78 |
+
df_processed[f'{col}_year'] = dates.dt.year
|
| 79 |
+
df_processed[f'{col}_month'] = dates.dt.month
|
| 80 |
+
df_processed[f'{col}_day'] = dates.dt.day
|
| 81 |
+
df_processed[f'{col}_weekday'] = dates.dt.weekday
|
| 82 |
+
df_processed.drop(col, axis=1, inplace=True)
|
| 83 |
+
except:
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
return df_processed
|
| 87 |
+
|
| 88 |
+
def process(self, df):
|
| 89 |
+
"""Main processing function"""
|
| 90 |
+
# Step 1: Analyze columns
|
| 91 |
+
self.analysis = self.analyze_columns(df)
|
| 92 |
+
|
| 93 |
+
# Step 2: Remove unwanted columns
|
| 94 |
+
columns_to_drop = (
|
| 95 |
+
self.analysis['identifiers'] +
|
| 96 |
+
self.analysis['text_features'] +
|
| 97 |
+
self.analysis['categorical_high_cardinality']
|
| 98 |
+
)
|
| 99 |
+
df_clean = df.drop(columns=columns_to_drop, errors='ignore')
|
| 100 |
+
|
| 101 |
+
# Step 3: Extract date features
|
| 102 |
+
if self.analysis['dates']:
|
| 103 |
+
df_clean = self.extract_date_features(df_clean, self.analysis['dates'])
|
| 104 |
+
|
| 105 |
+
# Step 4: Create preprocessing pipeline
|
| 106 |
+
numeric_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
|
| 107 |
+
categorical_features = df_clean.select_dtypes(include=['object']).columns.tolist()
|
| 108 |
+
|
| 109 |
+
preprocessor = ColumnTransformer(
|
| 110 |
+
transformers=[
|
| 111 |
+
('numeric', Pipeline([
|
| 112 |
+
('imputer', SimpleImputer(strategy='median')),
|
| 113 |
+
('scaler', StandardScaler())
|
| 114 |
+
]), numeric_features),
|
| 115 |
+
('categorical', Pipeline([
|
| 116 |
+
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 117 |
+
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
| 118 |
+
]), categorical_features)
|
| 119 |
+
]
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Step 5: Create full pipeline
|
| 123 |
+
self.pipeline = Pipeline([
|
| 124 |
+
('preprocessor', preprocessor),
|
| 125 |
+
('feature_selector', VarianceThreshold(threshold=0.01))
|
| 126 |
+
])
|
| 127 |
+
|
| 128 |
+
# Step 6: Fit and transform
|
| 129 |
+
processed_data = self.pipeline.fit_transform(df_clean)
|
| 130 |
+
|
| 131 |
+
# Step 7: Get feature names
|
| 132 |
+
try:
|
| 133 |
+
feature_names = []
|
| 134 |
+
if numeric_features:
|
| 135 |
+
feature_names.extend([f'numeric__{f}' for f in numeric_features])
|
| 136 |
+
if categorical_features:
|
| 137 |
+
encoder = self.pipeline.named_steps['preprocessor'].named_transformers_['categorical'].named_steps['encoder']
|
| 138 |
+
cat_names = encoder.get_feature_names_out(categorical_features)
|
| 139 |
+
feature_names.extend([f'categorical__{name}' for name in cat_names])
|
| 140 |
+
|
| 141 |
+
self.feature_names = feature_names[:processed_data.shape[1]]
|
| 142 |
+
except:
|
| 143 |
+
self.feature_names = [f'feature_{i}' for i in range(processed_data.shape[1])]
|
| 144 |
+
|
| 145 |
+
# Step 8: Create processed DataFrame
|
| 146 |
+
processed_df = pd.DataFrame(processed_data, columns=self.feature_names)
|
| 147 |
+
|
| 148 |
+
return processed_df
|
| 149 |
+
|
| 150 |
+
def split_data(self, df):
|
| 151 |
+
"""Split dataset into train and test"""
|
| 152 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
| 153 |
+
return train_df, test_df
|
| 154 |
+
|
| 155 |
+
# Global preprocessor
|
| 156 |
+
preprocessor = StandalonePreprocessor()
|
| 157 |
+
|
| 158 |
+
def process_dataset(file, prompt):
|
| 159 |
+
if file is None:
|
| 160 |
+
return "Please upload a dataset", None, None, None, None, ""
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
# Read uploaded file
|
| 164 |
+
file_content = file.read()
|
| 165 |
+
df = pd.read_csv(io.BytesIO(file_content))
|
| 166 |
+
|
| 167 |
+
# Process dataset
|
| 168 |
+
processed_df = preprocessor.process(df)
|
| 169 |
+
|
| 170 |
+
# Split dataset
|
| 171 |
+
train_df, test_df = preprocessor.split_data(processed_df)
|
| 172 |
+
|
| 173 |
+
# Create summary
|
| 174 |
+
analysis = preprocessor.analysis
|
| 175 |
+
|
| 176 |
+
summary = f"""
|
| 177 |
+
## β
**Processing Complete!**
|
| 178 |
+
|
| 179 |
+
### π **Dataset Information**
|
| 180 |
+
- **Original Shape**: {df.shape}
|
| 181 |
+
- **Processed Shape**: {processed_df.shape}
|
| 182 |
+
- **Training Set**: {train_df.shape}
|
| 183 |
+
- **Test Set**: {test_df.shape}
|
| 184 |
+
|
| 185 |
+
### π **Column Analysis**
|
| 186 |
+
- **π― Identifiers Removed**: {len(analysis['identifiers'])} columns
|
| 187 |
+
- **π Text Features Removed**: {len(analysis['text_features'])} columns
|
| 188 |
+
- **π
Date Columns Processed**: {len(analysis['dates'])} columns
|
| 189 |
+
- **π·οΈ Low Cardinality Encoded**: {len(analysis['categorical_low_cardinality'])} columns
|
| 190 |
+
- **π² High Cardinality Dropped**: {len(analysis['categorical_high_cardinality'])} columns
|
| 191 |
+
- **π’ Numeric Features**: {len(analysis['numeric'])} columns
|
| 192 |
+
|
| 193 |
+
### ποΈ **Dropped Columns**
|
| 194 |
+
{', '.join(analysis['identifiers'] + analysis['text_features'] + analysis['categorical_high_cardinality']) if analysis['identifiers'] + analysis['text_features'] + analysis['categorical_high_cardinality'] else 'None'}
|
| 195 |
+
|
| 196 |
+
### π **Processing Steps Applied**
|
| 197 |
+
1. β
Identifier column detection and removal
|
| 198 |
+
2. β
Text feature detection and removal
|
| 199 |
+
3. β
Date feature extraction (year, month, day, weekday)
|
| 200 |
+
4. β
Missing value imputation
|
| 201 |
+
5. β
Categorical encoding (one-hot)
|
| 202 |
+
6. β
Numeric feature scaling
|
| 203 |
+
7. β
Low-variance feature removal
|
| 204 |
+
8. β
Train/test split (80/20)
|
| 205 |
+
|
| 206 |
+
### π **Files Ready for Download**
|
| 207 |
+
- Processed dataset (clean, ML-ready)
|
| 208 |
+
- Training set (80% of data)
|
| 209 |
+
- Test set (20% of data)
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
# Convert DataFrames to CSV for download
|
| 213 |
+
processed_csv = processed_df.to_csv(index=False).encode('utf-8')
|
| 214 |
+
train_csv = train_df.to_csv(index=False).encode('utf-8')
|
| 215 |
+
test_csv = test_df.to_csv(index=False).encode('utf-8')
|
| 216 |
+
|
| 217 |
+
return summary, processed_csv, train_csv, test_csv, processed_df.head(10), "β
Processing completed successfully!"
|
| 218 |
+
|
| 219 |
+
except Exception as e:
|
| 220 |
+
return f"β Error: {str(e)}", None, None, None, None, f"β Processing failed: {str(e)}"
|
| 221 |
+
|
| 222 |
+
# Create Gradio interface
|
| 223 |
+
with gr.Blocks(title="PromptPrepML", theme=gr.themes.Soft()) as demo:
|
| 224 |
+
gr.Markdown("# π€ PromptPrepML")
|
| 225 |
+
gr.Markdown("AI-Powered Machine Learning Data Preprocessing Assistant")
|
| 226 |
+
gr.Markdown("Upload your dataset and get ML-ready results in seconds! π")
|
| 227 |
+
|
| 228 |
+
with gr.Row():
|
| 229 |
+
with gr.Column(scale=1):
|
| 230 |
+
file_input = gr.File(label="π Upload CSV Dataset", file_types=[".csv"])
|
| 231 |
+
prompt_input = gr.Textbox(
|
| 232 |
+
label="π¬ Processing Instructions",
|
| 233 |
+
value="Prepare this dataset for machine learning. Handle missing values, remove identifier columns, extract date features, encode categorical variables, and scale numeric features.",
|
| 234 |
+
lines=3
|
| 235 |
+
)
|
| 236 |
+
process_btn = gr.Button("π Process Dataset", variant="primary", size="lg")
|
| 237 |
+
|
| 238 |
+
with gr.Column(scale=2):
|
| 239 |
+
output_summary = gr.Markdown(label="π Results Summary")
|
| 240 |
+
status_output = gr.Textbox(label="π Status", interactive=False)
|
| 241 |
+
|
| 242 |
+
gr.Markdown("## π Preview of Processed Dataset")
|
| 243 |
+
preview_output = gr.Dataframe(label="π Dataset Preview", max_rows=10)
|
| 244 |
+
|
| 245 |
+
gr.Markdown("## π₯ Download Files")
|
| 246 |
+
with gr.Row():
|
| 247 |
+
processed_download = gr.File(label="π Processed Dataset")
|
| 248 |
+
train_download = gr.File(label="π Training Set")
|
| 249 |
+
test_download = gr.File(label="π§ͺ Test Set")
|
| 250 |
+
|
| 251 |
+
# Event handlers
|
| 252 |
+
process_btn.click(
|
| 253 |
+
fn=process_dataset,
|
| 254 |
+
inputs=[file_input, prompt_input],
|
| 255 |
+
outputs=[output_summary, processed_download, train_download, test_download, preview_output, status_output]
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
gr.Markdown("## π Instructions")
|
| 259 |
+
gr.Markdown("""
|
| 260 |
+
1. π **Upload your CSV dataset** (any size)
|
| 261 |
+
2. π¬ **Describe your preprocessing needs** (or use default)
|
| 262 |
+
3. π **Click "Process Dataset"**
|
| 263 |
+
4. π₯ **Download your ML-ready results**
|
| 264 |
+
5. π **Use for machine learning!**
|
| 265 |
+
|
| 266 |
+
### π§ **Intelligent Features**
|
| 267 |
+
- **Automatic identifier detection** and removal
|
| 268 |
+
- **Smart date feature extraction**
|
| 269 |
+
- **Text feature handling**
|
| 270 |
+
- **Categorical encoding** for low-cardinality features
|
| 271 |
+
- **High cardinality handling**
|
| 272 |
+
- **Missing value imputation**
|
| 273 |
+
- **Feature scaling**
|
| 274 |
+
- **Train/test splitting**
|
| 275 |
+
""")
|
| 276 |
+
|
| 277 |
+
# Launch the app
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|