Commit Β·
c3c3195
1
Parent(s): c3b2831
Deploy working Streamlit version - exact local version that works perfectly
Browse files- README.md +3 -3
- app.py +355 -207
- requirements.txt +0 -0
README.md
CHANGED
|
@@ -3,8 +3,8 @@ title: PromptPrepML
|
|
| 3 |
emoji: π€
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version:
|
| 8 |
python_version: 3.11
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
@@ -37,6 +37,6 @@ Upload your dataset and describe your preprocessing needs in natural language. O
|
|
| 37 |
|
| 38 |
## π οΈ Tech Stack
|
| 39 |
- **Backend**: Python, FastAPI, scikit-learn
|
| 40 |
-
- **Frontend**:
|
| 41 |
- **EDA**: ydata-profiling
|
| 42 |
- **ML**: pandas, numpy
|
|
|
|
| 3 |
emoji: π€
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.28.0
|
| 8 |
python_version: 3.11
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 37 |
|
| 38 |
## π οΈ Tech Stack
|
| 39 |
- **Backend**: Python, FastAPI, scikit-learn
|
| 40 |
+
- **Frontend**: Streamlit
|
| 41 |
- **EDA**: ydata-profiling
|
| 42 |
- **ML**: pandas, numpy
|
app.py
CHANGED
|
@@ -1,228 +1,376 @@
|
|
| 1 |
-
import
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
-
import numpy as np
|
| 4 |
-
from sklearn.impute import SimpleImputer
|
| 5 |
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 6 |
-
from sklearn.feature_selection import VarianceThreshold
|
| 7 |
-
from sklearn.compose import ColumnTransformer
|
| 8 |
-
from sklearn.pipeline import Pipeline
|
| 9 |
-
from sklearn.model_selection import train_test_split
|
| 10 |
import io
|
| 11 |
-
import
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
}
|
| 30 |
-
|
| 31 |
-
for col in df.columns:
|
| 32 |
-
col_lower = col.lower()
|
| 33 |
-
|
| 34 |
-
# Identifier detection
|
| 35 |
-
is_identifier = (
|
| 36 |
-
any(keyword in col_lower for keyword in ['id', 'index', 'uuid', 'key']) and
|
| 37 |
-
(df[col].nunique() / len(df) > 0.8)
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
if is_identifier:
|
| 41 |
-
analysis['identifiers'].append(col)
|
| 42 |
-
continue
|
| 43 |
-
|
| 44 |
-
# Date detection
|
| 45 |
-
if df[col].dtype == 'object':
|
| 46 |
-
try:
|
| 47 |
-
pd.to_datetime(df[col].dropna().head(10))
|
| 48 |
-
analysis['dates'].append(col)
|
| 49 |
-
continue
|
| 50 |
-
except:
|
| 51 |
-
pass
|
| 52 |
-
|
| 53 |
-
# Text feature detection
|
| 54 |
-
text_keywords = ['name', 'email', 'phone', 'website', 'address', 'description']
|
| 55 |
-
if any(keyword in col_lower for keyword in text_keywords):
|
| 56 |
-
analysis['text_features'].append(col)
|
| 57 |
-
continue
|
| 58 |
-
|
| 59 |
-
# Categorical vs Numeric
|
| 60 |
-
if df[col].dtype == 'object':
|
| 61 |
-
unique_ratio = df[col].nunique() / len(df)
|
| 62 |
-
if unique_ratio > 0.5:
|
| 63 |
-
analysis['categorical_high_cardinality'].append(col)
|
| 64 |
-
else:
|
| 65 |
-
analysis['categorical_low_cardinality'].append(col)
|
| 66 |
-
else:
|
| 67 |
-
analysis['numeric'].append(col)
|
| 68 |
-
|
| 69 |
-
return analysis
|
| 70 |
|
| 71 |
-
|
| 72 |
-
"""Extract features from date columns"""
|
| 73 |
-
df_processed = df.copy()
|
| 74 |
-
|
| 75 |
-
for col in date_cols:
|
| 76 |
-
try:
|
| 77 |
-
dates = pd.to_datetime(df_processed[col])
|
| 78 |
-
df_processed[f'{col}_year'] = dates.dt.year
|
| 79 |
-
df_processed[f'{col}_month'] = dates.dt.month
|
| 80 |
-
df_processed[f'{col}_day'] = dates.dt.day
|
| 81 |
-
df_processed[f'{col}_weekday'] = dates.dt.weekday
|
| 82 |
-
df_processed.drop(col, axis=1, inplace=True)
|
| 83 |
-
except:
|
| 84 |
-
pass
|
| 85 |
-
|
| 86 |
-
return df_processed
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
]), numeric_features),
|
| 115 |
-
('categorical', Pipeline([
|
| 116 |
-
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 117 |
-
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
| 118 |
-
]), categorical_features)
|
| 119 |
-
]
|
| 120 |
)
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
('
|
| 125 |
-
(
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
return train_df, test_df
|
| 154 |
-
|
| 155 |
-
# Global preprocessor
|
| 156 |
-
preprocessor = StandalonePreprocessor()
|
| 157 |
-
|
| 158 |
-
def process_dataset(file, prompt):
|
| 159 |
-
if file is None:
|
| 160 |
-
return "Please upload a dataset", None, None, None, None, ""
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
file_content = file.read()
|
| 165 |
-
df = pd.read_csv(io.BytesIO(file_content))
|
| 166 |
|
| 167 |
-
#
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
-
#
|
| 171 |
-
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
#
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
#
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
],
|
| 213 |
-
outputs=[
|
| 214 |
-
gr.Markdown(label="Results Summary"),
|
| 215 |
-
gr.File(label="Processed Dataset"),
|
| 216 |
-
gr.File(label="Training Set"),
|
| 217 |
-
gr.File(label="Test Set"),
|
| 218 |
-
gr.Dataframe(label="Dataset Preview"),
|
| 219 |
-
gr.Textbox(label="Status")
|
| 220 |
-
],
|
| 221 |
-
title="π€ PromptPrepML",
|
| 222 |
-
description="AI-Powered Machine Learning Data Preprocessing Assistant",
|
| 223 |
-
allow_flagging="never"
|
| 224 |
-
)
|
| 225 |
|
| 226 |
-
# Launch the app
|
| 227 |
if __name__ == "__main__":
|
| 228 |
-
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import io
|
| 5 |
+
import os
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import time
|
| 8 |
|
| 9 |
+
# Configure page
|
| 10 |
+
st.set_page_config(
|
| 11 |
+
page_title="PromptPrepML - Auto ML Data Preprocessing",
|
| 12 |
+
page_icon="π€",
|
| 13 |
+
layout="wide",
|
| 14 |
+
initial_sidebar_state="expanded"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Custom CSS for better styling
|
| 18 |
+
st.markdown("""
|
| 19 |
+
<style>
|
| 20 |
+
.main-header {
|
| 21 |
+
font-size: 2.5rem;
|
| 22 |
+
font-weight: bold;
|
| 23 |
+
color: #1f2937;
|
| 24 |
+
text-align: center;
|
| 25 |
+
margin-bottom: 2rem;
|
| 26 |
+
}
|
| 27 |
+
.step-header {
|
| 28 |
+
font-size: 1.5rem;
|
| 29 |
+
font-weight: 600;
|
| 30 |
+
color: #374151;
|
| 31 |
+
margin: 1rem 0;
|
| 32 |
+
}
|
| 33 |
+
.success-box {
|
| 34 |
+
background-color: #f0fdf4;
|
| 35 |
+
border: 1px solid #bbf7d0;
|
| 36 |
+
border-radius: 0.5rem;
|
| 37 |
+
padding: 1rem;
|
| 38 |
+
margin: 1rem 0;
|
| 39 |
+
}
|
| 40 |
+
.info-box {
|
| 41 |
+
background-color: #eff6ff;
|
| 42 |
+
border: 1px solid #bfdbfe;
|
| 43 |
+
border-radius: 0.5rem;
|
| 44 |
+
padding: 1rem;
|
| 45 |
+
margin: 1rem 0;
|
| 46 |
+
}
|
| 47 |
+
.warning-box {
|
| 48 |
+
background-color: #fffbeb;
|
| 49 |
+
border: 1px solid #fed7aa;
|
| 50 |
+
border-radius: 0.5rem;
|
| 51 |
+
padding: 1rem;
|
| 52 |
+
margin: 1rem 0;
|
| 53 |
+
}
|
| 54 |
+
</style>
|
| 55 |
+
""", unsafe_allow_html=True)
|
| 56 |
+
|
| 57 |
+
# API base URL
|
| 58 |
+
API_BASE = "http://localhost:8000"
|
| 59 |
+
|
| 60 |
+
def check_backend_health():
|
| 61 |
+
"""Check if backend is running"""
|
| 62 |
+
try:
|
| 63 |
+
response = requests.get(f"{API_BASE}/health", timeout=5)
|
| 64 |
+
return response.status_code == 200
|
| 65 |
+
except:
|
| 66 |
+
return False
|
| 67 |
+
|
| 68 |
+
def upload_dataset(uploaded_file):
|
| 69 |
+
"""Upload dataset to backend"""
|
| 70 |
+
try:
|
| 71 |
+
files = {'file': uploaded_file}
|
| 72 |
+
response = requests.post(f"{API_BASE}/api/upload-dataset", files=files, timeout=30)
|
| 73 |
+
if response.status_code == 200:
|
| 74 |
+
return response.json()
|
| 75 |
+
else:
|
| 76 |
+
return None
|
| 77 |
+
except Exception as e:
|
| 78 |
+
st.error(f"Upload error: {str(e)}")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def process_pipeline(uploaded_file, prompt):
|
| 82 |
+
"""Process dataset with ML pipeline"""
|
| 83 |
+
try:
|
| 84 |
+
files = {'file': uploaded_file}
|
| 85 |
+
data = {'prompt': prompt}
|
| 86 |
+
response = requests.post(f"{API_BASE}/process-pipeline", files=files, data=data, timeout=120)
|
| 87 |
+
if response.status_code == 200:
|
| 88 |
+
return response.json()
|
| 89 |
+
else:
|
| 90 |
+
st.error(f"Processing error: {response.text}")
|
| 91 |
+
return None
|
| 92 |
+
except Exception as e:
|
| 93 |
+
st.error(f"Processing error: {str(e)}")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
def download_file(filename):
|
| 97 |
+
"""Generate download link for file"""
|
| 98 |
+
return f"{API_BASE}/api/download/{filename}"
|
| 99 |
+
|
| 100 |
+
def main():
|
| 101 |
+
# Main header
|
| 102 |
+
st.markdown('<h1 class="main-header">π€ PromptPrepML</h1>', unsafe_allow_html=True)
|
| 103 |
+
st.markdown('<p style="text-align: center; color: #6b7280; font-size: 1.1rem;">Convert natural language prompts into ML-ready datasets</p>', unsafe_allow_html=True)
|
| 104 |
|
| 105 |
+
# Check backend health
|
| 106 |
+
if not check_backend_health():
|
| 107 |
+
st.error("β Backend is not running! Please start the backend first:")
|
| 108 |
+
st.code("""
|
| 109 |
+
cd promptprepml/backend
|
| 110 |
+
venv\\Scripts\\activate
|
| 111 |
+
python app/main.py
|
| 112 |
+
""")
|
| 113 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
st.success("β
Backend is connected and ready!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
# Sidebar for navigation
|
| 118 |
+
st.sidebar.title("π Processing Steps")
|
| 119 |
+
|
| 120 |
+
# Initialize session state
|
| 121 |
+
if 'step' not in st.session_state:
|
| 122 |
+
st.session_state.step = 'upload'
|
| 123 |
+
if 'dataset_info' not in st.session_state:
|
| 124 |
+
st.session_state.dataset_info = None
|
| 125 |
+
if 'processing_results' not in st.session_state:
|
| 126 |
+
st.session_state.processing_results = None
|
| 127 |
+
if 'uploaded_file' not in st.session_state:
|
| 128 |
+
st.session_state.uploaded_file = None
|
| 129 |
+
|
| 130 |
+
# Step indicators
|
| 131 |
+
steps = ['π€ Upload Dataset', 'π¬ Enter Prompt', 'β‘ Processing', 'π Results']
|
| 132 |
+
step_mapping = {
|
| 133 |
+
'upload': 0,
|
| 134 |
+
'prompt': 1,
|
| 135 |
+
'processing': 2,
|
| 136 |
+
'results': 3
|
| 137 |
+
}
|
| 138 |
+
current_step_idx = step_mapping.get(st.session_state.step, 0)
|
| 139 |
+
|
| 140 |
+
for i, step in enumerate(steps):
|
| 141 |
+
if i <= current_step_idx:
|
| 142 |
+
st.sidebar.markdown(f"β
{step}")
|
| 143 |
+
else:
|
| 144 |
+
st.sidebar.markdown(f"β³ {step}")
|
| 145 |
+
|
| 146 |
+
# Main content based on current step
|
| 147 |
+
if st.session_state.step == 'upload':
|
| 148 |
+
st.markdown('<h2 class="step-header">π€ Upload Your Dataset</h2>', unsafe_allow_html=True)
|
| 149 |
|
| 150 |
+
# File upload
|
| 151 |
+
uploaded_file = st.file_uploader(
|
| 152 |
+
"Choose a CSV file",
|
| 153 |
+
type=['csv'],
|
| 154 |
+
help="Upload your dataset in CSV format. Maximum file size: 200MB"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
)
|
| 156 |
|
| 157 |
+
if uploaded_file is not None:
|
| 158 |
+
# Display file info
|
| 159 |
+
st.markdown('<div class="info-box">', unsafe_allow_html=True)
|
| 160 |
+
st.write(f"**Filename:** {uploaded_file.name}")
|
| 161 |
+
st.write(f"**Size:** {uploaded_file.size / 1024 / 1024:.2f} MB")
|
| 162 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 163 |
+
|
| 164 |
+
# Preview data
|
| 165 |
+
try:
|
| 166 |
+
df = pd.read_csv(uploaded_file)
|
| 167 |
+
st.write("**Data Preview:**")
|
| 168 |
+
st.dataframe(df.head(), use_container_width=True)
|
| 169 |
+
st.write(f"**Shape:** {df.shape[0]} rows Γ {df.shape[1]} columns")
|
| 170 |
+
|
| 171 |
+
# Upload button
|
| 172 |
+
if st.button("π Upload Dataset", type="primary"):
|
| 173 |
+
with st.spinner("Uploading dataset..."):
|
| 174 |
+
# Reset file pointer
|
| 175 |
+
uploaded_file.seek(0)
|
| 176 |
+
result = upload_dataset(uploaded_file)
|
| 177 |
+
|
| 178 |
+
if result:
|
| 179 |
+
st.session_state.dataset_info = result
|
| 180 |
+
st.session_state.uploaded_file = uploaded_file # Store the file
|
| 181 |
+
st.session_state.step = 'prompt'
|
| 182 |
+
st.rerun()
|
| 183 |
+
else:
|
| 184 |
+
st.error("Upload failed. Please try again.")
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
st.error(f"Error reading CSV file: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
elif st.session_state.step == 'prompt':
|
| 190 |
+
st.markdown('<h2 class="step-header">π¬ Describe Your Preprocessing Needs</h2>', unsafe_allow_html=True)
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
# Show dataset info
|
| 193 |
+
if st.session_state.dataset_info:
|
| 194 |
+
info = st.session_state.dataset_info['dataset_info']
|
| 195 |
+
st.markdown('<div class="info-box">', unsafe_allow_html=True)
|
| 196 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 197 |
+
with col1:
|
| 198 |
+
st.metric("Rows", info['shape'][0])
|
| 199 |
+
with col2:
|
| 200 |
+
st.metric("Columns", info['shape'][1])
|
| 201 |
+
with col3:
|
| 202 |
+
st.metric("Missing Values", sum(info['missing_values'].values()))
|
| 203 |
+
with col4:
|
| 204 |
+
st.metric("Duplicates", info['duplicates'])
|
| 205 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 206 |
+
|
| 207 |
+
# Show file info
|
| 208 |
+
if st.session_state.uploaded_file:
|
| 209 |
+
st.info(f"π File loaded: {st.session_state.uploaded_file.name} ({st.session_state.uploaded_file.size / 1024 / 1024:.2f} MB)")
|
| 210 |
|
| 211 |
+
# Prompt input
|
| 212 |
+
st.write("**Enter your preprocessing instructions in natural language:**")
|
| 213 |
|
| 214 |
+
# Example prompts
|
| 215 |
+
example_prompts = [
|
| 216 |
+
"Prepare this dataset for fraud classification, handle missing values, encode categorical variables, remove outliers, and scale numeric features.",
|
| 217 |
+
"Clean this dataset for customer churn prediction, fill missing values with median, one-hot encode categories, and apply standard scaling.",
|
| 218 |
+
"Preprocess data for regression analysis, handle null values, remove duplicates, and normalize numerical features.",
|
| 219 |
+
"Get this dataset ready for machine learning, handle missing data, encode categorical variables, and scale features.",
|
| 220 |
+
"Analyze this customer dataset and prepare it for machine learning. Remove duplicate rows and unnecessary identifier columns. Handle missing values appropriately. Encode categorical variables such as country, city, and company. Extract useful features from the subscription date. Scale any numerical features if present. Remove low-variance features and prepare the dataset for clustering or classification."
|
| 221 |
+
]
|
| 222 |
|
| 223 |
+
# Prompt text area
|
| 224 |
+
prompt = st.text_area(
|
| 225 |
+
"Your prompt:",
|
| 226 |
+
height=120,
|
| 227 |
+
placeholder="e.g., Handle missing values, encode categorical variables, remove outliers, and scale numeric features",
|
| 228 |
+
help="Describe how you want to preprocess your dataset in plain English"
|
| 229 |
+
)
|
| 230 |
|
| 231 |
+
# Example prompts section
|
| 232 |
+
with st.expander("π‘ Example Prompts"):
|
| 233 |
+
for i, example in enumerate(example_prompts, 1):
|
| 234 |
+
if st.button(f"Use Example {i}", key=f"example_{i}"):
|
| 235 |
+
prompt = example
|
| 236 |
+
st.rerun()
|
| 237 |
+
st.write(f"{i}. {example}")
|
| 238 |
|
| 239 |
+
# Supported operations info
|
| 240 |
+
with st.expander("π§ Supported Operations"):
|
| 241 |
+
st.write("""
|
| 242 |
+
**Missing Values:**
|
| 243 |
+
- Mean/median/mode imputation
|
| 244 |
+
- Constant value filling
|
| 245 |
+
- Row deletion
|
| 246 |
+
|
| 247 |
+
**Categorical Encoding:**
|
| 248 |
+
- One-hot encoding
|
| 249 |
+
- Label encoding
|
| 250 |
+
|
| 251 |
+
**Feature Scaling:**
|
| 252 |
+
- Standard scaling (Z-score)
|
| 253 |
+
- Min-max scaling
|
| 254 |
+
- Robust scaling
|
| 255 |
+
|
| 256 |
+
**Outlier Detection:**
|
| 257 |
+
- Isolation Forest
|
| 258 |
+
- IQR method
|
| 259 |
+
- Z-score method
|
| 260 |
+
|
| 261 |
+
**Feature Engineering:**
|
| 262 |
+
- Variance threshold selection
|
| 263 |
+
- Correlation filtering
|
| 264 |
+
- Interaction features
|
| 265 |
+
""")
|
| 266 |
|
| 267 |
+
# Process button
|
| 268 |
+
if prompt and st.button("π Process Dataset", type="primary"):
|
| 269 |
+
if st.session_state.uploaded_file:
|
| 270 |
+
with st.spinner("Processing dataset... This may take a few minutes."):
|
| 271 |
+
# Reset file pointer
|
| 272 |
+
st.session_state.uploaded_file.seek(0)
|
| 273 |
+
result = process_pipeline(st.session_state.uploaded_file, prompt)
|
| 274 |
+
|
| 275 |
+
if result:
|
| 276 |
+
st.session_state.processing_results = result
|
| 277 |
+
st.session_state.step = 'results'
|
| 278 |
+
st.rerun()
|
| 279 |
+
else:
|
| 280 |
+
st.warning("No file found. Please upload your dataset again.")
|
| 281 |
+
|
| 282 |
+
elif st.session_state.step == 'results':
|
| 283 |
+
st.markdown('<h2 class="step-header">π Processing Complete!</h2>', unsafe_allow_html=True)
|
| 284 |
|
| 285 |
+
if st.session_state.processing_results:
|
| 286 |
+
results = st.session_state.processing_results
|
| 287 |
+
|
| 288 |
+
# Success message
|
| 289 |
+
st.markdown('<div class="success-box">', unsafe_allow_html=True)
|
| 290 |
+
st.success("β
Your dataset has been successfully preprocessed and is ready for machine learning!")
|
| 291 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 292 |
+
|
| 293 |
+
# Dataset information
|
| 294 |
+
st.write("### π Dataset Information")
|
| 295 |
+
info = results['dataset_info']['basic_info']
|
| 296 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 297 |
+
with col1:
|
| 298 |
+
st.metric("Original Shape", f"{info['shape'][0]} Γ {info['shape'][1]}")
|
| 299 |
+
with col2:
|
| 300 |
+
st.metric("Numeric Columns", len(info['numeric_columns']))
|
| 301 |
+
with col3:
|
| 302 |
+
st.metric("Categorical Columns", len(info['categorical_columns']))
|
| 303 |
+
with col4:
|
| 304 |
+
missing_total = sum(results['dataset_info']['missing_values']['counts'].values())
|
| 305 |
+
st.metric("Missing Values", missing_total)
|
| 306 |
+
|
| 307 |
+
# Applied preprocessing steps
|
| 308 |
+
st.write("### π§ Applied Preprocessing Steps")
|
| 309 |
+
for i, step in enumerate(results['preprocessing_steps'], 1):
|
| 310 |
+
st.markdown(f"""
|
| 311 |
+
<div style="padding: 1rem; margin: 0.5rem 0; background-color: #f8fafc; border-left: 4px solid #3b82f6; border-radius: 0.25rem;">
|
| 312 |
+
<strong>Step {i}:</strong> {step['description']}<br>
|
| 313 |
+
<small>Method: {step.get('method', 'N/A')}</small>
|
| 314 |
+
</div>
|
| 315 |
+
""", unsafe_allow_html=True)
|
| 316 |
+
|
| 317 |
+
# Download files
|
| 318 |
+
st.write("### π Download Files")
|
| 319 |
+
|
| 320 |
+
files_to_download = [
|
| 321 |
+
("processed_dataset.csv", "π Processed Dataset", "Fully preprocessed dataset ready for ML"),
|
| 322 |
+
("train.csv", "π Training Set", "80% of data for model training"),
|
| 323 |
+
("test.csv", "π§ͺ Test Set", "20% of data for model testing"),
|
| 324 |
+
("pipeline.pkl", "βοΈ Pipeline", "Scikit-learn pipeline for reuse"),
|
| 325 |
+
("eda_report.html", "π EDA Report", "Exploratory Data Analysis report")
|
| 326 |
+
]
|
| 327 |
+
|
| 328 |
+
col1, col2 = st.columns(2)
|
| 329 |
+
for i, (filename, title, description) in enumerate(files_to_download):
|
| 330 |
+
with col1 if i % 2 == 0 else col2:
|
| 331 |
+
st.markdown(f"""
|
| 332 |
+
<div style="padding: 1rem; margin: 0.5rem 0; border: 1px solid #e5e7eb; border-radius: 0.5rem;">
|
| 333 |
+
<h4>{title}</h4>
|
| 334 |
+
<p><small>{description}</small></p>
|
| 335 |
+
<a href="{download_file(filename)}" download="{filename}" style="text-decoration: none;">
|
| 336 |
+
<button style="background-color: #3b82f6; color: white; padding: 0.5rem 1rem; border: none; border-radius: 0.25rem; cursor: pointer;">
|
| 337 |
+
π₯ Download {filename}
|
| 338 |
+
</button>
|
| 339 |
+
</a>
|
| 340 |
+
</div>
|
| 341 |
+
""", unsafe_allow_html=True)
|
| 342 |
+
|
| 343 |
+
# Quick actions
|
| 344 |
+
st.write("### β‘ Quick Actions")
|
| 345 |
+
col1, col2, col3 = st.columns(3)
|
| 346 |
+
|
| 347 |
+
with col1:
|
| 348 |
+
if st.button("π View EDA Report", type="secondary"):
|
| 349 |
+
st.info(f"EDA Report will be available at: {download_file('eda_report.html')}")
|
| 350 |
+
|
| 351 |
+
with col2:
|
| 352 |
+
if st.button("βοΈ Download Pipeline", type="secondary"):
|
| 353 |
+
st.info(f"Pipeline file: {download_file('pipeline.pkl')}")
|
| 354 |
+
|
| 355 |
+
with col3:
|
| 356 |
+
if st.button("π Process Another Dataset", type="primary"):
|
| 357 |
+
# Reset session state
|
| 358 |
+
for key in st.session_state.keys():
|
| 359 |
+
del st.session_state[key]
|
| 360 |
+
st.session_state.step = 'upload'
|
| 361 |
+
st.rerun()
|
| 362 |
|
| 363 |
+
else:
|
| 364 |
+
st.error("No processing results available. Please start over.")
|
| 365 |
+
|
| 366 |
+
# Footer
|
| 367 |
+
st.markdown("---")
|
| 368 |
+
st.markdown("""
|
| 369 |
+
<div style="text-align: center; color: #6b7280; margin-top: 2rem;">
|
| 370 |
+
<p><strong>PromptPrepML</strong> - Automated ML Data Preprocessing</p>
|
| 371 |
+
<p><small>Convert natural language prompts into ML-ready datasets</small></p>
|
| 372 |
+
</div>
|
| 373 |
+
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
|
|
|
| 375 |
if __name__ == "__main__":
|
| 376 |
+
main()
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|