TanU21 commited on
Commit
d170e1e
·
verified ·
1 Parent(s): c275f5e

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +41 -0
  2. app/main.py +40 -0
  3. app/services/preprocessing.py +68 -0
  4. main.py +40 -0
  5. preprocessing.py +78 -0
  6. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import requests
4
+ from fastapi import FastAPI
5
+ from main import app as fastapi_app
6
+
7
+ import uvicorn
8
+ import threading
9
+
10
+ FASTAPI_URL = "http://127.0.0.1:8000/preprocess_data/"
11
+
12
+ # Start FastAPI server in background thread
13
+ def run_fastapi():
14
+ uvicorn.run(fastapi_app, host="127.0.0.1", port=8000)
15
+
16
+ threading.Thread(target=run_fastapi, daemon=True).start()
17
+
18
+ st.title("📊 Data Preprocessing App")
19
+
20
+ uploaded_file = st.file_uploader("Upload CSV File", type=["csv"])
21
+
22
+ if uploaded_file is not None:
23
+ st.write("✅ File uploaded successfully!")
24
+
25
+ if st.button("🚀 Process Data"):
26
+ if uploaded_file is None:
27
+ st.warning("⚠️ Please upload a CSV file first!")
28
+ else:
29
+ with st.spinner("Processing... ⏳"):
30
+ files = {"upload_file": (uploaded_file.name, uploaded_file, "text/csv")}
31
+ response = requests.post(FASTAPI_URL, files=files)
32
+
33
+ if response.status_code == 200:
34
+ st.success("✅ Data processed successfully!")
35
+ cleaned_data_path = "cleaned_dataset.csv"
36
+ with open(cleaned_data_path, "wb") as f:
37
+ f.write(response.content)
38
+ with open(cleaned_data_path, "rb") as f:
39
+ st.download_button("📥 Download Processed CSV", f, "cleaned_dataset.csv", "text/csv")
40
+ else:
41
+ st.error(f"❌ Error: {response.json()['detail']}")
app/main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from fastapi import FastAPI, HTTPException, File, UploadFile
3
+ from fastapi.responses import FileResponse
4
+ from app.services.preprocessing import data_quality, standardize_data_types, handle_missing_data, handle_outliers, generate_final_report, save_cleaned_data
5
+ import pandas as pd
6
+ import io
7
+ import os
8
+
9
+ app = FastAPI(title="Data Preprocessing")
10
+ os.makedirs("output", exist_ok=True)
11
+
12
+ @app.get("/")
13
+ async def root():
14
+ return {"message": "Welcome to the Data Preprocessing API!"}
15
+
16
+ @app.post("/preprocess_data/")
17
+ async def upload_csv(upload_file: UploadFile = File(...)):
18
+ try:
19
+ if not upload_file.filename.endswith('.csv'):
20
+ raise HTTPException(status_code=400, detail="File must be in CSV format!")
21
+ content = await upload_file.read()
22
+ df = pd.read_csv(io.BytesIO(content), encoding_errors="replace")
23
+ if df.empty:
24
+ raise HTTPException(status_code=400, detail="File is empty, upload the correct file")
25
+
26
+ data_quality(df)
27
+ df = standardize_data_types(df)
28
+ df = handle_missing_data(df)
29
+ df = handle_outliers(df)
30
+
31
+ REPORT_PATH = "output/preprocessing_report.txt"
32
+ generate_final_report(df, REPORT_PATH)
33
+
34
+ CLEANED_DATA_PATH = "output/cleaned_dataset.csv"
35
+ save_cleaned_data(df, CLEANED_DATA_PATH)
36
+
37
+ return FileResponse(CLEANED_DATA_PATH, media_type="text/csv", filename="cleaned_dataset.csv")
38
+
39
+ except Exception as e:
40
+ raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")
app/services/preprocessing.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sklearn.impute import SimpleImputer
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+
7
+ def data_quality(df: pd.DataFrame):
8
+ df.drop_duplicates(inplace=True)
9
+ return df
10
+
11
+ def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
+ for col in df.columns:
13
+ if df[col].isin([True, False]).all():
14
+ continue
15
+ if df[col].dtype == 'object' and df[col].str.replace('.', '', 1).str.isnumeric().all():
16
+ df[col] = pd.to_numeric(df[col], errors='ignore')
17
+ try:
18
+ df[col] = pd.to_datetime(df[col], errors='coerce')
19
+ if df[col].notna().sum() == 0:
20
+ df[col] = df[col].astype(str)
21
+ except Exception:
22
+ pass
23
+ try:
24
+ if df[col].apply(lambda x: isinstance(x, str) and x.startswith("[") and x.endswith("]")).all():
25
+ df[col] = df[col].apply(json.loads)
26
+ except Exception:
27
+ pass
28
+ if df[col].dtype == 'object' and df[col].dropna().isin(["TRUE", "FALSE"]).all():
29
+ df[col] = df[col].map({"TRUE": True, "FALSE": False})
30
+ if df[col].dtype == 'object':
31
+ df[col] = df[col].astype(str)
32
+ df.fillna("", inplace=True)
33
+ return df
34
+
35
+ def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
36
+ numeric_col = df.select_dtypes(include=['number']).columns
37
+ if not numeric_col.empty:
38
+ df[numeric_col] = SimpleImputer(strategy='median').fit_transform(df[numeric_col])
39
+ categorical_col = df.select_dtypes(include=['object']).columns
40
+ if not categorical_col.empty:
41
+ df[categorical_col] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_col])
42
+ return df
43
+
44
+ def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
45
+ numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
46
+ if not numeric_col.empty:
47
+ for col in numeric_col:
48
+ Q1 = df[col].quantile(0.25)
49
+ Q3 = df[col].quantile(0.75)
50
+ IQR = Q3 - Q1
51
+ lower = Q1 - 1.5 * IQR
52
+ upper = Q3 + 1.5 * IQR
53
+ df[col] = df[col].apply(lambda x: lower if x < lower else upper if x > upper else x)
54
+ return df
55
+
56
+ def generate_final_report(df: pd.DataFrame, file_path: str):
57
+ with open(file_path, "w") as file:
58
+ file.write("FINAL DATA PREPROCESSING REPORT\n")
59
+ file.write("=" * 50 + "\n\n")
60
+ missing = df.isnull().sum()
61
+ for col, count in missing.items():
62
+ file.write(f"{col}: {count} missing values\n")
63
+ file.write(f"Total Duplicate Rows: {df.duplicated().sum()}\n")
64
+ file.write("Preprocessing Completed Successfully!\n")
65
+
66
+ def save_cleaned_data(df: pd.DataFrame, file_path: str):
67
+ df.to_csv(file_path, index=False)
68
+ return file_path
main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from fastapi import FastAPI, HTTPException, File, UploadFile
3
+ from fastapi.responses import FileResponse
4
+ from preprocessing import data_quality, standardize_data_types, handle_missing_data, handle_outliers, generate_final_report, save_cleaned_data
5
+ import pandas as pd
6
+ import io
7
+
8
+ app = FastAPI(title="Data Preprocessing")
9
+
10
+ @app.get("/")
11
+ async def root():
12
+ return {"message": "Welcome to the Data Preprocessing API!"}
13
+
14
+ @app.post("/preprocess_data/")
15
+ async def upload_csv(upload_file: UploadFile = File(...)):
16
+ try:
17
+ if not upload_file.filename.endswith('.csv'):
18
+ raise HTTPException(status_code=400, detail="File must be in CSV format!")
19
+
20
+ content = await upload_file.read()
21
+ df = pd.read_csv(io.BytesIO(content), encoding_errors="replace")
22
+
23
+ if df.empty:
24
+ raise HTTPException(status_code=400, detail="File is empty, upload the correct file")
25
+
26
+ data_quality(df)
27
+ df = standardize_data_types(df)
28
+ df = handle_missing_data(df)
29
+ df = handle_outliers(df)
30
+
31
+ REPORT_PATH = "output/preprocessing_report.txt"
32
+ generate_final_report(df, REPORT_PATH)
33
+
34
+ CLEANED_DATA_PATH = "output/cleaned_dataset.csv"
35
+ save_cleaned_data(df, CLEANED_DATA_PATH)
36
+
37
+ return FileResponse(CLEANED_DATA_PATH, media_type="text/csv", filename="cleaned_dataset.csv")
38
+
39
+ except Exception as e:
40
+ raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")
preprocessing.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sklearn.impute import SimpleImputer
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+
7
+ def data_quality(df: pd.DataFrame):
8
+ print("Missing values before handling:")
9
+ print(df.isnull().sum())
10
+ print("Duplicate rows before handling:")
11
+ print(int(df.duplicated().sum()))
12
+ df.drop_duplicates(inplace=True)
13
+ print("Duplicate rows after handling:")
14
+ print(int(df.duplicated().sum()))
15
+ return df
16
+
17
+ def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
18
+ for col in df.columns:
19
+ if df[col].isin([True, False]).all():
20
+ continue
21
+ if df[col].dtype == 'object' and df[col].str.replace('.', '', 1).str.isnumeric().all():
22
+ df[col] = pd.to_numeric(df[col], errors='ignore')
23
+ try:
24
+ df[col] = pd.to_datetime(df[col], errors='coerce')
25
+ if df[col].notna().sum() == 0:
26
+ df[col] = df[col].astype(str)
27
+ except Exception:
28
+ pass
29
+ try:
30
+ if df[col].apply(lambda x: isinstance(x, str) and x.startswith("[") and x.endswith("]")).all():
31
+ df[col] = df[col].apply(json.loads)
32
+ except Exception:
33
+ pass
34
+ if df[col].dtype == 'object' and df[col].dropna().isin(["TRUE", "FALSE"]).all():
35
+ df[col] = df[col].map({"TRUE": True, "FALSE": False})
36
+ if df[col].dtype == 'object':
37
+ df[col] = df[col].astype(str)
38
+ df.fillna("", inplace=True)
39
+ return df
40
+
41
+ def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
42
+ numeric_col = df.select_dtypes(include=['number']).columns
43
+ if not numeric_col.empty:
44
+ num_imputer = SimpleImputer(strategy='median')
45
+ df[numeric_col] = num_imputer.fit_transform(df[numeric_col])
46
+ categorical_col = df.select_dtypes(include=['object']).columns
47
+ if not categorical_col.empty:
48
+ cat_imputer = SimpleImputer(strategy='most_frequent')
49
+ df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])
50
+ return df
51
+
52
+ def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
53
+ numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
54
+ if not numeric_col.empty:
55
+ for col in numeric_col:
56
+ Q1 = df[col].quantile(0.25)
57
+ Q3 = df[col].quantile(0.75)
58
+ IQR = Q3 - Q1
59
+ lower_bound = Q1 - 1.5 * IQR
60
+ upper_bound = Q3 + 1.5 * IQR
61
+ df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)
62
+ return df
63
+
64
+ def generate_final_report(df: pd.DataFrame, file_path: str):
65
+ with open(file_path, "w") as file:
66
+ file.write("FINAL DATA PREPROCESSING REPORT\n")
67
+ file.write("=" * 50 + "\n\n")
68
+ file.write("Missing Values (After Preprocessing):\n")
69
+ missing_values = df.isnull().sum()
70
+ for col, count in missing_values.items():
71
+ file.write(f"{col}: {count} missing values\n")
72
+ file.write("\nDuplicate Rows (After Preprocessing):\n")
73
+ file.write(f"Total Duplicate Rows: {df.duplicated().sum()}\n\n")
74
+ file.write("Preprocessing Completed Successfully!\n")
75
+
76
+ def save_cleaned_data(df: pd.DataFrame, file_path: str):
77
+ df.to_csv(file_path, index=False)
78
+ return file_path
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ streamlit
3
+ fastapi
4
+ pandas
5
+ scikit-learn
6
+ uvicorn
7
+ requests