TanU21 commited on
Commit
8e152f0
·
verified ·
1 Parent(s): 249a99a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app/main.py +40 -0
  2. app/services/preprocessing.py +68 -0
app/main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from fastapi import FastAPI, HTTPException, File, UploadFile
3
+ from fastapi.responses import FileResponse
4
+ from app.services.preprocessing import data_quality, standardize_data_types, handle_missing_data, handle_outliers, generate_final_report, save_cleaned_data
5
+ import pandas as pd
6
+ import io
7
+ import os
8
+
9
+ app = FastAPI(title="Data Preprocessing")
10
+ os.makedirs("output", exist_ok=True)
11
+
12
+ @app.get("/")
13
+ async def root():
14
+ return {"message": "Welcome to the Data Preprocessing API!"}
15
+
16
+ @app.post("/preprocess_data/")
17
+ async def upload_csv(upload_file: UploadFile = File(...)):
18
+ try:
19
+ if not upload_file.filename.endswith('.csv'):
20
+ raise HTTPException(status_code=400, detail="File must be in CSV format!")
21
+ content = await upload_file.read()
22
+ df = pd.read_csv(io.BytesIO(content), encoding_errors="replace")
23
+ if df.empty:
24
+ raise HTTPException(status_code=400, detail="File is empty, upload the correct file")
25
+
26
+ data_quality(df)
27
+ df = standardize_data_types(df)
28
+ df = handle_missing_data(df)
29
+ df = handle_outliers(df)
30
+
31
+ REPORT_PATH = "output/preprocessing_report.txt"
32
+ generate_final_report(df, REPORT_PATH)
33
+
34
+ CLEANED_DATA_PATH = "output/cleaned_dataset.csv"
35
+ save_cleaned_data(df, CLEANED_DATA_PATH)
36
+
37
+ return FileResponse(CLEANED_DATA_PATH, media_type="text/csv", filename="cleaned_dataset.csv")
38
+
39
+ except Exception as e:
40
+ raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")
app/services/preprocessing.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sklearn.impute import SimpleImputer
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+
7
+ def data_quality(df: pd.DataFrame):
8
+ df.drop_duplicates(inplace=True)
9
+ return df
10
+
11
+ def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
+ for col in df.columns:
13
+ if df[col].isin([True, False]).all():
14
+ continue
15
+ if df[col].dtype == 'object' and df[col].str.replace('.', '', 1).str.isnumeric().all():
16
+ df[col] = pd.to_numeric(df[col], errors='ignore')
17
+ try:
18
+ df[col] = pd.to_datetime(df[col], errors='coerce')
19
+ if df[col].notna().sum() == 0:
20
+ df[col] = df[col].astype(str)
21
+ except Exception:
22
+ pass
23
+ try:
24
+ if df[col].apply(lambda x: isinstance(x, str) and x.startswith("[") and x.endswith("]")).all():
25
+ df[col] = df[col].apply(json.loads)
26
+ except Exception:
27
+ pass
28
+ if df[col].dtype == 'object' and df[col].dropna().isin(["TRUE", "FALSE"]).all():
29
+ df[col] = df[col].map({"TRUE": True, "FALSE": False})
30
+ if df[col].dtype == 'object':
31
+ df[col] = df[col].astype(str)
32
+ df.fillna("", inplace=True)
33
+ return df
34
+
35
+ def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
36
+ numeric_col = df.select_dtypes(include=['number']).columns
37
+ if not numeric_col.empty:
38
+ df[numeric_col] = SimpleImputer(strategy='median').fit_transform(df[numeric_col])
39
+ categorical_col = df.select_dtypes(include=['object']).columns
40
+ if not categorical_col.empty:
41
+ df[categorical_col] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_col])
42
+ return df
43
+
44
+ def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
45
+ numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
46
+ if not numeric_col.empty:
47
+ for col in numeric_col:
48
+ Q1 = df[col].quantile(0.25)
49
+ Q3 = df[col].quantile(0.75)
50
+ IQR = Q3 - Q1
51
+ lower = Q1 - 1.5 * IQR
52
+ upper = Q3 + 1.5 * IQR
53
+ df[col] = df[col].apply(lambda x: lower if x < lower else upper if x > upper else x)
54
+ return df
55
+
56
+ def generate_final_report(df: pd.DataFrame, file_path: str):
57
+ with open(file_path, "w") as file:
58
+ file.write("FINAL DATA PREPROCESSING REPORT\n")
59
+ file.write("=" * 50 + "\n\n")
60
+ missing = df.isnull().sum()
61
+ for col, count in missing.items():
62
+ file.write(f"{col}: {count} missing values\n")
63
+ file.write(f"Total Duplicate Rows: {df.duplicated().sum()}\n")
64
+ file.write("Preprocessing Completed Successfully!\n")
65
+
66
+ def save_cleaned_data(df: pd.DataFrame, file_path: str):
67
+ df.to_csv(file_path, index=False)
68
+ return file_path