TanU21 commited on
Commit
249a99a
·
verified ·
1 Parent(s): 81a0452

Delete app/services/preprocessing.py

Browse files
Files changed (1) hide show
  1. app/services/preprocessing.py +0 -72
app/services/preprocessing.py DELETED
@@ -1,72 +0,0 @@
1
-
2
- from sklearn.impute import SimpleImputer
3
- import pandas as pd
4
- import numpy as np
5
- import json
6
-
7
- def data_quality(df: pd.DataFrame):
8
- df.drop_duplicates(inplace=True)
9
- return df
10
-
11
- def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
- # Convert string-based dates to datetime, but ignore boolean values
13
- for col in df.columns:
14
- if df[col].dtype == 'object' and not df[col].isin([True, False]).all():
15
- try:
16
- df[col] = pd.to_datetime(df[col], errors='coerce') # Invalid values become NaT
17
- except Exception as e:
18
- print(f"Skipping column {col}: {e}")
19
-
20
- # Convert numeric strings to actual numbers
21
- for col in df.select_dtypes(include=['object']).columns:
22
- if df[col].str.replace('.', '', 1).str.isnumeric().all():
23
- df[col] = pd.to_numeric(df[col])
24
-
25
- return df
26
-
27
-
28
- def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
29
- print("Before Imputation (NA Counts):")
30
- print(df.isnull().sum())
31
-
32
- numeric_col = df.select_dtypes(include=['number']).columns
33
- if not numeric_col.empty:
34
- num_imputer = SimpleImputer(strategy='median')
35
- df[numeric_col] = num_imputer.fit_transform(df[numeric_col])
36
-
37
- categorical_col = df.select_dtypes(include=['object', 'category']).columns
38
- if not categorical_col.empty:
39
- cat_imputer = SimpleImputer(strategy='most_frequent')
40
- df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])
41
-
42
- print("After Imputation (NA Counts):")
43
- print(df.isnull().sum())
44
-
45
- return df
46
-
47
-
48
- def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
49
- numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
50
- if not numeric_col.empty:
51
- for col in numeric_col:
52
- Q1 = df[col].quantile(0.25)
53
- Q3 = df[col].quantile(0.75)
54
- IQR = Q3 - Q1
55
- lower = Q1 - 1.5 * IQR
56
- upper = Q3 + 1.5 * IQR
57
- df[col] = df[col].apply(lambda x: lower if x < lower else upper if x > upper else x)
58
- return df
59
-
60
- def generate_final_report(df: pd.DataFrame, file_path: str):
61
- with open(file_path, "w") as file:
62
- file.write("FINAL DATA PREPROCESSING REPORT\n")
63
- file.write("=" * 50 + "\n\n")
64
- missing = df.isnull().sum()
65
- for col, count in missing.items():
66
- file.write(f"{col}: {count} missing values\n")
67
- file.write(f"Total Duplicate Rows: {df.duplicated().sum()}\n")
68
- file.write("Preprocessing Completed Successfully!\n")
69
-
70
- def save_cleaned_data(df: pd.DataFrame, file_path: str):
71
- df.to_csv(file_path, index=False)
72
- return file_path