datasciencesage commited on
Commit
f50d086
·
verified ·
1 Parent(s): 6fe5de2

Create preprocess_test.py

Browse files
Files changed (1) hide show
  1. preprocess_test.py +136 -0
preprocess_test.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from sklearn.preprocessing import LabelEncoder,StandardScaler
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from huggingface_hub import hf_hub_download
10
+
11
+
12
+
13
+ class Preprocess_Test:
14
+ def __init__(self,df):
15
+
16
+ self.df=df
17
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ # self.output_path=output_path
19
+ print("INSIDE CLEANING GOT THE DATASET")
20
+
21
+ def delete_redundant(self,percent):
22
+ cols_to_be_deleted=[]
23
+ precent=percent/100
24
+ for col in self.df.columns:
25
+ if self.df[col].isnull().sum()>int(len(self.df)*precent):
26
+ cols_to_be_deleted.append(col)
27
+ self.df.drop(cols_to_be_deleted,axis=1,inplace=True)
28
+
29
+
30
+ def delete_unncecessary(self):
31
+
32
+ # Checking for these columns in the dataset
33
+ new_cols_list = ['empid', 'hourly_pay', 'job', 'pincode', 'rating']
34
+ flag=True
35
+ for col in new_cols_list:
36
+ if col not in self.df.columns:
37
+ flag=False
38
+
39
+
40
+
41
+ if flag==False:
42
+ new_cols={"EmpID":"empid","PayZone":"hourly_pay","JobFunctionDescription":"job","LocationCode":"pincode","Current Employee Rating":"rating"}
43
+
44
+
45
+
46
+ cols=["EmpID","LocationCode","Current Employee Rating","JobFunctionDescription","PayZone"]
47
+ for col in self.df.columns:
48
+ if col not in cols:
49
+ self.df.drop(col,axis=1,inplace=True)
50
+ self.df.rename(columns=new_cols,inplace=True)
51
+
52
+
53
+
54
+
55
+ def preprocess(self,percent=30):
56
+ self.delete_redundant(percent=percent)
57
+ self.delete_unncecessary()
58
+
59
+ label_mappings = {}
60
+
61
+ for col in self.df.select_dtypes(exclude=np.number).columns:
62
+ le = LabelEncoder()
63
+ self.df[col] = le.fit_transform(self.df[col]) # Transform column
64
+ label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
65
+
66
+ X=np.array(self.df.drop("empid",axis=1))
67
+ Y=np.array(self.df["empid"])
68
+
69
+ sc=StandardScaler()
70
+ self.X_test=sc.fit_transform(X)
71
+
72
+ le=LabelEncoder()
73
+ self.Y_test=le.fit_transform(Y)
74
+
75
+
76
+
77
+
78
+ def test(self):
79
+
80
+
81
+ print(f"Using device: {self.device}")
82
+
83
+ # Download the model from Hugging Face
84
+ repo_id = "Haliyka/coldstartmodel"
85
+ model_file = "model_full.pth" # Matches your upload
86
+ local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
87
+
88
+ # Load the dictionary and extract the model
89
+ loaded_data = torch.load(local_path, map_location=self.device, weights_only=False)
90
+
91
+
92
+ if isinstance(loaded_data, dict):
93
+ # If it's a dictionary, it might contain state_dict or the model
94
+ if "model" in loaded_data:
95
+ model_loaded = loaded_data["model"]
96
+ else:
97
+ model_loaded.load_state_dict(loaded_data)
98
+ else:
99
+ # If it's not a dictionary, assume it's the state_dict
100
+ model_loaded.load_state_dict(loaded_data)
101
+
102
+
103
+ model_loaded.to(self.device)
104
+
105
+
106
+ # model_loaded = loaded_data["model"] # Extract the model from the dictionary
107
+ model_loaded.eval() # Set to evaluation mode
108
+ print(f"Model loaded from Hugging Face: {repo_id}")
109
+
110
+ # Convert your data to tensors (assuming X_test, Y_test are defined)
111
+ X_test_t = torch.tensor(self.X_test, dtype=torch.float32)
112
+ Y_test_t = torch.tensor(self.Y_test, dtype=torch.long)
113
+
114
+ # Evaluation
115
+ BATCH_SIZE = 256
116
+ correct = 0
117
+ total = 0
118
+
119
+ with torch.no_grad():
120
+ for i in range(0, len(X_test_t), BATCH_SIZE):
121
+ batch_x = X_test_t[i:i + BATCH_SIZE].to(self.device)
122
+ batch_y = Y_test_t[i:i + BATCH_SIZE].to(self.device)
123
+
124
+ outputs = model_loaded(batch_x)
125
+ predicted = torch.argmax(outputs, dim=1)
126
+ total += batch_y.size(0)
127
+ correct += (predicted == batch_y).sum().item()
128
+
129
+ if i == 0:
130
+ print(f"Test batch - Predicted: {predicted.cpu().numpy()[:10]}")
131
+ print(f"Test batch - Actual: {batch_y.cpu().numpy()[:10]}")
132
+
133
+
134
+
135
+
136
+