deepnet commited on
Commit
95a751f
·
verified ·
1 Parent(s): 7e1259e

Create modeleval.py

Browse files
Files changed (1) hide show
  1. modeleval.py +173 -0
modeleval.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import torch
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
+ from tqdm import tqdm
7
+ import pandas as pd
8
+ import torch.nn.functional as F
9
+
10
+ class CSVDataset(Dataset):
11
+ def __init__(self, filepath, tokenizer, seq_length, rows_per_sample):
12
+ self.data = pd.read_csv(filepath)
13
+ self.text_data = self.data['Text'].tolist()
14
+ self.tokenizer = tokenizer
15
+ self.seq_length = seq_length
16
+ self.rows_per_sample = rows_per_sample # Number of rows to pack per sample
17
+
18
+ # Define CAP_SAMPLE_LEN
19
+ self.CAP_SAMPLE_LEN = 17500 # 15000 for Phi3 Model # Maximum number of characters per sample
20
+
21
+ if self.tokenizer.eos_token is None:
22
+ self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
23
+
24
+ if self.tokenizer.pad_token is None:
25
+ self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
26
+
27
+ self.eos_token_id = self.tokenizer.eos_token_id
28
+ self.pad_token_id = self.tokenizer.pad_token_id
29
+
30
+ def __len__(self):
31
+ return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample
32
+
33
+ def __getitem__(self, idx):
34
+ start_idx = idx * self.rows_per_sample
35
+ end_idx = min(start_idx + self.rows_per_sample, len(self.text_data))
36
+
37
+ lines = self.text_data[start_idx:end_idx]
38
+
39
+ # Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary)
40
+ truncated_lines = []
41
+ for text in lines:
42
+ if len(text) > self.CAP_SAMPLE_LEN:
43
+ l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN)
44
+ if l < 0:
45
+ l = self.CAP_SAMPLE_LEN
46
+ text = text[:l]
47
+ truncated_lines.append(text)
48
+
49
+ # Tokenize all lines at once. Each line will be tokenized independently.
50
+ # We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically.
51
+ batch_encodings = self.tokenizer(
52
+ truncated_lines,
53
+ add_special_tokens=False,
54
+ truncation=True,
55
+ max_length=self.seq_length - 2, # Reserve space for EOS tokens
56
+ return_tensors=None
57
+ )
58
+
59
+ # batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line.
60
+ input_ids_list = []
61
+ for tokens in batch_encodings["input_ids"]:
62
+ # Append an EOS token after each line
63
+ tokens.append(self.eos_token_id)
64
+ input_ids_list.extend(tokens)
65
+
66
+ # Now we have a single list of input_ids for all rows.
67
+ # Ensure final token is EOS
68
+ if input_ids_list[-1] != self.eos_token_id:
69
+ input_ids_list.append(self.eos_token_id)
70
+
71
+ # Handle length adjustments
72
+ if len(input_ids_list) > self.seq_length:
73
+ # Truncate from the end
74
+ tokens_to_remove = len(input_ids_list) - self.seq_length
75
+ input_ids_list = input_ids_list[:-tokens_to_remove]
76
+ # Ensure EOS at the end after truncation
77
+ if input_ids_list[-1] != self.eos_token_id:
78
+ input_ids_list[-1] = self.eos_token_id
79
+ elif len(input_ids_list) < self.seq_length:
80
+ # Pad until we reach seq_length
81
+ padding_length = self.seq_length - len(input_ids_list)
82
+ input_ids_list.extend([self.pad_token_id] * padding_length)
83
+ # Ensure EOS at the end
84
+ input_ids_list[-1] = self.eos_token_id
85
+
86
+ input_ids = torch.tensor(input_ids_list, dtype=torch.long)
87
+ return input_ids
88
+
89
+
90
+ def evaluate_model(model, dataloader, device):
91
+ """
92
+ Evaluate the model batch by batch and print the losses for each batch.
93
+ """
94
+ model.eval()
95
+ total_loss = 0
96
+
97
+ with torch.no_grad():
98
+ for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")):
99
+ input_ids = input_ids.to(device)
100
+
101
+ # Evaluate the model
102
+ outputs = model(input_ids, labels=input_ids)
103
+ loss = outputs.loss.item()
104
+ total_loss += loss
105
+
106
+ # Print loss for the current batch
107
+ print(f"Batch {batch_idx + 1} Loss: {loss:.4f}")
108
+
109
+ avg_loss = total_loss / len(dataloader)
110
+ return avg_loss
111
+
112
+
113
+ def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device):
114
+ """
115
+ Evaluate a single model on the dataset and print losses for each batch.
116
+ """
117
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
118
+ dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50)
119
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)
120
+
121
+ # model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
122
+
123
+ # Load model in 4-bit precision
124
+ # bnb_config = BitsAndBytesConfig(load_in_4bit=True)
125
+
126
+ # Load the quantized model
127
+ model = AutoModelForCausalLM.from_pretrained(
128
+ model_path,
129
+ # quantization_config=bnb_config, # Use quantization
130
+ torch_dtype=torch.float16, # 4-bit models compute in FP32
131
+ device_map="auto"
132
+ )#.to(device)
133
+
134
+ # Convert model to bfloat16
135
+ # model.to(torch.bfloat16)
136
+
137
+ # # Remove quantization metadata from config
138
+ # if hasattr(model.config, "quantization_config"):
139
+ # delattr(model.config, "quantization_config")
140
+ # print("Removed quantization_config from model configuration.")
141
+
142
+ # Check model's dtype
143
+ print(model.dtype) # Should print torch.bfloat16
144
+
145
+ # Save the model in bfloat16 precision
146
+ # model.save_pretrained("model_bfloat16")
147
+
148
+ print("Evaluating Model...")
149
+ avg_loss = evaluate_model(model, dataloader, device)
150
+ print(f"Average Loss: {avg_loss:.4f}")
151
+
152
+ return avg_loss
153
+
154
+
155
+ if __name__ == "__main__":
156
+ parser = argparse.ArgumentParser()
157
+ parser.add_argument("--model_path", type=str, required=True, help="Path to the model.")
158
+ parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.")
159
+ parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.")
160
+ parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.")
161
+ parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.")
162
+ parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.")
163
+
164
+ args = parser.parse_args()
165
+
166
+ evaluate_single_model(
167
+ args.model_path,
168
+ args.tokenizer_path,
169
+ args.csv_path,
170
+ args.seq_length,
171
+ args.batch_size,
172
+ args.device
173
+ )