AlainDeLong commited on
Commit
db78b44
·
verified ·
1 Parent(s): 09c22f3

Update src/predict.py

Browse files
Files changed (1) hide show
  1. src/predict.py +395 -473
src/predict.py CHANGED
@@ -1,473 +1,395 @@
1
- # src/predict.py
2
-
3
- import os # To help build file paths correctly
4
- import torch # PyTorch library, for tensors and model operations
5
- from transformers import (
6
- AutoModelForSequenceClassification,
7
- AutoTokenizer,
8
- ) # Hugging Face stuff for models
9
-
10
-
11
- # --- Configuration ---
12
- # This is where our fine-tuned model and tokenizer files are stored
13
- # Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
14
- _SCRIPT_DIR = os.path.dirname(
15
- os.path.abspath(__file__)
16
- ) # Gets the directory where this script is
17
- MODEL_PATH = os.path.join(
18
- _SCRIPT_DIR, "fine_tuned_model"
19
- ) # User confirmed this variable name and directory
20
-
21
- print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}") # For checking the path
22
-
23
- # --- Device Setup ---
24
- # Check if a GPU is available, otherwise use CPU
25
- # Using GPU makes predictions much faster!
26
- if torch.cuda.is_available():
27
- device = torch.device("cuda")
28
- # Trying to get the name of the GPU, just for information
29
- try:
30
- gpu_name = torch.cuda.get_device_name(0)
31
- print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
32
- except Exception as e:
33
- print(
34
- f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
35
- )
36
- else:
37
- device = torch.device("cpu")
38
- print(
39
- "INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
40
- )
41
-
42
- # --- Load Model and Tokenizer ---
43
- # We load these once when the script (or module) is first loaded.
44
- # This is much better than loading them every time we want to predict.
45
- model = None
46
- tokenizer = None
47
- id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"} # Default mapping
48
-
49
- try:
50
- print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
51
- # Load the pre-trained model for sequence classification
52
- # This should be the PyTorch RoBERTa model we fine-tuned
53
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
54
- model.to(device) # Move the model to the GPU (or CPU if no GPU)
55
- model.eval() # Set the model to evaluation mode (important for layers like Dropout)
56
- print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
57
-
58
- print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
59
- # Load the tokenizer that matches the model
60
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
61
- print("INFO (predict.py): Tokenizer loaded successfully.")
62
-
63
- # Get the label mapping from the model's configuration
64
- # This was saved during fine-tuning
65
- if hasattr(model.config, "id2label") and model.config.id2label:
66
- id2label_mapping = model.config.id2label
67
- # Convert string keys from config.json to int if necessary
68
- id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
69
- print(
70
- f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
71
- )
72
- else:
73
- print(
74
- "WARN (predict.py): id2label not found in model config, using default mapping."
75
- )
76
-
77
- except FileNotFoundError:
78
- print(f"--- CRITICAL ERROR (predict.py) ---")
79
- print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
80
- print(
81
- f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
82
- )
83
- # Keep model and tokenizer as None, so predict_sentiments can handle it
84
- except Exception as e:
85
- print(f"--- ERROR (predict.py) ---")
86
- print(f"An unexpected error occurred loading model or tokenizer: {e}")
87
- # Keep model and tokenizer as None
88
-
89
-
90
- # --- Preprocessing Function ---
91
- # Same function we used for training data to make sure inputs are consistent
92
- def preprocess_tweet(text):
93
- """Replaces @user mentions and http links with placeholders."""
94
- preprocessed_text = []
95
- if text is None:
96
- return "" # Handle None input
97
- # Split text into parts by space
98
- for t in text.split(" "):
99
- if len(t) > 0: # Avoid processing empty parts from multiple spaces
100
- t = "@user" if t.startswith("@") else t # Replace mentions
101
- t = "http" if t.startswith("http") else t # Replace links
102
- preprocessed_text.append(t)
103
- return " ".join(preprocessed_text) # Put the parts back together
104
-
105
-
106
- # --- Prediction Function (UPDATED to return probabilities) ---
107
- def predict_sentiments(comment_list: list):
108
- """
109
- Predicts sentiments for a list of comment strings.
110
- Returns a list of dictionaries, each containing the predicted label
111
- and the probabilities (scores) for each class.
112
- e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
113
- """
114
- # Check if model and tokenizer are ready
115
- if model is None or tokenizer is None:
116
- print(
117
- "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
118
- )
119
- # Return an error structure
120
- return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
121
-
122
- if not comment_list: # Handle empty input list
123
- return []
124
-
125
- """
126
- # Preprocess comments first
127
- processed_comments = [preprocess_tweet(comment) for comment in comment_list]
128
-
129
- # Tokenize the batch
130
- print(
131
- f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments for prediction..."
132
- )
133
- inputs = tokenizer(
134
- processed_comments,
135
- padding=True,
136
- truncation=True,
137
- return_tensors="pt", # PyTorch tensors
138
- max_length=(
139
- tokenizer.model_max_length
140
- if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
141
- else 512
142
- ),
143
- )
144
-
145
- # Move inputs to the correct device
146
- inputs = {k: v.to(device) for k, v in inputs.items()}
147
-
148
- results_list = [] # To store the dictionaries
149
- try:
150
- # Perform inference without calculating gradients
151
- with torch.no_grad():
152
- outputs = model(**inputs)
153
- logits = outputs.logits
154
-
155
- # Apply Softmax to convert logits to probabilities
156
- # dim=-1 applies softmax across the last dimension (the classes)
157
- probabilities = torch.softmax(logits, dim=-1)
158
-
159
- # Get the predicted class IDs (index of the highest probability)
160
- predicted_class_ids = torch.argmax(probabilities, dim=-1)
161
-
162
- # Move results to CPU and convert to Python lists/numpy for easier handling
163
- probs_list = (
164
- probabilities.cpu().numpy().tolist()
165
- ) # List of lists of probabilities
166
- ids_list = predicted_class_ids.cpu().numpy().tolist() # List of predicted IDs
167
-
168
- print(
169
- f"DEBUG (predict.py): Probabilities and IDs calculated. Batch size: {len(ids_list)}"
170
- )
171
-
172
- # Format the output: list of dictionaries
173
- for i in range(len(ids_list)):
174
- pred_id = ids_list[i]
175
- # Map predicted ID to label string using the mapping from model config
176
- pred_label = id2label_mapping.get(pred_id, "Unknown")
177
-
178
- # Create the dictionary of scores {label_name: probability}
179
- pred_scores = {
180
- label_name: probs_list[i][label_id]
181
- for label_id, label_name in id2label_mapping.items()
182
- # Ensure index is within bounds, just in case
183
- if 0 <= label_id < probabilities.shape[-1]
184
- }
185
-
186
- # Append the result for this comment
187
- results_list.append({"label": pred_label, "scores": pred_scores})
188
-
189
- except Exception as e:
190
- print(f"--- ERROR (predict.py - predict_sentiments) ---")
191
- print(f"Error during sentiment prediction inference or formatting: {e}")
192
- import traceback
193
-
194
- traceback.print_exc() # Print full traceback for debugging
195
- # Return error structure for each comment
196
- results_list = [
197
- {"label": "Error: Prediction failed", "scores": {}} for _ in comment_list
198
- ]
199
-
200
- return results_list # Return the list of dictionaries
201
- """
202
-
203
- inference_batch_size = 64 # You can adjust this number based on performance/memory
204
- print(
205
- f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
206
- )
207
-
208
- all_results_list = [] # We'll collect results for all batches here
209
-
210
- # --- Loop through the comment list in batches ---
211
- try:
212
- total_comments = len(comment_list)
213
- # This loop goes from 0 to total_comments, jumping by inference_batch_size each time
214
- for i in range(0, total_comments, inference_batch_size):
215
- # Get the current slice of comments for this batch
216
- batch_comments = comment_list[i : i + inference_batch_size]
217
-
218
- # Just printing progress for long lists
219
- current_batch_num = i // inference_batch_size + 1
220
- total_batches = (
221
- total_comments + inference_batch_size - 1
222
- ) // inference_batch_size
223
- print(
224
- f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
225
- )
226
-
227
- # --- Process ONLY the current batch ---
228
- # 1. Preprocess this specific batch
229
- processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
230
-
231
- # 2. Tokenize this batch
232
- # Tokenizer handles padding within this smaller batch
233
- inputs = tokenizer(
234
- processed_batch,
235
- padding=True,
236
- truncation=True,
237
- return_tensors="pt",
238
- max_length=(
239
- tokenizer.model_max_length
240
- if hasattr(tokenizer, "model_max_length")
241
- and tokenizer.model_max_length
242
- else 512
243
- ),
244
- )
245
-
246
- # 3. Move this batch's inputs to the device (GPU/CPU)
247
- inputs = {k: v.to(device) for k, v in inputs.items()}
248
-
249
- # 4. Make prediction for this batch - no need for gradients
250
- with torch.no_grad():
251
- outputs = model(**inputs)
252
- logits = outputs.logits # Raw scores from the model for this batch
253
-
254
- # 5. Calculate probabilities and get predicted class IDs for this batch
255
- probabilities_batch = torch.softmax(logits, dim=-1)
256
- predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
257
-
258
- # 6. Move results back to CPU, convert to lists for easier looping
259
- probs_list_batch = probabilities_batch.cpu().numpy().tolist()
260
- ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
261
-
262
- # 7. Format results for each comment in THIS batch
263
- batch_results = []
264
- for j in range(len(ids_list_batch)):
265
- pred_id = ids_list_batch[j]
266
- pred_label = id2label_mapping.get(
267
- pred_id, "Unknown"
268
- ) # Map ID to label name
269
- # Create the scores dictionary for this comment
270
- pred_scores = {
271
- label_name: probs_list_batch[j][label_id]
272
- for label_id, label_name in id2label_mapping.items()
273
- if 0
274
- <= label_id
275
- < probabilities_batch.shape[-1] # Safety check for index
276
- }
277
- # Add the result for this comment
278
- batch_results.append({"label": pred_label, "scores": pred_scores})
279
-
280
- # Add the results from this completed batch to our main list
281
- all_results_list.extend(batch_results)
282
- # --- Finished processing current batch ---
283
-
284
- print(
285
- f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
286
- )
287
-
288
- except Exception as e:
289
- # Catch errors that might happen during the loop
290
- print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
291
- print(
292
- f"An error occurred during batch prediction (around comment index {i}): {e}"
293
- )
294
- import traceback
295
-
296
- traceback.print_exc() # Print full error details to console
297
- # Try to return results for processed batches + error messages for the rest
298
- num_processed = len(all_results_list)
299
- num_remaining = len(comment_list) - num_processed
300
- # Add error indicators for comments that couldn't be processed
301
- all_results_list.extend(
302
- [{"label": "Error: Batch failed", "scores": {}}] * num_remaining
303
- )
304
-
305
- # Return the list containing results for all comments
306
- return all_results_list
307
-
308
-
309
- # --- Main block for testing this script directly (UPDATED to show scores) ---
310
- if __name__ == "__main__":
311
- print("\n--- Testing predict.py Script Directly ---")
312
- if model and tokenizer:
313
- sample_comments_for_testing = [
314
- "This is an amazing movie, I loved it!",
315
- "I'm not sure how I feel about this, it was okay.",
316
- "Worst experience ever, would not recommend.",
317
- "The food was alright, but the service was slow.",
318
- "What a fantastic day! #blessed",
319
- "I hate waiting in long lines.",
320
- "@user Check out http this is cool.",
321
- "Just a normal sentence, nothing special here.",
322
- "",
323
- "This new update is absolutely terrible and full of bugs.",
324
- ]
325
-
326
- print("\nInput Comments for Direct Test:")
327
- for i, c in enumerate(sample_comments_for_testing):
328
- print(f"{i+1}. '{c}'")
329
-
330
- # Get predictions (now a list of dictionaries)
331
- prediction_results = predict_sentiments(sample_comments_for_testing)
332
-
333
- print("\nPredicted Sentiments and Scores (Direct Test):")
334
- # Loop through the results list
335
- for i, (comment, result) in enumerate(
336
- zip(sample_comments_for_testing, prediction_results)
337
- ):
338
- print(f"{i+1}. Comment: '{comment}'")
339
- # Format scores nicely for printing
340
- scores_dict = result.get("scores", {})
341
- formatted_scores = ", ".join(
342
- [f"{name}: {score:.3f}" for name, score in scores_dict.items()]
343
- )
344
- print(f" -> Predicted Label: {result.get('label', 'N/A')}")
345
- # Also print the raw scores dictionary
346
- print(f" -> Scores: {{{formatted_scores}}}")
347
- print("--- Direct Test Finished ---")
348
- else:
349
- print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
350
- print(
351
- f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
352
- )
353
-
354
-
355
- # # --- Prediction Function ---
356
- # def predict_sentiments(comment_list: list):
357
- # """
358
- # Predicts sentiments for a list of comment strings.
359
- # Returns a list of sentiment labels (e.g., "positive", "neutral", "negative").
360
- # """
361
- # # Check if model and tokenizer were loaded properly
362
- # if model is None or tokenizer is None:
363
- # print(
364
- # "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot make predictions."
365
- # )
366
- # # Return an error message for each comment if model isn't ready
367
- # return ["Error: Model not loaded"] * len(comment_list)
368
-
369
- # if not comment_list: # If the input list is empty
370
- # return []
371
-
372
- # # First, preprocess all comments like we did for training data
373
- # processed_comments = [preprocess_tweet(comment) for comment in comment_list]
374
-
375
- # # Tokenize the processed comments
376
- # # This turns text into numbers (input IDs, attention mask) for the model
377
- # # padding=True: make all sequences in the batch the same length
378
- # # truncation=True: cut off sequences longer than the model can handle
379
- # # return_tensors="pt": return PyTorch tensors
380
- # # max_length: ensure we don't exceed model's limit (e.g., 512 for RoBERTa)
381
- # print(f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments...")
382
- # inputs = tokenizer(
383
- # processed_comments,
384
- # padding=True,
385
- # truncation=True,
386
- # return_tensors="pt",
387
- # max_length=(
388
- # tokenizer.model_max_length
389
- # if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
390
- # else 512
391
- # ),
392
- # )
393
-
394
- # # Move the tokenized inputs to the same device as the model (GPU or CPU)
395
- # inputs = {k: v.to(device) for k, v in inputs.items()}
396
-
397
- # sentiment_labels_as_strings = []
398
- # try:
399
- # # Make predictions
400
- # # torch.no_grad() is important for inference:
401
- # # it tells PyTorch not to calculate gradients, saving memory and speeding things up.
402
- # with torch.no_grad():
403
- # outputs = model(**inputs) # Get model outputs
404
- # logits = outputs.logits # These are the raw scores from the final layer
405
-
406
- # # Get the predicted class ID by finding the index with the highest score (logit)
407
- # # logits shape is (batch_size, num_labels)
408
- # predicted_class_ids = torch.argmax(
409
- # logits, dim=-1
410
- # ) # dim=-1 means find max along the last dimension
411
-
412
- # # Convert the predicted class IDs (numbers) to actual sentiment labels (strings)
413
- # # using the id2label_mapping we got from the model's config
414
- # # .item() gets the Python number from a 0-dim PyTorch tensor
415
- # sentiment_labels_as_strings = [
416
- # id2label_mapping.get(class_id.item(), "Unknown")
417
- # for class_id in predicted_class_ids
418
- # ]
419
- # print(
420
- # f"DEBUG (predict.py): Predictions made. Example: {sentiment_labels_as_strings[:3] if sentiment_labels_as_strings else 'N/A'}"
421
- # )
422
-
423
- # except Exception as e:
424
- # print(f"--- ERROR (predict.py - predict_sentiments) ---")
425
- # print(f"Error during sentiment prediction inference: {e}")
426
- # # Return an error message for each comment if prediction fails
427
- # sentiment_labels_as_strings = ["Error: Prediction failed"] * len(comment_list)
428
-
429
- # return sentiment_labels_as_strings
430
-
431
-
432
- # # --- Main block for testing this script directly ---
433
- # # This part only runs if you execute 'python src/predict.py' from the terminal
434
- # # It won't run when app.py imports this file.
435
- # if __name__ == "__main__":
436
- # print("\n--- Testing predict.py Script Directly ---")
437
- # # Check if model was loaded, otherwise can't test
438
- # if model and tokenizer:
439
- # sample_comments_for_testing = [
440
- # "This is an amazing movie, I loved it!", # Expected: positive
441
- # "I'm not sure how I feel about this, it was okay.", # Expected: neutral
442
- # "Worst experience ever, would not recommend.", # Expected: negative
443
- # "The food was alright, but the service was slow.", # Expected: neutral or negative
444
- # "What a fantastic day! #blessed", # Expected: positive
445
- # "I hate waiting in long lines.", # Expected: negative
446
- # "@user Check out http this is cool.", # Test preprocessing, Expected: positive or neutral
447
- # "Just a normal sentence, nothing special here.", # Expected: neutral
448
- # "", # Empty string test
449
- # "This new update is absolutely terrible and full of bugs.", # Expected: negative
450
- # ]
451
-
452
- # print("\nInput Comments for Direct Test:")
453
- # for i, c in enumerate(sample_comments_for_testing):
454
- # print(f"{i + 1}. '{c}'")
455
-
456
- # # Get predictions using our main function
457
- # predicted_sentiments = predict_sentiments(sample_comments_for_testing)
458
-
459
- # print("\nPredicted Sentiments (Direct Test):")
460
- # for i, (comment, sentiment) in enumerate(
461
- # zip(sample_comments_for_testing, predicted_sentiments)
462
- # ):
463
- # print(
464
- # f"{i + 1}. Comment: '{comment}'\n -> Predicted Sentiment: {sentiment}"
465
- # )
466
- # print("--- Direct Test Finished ---")
467
- # else:
468
- # print(
469
- # "ERROR (predict.py - main test): Model and/or tokenizer not loaded. Cannot run direct test."
470
- # )
471
- # print(
472
- # f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
473
- # )
 
1
+ # src/predict.py
2
+
3
+ import os # To help build file paths correctly
4
+ import torch # PyTorch library, for tensors and model operations
5
+ from transformers import (
6
+ AutoModelForSequenceClassification,
7
+ AutoTokenizer,
8
+ ) # Hugging Face stuff for models
9
+
10
+
11
+ # --- Configuration ---
12
+ # This is where our fine-tuned model and tokenizer files are stored
13
+ # Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
14
+ _SCRIPT_DIR = os.path.dirname(
15
+ os.path.abspath(__file__)
16
+ ) # Gets the directory where this script is
17
+ MODEL_PATH = os.path.join(
18
+ _SCRIPT_DIR, "fine_tuned_model"
19
+ ) # User confirmed this variable name and directory
20
+
21
+ print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}") # For checking the path
22
+
23
+ # --- Device Setup ---
24
+ # Check if a GPU is available, otherwise use CPU
25
+ # Using GPU makes predictions much faster!
26
+ if torch.cuda.is_available():
27
+ device = torch.device("cuda")
28
+ # Trying to get the name of the GPU, just for information
29
+ try:
30
+ gpu_name = torch.cuda.get_device_name(0)
31
+ print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
32
+ except Exception as e:
33
+ print(
34
+ f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
35
+ )
36
+ else:
37
+ device = torch.device("cpu")
38
+ print(
39
+ "INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
40
+ )
41
+
42
+ # --- Load Model and Tokenizer ---
43
+ # We load these once when the script (or module) is first loaded.
44
+ # This is much better than loading them every time we want to predict.
45
+ model = None
46
+ tokenizer = None
47
+ id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"} # Default mapping
48
+
49
+ try:
50
+ print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
51
+ # Load the pre-trained model for sequence classification
52
+ # This should be the PyTorch RoBERTa model we fine-tuned
53
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
54
+ model.to(device) # Move the model to the GPU (or CPU if no GPU)
55
+ model.eval() # Set the model to evaluation mode (important for layers like Dropout)
56
+ print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
57
+
58
+ print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
59
+ # Load the tokenizer that matches the model
60
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
61
+ print("INFO (predict.py): Tokenizer loaded successfully.")
62
+
63
+ # Get the label mapping from the model's configuration
64
+ # This was saved during fine-tuning
65
+ if hasattr(model.config, "id2label") and model.config.id2label:
66
+ id2label_mapping = model.config.id2label
67
+ # Convert string keys from config.json to int if necessary
68
+ id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
69
+ print(
70
+ f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
71
+ )
72
+ else:
73
+ print(
74
+ "WARN (predict.py): id2label not found in model config, using default mapping."
75
+ )
76
+
77
+ except FileNotFoundError:
78
+ print(f"--- CRITICAL ERROR (predict.py) ---")
79
+ print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
80
+ print(
81
+ f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
82
+ )
83
+ # Keep model and tokenizer as None, so predict_sentiments can handle it
84
+ except Exception as e:
85
+ print(f"--- ERROR (predict.py) ---")
86
+ print(f"An unexpected error occurred loading model or tokenizer: {e}")
87
+ # Keep model and tokenizer as None
88
+
89
+
90
+ # --- Preprocessing Function ---
91
+ # Same function we used for training data to make sure inputs are consistent
92
+ def preprocess_tweet(text):
93
+ """Replaces @user mentions and http links with placeholders."""
94
+ preprocessed_text = []
95
+ if text is None:
96
+ return "" # Handle None input
97
+ # Split text into parts by space
98
+ for t in text.split(" "):
99
+ if len(t) > 0: # Avoid processing empty parts from multiple spaces
100
+ t = "@user" if t.startswith("@") else t # Replace mentions
101
+ t = "http" if t.startswith("http") else t # Replace links
102
+ preprocessed_text.append(t)
103
+ return " ".join(preprocessed_text) # Put the parts back together
104
+
105
+
106
+ # --- Prediction Function (UPDATED to return probabilities) ---
107
+ def predict_sentiments(comment_list: list):
108
+ """
109
+ Predicts sentiments for a list of comment strings.
110
+ Returns a list of dictionaries, each containing the predicted label
111
+ and the probabilities (scores) for each class.
112
+ e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
113
+ """
114
+ # Check if model and tokenizer are ready
115
+ if model is None or tokenizer is None:
116
+ print(
117
+ "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
118
+ )
119
+ # Return an error structure
120
+ return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
121
+
122
+ if not comment_list: # Handle empty input list
123
+ return []
124
+
125
+ inference_batch_size = 16 # You can adjust this number based on performance/memory
126
+ print(
127
+ f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
128
+ )
129
+
130
+ all_results_list = [] # We'll collect results for all batches here
131
+
132
+ # --- Loop through the comment list in batches ---
133
+ try:
134
+ total_comments = len(comment_list)
135
+ # This loop goes from 0 to total_comments, jumping by inference_batch_size each time
136
+ for i in range(0, total_comments, inference_batch_size):
137
+ # Get the current slice of comments for this batch
138
+ batch_comments = comment_list[i : i + inference_batch_size]
139
+
140
+ # Just printing progress for long lists
141
+ current_batch_num = i // inference_batch_size + 1
142
+ total_batches = (
143
+ total_comments + inference_batch_size - 1
144
+ ) // inference_batch_size
145
+ print(
146
+ f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
147
+ )
148
+
149
+ # --- Process ONLY the current batch ---
150
+ # 1. Preprocess this specific batch
151
+ processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
152
+
153
+ # 2. Tokenize this batch
154
+ # Tokenizer handles padding within this smaller batch
155
+ inputs = tokenizer(
156
+ processed_batch,
157
+ padding=True,
158
+ truncation=True,
159
+ return_tensors="pt",
160
+ max_length=(
161
+ tokenizer.model_max_length
162
+ if hasattr(tokenizer, "model_max_length")
163
+ and tokenizer.model_max_length
164
+ else 512
165
+ ),
166
+ )
167
+
168
+ # 3. Move this batch's inputs to the device (GPU/CPU)
169
+ inputs = {k: v.to(device) for k, v in inputs.items()}
170
+
171
+ # 4. Make prediction for this batch - no need for gradients
172
+ with torch.no_grad():
173
+ outputs = model(**inputs)
174
+ logits = outputs.logits # Raw scores from the model for this batch
175
+
176
+ # 5. Calculate probabilities and get predicted class IDs for this batch
177
+ probabilities_batch = torch.softmax(logits, dim=-1)
178
+ predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
179
+
180
+ # 6. Move results back to CPU, convert to lists for easier looping
181
+ probs_list_batch = probabilities_batch.cpu().numpy().tolist()
182
+ ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
183
+
184
+ # 7. Format results for each comment in THIS batch
185
+ batch_results = []
186
+ for j in range(len(ids_list_batch)):
187
+ pred_id = ids_list_batch[j]
188
+ pred_label = id2label_mapping.get(
189
+ pred_id, "Unknown"
190
+ ) # Map ID to label name
191
+ # Create the scores dictionary for this comment
192
+ pred_scores = {
193
+ label_name: probs_list_batch[j][label_id]
194
+ for label_id, label_name in id2label_mapping.items()
195
+ if 0
196
+ <= label_id
197
+ < probabilities_batch.shape[-1] # Safety check for index
198
+ }
199
+ # Add the result for this comment
200
+ batch_results.append({"label": pred_label, "scores": pred_scores})
201
+
202
+ # Add the results from this completed batch to our main list
203
+ all_results_list.extend(batch_results)
204
+ # --- Finished processing current batch ---
205
+
206
+ print(
207
+ f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
208
+ )
209
+
210
+ except Exception as e:
211
+ # Catch errors that might happen during the loop
212
+ print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
213
+ print(
214
+ f"An error occurred during batch prediction (around comment index {i}): {e}"
215
+ )
216
+ import traceback
217
+
218
+ traceback.print_exc() # Print full error details to console
219
+ # Try to return results for processed batches + error messages for the rest
220
+ num_processed = len(all_results_list)
221
+ num_remaining = len(comment_list) - num_processed
222
+ # Add error indicators for comments that couldn't be processed
223
+ all_results_list.extend(
224
+ [{"label": "Error: Batch failed", "scores": {}}] * num_remaining
225
+ )
226
+
227
+ # Return the list containing results for all comments
228
+ return all_results_list
229
+
230
+
231
+ # --- Main block for testing this script directly (UPDATED to show scores) ---
232
+ if __name__ == "__main__":
233
+ print("\n--- Testing predict.py Script Directly ---")
234
+ if model and tokenizer:
235
+ sample_comments_for_testing = [
236
+ "This is an amazing movie, I loved it!",
237
+ "I'm not sure how I feel about this, it was okay.",
238
+ "Worst experience ever, would not recommend.",
239
+ "The food was alright, but the service was slow.",
240
+ "What a fantastic day! #blessed",
241
+ "I hate waiting in long lines.",
242
+ "@user Check out http this is cool.",
243
+ "Just a normal sentence, nothing special here.",
244
+ "",
245
+ "This new update is absolutely terrible and full of bugs.",
246
+ ]
247
+
248
+ print("\nInput Comments for Direct Test:")
249
+ for i, c in enumerate(sample_comments_for_testing):
250
+ print(f"{i+1}. '{c}'")
251
+
252
+ # Get predictions (now a list of dictionaries)
253
+ prediction_results = predict_sentiments(sample_comments_for_testing)
254
+
255
+ print("\nPredicted Sentiments and Scores (Direct Test):")
256
+ # Loop through the results list
257
+ for i, (comment, result) in enumerate(
258
+ zip(sample_comments_for_testing, prediction_results)
259
+ ):
260
+ print(f"{i+1}. Comment: '{comment}'")
261
+ # Format scores nicely for printing
262
+ scores_dict = result.get("scores", {})
263
+ formatted_scores = ", ".join(
264
+ [f"{name}: {score:.3f}" for name, score in scores_dict.items()]
265
+ )
266
+ print(f" -> Predicted Label: {result.get('label', 'N/A')}")
267
+ # Also print the raw scores dictionary
268
+ print(f" -> Scores: {{{formatted_scores}}}")
269
+ print("--- Direct Test Finished ---")
270
+ else:
271
+ print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
272
+ print(
273
+ f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
274
+ )
275
+
276
+
277
+ # # --- Prediction Function ---
278
+ # def predict_sentiments(comment_list: list):
279
+ # """
280
+ # Predicts sentiments for a list of comment strings.
281
+ # Returns a list of sentiment labels (e.g., "positive", "neutral", "negative").
282
+ # """
283
+ # # Check if model and tokenizer were loaded properly
284
+ # if model is None or tokenizer is None:
285
+ # print(
286
+ # "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot make predictions."
287
+ # )
288
+ # # Return an error message for each comment if model isn't ready
289
+ # return ["Error: Model not loaded"] * len(comment_list)
290
+
291
+ # if not comment_list: # If the input list is empty
292
+ # return []
293
+
294
+ # # First, preprocess all comments like we did for training data
295
+ # processed_comments = [preprocess_tweet(comment) for comment in comment_list]
296
+
297
+ # # Tokenize the processed comments
298
+ # # This turns text into numbers (input IDs, attention mask) for the model
299
+ # # padding=True: make all sequences in the batch the same length
300
+ # # truncation=True: cut off sequences longer than the model can handle
301
+ # # return_tensors="pt": return PyTorch tensors
302
+ # # max_length: ensure we don't exceed model's limit (e.g., 512 for RoBERTa)
303
+ # print(f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments...")
304
+ # inputs = tokenizer(
305
+ # processed_comments,
306
+ # padding=True,
307
+ # truncation=True,
308
+ # return_tensors="pt",
309
+ # max_length=(
310
+ # tokenizer.model_max_length
311
+ # if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
312
+ # else 512
313
+ # ),
314
+ # )
315
+
316
+ # # Move the tokenized inputs to the same device as the model (GPU or CPU)
317
+ # inputs = {k: v.to(device) for k, v in inputs.items()}
318
+
319
+ # sentiment_labels_as_strings = []
320
+ # try:
321
+ # # Make predictions
322
+ # # torch.no_grad() is important for inference:
323
+ # # it tells PyTorch not to calculate gradients, saving memory and speeding things up.
324
+ # with torch.no_grad():
325
+ # outputs = model(**inputs) # Get model outputs
326
+ # logits = outputs.logits # These are the raw scores from the final layer
327
+
328
+ # # Get the predicted class ID by finding the index with the highest score (logit)
329
+ # # logits shape is (batch_size, num_labels)
330
+ # predicted_class_ids = torch.argmax(
331
+ # logits, dim=-1
332
+ # ) # dim=-1 means find max along the last dimension
333
+
334
+ # # Convert the predicted class IDs (numbers) to actual sentiment labels (strings)
335
+ # # using the id2label_mapping we got from the model's config
336
+ # # .item() gets the Python number from a 0-dim PyTorch tensor
337
+ # sentiment_labels_as_strings = [
338
+ # id2label_mapping.get(class_id.item(), "Unknown")
339
+ # for class_id in predicted_class_ids
340
+ # ]
341
+ # print(
342
+ # f"DEBUG (predict.py): Predictions made. Example: {sentiment_labels_as_strings[:3] if sentiment_labels_as_strings else 'N/A'}"
343
+ # )
344
+
345
+ # except Exception as e:
346
+ # print(f"--- ERROR (predict.py - predict_sentiments) ---")
347
+ # print(f"Error during sentiment prediction inference: {e}")
348
+ # # Return an error message for each comment if prediction fails
349
+ # sentiment_labels_as_strings = ["Error: Prediction failed"] * len(comment_list)
350
+
351
+ # return sentiment_labels_as_strings
352
+
353
+
354
+ # # --- Main block for testing this script directly ---
355
+ # # This part only runs if you execute 'python src/predict.py' from the terminal
356
+ # # It won't run when app.py imports this file.
357
+ # if __name__ == "__main__":
358
+ # print("\n--- Testing predict.py Script Directly ---")
359
+ # # Check if model was loaded, otherwise can't test
360
+ # if model and tokenizer:
361
+ # sample_comments_for_testing = [
362
+ # "This is an amazing movie, I loved it!", # Expected: positive
363
+ # "I'm not sure how I feel about this, it was okay.", # Expected: neutral
364
+ # "Worst experience ever, would not recommend.", # Expected: negative
365
+ # "The food was alright, but the service was slow.", # Expected: neutral or negative
366
+ # "What a fantastic day! #blessed", # Expected: positive
367
+ # "I hate waiting in long lines.", # Expected: negative
368
+ # "@user Check out http this is cool.", # Test preprocessing, Expected: positive or neutral
369
+ # "Just a normal sentence, nothing special here.", # Expected: neutral
370
+ # "", # Empty string test
371
+ # "This new update is absolutely terrible and full of bugs.", # Expected: negative
372
+ # ]
373
+
374
+ # print("\nInput Comments for Direct Test:")
375
+ # for i, c in enumerate(sample_comments_for_testing):
376
+ # print(f"{i + 1}. '{c}'")
377
+
378
+ # # Get predictions using our main function
379
+ # predicted_sentiments = predict_sentiments(sample_comments_for_testing)
380
+
381
+ # print("\nPredicted Sentiments (Direct Test):")
382
+ # for i, (comment, sentiment) in enumerate(
383
+ # zip(sample_comments_for_testing, predicted_sentiments)
384
+ # ):
385
+ # print(
386
+ # f"{i + 1}. Comment: '{comment}'\n -> Predicted Sentiment: {sentiment}"
387
+ # )
388
+ # print("--- Direct Test Finished ---")
389
+ # else:
390
+ # print(
391
+ # "ERROR (predict.py - main test): Model and/or tokenizer not loaded. Cannot run direct test."
392
+ # )
393
+ # print(
394
+ # f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
395
+ # )