Spaces:

AlainDeLong
/

Social-Sentiment-Analysis

Sleeping

App Files Files Community

AlainDeLong commited on May 11, 2025

Commit

db78b44

verified ·

1 Parent(s): 09c22f3

Update src/predict.py

Browse files

Files changed (1) hide show

src/predict.py +395 -473

src/predict.py CHANGED Viewed

@@ -1,473 +1,395 @@
-# src/predict.py
-import os  # To help build file paths correctly
-import torch  # PyTorch library, for tensors and model operations
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-)  # Hugging Face stuff for models
-# --- Configuration ---
-# This is where our fine-tuned model and tokenizer files are stored
-# Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
-_SCRIPT_DIR = os.path.dirname(
-    os.path.abspath(__file__)
-)  # Gets the directory where this script is
-MODEL_PATH = os.path.join(
-    _SCRIPT_DIR, "fine_tuned_model"
-)  # User confirmed this variable name and directory
-print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}")  # For checking the path
-# --- Device Setup ---
-# Check if a GPU is available, otherwise use CPU
-# Using GPU makes predictions much faster!
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    # Trying to get the name of the GPU, just for information
-    try:
-        gpu_name = torch.cuda.get_device_name(0)
-        print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
-    except Exception as e:
-        print(
-            f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
-        )
-else:
-    device = torch.device("cpu")
-    print(
-        "INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
-    )
-# --- Load Model and Tokenizer ---
-# We load these once when the script (or module) is first loaded.
-# This is much better than loading them every time we want to predict.
-model = None
-tokenizer = None
-id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"}  # Default mapping
-try:
-    print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
-    # Load the pre-trained model for sequence classification
-    # This should be the PyTorch RoBERTa model we fine-tuned
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
-    model.to(device)  # Move the model to the GPU (or CPU if no GPU)
-    model.eval()  # Set the model to evaluation mode (important for layers like Dropout)
-    print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
-    print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
-    # Load the tokenizer that matches the model
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-    print("INFO (predict.py): Tokenizer loaded successfully.")
-    # Get the label mapping from the model's configuration
-    # This was saved during fine-tuning
-    if hasattr(model.config, "id2label") and model.config.id2label:
-        id2label_mapping = model.config.id2label
-        # Convert string keys from config.json to int if necessary
-        id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
-        print(
-            f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
-        )
-    else:
-        print(
-            "WARN (predict.py): id2label not found in model config, using default mapping."
-        )
-except FileNotFoundError:
-    print(f"--- CRITICAL ERROR (predict.py) ---")
-    print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
-    print(
-        f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
-    )
-    # Keep model and tokenizer as None, so predict_sentiments can handle it
-except Exception as e:
-    print(f"--- ERROR (predict.py) ---")
-    print(f"An unexpected error occurred loading model or tokenizer: {e}")
-    # Keep model and tokenizer as None
-# --- Preprocessing Function ---
-# Same function we used for training data to make sure inputs are consistent
-def preprocess_tweet(text):
-    """Replaces @user mentions and http links with placeholders."""
-    preprocessed_text = []
-    if text is None:
-        return ""  # Handle None input
-    # Split text into parts by space
-    for t in text.split(" "):
-        if len(t) > 0:  # Avoid processing empty parts from multiple spaces
-            t = "@user" if t.startswith("@") else t  # Replace mentions
-            t = "http" if t.startswith("http") else t  # Replace links
-        preprocessed_text.append(t)
-    return " ".join(preprocessed_text)  # Put the parts back together
-# --- Prediction Function (UPDATED to return probabilities) ---
-def predict_sentiments(comment_list: list):
-    """
-    Predicts sentiments for a list of comment strings.
-    Returns a list of dictionaries, each containing the predicted label
-    and the probabilities (scores) for each class.
-    e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
-    """
-    # Check if model and tokenizer are ready
-    if model is None or tokenizer is None:
-        print(
-            "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
-        )
-        # Return an error structure
-        return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
-    if not comment_list:  # Handle empty input list
-        return []
-    """
-    # Preprocess comments first
-    processed_comments = [preprocess_tweet(comment) for comment in comment_list]
-    # Tokenize the batch
-    print(
-        f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments for prediction..."
-    )
-    inputs = tokenizer(
-        processed_comments,
-        padding=True,
-        truncation=True,
-        return_tensors="pt",  # PyTorch tensors
-        max_length=(
-            tokenizer.model_max_length
-            if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
-            else 512
-        ),
-    )
-    # Move inputs to the correct device
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    results_list = []  # To store the dictionaries
-    try:
-        # Perform inference without calculating gradients
-        with torch.no_grad():
-            outputs = model(**inputs)
-            logits = outputs.logits
-        # Apply Softmax to convert logits to probabilities
-        # dim=-1 applies softmax across the last dimension (the classes)
-        probabilities = torch.softmax(logits, dim=-1)
-        # Get the predicted class IDs (index of the highest probability)
-        predicted_class_ids = torch.argmax(probabilities, dim=-1)
-        # Move results to CPU and convert to Python lists/numpy for easier handling
-        probs_list = (
-            probabilities.cpu().numpy().tolist()
-        )  # List of lists of probabilities
-        ids_list = predicted_class_ids.cpu().numpy().tolist()  # List of predicted IDs
-        print(
-            f"DEBUG (predict.py): Probabilities and IDs calculated. Batch size: {len(ids_list)}"
-        )
-        # Format the output: list of dictionaries
-        for i in range(len(ids_list)):
-            pred_id = ids_list[i]
-            # Map predicted ID to label string using the mapping from model config
-            pred_label = id2label_mapping.get(pred_id, "Unknown")
-            # Create the dictionary of scores {label_name: probability}
-            pred_scores = {
-                label_name: probs_list[i][label_id]
-                for label_id, label_name in id2label_mapping.items()
-                # Ensure index is within bounds, just in case
-                if 0 <= label_id < probabilities.shape[-1]
-            }
-            # Append the result for this comment
-            results_list.append({"label": pred_label, "scores": pred_scores})
-    except Exception as e:
-        print(f"--- ERROR (predict.py - predict_sentiments) ---")
-        print(f"Error during sentiment prediction inference or formatting: {e}")
-        import traceback
-        traceback.print_exc()  # Print full traceback for debugging
-        # Return error structure for each comment
-        results_list = [
-            {"label": "Error: Prediction failed", "scores": {}} for _ in comment_list
-        ]
-    return results_list  # Return the list of dictionaries
-    """
-    inference_batch_size = 64  # You can adjust this number based on performance/memory
-    print(
-        f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
-    )
-    all_results_list = []  # We'll collect results for all batches here
-    # --- Loop through the comment list in batches ---
-    try:
-        total_comments = len(comment_list)
-        # This loop goes from 0 to total_comments, jumping by inference_batch_size each time
-        for i in range(0, total_comments, inference_batch_size):
-            # Get the current slice of comments for this batch
-            batch_comments = comment_list[i : i + inference_batch_size]
-            # Just printing progress for long lists
-            current_batch_num = i // inference_batch_size + 1
-            total_batches = (
-                total_comments + inference_batch_size - 1
-            ) // inference_batch_size
-            print(
-                f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
-            )
-            # --- Process ONLY the current batch ---
-            # 1. Preprocess this specific batch
-            processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
-            # 2. Tokenize this batch
-            # Tokenizer handles padding within this smaller batch
-            inputs = tokenizer(
-                processed_batch,
-                padding=True,
-                truncation=True,
-                return_tensors="pt",
-                max_length=(
-                    tokenizer.model_max_length
-                    if hasattr(tokenizer, "model_max_length")
-                    and tokenizer.model_max_length
-                    else 512
-                ),
-            )
-            # 3. Move this batch's inputs to the device (GPU/CPU)
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            # 4. Make prediction for this batch - no need for gradients
-            with torch.no_grad():
-                outputs = model(**inputs)
-                logits = outputs.logits  # Raw scores from the model for this batch
-            # 5. Calculate probabilities and get predicted class IDs for this batch
-            probabilities_batch = torch.softmax(logits, dim=-1)
-            predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
-            # 6. Move results back to CPU, convert to lists for easier looping
-            probs_list_batch = probabilities_batch.cpu().numpy().tolist()
-            ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
-            # 7. Format results for each comment in THIS batch
-            batch_results = []
-            for j in range(len(ids_list_batch)):
-                pred_id = ids_list_batch[j]
-                pred_label = id2label_mapping.get(
-                    pred_id, "Unknown"
-                )  # Map ID to label name
-                # Create the scores dictionary for this comment
-                pred_scores = {
-                    label_name: probs_list_batch[j][label_id]
-                    for label_id, label_name in id2label_mapping.items()
-                    if 0
-                    <= label_id
-                    < probabilities_batch.shape[-1]  # Safety check for index
-                }
-                # Add the result for this comment
-                batch_results.append({"label": pred_label, "scores": pred_scores})
-            # Add the results from this completed batch to our main list
-            all_results_list.extend(batch_results)
-            # --- Finished processing current batch ---
-        print(
-            f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
-        )
-    except Exception as e:
-        # Catch errors that might happen during the loop
-        print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
-        print(
-            f"An error occurred during batch prediction (around comment index {i}): {e}"
-        )
-        import traceback
-        traceback.print_exc()  # Print full error details to console
-        # Try to return results for processed batches + error messages for the rest
-        num_processed = len(all_results_list)
-        num_remaining = len(comment_list) - num_processed
-        # Add error indicators for comments that couldn't be processed
-        all_results_list.extend(
-            [{"label": "Error: Batch failed", "scores": {}}] * num_remaining
-        )
-    # Return the list containing results for all comments
-    return all_results_list
-# --- Main block for testing this script directly (UPDATED to show scores) ---
-if __name__ == "__main__":
-    print("\n--- Testing predict.py Script Directly ---")
-    if model and tokenizer:
-        sample_comments_for_testing = [
-            "This is an amazing movie, I loved it!",
-            "I'm not sure how I feel about this, it was okay.",
-            "Worst experience ever, would not recommend.",
-            "The food was alright, but the service was slow.",
-            "What a fantastic day! #blessed",
-            "I hate waiting in long lines.",
-            "@user Check out http this is cool.",
-            "Just a normal sentence, nothing special here.",
-            "",
-            "This new update is absolutely terrible and full of bugs.",
-        ]
-        print("\nInput Comments for Direct Test:")
-        for i, c in enumerate(sample_comments_for_testing):
-            print(f"{i+1}. '{c}'")
-        # Get predictions (now a list of dictionaries)
-        prediction_results = predict_sentiments(sample_comments_for_testing)
-        print("\nPredicted Sentiments and Scores (Direct Test):")
-        # Loop through the results list
-        for i, (comment, result) in enumerate(
-            zip(sample_comments_for_testing, prediction_results)
-        ):
-            print(f"{i+1}. Comment: '{comment}'")
-            # Format scores nicely for printing
-            scores_dict = result.get("scores", {})
-            formatted_scores = ", ".join(
-                [f"{name}: {score:.3f}" for name, score in scores_dict.items()]
-            )
-            print(f"   -> Predicted Label: {result.get('label', 'N/A')}")
-            # Also print the raw scores dictionary
-            print(f"   -> Scores: {{{formatted_scores}}}")
-        print("--- Direct Test Finished ---")
-    else:
-        print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
-        print(
-            f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
-        )
-# # --- Prediction Function ---
-# def predict_sentiments(comment_list: list):
-#     """
-#     Predicts sentiments for a list of comment strings.
-#     Returns a list of sentiment labels (e.g., "positive", "neutral", "negative").
-#     """
-#     # Check if model and tokenizer were loaded properly
-#     if model is None or tokenizer is None:
-#         print(
-#             "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot make predictions."
-#         )
-#         # Return an error message for each comment if model isn't ready
-#         return ["Error: Model not loaded"] * len(comment_list)
-#     if not comment_list:  # If the input list is empty
-#         return []
-#     # First, preprocess all comments like we did for training data
-#     processed_comments = [preprocess_tweet(comment) for comment in comment_list]
-#     # Tokenize the processed comments
-#     # This turns text into numbers (input IDs, attention mask) for the model
-#     # padding=True: make all sequences in the batch the same length
-#     # truncation=True: cut off sequences longer than the model can handle
-#     # return_tensors="pt": return PyTorch tensors
-#     # max_length: ensure we don't exceed model's limit (e.g., 512 for RoBERTa)
-#     print(f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments...")
-#     inputs = tokenizer(
-#         processed_comments,
-#         padding=True,
-#         truncation=True,
-#         return_tensors="pt",
-#         max_length=(
-#             tokenizer.model_max_length
-#             if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
-#             else 512
-#         ),
-#     )
-#     # Move the tokenized inputs to the same device as the model (GPU or CPU)
-#     inputs = {k: v.to(device) for k, v in inputs.items()}
-#     sentiment_labels_as_strings = []
-#     try:
-#         # Make predictions
-#         # torch.no_grad() is important for inference:
-#         # it tells PyTorch not to calculate gradients, saving memory and speeding things up.
-#         with torch.no_grad():
-#             outputs = model(**inputs)  # Get model outputs
-#             logits = outputs.logits  # These are the raw scores from the final layer
-#         # Get the predicted class ID by finding the index with the highest score (logit)
-#         # logits shape is (batch_size, num_labels)
-#         predicted_class_ids = torch.argmax(
-#             logits, dim=-1
-#         )  # dim=-1 means find max along the last dimension
-#         # Convert the predicted class IDs (numbers) to actual sentiment labels (strings)
-#         # using the id2label_mapping we got from the model's config
-#         # .item() gets the Python number from a 0-dim PyTorch tensor
-#         sentiment_labels_as_strings = [
-#             id2label_mapping.get(class_id.item(), "Unknown")
-#             for class_id in predicted_class_ids
-#         ]
-#         print(
-#             f"DEBUG (predict.py): Predictions made. Example: {sentiment_labels_as_strings[:3] if sentiment_labels_as_strings else 'N/A'}"
-#         )
-#     except Exception as e:
-#         print(f"--- ERROR (predict.py - predict_sentiments) ---")
-#         print(f"Error during sentiment prediction inference: {e}")
-#         # Return an error message for each comment if prediction fails
-#         sentiment_labels_as_strings = ["Error: Prediction failed"] * len(comment_list)
-#     return sentiment_labels_as_strings
-# # --- Main block for testing this script directly ---
-# # This part only runs if you execute 'python src/predict.py' from the terminal
-# # It won't run when app.py imports this file.
-# if __name__ == "__main__":
-#     print("\n--- Testing predict.py Script Directly ---")
-#     # Check if model was loaded, otherwise can't test
-#     if model and tokenizer:
-#         sample_comments_for_testing = [
-#             "This is an amazing movie, I loved it!",  # Expected: positive
-#             "I'm not sure how I feel about this, it was okay.",  # Expected: neutral
-#             "Worst experience ever, would not recommend.",  # Expected: negative
-#             "The food was alright, but the service was slow.",  # Expected: neutral or negative
-#             "What a fantastic day! #blessed",  # Expected: positive
-#             "I hate waiting in long lines.",  # Expected: negative
-#             "@user Check out http this is cool.",  # Test preprocessing, Expected: positive or neutral
-#             "Just a normal sentence, nothing special here.",  # Expected: neutral
-#             "",  # Empty string test
-#             "This new update is absolutely terrible and full of bugs.",  # Expected: negative
-#         ]
-#         print("\nInput Comments for Direct Test:")
-#         for i, c in enumerate(sample_comments_for_testing):
-#             print(f"{i + 1}. '{c}'")
-#         # Get predictions using our main function
-#         predicted_sentiments = predict_sentiments(sample_comments_for_testing)
-#         print("\nPredicted Sentiments (Direct Test):")
-#         for i, (comment, sentiment) in enumerate(
-#             zip(sample_comments_for_testing, predicted_sentiments)
-#         ):
-#             print(
-#                 f"{i + 1}. Comment: '{comment}'\n   -> Predicted Sentiment: {sentiment}"
-#             )
-#         print("--- Direct Test Finished ---")
-#     else:
-#         print(
-#             "ERROR (predict.py - main test): Model and/or tokenizer not loaded. Cannot run direct test."
-#         )
-#         print(
-#             f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
-#         )

+# src/predict.py
+import os  # To help build file paths correctly
+import torch  # PyTorch library, for tensors and model operations
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)  # Hugging Face stuff for models
+# --- Configuration ---
+# This is where our fine-tuned model and tokenizer files are stored
+# Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
+_SCRIPT_DIR = os.path.dirname(
+    os.path.abspath(__file__)
+)  # Gets the directory where this script is
+MODEL_PATH = os.path.join(
+    _SCRIPT_DIR, "fine_tuned_model"
+)  # User confirmed this variable name and directory
+print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}")  # For checking the path
+# --- Device Setup ---
+# Check if a GPU is available, otherwise use CPU
+# Using GPU makes predictions much faster!
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    # Trying to get the name of the GPU, just for information
+    try:
+        gpu_name = torch.cuda.get_device_name(0)
+        print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
+    except Exception as e:
+        print(
+            f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
+        )
+else:
+    device = torch.device("cpu")
+    print(
+        "INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
+    )
+# --- Load Model and Tokenizer ---
+# We load these once when the script (or module) is first loaded.
+# This is much better than loading them every time we want to predict.
+model = None
+tokenizer = None
+id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"}  # Default mapping
+try:
+    print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
+    # Load the pre-trained model for sequence classification
+    # This should be the PyTorch RoBERTa model we fine-tuned
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+    model.to(device)  # Move the model to the GPU (or CPU if no GPU)
+    model.eval()  # Set the model to evaluation mode (important for layers like Dropout)
+    print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
+    print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
+    # Load the tokenizer that matches the model
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    print("INFO (predict.py): Tokenizer loaded successfully.")
+    # Get the label mapping from the model's configuration
+    # This was saved during fine-tuning
+    if hasattr(model.config, "id2label") and model.config.id2label:
+        id2label_mapping = model.config.id2label
+        # Convert string keys from config.json to int if necessary
+        id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
+        print(
+            f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
+        )
+    else:
+        print(
+            "WARN (predict.py): id2label not found in model config, using default mapping."
+        )
+except FileNotFoundError:
+    print(f"--- CRITICAL ERROR (predict.py) ---")
+    print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
+    print(
+        f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
+    )
+    # Keep model and tokenizer as None, so predict_sentiments can handle it
+except Exception as e:
+    print(f"--- ERROR (predict.py) ---")
+    print(f"An unexpected error occurred loading model or tokenizer: {e}")
+    # Keep model and tokenizer as None
+# --- Preprocessing Function ---
+# Same function we used for training data to make sure inputs are consistent
+def preprocess_tweet(text):
+    """Replaces @user mentions and http links with placeholders."""
+    preprocessed_text = []
+    if text is None:
+        return ""  # Handle None input
+    # Split text into parts by space
+    for t in text.split(" "):
+        if len(t) > 0:  # Avoid processing empty parts from multiple spaces
+            t = "@user" if t.startswith("@") else t  # Replace mentions
+            t = "http" if t.startswith("http") else t  # Replace links
+        preprocessed_text.append(t)
+    return " ".join(preprocessed_text)  # Put the parts back together
+# --- Prediction Function (UPDATED to return probabilities) ---
+def predict_sentiments(comment_list: list):
+    """
+    Predicts sentiments for a list of comment strings.
+    Returns a list of dictionaries, each containing the predicted label
+    and the probabilities (scores) for each class.
+    e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
+    """
+    # Check if model and tokenizer are ready
+    if model is None or tokenizer is None:
+        print(
+            "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
+        )
+        # Return an error structure
+        return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
+    if not comment_list:  # Handle empty input list
+        return []
+    inference_batch_size = 16  # You can adjust this number based on performance/memory
+    print(
+        f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
+    )
+    all_results_list = []  # We'll collect results for all batches here
+    # --- Loop through the comment list in batches ---
+    try:
+        total_comments = len(comment_list)
+        # This loop goes from 0 to total_comments, jumping by inference_batch_size each time
+        for i in range(0, total_comments, inference_batch_size):
+            # Get the current slice of comments for this batch
+            batch_comments = comment_list[i : i + inference_batch_size]
+            # Just printing progress for long lists
+            current_batch_num = i // inference_batch_size + 1
+            total_batches = (
+                total_comments + inference_batch_size - 1
+            ) // inference_batch_size
+            print(
+                f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
+            )
+            # --- Process ONLY the current batch ---
+            # 1. Preprocess this specific batch
+            processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
+            # 2. Tokenize this batch
+            # Tokenizer handles padding within this smaller batch
+            inputs = tokenizer(
+                processed_batch,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=(
+                    tokenizer.model_max_length
+                    if hasattr(tokenizer, "model_max_length")
+                    and tokenizer.model_max_length
+                    else 512
+                ),
+            )
+            # 3. Move this batch's inputs to the device (GPU/CPU)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # 4. Make prediction for this batch - no need for gradients
+            with torch.no_grad():
+                outputs = model(**inputs)
+                logits = outputs.logits  # Raw scores from the model for this batch
+            # 5. Calculate probabilities and get predicted class IDs for this batch
+            probabilities_batch = torch.softmax(logits, dim=-1)
+            predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
+            # 6. Move results back to CPU, convert to lists for easier looping
+            probs_list_batch = probabilities_batch.cpu().numpy().tolist()
+            ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
+            # 7. Format results for each comment in THIS batch
+            batch_results = []
+            for j in range(len(ids_list_batch)):
+                pred_id = ids_list_batch[j]
+                pred_label = id2label_mapping.get(
+                    pred_id, "Unknown"
+                )  # Map ID to label name
+                # Create the scores dictionary for this comment
+                pred_scores = {
+                    label_name: probs_list_batch[j][label_id]
+                    for label_id, label_name in id2label_mapping.items()
+                    if 0
+                    <= label_id
+                    < probabilities_batch.shape[-1]  # Safety check for index
+                }
+                # Add the result for this comment
+                batch_results.append({"label": pred_label, "scores": pred_scores})
+            # Add the results from this completed batch to our main list
+            all_results_list.extend(batch_results)
+            # --- Finished processing current batch ---
+        print(
+            f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
+        )
+    except Exception as e:
+        # Catch errors that might happen during the loop
+        print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
+        print(
+            f"An error occurred during batch prediction (around comment index {i}): {e}"
+        )
+        import traceback
+        traceback.print_exc()  # Print full error details to console
+        # Try to return results for processed batches + error messages for the rest
+        num_processed = len(all_results_list)
+        num_remaining = len(comment_list) - num_processed
+        # Add error indicators for comments that couldn't be processed
+        all_results_list.extend(
+            [{"label": "Error: Batch failed", "scores": {}}] * num_remaining
+        )
+    # Return the list containing results for all comments
+    return all_results_list
+# --- Main block for testing this script directly (UPDATED to show scores) ---
+if __name__ == "__main__":
+    print("\n--- Testing predict.py Script Directly ---")
+    if model and tokenizer:
+        sample_comments_for_testing = [
+            "This is an amazing movie, I loved it!",
+            "I'm not sure how I feel about this, it was okay.",
+            "Worst experience ever, would not recommend.",
+            "The food was alright, but the service was slow.",
+            "What a fantastic day! #blessed",
+            "I hate waiting in long lines.",
+            "@user Check out http this is cool.",
+            "Just a normal sentence, nothing special here.",
+            "",
+            "This new update is absolutely terrible and full of bugs.",
+        ]
+        print("\nInput Comments for Direct Test:")
+        for i, c in enumerate(sample_comments_for_testing):
+            print(f"{i+1}. '{c}'")
+        # Get predictions (now a list of dictionaries)
+        prediction_results = predict_sentiments(sample_comments_for_testing)
+        print("\nPredicted Sentiments and Scores (Direct Test):")
+        # Loop through the results list
+        for i, (comment, result) in enumerate(
+            zip(sample_comments_for_testing, prediction_results)
+        ):
+            print(f"{i+1}. Comment: '{comment}'")
+            # Format scores nicely for printing
+            scores_dict = result.get("scores", {})
+            formatted_scores = ", ".join(
+                [f"{name}: {score:.3f}" for name, score in scores_dict.items()]
+            )
+            print(f"   -> Predicted Label: {result.get('label', 'N/A')}")
+            # Also print the raw scores dictionary
+            print(f"   -> Scores: {{{formatted_scores}}}")
+        print("--- Direct Test Finished ---")
+    else:
+        print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
+        print(
+            f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
+        )
+# # --- Prediction Function ---
+# def predict_sentiments(comment_list: list):
+#     """
+#     Predicts sentiments for a list of comment strings.
+#     Returns a list of sentiment labels (e.g., "positive", "neutral", "negative").
+#     """
+#     # Check if model and tokenizer were loaded properly
+#     if model is None or tokenizer is None:
+#         print(
+#             "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot make predictions."
+#         )
+#         # Return an error message for each comment if model isn't ready
+#         return ["Error: Model not loaded"] * len(comment_list)
+#     if not comment_list:  # If the input list is empty
+#         return []
+#     # First, preprocess all comments like we did for training data
+#     processed_comments = [preprocess_tweet(comment) for comment in comment_list]
+#     # Tokenize the processed comments
+#     # This turns text into numbers (input IDs, attention mask) for the model
+#     # padding=True: make all sequences in the batch the same length
+#     # truncation=True: cut off sequences longer than the model can handle
+#     # return_tensors="pt": return PyTorch tensors
+#     # max_length: ensure we don't exceed model's limit (e.g., 512 for RoBERTa)
+#     print(f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments...")
+#     inputs = tokenizer(
+#         processed_comments,
+#         padding=True,
+#         truncation=True,
+#         return_tensors="pt",
+#         max_length=(
+#             tokenizer.model_max_length
+#             if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
+#             else 512
+#         ),
+#     )
+#     # Move the tokenized inputs to the same device as the model (GPU or CPU)
+#     inputs = {k: v.to(device) for k, v in inputs.items()}
+#     sentiment_labels_as_strings = []
+#     try:
+#         # Make predictions
+#         # torch.no_grad() is important for inference:
+#         # it tells PyTorch not to calculate gradients, saving memory and speeding things up.
+#         with torch.no_grad():
+#             outputs = model(**inputs)  # Get model outputs
+#             logits = outputs.logits  # These are the raw scores from the final layer
+#         # Get the predicted class ID by finding the index with the highest score (logit)
+#         # logits shape is (batch_size, num_labels)
+#         predicted_class_ids = torch.argmax(
+#             logits, dim=-1
+#         )  # dim=-1 means find max along the last dimension
+#         # Convert the predicted class IDs (numbers) to actual sentiment labels (strings)
+#         # using the id2label_mapping we got from the model's config
+#         # .item() gets the Python number from a 0-dim PyTorch tensor
+#         sentiment_labels_as_strings = [
+#             id2label_mapping.get(class_id.item(), "Unknown")
+#             for class_id in predicted_class_ids
+#         ]
+#         print(
+#             f"DEBUG (predict.py): Predictions made. Example: {sentiment_labels_as_strings[:3] if sentiment_labels_as_strings else 'N/A'}"
+#         )
+#     except Exception as e:
+#         print(f"--- ERROR (predict.py - predict_sentiments) ---")
+#         print(f"Error during sentiment prediction inference: {e}")
+#         # Return an error message for each comment if prediction fails
+#         sentiment_labels_as_strings = ["Error: Prediction failed"] * len(comment_list)
+#     return sentiment_labels_as_strings
+# # --- Main block for testing this script directly ---
+# # This part only runs if you execute 'python src/predict.py' from the terminal
+# # It won't run when app.py imports this file.
+# if __name__ == "__main__":
+#     print("\n--- Testing predict.py Script Directly ---")
+#     # Check if model was loaded, otherwise can't test
+#     if model and tokenizer:
+#         sample_comments_for_testing = [
+#             "This is an amazing movie, I loved it!",  # Expected: positive
+#             "I'm not sure how I feel about this, it was okay.",  # Expected: neutral
+#             "Worst experience ever, would not recommend.",  # Expected: negative
+#             "The food was alright, but the service was slow.",  # Expected: neutral or negative
+#             "What a fantastic day! #blessed",  # Expected: positive
+#             "I hate waiting in long lines.",  # Expected: negative
+#             "@user Check out http this is cool.",  # Test preprocessing, Expected: positive or neutral
+#             "Just a normal sentence, nothing special here.",  # Expected: neutral
+#             "",  # Empty string test
+#             "This new update is absolutely terrible and full of bugs.",  # Expected: negative
+#         ]
+#         print("\nInput Comments for Direct Test:")
+#         for i, c in enumerate(sample_comments_for_testing):
+#             print(f"{i + 1}. '{c}'")
+#         # Get predictions using our main function
+#         predicted_sentiments = predict_sentiments(sample_comments_for_testing)
+#         print("\nPredicted Sentiments (Direct Test):")
+#         for i, (comment, sentiment) in enumerate(
+#             zip(sample_comments_for_testing, predicted_sentiments)
+#         ):
+#             print(
+#                 f"{i + 1}. Comment: '{comment}'\n   -> Predicted Sentiment: {sentiment}"
+#             )
+#         print("--- Direct Test Finished ---")
+#     else:
+#         print(
+#             "ERROR (predict.py - main test): Model and/or tokenizer not loaded. Cannot run direct test."
+#         )
+#         print(
+#             f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
+#         )