Spaces:

msmaje
/

bert-complain-classifier

Sleeping

App Files Files Community

msmaje commited on Aug 23, 2025

Commit

2b7e143

verified ·

1 Parent(s): 3c14fdc

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -61

app.py CHANGED Viewed

@@ -35,16 +35,16 @@ TRAINING_LOGS = []
 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Dataset file not found: {file_path}")
-        # Load the CSV file
         df = pd.read_csv(file_path)
-        # Verify required columns exist
         if text_column not in df.columns:
             available_cols = list(df.columns)
             raise ValueError(f"Text column '{text_column}' not found. Available columns: {available_cols}")
@@ -53,24 +53,19 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             available_cols = list(df.columns)
             raise ValueError(f"Label column '{label_column}' not found. Available columns: {available_cols}")
-        # Clean the data
         df = df.dropna(subset=[text_column, label_column])
         df[text_column] = df[text_column].astype(str)
-        # Handle different label formats
         if df[label_column].dtype == 'object':
-            # If labels are text, convert to indices
             unique_labels = df[label_column].unique()
             if len(unique_labels) > len(CATEGORIES):
                 raise ValueError(f"Too many unique labels ({len(unique_labels)}). Expected max {len(CATEGORIES)}")
-            # Try to map text labels to our categories
             label_mapping = {}
             for label in unique_labels:
                 if label in category_to_idx:
                     label_mapping[label] = category_to_idx[label]
                 else:
-                    # Auto-assign if not found
                     available_indices = set(range(len(CATEGORIES))) - set(label_mapping.values())
                     if available_indices:
                         label_mapping[label] = min(available_indices)
@@ -79,14 +74,11 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             df['label_idx'] = df[label_column].map(label_mapping)
         else:
-            # If labels are already numeric
             df['label_idx'] = df[label_column].astype(int)
-        # Verify label indices are valid
         if df['label_idx'].min() < 0 or df['label_idx'].max() >= len(CATEGORIES):
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
-        # Create train/validation split
         train_df, val_df = train_test_split(
             df,
             test_size=test_size,
@@ -94,7 +86,6 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             stratify=df['label_idx']
         )
-        # Convert to Hugging Face datasets
         train_dataset = Dataset.from_pandas(train_df[[text_column, 'label_idx']])
         val_dataset = Dataset.from_pandas(val_df[[text_column, 'label_idx']])
@@ -114,7 +105,6 @@ def preview_dataset(uploaded_file, text_column, label_column):
         if uploaded_file is None:
             return "Please upload a dataset file first."
-        # Get the file path from the uploaded file
         file_path = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
         df = pd.read_csv(file_path)
@@ -162,11 +152,9 @@ def validate_hub_model_id(username, model_name):
     if not username or not model_name:
         return None, "Please provide both username and model name"
-    # Clean up the model name
     model_name = model_name.strip().lower().replace(" ", "-")
     model_name = ''.join(c for c in model_name if c.isalnum() or c in ['-', '_'])
-    # Construct the full model ID
     hub_model_id = f"{username}/{model_name}"
     return hub_model_id, None
@@ -176,7 +164,6 @@ def load_model(model_path):
     global CURRENT_MODEL, CURRENT_TOKENIZER
     try:
-        # Try loading from local path first
         if os.path.exists(model_path):
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
@@ -185,7 +172,6 @@ def load_model(model_path):
             )
             return f"✅ Model loaded from local path: {model_path}"
-        # If local path doesn't exist, try loading from Hub
         try:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
@@ -194,7 +180,6 @@ def load_model(model_path):
             )
             return f"✅ Model loaded from Hugging Face Hub: {model_path}"
         except Exception as hub_error:
-            # If both local and hub loading fail, fall back to base model
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
                 "bert-base-uncased",
@@ -242,7 +227,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(login_result)
         yield "\n".join(TRAINING_LOGS)
-    # Validate hub model ID if pushing to hub
     if push_to_hub:
         hub_model_id, error = validate_hub_model_id(username, model_name)
         if error:
@@ -252,7 +236,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
     else:
         hub_model_id = None
-    # Validate uploaded file
     if uploaded_file is None:
         TRAINING_LOGS.append("❌ Please upload a dataset file")
         yield "\n".join(TRAINING_LOGS)
@@ -261,7 +244,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
     dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
     try:
-        # Load and prepare dataset
         TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
         yield "\n".join(TRAINING_LOGS)
@@ -274,7 +256,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
         yield "\n".join(TRAINING_LOGS)
-        # Load model and tokenizer
         TRAINING_LOGS.append("🤖 Loading BERT model and tokenizer...")
         yield "\n".join(TRAINING_LOGS)
@@ -287,14 +268,12 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append("✅ Model and tokenizer loaded")
         yield "\n".join(TRAINING_LOGS)
-        # Tokenize datasets
         TRAINING_LOGS.append("🔤 Tokenizing datasets...")
         yield "\n".join(TRAINING_LOGS)
         def tokenize_batch(examples):
             return tokenize_function(examples, tokenizer, final_text_col, 512)
-        # Get columns to remove (keep only label column and tokenized features)
         columns_to_remove = [col for col in dataset_dict['train'].column_names if col != final_label_col]
         tokenized_datasets = dataset_dict.map(
@@ -303,17 +282,14 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             remove_columns=columns_to_remove
         )
-        # Rename label column to 'labels' (required by Trainer)
         tokenized_datasets = tokenized_datasets.rename_column(final_label_col, 'labels')
         TRAINING_LOGS.append("✅ Tokenization completed")
         yield "\n".join(TRAINING_LOGS)
-        # Set up training
         output_dir = Path(MODEL_PATH)
         output_dir.mkdir(parents=True, exist_ok=True)
-        # Calculate steps
         total_steps = len(tokenized_datasets['train']) // batch_size * num_epochs
         eval_steps = max(10, min(100, total_steps // 4))
         save_steps = max(20, min(500, total_steps // 2))
@@ -326,7 +302,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(f"- Warmup steps: {warmup_steps}")
         yield "\n".join(TRAINING_LOGS)
-        # Training arguments
         training_args = TrainingArguments(
             output_dir=str(output_dir),
             num_train_epochs=num_epochs,
@@ -353,10 +328,8 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             remove_unused_columns=False,
         )
-        # Data collator
         data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-        # Create trainer
         trainer = Trainer(
             model=model,
             args=training_args,
@@ -371,7 +344,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append("🚀 Starting training...")
         yield "\n".join(TRAINING_LOGS)
-        # Custom training loop with progress updates
         class ProgressCallback:
             def __init__(self, logs_list):
                 self.logs = logs_list
@@ -395,7 +367,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         progress_callback = ProgressCallback(TRAINING_LOGS)
         trainer.add_callback(progress_callback)
-        # Train the model
         try:
             trainer.train()
             TRAINING_LOGS.append("✅ Training completed successfully!")
@@ -405,21 +376,18 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             yield "\n".join(TRAINING_LOGS)
             return
-        # Save the model
         TRAINING_LOGS.append("💾 Saving model...")
         yield "\n".join(TRAINING_LOGS)
         trainer.save_model()
         tokenizer.save_pretrained(output_dir)
-        # Update global model and tokenizer
         CURRENT_MODEL = model
         CURRENT_TOKENIZER = tokenizer
         TRAINING_LOGS.append("✅ Model saved successfully!")
         yield "\n".join(TRAINING_LOGS)
-        # Final evaluation
         TRAINING_LOGS.append("📊 Running final evaluation...")
         yield "\n".join(TRAINING_LOGS)
@@ -432,7 +400,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
                 else:
                     TRAINING_LOGS.append(f"  {key}: {value}")
-            # Save results
             with open(output_dir / "eval_results.json", "w") as f:
                 json.dump(eval_results, f, indent=2)
@@ -441,7 +408,6 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         yield "\n".join(TRAINING_LOGS)
-        # Push to hub if requested
         if push_to_hub and hub_model_id:
             TRAINING_LOGS.append(f"🤗 Pushing to Hugging Face Hub: {hub_model_id}")
             yield "\n".join(TRAINING_LOGS)
@@ -465,7 +431,6 @@ def predict_text(text, model_path):
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
-    # Load the model if it's not loaded or a different one is requested
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
@@ -475,24 +440,19 @@ def predict_text(text, model_path):
         if not text.strip():
             return "Please enter some text to classify."
-        # Check if text was truncated
         original_tokens = CURRENT_TOKENIZER(text, truncation=False)
         was_truncated = len(original_tokens['input_ids']) > 512
-        # Tokenize input
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
-        # Make prediction
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
             predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
             predicted_class_id = predictions.argmax().item()
             confidence = predictions.max().item()
-        # Get predicted category
         predicted_category = idx_to_category[predicted_class_id]
-        # Format result
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
         result = []
@@ -516,21 +476,19 @@ def predict_csv(csv_file, model_path):
     """Make predictions on a CSV file with complaints"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
-    # Load the model if needed
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
-            return load_result
     try:
-        # Read the CSV file
         if hasattr(csv_file, 'name'):
             df = pd.read_csv(csv_file.name)
         else:
             df = pd.read_csv(csv_file)
         if 'complaint' not in df.columns:
-            return "❌ CSV file must have a 'complaint' column"
         results = []
         predictions_list = []
@@ -539,13 +497,11 @@ def predict_csv(csv_file, model_path):
         for i, row in enumerate(df.iterrows()):
             complaint = str(row[1]['complaint'])
-            # Check for truncation
             original_tokens = CURRENT_TOKENIZER(complaint, truncation=False)
             was_truncated = len(original_tokens['input_ids']) > 512
             if was_truncated:
                 truncated_count += 1
-            # Predict
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
@@ -573,16 +529,15 @@ def predict_csv(csv_file, model_path):
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
-        # Save full results to a CSV file
         results_df = pd.DataFrame(predictions_list)
         results_file = "prediction_results.csv"
         results_df.to_csv(results_file, index=False)
         results.append(f"\n💾 Full results saved to {results_file}")
-        return "\n".join(results)
     except Exception as e:
-        return f"❌ CSV processing failed: {str(e)}"
 def push_to_hub_after_training(model_path, username, model_name, token):
     """Push a trained model to Hugging Face Hub"""
@@ -594,7 +549,6 @@ def push_to_hub_after_training(model_path, username, model_name, token):
         if error:
             return f"❌ {error}"
-        # Login and load model
         login(token)
         if not os.path.exists(model_path):
             return "❌ No trained model found. Please train a model first."
@@ -605,7 +559,6 @@ def push_to_hub_after_training(model_path, username, model_name, token):
         except Exception as e:
             return f"❌ Failed to load model: {str(e)}"
-        # Push to Hub
         try:
             model.push_to_hub(hub_model_id)
             tokenizer.push_to_hub(hub_model_id)
@@ -650,6 +603,8 @@ def display_available_datasets():
     else:
         return "No CSV files found in the current directory."
 # Initialize tokenizer on startup
 if CURRENT_TOKENIZER is None:
     try:
@@ -661,7 +616,7 @@ if CURRENT_TOKENIZER is None:
 print("🚀 Launching BERT Complaint Classifier...")
 print("📍 Available at: http://localhost:7860")
-# The entire Gradio UI definition must be within a single block
 with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app:
     gr.Markdown("# BERT Complaint Classifier 🗣️🤖")
     gr.Markdown("Fine-tune a BERT model or use an existing one to classify customer complaints.")
@@ -724,7 +679,6 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
                 predict_btn = gr.Button("Classify Complaint", variant="primary")
                 single_prediction_output = gr.Markdown("Prediction will appear here...")
-                # Link token count to text input
                 text_input.change(count_tokens, inputs=text_input, outputs=token_count_output)
         with gr.Tab("Predict from CSV"):
@@ -740,7 +694,6 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
                 csv_prediction_output = gr.Markdown("Predictions will appear here...")
                 download_link = gr.File(label="Download Full Predictions", interactive=False)
-        # Link prediction buttons to functions
         predict_btn.click(
             predict_text,
             inputs=[text_input, model_path_input],
@@ -797,16 +750,14 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
                 hub_token_input_push = gr.Textbox(label="Hugging Face Token", type="password")
             push_btn = gr.Button("🚀 Push Model to Hub", variant="primary")
-            push_output = gr.verse("Results will appear here...")
-            # Link the push button
             push_btn.click(
                 push_to_hub_after_training,
                 inputs=[gr.Textbox(value=MODEL_PATH, visible=False), hub_username_input_push, hub_model_name_input_push, hub_token_input_push],
                 outputs=push_output
             )
-    # All button clicks and UI logic now correctly indented within the app block
     preview_btn.click(
         preview_dataset,
         inputs=[uploaded_file, text_column_input, label_column_input],
@@ -835,10 +786,8 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
         outputs=available_datasets
     )
-    # Show datasets on load
     app.load(display_available_datasets, outputs=available_datasets)
-# Launch the app
 if __name__ == "__main__":
     app.launch(
         server_name="0.0.0.0",

 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
+# --- Application Logic Functions (No change needed here, they are correctly indented) ---
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Dataset file not found: {file_path}")
         df = pd.read_csv(file_path)
         if text_column not in df.columns:
             available_cols = list(df.columns)
             raise ValueError(f"Text column '{text_column}' not found. Available columns: {available_cols}")
             available_cols = list(df.columns)
             raise ValueError(f"Label column '{label_column}' not found. Available columns: {available_cols}")
         df = df.dropna(subset=[text_column, label_column])
         df[text_column] = df[text_column].astype(str)
         if df[label_column].dtype == 'object':
             unique_labels = df[label_column].unique()
             if len(unique_labels) > len(CATEGORIES):
                 raise ValueError(f"Too many unique labels ({len(unique_labels)}). Expected max {len(CATEGORIES)}")
             label_mapping = {}
             for label in unique_labels:
                 if label in category_to_idx:
                     label_mapping[label] = category_to_idx[label]
                 else:
                     available_indices = set(range(len(CATEGORIES))) - set(label_mapping.values())
                     if available_indices:
                         label_mapping[label] = min(available_indices)
             df['label_idx'] = df[label_column].map(label_mapping)
         else:
             df['label_idx'] = df[label_column].astype(int)
         if df['label_idx'].min() < 0 or df['label_idx'].max() >= len(CATEGORIES):
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
         train_df, val_df = train_test_split(
             df,
             test_size=test_size,
             stratify=df['label_idx']
         )
         train_dataset = Dataset.from_pandas(train_df[[text_column, 'label_idx']])
         val_dataset = Dataset.from_pandas(val_df[[text_column, 'label_idx']])
         if uploaded_file is None:
             return "Please upload a dataset file first."
         file_path = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
         df = pd.read_csv(file_path)
     if not username or not model_name:
         return None, "Please provide both username and model name"
     model_name = model_name.strip().lower().replace(" ", "-")
     model_name = ''.join(c for c in model_name if c.isalnum() or c in ['-', '_'])
     hub_model_id = f"{username}/{model_name}"
     return hub_model_id, None
     global CURRENT_MODEL, CURRENT_TOKENIZER
     try:
         if os.path.exists(model_path):
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
             )
             return f"✅ Model loaded from local path: {model_path}"
         try:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
             )
             return f"✅ Model loaded from Hugging Face Hub: {model_path}"
         except Exception as hub_error:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
                 "bert-base-uncased",
         TRAINING_LOGS.append(login_result)
         yield "\n".join(TRAINING_LOGS)
     if push_to_hub:
         hub_model_id, error = validate_hub_model_id(username, model_name)
         if error:
     else:
         hub_model_id = None
     if uploaded_file is None:
         TRAINING_LOGS.append("❌ Please upload a dataset file")
         yield "\n".join(TRAINING_LOGS)
     dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
     try:
         TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("🤖 Loading BERT model and tokenizer...")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("✅ Model and tokenizer loaded")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("🔤 Tokenizing datasets...")
         yield "\n".join(TRAINING_LOGS)
         def tokenize_batch(examples):
             return tokenize_function(examples, tokenizer, final_text_col, 512)
         columns_to_remove = [col for col in dataset_dict['train'].column_names if col != final_label_col]
         tokenized_datasets = dataset_dict.map(
             remove_columns=columns_to_remove
         )
         tokenized_datasets = tokenized_datasets.rename_column(final_label_col, 'labels')
         TRAINING_LOGS.append("✅ Tokenization completed")
         yield "\n".join(TRAINING_LOGS)
         output_dir = Path(MODEL_PATH)
         output_dir.mkdir(parents=True, exist_ok=True)
         total_steps = len(tokenized_datasets['train']) // batch_size * num_epochs
         eval_steps = max(10, min(100, total_steps // 4))
         save_steps = max(20, min(500, total_steps // 2))
         TRAINING_LOGS.append(f"- Warmup steps: {warmup_steps}")
         yield "\n".join(TRAINING_LOGS)
         training_args = TrainingArguments(
             output_dir=str(output_dir),
             num_train_epochs=num_epochs,
             remove_unused_columns=False,
         )
         data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
         trainer = Trainer(
             model=model,
             args=training_args,
         TRAINING_LOGS.append("🚀 Starting training...")
         yield "\n".join(TRAINING_LOGS)
         class ProgressCallback:
             def __init__(self, logs_list):
                 self.logs = logs_list
         progress_callback = ProgressCallback(TRAINING_LOGS)
         trainer.add_callback(progress_callback)
         try:
             trainer.train()
             TRAINING_LOGS.append("✅ Training completed successfully!")
             yield "\n".join(TRAINING_LOGS)
             return
         TRAINING_LOGS.append("💾 Saving model...")
         yield "\n".join(TRAINING_LOGS)
         trainer.save_model()
         tokenizer.save_pretrained(output_dir)
         CURRENT_MODEL = model
         CURRENT_TOKENIZER = tokenizer
         TRAINING_LOGS.append("✅ Model saved successfully!")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("📊 Running final evaluation...")
         yield "\n".join(TRAINING_LOGS)
                 else:
                     TRAINING_LOGS.append(f"  {key}: {value}")
             with open(output_dir / "eval_results.json", "w") as f:
                 json.dump(eval_results, f, indent=2)
         yield "\n".join(TRAINING_LOGS)
         if push_to_hub and hub_model_id:
             TRAINING_LOGS.append(f"🤗 Pushing to Hugging Face Hub: {hub_model_id}")
             yield "\n".join(TRAINING_LOGS)
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
         if not text.strip():
             return "Please enter some text to classify."
         original_tokens = CURRENT_TOKENIZER(text, truncation=False)
         was_truncated = len(original_tokens['input_ids']) > 512
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
             predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
             predicted_class_id = predictions.argmax().item()
             confidence = predictions.max().item()
         predicted_category = idx_to_category[predicted_class_id]
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
         result = []
     """Make predictions on a CSV file with complaints"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
+            return load_result, None
     try:
         if hasattr(csv_file, 'name'):
             df = pd.read_csv(csv_file.name)
         else:
             df = pd.read_csv(csv_file)
         if 'complaint' not in df.columns:
+            return "❌ CSV file must have a 'complaint' column", None
         results = []
         predictions_list = []
         for i, row in enumerate(df.iterrows()):
             complaint = str(row[1]['complaint'])
             original_tokens = CURRENT_TOKENIZER(complaint, truncation=False)
             was_truncated = len(original_tokens['input_ids']) > 512
             if was_truncated:
                 truncated_count += 1
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
         results_df = pd.DataFrame(predictions_list)
         results_file = "prediction_results.csv"
         results_df.to_csv(results_file, index=False)
         results.append(f"\n💾 Full results saved to {results_file}")
+        return "\n".join(results), results_file
     except Exception as e:
+        return f"❌ CSV processing failed: {str(e)}", None
 def push_to_hub_after_training(model_path, username, model_name, token):
     """Push a trained model to Hugging Face Hub"""
         if error:
             return f"❌ {error}"
         login(token)
         if not os.path.exists(model_path):
             return "❌ No trained model found. Please train a model first."
         except Exception as e:
             return f"❌ Failed to load model: {str(e)}"
         try:
             model.push_to_hub(hub_model_id)
             tokenizer.push_to_hub(hub_model_id)
     else:
         return "No CSV files found in the current directory."
+# --- Gradio UI Definition (Correctly structured) ---
 # Initialize tokenizer on startup
 if CURRENT_TOKENIZER is None:
     try:
 print("🚀 Launching BERT Complaint Classifier...")
 print("📍 Available at: http://localhost:7860")
+# The entire Gradio UI definition must be within this single block
 with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app:
     gr.Markdown("# BERT Complaint Classifier 🗣️🤖")
     gr.Markdown("Fine-tune a BERT model or use an existing one to classify customer complaints.")
                 predict_btn = gr.Button("Classify Complaint", variant="primary")
                 single_prediction_output = gr.Markdown("Prediction will appear here...")
                 text_input.change(count_tokens, inputs=text_input, outputs=token_count_output)
         with gr.Tab("Predict from CSV"):
                 csv_prediction_output = gr.Markdown("Predictions will appear here...")
                 download_link = gr.File(label="Download Full Predictions", interactive=False)
         predict_btn.click(
             predict_text,
             inputs=[text_input, model_path_input],
                 hub_token_input_push = gr.Textbox(label="Hugging Face Token", type="password")
             push_btn = gr.Button("🚀 Push Model to Hub", variant="primary")
+            push_output = gr.Textbox(label="Results", lines=3, interactive=False)
             push_btn.click(
                 push_to_hub_after_training,
                 inputs=[gr.Textbox(value=MODEL_PATH, visible=False), hub_username_input_push, hub_model_name_input_push, hub_token_input_push],
                 outputs=push_output
             )
     preview_btn.click(
         preview_dataset,
         inputs=[uploaded_file, text_column_input, label_column_input],
         outputs=available_datasets
     )
     app.load(display_available_datasets, outputs=available_datasets)
 if __name__ == "__main__":
     app.launch(
         server_name="0.0.0.0",