Spaces:

msmaje
/

bert-complain-classifier

Sleeping

App Files Files Community

msmaje commited on Aug 23, 2025

Commit

f3b6548

verified ·

1 Parent(s): 2b7e143

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -93

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from sklearn.model_selection import train_test_split
 from huggingface_hub import login, HfApi
 from transformers import (
-    AutoTokenizer,
     BertForSequenceClassification,
     TrainingArguments,
     Trainer,
@@ -35,16 +35,16 @@ TRAINING_LOGS = []
 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
-# --- Application Logic Functions (No change needed here, they are correctly indented) ---
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Dataset file not found: {file_path}")
         df = pd.read_csv(file_path)
         if text_column not in df.columns:
             available_cols = list(df.columns)
             raise ValueError(f"Text column '{text_column}' not found. Available columns: {available_cols}")
@@ -53,19 +53,24 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             available_cols = list(df.columns)
             raise ValueError(f"Label column '{label_column}' not found. Available columns: {available_cols}")
         df = df.dropna(subset=[text_column, label_column])
         df[text_column] = df[text_column].astype(str)
         if df[label_column].dtype == 'object':
             unique_labels = df[label_column].unique()
             if len(unique_labels) > len(CATEGORIES):
                 raise ValueError(f"Too many unique labels ({len(unique_labels)}). Expected max {len(CATEGORIES)}")
             label_mapping = {}
             for label in unique_labels:
                 if label in category_to_idx:
                     label_mapping[label] = category_to_idx[label]
                 else:
                     available_indices = set(range(len(CATEGORIES))) - set(label_mapping.values())
                     if available_indices:
                         label_mapping[label] = min(available_indices)
@@ -74,18 +79,22 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             df['label_idx'] = df[label_column].map(label_mapping)
         else:
             df['label_idx'] = df[label_column].astype(int)
         if df['label_idx'].min() < 0 or df['label_idx'].max() >= len(CATEGORIES):
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
         train_df, val_df = train_test_split(
-            df,
-            test_size=test_size,
-            random_state=42,
             stratify=df['label_idx']
         )
         train_dataset = Dataset.from_pandas(train_df[[text_column, 'label_idx']])
         val_dataset = Dataset.from_pandas(val_df[[text_column, 'label_idx']])
@@ -105,6 +114,7 @@ def preview_dataset(uploaded_file, text_column, label_column):
         if uploaded_file is None:
             return "Please upload a dataset file first."
         file_path = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
         df = pd.read_csv(file_path)
@@ -152,9 +162,11 @@ def validate_hub_model_id(username, model_name):
     if not username or not model_name:
         return None, "Please provide both username and model name"
     model_name = model_name.strip().lower().replace(" ", "-")
     model_name = ''.join(c for c in model_name if c.isalnum() or c in ['-', '_'])
     hub_model_id = f"{username}/{model_name}"
     return hub_model_id, None
@@ -164,6 +176,7 @@ def load_model(model_path):
     global CURRENT_MODEL, CURRENT_TOKENIZER
     try:
         if os.path.exists(model_path):
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
@@ -172,6 +185,7 @@ def load_model(model_path):
             )
             return f"✅ Model loaded from local path: {model_path}"
         try:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
@@ -180,6 +194,7 @@ def load_model(model_path):
             )
             return f"✅ Model loaded from Hugging Face Hub: {model_path}"
         except Exception as hub_error:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
                 "bert-base-uncased",
@@ -215,7 +230,7 @@ def compute_metrics(eval_pred):
         'recall_macro': report['macro avg']['recall']
     }
-def train_model_inline(uploaded_file, text_column, label_column, num_epochs, batch_size,
                        learning_rate, hf_token, push_to_hub, username, model_name):
     """Train the model using inline training (no subprocess)"""
     global TRAINING_LOGS, MODEL_PATH, CURRENT_MODEL, CURRENT_TOKENIZER
@@ -227,6 +242,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(login_result)
         yield "\n".join(TRAINING_LOGS)
     if push_to_hub:
         hub_model_id, error = validate_hub_model_id(username, model_name)
         if error:
@@ -236,6 +252,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
     else:
         hub_model_id = None
     if uploaded_file is None:
         TRAINING_LOGS.append("❌ Please upload a dataset file")
         yield "\n".join(TRAINING_LOGS)
@@ -244,6 +261,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
     dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
     try:
         TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
         yield "\n".join(TRAINING_LOGS)
@@ -256,6 +274,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("🤖 Loading BERT model and tokenizer...")
         yield "\n".join(TRAINING_LOGS)
@@ -268,12 +287,14 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append("✅ Model and tokenizer loaded")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("🔤 Tokenizing datasets...")
         yield "\n".join(TRAINING_LOGS)
         def tokenize_batch(examples):
             return tokenize_function(examples, tokenizer, final_text_col, 512)
         columns_to_remove = [col for col in dataset_dict['train'].column_names if col != final_label_col]
         tokenized_datasets = dataset_dict.map(
@@ -282,14 +303,17 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             remove_columns=columns_to_remove
         )
         tokenized_datasets = tokenized_datasets.rename_column(final_label_col, 'labels')
         TRAINING_LOGS.append("✅ Tokenization completed")
         yield "\n".join(TRAINING_LOGS)
         output_dir = Path(MODEL_PATH)
         output_dir.mkdir(parents=True, exist_ok=True)
         total_steps = len(tokenized_datasets['train']) // batch_size * num_epochs
         eval_steps = max(10, min(100, total_steps // 4))
         save_steps = max(20, min(500, total_steps // 2))
@@ -302,6 +326,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append(f"- Warmup steps: {warmup_steps}")
         yield "\n".join(TRAINING_LOGS)
         training_args = TrainingArguments(
             output_dir=str(output_dir),
             num_train_epochs=num_epochs,
@@ -328,8 +353,10 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             remove_unused_columns=False,
         )
         data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
         trainer = Trainer(
             model=model,
             args=training_args,
@@ -344,6 +371,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         TRAINING_LOGS.append("🚀 Starting training...")
         yield "\n".join(TRAINING_LOGS)
         class ProgressCallback:
             def __init__(self, logs_list):
                 self.logs = logs_list
@@ -367,6 +395,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         progress_callback = ProgressCallback(TRAINING_LOGS)
         trainer.add_callback(progress_callback)
         try:
             trainer.train()
             TRAINING_LOGS.append("✅ Training completed successfully!")
@@ -376,18 +405,21 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
             yield "\n".join(TRAINING_LOGS)
             return
         TRAINING_LOGS.append("💾 Saving model...")
         yield "\n".join(TRAINING_LOGS)
         trainer.save_model()
         tokenizer.save_pretrained(output_dir)
         CURRENT_MODEL = model
         CURRENT_TOKENIZER = tokenizer
         TRAINING_LOGS.append("✅ Model saved successfully!")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("📊 Running final evaluation...")
         yield "\n".join(TRAINING_LOGS)
@@ -400,6 +432,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
                 else:
                     TRAINING_LOGS.append(f"  {key}: {value}")
             with open(output_dir / "eval_results.json", "w") as f:
                 json.dump(eval_results, f, indent=2)
@@ -408,6 +441,7 @@ def train_model_inline(uploaded_file, text_column, label_column, num_epochs, bat
         yield "\n".join(TRAINING_LOGS)
         if push_to_hub and hub_model_id:
             TRAINING_LOGS.append(f"🤗 Pushing to Hugging Face Hub: {hub_model_id}")
             yield "\n".join(TRAINING_LOGS)
@@ -431,6 +465,7 @@ def predict_text(text, model_path):
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
@@ -440,19 +475,24 @@ def predict_text(text, model_path):
         if not text.strip():
             return "Please enter some text to classify."
         original_tokens = CURRENT_TOKENIZER(text, truncation=False)
         was_truncated = len(original_tokens['input_ids']) > 512
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
             predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
             predicted_class_id = predictions.argmax().item()
             confidence = predictions.max().item()
         predicted_category = idx_to_category[predicted_class_id]
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
         result = []
@@ -476,12 +516,14 @@ def predict_csv(csv_file, model_path):
     """Make predictions on a CSV file with complaints"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
             return load_result, None
     try:
         if hasattr(csv_file, 'name'):
             df = pd.read_csv(csv_file.name)
         else:
@@ -497,11 +539,13 @@ def predict_csv(csv_file, model_path):
         for i, row in enumerate(df.iterrows()):
             complaint = str(row[1]['complaint'])
             original_tokens = CURRENT_TOKENIZER(complaint, truncation=False)
             was_truncated = len(original_tokens['input_ids']) > 512
             if was_truncated:
                 truncated_count += 1
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
@@ -529,6 +573,7 @@ def predict_csv(csv_file, model_path):
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
         results_df = pd.DataFrame(predictions_list)
         results_file = "prediction_results.csv"
         results_df.to_csv(results_file, index=False)
@@ -549,6 +594,7 @@ def push_to_hub_after_training(model_path, username, model_name, token):
         if error:
             return f"❌ {error}"
         login(token)
         if not os.path.exists(model_path):
             return "❌ No trained model found. Please train a model first."
@@ -559,6 +605,7 @@ def push_to_hub_after_training(model_path, username, model_name, token):
         except Exception as e:
             return f"❌ Failed to load model: {str(e)}"
         try:
             model.push_to_hub(hub_model_id)
             tokenizer.push_to_hub(hub_model_id)
@@ -603,8 +650,6 @@ def display_available_datasets():
     else:
         return "No CSV files found in the current directory."
-# --- Gradio UI Definition (Correctly structured) ---
 # Initialize tokenizer on startup
 if CURRENT_TOKENIZER is None:
     try:
@@ -616,7 +661,7 @@ if CURRENT_TOKENIZER is None:
 print("🚀 Launching BERT Complaint Classifier...")
 print("📍 Available at: http://localhost:7860")
-# The entire Gradio UI definition must be within this single block
 with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app:
     gr.Markdown("# BERT Complaint Classifier 🗣️🤖")
     gr.Markdown("Fine-tune a BERT model or use an existing one to classify customer complaints.")
@@ -666,98 +711,86 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
             with gr.Column(variant="panel"):
                 gr.Markdown("### Classify a Single Complaint")
-                model_path_input = gr.Textbox(
-                    label="Model Path or Hub ID",
-                    value="bert-base-uncased",
-                    placeholder="e.g., local-model or your_username/your_model"
-                )
-                with gr.Row():
-                    text_input = gr.Textbox(label="Complaint Text", lines=3)
-                    token_count_output = gr.Markdown("Token count: 0/512")
-                predict_btn = gr.Button("Classify Complaint", variant="primary")
-                single_prediction_output = gr.Markdown("Prediction will appear here...")
-                text_input.change(count_tokens, inputs=text_input, outputs=token_count_output)
-        with gr.Tab("Predict from CSV"):
             with gr.Column(variant="panel"):
-                gr.Markdown("### Classify Complaints from a CSV File")
-                csv_file_input = gr.File(label="Upload CSV File (with 'complaint' column)")
-                csv_model_path = gr.Textbox(
-                    label="Model Path or Hub ID",
-                    value="local-model",
-                    placeholder="e.g., local-model or your_username/your_model"
-                )
-                csv_predict_btn = gr.Button("Run Predictions on CSV", variant="primary")
-                csv_prediction_output = gr.Markdown("Predictions will appear here...")
-                download_link = gr.File(label="Download Full Predictions", interactive=False)
-        predict_btn.click(
-            predict_text,
-            inputs=[text_input, model_path_input],
-            outputs=single_prediction_output
-        )
-        csv_predict_btn.click(
-            predict_csv,
-            inputs=[csv_file_input, csv_model_path],
-            outputs=[csv_prediction_output, download_link]
-        )
-    with gr.Tab("Tools"):
-        gr.Markdown("## 🔧 Tools")
-        gr.Markdown("Utilities for managing datasets and models.")
-        with gr.Accordion("Dataset Information"):
-            available_datasets = gr.Markdown("No CSV files found in the current directory.")
-            refresh_datasets_btn = gr.Button("🔄 Refresh Available Datasets")
-            gr.Markdown("### Dataset Format Requirements")
-            gr.Markdown("""
-            **For training, your CSV file should have:**
-            - A text column containing the complaint text (default name: 'complaint')
-            - A label column containing categories (default name: 'category')
-            **Supported label formats:**
-            - Text labels: 'Online-Safety', 'BroadBand', 'TV-Radio'
-            - Numeric labels: 0, 1, 2 (corresponding to the categories above)
-            **Example CSV structure:**
-            ```
-            complaint,category
-            "My internet is slow",BroadBand
-            "Blocked website access",Online-Safety
-            "Poor TV signal",TV-Radio
-            ```
-            """)
-            gr.Markdown("### Model Categories")
-            categories_info = f"""
-            **The model classifies complaints into these categories:**
-            | Index | Category | Description |
-            |-------|----------|-------------|
-            | 0 | Online-Safety | Internet safety, content filtering, cybersecurity issues |
-            | 1 | BroadBand | Internet connectivity, speed, network problems |
-            | 2 | TV-Radio | Television and radio broadcasting, signal quality issues |
-            """
-            gr.Markdown(categories_info)
-        with gr.Accordion("Push Local Model to Hub"):
-            gr.Markdown("Use this to manually push a locally trained model (`./local-model`) to the Hub.")
-            with gr.Row():
-                hub_username_input_push = gr.Textbox(label="Hugging Face Username")
-                hub_model_name_input_push = gr.Textbox(label="Model Name")
-                hub_token_input_push = gr.Textbox(label="Hugging Face Token", type="password")
-            push_btn = gr.Button("🚀 Push Model to Hub", variant="primary")
-            push_output = gr.Textbox(label="Results", lines=3, interactive=False)
-            push_btn.click(
-                push_to_hub_after_training,
-                inputs=[gr.Textbox(value=MODEL_PATH, visible=False), hub_username_input_push, hub_model_name_input_push, hub_token_input_push],
-                outputs=push_output
-            )
     preview_btn.click(
         preview_dataset,
         inputs=[uploaded_file, text_column_input, label_column_input],
@@ -781,13 +814,51 @@ with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app
         outputs=training_log_output,
     )
-    refresh_datasets_btn.click(
         display_available_datasets,
-        outputs=available_datasets
     )
-    app.load(display_available_datasets, outputs=available_datasets)
 if __name__ == "__main__":
     app.launch(
         server_name="0.0.0.0",

 from huggingface_hub import login, HfApi
 from transformers import (
+    AutoTokenizer,
     BertForSequenceClassification,
     TrainingArguments,
     Trainer,
 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"Dataset file not found: {file_path}")
+        # Load the CSV file
         df = pd.read_csv(file_path)
+        # Verify required columns exist
         if text_column not in df.columns:
             available_cols = list(df.columns)
             raise ValueError(f"Text column '{text_column}' not found. Available columns: {available_cols}")
             available_cols = list(df.columns)
             raise ValueError(f"Label column '{label_column}' not found. Available columns: {available_cols}")
+        # Clean the data
         df = df.dropna(subset=[text_column, label_column])
         df[text_column] = df[text_column].astype(str)
+        # Handle different label formats
         if df[label_column].dtype == 'object':
+            # If labels are text, convert to indices
             unique_labels = df[label_column].unique()
             if len(unique_labels) > len(CATEGORIES):
                 raise ValueError(f"Too many unique labels ({len(unique_labels)}). Expected max {len(CATEGORIES)}")
+            # Try to map text labels to our categories
             label_mapping = {}
             for label in unique_labels:
                 if label in category_to_idx:
                     label_mapping[label] = category_to_idx[label]
                 else:
+                    # Auto-assign if not found
                     available_indices = set(range(len(CATEGORIES))) - set(label_mapping.values())
                     if available_indices:
                         label_mapping[label] = min(available_indices)
             df['label_idx'] = df[label_column].map(label_mapping)
         else:
+            # If labels are already numeric
             df['label_idx'] = df[label_column].astype(int)
+        # Verify label indices are valid
         if df['label_idx'].min() < 0 or df['label_idx'].max() >= len(CATEGORIES):
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
+        # Create train/validation split
         train_df, val_df = train_test_split(
+            df,
+            test_size=test_size,
+            random_state=42,
             stratify=df['label_idx']
         )
+        # Convert to Hugging Face datasets
         train_dataset = Dataset.from_pandas(train_df[[text_column, 'label_idx']])
         val_dataset = Dataset.from_pandas(val_df[[text_column, 'label_idx']])
         if uploaded_file is None:
             return "Please upload a dataset file first."
+        # Get the file path from the uploaded file
         file_path = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
         df = pd.read_csv(file_path)
     if not username or not model_name:
         return None, "Please provide both username and model name"
+    # Clean up the model name
     model_name = model_name.strip().lower().replace(" ", "-")
     model_name = ''.join(c for c in model_name if c.isalnum() or c in ['-', '_'])
+    # Construct the full model ID
     hub_model_id = f"{username}/{model_name}"
     return hub_model_id, None
     global CURRENT_MODEL, CURRENT_TOKENIZER
     try:
+        # Try loading from local path first
         if os.path.exists(model_path):
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
             )
             return f"✅ Model loaded from local path: {model_path}"
+        # If local path doesn't exist, try loading from Hub
         try:
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained(model_path)
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
             )
             return f"✅ Model loaded from Hugging Face Hub: {model_path}"
         except Exception as hub_error:
+            # If both local and hub loading fail, fall back to base model
             CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
             CURRENT_MODEL = BertForSequenceClassification.from_pretrained(
                 "bert-base-uncased",
         'recall_macro': report['macro avg']['recall']
     }
+def train_model_inline(uploaded_file, text_column, label_column, num_epochs, batch_size,
                        learning_rate, hf_token, push_to_hub, username, model_name):
     """Train the model using inline training (no subprocess)"""
     global TRAINING_LOGS, MODEL_PATH, CURRENT_MODEL, CURRENT_TOKENIZER
         TRAINING_LOGS.append(login_result)
         yield "\n".join(TRAINING_LOGS)
+    # Validate hub model ID if pushing to hub
     if push_to_hub:
         hub_model_id, error = validate_hub_model_id(username, model_name)
         if error:
     else:
         hub_model_id = None
+    # Validate uploaded file
     if uploaded_file is None:
         TRAINING_LOGS.append("❌ Please upload a dataset file")
         yield "\n".join(TRAINING_LOGS)
     dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
     try:
+        # Load and prepare dataset
         TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
         yield "\n".join(TRAINING_LOGS)
+        # Load model and tokenizer
         TRAINING_LOGS.append("🤖 Loading BERT model and tokenizer...")
         yield "\n".join(TRAINING_LOGS)
         TRAINING_LOGS.append("✅ Model and tokenizer loaded")
         yield "\n".join(TRAINING_LOGS)
+        # Tokenize datasets
         TRAINING_LOGS.append("🔤 Tokenizing datasets...")
         yield "\n".join(TRAINING_LOGS)
         def tokenize_batch(examples):
             return tokenize_function(examples, tokenizer, final_text_col, 512)
+        # Get columns to remove (keep only label column and tokenized features)
         columns_to_remove = [col for col in dataset_dict['train'].column_names if col != final_label_col]
         tokenized_datasets = dataset_dict.map(
             remove_columns=columns_to_remove
         )
+        # Rename label column to 'labels' (required by Trainer)
         tokenized_datasets = tokenized_datasets.rename_column(final_label_col, 'labels')
         TRAINING_LOGS.append("✅ Tokenization completed")
         yield "\n".join(TRAINING_LOGS)
+        # Set up training
         output_dir = Path(MODEL_PATH)
         output_dir.mkdir(parents=True, exist_ok=True)
+        # Calculate steps
         total_steps = len(tokenized_datasets['train']) // batch_size * num_epochs
         eval_steps = max(10, min(100, total_steps // 4))
         save_steps = max(20, min(500, total_steps // 2))
         TRAINING_LOGS.append(f"- Warmup steps: {warmup_steps}")
         yield "\n".join(TRAINING_LOGS)
+        # Training arguments
         training_args = TrainingArguments(
             output_dir=str(output_dir),
             num_train_epochs=num_epochs,
             remove_unused_columns=False,
         )
+        # Data collator
         data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+        # Create trainer
         trainer = Trainer(
             model=model,
             args=training_args,
         TRAINING_LOGS.append("🚀 Starting training...")
         yield "\n".join(TRAINING_LOGS)
+        # Custom training loop with progress updates
         class ProgressCallback:
             def __init__(self, logs_list):
                 self.logs = logs_list
         progress_callback = ProgressCallback(TRAINING_LOGS)
         trainer.add_callback(progress_callback)
+        # Train the model
         try:
             trainer.train()
             TRAINING_LOGS.append("✅ Training completed successfully!")
             yield "\n".join(TRAINING_LOGS)
             return
+        # Save the model
         TRAINING_LOGS.append("💾 Saving model...")
         yield "\n".join(TRAINING_LOGS)
         trainer.save_model()
         tokenizer.save_pretrained(output_dir)
+        # Update global model and tokenizer
         CURRENT_MODEL = model
         CURRENT_TOKENIZER = tokenizer
         TRAINING_LOGS.append("✅ Model saved successfully!")
         yield "\n".join(TRAINING_LOGS)
+        # Final evaluation
         TRAINING_LOGS.append("📊 Running final evaluation...")
         yield "\n".join(TRAINING_LOGS)
                 else:
                     TRAINING_LOGS.append(f"  {key}: {value}")
+            # Save results
             with open(output_dir / "eval_results.json", "w") as f:
                 json.dump(eval_results, f, indent=2)
         yield "\n".join(TRAINING_LOGS)
+        # Push to hub if requested
         if push_to_hub and hub_model_id:
             TRAINING_LOGS.append(f"🤗 Pushing to Hugging Face Hub: {hub_model_id}")
             yield "\n".join(TRAINING_LOGS)
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
+    # Load the model if it's not loaded or a different one is requested
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
         if not text.strip():
             return "Please enter some text to classify."
+        # Check if text was truncated
         original_tokens = CURRENT_TOKENIZER(text, truncation=False)
         was_truncated = len(original_tokens['input_ids']) > 512
+        # Tokenize input
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
+        # Make prediction
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
             predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
             predicted_class_id = predictions.argmax().item()
             confidence = predictions.max().item()
+        # Get predicted category
         predicted_category = idx_to_category[predicted_class_id]
+        # Format result
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
         result = []
     """Make predictions on a CSV file with complaints"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
+    # Load the model if needed
     if CURRENT_MODEL is None or model_path != MODEL_PATH:
         load_result = load_model(model_path)
         if load_result.startswith("❌"):
             return load_result, None
     try:
+        # Read the CSV file
         if hasattr(csv_file, 'name'):
             df = pd.read_csv(csv_file.name)
         else:
         for i, row in enumerate(df.iterrows()):
             complaint = str(row[1]['complaint'])
+            # Check for truncation
             original_tokens = CURRENT_TOKENIZER(complaint, truncation=False)
             was_truncated = len(original_tokens['input_ids']) > 512
             if was_truncated:
                 truncated_count += 1
+            # Predict
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
+        # Save full results to a CSV file
         results_df = pd.DataFrame(predictions_list)
         results_file = "prediction_results.csv"
         results_df.to_csv(results_file, index=False)
         if error:
             return f"❌ {error}"
+        # Login and load model
         login(token)
         if not os.path.exists(model_path):
             return "❌ No trained model found. Please train a model first."
         except Exception as e:
             return f"❌ Failed to load model: {str(e)}"
+        # Push to Hub
         try:
             model.push_to_hub(hub_model_id)
             tokenizer.push_to_hub(hub_model_id)
     else:
         return "No CSV files found in the current directory."
 # Initialize tokenizer on startup
 if CURRENT_TOKENIZER is None:
     try:
 print("🚀 Launching BERT Complaint Classifier...")
 print("📍 Available at: http://localhost:7860")
+# The entire Gradio UI definition must be within a single block
 with gr.Blocks(title="BERT Complaint Classifier", theme=gr.themes.Soft()) as app:
     gr.Markdown("# BERT Complaint Classifier 🗣️🤖")
     gr.Markdown("Fine-tune a BERT model or use an existing one to classify customer complaints.")
             with gr.Column(variant="panel"):
                 gr.Markdown("### Classify a Single Complaint")
+                model_path_input = gr.Textbox(label="Model Path", value=MODEL_PATH, placeholder="Enter model path or HuggingFace model ID")
+                load_model_btn = gr.Button("Load Model")
+                model_status = gr.Textbox(label="Model Status", interactive=False)
+                gr.Markdown("---")
+                text_input = gr.Textbox(
+                    label="Enter complaint text",
+                    lines=3,
+                    placeholder="Type your complaint here..."
+                )
+                token_counter = gr.Textbox(label="Token Count", interactive=False, value="Enter text to see token count")
+                predict_btn = gr.Button("🔮 Predict Category", variant="primary")
+            prediction_output = gr.Markdown("Prediction results will appear here")
+        with gr.Tab("Predict CSV File"):
             with gr.Column(variant="panel"):
+                gr.Markdown("### Classify Multiple Complaints from CSV")
+                gr.Markdown("Upload a CSV file with a 'complaint' column to classify multiple complaints at once.")
+                csv_model_path = gr.Textbox(label="Model Path", value=MODEL_PATH, placeholder="Enter model path or HuggingFace model ID")
+                csv_load_btn = gr.Button("Load Model")
+                csv_model_status = gr.Textbox(label="Model Status", interactive=False)
+                gr.Markdown("---")
+                csv_file_input = gr.File(label="Upload CSV File", type="filepath", file_types=["csv"])
+                csv_predict_btn = gr.Button("🔮 Predict All", variant="primary")
+            csv_prediction_output = gr.Markdown("CSV prediction results will appear here")
+            csv_download = gr.File(label="Download Results", interactive=False)
+    with gr.Tab("Push to Hub"):
+        gr.Markdown("## 🤗 Push Trained Model to Hugging Face Hub")
+        gr.Markdown("Upload your locally trained model to the Hugging Face Hub for sharing.")
+        with gr.Column(variant="panel"):
+            hub_model_path = gr.Textbox(label="Local Model Path", value=MODEL_PATH)
+            hub_username = gr.Textbox(label="Hugging Face Username")
+            hub_model_name = gr.Textbox(label="Model Name", value="bert-complaint-classifier")
+            hub_token = gr.Textbox(label="Hugging Face Token", type="password")
+            push_hub_btn = gr.Button("🚀 Push to Hub", variant="primary")
+        push_hub_output = gr.Markdown("Push results will appear here")
+    with gr.Tab("Dataset Info"):
+        gr.Markdown("## 📊 Dataset Information")
+        gr.Markdown("View information about available datasets and model categories.")
+        with gr.Column(variant="panel"):
+            gr.Markdown("### 🎯 Model Categories")
+            categories_info = gr.Markdown(f"**Available Categories:**\n\n" + "\n".join([f"- **{cat}** (index: {idx})" for idx, cat in idx_to_category.items()]))
+            gr.Markdown("---")
+            gr.Markdown("### 📁 Available Datasets")
+            datasets_btn = gr.Button("🔍 Scan for CSV Files")
+            datasets_info = gr.Markdown("Click 'Scan for CSV Files' to see available datasets")
+            gr.Markdown("---")
+            gr.Markdown("### 💡 Tips")
+            gr.Markdown("""
+            **Dataset Format:**
+            - CSV file with at least two columns
+            - One column for text (complaints)
+            - One column for labels/categories
+            - Labels can be text (will be auto-mapped) or numeric indices (0, 1, 2)
+            **Training Tips:**
+            - Start with 3 epochs and adjust based on results
+            - Use batch size 8-16 for most datasets
+            - Learning rate 2e-5 works well for BERT fine-tuning
+            - Enable early stopping to prevent overfitting
+            """)
+    # Connect functions to UI components
     preview_btn.click(
         preview_dataset,
         inputs=[uploaded_file, text_column_input, label_column_input],
         outputs=training_log_output,
     )
+    load_model_btn.click(
+        load_model,
+        inputs=model_path_input,
+        outputs=model_status
+    )
+    predict_btn.click(
+        predict_text,
+        inputs=[text_input, model_path_input],
+        outputs=prediction_output
+    )
+    text_input.change(
+        count_tokens,
+        inputs=text_input,
+        outputs=token_counter
+    )
+    csv_load_btn.click(
+        load_model,
+        inputs=csv_model_path,
+        outputs=csv_model_status
+    )
+    csv_predict_btn.click(
+        predict_csv,
+        inputs=[csv_file_input, csv_model_path],
+        outputs=[csv_prediction_output, csv_download]
+    )
+    push_hub_btn.click(
+        push_to_hub_after_training,
+        inputs=[hub_model_path, hub_username, hub_model_name, hub_token],
+        outputs=push_hub_output
+    )
+    datasets_btn.click(
         display_available_datasets,
+        outputs=datasets_info
     )
+    # Run a check for available datasets on app load
+    app.load(display_available_datasets, outputs=datasets_info)
+# Launch the Gradio app
 if __name__ == "__main__":
     app.launch(
         server_name="0.0.0.0",