Spaces:

msmaje
/

bert-complain-classifier

Sleeping

App Files Files Community

msmaje commited on Aug 23, 2025

Commit

d58a542

verified ·

1 Parent(s): dfd51d1

Update app.py

Browse files

Files changed (1) hide show

app.py +403 -384

app.py CHANGED Viewed

@@ -2,13 +2,28 @@ import gradio as gr
 import torch
 import pandas as pd
 import os
-import tempfile
-import time
-import subprocess
 import json
 from huggingface_hub import login, HfApi
-from transformers import AutoTokenizer, BertForSequenceClassification
-from datasets import load_dataset, Dataset, DatasetDict
 # Global variables
 MODEL_PATH = "local-model"
@@ -20,38 +35,6 @@ TRAINING_LOGS = []
 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
-# Local data files
-LOCAL_DATA_FILES = [
-    "merged-test-data.csv",
-    "test-category.csv",
-    "test-complaint.csv"
-]
-def get_available_datasets():
-    """Get list of available local datasets"""
-    available_files = []
-    for file in LOCAL_DATA_FILES:
-        if os.path.exists(file):
-            try:
-                df = pd.read_csv(file)
-                available_files.append(f"{file} ({len(df)} rows)")
-            except Exception as e:
-                available_files.append(f"{file} (Error: {str(e)})")
-        else:
-            available_files.append(f"{file} (Not found)")
-    # Also check for any other CSV files in the directory
-    for file in os.listdir("."):
-        if file.endswith(".csv") and file not in LOCAL_DATA_FILES:
-            if os.path.exists(file):
-                try:
-                    df = pd.read_csv(file)
-                    available_files.append(f"{file} ({len(df)} rows)")
-                except:
-                    available_files.append(f"{file} (Error reading)")
-    return available_files
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
@@ -104,8 +87,6 @@ def load_and_prepare_local_dataset(file_path, text_column, label_column, test_si
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
         # Create train/validation split
-        from sklearn.model_selection import train_test_split
         train_df, val_df = train_test_split(
             df,
             test_size=test_size,
@@ -224,6 +205,262 @@ def load_model(model_path):
     except Exception as e:
         return f"❌ Failed to load model: {str(e)}"
 def predict_text(text, model_path):
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
@@ -235,25 +472,45 @@ def predict_text(text, model_path):
             return load_result
     try:
         # Tokenize input
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
         # Make prediction
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
-            predicted_idx = outputs.logits.argmax().item()
-        # Get category from index
-        predicted_category = idx_to_category[predicted_idx]
-        # Check if text was truncated
-        original_tokens = CURRENT_TOKENIZER(text, truncation=False)
-        was_truncated = len(original_tokens['input_ids']) > 512
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
-        return f"Complaint: {text}\n\nPredicted Category: {predicted_category}{truncation_warning}"
     except Exception as e:
-        return f"❌ Prediction failed: {str(e)}"
 def predict_csv(csv_file, model_path):
     """Make predictions on a CSV file with complaints"""
@@ -276,6 +533,7 @@ def predict_csv(csv_file, model_path):
             return "❌ CSV file must have a 'complaint' column"
         results = []
         truncated_count = 0
         for i, row in enumerate(df.iterrows()):
@@ -291,13 +549,22 @@ def predict_csv(csv_file, model_path):
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
-                predicted_idx = outputs.logits.argmax().item()
             predicted_category = idx_to_category[predicted_idx]
             truncation_mark = " ⚠️" if was_truncated else ""
             preview = complaint if len(complaint) <= 50 else complaint[:47] + "..."
-            results.append(f"Complaint {i+1}{truncation_mark}: {preview}\nPredicted Category: {predicted_category}\n")
             if i >= 19:
                 results.append(f"... and {len(df) - 20} more (showing first 20 out of {len(df)} complaints)")
@@ -306,141 +573,17 @@ def predict_csv(csv_file, model_path):
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
         return "\n".join(results)
     except Exception as e:
         return f"❌ CSV processing failed: {str(e)}"
-def train_model(uploaded_file, text_column, label_column, num_epochs, batch_size,
-                learning_rate, hf_token, push_to_hub, username, model_name):
-    """Start the model training process with local data"""
-    global TRAINING_LOGS, MODEL_PATH
-    TRAINING_LOGS = []  # Reset logs at the start of training
-    if hf_token:
-        login_result = login_to_hf(hf_token)
-        TRAINING_LOGS.append(login_result)
-        yield "\n".join(TRAINING_LOGS)
-    # Validate hub model ID if pushing to hub
-    if push_to_hub:
-        hub_model_id, error = validate_hub_model_id(username, model_name)
-        if error:
-            TRAINING_LOGS.append(f"❌ {error}")
-            yield "\n".join(TRAINING_LOGS)
-            return
-    else:
-        hub_model_id = None
-    # Validate uploaded file
-    if uploaded_file is None:
-        TRAINING_LOGS.append("❌ Please upload a dataset file")
-        yield "\n".join(TRAINING_LOGS)
-        return
-    # Get the file path from the uploaded file
-    dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
-    try:
-        # Load and prepare the dataset
-        TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
-        yield "\n".join(TRAINING_LOGS)
-        dataset_dict, final_text_col, final_label_col = load_and_prepare_local_dataset(
-            dataset_file, text_column, label_column
-        )
-        TRAINING_LOGS.append(f"✅ Dataset loaded successfully!")
-        TRAINING_LOGS.append(f"- Train samples: {len(dataset_dict['train'])}")
-        TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
-        yield "\n".join(TRAINING_LOGS)
-        # Save dataset temporarily for the training script
-        temp_dataset_path = "temp_dataset"
-        os.makedirs(temp_dataset_path, exist_ok=True)
-        dataset_dict.save_to_disk(temp_dataset_path)
-        TRAINING_LOGS.append("💾 Dataset prepared for training...")
-        yield "\n".join(TRAINING_LOGS)
-    except Exception as e:
-        TRAINING_LOGS.append(f"❌ Error preparing dataset: {str(e)}")
-        yield "\n".join(TRAINING_LOGS)
-        return
-    # Create training command for local dataset
-    cmd = [
-        "python", "bert_finetune.py",
-        "--dataset_path", temp_dataset_path,  # Use local path instead of HF dataset name
-        "--model_id", "bert-base-uncased",
-        "--output_dir", MODEL_PATH,
-        "--feature_column", final_text_col,
-        "--label_column", final_label_col,
-        "--num_labels", "3",
-        "--num_train_epochs", str(num_epochs),
-        "--batch_size", str(batch_size),
-        "--learning_rate", str(learning_rate),
-        "--max_length", "512"
-    ]
-    if push_to_hub and hub_model_id:
-        cmd.extend(["--push_to_hub", "--hub_model_id", hub_model_id])
-        if hf_token:
-            cmd.extend(["--hf_token", hf_token])
-    TRAINING_LOGS.append(f"🚀 Starting training with command: {' '.join(cmd)}")
-    yield "\n".join(TRAINING_LOGS)
-    try:
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            universal_newlines=True,
-            bufsize=1
-        )
-        TRAINING_LOGS.append("🔄 Training started...")
-        yield "\n".join(TRAINING_LOGS)
-        while True:
-            line = process.stdout.readline()
-            if not line and process.poll() is not None:
-                break
-            if line:
-                TRAINING_LOGS.append(line.strip())
-                yield "\n".join(TRAINING_LOGS)
-        process.wait()
-        if process.returncode == 0:
-            TRAINING_LOGS.append("✅ Training completed successfully!")
-            if push_to_hub and hub_model_id:
-                TRAINING_LOGS.append(f"🤗 Model pushed to Hugging Face Hub: {hub_model_id}")
-            # Load the trained model
-            TRAINING_LOGS.append("📥 Loading trained model...")
-            load_result = load_model(MODEL_PATH)
-            TRAINING_LOGS.append(load_result)
-            # Clean up temporary files
-            import shutil
-            try:
-                shutil.rmtree(temp_dataset_path)
-                TRAINING_LOGS.append("🧹 Cleaned up temporary files")
-            except:
-                pass
-            # Final success message
-            TRAINING_LOGS.append("\n✨ All done! Your model is ready to use.")
-        else:
-            TRAINING_LOGS.append(f"❌ Training failed with return code {process.returncode}")
-    except Exception as e:
-        TRAINING_LOGS.append(f"❌ Error during training: {str(e)}")
-    yield "\n".join(TRAINING_LOGS)
 def push_to_hub_after_training(model_path, username, model_name, token):
     """Push a trained model to Hugging Face Hub"""
     try:
@@ -473,220 +616,96 @@ def push_to_hub_after_training(model_path, username, model_name, token):
     except Exception as e:
         return f"❌ Error: {str(e)}"
-# Create the Gradio Interface
-with gr.Blocks(title="BERT Complaint Classifier") as app:
-    gr.Markdown("# BERT Complaint Category Classifier")
-    gr.Markdown("A simple tool to train and use a BERT model for classifying customer complaints")
-    with gr.Tabs():
-        # Training Tab
-        with gr.TabItem("Train Model"):
-            gr.Markdown("### Train a New Model with Local Data")
-            gr.Markdown("Upload your CSV file and configure training parameters")
-            # Dataset upload
-            with gr.Row():
-                dataset_file = gr.File(
-                    label="Upload Dataset (CSV)",
-                    file_types=[".csv"],
-                    type="filepath"
-                )
-            # Column configuration
-            with gr.Row():
-                text_column = gr.Textbox(
-                    label="Text Column Name",
-                    value="complaint",
-                    placeholder="e.g., complaint, text, description"
-                )
-                label_column = gr.Textbox(
-                    label="Label Column Name",
-                    value="category",
-                    placeholder="e.g., category, label, class"
-                )
-            # Dataset preview
-            preview_btn = gr.Button("📊 Preview Dataset", variant="secondary")
-            dataset_preview = gr.Markdown("Upload a dataset file and click 'Preview Dataset' to see its structure.")
-            # Training parameters
-            with gr.Row():
-                num_epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
-                batch_size = gr.Slider(minimum=4, maximum=32, value=8, step=4, label="Batch Size")
-                learning_rate = gr.Slider(minimum=1e-5, maximum=5e-5, value=2e-5, step=1e-5, label="Learning Rate")
-            with gr.Accordion("Hugging Face Hub Settings", open=False):
-                hf_token = gr.Textbox(
-                    label="Hugging Face Token (required for pushing to Hub)",
-                    type="password"
-                )
-                gr.Markdown("""### Choose when to push to Hub:
-                1. During Training: Model will be pushed automatically when training completes
-                2. After Training: You can push the trained model manually later""")
-                # During Training Push
-                with gr.Group():
-                    push_to_hub = gr.Checkbox(
-                        label="Push Model to Hub during training",
-                        value=False
-                    )
-                    with gr.Column(visible=False) as hub_settings:
-                        username = gr.Textbox(
-                            label="Hugging Face Username",
-                            placeholder="e.g., huggingface-username"
-                        )
-                        model_name = gr.Textbox(
-                            label="Model Name",
-                            placeholder="e.g., bert-complaint-classifier"
-                        )
-                # Post-Training Push
-                with gr.Group():
-                    post_train_push = gr.Checkbox(
-                        label="Push trained model to Hub after training",
-                        value=False
-                    )
-                    with gr.Column(visible=False) as post_train_settings:
-                        post_train_username = gr.Textbox(
-                            label="Hugging Face Username",
-                            placeholder="e.g., huggingface-username"
-                        )
-                        post_train_model_name = gr.Textbox(
-                            label="Model Name",
-                            placeholder="e.g., bert-complaint-classifier"
-                        )
-                        post_train_token = gr.Textbox(
-                            label="Hugging Face Token (if different from above)",
-                            type="password"
-                        )
-                        post_train_push_btn = gr.Button(
-                            "Push Model to Hub",
-                            variant="secondary"
-                        )
-                        post_train_status = gr.Textbox(label="Upload Status")
-                # Show/hide settings based on checkboxes
-                push_to_hub.change(
-                    lambda x: gr.update(visible=x),
-                    inputs=push_to_hub,
-                    outputs=hub_settings
-                )
-                post_train_push.change(
-                    lambda x: gr.update(visible=x),
-                    inputs=post_train_push,
-                    outputs=post_train_settings
-                )
-            gr.Markdown("### BERT Model Note")
-            gr.Markdown("⚠️ BERT has a maximum sequence length of 512 tokens. Complaints longer than this will be truncated.")
-            train_btn = gr.Button("Start Training", variant="primary")
-            training_output = gr.Textbox(label="Training Progress", lines=10)
-            # Connect the preview button
-            preview_btn.click(
-                preview_dataset,
-                inputs=[dataset_file, text_column, label_column],
-                outputs=dataset_preview
-            )
-            # Connect the training button
-            train_btn.click(
-                train_model,
-                inputs=[
-                    dataset_file,
-                    text_column,
-                    label_column,
-                    num_epochs,
-                    batch_size,
-                    learning_rate,
-                    hf_token,
-                    push_to_hub,
-                    username,
-                    model_name
-                ],
-                outputs=training_output,
-                show_progress="full"
-            )
-            # Connect the post-training push button
-            post_train_push_btn.click(
-                push_to_hub_after_training,
-                inputs=[
-                    gr.Textbox(value=MODEL_PATH, visible=False),
-                    post_train_username,
-                    post_train_model_name,
-                    post_train_token
-                ],
-                outputs=post_train_status
-            )
-        # Classification Tab
-        with gr.TabItem("Classify Complaints"):
-            gr.Markdown("### Classify Customer Complaints")
-            model_path = gr.Textbox(
-                label="Model Path or Hugging Face ID",
-                value="local-model",
-                placeholder="e.g., local-model or your-username/bert-complaint-classifier"
-            )
-            with gr.Tabs():
-                # Single Complaint Classification
-                with gr.TabItem("Single Complaint"):
-                    text_input = gr.Textbox(
-                        label="Complaint Text",
-                        lines=5,
-                        placeholder="Enter a customer complaint here..."
-                    )
-                    classify_btn = gr.Button("Classify", variant="primary")
-                    token_info = gr.Markdown("Note: BERT has a 512 token limit. Longer complaints will be truncated.")
-                    text_output = gr.Textbox(label="Classification Result", lines=5)
-                    # Token counter
-                    def count_tokens(text):
-                        if not text or CURRENT_TOKENIZER is None:
-                            return "Enter text to see token count"
-                        tokens = CURRENT_TOKENIZER(text, truncation=False)
-                        count = len(tokens['input_ids'])
-                        if count > 512:
-                            return f"⚠️ **Token count: {count}/512** - Text will be truncated for BERT"
-                        else:
-                            return f"Token count: {count}/512"
-                    text_input.change(
-                        fn=count_tokens,
-                        inputs=text_input,
-                        outputs=token_info
-                    )
-                    classify_btn.click(
-                        predict_text,
-                        inputs=[text_input, model_path],
-                        outputs=text_output
-                    )
-                # Batch Processing
-                with gr.TabItem("Batch Processing"):
-                    gr.Markdown("Upload a CSV file with a 'complaint' column")
-                    csv_input = gr.File(label="Upload CSV", file_types=[".csv"])
-                    batch_classify_btn = gr.Button("Classify All", variant="primary")
-                    csv_output = gr.Textbox(label="Classification Results", lines=15)
-                    batch_classify_btn.click(
-                        predict_csv,
-                        inputs=[csv_input, model_path],
-                        outputs=csv_output
-                    )
 # Launch the app
 if __name__ == "__main__":
     # Initialize tokenizer on startup
     if CURRENT_TOKENIZER is None:
-        CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
-    app.launch()

 import torch
 import pandas as pd
 import os
 import json
+import logging
+import numpy as np
+from datetime import datetime
+from pathlib import Path
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.model_selection import train_test_split
 from huggingface_hub import login, HfApi
+from transformers import (
+    AutoTokenizer,
+    BertForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    DataCollatorWithPadding,
+    EarlyStoppingCallback
+)
+from datasets import Dataset, DatasetDict
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Global variables
 MODEL_PATH = "local-model"
 CURRENT_MODEL = None
 CURRENT_TOKENIZER = None
 def load_and_prepare_local_dataset(file_path, text_column, label_column, test_size=0.2):
     """Load and prepare local CSV dataset for training"""
     try:
             raise ValueError(f"Label indices must be between 0 and {len(CATEGORIES)-1}")
         # Create train/validation split
         train_df, val_df = train_test_split(
             df,
             test_size=test_size,
     except Exception as e:
         return f"❌ Failed to load model: {str(e)}"
+def tokenize_function(examples, tokenizer, feature_column, max_length=512):
+    """Tokenize the input text"""
+    return tokenizer(
+        examples[feature_column],
+        truncation=True,
+        padding=False,
+        max_length=max_length
+    )
+def compute_metrics(eval_pred):
+    """Compute metrics for evaluation"""
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    accuracy = accuracy_score(labels, predictions)
+    report = classification_report(labels, predictions, output_dict=True, zero_division=0)
+    return {
+        'accuracy': accuracy,
+        'f1_macro': report['macro avg']['f1-score'],
+        'f1_weighted': report['weighted avg']['f1-score'],
+        'precision_macro': report['macro avg']['precision'],
+        'recall_macro': report['macro avg']['recall']
+    }
+def train_model_inline(uploaded_file, text_column, label_column, num_epochs, batch_size,
+                      learning_rate, hf_token, push_to_hub, username, model_name):
+    """Train the model using inline training (no subprocess)"""
+    global TRAINING_LOGS, MODEL_PATH, CURRENT_MODEL, CURRENT_TOKENIZER
+    TRAINING_LOGS = []
+    if hf_token:
+        login_result = login_to_hf(hf_token)
+        TRAINING_LOGS.append(login_result)
+        yield "\n".join(TRAINING_LOGS)
+    # Validate hub model ID if pushing to hub
+    if push_to_hub:
+        hub_model_id, error = validate_hub_model_id(username, model_name)
+        if error:
+            TRAINING_LOGS.append(f"❌ {error}")
+            yield "\n".join(TRAINING_LOGS)
+            return
+    else:
+        hub_model_id = None
+    # Validate uploaded file
+    if uploaded_file is None:
+        TRAINING_LOGS.append("❌ Please upload a dataset file")
+        yield "\n".join(TRAINING_LOGS)
+        return
+    dataset_file = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
+    try:
+        # Load and prepare dataset
+        TRAINING_LOGS.append(f"📊 Loading dataset from uploaded file...")
+        yield "\n".join(TRAINING_LOGS)
+        dataset_dict, final_text_col, final_label_col = load_and_prepare_local_dataset(
+            dataset_file, text_column, label_column
+        )
+        TRAINING_LOGS.append(f"✅ Dataset loaded successfully!")
+        TRAINING_LOGS.append(f"- Train samples: {len(dataset_dict['train'])}")
+        TRAINING_LOGS.append(f"- Validation samples: {len(dataset_dict['validation'])}")
+        yield "\n".join(TRAINING_LOGS)
+        # Load model and tokenizer
+        TRAINING_LOGS.append("🤖 Loading BERT model and tokenizer...")
+        yield "\n".join(TRAINING_LOGS)
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        model = BertForSequenceClassification.from_pretrained(
+            "bert-base-uncased",
+            num_labels=len(CATEGORIES)
+        )
+        TRAINING_LOGS.append("✅ Model and tokenizer loaded")
+        yield "\n".join(TRAINING_LOGS)
+        # Tokenize datasets
+        TRAINING_LOGS.append("🔤 Tokenizing datasets...")
+        yield "\n".join(TRAINING_LOGS)
+        def tokenize_batch(examples):
+            return tokenize_function(examples, tokenizer, final_text_col, 512)
+        # Get columns to remove (keep only label column and tokenized features)
+        columns_to_remove = [col for col in dataset_dict['train'].column_names if col != final_label_col]
+        tokenized_datasets = dataset_dict.map(
+            tokenize_batch,
+            batched=True,
+            remove_columns=columns_to_remove
+        )
+        # Rename label column to 'labels' (required by Trainer)
+        tokenized_datasets = tokenized_datasets.rename_column(final_label_col, 'labels')
+        TRAINING_LOGS.append("✅ Tokenization completed")
+        yield "\n".join(TRAINING_LOGS)
+        # Set up training
+        output_dir = Path(MODEL_PATH)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Calculate steps
+        total_steps = len(tokenized_datasets['train']) // batch_size * num_epochs
+        eval_steps = max(10, min(100, total_steps // 4))
+        save_steps = max(20, min(500, total_steps // 2))
+        logging_steps = max(5, min(50, total_steps // 10))
+        warmup_steps = min(500, total_steps // 10)
+        TRAINING_LOGS.append(f"📈 Training configuration:")
+        TRAINING_LOGS.append(f"- Total steps: {total_steps}")
+        TRAINING_LOGS.append(f"- Eval steps: {eval_steps}")
+        TRAINING_LOGS.append(f"- Warmup steps: {warmup_steps}")
+        yield "\n".join(TRAINING_LOGS)
+        # Training arguments
+        training_args = TrainingArguments(
+            output_dir=str(output_dir),
+            num_train_epochs=num_epochs,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            warmup_steps=warmup_steps,
+            weight_decay=0.01,
+            learning_rate=learning_rate,
+            logging_dir=str(output_dir / "logs"),
+            logging_steps=logging_steps,
+            eval_strategy="steps",
+            eval_steps=eval_steps,
+            save_steps=save_steps,
+            save_total_limit=2,
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_accuracy",
+            greater_is_better=True,
+            push_to_hub=push_to_hub,
+            hub_model_id=hub_model_id if push_to_hub else None,
+            report_to=None,
+            dataloader_num_workers=0,
+            fp16=torch.cuda.is_available(),
+            seed=42,
+            remove_unused_columns=False,
+        )
+        # Data collator
+        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+        # Create trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_datasets['train'],
+            eval_dataset=tokenized_datasets['validation'],
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+            compute_metrics=compute_metrics,
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
+        )
+        TRAINING_LOGS.append("🚀 Starting training...")
+        yield "\n".join(TRAINING_LOGS)
+        # Custom training loop with progress updates
+        class ProgressCallback:
+            def __init__(self, logs_list):
+                self.logs = logs_list
+                self.step_count = 0
+            def on_step_end(self, args, state, control, model=None, **kwargs):
+                self.step_count += 1
+                if self.step_count % logging_steps == 0:
+                    self.logs.append(f"Step {self.step_count}/{total_steps}")
+            def on_epoch_end(self, args, state, control, model=None, **kwargs):
+                epoch = int(state.epoch)
+                self.logs.append(f"✅ Epoch {epoch} completed")
+            def on_evaluate(self, args, state, control, model=None, logs=None, **kwargs):
+                if logs:
+                    acc = logs.get('eval_accuracy', 0)
+                    loss = logs.get('eval_loss', 0)
+                    self.logs.append(f"📊 Eval - Accuracy: {acc:.4f}, Loss: {loss:.4f}")
+        progress_callback = ProgressCallback(TRAINING_LOGS)
+        trainer.add_callback(progress_callback)
+        # Train the model
+        try:
+            trainer.train()
+            TRAINING_LOGS.append("✅ Training completed successfully!")
+            yield "\n".join(TRAINING_LOGS)
+        except Exception as e:
+            TRAINING_LOGS.append(f"❌ Training failed: {str(e)}")
+            yield "\n".join(TRAINING_LOGS)
+            return
+        # Save the model
+        TRAINING_LOGS.append("💾 Saving model...")
+        yield "\n".join(TRAINING_LOGS)
+        trainer.save_model()
+        tokenizer.save_pretrained(output_dir)
+        # Update global model and tokenizer
+        CURRENT_MODEL = model
+        CURRENT_TOKENIZER = tokenizer
+        TRAINING_LOGS.append("✅ Model saved successfully!")
+        yield "\n".join(TRAINING_LOGS)
+        # Final evaluation
+        TRAINING_LOGS.append("📊 Running final evaluation...")
+        yield "\n".join(TRAINING_LOGS)
+        try:
+            eval_results = trainer.evaluate()
+            TRAINING_LOGS.append("📊 Final Results:")
+            for key, value in eval_results.items():
+                if isinstance(value, float):
+                    TRAINING_LOGS.append(f"  {key}: {value:.4f}")
+                else:
+                    TRAINING_LOGS.append(f"  {key}: {value}")
+            # Save results
+            with open(output_dir / "eval_results.json", "w") as f:
+                json.dump(eval_results, f, indent=2)
+        except Exception as e:
+            TRAINING_LOGS.append(f"⚠️ Evaluation error: {str(e)}")
+        yield "\n".join(TRAINING_LOGS)
+        # Push to hub if requested
+        if push_to_hub and hub_model_id:
+            TRAINING_LOGS.append(f"🤗 Pushing to Hugging Face Hub: {hub_model_id}")
+            yield "\n".join(TRAINING_LOGS)
+            try:
+                trainer.push_to_hub()
+                TRAINING_LOGS.append(f"✅ Successfully pushed to {hub_model_id}")
+            except Exception as e:
+                TRAINING_LOGS.append(f"❌ Push to Hub failed: {str(e)}")
+            yield "\n".join(TRAINING_LOGS)
+        TRAINING_LOGS.append("\n✨ Training completed! Your model is ready to use.")
+        yield "\n".join(TRAINING_LOGS)
+    except Exception as e:
+        TRAINING_LOGS.append(f"❌ Error during training: {str(e)}")
+        yield "\n".join(TRAINING_LOGS)
 def predict_text(text, model_path):
     """Make a prediction on a single text input"""
     global CURRENT_MODEL, CURRENT_TOKENIZER
             return load_result
     try:
+        if not text.strip():
+            return "Please enter some text to classify."
+        # Check if text was truncated
+        original_tokens = CURRENT_TOKENIZER(text, truncation=False)
+        was_truncated = len(original_tokens['input_ids']) > 512
         # Tokenize input
         inputs = CURRENT_TOKENIZER(text, return_tensors="pt", truncation=True, max_length=512)
         # Make prediction
         with torch.no_grad():
             outputs = CURRENT_MODEL(**inputs)
+            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            predicted_class_id = predictions.argmax().item()
+            confidence = predictions.max().item()
+        # Get predicted category
+        predicted_category = idx_to_category[predicted_class_id]
+        # Format result
         truncation_warning = "\n\n⚠️ Note: This complaint was truncated to fit BERT's 512 token limit." if was_truncated else ""
+        result = []
+        result.append(f"**Complaint:** {text}")
+        result.append(f"\n**Predicted Category:** {predicted_category}")
+        result.append(f"**Confidence:** {confidence:.4f}")
+        result.append("\n**All Class Probabilities:**")
+        for i, category in enumerate(CATEGORIES):
+            prob = predictions[0][i].item()
+            result.append(f"- {category}: {prob:.4f}")
+        result.append(truncation_warning)
+        return "\n".join(result)
     except Exception as e:
+        return f"❌ Prediction error: {str(e)}"
 def predict_csv(csv_file, model_path):
     """Make predictions on a CSV file with complaints"""
             return "❌ CSV file must have a 'complaint' column"
         results = []
+        predictions_list = []
         truncated_count = 0
         for i, row in enumerate(df.iterrows()):
             inputs = CURRENT_TOKENIZER(complaint, return_tensors="pt", truncation=True, max_length=512)
             with torch.no_grad():
                 outputs = CURRENT_MODEL(**inputs)
+                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+                predicted_idx = predictions.argmax().item()
+                confidence = predictions.max().item()
             predicted_category = idx_to_category[predicted_idx]
+            predictions_list.append({
+                'complaint': complaint,
+                'predicted_category': predicted_category,
+                'confidence': confidence,
+                'truncated': was_truncated
+            })
             truncation_mark = " ⚠️" if was_truncated else ""
             preview = complaint if len(complaint) <= 50 else complaint[:47] + "..."
+            results.append(f"Complaint {i+1}{truncation_mark}: {preview}")
+            results.append(f"Predicted: {predicted_category} (confidence: {confidence:.3f})\n")
             if i >= 19:
                 results.append(f"... and {len(df) - 20} more (showing first 20 out of {len(df)} complaints)")
         if truncated_count > 0:
             results.append(f"\n⚠️ {truncated_count} complaints were truncated to fit BERT's 512 token limit.")
+        # Save full results to a CSV file
+        results_df = pd.DataFrame(predictions_list)
+        results_file = "prediction_results.csv"
+        results_df.to_csv(results_file, index=False)
+        results.append(f"\n💾 Full results saved to {results_file}")
         return "\n".join(results)
     except Exception as e:
         return f"❌ CSV processing failed: {str(e)}"
 def push_to_hub_after_training(model_path, username, model_name, token):
     """Push a trained model to Hugging Face Hub"""
     try:
     except Exception as e:
         return f"❌ Error: {str(e)}"
+def count_tokens(text):
+    """Count tokens in input text"""
+    if not text or CURRENT_TOKENIZER is None:
+        return "Enter text to see token count"
+    tokens = CURRENT_TOKENIZER(text, truncation=False)
+    count = len(tokens['input_ids'])
+    if count > 512:
+        return f"⚠️ **Token count: {count}/512** - Text will be truncated for BERT"
+    else:
+        return f"Token count: {count}/512"
+def get_available_datasets():
+    """Get list of available CSV files in the current directory"""
+    available_files = []
+    for file in os.listdir("."):
+        if file.endswith(".csv"):
+            try:
+                df = pd.read_csv(file)
+                available_files.append(f"{file} ({len(df)} rows)")
+            except:
+                available_files.append(f"{file} (Error reading)")
+    if not available_files:
+        available_files = ["No CSV files found in current directory"]
+    return available_files
+def display_available_datasets():
+                datasets = get_available_datasets()
+                if datasets:
+                    return "**Available CSV files:**\n\n" + "\n".join([f"- {file}" for file in datasets])
+                else:
+                    return "No CSV files found in the current directory."
+            # Initialize the display
+            refresh_datasets_btn.click(
+                display_available_datasets,
+                outputs=available_datasets
+            )
+            # Show datasets on load
+            app.load(display_available_datasets, outputs=available_datasets)
+            gr.Markdown("### Dataset Format Requirements")
+            gr.Markdown("""
+            **For training, your CSV file should have:**
+            - A text column containing the complaint text (default name: 'complaint')
+            - A label column containing categories (default name: 'category')
+            **Supported label formats:**
+            - Text labels: 'Online-Safety', 'BroadBand', 'TV-Radio'
+            - Numeric labels: 0, 1, 2 (corresponding to the categories above)
+            **Example CSV structure:**
+            ```
+            complaint,category
+            "My internet is slow",BroadBand
+            "Blocked website access",Online-Safety
+            "Poor TV signal",TV-Radio
+            ```
+            """)
+            gr.Markdown("### Model Categories")
+            categories_info = f"""
+            **The model classifies complaints into these categories:**
+            | Index | Category | Description |
+            |-------|----------|-------------|
+            | 0 | Online-Safety | Internet safety, content filtering, cybersecurity issues |
+            | 1 | BroadBand | Internet connectivity, speed, network problems |
+            | 2 | TV-Radio | Television and radio broadcasting, signal quality issues |
+            """
+            gr.Markdown(categories_info)
 # Launch the app
 if __name__ == "__main__":
     # Initialize tokenizer on startup
     if CURRENT_TOKENIZER is None:
+        try:
+            CURRENT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
+            print("✅ Tokenizer initialized successfully")
+        except Exception as e:
+            print(f"⚠️ Warning: Could not initialize tokenizer: {e}")
+    print("🚀 Launching BERT Complaint Classifier...")
+    print("📍 Available at: http://localhost:7860")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )