Binary_classifier / binary_classifier.py
SaitejaJate's picture
Upload 5 files
88b8fd6 verified
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
TrainingArguments, Trainer, DataCollatorWithPadding
)
import torch
from datasets import Dataset
import logging
import os
logger = logging.getLogger(__name__)
class CBTBinaryClassifier:
"""Binary classifier to distinguish normal conversation from CBT-triggering statements."""
def __init__(self, model_name="distilbert-base-uncased"):
# Use a lightweight model that's good for your laptop
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = None
self.trainer = None
self.inference_pipeline = None
self.use_hf_api = False
self.api_url = None
self.api_token = None
self.headers = None
self.model_id = None
# Add padding token if it doesn't exist
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def prepare_data(self, normal_csv_path, cbt_csv_path, text_column="text"):
"""Load and prepare training data from CSV files"""
logger.info(f"Loading normal conversations from {normal_csv_path}")
normal_df = pd.read_csv(normal_csv_path)
normal_df['label'] = 0 # Normal conversation = 0
normal_df['text'] = normal_df[text_column]
logger.info(f"Loading CBT conversations from {cbt_csv_path}")
cbt_df = pd.read_csv(cbt_csv_path)
cbt_df['label'] = 1 # CBT trigger = 1
cbt_df['text'] = cbt_df[text_column]
# Combine datasets
combined_df = pd.concat([
normal_df[['text', 'label']],
cbt_df[['text', 'label']]
], ignore_index=True)
# Shuffle the data
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
logger.info(f"Total examples: {len(combined_df)}")
logger.info(f"Normal conversations: {len(normal_df)}")
logger.info(f"CBT triggers: {len(cbt_df)}")
return combined_df
def tokenize_data(self, df, max_length=128):
"""Tokenize the text data"""
def tokenize_function(examples):
return self.tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=max_length,
return_tensors=None
)
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=['text'])
return tokenized_dataset
def split_data(self, dataset, test_size=0.2, val_size=0.1):
"""Split data into train/validation/test sets"""
# First split: train + val vs test
train_val, test = dataset.train_test_split(
test_size=test_size,
seed=42
).values()
# Second split: train vs validation
val_ratio = val_size / (1 - test_size)
train, val = train_val.train_test_split(
test_size=val_ratio,
seed=42
).values()
logger.info(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
return train, val, test
def train_model(self, train_dataset, val_dataset, output_dir="./cbt_classifier"):
"""Train the binary classifier with laptop-friendly settings"""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Initialize model
self.model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
num_labels=2
)
# Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
# Laptop-friendly training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=2, # Reduced epochs
per_device_train_batch_size=8, # Smaller batch size
per_device_eval_batch_size=8,
gradient_accumulation_steps=2, # Simulate larger batch size
warmup_steps=100, # Reduced warmup
weight_decay=0.01,
logging_dir=f'{output_dir}/logs',
logging_steps=50,
eval_strategy="steps",
eval_steps=200,
save_strategy="steps",
save_steps=200,
load_best_model_at_end=True,
metric_for_best_model="eval_accuracy",
fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
dataloader_num_workers=0, # Reduce CPU usage
remove_unused_columns=True,
)
# Metrics function
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {
'accuracy': accuracy_score(labels, predictions),
}
# Initialize trainer
self.trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
data_collator=data_collator,
)
# Train the model
logger.info("Starting training...")
self.trainer.train()
# Save the model
self.trainer.save_model()
self.tokenizer.save_pretrained(output_dir)
logger.info(f"Model saved to {output_dir}")
def evaluate_model(self, test_dataset):
"""Evaluate the trained model"""
if self.trainer is None:
raise ValueError("Model not trained yet!")
# Get predictions
predictions = self.trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
# Print results
print("\n=== Evaluation Results ===")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred,
target_names=['Normal', 'CBT Trigger']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))
return y_true, y_pred
def load_model(self, model_path="./cbt_classifier"):
"""Load a pre-trained model for inference"""
from transformers import pipeline
self.inference_pipeline = pipeline(
"text-classification",
model=model_path,
tokenizer=model_path,
return_all_scores=True
)
logger.info(f"Model loaded from {model_path}")
def predict(self, text, threshold=0.7):
"""Predict if text is CBT-triggering"""
if self.inference_pipeline is None:
raise ValueError("Model not loaded! Call load_model() first.")
result = self.inference_pipeline(text)
# Extract confidence for CBT trigger class (LABEL_1)
cbt_confidence = next(
score['score'] for score in result[0]
if score['label'] == 'LABEL_1'
)
return {
'is_cbt_trigger': cbt_confidence > threshold,
'confidence': cbt_confidence,
'threshold': threshold
}
def batch_predict(self, texts, threshold=0.7):
"""Predict for multiple texts"""
if self.inference_pipeline is None:
raise ValueError("Model not loaded! Call load_model() first.")
results = []
for text in texts:
result = self.predict(text, threshold)
results.append(result)
return results