Upload 5 files

Browse files

Files changed (5) hide show

binary_classifier.py +237 -0
classifier_api.py +150 -0
test_classifier.py +71 -0
test_model.py +5 -0
train_classifier.py +67 -0

binary_classifier.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import pandas as pd
+import numpy as np
+import requests
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from transformers import (
+    AutoTokenizer, AutoModelForSequenceClassification,
+    TrainingArguments, Trainer, DataCollatorWithPadding
+)
+import torch
+from datasets import Dataset
+import logging
+import os
+logger = logging.getLogger(__name__)
+class CBTBinaryClassifier:
+    """Binary classifier to distinguish normal conversation from CBT-triggering statements."""
+    def __init__(self, model_name="distilbert-base-uncased"):
+        # Use a lightweight model that's good for your laptop
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = None
+        self.trainer = None
+        self.inference_pipeline = None
+        self.use_hf_api = False
+        self.api_url = None
+        self.api_token = None
+        self.headers = None
+        self.model_id = None
+        # Add padding token if it doesn't exist
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def prepare_data(self, normal_csv_path, cbt_csv_path, text_column="text"):
+        """Load and prepare training data from CSV files"""
+        logger.info(f"Loading normal conversations from {normal_csv_path}")
+        normal_df = pd.read_csv(normal_csv_path)
+        normal_df['label'] = 0  # Normal conversation = 0
+        normal_df['text'] = normal_df[text_column]
+        logger.info(f"Loading CBT conversations from {cbt_csv_path}")
+        cbt_df = pd.read_csv(cbt_csv_path)
+        cbt_df['label'] = 1  # CBT trigger = 1
+        cbt_df['text'] = cbt_df[text_column]
+        # Combine datasets
+        combined_df = pd.concat([
+            normal_df[['text', 'label']],
+            cbt_df[['text', 'label']]
+        ], ignore_index=True)
+        # Shuffle the data
+        combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
+        logger.info(f"Total examples: {len(combined_df)}")
+        logger.info(f"Normal conversations: {len(normal_df)}")
+        logger.info(f"CBT triggers: {len(cbt_df)}")
+        return combined_df
+    def tokenize_data(self, df, max_length=128):
+        """Tokenize the text data"""
+        def tokenize_function(examples):
+            return self.tokenizer(
+                examples['text'],
+                truncation=True,
+                padding='max_length',
+                max_length=max_length,
+                return_tensors=None
+            )
+        # Convert to HuggingFace Dataset
+        dataset = Dataset.from_pandas(df)
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=['text'])
+        return tokenized_dataset
+    def split_data(self, dataset, test_size=0.2, val_size=0.1):
+        """Split data into train/validation/test sets"""
+        # First split: train + val vs test
+        train_val, test = dataset.train_test_split(
+            test_size=test_size,
+            seed=42
+        ).values()
+        # Second split: train vs validation
+        val_ratio = val_size / (1 - test_size)
+        train, val = train_val.train_test_split(
+            test_size=val_ratio,
+            seed=42
+        ).values()
+        logger.info(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
+        return train, val, test
+    def train_model(self, train_dataset, val_dataset, output_dir="./cbt_classifier"):
+        """Train the binary classifier with laptop-friendly settings"""
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        # Initialize model
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name,
+            num_labels=2
+        )
+        # Create data collator for dynamic padding
+        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
+        # Laptop-friendly training arguments
+        training_args = TrainingArguments(
+            output_dir=output_dir,
+            num_train_epochs=2,  # Reduced epochs
+            per_device_train_batch_size=8,  # Smaller batch size
+            per_device_eval_batch_size=8,
+            gradient_accumulation_steps=2,  # Simulate larger batch size
+            warmup_steps=100,  # Reduced warmup
+            weight_decay=0.01,
+            logging_dir=f'{output_dir}/logs',
+            logging_steps=50,
+            eval_strategy="steps",
+            eval_steps=200,
+            save_strategy="steps",
+            save_steps=200,
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_accuracy",
+            fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
+            dataloader_num_workers=0,  # Reduce CPU usage
+            remove_unused_columns=True,
+        )
+        # Metrics function
+        def compute_metrics(eval_pred):
+            predictions, labels = eval_pred
+            predictions = np.argmax(predictions, axis=1)
+            return {
+                'accuracy': accuracy_score(labels, predictions),
+            }
+        # Initialize trainer
+        self.trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            compute_metrics=compute_metrics,
+            data_collator=data_collator,
+        )
+        # Train the model
+        logger.info("Starting training...")
+        self.trainer.train()
+        # Save the model
+        self.trainer.save_model()
+        self.tokenizer.save_pretrained(output_dir)
+        logger.info(f"Model saved to {output_dir}")
+    def evaluate_model(self, test_dataset):
+        """Evaluate the trained model"""
+        if self.trainer is None:
+            raise ValueError("Model not trained yet!")
+        # Get predictions
+        predictions = self.trainer.predict(test_dataset)
+        y_pred = np.argmax(predictions.predictions, axis=1)
+        y_true = predictions.label_ids
+        # Print results
+        print("\n=== Evaluation Results ===")
+        print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
+        print("\nClassification Report:")
+        print(classification_report(y_true, y_pred,
+                                  target_names=['Normal', 'CBT Trigger']))
+        print("\nConfusion Matrix:")
+        print(confusion_matrix(y_true, y_pred))
+        return y_true, y_pred
+    def load_model(self, model_path="./cbt_classifier"):
+        """Load a pre-trained model for inference"""
+        from transformers import pipeline
+        self.inference_pipeline = pipeline(
+            "text-classification",
+            model=model_path,
+            tokenizer=model_path,
+            return_all_scores=True
+        )
+        logger.info(f"Model loaded from {model_path}")
+    def predict(self, text, threshold=0.7):
+        """Predict if text is CBT-triggering"""
+        if self.inference_pipeline is None:
+            raise ValueError("Model not loaded! Call load_model() first.")
+        result = self.inference_pipeline(text)
+        # Extract confidence for CBT trigger class (LABEL_1)
+        cbt_confidence = next(
+            score['score'] for score in result[0]
+            if score['label'] == 'LABEL_1'
+        )
+        return {
+            'is_cbt_trigger': cbt_confidence > threshold,
+            'confidence': cbt_confidence,
+            'threshold': threshold
+        }
+    def batch_predict(self, texts, threshold=0.7):
+        """Predict for multiple texts"""
+        if self.inference_pipeline is None:
+            raise ValueError("Model not loaded! Call load_model() first.")
+        results = []
+        for text in texts:
+            result = self.predict(text, threshold)
+            results.append(result)
+        return results

classifier_api.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
+import logging
+from pathlib import Path
+import sys
+import os
+from huggingface_hub import snapshot_download
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent))
+from binary_classifier import CBTBinaryClassifier
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Create FastAPI app
+app = FastAPI(
+    title="CBT Binary Classifier API",
+    description="API for detecting CBT-triggering conversations",
+    version="1.0.0"
+)
+# Request/Response models
+class TextRequest(BaseModel):
+    text: str = Field(..., description="Text to classify")
+    threshold: float = Field(0.7, description="Confidence threshold for CBT trigger detection")
+class BatchTextRequest(BaseModel):
+    texts: List[str] = Field(..., description="List of texts to classify")
+    threshold: float = Field(0.7, description="Confidence threshold for CBT trigger detection")
+class PredictionResponse(BaseModel):
+    is_cbt_trigger: bool
+    confidence: float
+    threshold: float
+    text: Optional[str] = None
+class BatchPredictionResponse(BaseModel):
+    predictions: List[PredictionResponse]
+# Initialize classifier
+classifier = None
+@app.on_event("startup")
+async def startup_event():
+    """Load the model on startup"""
+    global classifier
+    try:
+        classifier = CBTBinaryClassifier()
+        # Try to load from Hugging Face Hub first
+        hf_model_id = os.getenv("HF_MODEL_ID", "SaitejaJate/Binary_classifier")
+        local_model_path = Path(__file__).parent / "cbt_classifier"
+        # Check if we should use local model or download from HF
+        use_local = os.getenv("USE_LOCAL_MODEL", "false").lower() == "true"
+        if use_local and local_model_path.exists():
+            # Use local model
+            classifier.load_model(str(local_model_path))
+            logger.info(f"Model loaded successfully from local path: {local_model_path}")
+        else:
+            # Download from Hugging Face Hub
+            logger.info(f"Downloading model from Hugging Face Hub: {hf_model_id}")
+            cache_dir = Path(__file__).parent / "model_cache"
+            # Download model files
+            model_path = snapshot_download(
+                repo_id=hf_model_id,
+                cache_dir=str(cache_dir),
+                local_dir=str(cache_dir / "downloaded_model")
+            )
+            classifier.load_model(model_path)
+            logger.info(f"Model loaded successfully from Hugging Face Hub")
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "status": "active",
+        "service": "CBT Binary Classifier API",
+        "model_loaded": classifier is not None
+    }
+@app.post("/classify", response_model=PredictionResponse)
+async def classify_text(request: TextRequest):
+    """Classify a single text"""
+    try:
+        if classifier is None:
+            raise HTTPException(status_code=503, detail="Model not loaded")
+        result = classifier.predict(request.text, request.threshold)
+        return PredictionResponse(
+            is_cbt_trigger=result['is_cbt_trigger'],
+            confidence=result['confidence'],
+            threshold=result['threshold'],
+            text=request.text[:100] + "..." if len(request.text) > 100 else request.text
+        )
+    except Exception as e:
+        logger.error(f"Classification error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/classify/batch", response_model=BatchPredictionResponse)
+async def classify_batch(request: BatchTextRequest):
+    """Classify multiple texts"""
+    try:
+        if classifier is None:
+            raise HTTPException(status_code=503, detail="Model not loaded")
+        results = classifier.batch_predict(request.texts, request.threshold)
+        predictions = []
+        for i, result in enumerate(results):
+            text_preview = request.texts[i][:100] + "..." if len(request.texts[i]) > 100 else request.texts[i]
+            predictions.append(PredictionResponse(
+                is_cbt_trigger=result['is_cbt_trigger'],
+                confidence=result['confidence'],
+                threshold=result['threshold'],
+                text=text_preview
+            ))
+        return BatchPredictionResponse(predictions=predictions)
+    except Exception as e:
+        logger.error(f"Batch classification error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/model/info")
+async def model_info():
+    """Get information about the loaded model"""
+    if classifier is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    return {
+        "model_name": classifier.model_name,
+        "model_path": str(Path(__file__).parent / "cbt_classifier"),
+        "status": "loaded"
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8001)

test_classifier.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Test script for the trained CBT binary classifier.
+"""
+import argparse
+from binary_classifier import CBTBinaryClassifier
+def main():
+    parser = argparse.ArgumentParser(description='Test CBT Binary Classifier')
+    parser.add_argument('--model_path', default='./cbt_classifier',
+                       help='Path to the trained model')
+    parser.add_argument('--threshold', type=float, default=0.7,
+                       help='Confidence threshold for CBT trigger detection')
+    args = parser.parse_args()
+    # Load the trained model
+    classifier = CBTBinaryClassifier()
+    classifier.load_model(args.model_path)
+    # Test examples
+    test_texts = [
+        # Normal conversation examples
+        "How was your weekend?",
+        "Nice weather today!",
+        "Did you see that movie last night?",
+        "I had a great lunch at that new restaurant",
+        "What are your plans for tonight?",
+        # CBT trigger examples
+        "I'm such a failure at everything",
+        "I always mess things up",
+        "Everyone probably thinks I'm stupid",
+        "I'm not good enough for this job",
+        "I'll never be successful",
+        "It's all my fault that this happened"
+    ]
+    print(f"Testing classifier with threshold: {args.threshold}")
+    print("=" * 60)
+    for text in test_texts:
+        result = classifier.predict(text, threshold=args.threshold)
+        status = "🚨 CBT TRIGGER" if result['is_cbt_trigger'] else "✅ NORMAL"
+        confidence = result['confidence']
+        print(f"{status} (confidence: {confidence:.3f})")
+        print(f"Text: '{text}'")
+        print("-" * 60)
+    # Interactive testing
+    print("\nInteractive testing (type 'quit' to exit):")
+    while True:
+        user_input = input("\nEnter text to classify: ").strip()
+        if user_input.lower() in ['quit', 'exit', 'q']:
+            break
+        if not user_input:
+            continue
+        result = classifier.predict(user_input, threshold=args.threshold)
+        status = "🚨 CBT TRIGGER" if result['is_cbt_trigger'] else "✅ NORMAL"
+        confidence = result['confidence']
+        print(f"{status} (confidence: {confidence:.3f})")
+if __name__ == "__main__":
+    main()

test_model.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from binary_classifier import CBTBinaryClassifier
+classifier = CBTBinaryClassifier()
+classifier.load_model('./cbt_classifier')
+result = classifier.predict('I am happy cause I finished all of my tasks')
+print(f"Prediction: {result['is_cbt_trigger']}, Confidence: {result['confidence']:.3f}")

train_classifier.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Training script for CBT binary classifier.
+Run this script to train the model on your CSV data.
+"""
+import argparse
+import logging
+from binary_classifier import CBTBinaryClassifier
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+def main():
+    parser = argparse.ArgumentParser(description='Train CBT Binary Classifier')
+    parser.add_argument('--normal_csv', required=True,
+                       help='Path to CSV file with normal conversations')
+    parser.add_argument('--cbt_csv', required=True,
+                       help='Path to CSV file with CBT conversations')
+    parser.add_argument('--text_column', default='text',
+                       help='Name of the text column in CSV files')
+    parser.add_argument('--output_dir', default='./cbt_classifier',
+                       help='Directory to save the trained model')
+    parser.add_argument('--model_name', default='distilbert-base-uncased',
+                       help='Pre-trained model to use (distilbert-base-uncased recommended for laptops)')
+    args = parser.parse_args()
+    # Initialize classifier
+    classifier = CBTBinaryClassifier(model_name=args.model_name)
+    # Prepare data
+    print("Preparing data...")
+    df = classifier.prepare_data(
+        normal_csv_path=args.normal_csv,
+        cbt_csv_path=args.cbt_csv,
+        text_column=args.text_column
+    )
+    # Tokenize data
+    print("Tokenizing data...")
+    dataset = classifier.tokenize_data(df)
+    # Split data
+    print("Splitting data...")
+    train_dataset, val_dataset, test_dataset = classifier.split_data(dataset)
+    # Train model
+    print("Training model...")
+    print("Note: Training optimized for laptop performance (smaller batches, fewer epochs)")
+    classifier.train_model(train_dataset, val_dataset, output_dir=args.output_dir)
+    # Evaluate model
+    print("Evaluating model...")
+    classifier.evaluate_model(test_dataset)
+    print(f"\nTraining complete! Model saved to {args.output_dir}")
+    print("\nTo use the model for inference:")
+    print(f"from binary_classifier import CBTBinaryClassifier")
+    print(f"classifier = CBTBinaryClassifier()")
+    print(f"classifier.load_model('{args.output_dir}')")
+    print(f"result = classifier.predict('Your text here')")
+if __name__ == "__main__":
+    main()