SaitejaJate commited on
Commit
88b8fd6
·
verified ·
1 Parent(s): ba48e40

Upload 5 files

Browse files
binary_classifier.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import requests
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
6
+ from transformers import (
7
+ AutoTokenizer, AutoModelForSequenceClassification,
8
+ TrainingArguments, Trainer, DataCollatorWithPadding
9
+ )
10
+ import torch
11
+ from datasets import Dataset
12
+ import logging
13
+ import os
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class CBTBinaryClassifier:
18
+ """Binary classifier to distinguish normal conversation from CBT-triggering statements."""
19
+
20
+ def __init__(self, model_name="distilbert-base-uncased"):
21
+ # Use a lightweight model that's good for your laptop
22
+ self.model_name = model_name
23
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
24
+ self.model = None
25
+ self.trainer = None
26
+ self.inference_pipeline = None
27
+ self.use_hf_api = False
28
+ self.api_url = None
29
+ self.api_token = None
30
+ self.headers = None
31
+ self.model_id = None
32
+
33
+ # Add padding token if it doesn't exist
34
+ if self.tokenizer.pad_token is None:
35
+ self.tokenizer.pad_token = self.tokenizer.eos_token
36
+
37
+ def prepare_data(self, normal_csv_path, cbt_csv_path, text_column="text"):
38
+ """Load and prepare training data from CSV files"""
39
+
40
+ logger.info(f"Loading normal conversations from {normal_csv_path}")
41
+ normal_df = pd.read_csv(normal_csv_path)
42
+ normal_df['label'] = 0 # Normal conversation = 0
43
+ normal_df['text'] = normal_df[text_column]
44
+
45
+ logger.info(f"Loading CBT conversations from {cbt_csv_path}")
46
+ cbt_df = pd.read_csv(cbt_csv_path)
47
+ cbt_df['label'] = 1 # CBT trigger = 1
48
+ cbt_df['text'] = cbt_df[text_column]
49
+
50
+ # Combine datasets
51
+ combined_df = pd.concat([
52
+ normal_df[['text', 'label']],
53
+ cbt_df[['text', 'label']]
54
+ ], ignore_index=True)
55
+
56
+ # Shuffle the data
57
+ combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
58
+
59
+ logger.info(f"Total examples: {len(combined_df)}")
60
+ logger.info(f"Normal conversations: {len(normal_df)}")
61
+ logger.info(f"CBT triggers: {len(cbt_df)}")
62
+
63
+ return combined_df
64
+
65
+ def tokenize_data(self, df, max_length=128):
66
+ """Tokenize the text data"""
67
+
68
+ def tokenize_function(examples):
69
+ return self.tokenizer(
70
+ examples['text'],
71
+ truncation=True,
72
+ padding='max_length',
73
+ max_length=max_length,
74
+ return_tensors=None
75
+ )
76
+
77
+ # Convert to HuggingFace Dataset
78
+ dataset = Dataset.from_pandas(df)
79
+ tokenized_dataset = dataset.map(
80
+ tokenize_function,
81
+ batched=True,
82
+ remove_columns=['text'])
83
+
84
+ return tokenized_dataset
85
+
86
+ def split_data(self, dataset, test_size=0.2, val_size=0.1):
87
+ """Split data into train/validation/test sets"""
88
+
89
+ # First split: train + val vs test
90
+ train_val, test = dataset.train_test_split(
91
+ test_size=test_size,
92
+ seed=42
93
+ ).values()
94
+
95
+ # Second split: train vs validation
96
+ val_ratio = val_size / (1 - test_size)
97
+ train, val = train_val.train_test_split(
98
+ test_size=val_ratio,
99
+ seed=42
100
+ ).values()
101
+
102
+ logger.info(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
103
+ return train, val, test
104
+
105
+ def train_model(self, train_dataset, val_dataset, output_dir="./cbt_classifier"):
106
+ """Train the binary classifier with laptop-friendly settings"""
107
+
108
+ # Create output directory
109
+ os.makedirs(output_dir, exist_ok=True)
110
+
111
+ # Initialize model
112
+ self.model = AutoModelForSequenceClassification.from_pretrained(
113
+ self.model_name,
114
+ num_labels=2
115
+ )
116
+
117
+ # Create data collator for dynamic padding
118
+ data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
119
+
120
+ # Laptop-friendly training arguments
121
+ training_args = TrainingArguments(
122
+ output_dir=output_dir,
123
+ num_train_epochs=2, # Reduced epochs
124
+ per_device_train_batch_size=8, # Smaller batch size
125
+ per_device_eval_batch_size=8,
126
+ gradient_accumulation_steps=2, # Simulate larger batch size
127
+ warmup_steps=100, # Reduced warmup
128
+ weight_decay=0.01,
129
+ logging_dir=f'{output_dir}/logs',
130
+ logging_steps=50,
131
+ eval_strategy="steps",
132
+ eval_steps=200,
133
+ save_strategy="steps",
134
+ save_steps=200,
135
+ load_best_model_at_end=True,
136
+ metric_for_best_model="eval_accuracy",
137
+ fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
138
+ dataloader_num_workers=0, # Reduce CPU usage
139
+ remove_unused_columns=True,
140
+ )
141
+
142
+ # Metrics function
143
+ def compute_metrics(eval_pred):
144
+ predictions, labels = eval_pred
145
+ predictions = np.argmax(predictions, axis=1)
146
+ return {
147
+ 'accuracy': accuracy_score(labels, predictions),
148
+ }
149
+
150
+ # Initialize trainer
151
+ self.trainer = Trainer(
152
+ model=self.model,
153
+ args=training_args,
154
+ train_dataset=train_dataset,
155
+ eval_dataset=val_dataset,
156
+ compute_metrics=compute_metrics,
157
+ data_collator=data_collator,
158
+ )
159
+
160
+ # Train the model
161
+ logger.info("Starting training...")
162
+ self.trainer.train()
163
+
164
+ # Save the model
165
+ self.trainer.save_model()
166
+ self.tokenizer.save_pretrained(output_dir)
167
+
168
+ logger.info(f"Model saved to {output_dir}")
169
+
170
+ def evaluate_model(self, test_dataset):
171
+ """Evaluate the trained model"""
172
+
173
+ if self.trainer is None:
174
+ raise ValueError("Model not trained yet!")
175
+
176
+ # Get predictions
177
+ predictions = self.trainer.predict(test_dataset)
178
+ y_pred = np.argmax(predictions.predictions, axis=1)
179
+ y_true = predictions.label_ids
180
+
181
+ # Print results
182
+ print("\n=== Evaluation Results ===")
183
+ print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
184
+ print("\nClassification Report:")
185
+ print(classification_report(y_true, y_pred,
186
+ target_names=['Normal', 'CBT Trigger']))
187
+ print("\nConfusion Matrix:")
188
+ print(confusion_matrix(y_true, y_pred))
189
+
190
+ return y_true, y_pred
191
+
192
+ def load_model(self, model_path="./cbt_classifier"):
193
+ """Load a pre-trained model for inference"""
194
+
195
+ from transformers import pipeline
196
+
197
+ self.inference_pipeline = pipeline(
198
+ "text-classification",
199
+ model=model_path,
200
+ tokenizer=model_path,
201
+ return_all_scores=True
202
+ )
203
+
204
+ logger.info(f"Model loaded from {model_path}")
205
+
206
+ def predict(self, text, threshold=0.7):
207
+ """Predict if text is CBT-triggering"""
208
+
209
+ if self.inference_pipeline is None:
210
+ raise ValueError("Model not loaded! Call load_model() first.")
211
+
212
+ result = self.inference_pipeline(text)
213
+
214
+ # Extract confidence for CBT trigger class (LABEL_1)
215
+ cbt_confidence = next(
216
+ score['score'] for score in result[0]
217
+ if score['label'] == 'LABEL_1'
218
+ )
219
+
220
+ return {
221
+ 'is_cbt_trigger': cbt_confidence > threshold,
222
+ 'confidence': cbt_confidence,
223
+ 'threshold': threshold
224
+ }
225
+
226
+ def batch_predict(self, texts, threshold=0.7):
227
+ """Predict for multiple texts"""
228
+
229
+ if self.inference_pipeline is None:
230
+ raise ValueError("Model not loaded! Call load_model() first.")
231
+
232
+ results = []
233
+ for text in texts:
234
+ result = self.predict(text, threshold)
235
+ results.append(result)
236
+
237
+ return results
classifier_api.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, Field
3
+ from typing import List, Dict, Optional
4
+ import logging
5
+ from pathlib import Path
6
+ import sys
7
+ import os
8
+ from huggingface_hub import snapshot_download
9
+
10
+ # Add parent directory to path for imports
11
+ sys.path.append(str(Path(__file__).parent))
12
+
13
+ from binary_classifier import CBTBinaryClassifier
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Create FastAPI app
20
+ app = FastAPI(
21
+ title="CBT Binary Classifier API",
22
+ description="API for detecting CBT-triggering conversations",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # Request/Response models
27
+ class TextRequest(BaseModel):
28
+ text: str = Field(..., description="Text to classify")
29
+ threshold: float = Field(0.7, description="Confidence threshold for CBT trigger detection")
30
+
31
+ class BatchTextRequest(BaseModel):
32
+ texts: List[str] = Field(..., description="List of texts to classify")
33
+ threshold: float = Field(0.7, description="Confidence threshold for CBT trigger detection")
34
+
35
+ class PredictionResponse(BaseModel):
36
+ is_cbt_trigger: bool
37
+ confidence: float
38
+ threshold: float
39
+ text: Optional[str] = None
40
+
41
+ class BatchPredictionResponse(BaseModel):
42
+ predictions: List[PredictionResponse]
43
+
44
+ # Initialize classifier
45
+ classifier = None
46
+
47
+ @app.on_event("startup")
48
+ async def startup_event():
49
+ """Load the model on startup"""
50
+ global classifier
51
+ try:
52
+ classifier = CBTBinaryClassifier()
53
+
54
+ # Try to load from Hugging Face Hub first
55
+ hf_model_id = os.getenv("HF_MODEL_ID", "SaitejaJate/Binary_classifier")
56
+ local_model_path = Path(__file__).parent / "cbt_classifier"
57
+
58
+ # Check if we should use local model or download from HF
59
+ use_local = os.getenv("USE_LOCAL_MODEL", "false").lower() == "true"
60
+
61
+ if use_local and local_model_path.exists():
62
+ # Use local model
63
+ classifier.load_model(str(local_model_path))
64
+ logger.info(f"Model loaded successfully from local path: {local_model_path}")
65
+ else:
66
+ # Download from Hugging Face Hub
67
+ logger.info(f"Downloading model from Hugging Face Hub: {hf_model_id}")
68
+ cache_dir = Path(__file__).parent / "model_cache"
69
+
70
+ # Download model files
71
+ model_path = snapshot_download(
72
+ repo_id=hf_model_id,
73
+ cache_dir=str(cache_dir),
74
+ local_dir=str(cache_dir / "downloaded_model")
75
+ )
76
+
77
+ classifier.load_model(model_path)
78
+ logger.info(f"Model loaded successfully from Hugging Face Hub")
79
+
80
+ except Exception as e:
81
+ logger.error(f"Failed to load model: {e}")
82
+ raise
83
+
84
+ @app.get("/")
85
+ async def root():
86
+ """Health check endpoint"""
87
+ return {
88
+ "status": "active",
89
+ "service": "CBT Binary Classifier API",
90
+ "model_loaded": classifier is not None
91
+ }
92
+
93
+ @app.post("/classify", response_model=PredictionResponse)
94
+ async def classify_text(request: TextRequest):
95
+ """Classify a single text"""
96
+ try:
97
+ if classifier is None:
98
+ raise HTTPException(status_code=503, detail="Model not loaded")
99
+
100
+ result = classifier.predict(request.text, request.threshold)
101
+
102
+ return PredictionResponse(
103
+ is_cbt_trigger=result['is_cbt_trigger'],
104
+ confidence=result['confidence'],
105
+ threshold=result['threshold'],
106
+ text=request.text[:100] + "..." if len(request.text) > 100 else request.text
107
+ )
108
+ except Exception as e:
109
+ logger.error(f"Classification error: {e}")
110
+ raise HTTPException(status_code=500, detail=str(e))
111
+
112
+ @app.post("/classify/batch", response_model=BatchPredictionResponse)
113
+ async def classify_batch(request: BatchTextRequest):
114
+ """Classify multiple texts"""
115
+ try:
116
+ if classifier is None:
117
+ raise HTTPException(status_code=503, detail="Model not loaded")
118
+
119
+ results = classifier.batch_predict(request.texts, request.threshold)
120
+
121
+ predictions = []
122
+ for i, result in enumerate(results):
123
+ text_preview = request.texts[i][:100] + "..." if len(request.texts[i]) > 100 else request.texts[i]
124
+ predictions.append(PredictionResponse(
125
+ is_cbt_trigger=result['is_cbt_trigger'],
126
+ confidence=result['confidence'],
127
+ threshold=result['threshold'],
128
+ text=text_preview
129
+ ))
130
+
131
+ return BatchPredictionResponse(predictions=predictions)
132
+ except Exception as e:
133
+ logger.error(f"Batch classification error: {e}")
134
+ raise HTTPException(status_code=500, detail=str(e))
135
+
136
+ @app.get("/model/info")
137
+ async def model_info():
138
+ """Get information about the loaded model"""
139
+ if classifier is None:
140
+ raise HTTPException(status_code=503, detail="Model not loaded")
141
+
142
+ return {
143
+ "model_name": classifier.model_name,
144
+ "model_path": str(Path(__file__).parent / "cbt_classifier"),
145
+ "status": "loaded"
146
+ }
147
+
148
+ if __name__ == "__main__":
149
+ import uvicorn
150
+ uvicorn.run(app, host="0.0.0.0", port=8001)
test_classifier.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for the trained CBT binary classifier.
3
+ """
4
+
5
+ import argparse
6
+ from binary_classifier import CBTBinaryClassifier
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser(description='Test CBT Binary Classifier')
10
+ parser.add_argument('--model_path', default='./cbt_classifier',
11
+ help='Path to the trained model')
12
+ parser.add_argument('--threshold', type=float, default=0.7,
13
+ help='Confidence threshold for CBT trigger detection')
14
+
15
+ args = parser.parse_args()
16
+
17
+ # Load the trained model
18
+ classifier = CBTBinaryClassifier()
19
+ classifier.load_model(args.model_path)
20
+
21
+ # Test examples
22
+ test_texts = [
23
+ # Normal conversation examples
24
+ "How was your weekend?",
25
+ "Nice weather today!",
26
+ "Did you see that movie last night?",
27
+ "I had a great lunch at that new restaurant",
28
+ "What are your plans for tonight?",
29
+
30
+ # CBT trigger examples
31
+ "I'm such a failure at everything",
32
+ "I always mess things up",
33
+ "Everyone probably thinks I'm stupid",
34
+ "I'm not good enough for this job",
35
+ "I'll never be successful",
36
+ "It's all my fault that this happened"
37
+ ]
38
+
39
+ print(f"Testing classifier with threshold: {args.threshold}")
40
+ print("=" * 60)
41
+
42
+ for text in test_texts:
43
+ result = classifier.predict(text, threshold=args.threshold)
44
+
45
+ status = "🚨 CBT TRIGGER" if result['is_cbt_trigger'] else "✅ NORMAL"
46
+ confidence = result['confidence']
47
+
48
+ print(f"{status} (confidence: {confidence:.3f})")
49
+ print(f"Text: '{text}'")
50
+ print("-" * 60)
51
+
52
+ # Interactive testing
53
+ print("\nInteractive testing (type 'quit' to exit):")
54
+ while True:
55
+ user_input = input("\nEnter text to classify: ").strip()
56
+
57
+ if user_input.lower() in ['quit', 'exit', 'q']:
58
+ break
59
+
60
+ if not user_input:
61
+ continue
62
+
63
+ result = classifier.predict(user_input, threshold=args.threshold)
64
+
65
+ status = "🚨 CBT TRIGGER" if result['is_cbt_trigger'] else "✅ NORMAL"
66
+ confidence = result['confidence']
67
+
68
+ print(f"{status} (confidence: {confidence:.3f})")
69
+
70
+ if __name__ == "__main__":
71
+ main()
test_model.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from binary_classifier import CBTBinaryClassifier
2
+ classifier = CBTBinaryClassifier()
3
+ classifier.load_model('./cbt_classifier')
4
+ result = classifier.predict('I am happy cause I finished all of my tasks')
5
+ print(f"Prediction: {result['is_cbt_trigger']}, Confidence: {result['confidence']:.3f}")
train_classifier.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training script for CBT binary classifier.
3
+ Run this script to train the model on your CSV data.
4
+ """
5
+
6
+ import argparse
7
+ import logging
8
+ from binary_classifier import CBTBinaryClassifier
9
+
10
+ # Setup logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(description='Train CBT Binary Classifier')
18
+ parser.add_argument('--normal_csv', required=True,
19
+ help='Path to CSV file with normal conversations')
20
+ parser.add_argument('--cbt_csv', required=True,
21
+ help='Path to CSV file with CBT conversations')
22
+ parser.add_argument('--text_column', default='text',
23
+ help='Name of the text column in CSV files')
24
+ parser.add_argument('--output_dir', default='./cbt_classifier',
25
+ help='Directory to save the trained model')
26
+ parser.add_argument('--model_name', default='distilbert-base-uncased',
27
+ help='Pre-trained model to use (distilbert-base-uncased recommended for laptops)')
28
+
29
+ args = parser.parse_args()
30
+
31
+ # Initialize classifier
32
+ classifier = CBTBinaryClassifier(model_name=args.model_name)
33
+
34
+ # Prepare data
35
+ print("Preparing data...")
36
+ df = classifier.prepare_data(
37
+ normal_csv_path=args.normal_csv,
38
+ cbt_csv_path=args.cbt_csv,
39
+ text_column=args.text_column
40
+ )
41
+
42
+ # Tokenize data
43
+ print("Tokenizing data...")
44
+ dataset = classifier.tokenize_data(df)
45
+
46
+ # Split data
47
+ print("Splitting data...")
48
+ train_dataset, val_dataset, test_dataset = classifier.split_data(dataset)
49
+
50
+ # Train model
51
+ print("Training model...")
52
+ print("Note: Training optimized for laptop performance (smaller batches, fewer epochs)")
53
+ classifier.train_model(train_dataset, val_dataset, output_dir=args.output_dir)
54
+
55
+ # Evaluate model
56
+ print("Evaluating model...")
57
+ classifier.evaluate_model(test_dataset)
58
+
59
+ print(f"\nTraining complete! Model saved to {args.output_dir}")
60
+ print("\nTo use the model for inference:")
61
+ print(f"from binary_classifier import CBTBinaryClassifier")
62
+ print(f"classifier = CBTBinaryClassifier()")
63
+ print(f"classifier.load_model('{args.output_dir}')")
64
+ print(f"result = classifier.predict('Your text here')")
65
+
66
+ if __name__ == "__main__":
67
+ main()