File size: 17,784 Bytes

9cb1789

import json
import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score
import torch.amp as amp
import re
import warnings
from transformers import logging
import os

warnings.filterwarnings("ignore")
logging.set_verbosity_error()


class IntentDataset(Dataset):
    def __init__(self, questions, intents, tokenizer, max_len, intent_to_label, is_inference=False):
        self.questions = questions
        self.intents = intents
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.intent_to_label = intent_to_label
        self.is_inference = is_inference

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        intent = self.intents[item]

        encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        if self.is_inference:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(self.intent_to_label[intent], dtype=torch.long)
            }


def load_data(test_file, val_file):
    import random

    print(f"Reading test data from {test_file}")
    with open(test_file, 'r') as f:
        test_data = json.load(f)

    print(f"Reading validation data from {val_file}")
    with open(val_file, 'r') as f:
        val_data = json.load(f)

    all_data = []
    for item in test_data:
        if isinstance(item, dict):
            utterance = item.get('utterance') or item.get('text') or item.get('question')
            intent = item.get('intent') or item.get('label') or item.get('class')
            if utterance and intent:
                all_data.append((utterance, intent))
        elif isinstance(item, list) and len(item) == 2:
            all_data.append(tuple(item))

    random.shuffle(all_data)
    split_point = int(len(all_data) * 0.7)
    train_processed = all_data[:split_point]
    test_processed = all_data[split_point:]

    val_processed = []
    for item in val_data:
        if isinstance(item, dict):
            utterance = item.get('utterance') or item.get('text') or item.get('question')
            intent = item.get('intent') or item.get('label') or item.get('class')
            if utterance and intent:
                val_processed.append((utterance, intent))
        elif isinstance(item, list) and len(item) == 2:
            val_processed.append(tuple(item))

    intent_labels = list(set([intent for _, intent in train_processed + test_processed + val_processed]))
    print(
        f"Loaded {len(train_processed)} training examples, {len(test_processed)} test examples, and {len(val_processed)} validation examples")
    return train_processed, test_processed, val_processed, intent_labels


class IntentClassifier:
    def __init__(self, model_path, device):
        self.device = device
        self.model = BertForSequenceClassification.from_pretrained(model_path).to(device)
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model.eval()

        # Updated priority weights with more nuanced categorization
        self.intent_priorities = {
            # High Priority (Information Seeking)
            'RecommendationRequest': 1.0,
            'Request': 1.0,
            'ComparisonRequest': 0.95,
            'ClarificationRequest': 0.95,

            # Important Context
            'Fact': 0.9,
            'ActionReport': 0.85,
            'Preference': 0.85,

            # Supporting Information
            'Opinion': 0.7,
            'SystemRecommendation': 0.7,
            'Answer': 0.7,

            # Secondary Information
            'Sentiment': 0.5,
            'Feedback': 0.5,
            'ReferenceToPriorConversation': 0.5,

            # Low Priority
            'Greetings': 0.3,
            'Farewell': 0.3,
            'AgreementWithSystem': 0.3,
            'DisagreementWithSystem': 0.3,

            # Special Cases
            'IrrelevantUtterance': 0.1,
            'Unclear': 0.1
        }

        # Intent relationships for context
        self.intent_relationships = {
            'RecommendationRequest': ['Fact', 'Preference', 'ActionReport'],
            'Request': ['Fact', 'Preference', 'ActionReport'],
            'ComparisonRequest': ['Fact', 'Preference'],
            'ClarificationRequest': ['Fact', 'ReferenceToPriorConversation']
        }

    def segment_text(self, text):
        """Enhanced text segmentation"""
        # Split on stronger boundaries
        text = re.sub(r' and ', '. ', text)
        text = re.sub(r', (?=[A-Z])', '. ', text)

        # Split on sentence boundaries
        segments = re.split('[.!?]', text)

        # Clean segments
        segments = [s.strip() for s in segments if s.strip()]

        # Handle subordinate clauses
        refined_segments = []
        for segment in segments:
            if any(marker in segment.lower() for marker in ['because', 'since', 'as', 'would like', 'want to']):
                parts = re.split(r'\b(because|since|as|would like|want to)\b', segment, flags=re.IGNORECASE)
                refined_segments.extend([p.strip() for p in parts if p.strip()])
            else:
                refined_segments.append(segment)

        if not refined_segments:
            refined_segments = [text.strip()]

        return refined_segments

    def classify_segment(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs.logits, dim=1)[0]  # Take first element
            confidence_values, pred_indices = torch.topk(probabilities, k=3)

            # Convert to Python lists/floats
            return (pred_indices.cpu().tolist(),
                    confidence_values.cpu().tolist())

    def get_intent_label(self, index):
        try:
            with open('intent_mapping.json', 'r') as f:
                intent_mapping = json.load(f)
            return intent_mapping.get(f"LABEL_{index}", "Unknown")
        except Exception as e:
            print(f"Error loading intent mapping: {e}")
            return "Unknown"

    def classify_text(self, text):
        """Enhanced text classification with context awareness"""
        segments = self.segment_text(text)
        all_results = []

        # First pass: Classify all segments
        for segment in segments:
            try:
                pred_indices, confidence_values = self.classify_segment(segment)

                segment_results = []
                for pred_idx, conf in zip(pred_indices, confidence_values):
                    intent = self.get_intent_label(pred_idx)
                    base_priority = self.intent_priorities.get(intent, 0.5)

                    # Adjust priority based on segment position
                    position_boost = 1.0 if segment == segments[0] else 0.9

                    # Boost priority if contains key phrases
                    content_boost = 1.2 if any(phrase in segment.lower() for phrase in
                                               ['need', 'want', 'help', 'recommend', 'advice', 'suggest']) else 1.0

                    weighted_confidence = float(conf) * base_priority * position_boost * content_boost

                    segment_results.append({
                        'segment': segment,
                        'intent': intent,
                        'confidence': float(conf),
                        'weighted_confidence': weighted_confidence,
                        'base_priority': base_priority
                    })

                all_results.extend(segment_results)
            except Exception as e:
                print(f"Error processing segment '{segment}': {e}")
                continue

        if not all_results:
            return None

        # Sort by weighted confidence
        all_results.sort(key=lambda x: x['weighted_confidence'], reverse=True)

        # Filter and enhance results
        primary_intent = all_results[0]
        secondary_intents = []

        # Look for supporting intents
        if primary_intent['intent'] in self.intent_relationships:
            related_intents = self.intent_relationships[primary_intent['intent']]
            for result in all_results[1:]:
                if (result['intent'] in related_intents and
                        result['confidence'] > 0.4 and
                        len(secondary_intents) < 2):
                    secondary_intents.append(result)

        # Add high confidence intents if we still need more
        if len(secondary_intents) < 2:
            for result in all_results[1:]:
                if (result['weighted_confidence'] > 0.4 and
                        result not in secondary_intents and
                        len(secondary_intents) < 2):
                    secondary_intents.append(result)

        return [primary_intent] + secondary_intents


def train_model(train_data, val_data, intent_labels, device):
    train_questions, train_intents = zip(*train_data)
    val_questions, val_intents = zip(*val_data)
    intent_to_label = {intent: i for i, intent in enumerate(intent_labels)}

    config = BertConfig.from_pretrained('bert-base-uncased',
                                        num_labels=len(intent_labels))
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    model = model.to(device)

    train_dataset = IntentDataset(train_questions, train_intents, tokenizer, max_len=128,
                                  intent_to_label=intent_to_label)
    val_dataset = IntentDataset(val_questions, val_intents, tokenizer, max_len=128,
                                intent_to_label=intent_to_label)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    num_epochs = 25
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=len(train_loader) * num_epochs)
    scaler = amp.GradScaler()
    best_val_f1 = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with amp.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        val_preds = []
        val_true = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs.logits, dim=1)

                val_preds.extend(preds.cpu().tolist())
                val_true.extend(labels.cpu().tolist())

        val_f1 = f1_score(val_true, val_preds, average='weighted')
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}, Val F1: {val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            model.save_pretrained('./fine_tuned_bert')
            tokenizer.save_pretrained('./fine_tuned_bert')
            print(f"New best model saved with validation F1: {best_val_f1:.4f}")
            # Save intent mapping to model directory
            intent_mapping = {f"LABEL_{i}": intent for i, intent in enumerate(intent_labels)}
            with open('./fine_tuned_bert/intent_mapping.json', 'w') as f:
                json.dump(intent_mapping, f, indent=2)

    return model, tokenizer


def interactive_classification():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    try:
        classifier = IntentClassifier('./fine_tuned_bert', device)
        print("\nModel loaded successfully!")
        print("\nEnter your questions (type 'quit' to exit):")

        while True:
            question = input("\nEnter your question: ").strip()

            if question.lower() in ['quit', 'exit', 'q']:
                print("Exiting...")
                break

            if not question:
                print("Please enter a valid question.")
                continue

            try:
                results = classifier.classify_text(question)

                if results:
                    print("\nResults:")
                    for i, result in enumerate(results, 1):
                        print(f"\nIntent {i}:")
                        print(f"Detected Intent: {result['intent']}")
                        print(f"Confidence: {result['confidence']:.2%}")
                        print(f"Segment: {result['segment']}")

                        conf = result['confidence']
                        if conf >= 0.9:
                            print("Confidence Level: Very High")
                        elif conf >= 0.7:
                            print("Confidence Level: High")
                        elif conf >= 0.5:
                            print("Confidence Level: Moderate")
                        else:
                            print("Confidence Level: Low")
                else:
                    print("Could not determine intent with sufficient confidence.")

            except Exception as e:
                print(f"Error processing question: {str(e)}")
                print("Please try another question.")

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        print("Please ensure the model has been trained and saved correctly.")


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model_dir = '/app/code/fine_tuned_bert'
    required_files = {
        'config.json',
        'intent_mapping.json',
        'tokenizer_config.json',
        'vocab.txt'
    }
    model_files = {'pytorch_model.bin', 'model.safetensors'}  # Common model formats

    # Check for existing valid model
    if os.path.exists(model_dir):
        existing_files = set(os.listdir(model_dir))

        # Check for at least one model file format
        has_model_file = any(f in existing_files for f in model_files)

        # Check all other required files
        has_required = required_files.issubset(existing_files)

        if has_model_file and has_required:
            print("Found valid existing model. Loading...")
            classifier = IntentClassifier(model_dir, device)
            print("\nStarting interactive classification...")
            interactive_classification()
            return

    # If we get here, train new model
    print("Model not found or incomplete. Starting training...")
    os.makedirs(model_dir, exist_ok=True)

    # Load data and create mapping
    train_data, test_data, val_data, intent_labels = load_data(
        'training-22-intent.json',
        'validation-22-intent.json'
    )
    intent_mapping = {f"LABEL_{i}": intent for i, intent in enumerate(intent_labels)}
    with open(os.path.join(model_dir, 'intent_mapping.json'), 'w') as f:
        json.dump(intent_mapping, f, indent=2)

    # Train and save
    model, tokenizer = train_model(train_data, val_data, intent_labels, device)
    model.save_pretrained(model_dir, safe_serialization=False)  # Force PyTorch format
    tokenizer.save_pretrained(model_dir)

    print("Training complete. Starting classification...")
    interactive_classification()


if __name__ == "__main__":
    main()