File size: 23,523 Bytes

5de5c2b

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset
import re


class IntentDataset(Dataset):
    """

    Dataset for handling student input and session context for 5-class intent categorization.

    """
    def __init__(self, data, tokenizer, max_length=128):
        # data: list of dicts with 'student_input', 'session_context', 'label'
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = {
            'On-Topic Question': 0,
            'Off-Topic Question': 1,
            'Emotional-State': 2,
            'Pace-Related': 3,
            'Repeat/clarification': 4
        }
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        student_input = str(item.get('student_input', ''))
        session_context = str(item.get('session_context', ''))
        
        # Tokenize pair — longest_first truncation preserves student input priority
        encoded = self.tokenizer(
            student_input,
            session_context,
            padding='max_length',
            truncation='longest_first',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        label_val = item.get('label', 0)
        if isinstance(label_val, str):
            label_val = self.label_map.get(label_val, 0)
            
        output = {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(label_val, dtype=torch.long)
        }
        if 'token_type_ids' in encoded:
            output['token_type_ids'] = encoded['token_type_ids'].squeeze(0)
            
        return output


class CompoundSentenceSplitter:
    """

    Algorithm to split compound sentences containing 2 separate questions.

    Handles various patterns and conjunctions commonly used to combine questions.

    English only.

    """
    
    def __init__(self):
        # English question words
        self.question_words = [
            'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how',
            'is', 'are', 'was', 'were', 'do', 'does', 'did', 'can', 'could', 
            'will', 'would', 'should', 'may', 'might', 'must'
        ]
        
        # English conjunctions
        self.conjunctions = [
            'and', 'or', 'also', 'plus', 'additionally', 'moreover'
        ]
        
        # English transition phrases
        self.transition_phrases = [
            'and also', 'and what about', 'and how about', 'or what about', 
            'or how about', 'also what', 'also how', 'also when', 'also where',
            'also who', 'also why', 'plus what', 'plus how'
        ]
    
    def split_compound_question(self, text):
        """

        Split a compound sentence into 2 separate questions if applicable.

        Works with English text.

        

        Args:

            text (str): Input text that may contain compound questions

            

        Returns:

            list: List of separated questions. Returns [text] if no split is needed.

        """
        text = text.strip()
        
        # Check if text is likely a question
        if not self._is_question(text):
            return [text]
        
        # Try different splitting strategies
        questions = []
        
        # Strategy 1: Split by transition phrases
        questions = self._split_by_transition_phrases(text)
        if len(questions) > 1:
            return self._clean_questions(questions)
        
        # Strategy 2: Split by conjunction followed by question word
        questions = self._split_by_conjunction_pattern(text)
        if len(questions) > 1:
            return self._clean_questions(questions)
        
        # Strategy 3: Split by semicolon or comma-conjunction pattern
        questions = self._split_by_punctuation_pattern(text)
        if len(questions) > 1:
            return self._clean_questions(questions)
        
        # Strategy 4: Split by multiple question marks
        questions = self._split_by_question_marks(text)
        if len(questions) > 1:
            return self._clean_questions(questions)
        
        # No split found, return original
        return [text]
    
    def _is_question(self, text):
        """Check if text is likely a question (English)"""
        text_stripped = text.strip()
        
        # Has question mark
        if '?' in text:
            return True
        
        # Check for question words at the start
        words = text_stripped.split()
        if words:
            first_word = words[0].lower()
            # Check English question words
            if first_word in self.question_words:
                return True
        
        return False
    
    def _split_by_transition_phrases(self, text):
        """Split by transition phrases (English)"""
        for phrase in self.transition_phrases:
            # English phrase with word boundaries
            pattern = r'\s+' + re.escape(phrase) + r'\s+'
            
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                parts = re.split(pattern, text, maxsplit=1, flags=re.IGNORECASE)
                if len(parts) == 2 and parts[0] and parts[1]:
                    return parts
        
        return [text]
    
    def _split_by_conjunction_pattern(self, text):
        """Split by conjunction followed by question word (English)"""
        # Pattern: conjunction + question word
        for conj in self.conjunctions:
            for qword in self.question_words:
                # English pattern with word boundaries
                pattern = r'\s+' + re.escape(conj) + r'\s+' + re.escape(qword) + r'\b'
                
                match = re.search(pattern, text, re.IGNORECASE)
                
                if match:
                    # Find the actual position in original text
                    split_pos = match.start()
                    part1 = text[:split_pos].strip()
                    part2 = text[split_pos:].strip()
                    
                    # Remove leading conjunction from part2
                    for c in self.conjunctions:
                        is_arabic_c = any(ch in 'أبتثجحخدذرزسشصضطظعغفقكلمنهويىةؤإآ' for ch in c)
                        part2 = re.sub(r'^\s*' + re.escape(c) + r'\s+', '', part2, flags=re.IGNORECASE if not is_arabic_c else 0)
                    
                    # Ensure both parts are questions
                    if part1 and part2 and self._is_question(part1):
                        return [part1, part2]
        
        return [text]
    
    def _split_by_punctuation_pattern(self, text):
        """Split by semicolon or specific comma patterns"""
        # Split by semicolon (works for both languages)
        if ';' in text or '؛' in text:  # Added Arabic semicolon
            parts = re.split(r'[;؛]', text, maxsplit=1)
            if len(parts) == 2:
                parts = [p.strip() for p in parts]
                if all(self._is_question(p) for p in parts):
                    return parts
        
        # Split by comma followed by question word
        pattern = r',\s+(?=' + '|'.join([re.escape(qw) for qw in self.question_words]) + r')'
        parts = re.split(pattern, text, maxsplit=1, flags=re.IGNORECASE)
        
        if len(parts) == 2:
            parts = [p.strip() for p in parts]
            # Only split if second part is clearly a question
            if self._is_question(parts[1]):
                return parts
        
        return [text]
    
    def _split_by_question_marks(self, text):
        """Split by question marks if multiple exist (both ? and ؟)"""
        # Count both English and Arabic question marks
        q_marks = text.count('?') + text.count('؟')
        
        if q_marks >= 2:
            # Split at first question mark
            match = re.search(r'[?؟]', text)
            if match:
                split_pos = match.end()
                part1 = text[:split_pos].strip()
                part2 = text[split_pos:].strip()
                
                if part2:  # Ensure second part is not empty
                    return [part1, part2]
        
        return [text]
    
    def _clean_questions(self, questions):
        """Clean and validate split questions"""
        cleaned = []
        
        for q in questions:
            q = q.strip()
            
            # Skip empty questions
            if not q:
                continue
            
            # Ensure question ends with '?' or '؟' if it's clearly a question
            if self._is_question(q):
                # Check if already has question mark
                if not (q.endswith('?') or q.endswith('؟')):
                    # Add appropriate question mark based on language
                    if any(c in 'أبتثجحخدذرزسشصضطظعغفقكلمنهويىةؤإآ' for c in q):
                        q += '؟'  # Arabic question mark
                    else:
                        q += '?'  # English question mark
            
            cleaned.append(q)
        
        return cleaned if len(cleaned) > 1 else [' '.join(questions)]


class TinyBertCNN(nn.Module):
    """

    TinyBERT-CNN model for intent classification.

    Combines TinyBERT embeddings with CNN layers + BatchNorm + hidden FC layer.

    """
    
    def __init__(

        self,

        num_classes,

        bert_model_name='huawei-noah/TinyBERT_General_4L_312D',

        num_filters=256,

        filter_sizes=[2, 3, 4],

        dropout=0.5,

        hidden_dim=128,

        freeze_bert=False

    ):
        """

        Args:

            num_classes (int): Number of intent classes

            bert_model_name (str): Pre-trained TinyBERT model name

            num_filters (int): Number of filters for each filter size

            filter_sizes (list): List of filter sizes for CNN

            dropout (float): Dropout rate

            hidden_dim (int): Hidden FC layer dimension

            freeze_bert (bool): Whether to freeze BERT parameters

        """
        super(TinyBertCNN, self).__init__()
        
        # Load TinyBERT model
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # Freeze BERT parameters if specified
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        # CNN layers with BatchNorm
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=self.bert_hidden_size,
                out_channels=num_filters,
                kernel_size=fs
            )
            for fs in filter_sizes
        ])
        self.batchnorms = nn.ModuleList([
            nn.BatchNorm1d(num_filters)
            for _ in filter_sizes
        ])
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Hidden FC layer
        cnn_out_dim = len(filter_sizes) * num_filters
        self.fc_hidden = nn.Linear(cnn_out_dim, hidden_dim)
        self.bn_hidden = nn.BatchNorm1d(hidden_dim)
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        """

        Forward pass

        

        Args:

            input_ids: Token IDs (batch_size, seq_len)

            attention_mask: Attention mask (batch_size, seq_len)

            token_type_ids: Token type IDs (batch_size, seq_len), optional

            

        Returns:

            logits: Classification logits (batch_size, num_classes)

        """
        # Get TinyBERT embeddings
        # outputs: (batch_size, seq_len, hidden_size)
        bert_kwargs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
        if token_type_ids is not None:
            bert_kwargs['token_type_ids'] = token_type_ids
            
        bert_output = self.bert(**bert_kwargs)
        
        # Use last hidden state
        # sequence_output: (batch_size, seq_len, hidden_size)
        sequence_output = bert_output.last_hidden_state
        
        # Transpose for CNN: (batch_size, hidden_size, seq_len)
        sequence_output = sequence_output.transpose(1, 2)
        
        # Pad if sequence is shorter than the largest kernel
        max_kernel = max(conv.kernel_size[0] for conv in self.convs)
        if sequence_output.size(2) < max_kernel:
            pad_size = max_kernel - sequence_output.size(2)
            sequence_output = torch.nn.functional.pad(sequence_output, (0, pad_size))
        
        # Apply convolution + batchnorm + max pooling for each filter size
        conv_outputs = []
        for conv, bn in zip(self.convs, self.batchnorms):
            # conv_out: (batch_size, num_filters, seq_len - filter_size + 1)
            conv_out = torch.relu(bn(conv(sequence_output)))
            # pooled: (batch_size, num_filters)
            pooled = torch.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
            conv_outputs.append(pooled)
        
        # Concatenate all features
        # concatenated: (batch_size, len(filter_sizes) * num_filters)
        concatenated = torch.cat(conv_outputs, dim=1)
        concatenated = self.dropout(concatenated)
        
        # Hidden FC layer
        hidden = torch.relu(self.bn_hidden(self.fc_hidden(concatenated)))
        hidden = self.dropout(hidden)
        
        # Final classification
        logits = self.fc(hidden)
        
        return logits


class IntentClassifier:
    """

    Wrapper class for training and inference

    """
    
    def __init__(

        self,

        num_classes,

        bert_model_name='huawei-noah/TinyBERT_General_4L_312D',

        num_filters=256,

        filter_sizes=[2, 3, 4],

        dropout=0.5,

        freeze_bert=False,

        device=None

    ):
        self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Initialize model
        self.model = TinyBertCNN(
            num_classes=num_classes,
            bert_model_name=bert_model_name,
            num_filters=num_filters,
            filter_sizes=filter_sizes,
            dropout=dropout,
            freeze_bert=freeze_bert
        ).to(self.device)
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        
        # Initialize compound sentence splitter
        self.sentence_splitter = CompoundSentenceSplitter()
        
        self.num_classes = num_classes
        
    def preprocess_text(self, text):
        """

        Preprocess text by splitting compound questions if detected

        

        Args:

            text (str): Input text (English or Arabic)

            

        Returns:

            list: List of individual questions

        """
        return self.sentence_splitter.split_compound_question(text)
    
    def predict(self, student_inputs, session_contexts=None, max_length=128, split_compound=False):
        """

        Predict intents for input texts

        

        Args:

            student_inputs (list): List of student input texts (English or Arabic)

            session_contexts (list): List of session context texts

            max_length (int): Maximum sequence length

            split_compound (bool): Whether to split compound questions before prediction

            

        Returns:

            If split_compound=False:

                predictions: Predicted class indices

                probabilities: Prediction probabilities

            If split_compound=True:

                predictions: List of predictions (may contain multiple per text if split)

                probabilities: List of probabilities

                split_info: Dictionary with information about splits

        """
        # Handle compound questions if requested
        if split_compound:
            return self._predict_with_splitting(student_inputs, session_contexts, max_length)
        
        self.model.eval()
        
        # Determine if we are passing single string or pair
        if session_contexts is not None:
            text_args = (student_inputs, session_contexts)
        else:
            text_args = (student_inputs,)
        
        # Tokenize
        encoded = self.tokenizer(
            *text_args,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids'].to(self.device)
        attention_mask = encoded['attention_mask'].to(self.device)
        token_type_ids = encoded.get('token_type_ids')
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(self.device)
        
        with torch.no_grad():
            logits = self.model(input_ids, attention_mask, token_type_ids=token_type_ids)
            probabilities = torch.softmax(logits, dim=1)
            predictions = torch.argmax(probabilities, dim=1)
        
        return predictions.cpu().numpy(), probabilities.cpu().numpy()
    
    def _predict_with_splitting(self, student_inputs, session_contexts=None, max_length=128):
        """

        Predict intents after splitting compound questions (English and Arabic)

        

        Args:

            student_inputs (list): List of input texts

            session_contexts (list): List of session context texts

            max_length (int): Maximum sequence length

            

        Returns:

            predictions: List of predictions (one per original text, may contain multiple if split)

            probabilities: List of probabilities

            split_info: Dictionary with information about splits

        """
        all_predictions = []
        all_probabilities = []
        split_info = {
            'original_texts': student_inputs,
            'split_texts': [],
            'was_split': [],
            'split_indices': []  # Maps split question index to original text index
        }
        
        # Collect all questions after splitting
        all_questions = []
        all_contexts = []
        for i, text in enumerate(student_inputs):
            questions = self.preprocess_text(text)
            split_info['split_texts'].append(questions)
            split_info['was_split'].append(len(questions) > 1)
            
            # Track which original text each split question belongs to
            for _ in questions:
                split_info['split_indices'].append(i)
                if session_contexts is not None:
                    all_contexts.append(session_contexts[i])
            
            all_questions.extend(questions)
        
        # Predict for all questions at once
        if all_questions:
            contexts_to_pass = all_contexts if session_contexts is not None else None
            predictions, probabilities = self.predict(all_questions, contexts_to_pass, max_length, split_compound=False)
            
            # Reorganize results by original text
            idx = 0
            for i, text in enumerate(student_inputs):
                num_questions = len(split_info['split_texts'][i])
                text_predictions = predictions[idx:idx + num_questions]
                text_probabilities = probabilities[idx:idx + num_questions]
                
                all_predictions.append(text_predictions)
                all_probabilities.append(text_probabilities)
                
                idx += num_questions
        
        return all_predictions, all_probabilities, split_info
    
    def train_step(self, batch, optimizer, criterion):
        """

        Single training step

        

        Args:

            batch: Dictionary with 'input_ids', 'attention_mask', 'labels'

            optimizer: Optimizer

            criterion: Loss function

            

        Returns:

            loss: Training loss

        """
        self.model.train()
        
        input_ids = batch['input_ids'].to(self.device)
        attention_mask = batch['attention_mask'].to(self.device)
        labels = batch['labels'].to(self.device)
        token_type_ids = batch.get('token_type_ids')
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(self.device)
        
        # Forward pass
        logits = self.model(input_ids, attention_mask, token_type_ids=token_type_ids)
        loss = criterion(logits, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return loss.item()
    
    def evaluate(self, dataloader, criterion):
        """

        Evaluate model on validation/test set

        

        Args:

            dataloader: DataLoader for evaluation

            criterion: Loss function

            

        Returns:

            avg_loss: Average loss

            accuracy: Classification accuracy

        """
        self.model.eval()
        
        total_loss = 0
        total_correct = 0
        total_samples = 0
        
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                token_type_ids = batch.get('token_type_ids')
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(self.device)
                
                # Forward pass
                logits = self.model(input_ids, attention_mask, token_type_ids=token_type_ids)
                loss = criterion(logits, labels)
                
                # Calculate metrics
                predictions = torch.argmax(logits, dim=1)
                total_loss += loss.item() * labels.size(0)
                total_correct += (predictions == labels).sum().item()
                total_samples += labels.size(0)
        
        avg_loss = total_loss / total_samples
        accuracy = total_correct / total_samples
        
        return avg_loss, accuracy
    
    def save_model(self, path):
        """Save model checkpoint"""
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'num_classes': self.num_classes
        }, path)
        print(f"Model saved to {path}")
    
    def load_model(self, path):
        """Load model checkpoint"""
        checkpoint = torch.load(path, map_location=self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        print(f"Model loaded from {path}")