alejandrohdez
/

gpt2-finetuned-cognitive-availability-model

@@ -2,6 +2,7 @@ import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 from typing import Dict, List, Any, Union
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -57,14 +58,17 @@ class EndpointHandler:
         Expected input format:
         {
-            "inputs": "text string" or ["text1", "text2", ...]
         }
         Returns:
         {
             "logits": [[[vocab_logits]], [[vocab_logits]]],
             "input_ids": [[token_ids]],
-            "shape": [batch_size, sequence_length, vocab_size]
         }
         Args:
@@ -90,7 +94,12 @@ class EndpointHandler:
             else:
                 raise ValueError(f"Expected string or list of strings, got: {type(inputs)}")
-            logger.info(f"Processing batch of {len(text_inputs)} text inputs")
             # Tokenize all inputs
             encoded = self.tokenizer(
@@ -101,12 +110,13 @@ class EndpointHandler:
             )
             input_ids = encoded["input_ids"].to(self.device)
             logger.info(f"Tokenized to shape: {input_ids.shape}")
             # Forward pass - no gradients needed
             with torch.no_grad():
-                outputs = self.model(input_ids)
                 logits = outputs.logits
             # Convert to CPU and then to list for JSON serialization
@@ -123,6 +133,49 @@ class EndpointHandler:
                 "original_inputs": text_inputs
             }
             logger.info(f"Successfully processed batch, output shape: {logits.shape}")
             return response

 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 from typing import Dict, List, Any, Union
 import logging
+import torch.nn.functional as F
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         Expected input format:
         {
+            "inputs": "text string" or ["text1", "text2", ...],
+            "compute_scores": false,  # Optional: if true, compute scores on server side
+            "metric": "nll"  # Optional: "nll" or "perplexity" (only used if compute_scores=true)
         }
         Returns:
         {
             "logits": [[[vocab_logits]], [[vocab_logits]]],
             "input_ids": [[token_ids]],
+            "shape": [batch_size, sequence_length, vocab_size],
+            "scores": [score1, score2, ...]  # Only if compute_scores=true
         }
         Args:
             else:
                 raise ValueError(f"Expected string or list of strings, got: {type(inputs)}")
+            # Check if we should compute scores on server side
+            compute_scores = data.get("compute_scores", False)
+            metric = data.get("metric", "nll")
+            logger.info(f"Processing batch of {len(text_inputs)} text inputs" +
+                       f" (compute_scores={compute_scores}, metric={metric})" if compute_scores else "")
             # Tokenize all inputs
             encoded = self.tokenizer(
             )
             input_ids = encoded["input_ids"].to(self.device)
+            attention_mask = encoded["attention_mask"].to(self.device)
             logger.info(f"Tokenized to shape: {input_ids.shape}")
             # Forward pass - no gradients needed
             with torch.no_grad():
+                outputs = self.model(input_ids, attention_mask=attention_mask)
                 logits = outputs.logits
             # Convert to CPU and then to list for JSON serialization
                 "original_inputs": text_inputs
             }
+            # Optionally compute scores on server side (GPU)
+            if compute_scores:
+                scores = []
+                # Process each sequence in the batch
+                for i in range(len(text_inputs)):
+                    try:
+                        # Extract this sequence's data
+                        seq_input_ids = input_ids[i:i+1]  # Keep batch dimension
+                        seq_logits = logits[i:i+1]
+                        seq_attention_mask = attention_mask[i:i+1]
+                        # Prepare targets and logits for loss computation
+                        targets = seq_input_ids[:, 1:].clone()
+                        logits_for_loss = seq_logits[:, :-1]
+                        # Only consider non-padding tokens (use attention mask)
+                        mask = seq_attention_mask[:, 1:] == 1  # Skip first token, match targets shape
+                        if mask.sum() == 0:  # No valid tokens
+                            scores.append(float('inf'))
+                            continue
+                        # Flatten and mask
+                        masked_logits = logits_for_loss[mask]  # [num_valid_tokens, vocab_size]
+                        masked_targets = targets[mask]  # [num_valid_tokens]
+                        if metric == "perplexity":
+                            loss = F.cross_entropy(masked_logits, masked_targets, reduction='mean')
+                            perplexity = torch.exp(loss).item()
+                            scores.append(perplexity)
+                        else:  # nll
+                            nll = F.cross_entropy(masked_logits, masked_targets, reduction='mean')
+                            scores.append(nll.item())
+                    except Exception as e:
+                        logger.warning(f"Failed to compute {metric} for sequence {i}: {e}")
+                        scores.append(float('inf'))
+                response["scores"] = scores
+                response["metric"] = metric
+                logger.info(f"Computed {metric} scores on server side for {len(scores)} sequences")
             logger.info(f"Successfully processed batch, output shape: {logits.shape}")
             return response