Khriis
/

RECCON

@@ -22,7 +22,6 @@ class EndpointHandler:
         if not cuda_available:
             logger.warning("GPU not detected. Running on CPU. Inference will be slower.")
-        # In 'pipeline', device is an integer (-1 for CPU, 0+ for GPU)
         self.device_id = 0 if cuda_available else -1
         # Determine model path
@@ -30,7 +29,6 @@ class EndpointHandler:
         logger.info(f"Loading model from {model_path}...")
         try:
-            # Load tokenizer and model explicitly to ensure correct loading
             tokenizer = AutoTokenizer.from_pretrained(model_path)
             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(
                 model_path,
@@ -43,8 +41,6 @@ class EndpointHandler:
             logger.warning("Loaded model class: %s", model.__class__.__name__)
             logger.warning("Loaded model name_or_path: %s", getattr(model.config, "_name_or_path", None))
-            # Initialize the pipeline
-            # top_k=20 matches your previous 'n_best_size=20' logic
             self.pipe = pipeline(
                 "question-answering",
                 model=model,
@@ -53,6 +49,12 @@ class EndpointHandler:
                 top_k=20,
                 handle_impossible_answer=False
             )
             logger.info("Model loaded successfully.")
         except Exception as e:
             logger.error(f"Failed to load model: {e}")
@@ -63,11 +65,100 @@ class EndpointHandler:
             "Extract the exact short phrase (<= 8 words) from the target "
             "utterance that most strongly signals the emotion {emotion}. "
             "Return only a substring of the target utterance."
-        )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Process inference request.
         """
         # Extract inputs
         inputs = data.pop("inputs", data)
@@ -86,18 +177,21 @@ class EndpointHandler:
         for i, item in enumerate(inputs):
             utterance = item.get("utterance", "").strip()
             emotion = item.get("emotion", "")
             if not utterance:
                 logger.warning(f"Empty utterance at index {i}")
                 continue
             # Format as QA task
             question = self.question_template.format(emotion=emotion)
-            # The pipeline expects a list of dicts with 'question' and 'context'
             pipeline_inputs.append({
                 'question': question,
-                'context': utterance
             })
             valid_indices.append(i)
@@ -105,7 +199,6 @@ class EndpointHandler:
         results = []
         if not pipeline_inputs:
-            # All inputs were invalid
             for item in inputs:
                 results.append({
                     "utterance": item.get("utterance", ""),
@@ -116,21 +209,13 @@ class EndpointHandler:
             return results
         try:
-            # Run inference (batch_size helps with multiple inputs)
             predictions = self.pipe(pipeline_inputs, batch_size=8)
-            # If batch_size=1 or single input, pipeline might return a single list/dict
-            # We ensure it's a list of lists (since top_k > 1)
-            if isinstance(predictions, dict): # Single input result
-                predictions = [predictions] # Wrap in list
             elif isinstance(predictions, list) and len(predictions) > 0 and isinstance(predictions[0], dict):
-                 # This happens if we have multiple inputs but top_k=1 (which is not the case here),
-                 # OR if we have a single input and top_k > 1.
-                 # If we have multiple inputs and top_k > 1, it returns a list of lists.
-                 if len(pipeline_inputs) == 1:
-                     predictions = [predictions]
-                 # If multiple inputs and list of dicts, that implies top_k=1.
-                 # But we set top_k=20. So it should be list of lists.
             logger.debug(f"Raw predictions: {predictions}")
@@ -148,18 +233,14 @@ class EndpointHandler:
                         "triggers": []
                     })
                 else:
-                    # Get prediction for this item
-                    # Because top_k=20, 'current_preds' is a list of dicts: [{'answer': '...', 'score': ...}, ...]
                     current_preds = predictions[pred_idx]
-                    # Ensure it is a list
                     if isinstance(current_preds, dict):
                         current_preds = [current_preds]
                     logger.info(
                         "RECCON raw spans (answer, score): %s",
-                        [(p.get("answer"), p.get("score", 0.0), 3) for p in current_preds[:5]]
                     )
                     def is_good_span(ans: str) -> bool:
@@ -168,16 +249,16 @@ class EndpointHandler:
                         a = ans.strip()
                         if len(a) < 3:
                             return False
-                        # reject pure punctuation
                         if all(ch in ".,!?;:-—'\"()[]{}" for ch in a):
                             return False
-                        # require at least one letter
                         if not any(ch.isalpha() for ch in a):
                             return False
                         return True
                     raw_answers = [p.get("answer", "") for p in current_preds]
                     raw_answers = [a for a in raw_answers if is_good_span(a)]
                     triggers = self._clean_spans(raw_answers, utterance)
                     results.append({

         if not cuda_available:
             logger.warning("GPU not detected. Running on CPU. Inference will be slower.")
         self.device_id = 0 if cuda_available else -1
         # Determine model path
         logger.info(f"Loading model from {model_path}...")
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_path)
             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(
                 model_path,
             logger.warning("Loaded model class: %s", model.__class__.__name__)
             logger.warning("Loaded model name_or_path: %s", getattr(model.config, "_name_or_path", None))
             self.pipe = pipeline(
                 "question-answering",
                 model=model,
                 top_k=20,
                 handle_impossible_answer=False
             )
+            # Store tokenizer for context window management
+            self.tokenizer = tokenizer
+            # Set max context length (adjust based on your model's max_position_embeddings)
+            self.max_context_tokens = 384  # Conservative limit for BERT-based models
             logger.info("Model loaded successfully.")
         except Exception as e:
             logger.error(f"Failed to load model: {e}")
             "Extract the exact short phrase (<= 8 words) from the target "
             "utterance that most strongly signals the emotion {emotion}. "
             "Return only a substring of the target utterance."
+        )
+    def _build_context(self, target_utterance: str, conversation_history: List[Dict[str, str]],
+                       max_history: int = 5) -> str:
+        """
+        Build conversational context by prepending previous utterances.
+        Args:
+            target_utterance: The main utterance to analyze
+            conversation_history: List of previous utterances, each with 'speaker' and 'text'
+                                 Format: [{"speaker": "A", "text": "..."}, ...]
+            max_history: Maximum number of previous turns to include
+        Returns:
+            Formatted context string
+        """
+        if not conversation_history:
+            return target_utterance
+        # Take the most recent turns (up to max_history)
+        recent_history = conversation_history[-max_history:] if len(conversation_history) > max_history else conversation_history
+        # Build context string
+        context_parts = []
+        for turn in recent_history:
+            speaker = turn.get("speaker", "")
+            text = turn.get("text", "").strip()
+            if text:
+                if speaker:
+                    context_parts.append(f"{speaker}: {text}")
+                else:
+                    context_parts.append(text)
+        # Add separator before target utterance
+        context_parts.append(f"[TARGET] {target_utterance}")
+        full_context = " ".join(context_parts)
+        # Token-based truncation to fit within model limits
+        return self._truncate_context(full_context, target_utterance)
+    def _truncate_context(self, full_context: str, target_utterance: str) -> str:
+        """
+        Truncate context to fit within token limits while preserving target utterance.
+        """
+        # Tokenize to check length
+        tokens = self.tokenizer.encode(full_context, add_special_tokens=True)
+        if len(tokens) <= self.max_context_tokens:
+            return full_context
+        # If too long, ensure target utterance is fully preserved
+        # and truncate from the beginning of the context
+        target_marker = "[TARGET]"
+        if target_marker in full_context:
+            parts = full_context.split(target_marker)
+            if len(parts) == 2:
+                prefix, target_part = parts
+                target_with_marker = f"{target_marker} {target_part}"
+                # Calculate tokens for target
+                target_tokens = self.tokenizer.encode(target_with_marker, add_special_tokens=False)
+                available_for_prefix = self.max_context_tokens - len(target_tokens) - 10  # Buffer for special tokens
+                if available_for_prefix > 0:
+                    # Truncate prefix from the left (keep most recent context)
+                    prefix_tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
+                    if len(prefix_tokens) > available_for_prefix:
+                        prefix_tokens = prefix_tokens[-available_for_prefix:]
+                        prefix = self.tokenizer.decode(prefix_tokens, skip_special_tokens=True)
+                    return f"{prefix} {target_with_marker}"
+        # Fallback: just return target utterance
+        logger.warning("Context truncation fallback - returning target only")
+        return target_utterance
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Process inference request.
+        Expected input format (NEW):
+        {
+            "inputs": [
+                {
+                    "utterance": "I'm so happy today!",
+                    "emotion": "joy",
+                    "conversation_history": [  # OPTIONAL
+                        {"speaker": "A", "text": "How are you doing?"},
+                        {"speaker": "B", "text": "Pretty good, thanks!"}
+                    ]
+                }
+            ]
+        }
         """
         # Extract inputs
         inputs = data.pop("inputs", data)
         for i, item in enumerate(inputs):
             utterance = item.get("utterance", "").strip()
             emotion = item.get("emotion", "")
+            conversation_history = item.get("conversation_history", [])
             if not utterance:
                 logger.warning(f"Empty utterance at index {i}")
                 continue
+            # Build context with conversation history
+            context = self._build_context(utterance, conversation_history)
             # Format as QA task
             question = self.question_template.format(emotion=emotion)
             pipeline_inputs.append({
                 'question': question,
+                'context': context  # Now includes conversation history
             })
             valid_indices.append(i)
         results = []
         if not pipeline_inputs:
             for item in inputs:
                 results.append({
                     "utterance": item.get("utterance", ""),
             return results
         try:
             predictions = self.pipe(pipeline_inputs, batch_size=8)
+            if isinstance(predictions, dict):
+                predictions = [predictions]
             elif isinstance(predictions, list) and len(predictions) > 0 and isinstance(predictions[0], dict):
+                if len(pipeline_inputs) == 1:
+                    predictions = [predictions]
             logger.debug(f"Raw predictions: {predictions}")
                         "triggers": []
                     })
                 else:
                     current_preds = predictions[pred_idx]
                     if isinstance(current_preds, dict):
                         current_preds = [current_preds]
                     logger.info(
                         "RECCON raw spans (answer, score): %s",
+                        [(p.get("answer"), p.get("score", 0.0)) for p in current_preds[:5]]
                     )
                     def is_good_span(ans: str) -> bool:
                         a = ans.strip()
                         if len(a) < 3:
                             return False
                         if all(ch in ".,!?;:-—'\"()[]{}" for ch in a):
                             return False
                         if not any(ch.isalpha() for ch in a):
                             return False
                         return True
                     raw_answers = [p.get("answer", "") for p in current_preds]
                     raw_answers = [a for a in raw_answers if is_good_span(a)]
+                    # Clean spans against ORIGINAL utterance (not full context)
                     triggers = self._clean_spans(raw_answers, utterance)
                     results.append({