Spaces:

SamSankar
/

hallucination-guard-env

Running

File size: 43,981 Bytes

"""Professional-grade HallucinationGuard RL Environment.



This module implements a sophisticated, production-ready RL environment with:

- Curriculum learning with adaptive difficulty

- Multi-turn conversation support

- Context retrieval challenges

- Comprehensive episode management

- Model-agnostic design (works with any LLM)

- Real-time metrics and logging

- Session management for concurrent users

"""

import uuid
import time
import logging
from typing import Optional, Dict, Any, List, Tuple
from dataclasses import dataclass, field
from enum import Enum

# Add directories to path for imports to work in both local and HF Spaces
import sys
import os
_dir = os.path.dirname(os.path.abspath(__file__))
_parent = os.path.dirname(_dir)
if _parent not in sys.path:
    sys.path.insert(0, _parent)
if _dir not in sys.path:
    sys.path.insert(0, _dir)

from openenv.core.env_server import Environment

from models import (
    HallucinationAction,
    HallucinationObservation,
    HallucinationState,
    EpisodeStatistics,
    AgentSkillProfile,
    RewardBreakdown,
    SemanticAnalysis,
    CitationAnalysis,
    HallucinationSeverity,
    HallucinationType,
    DifficultyLevel,
    EnvironmentConfig,
    MultiTurnDialogue,
)
# Import from same directory for HF Spaces deployment compatibility
from grader import (
    calculate_reward,
    generate_feedback,
    detect_hallucination_advanced,
    HallucinationType as GraderHallucinationType,
    HallucinationSeverity as GraderHallucinationSeverity,
)
from dataset_loader import DatasetLoader, QAExample, DifficultyLevel as DatasetDifficulty


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class EpisodePhase(Enum):
    """Phases of an episode."""
    INITIALIZATION = "initialization"
    ACTIVE = "active"
    MULTI_TURN_CLARIFICATION = "multi_turn_clarification"
    CONTEXT_RETRIEVAL = "context_retrieval"
    COMPLETION = "completion"


class HallucinationEnvironment(Environment[HallucinationAction, HallucinationObservation, HallucinationState]):
    """

    Professional-grade OpenEnv environment for training AI to avoid hallucinations.



    Features:

    - Curriculum learning with progressive difficulty

    - Adaptive difficulty based on performance

    - Multi-turn conversation support

    - Context retrieval challenges

    - Comprehensive metrics tracking

    - Model-agnostic design

    - Session management

    """

    SUPPORTS_CONCURRENT_SESSIONS = True
    VERSION = "2.0.0"

    def __init__(

        self,

        transform=None,

        config: Optional[EnvironmentConfig] = None,

        session_id: Optional[str] = None,

        dataset_loader: Optional["DatasetLoader"] = None

    ):
        super().__init__(transform=transform)

        # Configuration
        self.config = config or EnvironmentConfig()
        self.session_id = session_id or str(uuid.uuid4())[:8]

        # Dataset management — accept a pre-loaded shared loader to avoid
        # reloading 1M+ examples on every session creation.
        if dataset_loader is not None:
            self.dataset_loader = dataset_loader
            logger.info(f"Reusing shared dataset loader — {dataset_loader.get_total_examples():,} examples")
        else:
            # First boot: load synthetic baseline, then augment with real HF data
            self.dataset_loader = DatasetLoader()
            self.dataset_loader.load_builtin_datasets()
            logger.info(f"Synthetic dataset: {self.dataset_loader.get_total_examples()} examples")

            # Attempt to load real HuggingFace datasets (SQuAD, TriviaQA, HaluEval, TruthfulQA).
            # Uses disk cache after first download so restarts are instant.
            # Gracefully skips if the `datasets` package is not installed.
            try:
                real_added = self.dataset_loader.load_real_datasets(max_per_dataset=500, cache=True)
                if real_added > 0:
                    logger.info(f"Added {real_added} real examples — total: {self.dataset_loader.get_total_examples()}")
                else:
                    logger.info("HuggingFace datasets unavailable; using synthetic data only")
            except Exception as _ds_err:
                logger.warning(f"Dataset loading failed ({_ds_err}); continuing with synthetic data only")

        # Episode state
        self.episode_id: Optional[str] = None
        self.episode_phase: EpisodePhase = EpisodePhase.INITIALIZATION
        self.step_count: int = 0
        self.total_hallucinations: int = 0
        self.total_correct: int = 0
        self.total_partial: int = 0

        # Current data
        self.current_example: Optional[QAExample] = None
        self.episode_examples: List[QAExample] = []
        self.episode_start_time: Optional[float] = None
        self.last_step_time: Optional[float] = None

        # Performance tracking
        self.reward_history: List[float] = []
        self.confidence_history: List[float] = []
        self.hallucination_history: List[bool] = []
        self.current_streak: int = 0
        self.best_streak: int = 0

        # Early stopping tracking (NEW)
        self.consecutive_failures: int = 0
        self.consecutive_hallucinations: int = 0
        self.consecutive_perfect: int = 0
        self.early_stop_reason: Optional[str] = None
        self.calibration_history: List[float] = []

        # Curriculum state
        self.curriculum_stage: int = 0
        self.curriculum_performance: List[float] = []
        self.skill_rating: float = 0.5  # ELO-style rating

        # Multi-turn state
        self.dialogue: Optional[MultiTurnDialogue] = None
        self.pending_clarifications: List[str] = []

        # Agent profile (persistent across episodes)
        self.agent_profile: Optional[AgentSkillProfile] = None

        # Context retrieval challenge state
        self.revealed_context_fragments: List[str] = []
        self.context_retrieval_turns: int = 0

        # Active model adapter (set via reset(model=...) for auto-play mode)
        self.active_adapter = None

        logger.info(f"Initialized HallucinationEnvironment (session={self.session_id})")

    def reset(

        self,

        seed: Optional[int] = None,

        episode_id: Optional[str] = None,

        difficulty: Optional[str] = None,

        enable_multi_turn: bool = False,

        enable_context_retrieval: bool = False,

        model: Optional[str] = None,

        model_config: Optional[Dict[str, Any]] = None,

        **kwargs

    ) -> HallucinationObservation:
        """

        Reset the environment for a new episode.



        Args:

            seed: Random seed for reproducibility

            episode_id: Custom episode ID

            difficulty: Starting difficulty level

            enable_multi_turn: Enable multi-turn clarification

            enable_context_retrieval: Enable context retrieval challenges

            model: Model provider to use for auto-play mode.

                   Supported: "openai", "anthropic", "huggingface", "ollama", "generic".

                   When set, the environment calls the model automatically on each step

                   so you only need to call reset() + step() in a loop.

            model_config: Optional dict passed to create_adapter(). Keys:

                   model_name, api_key, api_base, temperature, max_tokens, etc.



        Returns:

            Initial observation

        """
        import random
        if seed is not None:
            random.seed(seed)
            # Reset used indices for reproducibility
            self.dataset_loader.reset_usage()

        # ── Model adapter setup ───────────────────────────────────────────────
        # When model= is supplied, the environment auto-generates answers by
        # calling the adapter inside step(), so callers just loop reset/step.
        self.active_adapter = None
        if model is not None:
            try:
                import sys, os
                sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
                from model_adapters import create_adapter
                cfg = model_config or {}
                self.active_adapter = create_adapter(model, **cfg)
                logger.info(f"Active adapter: {model} ({self.active_adapter.__class__.__name__})")
            except ImportError:
                logger.info(f"model_adapters not installed — manual action mode (model={model} ignored)")
            except Exception as e:
                logger.warning(f"Could not create adapter for '{model}': {e}. Manual action mode.")

        # Generate episode ID
        self.episode_id = episode_id or f"ep_{uuid.uuid4().hex[:8]}"
        self.episode_start_time = time.time()
        self.last_step_time = time.time()

        # Reset counters
        self.step_count = 0
        self.total_hallucinations = 0
        self.total_correct = 0
        self.total_partial = 0
        self.reward_history = []
        self.confidence_history = []
        self.hallucination_history = []
        self.current_streak = 0

        # Reset early stopping counters
        self.consecutive_failures = 0
        self.consecutive_hallucinations = 0
        self.consecutive_perfect = 0
        self.early_stop_reason = None
        self.calibration_history = []

        # Reset multi-turn state
        self.dialogue = MultiTurnDialogue() if enable_multi_turn else None
        self.pending_clarifications = []

        # Reset context retrieval state
        self.revealed_context_fragments = []
        self.context_retrieval_turns = 0

        # Determine starting difficulty
        if difficulty:
            try:
                start_difficulty = DifficultyLevel(difficulty.lower())
            except ValueError:
                start_difficulty = self.config.initial_difficulty
        elif self.config.adaptive_difficulty and self.agent_profile:
            # Use agent's skill level
            start_difficulty = self.agent_profile.difficulty_ceiling
        else:
            start_difficulty = self.config.initial_difficulty

        # Load questions for this episode
        mix_difficulties = self.config.curriculum_enabled and start_difficulty == DifficultyLevel.INTERMEDIATE
        self.episode_examples = self.dataset_loader.start_new_episode(
            num_questions=self.config.max_questions_per_episode,
            difficulty=start_difficulty if not mix_difficulties else None,
            mix_difficulties=mix_difficulties
        )

        if not self.episode_examples:
            logger.error("No examples loaded for episode")
            return self._create_error_observation("No questions available")

        self.current_example = self.episode_examples[0]
        self.episode_phase = EpisodePhase.ACTIVE

        logger.info(f"Reset episode {self.episode_id} with {len(self.episode_examples)} questions")

        return self._create_observation(
            question=self.current_example.question,
            context=self._get_context_for_observation(self.current_example),
            feedback="Episode started. Answer using only the provided context.",
            metadata={"phase": self.episode_phase.value}
        )

    def step(

        self,

        action: Optional[HallucinationAction] = None,

        timeout_s: Optional[float] = None,

        **kwargs

    ) -> HallucinationObservation:
        """

        Process the AI's action and return the next observation.



        Auto-play mode: if reset(model=...) was called, action can be None —

        the environment calls the active adapter to generate an answer

        automatically using the current question and context.



        Manual mode: pass a HallucinationAction with answer, confidence, and

        source_quote filled in (the normal RL training loop).



        Handles:

        - Standard Q&A steps

        - Multi-turn clarifications

        - Context retrieval challenges

        """
        current_time = time.time()
        step_duration = current_time - (self.last_step_time or current_time)
        self.last_step_time = current_time

        # ── Auto-play: generate action via active adapter ─────────────────────
        if action is None or (not action.answer and self.active_adapter is not None):
            if self.current_example is not None and self.active_adapter is not None:
                try:
                    resp = self.active_adapter.generate_response(
                        question=self.current_example.question,
                        context=self.current_example.context,
                        require_citation=True,
                        require_confidence=True,
                    )
                    action = HallucinationAction(
                        answer=resp.answer,
                        confidence=resp.confidence,
                        source_quote=resp.source_quote or "",
                        reasoning=resp.reasoning or "",
                    )
                    logger.debug(f"Auto-play answer: {resp.answer[:80]}...")
                except Exception as e:
                    logger.warning(f"Adapter generate_response failed: {e}")
                    action = HallucinationAction(answer="", confidence=0.5)
            elif action is None:
                action = HallucinationAction(answer="", confidence=0.5)

        # Handle different episode phases
        if self.episode_phase == EpisodePhase.MULTI_TURN_CLARIFICATION:
            return self._handle_clarification_step(action)
        elif self.episode_phase == EpisodePhase.CONTEXT_RETRIEVAL:
            return self._handle_context_retrieval_step(action)

        # Standard Q&A step
        if self.current_example is None:
            return self._end_episode()

        # Validate action
        if not action.answer and not action.requires_clarification:
            return self._create_error_observation("No answer provided")

        # Handle clarification request
        if action.requires_clarification and self.dialogue:
            return self._handle_clarification_request(action)

        # Process the answer
        return self._process_answer(action, step_duration)

    def state(self) -> HallucinationState:
        """Return comprehensive state of the environment."""
        # Calculate derived metrics
        accuracy = self.total_correct / max(1, self.step_count)
        hallucination_rate = self.total_hallucinations / max(1, self.step_count)
        avg_confidence = sum(self.confidence_history) / max(1, len(self.confidence_history))

        # Calculate calibration error
        calibration_error = 0.0
        if self.confidence_history and self.reward_history:
            calibration_error = sum(
                abs(c - r) for c, r in zip(self.confidence_history, self.reward_history)
            ) / len(self.confidence_history)

        # Build episode statistics
        episode_stats = EpisodeStatistics(
            episode_id=self.episode_id or "",
            total_questions=len(self.episode_examples),
            questions_answered=self.step_count,
            correct_answers=self.total_correct,
            hallucinated_answers=self.total_hallucinations,
            partially_correct=self.total_partial,
            average_confidence=avg_confidence,
            average_reward=sum(self.reward_history) / max(1, len(self.reward_history)),
            calibration_error=calibration_error,
            reward_history=self.reward_history.copy(),
        )

        return HallucinationState(
            episode_id=self.episode_id,
            session_id=self.session_id,
            step_count=self.step_count,
            max_questions=self.config.max_questions_per_episode,
            total_hallucinations=self.total_hallucinations,
            hallucination_rate=hallucination_rate,
            total_correct=self.total_correct,
            total_partial=self.total_partial,
            accuracy=accuracy,
            average_reward=sum(self.reward_history) / max(1, len(self.reward_history)),
            average_confidence=avg_confidence,
            calibration_error=calibration_error,
            current_difficulty=self._get_current_difficulty(),
            curriculum_stage=self.curriculum_stage,
            skill_rating=self.skill_rating,
            current_streak=self.current_streak,
            best_streak=self.best_streak,
            episode_stats=episode_stats.model_dump() if episode_stats else None,
            agent_profile=self.agent_profile.model_dump() if self.agent_profile else None,
            config={
                "multi_turn_enabled": self.dialogue is not None,
                "context_retrieval_enabled": self.config.enable_multi_turn,
                "adaptive_difficulty": self.config.adaptive_difficulty,
            },
            episode_start_time=self.episode_start_time,
            last_step_time=self.last_step_time,
            metadata={
                "phase": self.episode_phase.value,
                "version": self.VERSION,
            }
        )

    def close(self) -> None:
        """Clean up resources and save agent profile."""
        if self.agent_profile:
            self._update_agent_profile()
        logger.info(f"Closed environment (session={self.session_id})")

    def _process_answer(

        self,

        action: HallucinationAction,

        step_duration: float

    ) -> HallucinationObservation:
        """Process a standard answer and compute rewards."""

        # Get ground truth
        ground_truth = self.current_example.answer
        context = self.current_example.context

        # Calculate reward using advanced grader
        difficulty_str = self.current_example.difficulty.value if self.current_example else "intermediate"
        prev_performance = self.skill_rating

        reward, info = calculate_reward(
            answer=action.answer,
            confidence=action.confidence,
            source_quote=action.source_quote,
            context=context,
            ground_truth=ground_truth,
            difficulty_level=difficulty_str,
            previous_performance=prev_performance,
            reward_weights=self.config.reward_weights
        )

        # Extract metrics from info
        is_hallucination = info.get("is_hallucination", False)
        hallucination_type_str = info.get("hallucination_type", "none")
        hallucination_severity_str = info.get("hallucination_severity", "NONE")
        correctness = info.get("correctness", 0.0)
        grounding_score = info.get("grounding", 0.0)
        calibration_score = info.get("calibration", 0.0)

        # Map hallucination type
        try:
            hallucination_type = HallucinationType(hallucination_type_str)
        except ValueError:
            hallucination_type = HallucinationType.NONE

        # Map severity
        try:
            severity = HallucinationSeverity[hallucination_severity_str]
        except KeyError:
            severity = HallucinationSeverity.NONE

        # Update statistics
        if is_hallucination:
            self.total_hallucinations += 1
            self.current_streak = 0
            self.consecutive_hallucinations += 1
            self.consecutive_perfect = 0
        elif correctness > 0.7:
            self.total_correct += 1
            self.current_streak += 1
            self.best_streak = max(self.best_streak, self.current_streak)
            self.consecutive_perfect += 1
            self.consecutive_hallucinations = 0
            self.consecutive_failures = 0
        else:
            self.total_partial += 1
            self.current_streak = 0
            self.consecutive_perfect = 0
            self.consecutive_hallucinations = 0
            if reward < self.config.early_stopping_min_reward:
                self.consecutive_failures += 1

        # Track calibration history
        calibration_error = abs(action.confidence - correctness)
        self.calibration_history.append(calibration_error)

        # Track history
        self.reward_history.append(reward)
        self.confidence_history.append(action.confidence)
        self.hallucination_history.append(is_hallucination)

        # Update skill rating (ELO-style)
        expected_score = 1 / (1 + 10 ** ((0.5 - self.skill_rating) * 4))
        actual_score = 1.0 if correctness > 0.7 else (0.5 if correctness > 0.4 else 0.0)
        self.skill_rating += 0.05 * (actual_score - expected_score)
        self.skill_rating = max(0.0, min(1.0, self.skill_rating))

        # Generate feedback
        feedback = generate_feedback(
            answer=action.answer,
            ground_truth=ground_truth,
            is_hallucination=is_hallucination,
            hallucination_type=hallucination_type,
            hallucination_severity=severity,
            grounding_score=grounding_score,
            correctness=correctness,
            calibration_score=calibration_score,
            total_reward=reward
        )

        # Move to next question
        self.step_count += 1

        # Check for early stopping conditions
        early_stop = self._check_early_stopping(is_hallucination, correctness, calibration_error)

        # Determine if episode is done
        done = self.step_count >= self.config.max_questions_per_episode

        if early_stop:
            done = True
            self.early_stop_reason = early_stop
            self.episode_phase = EpisodePhase.COMPLETION
            feedback += f" [Early stop: {early_stop}]"

        if not done:
            self.current_example = self.dataset_loader.get_example_for_step(self.step_count)
        else:
            self.current_example = None
            self.episode_phase = EpisodePhase.COMPLETION

        # Build observation
        observation = self._create_observation(
            question=self.current_example.question if self.current_example else "",
            context=self._get_context_for_observation(self.current_example) if self.current_example else "",
            ground_truth=ground_truth if done else "",  # Only reveal at end
            feedback=feedback,
            reward=reward,
            is_hallucination=is_hallucination,
            hallucination_type=hallucination_type,
            hallucination_severity=severity,
            grounding_score=grounding_score,
            done=done,
            metadata={
                "step": self.step_count,
                "correctness": correctness,
                "calibration": calibration_score,
                "hallucination_score": info.get("hallucination_score", 0.0),
                "reward_breakdown": self._extract_reward_breakdown(info),
                "semantic_analysis": info.get("semantic_analysis", {}),
                "citation_analysis": info.get("citation_analysis", {}),
            }
        )

        # Update dialogue history if enabled
        if self.dialogue:
            self.dialogue.turn_number += 1
            self.dialogue.conversation_history.append({
                "question": observation.question,
                "answer": action.answer,
                "feedback": feedback
            })

        return observation

    def _handle_clarification_request(

        self,

        action: HallucinationAction

    ) -> HallucinationObservation:
        """Handle a request for clarification."""
        if not self.dialogue:
            return self._create_error_observation("Multi-turn not enabled")

        # Add clarification questions to pending list
        self.pending_clarifications.extend(action.clarification_questions)
        self.dialogue.unresolved_queries.extend(action.clarification_questions)

        # Provide clarifications (simulated)
        clarifications = []
        for q in action.clarification_questions:
            # Simple keyword-based clarification
            clarification = self._generate_clarification(q, self.current_example)
            clarifications.append(clarification)
            if q in self.dialogue.unresolved_queries:
                self.dialogue.unresolved_queries.remove(q)

        # Switch to active phase
        self.episode_phase = EpisodePhase.ACTIVE

        return self._create_observation(
            question=self.current_example.question if self.current_example else "",
            context=self.current_example.context if self.current_example else "",
            feedback=f"Clarifications provided: {'; '.join(clarifications)}",
            metadata={
                "clarifications": clarifications,
                "phase": self.episode_phase.value
            }
        )

    def _handle_clarification_step(

        self,

        action: HallucinationAction

    ) -> HallucinationObservation:
        """Handle a step during multi-turn clarification."""
        # Process clarification and return to main question
        self.episode_phase = EpisodePhase.ACTIVE
        return self._process_answer(action, 0.0)

    def _handle_context_retrieval_step(

        self,

        action: HallucinationAction

    ) -> HallucinationObservation:
        """Handle context retrieval challenge."""
        # Reveal more context based on action
        full_context = self.current_example.context if self.current_example else ""
        context_fragments = self._split_context_into_fragments(full_context)

        # Reveal additional fragments
        new_revealed = min(
            len(self.revealed_context_fragments) + 1,
            len(context_fragments)
        )
        self.revealed_context_fragments = context_fragments[:new_revealed]

        revealed_context = " ".join(self.revealed_context_fragments)
        self.context_retrieval_turns += 1

        # Check if enough context revealed or max turns reached
        if self.context_retrieval_turns >= self.config.max_turns_per_question or \
           new_revealed >= len(context_fragments):
            self.episode_phase = EpisodePhase.ACTIVE
            # Update current example with full context
            if self.current_example:
                self.current_example.metadata["revealed_context"] = revealed_context
        else:
            # Stay in retrieval phase
            pass

        return self._create_observation(
            question=self.current_example.question if self.current_example else "",
            context=revealed_context,
            feedback=f"Context revealed: {new_revealed}/{len(context_fragments)} fragments",
            metadata={
                "fragments_revealed": new_revealed,
                "total_fragments": len(context_fragments),
                "phase": self.episode_phase.value
            }
        )

    def _create_observation(

        self,

        question: str = "",

        context: str = "",

        ground_truth: str = "",

        feedback: str = "",

        reward: Optional[float] = None,

        done: bool = False,

        is_hallucination: bool = False,

        hallucination_type: HallucinationType = HallucinationType.NONE,

        hallucination_severity: HallucinationSeverity = HallucinationSeverity.NONE,

        grounding_score: float = 0.0,

        metadata: Optional[Dict[str, Any]] = None

    ) -> HallucinationObservation:
        """Create a comprehensive observation."""
        accuracy_so_far = self.total_correct / max(1, self.step_count) if self.step_count > 0 else 0.0

        # Extract reward breakdown from metadata if available
        reward_breakdown = None
        semantic_analysis = None
        citation_analysis = None
        if metadata:
            reward_breakdown = metadata.get("reward_breakdown")
            semantic_analysis = metadata.get("semantic_analysis")
            citation_analysis = metadata.get("citation_analysis")

        return HallucinationObservation(
            question=question,
            context=context,
            ground_truth=ground_truth,
            question_id=self.current_example.id if self.current_example else "",
            source_dataset=self.current_example.source if self.current_example else "",
            done=done,
            reward=reward,
            feedback=feedback,
            is_hallucination=is_hallucination,
            hallucination_type=hallucination_type,
            hallucination_severity=hallucination_severity,
            grounding_score=grounding_score,
            accuracy_so_far=accuracy_so_far,
            attempts_remaining=max(0, self.config.max_questions_per_episode - self.step_count),
            current_streak=self.current_streak,
            best_streak=self.best_streak,
            difficulty_level=self._get_current_difficulty().value if hasattr(self._get_current_difficulty(), 'value') else str(self._get_current_difficulty()),
            curriculum_progress=self.step_count / max(1, self.config.max_questions_per_episode),
            skill_rating=self.skill_rating,
            dialogue=self.dialogue,
            reward_breakdown=reward_breakdown,
            semantic_analysis=semantic_analysis,
            citation_analysis=citation_analysis,
            metadata=metadata or {}
        )

    def _create_error_observation(self, error_message: str) -> HallucinationObservation:
        """Create an error observation."""
        return HallucinationObservation(
            done=True,
            reward=0.0,
            question="",
            context="",
            feedback=f"Error: {error_message}",
            is_hallucination=False,
            grounding_score=0.0,
            accuracy_so_far=0.0,
            attempts_remaining=0,
            reward_breakdown=None,
            semantic_analysis=None,
            citation_analysis=None,
            metadata={"error": error_message}
        )

    def _end_episode(self) -> HallucinationObservation:
        """End the current episode."""
        self.episode_phase = EpisodePhase.COMPLETION

        # Update curriculum
        self._update_curriculum()

        return HallucinationObservation(
            done=True,
            reward=sum(self.reward_history) / max(1, len(self.reward_history)),
            question="",
            context="",
            feedback=self._generate_episode_summary(),
            is_hallucination=False,
            grounding_score=0.0,
            accuracy_so_far=self.total_correct / max(1, self.step_count),
            attempts_remaining=0,
            metadata={
                "episode_complete": True,
                "final_reward": sum(self.reward_history) / max(1, len(self.reward_history)),
                "total_hallucinations": self.total_hallucinations,
                "total_correct": self.total_correct,
            }
        )

    def _check_early_stopping(self, is_hallucination: bool, correctness: float, calibration_error: float) -> Optional[str]:
        """

        Check if episode should stop early based on performance conditions.



        Returns:

            str describing early stop reason, or None if should continue.

        """
        if not self.config.early_stopping_enabled:
            return None

        # Require minimum steps before early stopping
        if self.step_count < 3:
            return None

        # 1. Hallucination cascade: too many consecutive hallucinations
        if self.consecutive_hallucinations >= self.config.early_stopping_hallucination_cascade:
            return f"hallucination_cascade ({self.consecutive_hallucinations} consecutive)"

        # 2. Consecutive failures: poor performance
        if self.consecutive_failures >= self.config.early_stopping_patience:
            return f"consecutive_failures ({self.consecutive_failures} below {self.config.early_stopping_min_reward})"

        # 3. Calibration failure: confidence systematically misaligned
        if len(self.calibration_history) >= 5:
            avg_calibration_error = sum(self.calibration_history[-5:]) / 5
            if avg_calibration_error > self.config.early_stopping_calibration_failure:
                return f"calibration_failure (avg error: {avg_calibration_error:.2f})"

        # 4. Perfect run: early completion after consistent high performance
        if self.consecutive_perfect >= self.config.early_stopping_perfect_run:
            if self.step_count >= self.config.min_questions_for_completion:
                return f"perfect_run ({self.consecutive_perfect} consecutive correct)"

        return None

    def _get_context_for_observation(self, example: Optional[QAExample]) -> str:
        """Get context, potentially with partial revelation for challenges."""
        if not example:
            return ""

        # Check if context retrieval is enabled
        if self.config.enable_multi_turn and self.revealed_context_fragments:
            return " ".join(self.revealed_context_fragments)

        return example.context

    def _get_current_difficulty(self) -> DifficultyLevel:
        """

        Determine current difficulty based on performance with hysteresis.



        Uses smooth difficulty scaling with:

        - Stage-specific thresholds

        - Minimum steps at each level (hysteresis)

        - EXPERT level progression

        """
        if not self.config.adaptive_difficulty:
            return self.config.initial_difficulty

        # Need enough history for reliable assessment
        if len(self.reward_history) < 3:
            return self.config.initial_difficulty

        # Calculate recent performance with exponential weighting
        recent_rewards = self.reward_history[-10:] if len(self.reward_history) >= 10 else self.reward_history
        avg_recent_reward = sum(recent_rewards) / len(recent_rewards)

        # Get current difficulty from example
        current_difficulty = self.config.initial_difficulty
        if self.current_example:
            # Convert string to DifficultyLevel enum if needed
            example_diff = self.current_example.difficulty
            if isinstance(example_diff, str):
                try:
                    current_difficulty = DifficultyLevel(example_diff.lower())
                except ValueError:
                    current_difficulty = self.config.initial_difficulty
            else:
                current_difficulty = example_diff

        # Stage-specific mastery thresholds
        mastery_thresholds = {
            DifficultyLevel.BEGINNER: 0.60,
            DifficultyLevel.INTERMEDIATE: 0.65,
            DifficultyLevel.ADVANCED: 0.75,
            DifficultyLevel.EXPERT: 0.85,
        }

        # Regression thresholds (lower than mastery to avoid oscillation)
        regression_thresholds = {
            DifficultyLevel.BEGINNER: 0.30,
            DifficultyLevel.INTERMEDIATE: 0.40,
            DifficultyLevel.ADVANCED: 0.50,
            DifficultyLevel.EXPERT: 0.60,
        }

        # Difficulty progression order
        difficulty_order = [
            DifficultyLevel.BEGINNER,
            DifficultyLevel.INTERMEDIATE,
            DifficultyLevel.ADVANCED,
            DifficultyLevel.EXPERT,
        ]

        current_idx = difficulty_order.index(current_difficulty) if current_difficulty in difficulty_order else 0

        # Check for promotion
        if avg_recent_reward > mastery_thresholds.get(current_difficulty, 0.7):
            # Promote if not at EXPERT
            if current_idx < len(difficulty_order) - 1:
                return difficulty_order[current_idx + 1]

        # Check for demotion
        elif avg_recent_reward < regression_thresholds.get(current_difficulty, 0.4):
            # Demote if not at BEGINNER
            if current_idx > 0:
                return difficulty_order[current_idx - 1]

        return current_difficulty

    def _update_curriculum(self) -> None:
        """

        Update curriculum stage based on episode performance.



        Supports:

        - Advancement on sustained high performance

        - Regression on sustained poor performance

        - Stage-specific thresholds

        """
        if not self.config.curriculum_enabled:
            return

        episode_reward = sum(self.reward_history) / max(1, len(self.reward_history))
        self.curriculum_performance.append(episode_reward)

        # Calculate statistics
        avg_reward = sum(self.curriculum_performance) / len(self.curriculum_performance)
        recent_rewards = self.curriculum_performance[-10:] if len(self.curriculum_performance) >= 10 else self.curriculum_performance
        recent_avg = sum(recent_rewards) / len(recent_rewards)

        # Stage-specific thresholds
        advancement_threshold = self.config.curriculum_mastery_threshold
        regression_threshold = self.config.curriculum_regression_threshold

        # Check for curriculum advancement (sustained high performance)
        if len(self.curriculum_performance) >= self.config.min_steps_per_curriculum_stage:
            if recent_avg > advancement_threshold:
                self.curriculum_stage += 1
                self.curriculum_performance = []  # Reset for next stage
                logger.info(f"Advanced to curriculum stage {self.curriculum_stage} (avg: {recent_avg:.2f})")

            # Check for curriculum regression (sustained poor performance)
            elif recent_avg < regression_threshold and self.curriculum_stage > 0:
                self.curriculum_stage = max(0, self.curriculum_stage - 1)
                self.curriculum_performance = []
                logger.info(f"Regressed to curriculum stage {self.curriculum_stage} (avg: {recent_avg:.2f})")

    def _update_agent_profile(self) -> None:
        """Update the agent's long-term skill profile."""
        if not self.agent_profile:
            self.agent_profile = AgentSkillProfile()

        # Update metrics
        total_steps = self.agent_profile.total_steps + self.step_count
        weight = self.step_count / max(1, total_steps)

        self.agent_profile.overall_accuracy = (
            (1 - weight) * self.agent_profile.overall_accuracy +
            weight * (self.total_correct / max(1, self.step_count))
        )
        self.agent_profile.grounding_skill = (
            (1 - weight) * self.agent_profile.grounding_skill +
            weight * sum(self.reward_history) / max(1, len(self.reward_history))
        )
        self.agent_profile.hallucination_rate = (
            (1 - weight) * self.agent_profile.hallucination_rate +
            weight * (self.total_hallucinations / max(1, self.step_count))
        )
        self.agent_profile.total_episodes += 1
        self.agent_profile.total_steps = total_steps

        # Update difficulty ceiling
        if self.agent_profile.overall_accuracy > 0.8:
            self.agent_profile.difficulty_ceiling = DifficultyLevel.EXPERT
        elif self.agent_profile.overall_accuracy > 0.6:
            self.agent_profile.difficulty_ceiling = DifficultyLevel.ADVANCED
        elif self.agent_profile.overall_accuracy > 0.4:
            self.agent_profile.difficulty_ceiling = DifficultyLevel.INTERMEDIATE
        else:
            self.agent_profile.difficulty_ceiling = DifficultyLevel.BEGINNER

    def _generate_episode_summary(self) -> str:
        """Generate a summary of the completed episode."""
        total_reward = sum(self.reward_history) / max(1, len(self.reward_history))
        accuracy = self.total_correct / max(1, self.step_count)

        summary_parts = [
            f"Episode completed!",
            f"Total reward: {total_reward:.2f}",
            f"Accuracy: {accuracy:.1%}",
            f"Hallucinations: {self.total_hallucinations}/{self.step_count}",
            f"Best streak: {self.best_streak}",
        ]

        if total_reward > 0.8:
            summary_parts.append("Performance: OUTSTANDING!")
        elif total_reward > 0.6:
            summary_parts.append("Performance: Good")
        elif total_reward > 0.4:
            summary_parts.append("Performance: Needs improvement")
        else:
            summary_parts.append("Performance: Poor - review and recalibrate")

        return " ".join(summary_parts)

    def _extract_reward_breakdown(self, info: Dict[str, Any]) -> Dict[str, Any]:
        """Extract reward breakdown from grader info."""
        components = info.get("components", {})
        return {
            "factual_correctness": info.get("correctness", 0.0),
            "source_grounding": info.get("grounding", 0.0),
            "citation_accuracy": info.get("citation_analysis", {}).get("best_match_score", 0.0),
            "confidence_calibration": info.get("calibration", 0.0),
            "semantic_consistency": info.get("semantic_consistency", 0.0),
            "hallucination_penalty": info.get("hallucination_penalty", 0.0),
            "rouge_l": info.get("rouge_combined", 0.0),
            "bert_score": info.get("bertscore", {}).get("f1", 0.0) if isinstance(info.get("bertscore"), dict) else info.get("bertscore", 0.0),
            "align_score": info.get("alignscore", 0.0),
            "rouge_contrib": info.get("rouge_contrib", 0.0),
            "bertscore_contrib": info.get("bertscore_contrib", 0.0),
            "alignscore_contrib": info.get("alignscore_contrib", 0.0),
            "total": info.get("total_reward", 0.0),
            "difficulty_adjustment": info.get("difficulty_multiplier", 1.0),
            "consistency_bonus": info.get("consistency_bonus", 0.0),
        }

    def _split_context_into_fragments(self, context: str, num_fragments: int = 5) -> List[str]:
        """Split context into fragments for retrieval challenges."""
        if not context:
            return []

        sentences = context.split('.')
        fragments = []
        chunk_size = max(1, len(sentences) // num_fragments)

        for i in range(0, len(sentences), chunk_size):
            fragment = '.'.join(sentences[i:i + chunk_size]).strip()
            if fragment:
                fragments.append(fragment + '.')

        return fragments or [context]

    def _generate_clarification(self, question: str, example: Optional[QAExample]) -> str:
        """Generate a clarification response."""
        if not example:
            return "No context available for clarification."

        # Simple keyword-based clarification
        context_lower = example.context.lower()
        question_lower = question.lower()

        # Extract key terms from question
        key_terms = [w for w in question_lower.split() if len(w) > 3 and w not in {'what', 'when', 'where', 'who', 'why', 'how', 'does', 'have', 'has', 'with', 'from'}]

        clarifications = []
        for term in key_terms[:3]:
            if term in context_lower:
                clarifications.append(f"Context mentions '{term}'")

        return "; ".join(clarifications) if clarifications else "Review the provided context for relevant information."