"""
Stability Tester Module
Tests if understanding holds under reformulation and stress testing
"""

from typing import List, Dict, Optional
import os
import requests
import numpy as np
from sentence_transformers import SentenceTransformer

class StabilityTester:
    def __init__(self):
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
        self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
        self._ready = True
    
    def is_ready(self) -> bool:
        return self._ready
    
    async def test_stability(
        self,
        concept: str,
        original_explanation: str,
        claims: List[Dict]
    ) -> Dict:
        """
        Test explanation stability through reformulation
        
        Strategy:
        1. Generate re-prompts asking user to explain differently
        2. Simulate alternative explanations (or use original for drift)
        3. Measure semantic drift from original
        4. Identify claims that become unclear/contradictory
        
        Returns:
            {
                'stability_score': float (0-100),
                'drift_scores': Dict[str, float],
                'unstable_claims': List[Dict],
                'stress_test_results': List[Dict]
            }
        """
        # Generate stress test prompts
        stress_prompts = self._generate_stress_prompts(concept)
        
        # For demo, analyze stability of original explanation
        # In production, would actually re-prompt user or use LLM to generate alternatives
        original_embedding = self.embedding_model.encode(original_explanation)
        
        # Test claim stability
        unstable_claims = []
        claim_drift_scores = {}
        
        for claim in claims:
            # Check if claim relies on other claims
            stability = await self._test_claim_stability(
                claim=claim,
                concept=concept,
                all_claims=claims
            )
            
            claim_drift_scores[claim['id']] = stability['drift_score']
            
            if stability['is_unstable']:
                unstable_claims.append({
                    'claim': claim['text'],
                    'reason': stability['reason'],
                    'drift_score': stability['drift_score']
                })
        
        # Calculate overall stability score
        avg_drift = np.mean(list(claim_drift_scores.values())) if claim_drift_scores else 0.0
        stability_score = max(0, 100 - (avg_drift * 100))
        
        return {
            'stability_score': stability_score,
            'drift_scores': claim_drift_scores,
            'unstable_claims': unstable_claims[:3],  # Top 3
            'stress_test_results': [
                {
                    'prompt': prompt,
                    'passes': len(unstable_claims) == 0
                }
                for prompt in stress_prompts[:2]
            ]
        }
    
    def _generate_stress_prompts(self, concept: str) -> List[str]:
        """Generate stress test prompts"""
        return [
            f"Explain {concept} in a different way",
            f"What would happen if {concept} didn't exist?",
            f"Explain {concept} to a 10-year-old",
            f"What are the limits or boundary conditions of {concept}?"
        ]
    
    async def _test_claim_stability(
        self,
        claim: Dict,
        concept: str,
        all_claims: List[Dict]
    ) -> Dict:
        """Test if a single claim is stable"""
        # Heuristic: claims that are very short or vague are unstable
        claim_text = claim['text']
        word_count = len(claim_text.split())
        
        # Very short claims (<5 words) are often unstable
        if word_count < 5:
            return {
                'is_unstable': True,
                'reason': 'Claim is too brief to demonstrate understanding',
                'drift_score': 0.6
            }
        
        # Check for vague language
        vague_terms = ['thing', 'stuff', 'kind of', 'sort of', 'basically', 'just', 'simply']
        vague_count = sum(1 for term in vague_terms if term in claim_text.lower())
        
        if vague_count >= 2:
            return {
                'is_unstable': True,
                'reason': 'Contains vague language suggesting surface understanding',
                'drift_score': 0.5
            }
        
        # Check if claim is standalone or depends on others
        # Claims that reference "this" or "that" without clear antecedent are unstable
        unclear_refs = ['this', 'that', 'it', 'these', 'those']
        has_unclear_ref = any(claim_text.lower().startswith(ref + ' ') for ref in unclear_refs)
        
        if has_unclear_ref and len(all_claims) > 1:
            return {
                'is_unstable': True,
                'reason': 'Claim has unclear references and may not stand alone',
                'drift_score': 0.4
            }
        
        # Claim appears stable
        return {
            'is_unstable': False,
            'reason': 'Claim appears well-formed',
            'drift_score': 0.1
        }