""" Stability Tester Module Tests if understanding holds under reformulation and stress testing """ from typing import List, Dict, Optional import os import requests import numpy as np from sentence_transformers import SentenceTransformer class StabilityTester: def __init__(self): self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY') self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2" self._ready = True def is_ready(self) -> bool: return self._ready async def test_stability( self, concept: str, original_explanation: str, claims: List[Dict] ) -> Dict: """ Test explanation stability through reformulation Strategy: 1. Generate re-prompts asking user to explain differently 2. Simulate alternative explanations (or use original for drift) 3. Measure semantic drift from original 4. Identify claims that become unclear/contradictory Returns: { 'stability_score': float (0-100), 'drift_scores': Dict[str, float], 'unstable_claims': List[Dict], 'stress_test_results': List[Dict] } """ # Generate stress test prompts stress_prompts = self._generate_stress_prompts(concept) # For demo, analyze stability of original explanation # In production, would actually re-prompt user or use LLM to generate alternatives original_embedding = self.embedding_model.encode(original_explanation) # Test claim stability unstable_claims = [] claim_drift_scores = {} for claim in claims: # Check if claim relies on other claims stability = await self._test_claim_stability( claim=claim, concept=concept, all_claims=claims ) claim_drift_scores[claim['id']] = stability['drift_score'] if stability['is_unstable']: unstable_claims.append({ 'claim': claim['text'], 'reason': stability['reason'], 'drift_score': stability['drift_score'] }) # Calculate overall stability score avg_drift = np.mean(list(claim_drift_scores.values())) if claim_drift_scores else 0.0 stability_score = max(0, 100 - (avg_drift * 100)) return { 'stability_score': stability_score, 'drift_scores': claim_drift_scores, 'unstable_claims': unstable_claims[:3], # Top 3 'stress_test_results': [ { 'prompt': prompt, 'passes': len(unstable_claims) == 0 } for prompt in stress_prompts[:2] ] } def _generate_stress_prompts(self, concept: str) -> List[str]: """Generate stress test prompts""" return [ f"Explain {concept} in a different way", f"What would happen if {concept} didn't exist?", f"Explain {concept} to a 10-year-old", f"What are the limits or boundary conditions of {concept}?" ] async def _test_claim_stability( self, claim: Dict, concept: str, all_claims: List[Dict] ) -> Dict: """Test if a single claim is stable""" # Heuristic: claims that are very short or vague are unstable claim_text = claim['text'] word_count = len(claim_text.split()) # Very short claims (<5 words) are often unstable if word_count < 5: return { 'is_unstable': True, 'reason': 'Claim is too brief to demonstrate understanding', 'drift_score': 0.6 } # Check for vague language vague_terms = ['thing', 'stuff', 'kind of', 'sort of', 'basically', 'just', 'simply'] vague_count = sum(1 for term in vague_terms if term in claim_text.lower()) if vague_count >= 2: return { 'is_unstable': True, 'reason': 'Contains vague language suggesting surface understanding', 'drift_score': 0.5 } # Check if claim is standalone or depends on others # Claims that reference "this" or "that" without clear antecedent are unstable unclear_refs = ['this', 'that', 'it', 'these', 'those'] has_unclear_ref = any(claim_text.lower().startswith(ref + ' ') for ref in unclear_refs) if has_unclear_ref and len(all_claims) > 1: return { 'is_unstable': True, 'reason': 'Claim has unclear references and may not stand alone', 'drift_score': 0.4 } # Claim appears stable return { 'is_unstable': False, 'reason': 'Claim appears well-formed', 'drift_score': 0.1 }