File size: 5,279 Bytes
8176754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Stability Tester Module
Tests if understanding holds under reformulation and stress testing
"""

from typing import List, Dict, Optional
import os
import requests
import numpy as np
from sentence_transformers import SentenceTransformer

class StabilityTester:
    def __init__(self):
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
        self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
        self._ready = True
    
    def is_ready(self) -> bool:
        return self._ready
    
    async def test_stability(
        self,
        concept: str,
        original_explanation: str,
        claims: List[Dict]
    ) -> Dict:
        """
        Test explanation stability through reformulation
        
        Strategy:
        1. Generate re-prompts asking user to explain differently
        2. Simulate alternative explanations (or use original for drift)
        3. Measure semantic drift from original
        4. Identify claims that become unclear/contradictory
        
        Returns:
            {
                'stability_score': float (0-100),
                'drift_scores': Dict[str, float],
                'unstable_claims': List[Dict],
                'stress_test_results': List[Dict]
            }
        """
        # Generate stress test prompts
        stress_prompts = self._generate_stress_prompts(concept)
        
        # For demo, analyze stability of original explanation
        # In production, would actually re-prompt user or use LLM to generate alternatives
        original_embedding = self.embedding_model.encode(original_explanation)
        
        # Test claim stability
        unstable_claims = []
        claim_drift_scores = {}
        
        for claim in claims:
            # Check if claim relies on other claims
            stability = await self._test_claim_stability(
                claim=claim,
                concept=concept,
                all_claims=claims
            )
            
            claim_drift_scores[claim['id']] = stability['drift_score']
            
            if stability['is_unstable']:
                unstable_claims.append({
                    'claim': claim['text'],
                    'reason': stability['reason'],
                    'drift_score': stability['drift_score']
                })
        
        # Calculate overall stability score
        avg_drift = np.mean(list(claim_drift_scores.values())) if claim_drift_scores else 0.0
        stability_score = max(0, 100 - (avg_drift * 100))
        
        return {
            'stability_score': stability_score,
            'drift_scores': claim_drift_scores,
            'unstable_claims': unstable_claims[:3],  # Top 3
            'stress_test_results': [
                {
                    'prompt': prompt,
                    'passes': len(unstable_claims) == 0
                }
                for prompt in stress_prompts[:2]
            ]
        }
    
    def _generate_stress_prompts(self, concept: str) -> List[str]:
        """Generate stress test prompts"""
        return [
            f"Explain {concept} in a different way",
            f"What would happen if {concept} didn't exist?",
            f"Explain {concept} to a 10-year-old",
            f"What are the limits or boundary conditions of {concept}?"
        ]
    
    async def _test_claim_stability(
        self,
        claim: Dict,
        concept: str,
        all_claims: List[Dict]
    ) -> Dict:
        """Test if a single claim is stable"""
        # Heuristic: claims that are very short or vague are unstable
        claim_text = claim['text']
        word_count = len(claim_text.split())
        
        # Very short claims (<5 words) are often unstable
        if word_count < 5:
            return {
                'is_unstable': True,
                'reason': 'Claim is too brief to demonstrate understanding',
                'drift_score': 0.6
            }
        
        # Check for vague language
        vague_terms = ['thing', 'stuff', 'kind of', 'sort of', 'basically', 'just', 'simply']
        vague_count = sum(1 for term in vague_terms if term in claim_text.lower())
        
        if vague_count >= 2:
            return {
                'is_unstable': True,
                'reason': 'Contains vague language suggesting surface understanding',
                'drift_score': 0.5
            }
        
        # Check if claim is standalone or depends on others
        # Claims that reference "this" or "that" without clear antecedent are unstable
        unclear_refs = ['this', 'that', 'it', 'these', 'those']
        has_unclear_ref = any(claim_text.lower().startswith(ref + ' ') for ref in unclear_refs)
        
        if has_unclear_ref and len(all_claims) > 1:
            return {
                'is_unstable': True,
                'reason': 'Claim has unclear references and may not stand alone',
                'drift_score': 0.4
            }
        
        # Claim appears stable
        return {
            'is_unstable': False,
            'reason': 'Claim appears well-formed',
            'drift_score': 0.1
        }