Spaces:

tawhidbinomar
/

conceptvector

Sleeping

conceptvector / analysis /stability_tester.py

Tawhid Bin Omar

Initial deployment of RealityCheck AI backend

8176754 2 months ago

5.28 kB

	"""
	Stability Tester Module
	Tests if understanding holds under reformulation and stress testing
	"""

	from typing import List, Dict, Optional
	import os
	import requests
	import numpy as np
	from sentence_transformers import SentenceTransformer

	class StabilityTester:
	def __init__(self):
	self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
	self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
	self._ready = True

	def is_ready(self) -> bool:
	return self._ready

	async def test_stability(
	self,
	concept: str,
	original_explanation: str,
	claims: List[Dict]
	) -> Dict:
	"""
	Test explanation stability through reformulation

	Strategy:
	1. Generate re-prompts asking user to explain differently
	2. Simulate alternative explanations (or use original for drift)
	3. Measure semantic drift from original
	4. Identify claims that become unclear/contradictory

	Returns:
	{
	'stability_score': float (0-100),
	'drift_scores': Dict[str, float],
	'unstable_claims': List[Dict],
	'stress_test_results': List[Dict]
	}
	"""
	# Generate stress test prompts
	stress_prompts = self._generate_stress_prompts(concept)

	# For demo, analyze stability of original explanation
	# In production, would actually re-prompt user or use LLM to generate alternatives
	original_embedding = self.embedding_model.encode(original_explanation)

	# Test claim stability
	unstable_claims = []
	claim_drift_scores = {}

	for claim in claims:
	# Check if claim relies on other claims
	stability = await self._test_claim_stability(
	claim=claim,
	concept=concept,
	all_claims=claims
	)

	claim_drift_scores[claim['id']] = stability['drift_score']

	if stability['is_unstable']:
	unstable_claims.append({
	'claim': claim['text'],
	'reason': stability['reason'],
	'drift_score': stability['drift_score']
	})

	# Calculate overall stability score
	avg_drift = np.mean(list(claim_drift_scores.values())) if claim_drift_scores else 0.0
	stability_score = max(0, 100 - (avg_drift * 100))

	return {
	'stability_score': stability_score,
	'drift_scores': claim_drift_scores,
	'unstable_claims': unstable_claims[:3], # Top 3
	'stress_test_results': [
	{
	'prompt': prompt,
	'passes': len(unstable_claims) == 0
	}
	for prompt in stress_prompts[:2]
	]
	}

	def _generate_stress_prompts(self, concept: str) -> List[str]:
	"""Generate stress test prompts"""
	return [
	f"Explain {concept} in a different way",
	f"What would happen if {concept} didn't exist?",
	f"Explain {concept} to a 10-year-old",
	f"What are the limits or boundary conditions of {concept}?"
	]

	async def _test_claim_stability(
	self,
	claim: Dict,
	concept: str,
	all_claims: List[Dict]
	) -> Dict:
	"""Test if a single claim is stable"""
	# Heuristic: claims that are very short or vague are unstable
	claim_text = claim['text']
	word_count = len(claim_text.split())

	# Very short claims (<5 words) are often unstable
	if word_count < 5:
	return {
	'is_unstable': True,
	'reason': 'Claim is too brief to demonstrate understanding',
	'drift_score': 0.6
	}

	# Check for vague language
	vague_terms = ['thing', 'stuff', 'kind of', 'sort of', 'basically', 'just', 'simply']
	vague_count = sum(1 for term in vague_terms if term in claim_text.lower())

	if vague_count >= 2:
	return {
	'is_unstable': True,
	'reason': 'Contains vague language suggesting surface understanding',
	'drift_score': 0.5
	}

	# Check if claim is standalone or depends on others
	# Claims that reference "this" or "that" without clear antecedent are unstable
	unclear_refs = ['this', 'that', 'it', 'these', 'those']
	has_unclear_ref = any(claim_text.lower().startswith(ref + ' ') for ref in unclear_refs)

	if has_unclear_ref and len(all_claims) > 1:
	return {
	'is_unstable': True,
	'reason': 'Claim has unclear references and may not stand alone',
	'drift_score': 0.4
	}

	# Claim appears stable
	return {
	'is_unstable': False,
	'reason': 'Claim appears well-formed',
	'drift_score': 0.1
	}