Spaces:
Runtime error
Runtime error
| """ | |
| COMPREHENSIVE TEST CASE STUDY | |
| ============================== | |
| Scenario: AI Safety Discussion between researchers | |
| Purpose: Demonstrate full realization crystallization pipeline | |
| Test Objectives: | |
| 1. Extract realizations from complex multi-turn conversation | |
| 2. Calculate Q-scores with full transparency | |
| 3. Assign layers automatically | |
| 4. Build parent-child graph (ุจูุงุช ุงููุงุฑ) | |
| 5. Test retrieval system | |
| 6. Assess system performance | |
| """ | |
| from layers.layer_2_core.realization_engine import RealizationEngine, RealizationFeatures | |
| import json | |
| from datetime import datetime | |
| class TestCaseStudy: | |
| def __init__(self): | |
| self.engine = RealizationEngine() | |
| self.test_results = { | |
| 'test_name': 'AI Safety Discussion Analysis', | |
| 'timestamp': datetime.now().isoformat(), | |
| 'realizations': [], | |
| 'metrics': {}, | |
| 'assessment': {} | |
| } | |
| def run_test(self): | |
| """Execute complete test pipeline""" | |
| print("="*80) | |
| print("TEST CASE STUDY: AI SAFETY DISCUSSION") | |
| print("="*80) | |
| print("\n๐ PHASE 1: CONVERSATION SIMULATION\n") | |
| # Simulate a multi-turn conversation about AI safety | |
| self.simulate_conversation() | |
| print("\n๐ PHASE 2: REALIZATION EXTRACTION & SCORING\n") | |
| self.extract_and_score_realizations() | |
| print("\n๐ PHASE 3: LAYER DISTRIBUTION ANALYSIS\n") | |
| self.analyze_layer_distribution() | |
| print("\n๐ PHASE 4: GENERATIVITY ANALYSIS (ุจูุงุช ุงููุงุฑ)\n") | |
| self.analyze_generativity() | |
| print("\n๐ PHASE 5: RETRIEVAL SYSTEM TEST\n") | |
| self.test_retrieval() | |
| print("\n๐ PHASE 6: QUALITY ASSESSMENT\n") | |
| self.assess_quality() | |
| print("\n๐ PHASE 7: EXPORT RESULTS\n") | |
| self.export_results() | |
| print("\n" + "="*80) | |
| print("TEST COMPLETE") | |
| print("="*80) | |
| def simulate_conversation(self): | |
| """Simulate AI safety discussion""" | |
| print("Simulating 8-turn conversation between AI safety researchers...") | |
| print("Topic: Alignment, interpretability, and emergent behaviors\n") | |
| def extract_and_score_realizations(self): | |
| """Extract realizations from conversation and score them""" | |
| # =================================================================== | |
| # TURN 1: Foundational observation about AI systems | |
| # =================================================================== | |
| print("Turn 1: Discussing AI scaling...") | |
| r1 = self.engine.add_realization( | |
| content="Larger language models exhibit emergent capabilities not present in smaller models", | |
| features=RealizationFeatures( | |
| grounding=0.95, # Well-documented (GPT-3, GPT-4 papers) | |
| certainty=0.92, # Strong empirical evidence | |
| structure=0.90, # Clear statement | |
| applicability=0.88, # Applies to model development | |
| coherence=1.0, # No contradictions | |
| generativity=0.85 # Generates safety questions | |
| ), | |
| turn_number=1, | |
| context="Researcher A observes scaling trends", | |
| evidence=["GPT-3 paper", "Emergent Abilities paper"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r1.id, | |
| 'content': r1.content, | |
| 'q_score': r1.q_score, | |
| 'layer': r1.layer | |
| }) | |
| # =================================================================== | |
| # TURN 2: The alignment problem emerges | |
| # =================================================================== | |
| print("Turn 2: Identifying the core problem...") | |
| r2 = self.engine.add_realization( | |
| content="AI systems optimize for specified objectives, not intended outcomes - this is the alignment problem", | |
| features=RealizationFeatures( | |
| grounding=0.92, # Well-established in AI safety literature | |
| certainty=0.95, # Core problem, high certainty | |
| structure=0.93, # Clear problem statement | |
| applicability=0.94, # Critical for AI development | |
| coherence=0.95, # Consistent with Turn 1 | |
| generativity=0.90 # Generates research directions | |
| ), | |
| turn_number=2, | |
| parents=[r1.id], | |
| context="Researcher B identifies misalignment risk", | |
| evidence=["Superintelligence by Bostrom", "AI Alignment Forum"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r2.id, | |
| 'content': r2.content, | |
| 'q_score': r2.q_score, | |
| 'layer': r2.layer | |
| }) | |
| # =================================================================== | |
| # TURN 3: Interpretability as solution direction | |
| # =================================================================== | |
| print("Turn 3: Proposing interpretability approach...") | |
| r3 = self.engine.add_realization( | |
| content="Mechanistic interpretability - understanding model internals - is necessary for alignment", | |
| features=RealizationFeatures( | |
| grounding=0.85, # Emerging field, less established | |
| certainty=0.80, # Strong belief but not proven | |
| structure=0.88, # Clear proposal | |
| applicability=0.90, # Highly actionable | |
| coherence=0.92, # Follows from alignment problem | |
| generativity=0.88 # Generates research methods | |
| ), | |
| turn_number=3, | |
| parents=[r2.id], | |
| context="Researcher C proposes interpretability research", | |
| evidence=["Anthropic's interpretability work", "Circuits papers"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r3.id, | |
| 'content': r3.content, | |
| 'q_score': r3.q_score, | |
| 'layer': r3.layer | |
| }) | |
| # =================================================================== | |
| # TURN 4: Measurement challenge | |
| # =================================================================== | |
| print("Turn 4: Identifying measurement problem...") | |
| r4 = self.engine.add_realization( | |
| content="We cannot fully verify AI system behavior - the testing problem is computationally intractable", | |
| features=RealizationFeatures( | |
| grounding=0.98, # Computational complexity theory | |
| certainty=0.90, # Strong theoretical backing | |
| structure=0.92, # Clear limitation | |
| applicability=0.85, # Constrains verification approaches | |
| coherence=0.88, # Complicates Turn 3 | |
| generativity=0.82 # Generates verification methods | |
| ), | |
| turn_number=4, | |
| parents=[r3.id], | |
| context="Researcher D identifies fundamental limit", | |
| evidence=["Computational complexity", "Verification literature"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r4.id, | |
| 'content': r4.content, | |
| 'q_score': r4.q_score, | |
| 'layer': r4.layer | |
| }) | |
| # =================================================================== | |
| # TURN 5: Sandbox approach | |
| # =================================================================== | |
| print("Turn 5: Proposing containment strategy...") | |
| r5 = self.engine.add_realization( | |
| content="AI systems should be developed in sandboxed environments with capability constraints", | |
| features=RealizationFeatures( | |
| grounding=0.80, # Practical approach, less theoretical | |
| certainty=0.75, # Uncertain effectiveness | |
| structure=0.85, # Clear strategy | |
| applicability=0.92, # Very actionable | |
| coherence=0.85, # Partial solution to Turn 4 | |
| generativity=0.78 # Generates safety protocols | |
| ), | |
| turn_number=5, | |
| parents=[r4.id], | |
| context="Researcher E proposes containment", | |
| evidence=["Capability control literature"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r5.id, | |
| 'content': r5.content, | |
| 'q_score': r5.q_score, | |
| 'layer': r5.layer | |
| }) | |
| # =================================================================== | |
| # TURN 6: Multi-agent coordination insight | |
| # =================================================================== | |
| print("Turn 6: Discovering coordination dynamics...") | |
| r6 = self.engine.add_realization( | |
| content="Multiple AI systems will exhibit emergent coordination behaviors not predictable from individual analysis", | |
| features=RealizationFeatures( | |
| grounding=0.82, # Game theory + emergence literature | |
| certainty=0.85, # Strong theoretical basis | |
| structure=0.88, # Clear prediction | |
| applicability=0.80, # Applies to multi-agent systems | |
| coherence=0.90, # Extends Turn 1 (emergence) | |
| generativity=0.92 # Opens multi-agent research | |
| ), | |
| turn_number=6, | |
| parents=[r1.id, r2.id], | |
| context="Researcher F identifies multi-agent risk", | |
| evidence=["Multi-agent RL", "Game theory"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r6.id, | |
| 'content': r6.content, | |
| 'q_score': r6.q_score, | |
| 'layer': r6.layer | |
| }) | |
| # =================================================================== | |
| # TURN 7: Synthesis - layered safety | |
| # =================================================================== | |
| print("Turn 7: Synthesizing into framework...") | |
| r7 = self.engine.add_realization( | |
| content="AI safety requires layered defenses: interpretability + verification + containment + coordination protocols", | |
| features=RealizationFeatures( | |
| grounding=0.88, # Synthesizes prior work | |
| certainty=0.87, # Confident in framework | |
| structure=0.92, # Clear framework | |
| applicability=0.95, # Highly actionable | |
| coherence=0.95, # Synthesizes Turns 3-6 | |
| generativity=0.88 # Generates integrated approach | |
| ), | |
| turn_number=7, | |
| parents=[r3.id, r4.id, r5.id, r6.id], | |
| context="Researcher A synthesizes discussion", | |
| evidence=["Defense in depth", "Security engineering"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r7.id, | |
| 'content': r7.content, | |
| 'q_score': r7.q_score, | |
| 'layer': r7.layer | |
| }) | |
| # =================================================================== | |
| # TURN 8: Meta-realization | |
| # =================================================================== | |
| print("Turn 8: Meta-observation about the discussion...") | |
| r8 = self.engine.add_realization( | |
| content="This conversation itself demonstrates how realizations build on each other to form coherent frameworks", | |
| features=RealizationFeatures( | |
| grounding=0.90, # Observable in this conversation | |
| certainty=0.88, # We can see it happening | |
| structure=0.94, # Very clear observation | |
| applicability=0.85, # Applies to knowledge work | |
| coherence=0.98, # Meta-coherent | |
| generativity=0.90 # Self-referential insight | |
| ), | |
| turn_number=8, | |
| parents=[r7.id], | |
| context="Researcher B observes the process", | |
| evidence=["This very conversation"] | |
| ) | |
| self.test_results['realizations'].append({ | |
| 'id': r8.id, | |
| 'content': r8.content, | |
| 'q_score': r8.q_score, | |
| 'layer': r8.layer | |
| }) | |
| print(f"\nโ Extracted {len(self.test_results['realizations'])} realizations") | |
| def analyze_layer_distribution(self): | |
| """Analyze how realizations distributed across layers""" | |
| # Get statistics | |
| self.engine.print_stats() | |
| # Store in results | |
| self.test_results['metrics']['layer_distribution'] = dict( | |
| self.engine.stats['layer_distribution'] | |
| ) | |
| self.test_results['metrics']['avg_q_score'] = self.engine.stats['avg_q_score'] | |
| # Analyze quality by layer | |
| print("\nQuality Analysis by Layer:") | |
| for layer in [0, 1, 2, 3, 'N']: | |
| realizations = list(self.engine.layers[layer].values()) | |
| if realizations: | |
| avg_q = sum(r.q_score for r in realizations) / len(realizations) | |
| min_q = min(r.q_score for r in realizations) | |
| max_q = max(r.q_score for r in realizations) | |
| print(f" Layer {layer}: avg={avg_q:.4f}, min={min_q:.4f}, max={max_q:.4f}") | |
| def analyze_generativity(self): | |
| """Analyze which realizations were most generative""" | |
| print("Most Generative Realizations (ุจูุงุช ุงููุงุฑ):\n") | |
| # Find realizations with children | |
| with_children = [ | |
| (r, len(r.children)) | |
| for r in self.engine.index.values() | |
| if r.children | |
| ] | |
| with_children.sort(key=lambda x: x[1], reverse=True) | |
| generativity_data = [] | |
| for i, (r, child_count) in enumerate(with_children[:5], 1): | |
| print(f"{i}. {r.content[:60]}...") | |
| print(f" Q={r.q_score:.4f}, Layer {r.layer}") | |
| print(f" Generated {child_count} children:") | |
| children_info = [] | |
| for child_id in r.children: | |
| child = self.engine.index[child_id] | |
| print(f" โ {child.content[:50]}... (Q={child.q_score:.3f})") | |
| children_info.append({ | |
| 'content': child.content, | |
| 'q_score': child.q_score | |
| }) | |
| generativity_data.append({ | |
| 'parent': r.content, | |
| 'q_score': r.q_score, | |
| 'child_count': child_count, | |
| 'children': children_info | |
| }) | |
| print() | |
| self.test_results['metrics']['generativity'] = generativity_data | |
| def test_retrieval(self): | |
| """Test the retrieval system""" | |
| queries = [ | |
| ("alignment", "alignment problem"), | |
| ("interpretability", "understanding models"), | |
| ("verification", "testing problem"), | |
| ("safety framework", "layered defenses"), | |
| ("emergence", "emergent capabilities") | |
| ] | |
| retrieval_results = [] | |
| for query_name, query in queries: | |
| print(f"Query: '{query}'") | |
| results = self.engine.retrieve(query) | |
| if results: | |
| best = results[0] | |
| print(f" โ Found: [{best.layer}] Q={best.q_score:.4f}") | |
| print(f" {best.content[:60]}...") | |
| retrieval_results.append({ | |
| 'query': query, | |
| 'found': True, | |
| 'best_match': { | |
| 'content': best.content, | |
| 'q_score': best.q_score, | |
| 'layer': best.layer | |
| } | |
| }) | |
| else: | |
| print(f" โ No results") | |
| retrieval_results.append({ | |
| 'query': query, | |
| 'found': False | |
| }) | |
| print() | |
| self.test_results['metrics']['retrieval'] = retrieval_results | |
| # Calculate retrieval accuracy | |
| found_count = sum(1 for r in retrieval_results if r['found']) | |
| accuracy = found_count / len(queries) * 100 | |
| print(f"Retrieval Accuracy: {found_count}/{len(queries)} = {accuracy:.1f}%") | |
| def assess_quality(self): | |
| """Comprehensive quality assessment""" | |
| print("="*60) | |
| print("QUALITY ASSESSMENT") | |
| print("="*60) | |
| # 1. Q-Score Distribution | |
| q_scores = [r.q_score for r in self.engine.index.values()] | |
| q_scores.sort(reverse=True) | |
| print("\n1. Q-Score Distribution:") | |
| print(f" Highest: {max(q_scores):.4f}") | |
| print(f" Lowest: {min(q_scores):.4f}") | |
| print(f" Mean: {sum(q_scores)/len(q_scores):.4f}") | |
| print(f" Median: {q_scores[len(q_scores)//2]:.4f}") | |
| # 2. Layer Quality | |
| print("\n2. Layer Quality:") | |
| print(f" Layer 0 (Universal): {self.engine.stats['layer_distribution'][0]} realizations") | |
| print(f" Layer 1 (Domain): {self.engine.stats['layer_distribution'][1]} realizations") | |
| print(f" Layer 2 (Pattern): {self.engine.stats['layer_distribution'][2]} realizations") | |
| print(f" Layer 3 (Situation): {self.engine.stats['layer_distribution'][3]} realizations") | |
| print(f" Layer N (Ephemeral): {self.engine.stats['layer_distribution']['N']} realizations") | |
| # 3. Coherence Analysis | |
| print("\n3. Coherence Analysis:") | |
| avg_coherence = sum(r.features.coherence for r in self.engine.index.values()) / len(self.engine.index) | |
| print(f" Average Coherence: {avg_coherence:.4f}") | |
| print(f" โ {avg_coherence*100:.1f}% consistency with prior layers") | |
| # 4. Generativity Analysis | |
| print("\n4. Generativity Analysis:") | |
| total_children = sum(len(r.children) for r in self.engine.index.values()) | |
| total_parents = sum(len(r.parents) for r in self.engine.index.values()) | |
| avg_children = total_children / len(self.engine.index) | |
| print(f" Total children spawned: {total_children}") | |
| print(f" Average children per realization: {avg_children:.2f}") | |
| print(f" Total parent links: {total_parents}") | |
| # 5. Feature Analysis | |
| print("\n5. Feature Averages:") | |
| features = ['grounding', 'certainty', 'structure', 'applicability', 'coherence', 'generativity'] | |
| for feature in features: | |
| avg = sum(getattr(r.features, feature) for r in self.engine.index.values()) / len(self.engine.index) | |
| print(f" {feature.capitalize():15s}: {avg:.4f}") | |
| # 6. System Performance | |
| print("\n6. System Performance:") | |
| print(f" Total realizations: {len(self.engine.index)}") | |
| print(f" Layers used: {sum(1 for v in self.engine.stats['layer_distribution'].values() if v > 0)}/5") | |
| print(f" Graph depth: {self.calculate_max_depth()} levels") | |
| print(f" Avg Q-score: {self.engine.stats['avg_q_score']:.4f}") | |
| # Store assessment | |
| self.test_results['assessment'] = { | |
| 'q_score_stats': { | |
| 'max': max(q_scores), | |
| 'min': min(q_scores), | |
| 'mean': sum(q_scores)/len(q_scores), | |
| 'median': q_scores[len(q_scores)//2] | |
| }, | |
| 'coherence': { | |
| 'avg_coherence': avg_coherence, | |
| 'consistency_pct': avg_coherence * 100 | |
| }, | |
| 'generativity': { | |
| 'total_children': total_children, | |
| 'avg_children_per_realization': avg_children, | |
| 'total_parent_links': total_parents | |
| }, | |
| 'system_performance': { | |
| 'total_realizations': len(self.engine.index), | |
| 'layers_used': sum(1 for v in self.engine.stats['layer_distribution'].values() if v > 0), | |
| 'graph_depth': self.calculate_max_depth(), | |
| 'avg_q_score': self.engine.stats['avg_q_score'] | |
| } | |
| } | |
| # 7. Pass/Fail Criteria | |
| print("\n7. Test Criteria:") | |
| tests = [ | |
| ("All Q-scores >= 0.70", all(q >= 0.70 for q in q_scores)), | |
| ("Average Q-score >= 0.85", self.engine.stats['avg_q_score'] >= 0.85), | |
| ("At least 1 Layer 1+ realization", self.engine.stats['layer_distribution'][1] >= 1), | |
| ("Retrieval accuracy >= 80%", len([r for r in self.test_results['metrics']['retrieval'] if r['found']]) / len(self.test_results['metrics']['retrieval']) >= 0.80), | |
| ("Average coherence >= 0.85", avg_coherence >= 0.85) | |
| ] | |
| all_passed = True | |
| for test_name, passed in tests: | |
| status = "โ PASS" if passed else "โ FAIL" | |
| print(f" {status}: {test_name}") | |
| if not passed: | |
| all_passed = False | |
| print("\n" + "="*60) | |
| if all_passed: | |
| print("โ ALL TESTS PASSED") | |
| else: | |
| print("โ ๏ธ SOME TESTS FAILED") | |
| print("="*60) | |
| self.test_results['assessment']['all_tests_passed'] = all_passed | |
| self.test_results['assessment']['individual_tests'] = [ | |
| {'name': name, 'passed': passed} for name, passed in tests | |
| ] | |
| def calculate_max_depth(self): | |
| """Calculate maximum depth of realization graph""" | |
| def get_depth(r_id, visited=None): | |
| if visited is None: | |
| visited = set() | |
| if r_id in visited: | |
| return 0 | |
| visited.add(r_id) | |
| r = self.engine.index.get(r_id) | |
| if not r or not r.children: | |
| return 1 | |
| return 1 + max(get_depth(child_id, visited.copy()) for child_id in r.children) | |
| # Find root nodes (no parents) | |
| roots = [r.id for r in self.engine.index.values() if not r.parents] | |
| if not roots: | |
| return 0 | |
| return max(get_depth(root) for root in roots) | |
| def export_results(self): | |
| """Export test results to JSON""" | |
| output_path = 'layers/layer_3_optimization/test_case_results.json' | |
| with open(output_path, 'w') as f: | |
| json.dump(self.test_results, f, indent=2) | |
| print(f"โ Results exported to {output_path}") | |
| print(f" Total size: {len(json.dumps(self.test_results))} bytes") | |
| # Also export the full engine state | |
| engine_state = self.engine.export_state() | |
| engine_path = 'layers/layer_3_optimization/test_case_engine_state.json' | |
| with open(engine_path, 'w') as f: | |
| json.dump(engine_state, f, indent=2) | |
| print(f"โ Engine state exported to {engine_path}") | |
| if __name__ == "__main__": | |
| # Run the comprehensive test | |
| test = TestCaseStudy() | |
| test.run_test() | |