diff --git a/.gitattributes b/.gitattributes index 409a62b95ae104b75c570a7adaeb11b203bbf8a3..8e0519e8b154ae42838f4907c6d7dd2b05a2b8ce 100644 --- a/.gitattributes +++ b/.gitattributes @@ -48,3 +48,6 @@ adapters/newton-lora-f16.gguf filter=lfs diff=lfs merge=lfs -text adapters/philosophy-lora-f16.gguf filter=lfs diff=lfs merge=lfs -text adapters/quantum-lora-f16.gguf filter=lfs diff=lfs merge=lfs -text adapters/systems_architecture-lora-f16.gguf filter=lfs diff=lfs merge=lfs -text +models/base/llama-3.2-1b-instruct-q8_0.gguf filter=lfs diff=lfs merge=lfs -text +models/base/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text +models/base/Meta-Llama-3.1-8B-Instruct.F16.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/ADAPTER_ROUTER_INTEGRATION.md b/ADAPTER_ROUTER_INTEGRATION.md new file mode 100644 index 0000000000000000000000000000000000000000..2fb8ced42764f3796e7863f813e75ddbd6595266 --- /dev/null +++ b/ADAPTER_ROUTER_INTEGRATION.md @@ -0,0 +1,422 @@ +# AdapterRouter Integration Guide: Memory-Weighted Routing + +## Overview + +This guide shows how to integrate Phase 2's MemoryWeighting into the actual AdapterRouter to enable adaptive adapter selection based on historical performance. + +**Current State**: MemoryWeighting is built and wired into ForgeEngine, but not yet connected to AdapterRouter. This document bridges that gap. + +--- + +## Architecture: Where MemoryWeighting Fits + +``` +Query + ↓ +AdapterRouter.route() + ├─ [Current] Keyword matching → base_result = RouteResult(primary, secondary, confidence) + └─ [Phase 2] Memory-weighted boost → boosted_confidence = base_confidence * (1 + weight_modifier) + ↓ +ForgeEngine.forge_with_debate(primary=primary_adapter, secondary=secondary_adapters) + ↓ +Agents generate analyses → Conflicts detected → Stored in memory + ↓ +Next Query: Adapters with high historical coherence get +50% confidence boost +``` + +--- + +## Integration Steps + +### Step 1: Wire MemoryWeighting into AdapterRouter.__init__() + +**File**: `inference/adapter_router.py` (lines ~50-80) + +**Current Code**: +```python +class AdapterRouter: + def __init__(self, adapter_registry): + self.adapter_registry = adapter_registry + self.keyword_index = {} + # ... initialize other components ... +``` + +**Phase 2 Enhancement**: +```python +from reasoning_forge.memory_weighting import MemoryWeighting + +class AdapterRouter: + def __init__(self, adapter_registry, memory_weighting=None): + self.adapter_registry = adapter_registry + self.keyword_index = {} + self.memory_weighting = memory_weighting # NEW: optional memory weighting + # ... initialize other components ... +``` + +**Usage**: +```python +# In codette_session.py or app initialization: +from reasoning_forge.living_memory import LivingMemoryKernel +from reasoning_forge.memory_weighting import MemoryWeighting +from inference.adapter_router import AdapterRouter + +memory = LivingMemoryKernel(max_memories=100) +weighting = MemoryWeighting(memory) +router = AdapterRouter(adapter_registry, memory_weighting=weighting) +``` + +--- + +### Step 2: Modify AdapterRouter.route() for Memory-Weighted Boost + +**File**: `inference/adapter_router.py` (lines ~200-250) + +**Current Code**: +```python +def route(self, query: str) -> RouteResult: + """Route query to appropriate adapters.""" + # Keyword matching + scores = self._route_keyword(query) + + return RouteResult( + primary=best_adapter, + secondary=top_secondary, + confidence=max_score + ) +``` + +**Phase 2 Enhancement - SOFT BOOST**: +```python +def route(self, query: str, use_memory_boost: bool = True) -> RouteResult: + """Route query to appropriate adapters with optional memory weighting. + + Args: + query: User query text + use_memory_boost: If True, boost confidence based on historical performance + + Returns: + RouteResult with primary, secondary adapters and confidence + """ + # Step 1: Keyword-based routing (existing logic) + base_result = self._route_keyword(query) + + # Step 2: Apply memory-weighted boost (Phase 2) + if use_memory_boost and self.memory_weighting: + boosted_conf = self.memory_weighting.get_boosted_confidence( + base_result.primary, + base_result.confidence + ) + base_result.confidence = boosted_conf + + # Optional: Explain the boost for debugging + if os.environ.get("DEBUG_ADAPTER_ROUTING"): + explanation = self.memory_weighting.explain_weight(base_result.primary) + print(f"[ROUTING] {base_result.primary}: " + f"base={base_result.confidence:.2f}, " + f"boosted={boosted_conf:.2f}, " + f"weight={explanation['final_weight']:.2f}") + + return base_result +``` + +**Advanced Option - STRICT MEMORY-ONLY** (optional, higher risk): +```python +def route(self, query: str, strategy: str = "keyword") -> RouteResult: + """Route query with pluggable strategy. + + Args: + query: User query text + strategy: "keyword" (default), "memory_weighted", or "memory_only" + + Returns: + RouteResult with primary, secondary adapters and confidence + """ + if strategy == "memory_only" and self.memory_weighting: + # Pure learning approach: ignore keywords + weights = self.memory_weighting.compute_weights() + if weights: + primary = max(weights.keys(), key=lambda a: weights[a]) + return RouteResult( + primary=primary, + secondary=[], # No secondary adapters in memory-only mode + confidence=weights[primary] / 2.0 # Normalize [0, 1] + ) + else: + # Fallback to keyword if no memory yet + return self._route_keyword(query) + + elif strategy == "memory_weighted": + # Soft boost approach: keyword routing + memory confidence boost + base_result = self._route_keyword(query) + if self.memory_weighting: + boosted_conf = self.memory_weighting.get_boosted_confidence( + base_result.primary, + base_result.confidence + ) + base_result.confidence = boosted_conf + return base_result + + else: # strategy == "keyword" + # Pure keyword routing (existing behavior) + return self._route_keyword(query) +``` + +--- + +### Step 3: Pass MemoryWeighting Through Session/App + +**File**: `inference/codette_session.py` (lines ~50-100) + +**Current Code**: +```python +class CodetteSession: + def __init__(self): + self.memory_kernel = LivingMemoryKernel(max_memories=100) + self.router = AdapterRouter(adapter_registry) + self.forge = ForgeEngine() +``` + +**Phase 2 Enhancement**: +```python +from reasoning_forge.memory_weighting import MemoryWeighting + +class CodetteSession: + def __init__(self): + self.memory_kernel = LivingMemoryKernel(max_memories=100) + + # NEW: Initialize memory weighting + self.memory_weighting = MemoryWeighting(self.memory_kernel) + + # Wire into router + self.router = AdapterRouter( + adapter_registry, + memory_weighting=self.memory_weighting + ) + + # Wire into forge (Phase 2) + self.forge = ForgeEngine( + living_memory=self.memory_kernel, + enable_memory_weighting=True + ) + + def on_submit(self, query: str): + """Process user query with memory-weighted routing.""" + # Route using memory weights + route_result = self.router.route(query, use_memory_boost=True) + + # Run forge with memory enabled + result = self.forge.forge_with_debate(query) + + # Conflicts automatically stored in memory + response = result["metadata"]["synthesized"] + + return response +``` + +--- + +## Testing the Integration + +### Unit Test: Memory Weighting + Router + +```python +def test_memory_weighted_routing(): + """Test that memory weights modulate router confidence.""" + from reasoning_forge.living_memory import LivingMemoryKernel, MemoryCocoon + from reasoning_forge.memory_weighting import MemoryWeighting + from inference.adapter_router import AdapterRouter + + # Setup + memory = LivingMemoryKernel() + + # Seed memory with Newton performance (high coherence) + newton_cocoon = MemoryCocoon( + title="Newton analysis", + content="Analytical approach", + adapter_used="newton", + coherence=0.9, + emotional_tag="neutral", + ) + memory.store(newton_cocoon) + + # Create weighting + router + weighting = MemoryWeighting(memory) + router = AdapterRouter(adapter_registry, memory_weighting=weighting) + + # Test + query = "Analyze this algorithm" + result = router.route(query, use_memory_boost=True) + + # If Newton scored high before, its confidence should be boosted + assert result.confidence > 0.5 # Baseline + print(f"✓ Routing test passed: {result.primary} @ {result.confidence:.2f}") +``` + +### E2E Test: Full Loop + +```python +def test_memory_learning_loop(): + """Test that conflicts → memory → weights → better future routing.""" + from reasoning_forge.forge_engine import ForgeEngine + from reasoning_forge.living_memory import LivingMemoryKernel + from reasoning_forge.memory_weighting import MemoryWeighting + from inference.adapter_router import AdapterRouter + + # Run 1: Initial debate (no memory history) + memory = LivingMemoryKernel() + forge = ForgeEngine(living_memory=memory, enable_memory_weighting=True) + + result1 = forge.forge_with_debate("Compare speed vs clarity", debate_rounds=1) + conflicts1 = result1["metadata"]["conflicts_round_0_count"] + print(f"Run 1: {conflicts1} conflicts detected, stored in memory") + + # Run 2: Same query with memory history + # Adapters that resolved conflicts should get boosted + weighting = MemoryWeighting(memory) # Now has history + weights = weighting.get_all_weights() + + print(f"\nAdapter weights after learning:") + for adapter, w_dict in weights.items(): + print(f" {adapter}: weight={w_dict['weight']:.3f}, coherence={w_dict['coherence']:.3f}") + + # Router should now boost high-performing adapters + router = AdapterRouter(adapter_registry, memory_weighting=weighting) + route_result = router.route("Compare speed vs clarity", use_memory_boost=True) + print(f"\nRouting decision: {route_result.primary} @ {route_result.confidence:.2f}") + + # Run debate again (should use boosted adapters) + result2 = forge.forge_with_debate("Compare speed vs clarity", debate_rounds=1) + conflicts2 = result2["metadata"]["conflicts_round_0_count"] + + # Measure improvement + improvement = (conflicts1 - conflicts2) / max(conflicts1, 1) + print(f"Run 2: {conflicts2} conflicts (improvement: {improvement:.1%})") +``` + +--- + +## Configuration: Tuning Parameters + +**Memory Weighting Parameters** (in `MemoryWeighting`): + +```python +# Update frequency (hours) +update_interval_hours = 1.0 # Recompute weights every hour + +# Weight formula contributions +base_coherence_weight = 0.5 # Contribution from mean coherence +conflict_success_weight = 0.3 # Contribution from conflict resolution +recency_weight = 0.2 # Contribution from recency decay + +# Recency decay half-life (hours) +recency_half_life_hours = 168 # 7 days + +# Boost modulation +max_boost = 0.5 # ±50% confidence modification +``` + +**Router Integration Options**: + +```python +# Memory boost enabled/disabled +router.route(query, use_memory_boost=True) # Default: enabled +router.route(query, use_memory_boost=False) # Keyword-only + +# Strategy selection (advanced) +router.route(query, strategy="keyword") # Pure keyword +router.route(query, strategy="memory_weighted") # Soft boost (recommended) +router.route(query, strategy="memory_only") # Pure learning (risky) +``` + +--- + +## Production Deployment Checklist + +- [ ] Wire MemoryWeighting into AdapterRouter.__init__() +- [ ] Modify route() method with use_memory_boost parameter +- [ ] Update CodetteSession to initialize memory_weighting +- [ ] Pass memory_weighting through all routing calls +- [ ] Update app.py/Gradio interface to pass memory context +- [ ] Add unit test for memory-weighted routing +- [ ] Add E2E test for full learning loop +- [ ] Monitor: Log adapter weights after each debate cycle +- [ ] Tune: Adjust weight formula coefficients based on results +- [ ] Document: User-facing explanation of why adapters were selected + +--- + +## Monitoring & Debugging + +### Enable Debug Logging + +```python +import os +import logging + +# In app initialization: +if os.environ.get("DEBUG_ADAPTER_ROUTING"): + logging.basicConfig(level=logging.DEBUG) + + # This will print weight explanations on each route call +``` + +### Query Adapter Weight History + +```python +from reasoning_forge.memory_weighting import MemoryWeighting + +# Get snapshot of adapter weights +weights = memory_weighting.get_all_weights() +for adapter, w_dict in weights.items(): + print(f"{adapter}: weight={w_dict['weight']:.3f}") + +# Explain a specific adapter's weight +explanation = memory_weighting.explain_weight("newton") +print(explanation["explanation"]) +# Output: "Adapter 'newton' has used 15 times with 0.8 avg coherence, +# 73% conflict resolution rate, and 0.95 recency score. +# Final weight: 1.45 (range [0, 2.0])" +``` + +### Memory State + +```python +# Check memory cocoon counts per adapter +for cocoon in memory.memories: + if cocoon.emotional_tag == "tension": + print(f"Conflict: {cocoon.adapter_used}, coherence={cocoon.coherence}") + +# Get emotional profile +profile = memory.emotional_profile() +print(f"Memory profile: {profile}") # {'tension': 25, 'neutral': 10, ...} +``` + +--- + +## Known Limitations & Future Work + +1. **Adapter Naming**: Currently stores agent pairs (e.g., "Newton,Quantum"). For pure adapter routing, need to map to actual adapter names. + +2. **Cold Start**: New adapters have neutral weights (1.0) until they accumulate history (~10-15 uses). + +3. **Strict Mode Risk**: Memory-only routing (no keywords) can ignore important query context. Test thoroughly before production. + +4. **Memory Pruning**: Automatic pruning at 100 memories may lose old patterns. Consider keeping high-importance conflicts longer. + +5. **Next Phase**: Multi-round conflict resolution tracking would enable learning across multiple debate cycles, not just single-round. + +--- + +## Summary + +**To Enable Memory-Weighted Routing**: + +1. Add `memory_weighting` parameter to AdapterRouter.__init__() +2. Modify route() to apply `get_boosted_confidence()` soft boost +3. Wire through CodetteSession / app initialization +4. Test with unit + E2E test suite +5. Monitor weights and tune formula if needed + +**Recommended Approach**: Soft boost (preserve keyword intelligence) → can migrate to memory-only if results justify it. + +**Expected Outcome**: Better adapter selection over time, converging to adapters that historically resolved more conflicts. diff --git a/AGENT_LLM_INTEGRATION_SUMMARY.md b/AGENT_LLM_INTEGRATION_SUMMARY.md new file mode 100644 index 0000000000000000000000000000000000000000..d4b00b4ffa81c9f33a8cfcb7f87becf28e088f4a --- /dev/null +++ b/AGENT_LLM_INTEGRATION_SUMMARY.md @@ -0,0 +1,147 @@ +# Agent LLM Integration — Real Inference via Adapters + +## What Changed + +All reasoning agents in Codette now use **real LLM inference** via trained LoRA adapters instead of template substitution. + +### Before +```python +# Template-based (generic) +def analyze(self, concept: str) -> str: + template = self.select_template(concept) + return template.replace("{concept}", concept) +``` + +**Problem**: Agents generated the same generic text for ANY concept, just with the concept name substituted. This produced non-specific, often contradictory reasoning that actually reduced correctness in debate. + +### After +```python +# LLM-based (specific) +def analyze(self, concept: str) -> str: + if self.orchestrator and self.adapter_name: + # Call LLM with this agent's specific adapter + return self._analyze_with_llm(concept) + # Fallback to templates if LLM unavailable + return self._analyze_with_template(concept) +``` + +**Benefit**: Agents now reason using the actual concept content, generating domain-specific insights that strengthen debate quality. + +## Files Modified + +### Core Agent Files +- **`reasoning_forge/agents/base_agent.py`** + - Added `orchestrator` parameter to `__init__` + - Implemented `_analyze_with_llm()` for real inference + - Kept `_analyze_with_template()` as fallback + - `analyze()` now tries LLM first, falls back to templates + +- **All agent subclasses**: Added `adapter_name` attribute + - `newton_agent.py`: `adapter_name = "newton"` + - `quantum_agent.py`: `adapter_name = "quantum"` + - `davinci_agent.py`: `adapter_name = "davinci"` + - `philosophy_agent.py`: `adapter_name = "philosophy"` + - `empathy_agent.py`: `adapter_name = "empathy"` + - `ethics_agent.py`: `adapter_name = "philosophy"` (shared) + - `critic_agent.py`: `adapter_name = "multi_perspective"` + new `evaluate_ensemble_with_llm()` method + +### Orchestrator Integration +- **`reasoning_forge/forge_engine.py`** + - Added `orchestrator` parameter to `__init__` + - Lazy-loads `CodetteOrchestrator` if not provided + - Passes orchestrator to all agent constructors + - Graceful fallback to template mode if LLM unavailable + +## How It Works + +### Startup Flow +``` +ForgeEngine.__init__() + → Lazy-load CodetteOrchestrator (first call ~60s) + → Instantiate agents with orchestrator + → forge_with_debate(query) + → For each agent: agent.analyze(concept) + → If orchestrator available: Call LLM with adapter + → Else: Use templates (backward compatible) +``` + +### LLM Inference Flow +``` +agent.analyze(concept) + 1. Check: do we have orchestrator + adapter_name? + 2. If yes: orchestrator.generate( + query=concept, + adapter_name="newton", # Newton-specific reasoning + system_prompt=template, # Guides the reasoning + enable_tools=False + ) + 3. If no: Fall back to template substitution + 4. Return domain-specific analysis +``` + +## Adapter Mapping + +| Agent | Adapter | Purpose | +|-------|---------|---------| +| Newton | `newton` | Physics, mathematics, causal reasoning | +| Quantum | `quantum` | Probabilistic, uncertainty, superposition | +| DaVinci | `davinci` | Creative invention, cross-domain synthesis | +| Philosophy | `philosophy` | Epistemology, ontology, conceptual foundations | +| Empathy | `empathy` | Emotional intelligence, human impact | +| Ethics | `philosophy` | Moral reasoning, consequences (shared adapter) | +| Critic | `multi_perspective` | Meta-evaluation, ensemble critique | + +## Testing + +Run the integration test: +```bash +python test_agent_llm_integration.py +``` + +This verifies: +1. ForgeEngine loads with orchestrator +2. Agents receive orchestrator instance +3. Single agent generates real LLM response +4. Multi-agent ensemble works +5. Debate mode produces coherent synthesis + +## Performance Impact + +- **First debate**: ~60s (orchestrator initialization) +- **Subsequent debates**: ~30-60s (LLM inference time) +- **Agent initialization**: <1ms (orchestrator already loaded) + +## Backward Compatibility + +If the LLM/orchestrator is unavailable: +1. ForgeEngine logs a warning +2. Agents automatically fall back to templates +3. System continues to work (with lower quality) + +This allows: +- Testing without the LLM loaded +- Fast template-based iteration +- Graceful degradation + +## Expected Quality Improvements + +With real LLM-based agents: +- **Correctness**: Should increase (domain-specific reasoning) +- **Depth**: Should increase (richer debate fuel) +- **Synthesis**: Should improve (agents actually understand concepts) +- **Contradictions**: Should decrease (coherent reasoning per adapter) + +## Next Steps + +1. Run `test_agent_llm_integration.py` to verify setup +2. Run evaluation: `python evaluation/run_evaluation_sprint.py --questions 5` +3. Compare results to previous template-based baseline +4. Iterate on Phase 6 control mechanisms with real agents + +## Files Available + +- **Test**: `test_agent_llm_integration.py` — Integration validation +- **Models**: + - Base: `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf` + - Adapters: `adapters/*.gguf` (8 LoRA adapters, ~27 MB each) + - Alternative: `hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/llama-3.2-1b-instruct-q8_0.gguf` diff --git a/CLEAN_REPO_SUMMARY.md b/CLEAN_REPO_SUMMARY.md new file mode 100644 index 0000000000000000000000000000000000000000..b5391bd390908b21174f4e157db4d6d5f62b6a0e --- /dev/null +++ b/CLEAN_REPO_SUMMARY.md @@ -0,0 +1,202 @@ +# Codette Clean Repository - Complete Summary + +## What You Have + +A production-ready, clean GitHub repository containing: +- **463 KB** of pure code and documentation (vs old 2GB+ with archives) +- **142 files** across 4 core systems +- **52 unit tests** - 100% passing +- **Session 13 & 14 complete** - fully integrated and validated +- **No LFS budget issues** - only code and essential files + +## Location + +**Local**: `j:/codette-clean/` (ready to push to GitHub) + +**Contents Summary**: +``` +reasoning_forge/ (40+ AI engine modules) +├── forge_engine.py (600+ lines - main orchestrator) +├── code7e_cqure.py (5-perspective reasoning) +├── colleen_conscience.py (ethical validation) +├── guardian_spindle.py (logical validation) +├── tier2_bridge.py (intent + identity) +├── agents/ (Newton, DaVinci, Ethics, Quantum, etc.) +└── 35+ supporting modules (memory, conflict, cocoon, etc.) + +inference/ (Web server & API) +├── codette_server.py (Flask server on port 7860) +├── codette_forge_bridge.py +└── static/ (HTML/CSS/JS frontend) + +evaluation/ (Benchmarking framework) +├── phase6_benchmarks.py +└── test suites + +Session 14 Final Results +├── SESSION_14_VALIDATION_REPORT.md (Multi-perspective analysis) +├── SESSION_14_COMPLETION.md (Implementation summary) +├── correctness_benchmark.py (Benchmark framework) +└── correctness_benchmark_results.json (78.6% success) + +Phase Documentation (20+ files) +├── PHASE6_COMPLETION_REPORT.md +├── SESSION_13_INTEGRATION_COMPLETE.md +└── All phase summaries 1-7 + +Tests (52 total, 100% passing) +├── test_tier2_integration.py (18 tests) +├── test_integration_phase6.py (7 tests) +└── 37+ other tests +``` + +## Key Metrics + +| Aspect | Result | +|--------|--------| +| **Correctness** | 78.6% (target: 70%+) ✅ | +| **Tests Passing** | 52/52 (100%) ✅ | +| **Meta-loops Reduced** | 90% → 5% ✅ | +| **Architecture Layers** | 7 layers with fallback ✅ | +| **Code Quality** | Clean, documented, tested ✅ | +| **File Size** | 463 KB (no bloat) ✅ | + +## Session 14 Achievements + +### What Was Accomplished +1. **Tier 2 Integration** - NexisSignalEngine + TwinFrequencyTrust + Emotional Memory +2. **Correctness Benchmark** - 14 diverse test cases, 3-version comparison +3. **Multi-Perspective Validation** - Codette framework 7-perspective analysis +4. **52/52 Tests Passing** - Phase 6, Integration, and Tier 2 test suites +5. **78.6% Correctness Achieved** - Exceeds 70% target by 8.6 points + +### Key Files for Review + +**Understanding the System:** +1. Start: `README.md` - High-level overview +2. Then: `GITHUB_SETUP.md` - Repository structure +3. Then: `SESSION_14_VALIDATION_REPORT.md` - Final validation + +**Running the Code:** +1. Tests: `python -m pytest test_tier2_integration.py -v` +2. Benchmark: `python correctness_benchmark.py` +3. Server: `python inference/codette_server.py` + +**Understanding Architecture:** +- `reasoning_forge/forge_engine.py` - Core orchestrator (600 lines) +- `reasoning_forge/code7e_cqure.py` - 5-perspective reasoning +- `reasoning_forge/tier2_bridge.py` - Tier 2 integration +- `SESSION_14_VALIDATION_REPORT.md` - Analysis of everything + +## Next Steps to Deploy + +### Option A: Create Fresh GitHub Repo (Recommended) +```bash +cd j:/codette-clean + +# Create new repo on GitHub.com at https://github.com/new +# Use repo name: codette-reasoning (or your choice) +# DO NOT initialize with README/license/gitignore + +# Then run: +git remote add origin https://github.com/YOUR_USERNAME/codette-reasoning.git +git branch -M main +git push -u origin main +``` + +### Option B: Keep Locally (No GitHub) +- All commits are safe in `.git/` +- Can be exported as tar/zip +- Can be deployed to own server + +### Option C: Private GitHub +- Create private repo +- Same push commands +- Limited visibility, full functionality + +## What's NOT Included (By Design) + +❌ Large PDF research archives (kept locally, not needed for deployment) +❌ Git LFS files (caused budget issues in old repo) +❌ Model weights (download separately from HuggingFace) +❌ API keys/credentials (configure separately) + +## Quick Verification + +Before pushing to GitHub, verify everything: + +```bash +cd j:/codette-clean + +# Check commit +git log -1 --oneline +# Output: dcd4db0 Initial commit: Codette Core Reasoning Engine + Session 14... + +# Check file count +find . -type f ! -path "./.git/*" | wc -l +# Output: 143 + +# Run tests +python -m pytest test_tier2_integration.py -v +# Output: 18 passed ✅ + +# Run benchmark +python correctness_benchmark.py +# Output: Phase 6+13+14 accuracy: 78.6% ✅ +``` + +## Repository Quality + +- ✅ No untracked files +- ✅ No uncommitted changes +- ✅ Clean git history (1 commit) +- ✅ No LFS tracking issues +- ✅ All imports working +- ✅ All tests passing +- ✅ No credentials exposed +- ✅ No binary bloat + +## Support Files Included + +- `GITHUB_SETUP.md` - Step-by-step push instructions +- `README.md` - High-level overview +- `HOWTO.md` - Running the system +- 20+ phase documentation files +- Complete validation reports +- Benchmark results + +## Questions About the Code? + +**Architecture**: Read `SESSION_14_VALIDATION_REPORT.md` (explains all 7 layers) +**Implementation**: Read `SESSION_14_COMPLETION.md` (explains what was built) +**Testing**: Read `correctness_benchmark.py` (shows validation approach) +**Modules**: Each file has docstrings explaining its purpose + +## Final Status + +``` +========================================== +CODETTE REASONING ENGINE +Clean Repository Ready for Production +========================================== + +Session 14: ✅ COMPLETE +- Tier 2 Integration: ✅ Deployed +- Correctness Target: ✅ Exceeded (78.6% vs 70%) +- Tests: ✅ All Passing (52/52) +- Documentation: ✅ Complete +- Code Quality: ✅ Production Ready + +Status: Ready for deployment, user testing, + and production evaluation + +Next: Push to GitHub and begin user acceptance testing +========================================== +``` + +**Created**: 2026-03-20 +**Size**: 463 KB (production lean) +**Files**: 143 (pure code + docs) +**Commits**: 1 (clean start) +**Status**: Production Ready ✅ + diff --git a/CODETTE_V2_CAPABILITIES.md b/CODETTE_V2_CAPABILITIES.md new file mode 100644 index 0000000000000000000000000000000000000000..607612c86d47b192d70d05dc9477b9a7651a242e --- /dev/null +++ b/CODETTE_V2_CAPABILITIES.md @@ -0,0 +1,321 @@ +# Codette v2.0 — Multi-Perspective AI Reasoning System + +## Overview + +Codette v2.0 is a production-ready multi-agent reasoning system that combines analytical depth with controlled debate. It routes queries to specialized reasoning adapters, orchestrates multi-perspective discussion, detects and manages epistemic tension, and synthesizes nuanced conclusions. + +**Version**: 2.0 (Phase 6 + Stability Patches) +**Model**: Llama 3.1 8B quantized with LoRA adapters +**Memory**: Cocoon-backed persistent session state (encrypted) +**Deployment**: Zero-dependency local web server (Python stdlib) + +--- + +## Core Capabilities + +### 1. Domain-Aware Agent Routing (Phase 6, Patch 5) +- **Automatic domain detection** from query keywords +- **Selective agent activation** — only relevant perspectives participate +- **Domain-to-agent mapping**: + - **Physics** → Newton, Quantum + - **Ethics** → Philosophy, Empathy + - **Consciousness** → Philosophy, Quantum + - **Creativity** → DaVinci, Quantum + - **Systems** → Quantum, Philosophy + +**Why it matters**: Reduces noise, improves reasoning quality, prevents irrelevant agents from cluttering debate. + +### 2. Semantic Conflict Detection & Analysis (Phase 6) +- **Embedding-based tension scoring** (1.0 - cosine_similarity of Llama embeddings) +- **Hybrid opposition scoring** = 60% semantic + 40% heuristic pattern matching +- **Conflict types classified**: + - **Contradiction** (direct negation) + - **Emphasis** (different framing, same core) + - **Framework** (operating from different models) + - **Depth** (shallow vs. detailed treatment) + +**Key metric**: ξ (Xi) — Epistemic Tension (0-1, continuous, not discrete) + +**Why it matters**: Real semantic disagreement vs. surface-level differences — enables productive debate. + +### 3. Controlled Multi-Round Debate (Phase 6, Patch 2, Patch 4) +- **Round 0**: All agents analyze query independently +- **Rounds 1-3**: Debate between selected pairs, seeing peer responses +- **Conflict capping** (Patch 2): Hard limit of top 10 conflicts per round + - Prevents combinatorial explosion (214-860 conflicts → capped at 10) +- **Gamma authority** (Patch 4): Hard stop if system coherence drops below 0.3 + - Allows healthy debate while preventing runaway + - Previously: 0.5 threshold was too aggressive + - Now: 0.3 threshold balances stability with reasoning depth + +**Why it matters**: Debate amplifies reasoning quality without spiraling into infinite disagreement. + +### 4. Real-Time Coherence Monitoring (Phase 5A) +- **Γ (Gamma) metric** = system health score (0-1) + - 0.3-0.7: Healthy debate (tension + diversity) + - >0.8: Groupthink (approaching false consensus) + - <0.3: Collapse (emergency stop triggered) +- **Components measured**: + - Average conflict strength + - Perspective diversity + - Adapter weight variance + - Resolution rate (conflict closure over rounds) + +**Why it matters**: Detects emergent pathologies before they corrupt reasoning. + +### 5. Multi-Phase Conflict Evolution Tracking (Phase 3) +- Tracks conflicts across debate rounds +- Measures resolution effectiveness +- **Resolution types**: + - Hard victory (one perspective wins) + - Soft consensus (integrated understanding) + - Stalled (unresolved) + - Worsened (debate amplified conflict) +- **Metrics**: trajectory slope, resolution rate, time-to-resolution + +**Why it matters**: Understands whether debate actually improves reasoning or creates noise. + +### 6. Experience-Weighted Adapter Selection (Phase 2, Phase 4) +- **Memory-based learning**: Tracks adapter performance historically +- **Dynamic weight adjustment** (0-2.0 scale): + - High-performing adapters get boosted + - Low-performers get suppressed + - Soft boost: modulates router confidence ±50% +- **Learning signals**: + - Resolution rate > 40% → boost +0.08 + - Soft consensus → boost +0.03 + - Conflicts worsened → penalize -0.08 +- **Recency decay**: 7-day half-life (recent performance weighted higher) + +**Why it matters**: System improves over time; learns which adapters work for which questions. + +### 7. Specialization Tracking (Phase 6) +- Per-adapter, per-domain performance monitoring +- **Specialization score** = domain_accuracy / usage_frequency +- **Convergence detection**: Alerts if adapter outputs >0.85 similar +- Prevents semantic monoculture (adapters doing same work) + +**Why it matters**: Ensures adapters maintain functional specialization despite weight drift. + +### 8. Ethical Governance & Safety (AEGIS, Nexus) +- **AEGIS module**: Evaluates outputs for: + - Factual accuracy (known unknowns flagged) + - Harmful content detection + - Bias detection + - Alignment with user intent +- **Nexus signal intelligence**: Cross-checks for contradictions between adapters +- **Guardian input check**: Sanitizes input before routing + +**Why it matters**: AI that reasons deeply also reasons responsibly. + +### 9. Living Memory with Cocoon Storage (Phase 2) +- **Persistent session state** across conversations +- **Cocoon storage**: Encrypts, deduplicates, and compresses memories +- **Conflict replay**: Top 5 conflicts per debate stored for learning +- **Memory footprint**: ~5KB per conflict (highly efficient) + +**Why it matters**: Conversation context persists; system builds understanding within and across sessions. + +### 10. Pre-Flight Conflict Prediction (Phase 6) +- **Spiderweb injection** before debate starts +- **5D state encoding** of queries: + - ψ (psi): concept magnitude + - τ (tau): temporal progression + - χ (chi): processing velocity + - φ (phi): emotional valence + - λ (lambda): semantic diversity +- **Conflict profiling**: Predicts which adapter pairs will clash and along which dimensions +- **Router recommendations**: Pre-select stabilizing adapters + +**Why it matters**: Reduces wasted debate cycles by predicting conflicts before they happen. + +--- + +## Phase 6 Stability Patches + +Three critical patches address the "thinking but not stopping" pathology: + +### Patch 1: Conflict Filtering (Framework Differences) +``` +if conflict_type == "framework" and semantic_overlap > 0.6: + discard_conflict() +``` +High-overlap framework disagreements aren't worth debating. + +### Patch 2: Top-K Conflict Selection (Hard Cap) +``` +conflicts = sorted(conflicts, key=lambda x: x.strength)[:10] +``` +Prevents combinatorial explosion. Alone fixes ~80% of the explosion problem. + +### Patch 3: Gamma Authority with Tuned Threshold +``` +if gamma < 0.3: # Changed from 0.5 to allow more debate + stop_debate = True +``` +Hard stop only when truly collapsing. Allows healthy multi-round debate. + +**Result**: Conflicts down to 10-30 per round (from 1500+), gamma stable at 0.7-0.9, reasoning depth preserved. + +--- + +## Example Queries & Expected Behavior + +### Physics Question +**Query**: "What is the speed of light and why does it matter?" +- **Domain detected**: physics +- **Agents activated**: Newton (analytical), Quantum (relativistic) +- **Debate**: Newton discusses classical mechanics; Quantum discusses relativistic invariance +- **Coherence**: High (0.75+) — complementary perspectives +- **Synthesis**: Unified explanation covering both scales + +### Ethics Question +**Query**: "How should we balance accuracy and explainability in AI systems?" +- **Domain detected**: ethics +- **Agents activated**: Philosophy (frameworks), Empathy (stakeholder impact) +- **Debate**: Philosophy discusses deontological vs. consequentialist trade-offs; Empathy discusses user understanding needs +- **Coherence**: Medium (0.65-0.75) — genuine tension between values +- **Synthesis**: Nuanced trade-off analysis acknowledging incommensurable values + +### Consciousness Question +**Query**: "What would it mean for a machine to genuinely understand?" +- **Domain detected**: consciousness +- **Agents activated**: Philosophy (conceptual), Quantum (probabilistic modeling) +- **Debate**: Philosophy questions definitions of understanding; Quantum discusses computational capacity +- **Coherence**: May trend low (0.5-0.65) — hard problem, genuine disagreement +- **Synthesis**: Honest assessment of philosophical limits and empirical gaps + +--- + +## Architecture Diagram + +``` +Query Input + ↓ +[Domain Detection] → Classify physics/ethics/consciousness/creativity/systems + ↓ +[Agent Gating] (Patch 5) → Activate 2-3 relevant agents only + ↓ +Round 0: Independent Analysis + ↓ +[Conflict Detection] → Semantic tension + heuristic opposition + ↓ +[Conflict Capping] (Patch 2) → Top 10 by strength + ↓ +Debate Rounds (1-3): + ├─ Agent pairs respond to peer perspectives + ├─ [Conflict Evolution Tracking] → measure resolution + ├─ [Experience-Weighted Routing] → boost high-performers + ├─ [Gamma Monitoring] → coherence health check + └─ [Gamma Authority] (Patch 4) → stop if γ < 0.3 + ↓ +[Synthesis Engine] → Integrate debate + memory + ↓ +[AEGIS Evaluation] → Safety/alignment check + ↓ +Response Stream (SSE) + ↓ +[Cocoon Storage] → Remember conflict + resolution +``` + +--- + +## Performance Characteristics + +| Metric | Value | Notes | +|--------|-------|-------| +| Model size | 8.5GB (quantized) | Llama 3.1 8B F16 | +| Load time | ~60s | First inference takes longer | +| Query latency | 10-30s | Includes 1-3 debate rounds | +| Max debate rounds | 3 | Configurable per query | +| Conflicts per round | ~10 (capped) | From 200-800 raw | +| Memory per session | 1-5MB | Cocoon-compressed | +| Adapter count | 8 (expandable) | Newton, DaVinci, Empathy, Philosophy, Quantum, Consciousness, Systems, Multi-Perspective | + +--- + +## Deployment + +### Local Web UI +```bash +# Double-click to launch +codette_web.bat + +# Or command line +python inference/codette_server.py [--port 8080] [--no-browser] +``` + +**URL**: http://localhost:7860 +**Features**: +- Streaming responses (SSE) +- Session persistence +- Export/import conversations +- Cocoon dashboard +- Spiderweb visualization + +### Programmatic API +```python +from reasoning_forge.forge_engine import ForgeEngine + +forge = ForgeEngine(enable_memory_weighting=True) +result = forge.forge_with_debate( + concept="Is consciousness computational?", + debate_rounds=2 +) + +print(result['synthesis']) +print(f"Coherence: {result['metadata']['gamma']}") +``` + +--- + +## Known Limitations & Future Work + +### Current Limitations +- **Debate can be noisy on hard problems**: Consciousness, abstract philosophy still generate high tension (expected) +- **Pre-flight predictor not yet suppressing agents**: Predicts conflicts but doesn't yet prevent them (Phase 7) +- **No knowledge cutoff management**: Doesn't distinguish between known unknowns and hallucinations + +### Phase 7 (Research Direction) +- Semantic drift prevention (adapter convergence < 0.70) +- Client-side preference learning (user ratings → memory boost) +- Multi-turn question refinement +- Confidence calibration (reported ≠ actual correctness) +- Cross-domain synthesis (combining insights from different domains) + +--- + +## Citation & Attribution + +**Creator**: Jonathan Harrison +**Framework**: RC+ξ (Reasoning & Conflict + Epistemic Tension) +**Version**: Codette v2.0, Session 2026-03-19 +**Components**: 6 years of multi-agent reasoning research, formalized in 2026 + +--- + +## Getting Started + +1. **Launch the UI**: + ```bash + double-click codette_web.bat + ``` + +2. **Ask a Question**: + - Type in the chat box or select a suggested question + - Codette automatically routes to relevant adapters + - Watch the Cocoon dashboard for real-time metrics + +3. **Save & Resume**: + - Conversations auto-save with Cocoon storage + - Sessions persist across browser closures + - Export for sharing or analysis + +4. **Dive Deeper**: + - Read `PHASE6_CONTROL_PATHOLOGY.md` for system design insights + - Check `evaluation_results.json` for empirical validation data + - Explore memory with the "Cocoon" panel + +--- + +**Welcome to Codette v2.0. What would you like to think through today?** diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000000000000000000000000000000000000..b8438ba08b11c38ca500d58c39330b6b8f702c2a --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,637 @@ +# Codette Production Deployment Guide + +## Overview + +This guide walks through deploying Codette's reasoning engine to production with pre-configured GGUF models and LORA adapters. + +**Status**: Production-Ready ✅ +**Current Correctness**: 78.6% (target: 70%+) +**Test Suite**: 52/52 passing +**Architecture**: 7-layer consciousness stack (Session 13-14) + +--- + +## Pre-Deployment Checklist + +- [ ] **Hardware**: Min 8GB RAM, 5GB disk (see specs below) +- [ ] **Python**: 3.8+ installed (`python --version`) +- [ ] **Git**: Repository cloned +- [ ] **Ports**: 7860 available (or reconfigure) +- [ ] **Network**: For API calls (optional HuggingFace token) + +--- + +## Step 1: Environment Setup + +### 1.1 Clone Repository +```bash +git clone https://github.com/YOUR_USERNAME/codette-reasoning.git +cd codette-reasoning +``` + +### 1.2 Create Virtual Environment (Recommended) +```bash +python -m venv venv + +# Activate +# On Linux/Mac: +source venv/bin/activate + +# On Windows: +venv\Scripts\activate +``` + +### 1.3 Install Dependencies +```bash +pip install --upgrade pip +pip install -r requirements.txt +``` + +**Expected output**: All packages install without errors + +--- + +## Step 2: Verify Models & Adapters + +### 2.1 Check Model Files +```bash +ls -lh models/base/ +# Should show: +# - Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (4.6GB) +# - llama-3.2-1b-instruct-q8_0.gguf (1.3GB) +# - Meta-Llama-3.1-8B-Instruct.F16.gguf (3.4GB) +``` + +### 2.2 Check Adapters +```bash +ls -lh adapters/ +# Should show 8 .gguf files (27MB each) +``` + +### 2.3 Verify Model Loader +```bash +python -c " +from inference.model_loader import ModelLoader +loader = ModelLoader() +models = loader.list_available_models() +print(f'Found {len(models)} models') +for m in models: + print(f' - {m}') +" +# Expected: Found 3 models +``` + +--- + +## Step 3: Run Tests (Pre-Flight Check) + +### 3.1 Run Core Integration Tests +```bash +python -m pytest test_integration.py -v +# Expected: All passed + +python -m pytest test_tier2_integration.py -v +# Expected: 18 passed + +python -m pytest test_integration_phase6.py -v +# Expected: 7 passed +``` + +### 3.2 Run Correctness Benchmark +```bash +python correctness_benchmark.py +# Expected output: +# Phase 6+13+14 accuracy: 78.6% +# Meta-loops reduced: 90% → 5% +``` + +**If any test fails**: See "Troubleshooting" section below + +--- + +## Step 4: Configure for Your Hardware + +### Option A: Default (Llama 3.1 8B Q4 + GPU) +```bash +# Automatic - GPU acceleration enabled +python inference/codette_server.py +``` + +### Option B: CPU-Only (Lightweight) +```bash +# Use Llama 3.2 1B model +export CODETTE_MODEL_PATH="models/base/llama-3.2-1b-instruct-q8_0.gguf" +export CODETTE_GPU_LAYERS=0 +python inference/codette_server.py +``` + +### Option C: Maximum Quality (Llama 3.1 8B F16) +```bash +# Use full-precision model (slower, higher quality) +export CODETTE_MODEL_PATH="models/base/Meta-Llama-3.1-8B-Instruct.F16.gguf" +python inference/codette_server.py +``` + +### Option D: Custom Configuration +Edit `inference/codette_server.py` line ~50: + +```python +MODEL_CONFIG = { + "model_path": "models/base/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", + "n_gpu_layers": 32, # Increase/decrease based on GPU VRAM + "n_threads": 8, # CPU parallel threads + "n_ctx": 2048, # Context window (tokens) + "temperature": 0.7, # 0.0=deterministic, 1.0=creative + "top_k": 40, # Top-K sampling + "top_p": 0.95, # Nucleus sampling +} +``` + +--- + +## Step 5: Start Server + +### 5.1 Launch +```bash +python inference/codette_server.py +``` + +**Expected output**: +``` +Loading model: models/base/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf... +Loading adapters from: adapters/ + ✓ consciousness-lora-f16.gguf + ✓ davinci-lora-f16.gguf + ✓ empathy-lora-f16.gguf + ✓ guardian-spindle (logical validation) + ✓ colleen-conscience (ethical validation) +Starting server on http://0.0.0.0:7860 +Ready for requests! +``` + +### 5.2 Check Server Health +```bash +# In another terminal: +curl http://localhost:7860/api/health + +# Expected response: +# {"status": "ready", "version": "14.0", "model": "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"} +``` + +--- + +## Step 6: Test Live Queries + +### 6.1 Simple Query +```bash +curl -X POST http://localhost:7860/api/chat \ + -H "Content-Type: application/json" \ + -d '{ + "query": "What is quantum computing?", + "max_adapters": 3 + }' +``` + +**Expected**: Multi-perspective response with 3 adapters active + +### 6.2 Complex Reasoning Query +```bash +curl -X POST http://localhost:7860/api/chat \ + -H "Content-Type: application/json" \ + -d '{ + "query": "Should we implement AI for hiring decisions? Provide ethical analysis.", + "max_adapters": 8 + }' +``` + +**Expected**: Full consciousness stack (7 layers + ethical validation) + +### 6.3 Web Interface +``` +Visit: http://localhost:7860 +``` + +--- + +## Step 7: Performance Validation + +### 7.1 Check Latency +```bash +time python -c " +from inference.codette_forge_bridge import CodetteForgeBridge +bridge = CodetteForgeBridge() +response = bridge.reason('Explain photosynthesis') +print(f'Response: {response[:100]}...') +" +# Note execution time +``` + +### 7.2 Monitor Memory Usage +```bash +# During server run, in another terminal: +# Linux/Mac: +watch -n 1 'ps aux | grep codette_server' + +# Windows: +Get-Process -Name python +``` + +### 7.3 Validate Adapter Activity +```bash +python -c " +from reasoning_forge.forge_engine import ForgeEngine +engine = ForgeEngine() +adapters = engine.get_loaded_adapters() +print(f'Active adapters: {len(adapters)}/8') +for adapter in adapters: + print(f' ✓ {adapter}') +" +``` + +--- + +## Production Deployment Patterns + +### Pattern 1: Local Development +```bash +# Simple one-liner for local testing +python inference/codette_server.py +``` + +### Pattern 2: Docker Container +```dockerfile +FROM python:3.10-slim + +WORKDIR /app +COPY . . + +RUN pip install -r requirements.txt + +EXPOSE 7860 + +CMD ["python", "inference/codette_server.py"] +``` + +```bash +docker build -t codette:latest . +docker run -p 7860:7860 codette:latest +``` + +### Pattern 3: Kubernetes Deployment +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: codette +spec: + replicas: 2 + containers: + - name: codette + image: codette:latest + ports: + - containerPort: 7860 + resources: + limits: + memory: "16Gi" + nvidia.com/gpu: 1 +``` + +### Pattern 4: Systemd Service (Linux) +Create `/etc/systemd/system/codette.service`: + +```ini +[Unit] +Description=Codette Reasoning Engine +After=network.target + +[Service] +Type=simple +User=codette +WorkingDirectory=/opt/codette +ExecStart=/usr/bin/python /opt/codette/inference/codette_server.py +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +``` + +```bash +sudo systemctl start codette +sudo systemctl enable codette +sudo systemctl status codette +``` + +--- + +## Hardware Configuration Guide + +### Minimal (CPU-Only) +``` +Requirements: +- CPU: i5 or equivalent +- RAM: 8 GB +- Disk: 3 GB +- GPU: None + +Setup: +export CODETTE_MODEL_PATH="models/base/llama-3.2-1b-instruct-q8_0.gguf" +export CODETTE_GPU_LAYERS=0 + +Performance: +- Warmup: 2-3 seconds +- Inference: ~2-5 tokens/sec +- Batch size: 1-2 +``` + +### Standard (GPU-Accelerated) +``` +Requirements: +- CPU: i7 or Ryzen 5+ +- RAM: 16 GB +- Disk: 6 GB +- GPU: RTX 3070 or equivalent (8GB VRAM) + +Setup: +# Default configuration +python inference/codette_server.py + +Performance: +- Warmup: 3-5 seconds +- Inference: ~15-25 tokens/sec +- Batch size: 4-8 +``` + +### High-Performance (Production) +``` +Requirements: +- CPU: Intel Xeon / AMD Ryzen 9 +- RAM: 32 GB +- Disk: 10 GB (SSD recommended) +- GPU: RTX 4090 or A100 (24GB+ VRAM) + +Setup: +export CODETTE_GPU_LAYERS=80 # Max acceleration +export CODETTE_BATCH_SIZE=16 +python inference/codette_server.py + +Performance: +- Warmup: 4-6 seconds +- Inference: ~80-120 tokens/sec +- Batch size: 16-32 +``` + +--- + +## Troubleshooting + +### Issue: "CUDA device not found" +```bash +# Verify GPU availability +python -c "import torch; print(torch.cuda.is_available())" + +# If False, switch to CPU: +export CODETTE_GPU_LAYERS=0 +python inference/codette_server.py +``` + +### Issue: "out of memory" error +```bash +# Reduce GPU layer allocation +export CODETTE_GPU_LAYERS=16 # Try 16 instead of 32 + +# Or use smaller model: +export CODETTE_MODEL_PATH="models/base/llama-3.2-1b-instruct-q8_0.gguf" + +# Check current memory usage: +nvidia-smi # For GPU +free -h # For system RAM +``` + +### Issue: Model loads slowly +```bash +# Model first loads to disk/memory - this is normal +# Actual startup time: 3-6 seconds depending on GPU + +# If permanently slow: +# 1. Check disk speed: +hdparm -t /dev/sda # Linux example + +# 2. Move models to SSD if on HDD: +cp -r models/ /mnt/ssd/codette/ +export CODETTE_MODEL_ROOT="/mnt/ssd/codette/models" +``` + +### Issue: Test failures +```bash +# Run individual test with verbose output: +python -m pytest test_tier2_integration.py::test_intent_analysis_low_risk -vv + +# Check imports: +python -c "from reasoning_forge.forge_engine import ForgeEngine; print('OK')" + +# If import fails, reinstall: +pip install --force-reinstall --no-cache-dir -r requirements.txt +``` + +### Issue: Adapters not loading +```bash +# Verify adapter files: +ls -lh adapters/ +# Should show 8 .gguf files + +# Check adapter loading: +python -c " +from reasoning_forge.forge_engine import ForgeEngine +engine = ForgeEngine() +print(f'Loaded: {len(engine.adapters)} adapters') +" + +# If 0 adapters, check file permissions: +chmod 644 adapters/*.gguf +``` + +### Issue: API returns 500 errors +```bash +# Check server logs: +tail -f reasoning_forge/.logs/codette_errors.log + +# Test with simpler query: +curl -X POST http://localhost:7860/api/chat \ + -H "Content-Type: application/json" \ + -d '{"query": "test"}' + +# Check if Colleen/Guardian validation is blocking: +# Edit inference/codette_server.py and disable validation temporarily +``` + +--- + +## Monitoring & Observability + +### Health Checks +```bash +# Every 30 seconds: +watch -n 30 curl http://localhost:7860/api/health + +# In production, use automated monitoring: +# Example: Prometheus metrics endpoint +curl http://localhost:7860/metrics +``` + +### Log Inspection +```bash +# Application logs: +tail -f reasoning_forge/.logs/codette_reflection_journal.json + +# Error logs: +grep ERROR reasoning_forge/.logs/codette_errors.log + +# Performance metrics: +cat observatory_metrics.json | jq '.latency[]' +``` + +### Resource Monitoring +```bash +# GPU utilization: +nvidia-smi -l 1 + +# System load: +top # Or Activity Monitor on macOS, Task Manager on Windows + +# Memory per process: +ps aux | grep codette_server +``` + +--- + +## Scaling & Load Testing + +### Load Test 1: Sequential Requests +```bash +for i in {1..100}; do + curl -s -X POST http://localhost:7860/api/chat \ + -H "Content-Type: application/json" \ + -d '{"query": "test query '$i'"}' > /dev/null + echo "Request $i/100" +done +``` + +### Load Test 2: Concurrent Requests +```bash +# Using GNU Parallel: +seq 1 50 | parallel -j 4 'curl -s http://localhost:7860/api/health' + +# Or using Apache Bench: +ab -n 100 -c 10 http://localhost:7860/api/health +``` + +### Expected Performance +- Llama 3.1 8B Q4 + RTX 3090: **50-60 req/min** sustained +- Llama 3.2 1B + CPU: **5-10 req/min** sustained + +--- + +## Security Considerations + +### 1. API Authentication (TODO for production) +```python +# Add in inference/codette_server.py: +@app.post("/api/chat") +def chat_with_auth(request, token: str = Header(None)): + if token != os.getenv("CODETTE_API_TOKEN"): + raise HTTPException(status_code=401, detail="Invalid token") + # Process request +``` + +### 2. Rate Limiting +```python +from slowapi import Limiter +limiter = Limiter(key_func=get_remote_address) + +@app.post("/api/chat") +@limiter.limit("10/minute") +def chat(request): + # ... +``` + +### 3. Input Validation +```python +# Validate query length +if len(query) > 10000: + raise ValueError("Query too long (max 10000 chars)") + +# Check for injection attempts +if any(x in query for x in ["