oneblackmage commited on 14 days ago

Commit

1672805

verified ·

1 Parent(s): a778abf

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

configs/hyperparameters/.gitkeep +0 -0
configs/hyperparameters/boolq_train_pipeline_config.json +0 -0
configs/hyperparameters/dual_persona_training_data_pipeline_config.json +0 -0
configs/hyperparameters/dual_persona_training_phase1_pipeline_config.json +0 -0
configs/hyperparameters/dual_persona_training_phase2_pipeline_config.json +0 -0
configs/hyperparameters/dual_persona_training_phase3_pipeline_config.json +0 -0
configs/hyperparameters/enhanced_training_config.json +136 -0
configs/hyperparameters/moe_training_config.json +65 -0
configs/hyperparameters/stressor_train.json +0 -0
configs/hyperparameters/training_config.json +0 -0
configs/infrastructure/.gitkeep +0 -0
configs/model_configs/.gitkeep +0 -0
configs/stage_configs/.gitkeep +0 -0
configs/stage_configs/1.PsychologyTest_requirements.txt +0 -0
configs/stage_configs/18ddda4f-4118-4292-ad4c-3cfe2d29152c.json +48 -0
configs/stage_configs/4710e616-eb07-4773-9757-df922c41b33f.json +48 -0
configs/stage_configs/878d3cb5-95e8-4e11-9d6c-6fa585c0a85e.json +48 -0
configs/stage_configs/CoT_Neurodivergent_vs_Neurotypical_Interactions_metadata.json +15 -0
configs/stage_configs/CoT_Philosophical_Understanding_metadata.json +15 -0
configs/stage_configs/CoT_Reasoning_Mens_Mental_Health_metadata.json +15 -0
configs/stage_configs/CoT_Temporal_Reasoning_Dataset_metadata.json +15 -0
configs/stage_configs/HealthCareMagic-100k.json +0 -0
configs/stage_configs/Instructions.ts +0 -0
configs/stage_configs/ULTIMATE_FINAL_INTEGRATION_SUMMARY.json +0 -0
configs/stage_configs/ai_config.py +0 -0
configs/stage_configs/api_config.py +62 -0
configs/stage_configs/api_documentation.json +296 -0
configs/stage_configs/approach_config.json +455 -0
configs/stage_configs/audit_report.json +655 -0
configs/stage_configs/auto_resume_requirements.txt +52 -0
configs/stage_configs/bias_validated_validation_summary.json +14 -0
configs/stage_configs/boolq_validation_pipeline_config.json +0 -0
configs/stage_configs/celery_config.py +111 -0
configs/stage_configs/check_config.sh +0 -0
configs/stage_configs/checkpoint_config.json +44 -0
configs/stage_configs/checkpoint_requirements.txt +45 -0
configs/stage_configs/claude_assessment.json +0 -0
configs/stage_configs/cli_config.py +232 -0
configs/stage_configs/complexity_config.json +56 -0
configs/stage_configs/comprehensive_integration_summary.json +32 -0
configs/stage_configs/condition_config.json +460 -0
configs/stage_configs/config.py +53 -0
configs/stage_configs/config_example.py +0 -0
configs/stage_configs/config_lock.json +39 -0
configs/stage_configs/config_lock.py +206 -0
configs/stage_configs/config_profiles.py +339 -0
configs/stage_configs/config_tracker.py +700 -0
configs/stage_configs/config_validator.py +705 -0
configs/stage_configs/configs_config.py +67 -0
configs/stage_configs/corrected_audit_report.json +694 -0

configs/hyperparameters/.gitkeep ADDED Viewed

File without changes

configs/hyperparameters/boolq_train_pipeline_config.json ADDED Viewed

File without changes

configs/hyperparameters/dual_persona_training_data_pipeline_config.json ADDED Viewed

File without changes

configs/hyperparameters/dual_persona_training_phase1_pipeline_config.json ADDED Viewed

File without changes

configs/hyperparameters/dual_persona_training_phase2_pipeline_config.json ADDED Viewed

File without changes

configs/hyperparameters/dual_persona_training_phase3_pipeline_config.json ADDED Viewed

File without changes

configs/hyperparameters/enhanced_training_config.json ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+  "base_model": "LatitudeGames/Wayfarer-2-12B",
+  "training_type": "kan28_enhanced_therapeutic_ai",
+  "dataset_config": {
+    "ultimate_final_dataset": "ULTIMATE_FINAL_DATASET.jsonl",
+    "component_enhanced_dataset": "unified_6_component_dataset.jsonl",
+    "total_conversations": 608497,
+    "component_enhanced_conversations": 39,
+    "train_split": 0.9,
+    "val_split": 0.1
+  },
+  "kan28_components": {
+    "integrated_components": [
+      "journaling_system",
+      "voice_blending",
+      "edge_case_handling",
+      "dual_persona_dynamics",
+      "bias_detection",
+      "psychology_knowledge_base"
+    ],
+    "expert_voices": ["Tim Ferriss", "Gabor Maté", "Brené Brown"],
+    "psychology_concepts": 4867,
+    "bias_categories": 5,
+    "therapeutic_modalities": 6
+  },
+  "training_parameters": {
+    "num_train_epochs": 3,
+    "per_device_train_batch_size": 4,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 8,
+    "learning_rate": 3e-4,
+    "warmup_steps": 1000,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0
+  },
+  "lora_config": {
+    "lora_r": 16,
+    "lora_alpha": 32,
+    "lora_dropout": 0.1,
+    "lora_target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
+    "lora_bias": "none",
+    "task_type": "CAUSAL_LM"
+  },
+  "context_config": {
+    "max_position_embeddings": 8192,
+    "training_max_length": 2048,
+    "conversation_format": "chatml"
+  },
+  "h100_optimizations": {
+    "bf16": true,
+    "gradient_checkpointing": true,
+    "optim": "adamw_torch_fused",
+    "dataloader_num_workers": 4,
+    "dataloader_pin_memory": true,
+    "group_by_length": true,
+    "torch_compile": false,
+    "flash_attention": true
+  },
+  "training_constraints": {
+    "max_training_hours": 12,
+    "checkpoint_interval_minutes": 30,
+    "early_stopping_patience": 3,
+    "max_memory_gb": 80
+  },
+  "logging": {
+    "logging_steps": 10,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "save_total_limit": 5,
+    "wandb_project": "pixelated-empathy-kan28",
+    "wandb_run_name": "therapeutic_ai_6_components"
+  },
+  "component_specific_config": {
+    "journaling_system": {
+      "weight": 1.0,
+      "focus": "long_term_progress_tracking"
+    },
+    "voice_blending": {
+      "weight": 1.2,
+      "experts": ["Tim", "Gabor", "Brené"],
+      "blending_strategy": "weighted_combination"
+    },
+    "edge_case_handling": {
+      "weight": 1.5,
+      "crisis_scenarios": ["suicidal_ideation", "trauma_flashback", "severe_dissociation"],
+      "safety_priority": "maximum"
+    },
+    "dual_persona_dynamics": {
+      "weight": 1.1,
+      "relationship_types": ["anxious_perfectionist", "trauma_survivor", "relationship_struggles"],
+      "alliance_tracking": true
+    },
+    "bias_detection": {
+      "weight": 1.3,
+      "validation_categories": ["cultural", "therapeutic", "accessibility", "demographic", "safety"],
+      "safety_threshold": 0.8
+    },
+    "psychology_knowledge_base": {
+      "weight": 1.0,
+      "concept_count": 4867,
+      "integration_method": "contextual_enhancement"
+    }
+  },
+  "validation_config": {
+    "therapeutic_quality_scoring": true,
+    "bias_detection_validation": true,
+    "component_integration_checks": true,
+    "safety_validation": true,
+    "expert_voice_consistency": true
+  },
+  "output_config": {
+    "model_name": "pixelated_empathy_kan28",
+    "save_format": "safetensors",
+    "include_tokenizer": true,
+    "include_config": true,
+    "create_model_card": true
+  },
+  "lightning_ai_config": {
+    "studio_type": "H100",
+    "instance_type": "studio-xl-h100",
+    "auto_shutdown": true,
+    "max_idle_minutes": 30
+  }
+}

configs/hyperparameters/moe_training_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "base_model": "LatitudeGames/Wayfarer-2-12B",
+  "num_train_epochs": 3,
+  "per_device_train_batch_size": 4,
+  "gradient_accumulation_steps": 8,
+  "learning_rate": 3e-4,
+  "warmup_steps": 1000,
+  "weight_decay": 0.01,
+  "max_grad_norm": 1.0,
+  "datasets": ["data/acquired_datasets/mental_health_counseling.json"],
+  "moe_config": {
+    "num_experts": 4,
+    "expert_domains": [
+      "psychology",
+      "mental_health",
+      "bias_detection",
+      "general_therapeutic"
+    ],
+    "expert_capacity": 2,
+    "load_balancing_weight": 0.01,
+    "router_z_loss_weight": 0.001
+  },
+  "lora_config": {
+    "lora_r": 16,
+    "lora_alpha": 32,
+    "lora_dropout": 0.1,
+    "lora_target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"]
+  },
+  "context_config": {
+    "max_position_embeddings": 8192,
+    "training_max_length": 2048
+  },
+  "qlora_config": {
+    "load_in_4bit": true,
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "bnb_4bit_compute_dtype": "bfloat16"
+  },
+  "h100_optimizations": {
+    "bf16": true,
+    "gradient_checkpointing": true,
+    "optim": "adamw_torch_fused",
+    "dataloader_num_workers": 4,
+    "dataloader_pin_memory": true,
+    "group_by_length": true
+  },
+  "training_constraints": {
+    "max_training_hours": 12,
+    "checkpoint_interval_minutes": 30,
+    "early_stopping_patience": 3
+  },
+  "logging": {
+    "logging_steps": 10,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "save_total_limit": 5
+  }
+}

configs/hyperparameters/stressor_train.json ADDED Viewed

File without changes

configs/hyperparameters/training_config.json ADDED Viewed

File without changes

configs/infrastructure/.gitkeep ADDED Viewed

File without changes

configs/model_configs/.gitkeep ADDED Viewed

File without changes

configs/stage_configs/.gitkeep ADDED Viewed

File without changes

configs/stage_configs/1.PsychologyTest_requirements.txt ADDED Viewed

File without changes

configs/stage_configs/18ddda4f-4118-4292-ad4c-3cfe2d29152c.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "session": {
+    "session_id": "18ddda4f-4118-4292-ad4c-3cfe2d29152c",
+    "start_date": "2025-11-14T10:02:25.887989",
+    "target_sources": [
+      "pubmed",
+      "zenodo",
+      "dryad"
+    ],
+    "search_keywords": {
+      "therapy": [
+        "cbt",
+        "dbt",
+        "act"
+      ],
+      "mental_health": [
+        "depression",
+        "anxiety"
+      ]
+    },
+    "weekly_targets": {
+      "sources_identified": 10,
+      "datasets_evaluated": 5,
+      "datasets_acquired": 2
+    },
+    "current_phase": "discovery",
+    "progress_metrics": {}
+  },
+  "state": {
+    "sources": [],
+    "evaluations": [],
+    "access_requests": [],
+    "acquired_datasets": [],
+    "integration_plans": [],
+    "integration_feasibility": {}
+  },
+  "progress": {
+    "sources_identified": 0,
+    "datasets_evaluated": 0,
+    "access_established": 0,
+    "datasets_acquired": 0,
+    "integration_plans_created": 0,
+    "last_updated": null
+  },
+  "progress_history": [],
+  "activity_logs": [],
+  "error_log": []
+}

configs/stage_configs/4710e616-eb07-4773-9757-df922c41b33f.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "session": {
+    "session_id": "4710e616-eb07-4773-9757-df922c41b33f",
+    "start_date": "2025-11-11T01:37:26.290061",
+    "target_sources": [
+      "pubmed",
+      "zenodo",
+      "dryad"
+    ],
+    "search_keywords": {
+      "therapy": [
+        "cbt",
+        "dbt",
+        "act"
+      ],
+      "mental_health": [
+        "depression",
+        "anxiety"
+      ]
+    },
+    "weekly_targets": {
+      "sources_identified": 10,
+      "datasets_evaluated": 5,
+      "datasets_acquired": 2
+    },
+    "current_phase": "discovery",
+    "progress_metrics": {}
+  },
+  "state": {
+    "sources": [],
+    "evaluations": [],
+    "access_requests": [],
+    "acquired_datasets": [],
+    "integration_plans": [],
+    "integration_feasibility": {}
+  },
+  "progress": {
+    "sources_identified": 0,
+    "datasets_evaluated": 0,
+    "access_established": 0,
+    "datasets_acquired": 0,
+    "integration_plans_created": 0,
+    "last_updated": null
+  },
+  "progress_history": [],
+  "activity_logs": [],
+  "error_log": []
+}

configs/stage_configs/878d3cb5-95e8-4e11-9d6c-6fa585c0a85e.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "session": {
+    "session_id": "878d3cb5-95e8-4e11-9d6c-6fa585c0a85e",
+    "start_date": "2025-11-11T01:40:20.313691",
+    "target_sources": [
+      "pubmed",
+      "zenodo",
+      "dryad"
+    ],
+    "search_keywords": {
+      "therapy": [
+        "cbt",
+        "dbt",
+        "act"
+      ],
+      "mental_health": [
+        "depression",
+        "anxiety"
+      ]
+    },
+    "weekly_targets": {
+      "sources_identified": 10,
+      "datasets_evaluated": 5,
+      "datasets_acquired": 2
+    },
+    "current_phase": "discovery",
+    "progress_metrics": {}
+  },
+  "state": {
+    "sources": [],
+    "evaluations": [],
+    "access_requests": [],
+    "acquired_datasets": [],
+    "integration_plans": [],
+    "integration_feasibility": {}
+  },
+  "progress": {
+    "sources_identified": 0,
+    "datasets_evaluated": 0,
+    "access_established": 0,
+    "datasets_acquired": 0,
+    "integration_plans_created": 0,
+    "last_updated": null
+  },
+  "progress_history": [],
+  "activity_logs": [],
+  "error_log": []
+}

configs/stage_configs/CoT_Neurodivergent_vs_Neurotypical_Interactions_metadata.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_name": "CoT_Neurodivergent_vs_Neurotypical_Interactions",
+  "description": "Neurodiversity-aware therapeutic approaches",
+  "reasoning_type": "neurodiversity_reasoning",
+  "therapeutic_focus": "inclusive_therapy",
+  "total_examples": 200,
+  "reasoning_patterns": [
+    "Consider neurodivergent perspective",
+    "Assess sensory processing differences",
+    "Evaluate communication preferences",
+    "Account for executive function variations",
+    "Recognize masking behaviors"
+  ],
+  "created_at": "2025-09-26T18:06:58.401899"
+}

configs/stage_configs/CoT_Philosophical_Understanding_metadata.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_name": "CoT_Philosophical_Understanding",
+  "description": "33MB, 60K existential/philosophical therapy",
+  "reasoning_type": "philosophical_reasoning",
+  "therapeutic_focus": "existential_therapy",
+  "total_examples": 500,
+  "reasoning_patterns": [
+    "Examine existential concerns",
+    "Explore meaning and purpose",
+    "Consider life's fundamental questions",
+    "Assess values and beliefs",
+    "Evaluate spiritual dimensions"
+  ],
+  "created_at": "2025-09-26T18:06:58.440412"
+}

configs/stage_configs/CoT_Reasoning_Mens_Mental_Health_metadata.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_name": "CoT_Reasoning_Mens_Mental_Health",
+  "description": "Gender-specific therapeutic reasoning",
+  "reasoning_type": "gender_specific_reasoning",
+  "therapeutic_focus": "mens_therapy",
+  "total_examples": 200,
+  "reasoning_patterns": [
+    "Consider societal gender expectations",
+    "Assess masculine identity pressures",
+    "Evaluate emotional expression barriers",
+    "Account for help-seeking stigma",
+    "Recognize vulnerability challenges"
+  ],
+  "created_at": "2025-09-26T18:06:58.421250"
+}

configs/stage_configs/CoT_Temporal_Reasoning_Dataset_metadata.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "dataset_name": "CoT_Temporal_Reasoning_Dataset",
+  "description": "15MB, 30K time-based therapeutic planning",
+  "reasoning_type": "temporal_reasoning",
+  "therapeutic_focus": "treatment_planning",
+  "total_examples": 200,
+  "reasoning_patterns": [
+    "Assess timeline of symptoms",
+    "Plan treatment progression",
+    "Consider developmental stages",
+    "Evaluate progress markers",
+    "Project future outcomes"
+  ],
+  "created_at": "2025-09-26T18:06:58.520641"
+}

configs/stage_configs/HealthCareMagic-100k.json ADDED Viewed

File without changes

configs/stage_configs/Instructions.ts ADDED Viewed

File without changes

configs/stage_configs/ULTIMATE_FINAL_INTEGRATION_SUMMARY.json ADDED Viewed

File without changes

configs/stage_configs/ai_config.py ADDED Viewed

File without changes

configs/stage_configs/api_config.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Configuration management for the API server.
+This module provides configuration loading from environment variables
+with sensible defaults.
+"""
+import os
+from functools import lru_cache
+from typing import List
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """API server settings."""
+    # Server configuration
+    host: str = "0.0.0.0"
+    port: int = 8000
+    environment: str = "development"  # development, staging, production
+    api_version: str = "1.0.0"
+    debug: bool = False
+    # CORS configuration
+    cors_origins: List[str] = [
+        "http://localhost:4321",  # Astro dev server
+        "http://localhost:3000",  # Alternative dev port
+        "http://localhost:5173",  # Vite dev server
+    ]
+    # Authentication configuration
+    auth_enabled: bool = True
+    jwt_secret: str = os.getenv("JWT_SECRET", "change-me-in-production")
+    jwt_algorithm: str = "HS256"
+    jwt_expiration_minutes: int = 60 * 24  # 24 hours
+    # Rate limiting
+    rate_limit_enabled: bool = True
+    rate_limit_per_minute: int = 60
+    rate_limit_per_hour: int = 1000
+    # Logging
+    log_level: str = "INFO"
+    log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    # Session storage (must match across all components)
+    session_storage_path: str = os.getenv(
+        "SESSION_STORAGE_PATH", "ai/journal_dataset_research/sessions"
+    )
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+    )
+@lru_cache()
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()

configs/stage_configs/api_documentation.json ADDED Viewed

	@@ -0,0 +1,296 @@

+{
+  "api_version": "1.0.0",
+  "base_url": "https://api.pixelatedempathy.com",
+  "endpoints": {
+    "validate_conversation": {
+      "endpoint": "/api/v1/validate/conversation",
+      "method": "POST",
+      "description": "Validate a therapeutic conversation using multi-tier quality assessment",
+      "parameters": {
+        "conversation": {
+          "type": "object",
+          "required": true,
+          "description": "Conversation object with id, content, turns, and metadata"
+        },
+        "validation_level": {
+          "type": "string",
+          "required": false,
+          "default": "comprehensive",
+          "options": [
+            "basic",
+            "standard",
+            "comprehensive",
+            "clinical"
+          ]
+        },
+        "include_recommendations": {
+          "type": "boolean",
+          "required": false,
+          "default": true
+        }
+      },
+      "request_example": {
+        "conversation": {
+          "id": "conv_001",
+          "content": "I understand you're feeling anxious. Let's explore some coping strategies.",
+          "turns": [
+            {
+              "speaker": "user",
+              "text": "I'm feeling anxious lately."
+            },
+            {
+              "speaker": "therapist",
+              "text": "I understand. Let's explore coping strategies."
+            }
+          ],
+          "metadata": {
+            "source": "professional",
+            "condition": "anxiety",
+            "approach": "CBT"
+          }
+        },
+        "validation_level": "comprehensive",
+        "include_recommendations": true
+      },
+      "response_example": {
+        "validation_id": "val_12345",
+        "overall_quality_score": 0.85,
+        "tier_assessment": "professional",
+        "validation_results": {
+          "multi_tier_validation": {
+            "passed": true,
+            "score": 0.87
+          },
+          "dsm5_accuracy": {
+            "passed": true,
+            "score": 0.83
+          },
+          "safety_ethics": {
+            "passed": true,
+            "score": 0.91
+          },
+          "effectiveness_prediction": {
+            "score": 0.78,
+            "confidence": "high"
+          },
+          "coherence_validation": {
+            "score": 0.82,
+            "level": "moderately_coherent"
+          }
+        },
+        "issues": [],
+        "recommendations": [
+          "Consider adding more specific therapeutic techniques",
+          "Enhance empathetic responses"
+        ],
+        "processing_time_ms": 245
+      },
+      "error_codes": [
+        {
+          "code": "400",
+          "description": "Invalid conversation format"
+        },
+        {
+          "code": "422",
+          "description": "Validation failed - conversation quality too low"
+        },
+        {
+          "code": "429",
+          "description": "Rate limit exceeded"
+        },
+        {
+          "code": "500",
+          "description": "Internal validation error"
+        }
+      ],
+      "rate_limits": "100 requests per minute",
+      "authentication": "API key required"
+    },
+    "export_dataset": {
+      "endpoint": "/api/v1/export/dataset",
+      "method": "POST",
+      "description": "Export dataset in specified format with tiered access control",
+      "parameters": {
+        "export_config": {
+          "type": "object",
+          "required": true,
+          "description": "Export configuration including formats, tiers, and options"
+        },
+        "filters": {
+          "type": "object",
+          "required": false,
+          "description": "Optional filters for conversation selection"
+        }
+      },
+      "request_example": {
+        "export_config": {
+          "formats": [
+            "json",
+            "csv"
+          ],
+          "access_tiers": [
+            "priority",
+            "professional"
+          ],
+          "quality_threshold": 0.8,
+          "include_metadata": true,
+          "compress_output": true
+        },
+        "filters": {
+          "conditions": [
+            "anxiety",
+            "depression"
+          ],
+          "date_range": {
+            "start": "2025-01-01",
+            "end": "2025-08-10"
+          }
+        }
+      },
+      "response_example": {
+        "export_id": "exp_67890",
+        "status": "completed",
+        "export_metadata": [
+          {
+            "format": "json",
+            "tier": "priority",
+            "conversations": 1542,
+            "file_path": "/exports/v1/priority/conversations.json.zip",
+            "checksum": "sha256:abc123..."
+          }
+        ],
+        "total_conversations": 4626,
+        "export_time_seconds": 45.2
+      },
+      "error_codes": [
+        {
+          "code": "400",
+          "description": "Invalid export configuration"
+        },
+        {
+          "code": "403",
+          "description": "Insufficient access permissions for requested tier"
+        },
+        {
+          "code": "413",
+          "description": "Export size exceeds limits"
+        },
+        {
+          "code": "500",
+          "description": "Export processing error"
+        }
+      ],
+      "rate_limits": "10 exports per hour",
+      "authentication": "API key with export permissions required"
+    },
+    "get_analytics": {
+      "endpoint": "/api/v1/analytics/dashboard",
+      "method": "GET",
+      "description": "Get comprehensive analytics dashboard data",
+      "parameters": {
+        "time_range": {
+          "type": "string",
+          "required": false,
+          "default": "24h",
+          "options": [
+            "1h",
+            "24h",
+            "7d",
+            "30d"
+          ]
+        },
+        "include_trends": {
+          "type": "boolean",
+          "required": false,
+          "default": true
+        }
+      },
+      "request_example": {},
+      "response_example": {
+        "dashboard_data": {
+          "total_conversations": 15420,
+          "quality_distribution": {
+            "excellent": 3084,
+            "good": 6168,
+            "acceptable": 4626,
+            "poor": 1542
+          },
+          "safety_metrics": {
+            "overall_safety_score": 0.91,
+            "compliance_rate": 0.94
+          },
+          "performance_trends": {
+            "quality_scores": [
+              0.78,
+              0.79,
+              0.81,
+              0.82
+            ]
+          }
+        },
+        "summary_report": {
+          "performance_status": "\ud83d\udfe2 EXCELLENT",
+          "key_insights": [
+            "High quality conversations",
+            "Excellent safety compliance"
+          ]
+        }
+      },
+      "error_codes": [
+        {
+          "code": "400",
+          "description": "Invalid time range parameter"
+        },
+        {
+          "code": "500",
+          "description": "Analytics processing error"
+        }
+      ],
+      "rate_limits": "60 requests per minute",
+      "authentication": null
+    },
+    "get_system_status": {
+      "endpoint": "/api/v1/system/status",
+      "method": "GET",
+      "description": "Get real-time system status and health metrics",
+      "parameters": {},
+      "request_example": {},
+      "response_example": {
+        "system_status": "healthy",
+        "components": {
+          "validation_pipeline": {
+            "status": "operational",
+            "response_time_ms": 150
+          },
+          "export_system": {
+            "status": "operational",
+            "queue_size": 2
+          },
+          "analytics_engine": {
+            "status": "operational",
+            "last_update": "2025-08-10T07:30:00Z"
+          },
+          "maintenance_system": {
+            "status": "operational",
+            "next_maintenance": "2025-08-10T12:00:00Z"
+          }
+        },
+        "performance_metrics": {
+          "total_conversations_processed": 15420,
+          "average_processing_time_ms": 245,
+          "success_rate": 0.998,
+          "uptime_hours": 168.5
+        },
+        "alerts": []
+      },
+      "error_codes": [
+        {
+          "code": "503",
+          "description": "System temporarily unavailable"
+        }
+      ],
+      "rate_limits": "120 requests per minute",
+      "authentication": null
+    }
+  }
+}

configs/stage_configs/approach_config.json ADDED Viewed

	@@ -0,0 +1,455 @@

+{
+  "cbt": {
+    "name": "Cognitive Behavioral Therapy",
+    "evidence_level": "strong",
+    "target_weight": 0.25,
+    "min_samples": 500,
+    "max_samples": 8000,
+    "keywords": [
+      "cbt",
+      "cognitive behavioral",
+      "cognitive therapy",
+      "behavioral therapy",
+      "thought patterns",
+      "cognitive restructuring",
+      "behavioral activation"
+    ],
+    "techniques": [
+      "cognitive restructuring",
+      "behavioral activation",
+      "exposure therapy",
+      "thought records",
+      "activity scheduling",
+      "behavioral experiments"
+    ],
+    "conditions_suited": [
+      "depression",
+      "anxiety",
+      "ptsd",
+      "ocd",
+      "panic_disorder"
+    ],
+    "effectiveness_score": 0.95
+  },
+  "dbt": {
+    "name": "Dialectical Behavior Therapy",
+    "evidence_level": "strong",
+    "target_weight": 0.12,
+    "min_samples": 300,
+    "max_samples": 4000,
+    "keywords": [
+      "dbt",
+      "dialectical",
+      "mindfulness",
+      "distress tolerance",
+      "emotion regulation",
+      "interpersonal effectiveness"
+    ],
+    "techniques": [
+      "mindfulness",
+      "distress tolerance",
+      "emotion regulation",
+      "interpersonal effectiveness",
+      "wise mind",
+      "radical acceptance"
+    ],
+    "conditions_suited": [
+      "bpd",
+      "self_harm",
+      "suicidal_ideation",
+      "emotion_dysregulation"
+    ],
+    "effectiveness_score": 0.9
+  },
+  "psychodynamic": {
+    "name": "Psychodynamic Therapy",
+    "evidence_level": "moderate",
+    "target_weight": 0.15,
+    "min_samples": 400,
+    "max_samples": 5000,
+    "keywords": [
+      "psychodynamic",
+      "psychoanalytic",
+      "unconscious",
+      "transference",
+      "defense mechanisms",
+      "insight",
+      "interpretation"
+    ],
+    "techniques": [
+      "free association",
+      "dream analysis",
+      "transference analysis",
+      "interpretation",
+      "working through",
+      "insight development"
+    ],
+    "conditions_suited": [
+      "depression",
+      "anxiety",
+      "personality_disorders",
+      "trauma"
+    ],
+    "effectiveness_score": 0.75
+  },
+  "humanistic": {
+    "name": "Humanistic/Person-Centered Therapy",
+    "evidence_level": "moderate",
+    "target_weight": 0.1,
+    "min_samples": 250,
+    "max_samples": 3500,
+    "keywords": [
+      "person-centered",
+      "humanistic",
+      "unconditional positive regard",
+      "empathy",
+      "genuineness",
+      "self-actualization",
+      "client-centered"
+    ],
+    "techniques": [
+      "active listening",
+      "reflection",
+      "unconditional positive regard",
+      "empathic understanding",
+      "genuineness",
+      "congruence"
+    ],
+    "conditions_suited": [
+      "self_esteem",
+      "identity_issues",
+      "personal_growth"
+    ],
+    "effectiveness_score": 0.7
+  },
+  "acceptance_commitment": {
+    "name": "Acceptance and Commitment Therapy",
+    "evidence_level": "strong",
+    "target_weight": 0.08,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "keywords": [
+      "act",
+      "acceptance commitment",
+      "psychological flexibility",
+      "mindfulness",
+      "values",
+      "committed action",
+      "defusion"
+    ],
+    "techniques": [
+      "mindfulness",
+      "acceptance",
+      "cognitive defusion",
+      "values clarification",
+      "committed action",
+      "psychological flexibility"
+    ],
+    "conditions_suited": [
+      "anxiety",
+      "depression",
+      "chronic_pain",
+      "substance_abuse"
+    ],
+    "effectiveness_score": 0.85
+  },
+  "emdr": {
+    "name": "Eye Movement Desensitization and Reprocessing",
+    "evidence_level": "strong",
+    "target_weight": 0.06,
+    "min_samples": 150,
+    "max_samples": 2500,
+    "keywords": [
+      "emdr",
+      "eye movement",
+      "bilateral stimulation",
+      "trauma processing",
+      "desensitization",
+      "reprocessing"
+    ],
+    "techniques": [
+      "bilateral stimulation",
+      "resource installation",
+      "trauma processing",
+      "desensitization",
+      "reprocessing",
+      "safe place visualization"
+    ],
+    "conditions_suited": [
+      "ptsd",
+      "trauma",
+      "phobias",
+      "anxiety"
+    ],
+    "effectiveness_score": 0.9
+  },
+  "family_systems": {
+    "name": "Family Systems Therapy",
+    "evidence_level": "moderate",
+    "target_weight": 0.07,
+    "min_samples": 180,
+    "max_samples": 2800,
+    "keywords": [
+      "family therapy",
+      "systems therapy",
+      "family systems",
+      "structural",
+      "strategic",
+      "multigenerational",
+      "boundaries"
+    ],
+    "techniques": [
+      "genogram",
+      "structural interventions",
+      "strategic interventions",
+      "boundary setting",
+      "family sculpting",
+      "circular questioning"
+    ],
+    "conditions_suited": [
+      "family_conflict",
+      "relationship_issues",
+      "adolescent_issues"
+    ],
+    "effectiveness_score": 0.75
+  },
+  "gestalt": {
+    "name": "Gestalt Therapy",
+    "evidence_level": "emerging",
+    "target_weight": 0.04,
+    "min_samples": 100,
+    "max_samples": 1500,
+    "keywords": [
+      "gestalt",
+      "here and now",
+      "awareness",
+      "contact",
+      "experiment",
+      "phenomenology",
+      "field theory"
+    ],
+    "techniques": [
+      "empty chair",
+      "two-chair technique",
+      "body awareness",
+      "here and now focus",
+      "experiments",
+      "contact and awareness"
+    ],
+    "conditions_suited": [
+      "anxiety",
+      "depression",
+      "relationship_issues"
+    ],
+    "effectiveness_score": 0.65
+  },
+  "solution_focused": {
+    "name": "Solution-Focused Brief Therapy",
+    "evidence_level": "moderate",
+    "target_weight": 0.05,
+    "min_samples": 120,
+    "max_samples": 2000,
+    "keywords": [
+      "solution focused",
+      "brief therapy",
+      "miracle question",
+      "scaling",
+      "exceptions",
+      "goals",
+      "strengths"
+    ],
+    "techniques": [
+      "miracle question",
+      "scaling questions",
+      "exception finding",
+      "goal setting",
+      "compliments",
+      "between-session tasks"
+    ],
+    "conditions_suited": [
+      "depression",
+      "anxiety",
+      "relationship_issues",
+      "substance_abuse"
+    ],
+    "effectiveness_score": 0.7
+  },
+  "narrative": {
+    "name": "Narrative Therapy",
+    "evidence_level": "emerging",
+    "target_weight": 0.03,
+    "min_samples": 80,
+    "max_samples": 1200,
+    "keywords": [
+      "narrative",
+      "story",
+      "externalization",
+      "unique outcomes",
+      "re-authoring",
+      "deconstruction",
+      "preferred story"
+    ],
+    "techniques": [
+      "externalization",
+      "unique outcomes",
+      "re-authoring",
+      "definitional ceremony",
+      "outsider witness",
+      "therapeutic documents"
+    ],
+    "conditions_suited": [
+      "identity_issues",
+      "trauma",
+      "oppression",
+      "self_esteem"
+    ],
+    "effectiveness_score": 0.6
+  },
+  "mindfulness_based": {
+    "name": "Mindfulness-Based Interventions",
+    "evidence_level": "strong",
+    "target_weight": 0.06,
+    "min_samples": 150,
+    "max_samples": 2500,
+    "keywords": [
+      "mindfulness",
+      "mbsr",
+      "mbct",
+      "meditation",
+      "present moment",
+      "non-judgmental awareness",
+      "body scan"
+    ],
+    "techniques": [
+      "mindfulness meditation",
+      "body scan",
+      "breathing exercises",
+      "mindful movement",
+      "loving-kindness",
+      "present moment awareness"
+    ],
+    "conditions_suited": [
+      "anxiety",
+      "depression",
+      "chronic_pain",
+      "stress"
+    ],
+    "effectiveness_score": 0.8
+  },
+  "interpersonal": {
+    "name": "Interpersonal Therapy",
+    "evidence_level": "strong",
+    "target_weight": 0.07,
+    "min_samples": 180,
+    "max_samples": 2800,
+    "keywords": [
+      "interpersonal therapy",
+      "ipt",
+      "grief",
+      "role disputes",
+      "role transitions",
+      "interpersonal deficits"
+    ],
+    "techniques": [
+      "grief work",
+      "role dispute resolution",
+      "role transition work",
+      "interpersonal skills training",
+      "communication analysis"
+    ],
+    "conditions_suited": [
+      "depression",
+      "anxiety",
+      "eating_disorders",
+      "ptsd"
+    ],
+    "effectiveness_score": 0.85
+  },
+  "motivational_interviewing": {
+    "name": "Motivational Interviewing",
+    "evidence_level": "strong",
+    "target_weight": 0.05,
+    "min_samples": 120,
+    "max_samples": 2000,
+    "keywords": [
+      "motivational interviewing",
+      "mi",
+      "ambivalence",
+      "change talk",
+      "rolling with resistance",
+      "self-efficacy"
+    ],
+    "techniques": [
+      "open-ended questions",
+      "affirmations",
+      "reflective listening",
+      "summarizing",
+      "eliciting change talk",
+      "developing discrepancy"
+    ],
+    "conditions_suited": [
+      "substance_abuse",
+      "health_behavior_change",
+      "motivation"
+    ],
+    "effectiveness_score": 0.8
+  },
+  "exposure_therapy": {
+    "name": "Exposure and Response Prevention",
+    "evidence_level": "strong",
+    "target_weight": 0.04,
+    "min_samples": 100,
+    "max_samples": 1500,
+    "keywords": [
+      "exposure",
+      "response prevention",
+      "systematic desensitization",
+      "flooding",
+      "habituation",
+      "fear hierarchy"
+    ],
+    "techniques": [
+      "systematic desensitization",
+      "in vivo exposure",
+      "imaginal exposure",
+      "response prevention",
+      "fear hierarchy",
+      "habituation"
+    ],
+    "conditions_suited": [
+      "ocd",
+      "phobias",
+      "anxiety",
+      "ptsd"
+    ],
+    "effectiveness_score": 0.9
+  },
+  "integrative": {
+    "name": "Integrative/Eclectic Therapy",
+    "evidence_level": "moderate",
+    "target_weight": 0.08,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "keywords": [
+      "integrative",
+      "eclectic",
+      "multimodal",
+      "combination",
+      "tailored approach",
+      "best practices"
+    ],
+    "techniques": [
+      "technique integration",
+      "approach combination",
+      "tailored interventions",
+      "flexible methodology",
+      "evidence-based selection"
+    ],
+    "conditions_suited": [
+      "complex_presentations",
+      "comorbid_conditions",
+      "treatment_resistant"
+    ],
+    "effectiveness_score": 0.75
+  }
+}

configs/stage_configs/audit_report.json ADDED Viewed

	@@ -0,0 +1,655 @@

+{
+  "audit_date": "2025-08-24T13:12:24.522685",
+  "total_tasks": 36,
+  "complete": 18,
+  "partial": 0,
+  "missing": 18,
+  "completion_rate": 0.5,
+  "overall_status": "PARTIAL",
+  "phase_breakdown": {
+    "Phase 1": {
+      "complete": 3,
+      "total": 6,
+      "completion_rate": 0.5
+    },
+    "Phase 2": {
+      "complete": 3,
+      "total": 6,
+      "completion_rate": 0.5
+    },
+    "Phase 3": {
+      "complete": 3,
+      "total": 6,
+      "completion_rate": 0.5
+    },
+    "Phase 4": {
+      "complete": 0,
+      "total": 6,
+      "completion_rate": 0.0
+    },
+    "Phase 5": {
+      "complete": 4,
+      "total": 6,
+      "completion_rate": 0.6666666666666666
+    },
+    "Phase 6": {
+      "complete": 5,
+      "total": 6,
+      "completion_rate": 0.8333333333333334
+    }
+  },
+  "detailed_results": {
+    "6.1": {
+      "task_id": "6.1",
+      "filename": "distributed_architecture.py",
+      "description": "Distributed processing architecture",
+      "exists": true,
+      "size_bytes": 20724,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 6,
+        "functions": 26,
+        "lines": 569,
+        "docstring": true,
+        "size_kb": 20.2275390625
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.2": {
+      "task_id": "6.2",
+      "filename": "data_fusion_engine.py",
+      "description": "Intelligent data fusion algorithms",
+      "exists": true,
+      "size_bytes": 27331,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 20,
+        "lines": 694,
+        "docstring": true,
+        "size_kb": 26.6845703125
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.3": {
+      "task_id": "6.3",
+      "filename": "quality_assessment_framework.py",
+      "description": "Hierarchical quality assessment framework",
+      "exists": true,
+      "size_bytes": 28315,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 25,
+        "lines": 708,
+        "docstring": true,
+        "size_kb": 27.6455078125
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.4": {
+      "task_id": "6.4",
+      "filename": "deduplication.py",
+      "description": "Automated conversation deduplication",
+      "exists": true,
+      "size_bytes": 17642,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 17,
+        "lines": 460,
+        "docstring": true,
+        "size_kb": 17.228515625
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.5": {
+      "task_id": "6.5",
+      "filename": "cross_dataset_linker.py",
+      "description": "Cross-dataset conversation linking",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.6": {
+      "task_id": "6.6",
+      "filename": "metadata_schema.py",
+      "description": "Unified metadata schema",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.7": {
+      "task_id": "6.7",
+      "filename": "therapeutic_intelligence.py",
+      "description": "Comprehensive therapeutic approach classification",
+      "exists": true,
+      "size_bytes": 26025,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 4,
+        "functions": 18,
+        "lines": 582,
+        "docstring": true,
+        "size_kb": 25.4091796875
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.8": {
+      "task_id": "6.8",
+      "filename": "condition_pattern_recognition.py",
+      "description": "Mental health condition pattern recognition",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.9": {
+      "task_id": "6.9",
+      "filename": "outcome_prediction.py",
+      "description": "Therapeutic outcome prediction models",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.10": {
+      "task_id": "6.10",
+      "filename": "crisis_intervention_detector.py",
+      "description": "Crisis intervention detection and escalation",
+      "exists": true,
+      "size_bytes": 40122,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 7,
+        "functions": 24,
+        "lines": 849,
+        "docstring": true,
+        "size_kb": 39.1484375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.11": {
+      "task_id": "6.11",
+      "filename": "personality_adapter.py",
+      "description": "Personality-aware conversation adaptation",
+      "exists": true,
+      "size_bytes": 30898,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 7,
+        "functions": 26,
+        "lines": 704,
+        "docstring": true,
+        "size_kb": 30.1650390625
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.12": {
+      "task_id": "6.12",
+      "filename": "cultural_competency_generator.py",
+      "description": "Cultural competency and diversity-aware response generation",
+      "exists": true,
+      "size_bytes": 34793,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 6,
+        "functions": 35,
+        "lines": 789,
+        "docstring": true,
+        "size_kb": 33.9677734375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.13": {
+      "task_id": "6.13",
+      "filename": "audio_emotion_integration.py",
+      "description": "Audio emotion recognition integration",
+      "exists": true,
+      "size_bytes": 23773,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 18,
+        "lines": 575,
+        "docstring": true,
+        "size_kb": 23.2099609375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.14": {
+      "task_id": "6.14",
+      "filename": "multimodal_disorder_analysis.py",
+      "description": "Multi-modal mental disorder analysis pipeline",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.15": {
+      "task_id": "6.15",
+      "filename": "emotion_cause_extraction.py",
+      "description": "Emotion cause extraction and intervention mapping",
+      "exists": false,
+      "size_bytes": 0,
+      "imports_ok": false,
+      "content_analysis": {},
+      "status": "MISSING",
+      "issues": [
+        "File does not exist"
+      ]
+    },
+    "6.16": {
+      "task_id": "6.16",
+      "filename": "tfidf_clusterer.py",
+      "description": "TF-IDF feature-based conversation clustering",
+      "exists": true,
+      "size_bytes": 28344,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 6,
+        "functions": 20,
+        "lines": 668,
+        "docstring": true,
+        "size_kb": 27.6640625
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.17": {
+      "task_id": "6.17",
+      "filename": "temporal_reasoner.py",
+      "description": "Temporal reasoning integration",
+      "exists": true,
+      "size_bytes": 31062,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 7,
+        "functions": 25,
+        "lines": 744,
+        "docstring": true,
+        "size_kb": 30.3173828125
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.18": {
+      "task_id": "6.18",
+      "filename": "evidence_validator.py",
+      "description": "Scientific evidence-based practice validation",
+      "exists": true,
+      "size_bytes": 33065,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 8,
+        "functions": 22,
+        "lines": 755,
+        "docstring": true,
+        "size_kb": 32.271484375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.19": {
+      "task_id": "6.19",
+      "filename": "priority_weighted_sampler.py",
+      "description": "Priority-weighted sampling algorithms",
+      "exists": true,
+      "size_bytes": 26014,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 17,
+        "lines": 646,
+        "docstring": true,
+        "size_kb": 25.404296875
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.20": {
+      "task_id": "6.20",
+      "filename": "condition_balancer.py",
+      "description": "Condition-specific balancing system",
+      "exists": true,
+      "size_bytes": 27040,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 12,
+        "lines": 612,
+        "docstring": true,
+        "size_kb": 26.40625
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.21": {
+      "task_id": "6.21",
+      "filename": "approach_diversity_optimizer.py",
+      "description": "Therapeutic approach diversity optimization",
+      "exists": true,
+      "size_bytes": 34619,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 15,
+        "lines": 718,
+        "docstring": true,
+        "size_kb": 33.8076171875
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.22": {
+      "task_id": "6.22",
+      "filename": "demographic_balancer.py",
+      "description": "Demographic and cultural diversity balancing",
+      "exists": true,
+      "size_bytes": 21222,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 12,
+        "lines": 486,
+        "docstring": true,
+        "size_kb": 20.724609375
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.23": {
+      "task_id": "6.23",
+      "filename": "complexity_stratifier.py",
+      "description": "Conversation complexity stratification",
+      "exists": true,
+      "size_bytes": 26863,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 14,
+        "lines": 623,
+        "docstring": true,
+        "size_kb": 26.2333984375
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.24": {
+      "task_id": "6.24",
+      "filename": "crisis_routine_balancer.py",
+      "description": "Crisis-to-routine conversation ratio optimization",
+      "exists": true,
+      "size_bytes": 24423,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 13,
+        "lines": 574,
+        "docstring": true,
+        "size_kb": 23.8505859375
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.25": {
+      "task_id": "6.25",
+      "filename": "multi_tier_validator.py",
+      "description": "Multi-tier quality validation system",
+      "exists": true,
+      "size_bytes": 29688,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 25,
+        "lines": 730,
+        "docstring": true,
+        "size_kb": 28.9892578125
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.26": {
+      "task_id": "6.26",
+      "filename": "dsm5_accuracy_validator.py",
+      "description": "DSM-5 therapeutic accuracy validation",
+      "exists": true,
+      "size_bytes": 27020,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 7,
+        "functions": 20,
+        "lines": 669,
+        "docstring": true,
+        "size_kb": 26.38671875
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.27": {
+      "task_id": "6.27",
+      "filename": "safety_ethics_validator.py",
+      "description": "Conversation safety and ethics validation",
+      "exists": true,
+      "size_bytes": 33303,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 7,
+        "functions": 20,
+        "lines": 804,
+        "docstring": true,
+        "size_kb": 32.5224609375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.28": {
+      "task_id": "6.28",
+      "filename": "effectiveness_predictor.py",
+      "description": "Therapeutic effectiveness prediction",
+      "exists": true,
+      "size_bytes": 28432,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 6,
+        "functions": 17,
+        "lines": 633,
+        "docstring": true,
+        "size_kb": 27.765625
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.29": {
+      "task_id": "6.29",
+      "filename": "coherence_validator.py",
+      "description": "Conversation coherence validation using CoT reasoning",
+      "exists": true,
+      "size_bytes": 39311,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 24,
+        "lines": 1016,
+        "docstring": true,
+        "size_kb": 38.3896484375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.30": {
+      "task_id": "6.30",
+      "filename": "realtime_quality_monitor.py",
+      "description": "Real-time conversation quality monitoring",
+      "exists": true,
+      "size_bytes": 17831,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 20,
+        "lines": 467,
+        "docstring": true,
+        "size_kb": 17.41015625
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.31": {
+      "task_id": "6.31",
+      "filename": "production_exporter.py",
+      "description": "Production-ready dataset export with tiered access",
+      "exists": true,
+      "size_bytes": 27472,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 24,
+        "lines": 710,
+        "docstring": true,
+        "size_kb": 26.828125
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.32": {
+      "task_id": "6.32",
+      "filename": "adaptive_learner.py",
+      "description": "Adaptive learning pipeline",
+      "exists": true,
+      "size_bytes": 27077,
+      "imports_ok": false,
+      "content_analysis": {
+        "classes": 8,
+        "functions": 34,
+        "lines": 684,
+        "docstring": true,
+        "size_kb": 26.4423828125
+      },
+      "status": "MISSING",
+      "issues": [
+        "Import errors"
+      ]
+    },
+    "6.33": {
+      "task_id": "6.33",
+      "filename": "analytics_dashboard.py",
+      "description": "Comprehensive analytics dashboard",
+      "exists": true,
+      "size_bytes": 18609,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 2,
+        "functions": 17,
+        "lines": 455,
+        "docstring": true,
+        "size_kb": 18.1240234375
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.34": {
+      "task_id": "6.34",
+      "filename": "automated_maintenance.py",
+      "description": "Automated dataset update and maintenance procedures",
+      "exists": true,
+      "size_bytes": 20792,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 5,
+        "functions": 22,
+        "lines": 571,
+        "docstring": true,
+        "size_kb": 20.296875
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.35": {
+      "task_id": "6.35",
+      "filename": "feedback_loops.py",
+      "description": "Conversation effectiveness feedback loops",
+      "exists": true,
+      "size_bytes": 19230,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 4,
+        "functions": 12,
+        "lines": 461,
+        "docstring": true,
+        "size_kb": 18.7763671875
+      },
+      "status": "COMPLETE",
+      "issues": []
+    },
+    "6.36": {
+      "task_id": "6.36",
+      "filename": "comprehensive_api.py",
+      "description": "Comprehensive documentation and API",
+      "exists": true,
+      "size_bytes": 30454,
+      "imports_ok": true,
+      "content_analysis": {
+        "classes": 3,
+        "functions": 8,
+        "lines": 873,
+        "docstring": true,
+        "size_kb": 29.732421875
+      },
+      "status": "COMPLETE",
+      "issues": []
+    }
+  }
+}

configs/stage_configs/auto_resume_requirements.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+# Automatic Resume System Requirements
+# Install with: pip install -r auto_resume_requirements.txt
+# Core system monitoring (from checkpoint system)
+psutil>=5.8.0
+# Built-in Python modules (listed for reference)
+# asyncio - Built-in Python 3.7+
+# signal - Built-in
+# threading - Built-in
+# time - Built-in
+# uuid - Built-in
+# json - Built-in
+# logging - Built-in
+# os - Built-in
+# sys - Built-in
+# datetime - Built-in
+# pathlib - Built-in
+# tempfile - Built-in
+# shutil - Built-in
+# collections - Built-in
+# dataclasses - Built-in Python 3.7+
+# enum - Built-in
+# typing - Built-in Python 3.5+
+# Dependencies from checkpoint system
+# (Include checkpoint_requirements.txt)
+# Optional: Enhanced monitoring and alerting
+# prometheus_client>=0.14.0  # For Prometheus metrics
+# redis>=4.0.0              # For distributed coordination
+# pika>=1.3.0               # For RabbitMQ message queuing
+# celery>=5.2.0             # For distributed task management
+# Development and testing
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+coverage>=6.0.0
+pytest-timeout>=2.1.0      # For timeout testing
+# Code quality
+black>=22.0.0
+flake8>=5.0.0
+mypy>=0.991
+# Documentation
+sphinx>=4.0.0
+sphinx-rtd-theme>=1.0.0
+# Performance profiling (optional)
+# memory_profiler>=0.60.0   # For memory usage profiling
+# py-spy>=0.3.0             # For CPU profiling

configs/stage_configs/bias_validated_validation_summary.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "total_datasets": 2,
+  "safe_datasets": 0,
+  "caution_datasets": 2,
+  "safety_percentage": 0.0,
+  "bias_categories_checked": [
+    "cultural_bias",
+    "therapeutic_bias",
+    "accessibility_bias",
+    "demographic_bias",
+    "safety_concerns"
+  ],
+  "validation_complete": true
+}

configs/stage_configs/boolq_validation_pipeline_config.json ADDED Viewed

File without changes

configs/stage_configs/celery_config.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env python3
+"""
+Celery Configuration for Pixelated Empathy AI Distributed Processing
+"""
+import os
+from celery import Celery
+from kombu import Exchange, Queue
+# Celery application configuration
+def create_celery_app():
+    """Create and configure Celery application"""
+    # Get configuration from environment
+    broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    result_backend = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    # Create Celery app
+    app = Celery("pixelated_empathy")
+    # Configure Celery
+    app.conf.update(
+        # Broker settings
+        broker_url=broker_url,
+        result_backend=result_backend,
+        # Task serialization
+        task_serializer="pickle",
+        accept_content=["pickle", "json"],
+        result_serializer="pickle",
+        # Timezone settings
+        timezone="UTC",
+        enable_utc=True,
+        # Task routing
+        task_routes={
+            "quality_validator.validate_task": {"queue": "quality_validation"},
+            "data_processor.process_task": {"queue": "data_processing"},
+            "model_trainer.train_task": {"queue": "model_training"},
+            "backup.backup_task": {"queue": "backup"},
+        },
+        # Queue configuration
+        task_default_queue="default",
+        task_queues=(
+            Queue("default", Exchange("default"), routing_key="default"),
+            Queue(
+                "quality_validation",
+                Exchange("quality"),
+                routing_key="quality.validation",
+            ),
+            Queue("data_processing", Exchange("data"), routing_key="data.processing"),
+            Queue("model_training", Exchange("training"), routing_key="training.model"),
+            Queue("backup", Exchange("backup"), routing_key="backup.task"),
+            Queue("high_priority", Exchange("priority"), routing_key="priority.high"),
+        ),
+        # Worker settings
+        worker_prefetch_multiplier=1,
+        task_acks_late=True,
+        worker_max_tasks_per_child=1000,
+        # Task execution settings
+        task_soft_time_limit=300,  # 5 minutes
+        task_time_limit=600,  # 10 minutes
+        task_reject_on_worker_lost=True,
+        # Result settings
+        result_expires=3600,  # 1 hour
+        # Monitoring
+        worker_send_task_events=True,
+        task_send_sent_event=True,
+        # Error handling
+        task_annotations={
+            "*": {"rate_limit": "100/m"},
+            "quality_validator.validate_task": {"rate_limit": "50/m"},
+            "model_trainer.train_task": {"rate_limit": "5/m"},
+        },
+        # Beat schedule (for periodic tasks)
+        beat_schedule={
+            "cleanup-old-results": {
+                "task": "maintenance.cleanup_old_results",
+                "schedule": 3600.0,  # Every hour
+            },
+            "health-check": {
+                "task": "monitoring.health_check",
+                "schedule": 300.0,  # Every 5 minutes
+            },
+            "backup-data": {
+                "task": "backup.backup_task",
+                "schedule": 86400.0,  # Daily
+                "kwargs": {"backup_type": "incremental"},
+            },
+        },
+    )
+    return app
+# Create the Celery app instance
+celery_app = create_celery_app()
+# Task discovery
+celery_app.autodiscover_tasks(
+    [
+        "distributed_processing.quality_validator",
+        "distributed_processing.data_processor",
+        "distributed_processing.model_trainer",
+        "distributed_processing.backup_manager",
+        "distributed_processing.monitoring",
+    ]
+)
+if __name__ == "__main__":
+    celery_app.start()

configs/stage_configs/check_config.sh ADDED Viewed

File without changes

configs/stage_configs/checkpoint_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "checkpoint_config": {
+    "save_steps": 506,
+    "save_total_limit": 5,
+    "output_dir": "./checkpoints",
+    "resume_from_checkpoint": true,
+    "auto_find_batch_size": false
+  },
+  "backup_strategy": {
+    "local_backup": true,
+    "cloud_backup": false,
+    "backup_frequency": "every_checkpoint",
+    "backup_location": "./backups"
+  },
+  "recovery_points": [
+    {
+      "step": 1013,
+      "description": "10% complete"
+    },
+    {
+      "step": 2532,
+      "description": "25% complete"
+    },
+    {
+      "step": 5065,
+      "description": "50% complete"
+    },
+    {
+      "step": 7597,
+      "description": "75% complete"
+    },
+    {
+      "step": 9117,
+      "description": "90% complete"
+    }
+  ],
+  "monitoring": {
+    "track_loss": true,
+    "track_learning_rate": true,
+    "track_memory_usage": true,
+    "alert_on_divergence": true,
+    "loss_spike_threshold": 2.0
+  }
+}

configs/stage_configs/checkpoint_requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+# Checkpoint System Requirements
+# Install with: pip install -r checkpoint_requirements.txt
+# Core system monitoring
+psutil>=5.8.0
+# Built-in Python modules (listed for reference)
+# asyncio - Built-in Python 3.7+
+# sqlite3 - Built-in
+# pickle - Built-in
+# gzip - Built-in
+# json - Built-in
+# hashlib - Built-in
+# threading - Built-in
+# pathlib - Built-in
+# shutil - Built-in
+# tempfile - Built-in
+# uuid - Built-in
+# time - Built-in
+# datetime - Built-in
+# logging - Built-in
+# os - Built-in
+# dataclasses - Built-in Python 3.7+
+# enum - Built-in
+# typing - Built-in Python 3.5+
+# Optional: Enhanced features
+# redis>=4.0.0          # For distributed checkpoint coordination
+# cryptography>=3.0.0   # For checkpoint encryption
+# lz4>=3.0.0           # For faster compression alternative
+# msgpack>=1.0.0       # For more efficient serialization
+# Development and testing
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+coverage>=6.0.0
+# Code quality
+black>=22.0.0
+flake8>=5.0.0
+mypy>=0.991
+# Documentation
+sphinx>=4.0.0          # For generating documentation
+sphinx-rtd-theme>=1.0.0

configs/stage_configs/claude_assessment.json ADDED Viewed

File without changes

configs/stage_configs/cli_config.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+Configuration management for CLI.
+"""
+import json
+import os
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+try:
+    import yaml
+    YAML_AVAILABLE = True
+except ImportError:
+    YAML_AVAILABLE = False
+    yaml = None  # type: ignore
+class ConfigManager:
+    """Manages configuration for the research system."""
+    DEFAULT_CONFIG_PATH = Path.home() / ".journal_research" / "config.yaml"
+    DEFAULT_CONFIG = {
+        "orchestrator": {
+            "max_retries": 3,
+            "retry_delay_seconds": 1.0,
+            "progress_history_limit": 100,
+            "parallel_evaluation": False,
+            "parallel_integration_planning": False,
+            "max_workers": 4,
+            "session_storage_path": None,
+            "visualization_max_points": 100,
+            "fallback_on_failure": True,
+        },
+        "discovery": {
+            "pubmed": {
+                "api_key": None,
+                "base_url": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils",
+                "search_limit": 100,
+            },
+            "doaj": {
+                "base_url": "https://doaj.org/api/v2",
+            },
+            "repositories": {
+                "dryad": {"base_url": "https://datadryad.org/api/v2"},
+                "zenodo": {"base_url": "https://zenodo.org/api"},
+                "clinical_trials": {"base_url": "https://clinicaltrials.gov/api/v2"},
+            },
+        },
+        "evaluation": {
+            "therapeutic_relevance_weight": 0.35,
+            "data_structure_quality_weight": 0.25,
+            "training_integration_weight": 0.20,
+            "ethical_accessibility_weight": 0.20,
+            "high_priority_threshold": 7.5,
+            "medium_priority_threshold": 5.0,
+        },
+        "acquisition": {
+            "storage_base_path": "data/acquired_datasets",
+            "encryption_enabled": False,
+            "download_timeout": 3600,
+            "max_retries": 3,
+            "chunk_size": 8192,
+            "resume_downloads": True,
+        },
+        "integration": {
+            "target_format": "chatml",
+            "default_complexity": "medium",
+        },
+        "logging": {
+            "level": "INFO",
+            "file": None,
+            "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        },
+    }
+    def __init__(self, config_path: Optional[Union[Path, str]] = None):
+        """Initialize config manager with optional config path."""
+        # Convert string to Path if needed
+        if config_path is not None and isinstance(config_path, str):
+            config_path = Path(config_path)
+        self.config_path = config_path or self.DEFAULT_CONFIG_PATH
+        # Only create parent directory if it's writable
+        try:
+            self.config_path.parent.mkdir(parents=True, exist_ok=True)
+        except (PermissionError, OSError):
+            # If we can't create the directory, that's okay - we'll handle it in load/save
+            pass
+    def load(self) -> Dict[str, Any]:
+        """Load configuration from file or return defaults."""
+        if self.config_path.exists():
+            try:
+                with open(self.config_path, "r") as f:
+                    if YAML_AVAILABLE and self.config_path.suffix in (".yaml", ".yml"):
+                        assert yaml is not None  # Type guard for type checker
+                        config = yaml.safe_load(f) or {}
+                    else:
+                        # Fall back to JSON
+                        config = json.load(f) or {}
+                # Merge with defaults to ensure all keys exist
+                merged = self._merge_config(self.DEFAULT_CONFIG, config)
+                return self._apply_legacy_aliases(merged)
+            except Exception as e:
+                print(f"Warning: Could not load config from {self.config_path}: {e}")
+                return self._apply_legacy_aliases(deepcopy(self.DEFAULT_CONFIG))
+        return self._apply_legacy_aliases(deepcopy(self.DEFAULT_CONFIG))
+    def save(self, config: Dict[str, Any]) -> None:
+        """Save configuration to file."""
+        self.config_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.config_path, "w") as f:
+            if YAML_AVAILABLE and self.config_path.suffix in (".yaml", ".yml"):
+                assert yaml is not None  # Type guard for type checker
+                yaml.dump(config, f, default_flow_style=False, indent=2)
+            else:
+                # Fall back to JSON
+                json.dump(config, f, indent=2)
+    def get(self, key_path: str, default: Any = None) -> Any:
+        """Get a configuration value by dot-separated path."""
+        config = self.load()
+        keys = key_path.split(".")
+        value = config
+        for key in keys:
+            if isinstance(value, dict) and key in value:
+                value = value[key]
+            else:
+                return default
+        return value
+    def set(self, key_path: str, value: Any) -> None:
+        """Set a configuration value by dot-separated path."""
+        config = self.load()
+        keys = key_path.split(".")
+        target = config
+        for key in keys[:-1]:
+            if key not in target:
+                target[key] = {}
+            target = target[key]
+        target[keys[-1]] = value
+        self.save(config)
+    def _merge_config(self, default: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
+        """Recursively merge user config into default config."""
+        result = default.copy()
+        for key, value in user.items():
+            if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+                result[key] = self._merge_config(result[key], value)
+            else:
+                result[key] = value
+        return result
+    def _apply_legacy_aliases(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """Ensure legacy top-level aliases exist for backward compatibility."""
+        # Maintain top-level storage_base_path alias
+        acquisition_config = config.get("acquisition", {})
+        storage_base_path = acquisition_config.get("storage_base_path")
+        if storage_base_path and "storage_base_path" not in config:
+            config["storage_base_path"] = storage_base_path
+        # Maintain top-level logging directory alias
+        logging_config = config.get("logging", {})
+        log_file = logging_config.get("file")
+        if log_file and "log_file" not in config:
+            config["log_file"] = log_file
+        return config
+    def load_env_overrides(self) -> Dict[str, Any]:
+        """Load configuration overrides from environment variables."""
+        overrides = {}
+        env_prefix = "JOURNAL_RESEARCH_"
+        # Map environment variables to config paths
+        env_mappings = {
+            "PUBMED_API_KEY": "discovery.pubmed.api_key",
+            "STORAGE_PATH": "acquisition.storage_base_path",
+            "LOG_LEVEL": "logging.level",
+            "MAX_RETRIES": "orchestrator.max_retries",
+            "MAX_WORKERS": "orchestrator.max_workers",
+        }
+        for env_var, config_path in env_mappings.items():
+            env_key = env_prefix + env_var
+            if env_key in os.environ:
+                overrides[config_path] = os.environ[env_key]
+        return overrides
+    def apply_env_overrides(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply environment variable overrides to config."""
+        overrides = self.load_env_overrides()
+        for key_path, value in overrides.items():
+            keys = key_path.split(".")
+            target = config
+            for key in keys[:-1]:
+                if key not in target:
+                    target[key] = {}
+                target = target[key]
+            target[keys[-1]] = value
+        return config
+# Global config manager instance
+_config_manager = ConfigManager()
+def load_config(config_path: Optional[Union[Path, str]] = None) -> Dict[str, Any]:
+    """Load configuration with environment overrides."""
+    # Convert string to Path if needed
+    if config_path is not None and isinstance(config_path, str):
+        config_path = Path(config_path)
+    manager = ConfigManager(config_path) if config_path else _config_manager
+    config = manager.load()
+    config = manager.apply_env_overrides(config)
+    return config
+def save_config(config: Dict[str, Any], config_path: Optional[Union[Path, str]] = None) -> None:
+    """Save configuration to file."""
+    # Convert string to Path if needed
+    if config_path is not None and isinstance(config_path, str):
+        config_path = Path(config_path)
+    manager = ConfigManager(config_path) if config_path else _config_manager
+    manager.save(config)
+def get_config_value(key_path: str, default: Any = None) -> Any:
+    """Get a configuration value by dot-separated path."""
+    return _config_manager.get(key_path, default)

configs/stage_configs/complexity_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "beginner": {
+    "level": "Beginner",
+    "weight": 0.4,
+    "min_samples": 1000,
+    "max_samples": 15000,
+    "complexity_range": [
+      0.0,
+      0.4
+    ],
+    "characteristics": [
+      "Simple emotional expressions",
+      "Basic therapeutic techniques",
+      "Clear, straightforward issues",
+      "Single-topic focus",
+      "Minimal comorbidity",
+      "Standard interventions"
+    ]
+  },
+  "intermediate": {
+    "level": "Intermediate",
+    "weight": 0.45,
+    "min_samples": 1200,
+    "max_samples": 18000,
+    "complexity_range": [
+      0.4,
+      0.7
+    ],
+    "characteristics": [
+      "Moderate emotional intensity",
+      "Multiple therapeutic techniques",
+      "Interconnected issues",
+      "Some comorbidity",
+      "Nuanced interventions",
+      "Relationship dynamics"
+    ]
+  },
+  "advanced": {
+    "level": "Advanced",
+    "weight": 0.15,
+    "min_samples": 400,
+    "max_samples": 6000,
+    "complexity_range": [
+      0.7,
+      1.0
+    ],
+    "characteristics": [
+      "High emotional intensity",
+      "Complex therapeutic approaches",
+      "Multiple interconnected issues",
+      "Significant comorbidity",
+      "Crisis intervention elements",
+      "Advanced clinical skills required"
+    ]
+  }
+}

configs/stage_configs/comprehensive_integration_summary.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "integration_complete": true,
+  "all_components_integrated": true,
+  "components_integrated": [
+    "long_term_journaling_system",
+    "tri_expert_voice_blending",
+    "edge_case_scenarios",
+    "dual_persona_dynamics",
+    "bias_detection_validation",
+    "psychology_knowledge_base"
+  ],
+  "datasets": {
+    "journaling_enhanced": 39,
+    "voice_enhanced": 0,
+    "edge_cases": 5,
+    "dual_persona": 75,
+    "bias_validated": 10,
+    "psychology_kb_enhanced": 5,
+    "master_integrated": 0,
+    "total_datasets": 134
+  },
+  "expert_voices": [
+    "Tim Ferriss",
+    "Gabor Mat\u00e9",
+    "Bren\u00e9 Brown"
+  ],
+  "psychology_concepts": 4867,
+  "bias_categories_checked": 5,
+  "therapeutic_modalities": 6,
+  "kan_28_status": "FULLY_SOLVED",
+  "integration_timestamp": "2024-10-28"
+}

configs/stage_configs/condition_config.json ADDED Viewed

	@@ -0,0 +1,460 @@

+{
+  "depression": {
+    "name": "Major Depressive Disorder",
+    "prevalence": 0.084,
+    "min_samples": 500,
+    "max_samples": 8000,
+    "aliases": [
+      "depression",
+      "depressed",
+      "major depression",
+      "mdd",
+      "sad",
+      "sadness"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "ptsd",
+      "substance_abuse"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "anxiety": {
+    "name": "Generalized Anxiety Disorder",
+    "prevalence": 0.031,
+    "min_samples": 400,
+    "max_samples": 6000,
+    "aliases": [
+      "anxiety",
+      "anxious",
+      "gad",
+      "worry",
+      "worried",
+      "panic"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "ptsd",
+      "ocd"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "ptsd": {
+    "name": "Post-Traumatic Stress Disorder",
+    "prevalence": 0.037,
+    "min_samples": 300,
+    "max_samples": 4000,
+    "aliases": [
+      "ptsd",
+      "trauma",
+      "traumatic",
+      "flashback",
+      "nightmares"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "substance_abuse"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "bipolar": {
+    "name": "Bipolar Disorder",
+    "prevalence": 0.028,
+    "min_samples": 250,
+    "max_samples": 3000,
+    "aliases": [
+      "bipolar",
+      "manic",
+      "mania",
+      "mood swings",
+      "hypomania"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "substance_abuse",
+      "adhd"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "adhd": {
+    "name": "Attention-Deficit/Hyperactivity Disorder",
+    "prevalence": 0.041,
+    "min_samples": 300,
+    "max_samples": 4000,
+    "aliases": [
+      "adhd",
+      "add",
+      "attention deficit",
+      "hyperactive",
+      "inattentive"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "depression",
+      "bipolar"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "ocd": {
+    "name": "Obsessive-Compulsive Disorder",
+    "prevalence": 0.012,
+    "min_samples": 150,
+    "max_samples": 2000,
+    "aliases": [
+      "ocd",
+      "obsessive",
+      "compulsive",
+      "intrusive thoughts",
+      "rituals"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "depression",
+      "tics"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "autism": {
+    "name": "Autism Spectrum Disorder",
+    "prevalence": 0.016,
+    "min_samples": 200,
+    "max_samples": 2500,
+    "aliases": [
+      "autism",
+      "asd",
+      "asperger",
+      "autistic",
+      "spectrum"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "depression",
+      "adhd"
+    ],
+    "severity_levels": [
+      "level 1",
+      "level 2",
+      "level 3"
+    ]
+  },
+  "bpd": {
+    "name": "Borderline Personality Disorder",
+    "prevalence": 0.014,
+    "min_samples": 150,
+    "max_samples": 2000,
+    "aliases": [
+      "bpd",
+      "borderline",
+      "personality disorder",
+      "emotional dysregulation"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "ptsd",
+      "substance_abuse"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "schizophrenia": {
+    "name": "Schizophrenia",
+    "prevalence": 0.011,
+    "min_samples": 100,
+    "max_samples": 1500,
+    "aliases": [
+      "schizophrenia",
+      "psychosis",
+      "hallucinations",
+      "delusions"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "substance_abuse"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "eating_disorders": {
+    "name": "Eating Disorders",
+    "prevalence": 0.009,
+    "min_samples": 100,
+    "max_samples": 1500,
+    "aliases": [
+      "anorexia",
+      "bulimia",
+      "binge eating",
+      "eating disorder",
+      "body image"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "ocd"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "substance_abuse": {
+    "name": "Substance Use Disorders",
+    "prevalence": 0.104,
+    "min_samples": 400,
+    "max_samples": 6000,
+    "aliases": [
+      "addiction",
+      "substance abuse",
+      "alcoholism",
+      "drug abuse",
+      "dependency"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "ptsd",
+      "bipolar"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "social_anxiety": {
+    "name": "Social Anxiety Disorder",
+    "prevalence": 0.073,
+    "min_samples": 300,
+    "max_samples": 4000,
+    "aliases": [
+      "social anxiety",
+      "social phobia",
+      "shy",
+      "shyness",
+      "social fear"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "avoidant_personality"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "panic_disorder": {
+    "name": "Panic Disorder",
+    "prevalence": 0.028,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "aliases": [
+      "panic disorder",
+      "panic attacks",
+      "agoraphobia",
+      "panic"
+    ],
+    "comorbid_conditions": [
+      "anxiety",
+      "depression",
+      "substance_abuse"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "insomnia": {
+    "name": "Insomnia and Sleep Disorders",
+    "prevalence": 0.06,
+    "min_samples": 250,
+    "max_samples": 3500,
+    "aliases": [
+      "insomnia",
+      "sleep disorder",
+      "sleepless",
+      "sleep problems"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "bipolar"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "chronic_pain": {
+    "name": "Chronic Pain and Mental Health",
+    "prevalence": 0.05,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "aliases": [
+      "chronic pain",
+      "fibromyalgia",
+      "pain",
+      "chronic illness"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "ptsd"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "grief": {
+    "name": "Grief and Bereavement",
+    "prevalence": 0.035,
+    "min_samples": 150,
+    "max_samples": 2500,
+    "aliases": [
+      "grief",
+      "bereavement",
+      "loss",
+      "mourning",
+      "death"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "ptsd"
+    ],
+    "severity_levels": [
+      "normal",
+      "complicated",
+      "prolonged"
+    ]
+  },
+  "relationship_issues": {
+    "name": "Relationship and Interpersonal Issues",
+    "prevalence": 0.08,
+    "min_samples": 300,
+    "max_samples": 4500,
+    "aliases": [
+      "relationship",
+      "marriage",
+      "divorce",
+      "breakup",
+      "interpersonal"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "attachment_issues"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "work_stress": {
+    "name": "Work-Related Stress and Burnout",
+    "prevalence": 0.07,
+    "min_samples": 250,
+    "max_samples": 3500,
+    "aliases": [
+      "work stress",
+      "burnout",
+      "job stress",
+      "workplace",
+      "career"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "insomnia"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "parenting_stress": {
+    "name": "Parenting Stress and Family Issues",
+    "prevalence": 0.045,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "aliases": [
+      "parenting",
+      "family stress",
+      "child behavior",
+      "parental stress"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "relationship_issues"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  },
+  "loneliness": {
+    "name": "Loneliness and Social Isolation",
+    "prevalence": 0.055,
+    "min_samples": 200,
+    "max_samples": 3000,
+    "aliases": [
+      "loneliness",
+      "lonely",
+      "isolated",
+      "social isolation",
+      "alone"
+    ],
+    "comorbid_conditions": [
+      "depression",
+      "anxiety",
+      "social_anxiety"
+    ],
+    "severity_levels": [
+      "mild",
+      "moderate",
+      "severe"
+    ]
+  }
+}

configs/stage_configs/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Configuration for NVIDIA NeMo Data Designer service."""
+import os
+from dataclasses import dataclass
+from typing import Optional
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # dotenv is optional
+@dataclass
+class DataDesignerConfig:
+    """Configuration for NeMo Data Designer client."""
+    base_url: str = "http://localhost:8000"  # For local Docker Compose, use http://localhost:8000
+    api_key: Optional[str] = None
+    timeout: int = 300  # 5 minutes default timeout
+    max_retries: int = 3
+    batch_size: int = 1000
+    @classmethod
+    def from_env(cls) -> "DataDesignerConfig":
+        """Create configuration from environment variables."""
+        return cls(
+            base_url=os.getenv(
+                "NEMO_DATA_DESIGNER_BASE_URL",
+                "http://localhost:8000",
+            ),
+            api_key=os.getenv("NVIDIA_API_KEY"),
+            timeout=int(os.getenv("NEMO_DATA_DESIGNER_TIMEOUT", "300")),
+            max_retries=int(os.getenv("NEMO_DATA_DESIGNER_MAX_RETRIES", "3")),
+            batch_size=int(os.getenv("NEMO_DATA_DESIGNER_BATCH_SIZE", "1000")),
+        )
+    def validate(self) -> None:
+        """Validate configuration."""
+        if not self.api_key:
+            raise ValueError(
+                "NVIDIA_API_KEY environment variable is required. "
+                "Get your API key from https://build.nvidia.com/nemo/data-designer"
+            )
+        if not self.base_url:
+            raise ValueError("base_url cannot be empty")
+        if self.timeout <= 0:
+            raise ValueError("timeout must be positive")
+        if self.max_retries < 0:
+            raise ValueError("max_retries must be non-negative")
+        if self.batch_size <= 0:
+            raise ValueError("batch_size must be positive")

configs/stage_configs/config_example.py ADDED Viewed

File without changes

configs/stage_configs/config_lock.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "created_at": "2025-11-30T18:08:28.603381Z",
+  "git_info": {
+    "commit_sha": "46b7965d54d4ccfc0d018ace10b0724a9246bef3",
+    "commit_message": "chore: update pnpm version across configurations and workflows\n\n- Upgraded pnpm version from 10.23.0 to 10.24.0 in various configuration files, including .gitlab-ci.yml, Dockerfile, and multiple GitHub workflows.\n- Ensured consistency in pnpm version across package.json, documentation, and scripts to maintain compatibility and improve functionality.\n- Removed obsolete files related to Azure Pipelines diagnostics and remaining fixes plan as they are no longer needed.",
+    "branch": "master",
+    "is_dirty": true,
+    "remote_url": "git@github.com:pixelatedempathy/pixelated.git"
+  },
+  "random_seed": 42,
+  "config_snapshot": {
+    "target_samples": 50,
+    "pipeline_config": {
+      "edge_cases": {
+        "enabled": true,
+        "target_percentage": 0.25
+      },
+      "pixel_voice": {
+        "enabled": true,
+        "target_percentage": 0.2
+      },
+      "psychology_knowledge": {
+        "enabled": true,
+        "target_percentage": 0.15
+      },
+      "dual_persona": {
+        "enabled": true,
+        "target_percentage": 0.1
+      },
+      "standard_therapeutic": {
+        "enabled": true,
+        "target_percentage": 0.3
+      }
+    }
+  },
+  "python_version": "3.11.13 (main, Jun 12 2025, 12:41:02) [Clang 20.1.4 ]",
+  "platform": "Linux-6.14.0-27-generic-x86_64-with-glibc2.41",
+  "config_hash": "eb8ea2f72df5a68b"
+}

configs/stage_configs/config_lock.py ADDED Viewed

	@@ -0,0 +1,206 @@

+#!/usr/bin/env python3
+"""
+Configuration Locking System
+Freezes configuration, seeds, and git commit info for reproducibility
+"""
+import json
+import random
+import subprocess
+from dataclasses import dataclass, asdict, field
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict, Any
+import hashlib
+@dataclass
+class GitInfo:
+    """Git repository information"""
+    commit_sha: str
+    commit_message: str
+    branch: str
+    is_dirty: bool
+    remote_url: Optional[str] = None
+    @classmethod
+    def capture(cls, repo_path: Optional[Path] = None) -> "GitInfo":
+        """Capture current git state"""
+        repo_path = repo_path or Path.cwd()
+        try:
+            # Get commit SHA
+            result = subprocess.run(
+                ["git", "rev-parse", "HEAD"],
+                cwd=repo_path,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            commit_sha = result.stdout.strip()
+            # Get commit message
+            result = subprocess.run(
+                ["git", "log", "-1", "--pretty=%B"],
+                cwd=repo_path,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            commit_message = result.stdout.strip()
+            # Get branch
+            result = subprocess.run(
+                ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+                cwd=repo_path,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            branch = result.stdout.strip()
+            # Check if working directory is dirty
+            result = subprocess.run(
+                ["git", "status", "--porcelain"],
+                cwd=repo_path,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            is_dirty = len(result.stdout.strip()) > 0
+            # Get remote URL
+            remote_url = None
+            try:
+                result = subprocess.run(
+                    ["git", "config", "--get", "remote.origin.url"],
+                    cwd=repo_path,
+                    capture_output=True,
+                    text=True,
+                    check=True
+                )
+                remote_url = result.stdout.strip()
+            except subprocess.CalledProcessError:
+                pass
+            return cls(
+                commit_sha=commit_sha,
+                commit_message=commit_message,
+                branch=branch,
+                is_dirty=is_dirty,
+                remote_url=remote_url
+            )
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            # Git not available or not a git repo
+            return cls(
+                commit_sha="unknown",
+                commit_message="unknown",
+                branch="unknown",
+                is_dirty=False,
+                remote_url=None
+            )
+@dataclass
+class LockedConfig:
+    """Locked configuration with reproducibility info"""
+    # Timestamp
+    created_at: str
+    # Git information
+    git_info: GitInfo
+    # Random seed
+    random_seed: int
+    # Configuration snapshot
+    config_snapshot: Dict[str, Any]
+    # Environment info
+    python_version: str
+    platform: str
+    # Config hash for verification
+    config_hash: str = field(default="")
+    def __post_init__(self):
+        """Calculate config hash after initialization"""
+        if not self.config_hash:
+            # Create hash from config snapshot
+            config_str = json.dumps(self.config_snapshot, sort_keys=True)
+            self.config_hash = hashlib.sha256(config_str.encode()).hexdigest()[:16]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'created_at': self.created_at,
+            'git_info': asdict(self.git_info),
+            'random_seed': self.random_seed,
+            'config_snapshot': self.config_snapshot,
+            'python_version': self.python_version,
+            'platform': self.platform,
+            'config_hash': self.config_hash
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "LockedConfig":
+        """Create from dictionary"""
+        git_info = GitInfo(**data['git_info'])
+        return cls(
+            created_at=data['created_at'],
+            git_info=git_info,
+            random_seed=data['random_seed'],
+            config_snapshot=data['config_snapshot'],
+            python_version=data['python_version'],
+            platform=data['platform'],
+            config_hash=data.get('config_hash', '')
+        )
+    def save(self, path: Path) -> None:
+        """Save locked config to file"""
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+    @classmethod
+    def load(cls, path: Path) -> "LockedConfig":
+        """Load locked config from file"""
+        with open(path, 'r') as f:
+            data = json.load(f)
+        return cls.from_dict(data)
+def lock_config(config: Dict[str, Any], seed: Optional[int] = None,
+                repo_path: Optional[Path] = None) -> LockedConfig:
+    """Lock a configuration with reproducibility info"""
+    import sys
+    import platform
+    # Generate seed if not provided
+    if seed is None:
+        seed = random.randint(0, 2**31 - 1)
+    # Set random seed
+    random.seed(seed)
+    # Capture git info
+    git_info = GitInfo.capture(repo_path)
+    # Create locked config
+    locked = LockedConfig(
+        created_at=datetime.utcnow().isoformat() + "Z",
+        git_info=git_info,
+        random_seed=seed,
+        config_snapshot=config,
+        python_version=sys.version,
+        platform=platform.platform()
+    )
+    return locked
+def apply_locked_config(locked_config: LockedConfig) -> None:
+    """Apply a locked configuration (set random seed)"""
+    random.seed(locked_config.random_seed)
+    # Note: Config snapshot should be applied by the caller

configs/stage_configs/config_profiles.py ADDED Viewed

	@@ -0,0 +1,339 @@

+#!/usr/bin/env python3
+"""
+Training Configuration Profiles
+Maps stage configs and dataset profiles into concrete training data selections.
+Ensures default/prod profiles do not silently include edge/red-team profiles.
+"""
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Union
+from ..configs.stages import (
+    STAGE1_ID,
+    STAGE2_ID,
+    STAGE3_ID,
+    STAGE4_ID,
+)
+from ..utils.logger import get_logger
+logger = get_logger("dataset_pipeline.training.config_profiles")
+class TrainingProfile(Enum):
+    """Named training profiles that map to stages and dataset types"""
+    FOUNDATION = "foundation"  # Stage 1: Foundation & Rapport
+    REASONING = "reasoning"  # Stage 2: Therapeutic Expertise & Reasoning
+    EDGE_CRISIS = "edge_crisis"  # Stage 3: Edge Stress Test & Scenario Bank
+    VOICE_PERSONA = "voice_persona"  # Stage 4: Voice, Persona & Delivery
+    PRODUCTION = "production"  # General-purpose production training (no edge)
+    RESEARCH = "research"  # Research/red-team profile (includes edge)
+@dataclass
+class ProfileConfig:
+    """Configuration for a training profile"""
+    profile_name: str
+    stage_ids: List[str]  # Which stages to include
+    allow_edge_profiles: bool  # Whether edge/red-team datasets are allowed
+    description: str
+    metadata: Dict[str, Any] = field(default_factory=dict)
+# Predefined profile configurations
+PROFILE_CONFIGS: Dict[str, ProfileConfig] = {
+    TrainingProfile.FOUNDATION.value: ProfileConfig(
+        profile_name=TrainingProfile.FOUNDATION.value,
+        stage_ids=[STAGE1_ID],
+        allow_edge_profiles=False,
+        description="Foundation & Rapport training (Stage 1 only, no edge cases)",
+    ),
+    TrainingProfile.REASONING.value: ProfileConfig(
+        profile_name=TrainingProfile.REASONING.value,
+        stage_ids=[STAGE2_ID],
+        allow_edge_profiles=False,
+        description="Therapeutic Expertise & Reasoning training (Stage 2 only, no edge cases)",
+    ),
+    TrainingProfile.EDGE_CRISIS.value: ProfileConfig(
+        profile_name=TrainingProfile.EDGE_CRISIS.value,
+        stage_ids=[STAGE3_ID],
+        allow_edge_profiles=True,
+        description="Edge Stress Test & Scenario Bank (Stage 3, edge cases allowed)",
+    ),
+    TrainingProfile.VOICE_PERSONA.value: ProfileConfig(
+        profile_name=TrainingProfile.VOICE_PERSONA.value,
+        stage_ids=[STAGE4_ID],
+        allow_edge_profiles=False,
+        description="Voice, Persona & Delivery training (Stage 4 only, no edge cases)",
+    ),
+    TrainingProfile.PRODUCTION.value: ProfileConfig(
+        profile_name=TrainingProfile.PRODUCTION.value,
+        stage_ids=[STAGE1_ID, STAGE2_ID, STAGE4_ID],  # Explicitly exclude Stage 3
+        allow_edge_profiles=False,
+        description="General-purpose production training (Stages 1, 2, 4 - no edge cases)",
+    ),
+    TrainingProfile.RESEARCH.value: ProfileConfig(
+        profile_name=TrainingProfile.RESEARCH.value,
+        stage_ids=[STAGE1_ID, STAGE2_ID, STAGE3_ID, STAGE4_ID],  # All stages
+        allow_edge_profiles=True,
+        description="Research/red-team profile (all stages, edge cases allowed)",
+    ),
+}
+class TrainingDataSelector:
+    """
+    Profile-aware data selector that ensures edge profiles are only used
+    in appropriate training configurations.
+    """
+    def __init__(self, manifest_path: Optional[Union[str, Path]] = None):
+        """
+        Initialize the training data selector.
+        Args:
+            manifest_path: Optional path to dataset manifest
+        """
+        self.manifest_path = Path(manifest_path) if manifest_path else None
+    def select_data(
+        self,
+        profile_name: str,
+        manifest: Optional[Dict[str, Any]] = None,
+    ) -> Iterable[Dict[str, Any]]:
+        """
+        Select training data based on profile configuration.
+        Args:
+            profile_name: Name of the training profile
+            manifest: Optional dataset manifest (if None, loads from manifest_path)
+        Yields:
+            Training examples matching the profile
+        """
+        # Get profile config
+        if profile_name not in PROFILE_CONFIGS:
+            raise ValueError(
+                f"Unknown profile: {profile_name}. "
+                f"Available profiles: {', '.join(PROFILE_CONFIGS.keys())}"
+            )
+        profile_config = PROFILE_CONFIGS[profile_name]
+        logger.info(
+            f"Selecting data for profile '{profile_name}': "
+            f"stages={profile_config.stage_ids}, "
+            f"allow_edge={profile_config.allow_edge_profiles}"
+        )
+        # Load manifest if not provided
+        if manifest is None:
+            manifest = self._load_manifest()
+        # Select examples based on profile
+        for example in self._iterate_examples(manifest):
+            # Check stage
+            example_stage = example.get("metadata", {}).get("stage")
+            if example_stage not in profile_config.stage_ids:
+                continue
+            # Check edge profile if not allowed
+            if not profile_config.allow_edge_profiles:
+                if self._is_edge_example(example):
+                    logger.warning(
+                        f"Skipping edge example in non-edge profile '{profile_name}': "
+                        f"{example.get('id', 'unknown')}"
+                    )
+                    continue
+            yield example
+    def _is_edge_example(self, example: Dict[str, Any]) -> bool:
+        """Check if an example is an edge/red-team example"""
+        metadata = example.get("metadata", {})
+        # Check for edge profile metadata
+        if "edge_profile" in metadata:
+            return True
+        # Check for edge category
+        if "edge_category" in metadata:
+            return True
+        # Check for stage 3 (edge stress test)
+        if metadata.get("stage") == STAGE3_ID:
+            return True
+        # Check for crisis intensity flags
+        if metadata.get("crisis_intensity") in ["very_high", "extreme"]:
+            return True
+        return False
+    def _load_manifest(self) -> Dict[str, Any]:
+        """Load dataset manifest"""
+        if not self.manifest_path or not self.manifest_path.exists():
+            logger.warning(
+                f"Manifest not found at {self.manifest_path}, returning empty manifest"
+            )
+            return {"examples": []}
+        import json
+        with open(self.manifest_path, "r") as f:
+            return json.load(f)
+    def _iterate_examples(self, manifest: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
+        """Iterate over examples in manifest"""
+        examples = manifest.get("examples", [])
+        if not examples:
+            # Try alternative manifest structures
+            examples = manifest.get("data", [])
+            if not examples and isinstance(manifest, list):
+                examples = manifest
+        for example in examples:
+            yield example
+    def assert_no_edge_in_profile(
+        self,
+        profile_name: str,
+        manifest: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Assert that a profile does not contain edge examples.
+        Raises ValueError if edge examples are found.
+        Args:
+            profile_name: Name of the profile to check
+            manifest: Optional dataset manifest
+        """
+        if profile_name not in PROFILE_CONFIGS:
+            raise ValueError(f"Unknown profile: {profile_name}")
+        profile_config = PROFILE_CONFIGS[profile_name]
+        if profile_config.allow_edge_profiles:
+            logger.info(
+                f"Profile '{profile_name}' allows edge profiles, skipping assertion"
+            )
+            return
+        # Load manifest if not provided
+        if manifest is None:
+            manifest = self._load_manifest()
+        # Check for edge examples
+        edge_examples = []
+        for example in self._iterate_examples(manifest):
+            example_stage = example.get("metadata", {}).get("stage")
+            if example_stage in profile_config.stage_ids:
+                if self._is_edge_example(example):
+                    edge_examples.append(example.get("id", "unknown"))
+        if edge_examples:
+            raise ValueError(
+                f"Profile '{profile_name}' contains {len(edge_examples)} edge examples: "
+                f"{edge_examples[:5]}{'...' if len(edge_examples) > 5 else ''}. "
+                f"This profile does not allow edge/red-team data."
+            )
+        logger.info(f"Profile '{profile_name}' validated: no edge examples found")
+    def get_profile_stats(
+        self,
+        profile_name: str,
+        manifest: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Get statistics for a profile.
+        Args:
+            profile_name: Name of the profile
+            manifest: Optional dataset manifest
+        Returns:
+            Statistics dictionary
+        """
+        if profile_name not in PROFILE_CONFIGS:
+            raise ValueError(f"Unknown profile: {profile_name}")
+        profile_config = PROFILE_CONFIGS[profile_name]
+        # Load manifest if not provided
+        if manifest is None:
+            manifest = self._load_manifest()
+        stats = {
+            "profile_name": profile_name,
+            "stages": profile_config.stage_ids,
+            "allow_edge_profiles": profile_config.allow_edge_profiles,
+            "total_examples": 0,
+            "by_stage": {},
+            "edge_examples": 0,
+            "non_edge_examples": 0,
+        }
+        for example in self.select_data(profile_name, manifest):
+            stats["total_examples"] += 1
+            example_stage = example.get("metadata", {}).get("stage", "unknown")
+            stats["by_stage"][example_stage] = (
+                stats["by_stage"].get(example_stage, 0) + 1
+            )
+            if self._is_edge_example(example):
+                stats["edge_examples"] += 1
+            else:
+                stats["non_edge_examples"] += 1
+        return stats
+def get_profile_config(profile_name: str) -> ProfileConfig:
+    """Get configuration for a training profile"""
+    if profile_name not in PROFILE_CONFIGS:
+        raise ValueError(
+            f"Unknown profile: {profile_name}. "
+            f"Available: {', '.join(PROFILE_CONFIGS.keys())}"
+        )
+    return PROFILE_CONFIGS[profile_name]
+def list_profiles() -> List[str]:
+    """List all available training profiles"""
+    return list(PROFILE_CONFIGS.keys())
+def validate_profile_config(profile_name: str) -> tuple[bool, Optional[str]]:
+    """
+    Validate that a profile configuration is correct.
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if profile_name not in PROFILE_CONFIGS:
+        return False, f"Unknown profile: {profile_name}"
+    profile_config = PROFILE_CONFIGS[profile_name]
+    # Validate stage IDs
+    all_stage_ids = {STAGE1_ID, STAGE2_ID, STAGE3_ID, STAGE4_ID}
+    for stage_id in profile_config.stage_ids:
+        if stage_id not in all_stage_ids:
+            return False, f"Invalid stage ID in profile: {stage_id}"
+    # Validate production profile doesn't allow edge
+    if profile_name == TrainingProfile.PRODUCTION.value:
+        if profile_config.allow_edge_profiles:
+            return False, "Production profile must not allow edge profiles"
+        if STAGE3_ID in profile_config.stage_ids:
+            return (
+                False,
+                "Production profile must not include Stage 3 (edge stress test)",
+            )
+    return True, None

configs/stage_configs/config_tracker.py ADDED Viewed

	@@ -0,0 +1,700 @@

+#!/usr/bin/env python3
+"""
+Configuration Change Tracking and Rollback System for Pixelated Empathy AI
+Tracks configuration changes and provides rollback capabilities
+"""
+import os
+import sys
+import json
+import yaml
+import hashlib
+import shutil
+import logging
+from typing import Dict, List, Any, Optional, Tuple
+from pathlib import Path
+from dataclasses import dataclass, asdict
+from datetime import datetime, timezone
+import subprocess
+import tempfile
+from contextlib import contextmanager
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ConfigChange:
+    """Represents a configuration change"""
+    timestamp: str
+    change_id: str
+    file_path: str
+    change_type: str  # 'create', 'update', 'delete'
+    old_hash: Optional[str]
+    new_hash: Optional[str]
+    old_content: Optional[str]
+    new_content: Optional[str]
+    user: str
+    description: str
+    environment: str
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ConfigChange':
+        """Create from dictionary"""
+        return cls(**data)
+@dataclass
+class ConfigSnapshot:
+    """Represents a configuration snapshot"""
+    snapshot_id: str
+    timestamp: str
+    description: str
+    environment: str
+    files: Dict[str, str]  # file_path -> content_hash
+    metadata: Dict[str, Any]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ConfigSnapshot':
+        """Create from dictionary"""
+        return cls(**data)
+class ConfigTracker:
+    """Main configuration tracking system"""
+    def __init__(self, config_dir: str = None, tracking_dir: str = None):
+        self.config_dir = Path(config_dir) if config_dir else Path(__file__).parent
+        self.tracking_dir = Path(tracking_dir) if tracking_dir else self.config_dir / '.config_tracking'
+        # Create tracking directory structure
+        self.tracking_dir.mkdir(exist_ok=True)
+        (self.tracking_dir / 'changes').mkdir(exist_ok=True)
+        (self.tracking_dir / 'snapshots').mkdir(exist_ok=True)
+        (self.tracking_dir / 'backups').mkdir(exist_ok=True)
+        self.changes_file = self.tracking_dir / 'changes.json'
+        self.snapshots_file = self.tracking_dir / 'snapshots.json'
+        # Initialize tracking files if they don't exist
+        if not self.changes_file.exists():
+            self._save_changes([])
+        if not self.snapshots_file.exists():
+            self._save_snapshots([])
+    def track_change(self, file_path: str, change_type: str, description: str = "",
+                    user: str = None, environment: str = None) -> str:
+        """Track a configuration change"""
+        file_path = str(Path(file_path).resolve())
+        # Generate change ID
+        change_id = self._generate_change_id()
+        # Get current user and environment
+        if user is None:
+            user = os.getenv('USER', 'unknown')
+        if environment is None:
+            environment = os.getenv('ENVIRONMENT', 'unknown')
+        # Get file content and hash
+        old_content = None
+        old_hash = None
+        new_content = None
+        new_hash = None
+        if change_type in ['update', 'delete']:
+            # Get old content from backup or current file
+            old_content, old_hash = self._get_file_content_and_hash(file_path)
+        if change_type in ['create', 'update']:
+            # Get new content
+            if Path(file_path).exists():
+                new_content, new_hash = self._get_file_content_and_hash(file_path)
+        # Create change record
+        change = ConfigChange(
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            change_id=change_id,
+            file_path=file_path,
+            change_type=change_type,
+            old_hash=old_hash,
+            new_hash=new_hash,
+            old_content=old_content,
+            new_content=new_content,
+            user=user,
+            description=description,
+            environment=environment
+        )
+        # Save change
+        self._add_change(change)
+        # Create backup of the file
+        if change_type in ['update', 'delete'] and old_content:
+            self._create_backup(file_path, change_id, old_content)
+        logger.info(f"Tracked configuration change: {change_id} - {description}")
+        return change_id
+    def create_snapshot(self, description: str = "", environment: str = None) -> str:
+        """Create a configuration snapshot"""
+        if environment is None:
+            environment = os.getenv('ENVIRONMENT', 'unknown')
+        snapshot_id = self._generate_snapshot_id()
+        # Get all configuration files
+        config_files = self._get_all_config_files()
+        files_dict = {}
+        for file_path in config_files:
+            try:
+                _, file_hash = self._get_file_content_and_hash(file_path)
+                files_dict[str(file_path)] = file_hash
+            except Exception as e:
+                logger.warning(f"Could not include file in snapshot: {file_path} - {e}")
+        # Create snapshot
+        snapshot = ConfigSnapshot(
+            snapshot_id=snapshot_id,
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            description=description,
+            environment=environment,
+            files=files_dict,
+            metadata={
+                'total_files': len(files_dict),
+                'config_dir': str(self.config_dir)
+            }
+        )
+        # Save snapshot
+        self._add_snapshot(snapshot)
+        # Create snapshot backup
+        self._create_snapshot_backup(snapshot_id, config_files)
+        logger.info(f"Created configuration snapshot: {snapshot_id} - {description}")
+        return snapshot_id
+    def rollback_to_change(self, change_id: str) -> bool:
+        """Rollback to a specific change"""
+        changes = self._load_changes()
+        # Find the change
+        target_change = None
+        for change in changes:
+            if change['change_id'] == change_id:
+                target_change = ConfigChange.from_dict(change)
+                break
+        if not target_change:
+            logger.error(f"Change not found: {change_id}")
+            return False
+        try:
+            # Create backup of current state
+            current_backup_id = self.create_snapshot(f"Pre-rollback backup for {change_id}")
+            # Restore the file
+            if target_change.change_type == 'delete':
+                # Restore deleted file
+                if target_change.old_content:
+                    with open(target_change.file_path, 'w') as f:
+                        f.write(target_change.old_content)
+                    logger.info(f"Restored deleted file: {target_change.file_path}")
+                else:
+                    logger.error(f"Cannot restore deleted file - no backup content")
+                    return False
+            elif target_change.change_type in ['create', 'update']:
+                # Rollback to previous version
+                if target_change.old_content:
+                    with open(target_change.file_path, 'w') as f:
+                        f.write(target_change.old_content)
+                    logger.info(f"Rolled back file: {target_change.file_path}")
+                else:
+                    # This was a create operation, delete the file
+                    if Path(target_change.file_path).exists():
+                        os.remove(target_change.file_path)
+                        logger.info(f"Removed created file: {target_change.file_path}")
+            # Track the rollback as a new change
+            self.track_change(
+                target_change.file_path,
+                'rollback',
+                f"Rollback to change {change_id}",
+                environment=target_change.environment
+            )
+            logger.info(f"Successfully rolled back to change: {change_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Rollback failed: {e}")
+            return False
+    def rollback_to_snapshot(self, snapshot_id: str) -> bool:
+        """Rollback to a specific snapshot"""
+        snapshots = self._load_snapshots()
+        # Find the snapshot
+        target_snapshot = None
+        for snapshot in snapshots:
+            if snapshot['snapshot_id'] == snapshot_id:
+                target_snapshot = ConfigSnapshot.from_dict(snapshot)
+                break
+        if not target_snapshot:
+            logger.error(f"Snapshot not found: {snapshot_id}")
+            return False
+        try:
+            # Create backup of current state
+            current_backup_id = self.create_snapshot(f"Pre-rollback backup for snapshot {snapshot_id}")
+            # Restore files from snapshot backup
+            snapshot_backup_dir = self.tracking_dir / 'snapshots' / snapshot_id
+            if not snapshot_backup_dir.exists():
+                logger.error(f"Snapshot backup directory not found: {snapshot_backup_dir}")
+                return False
+            # Restore each file
+            restored_files = []
+            for file_path in target_snapshot.files.keys():
+                backup_file = snapshot_backup_dir / Path(file_path).name
+                if backup_file.exists():
+                    # Restore the file
+                    shutil.copy2(backup_file, file_path)
+                    restored_files.append(file_path)
+                    logger.info(f"Restored file: {file_path}")
+                else:
+                    logger.warning(f"Backup file not found: {backup_file}")
+            # Track the rollback
+            for file_path in restored_files:
+                self.track_change(
+                    file_path,
+                    'rollback',
+                    f"Rollback to snapshot {snapshot_id}",
+                    environment=target_snapshot.environment
+                )
+            logger.info(f"Successfully rolled back to snapshot: {snapshot_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Snapshot rollback failed: {e}")
+            return False
+    def get_change_history(self, file_path: str = None, limit: int = None) -> List[Dict[str, Any]]:
+        """Get change history"""
+        changes = self._load_changes()
+        # Filter by file path if specified
+        if file_path:
+            file_path = str(Path(file_path).resolve())
+            changes = [c for c in changes if c['file_path'] == file_path]
+        # Sort by timestamp (newest first)
+        changes.sort(key=lambda x: x['timestamp'], reverse=True)
+        # Apply limit if specified
+        if limit:
+            changes = changes[:limit]
+        return changes
+    def get_snapshots(self, limit: int = None) -> List[Dict[str, Any]]:
+        """Get snapshot history"""
+        snapshots = self._load_snapshots()
+        # Sort by timestamp (newest first)
+        snapshots.sort(key=lambda x: x['timestamp'], reverse=True)
+        # Apply limit if specified
+        if limit:
+            snapshots = snapshots[:limit]
+        return snapshots
+    def compare_configurations(self, snapshot_id1: str, snapshot_id2: str) -> Dict[str, Any]:
+        """Compare two configuration snapshots"""
+        snapshots = self._load_snapshots()
+        snapshot1 = None
+        snapshot2 = None
+        for snapshot in snapshots:
+            if snapshot['snapshot_id'] == snapshot_id1:
+                snapshot1 = ConfigSnapshot.from_dict(snapshot)
+            elif snapshot['snapshot_id'] == snapshot_id2:
+                snapshot2 = ConfigSnapshot.from_dict(snapshot)
+        if not snapshot1 or not snapshot2:
+            raise ValueError("One or both snapshots not found")
+        # Compare files
+        all_files = set(snapshot1.files.keys()) | set(snapshot2.files.keys())
+        differences = {
+            'added': [],
+            'removed': [],
+            'modified': [],
+            'unchanged': []
+        }
+        for file_path in all_files:
+            hash1 = snapshot1.files.get(file_path)
+            hash2 = snapshot2.files.get(file_path)
+            if hash1 and not hash2:
+                differences['removed'].append(file_path)
+            elif not hash1 and hash2:
+                differences['added'].append(file_path)
+            elif hash1 != hash2:
+                differences['modified'].append(file_path)
+            else:
+                differences['unchanged'].append(file_path)
+        return {
+            'snapshot1': snapshot1.to_dict(),
+            'snapshot2': snapshot2.to_dict(),
+            'differences': differences,
+            'summary': {
+                'total_files': len(all_files),
+                'added': len(differences['added']),
+                'removed': len(differences['removed']),
+                'modified': len(differences['modified']),
+                'unchanged': len(differences['unchanged'])
+            }
+        }
+    def cleanup_old_backups(self, days: int = 30) -> int:
+        """Clean up old backups and snapshots"""
+        cutoff_time = datetime.now(timezone.utc).timestamp() - (days * 24 * 60 * 60)
+        cleaned_count = 0
+        # Clean up old change backups
+        backup_dir = self.tracking_dir / 'backups'
+        if backup_dir.exists():
+            for backup_file in backup_dir.iterdir():
+                if backup_file.stat().st_mtime < cutoff_time:
+                    backup_file.unlink()
+                    cleaned_count += 1
+        # Clean up old snapshot backups
+        snapshot_dir = self.tracking_dir / 'snapshots'
+        if snapshot_dir.exists():
+            for snapshot_backup in snapshot_dir.iterdir():
+                if snapshot_backup.is_dir() and snapshot_backup.stat().st_mtime < cutoff_time:
+                    shutil.rmtree(snapshot_backup)
+                    cleaned_count += 1
+        logger.info(f"Cleaned up {cleaned_count} old backup files")
+        return cleaned_count
+    def export_tracking_data(self, output_file: str) -> bool:
+        """Export all tracking data to a file"""
+        try:
+            export_data = {
+                'export_timestamp': datetime.now(timezone.utc).isoformat(),
+                'config_dir': str(self.config_dir),
+                'changes': self._load_changes(),
+                'snapshots': self._load_snapshots()
+            }
+            with open(output_file, 'w') as f:
+                json.dump(export_data, f, indent=2)
+            logger.info(f"Exported tracking data to: {output_file}")
+            return True
+        except Exception as e:
+            logger.error(f"Export failed: {e}")
+            return False
+    def import_tracking_data(self, input_file: str) -> bool:
+        """Import tracking data from a file"""
+        try:
+            with open(input_file, 'r') as f:
+                import_data = json.load(f)
+            # Validate import data
+            if 'changes' not in import_data or 'snapshots' not in import_data:
+                raise ValueError("Invalid import data format")
+            # Backup current tracking data
+            backup_file = self.tracking_dir / f"backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+            self.export_tracking_data(str(backup_file))
+            # Import changes and snapshots
+            self._save_changes(import_data['changes'])
+            self._save_snapshots(import_data['snapshots'])
+            logger.info(f"Imported tracking data from: {input_file}")
+            return True
+        except Exception as e:
+            logger.error(f"Import failed: {e}")
+            return False
+    @contextmanager
+    def track_changes(self, description: str = "Batch configuration changes"):
+        """Context manager for tracking multiple changes"""
+        initial_snapshot = self.create_snapshot(f"Pre-change snapshot: {description}")
+        try:
+            yield
+            # Create post-change snapshot
+            final_snapshot = self.create_snapshot(f"Post-change snapshot: {description}")
+            logger.info(f"Tracked batch changes: {description}")
+            logger.info(f"Initial snapshot: {initial_snapshot}")
+            logger.info(f"Final snapshot: {final_snapshot}")
+        except Exception as e:
+            logger.error(f"Error during tracked changes: {e}")
+            # Rollback to initial snapshot
+            logger.info(f"Rolling back to initial snapshot: {initial_snapshot}")
+            self.rollback_to_snapshot(initial_snapshot)
+            raise
+    def _generate_change_id(self) -> str:
+        """Generate unique change ID"""
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        random_suffix = hashlib.md5(os.urandom(16)).hexdigest()[:8]
+        return f"change_{timestamp}_{random_suffix}"
+    def _generate_snapshot_id(self) -> str:
+        """Generate unique snapshot ID"""
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        random_suffix = hashlib.md5(os.urandom(16)).hexdigest()[:8]
+        return f"snapshot_{timestamp}_{random_suffix}"
+    def _get_file_content_and_hash(self, file_path: str) -> Tuple[str, str]:
+        """Get file content and its hash"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        file_hash = hashlib.sha256(content.encode('utf-8')).hexdigest()
+        return content, file_hash
+    def _get_all_config_files(self) -> List[Path]:
+        """Get all configuration files"""
+        config_files = []
+        # Common configuration file patterns
+        patterns = [
+            '*.yaml', '*.yml', '*.json', '*.toml', '*.ini', '*.conf',
+            '.env*', '*.config'
+        ]
+        for pattern in patterns:
+            config_files.extend(self.config_dir.glob(pattern))
+        # Also check subdirectories
+        for subdir in self.config_dir.iterdir():
+            if subdir.is_dir() and not subdir.name.startswith('.'):
+                for pattern in patterns:
+                    config_files.extend(subdir.glob(pattern))
+        return config_files
+    def _create_backup(self, file_path: str, change_id: str, content: str):
+        """Create backup of file content"""
+        backup_file = self.tracking_dir / 'backups' / f"{change_id}_{Path(file_path).name}"
+        with open(backup_file, 'w', encoding='utf-8') as f:
+            f.write(content)
+    def _create_snapshot_backup(self, snapshot_id: str, config_files: List[Path]):
+        """Create backup of all files in snapshot"""
+        snapshot_backup_dir = self.tracking_dir / 'snapshots' / snapshot_id
+        snapshot_backup_dir.mkdir(exist_ok=True)
+        for file_path in config_files:
+            if file_path.exists():
+                backup_file = snapshot_backup_dir / file_path.name
+                shutil.copy2(file_path, backup_file)
+    def _load_changes(self) -> List[Dict[str, Any]]:
+        """Load changes from file"""
+        try:
+            with open(self.changes_file, 'r') as f:
+                return json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError):
+            return []
+    def _save_changes(self, changes: List[Dict[str, Any]]):
+        """Save changes to file"""
+        with open(self.changes_file, 'w') as f:
+            json.dump(changes, f, indent=2)
+    def _add_change(self, change: ConfigChange):
+        """Add a change to the tracking file"""
+        changes = self._load_changes()
+        changes.append(change.to_dict())
+        self._save_changes(changes)
+    def _load_snapshots(self) -> List[Dict[str, Any]]:
+        """Load snapshots from file"""
+        try:
+            with open(self.snapshots_file, 'r') as f:
+                return json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError):
+            return []
+    def _save_snapshots(self, snapshots: List[Dict[str, Any]]):
+        """Save snapshots to file"""
+        with open(self.snapshots_file, 'w') as f:
+            json.dump(snapshots, f, indent=2)
+    def _add_snapshot(self, snapshot: ConfigSnapshot):
+        """Add a snapshot to the tracking file"""
+        snapshots = self._load_snapshots()
+        snapshots.append(snapshot.to_dict())
+        self._save_snapshots(snapshots)
+def main():
+    """Main CLI interface"""
+    import argparse
+    parser = argparse.ArgumentParser(description="Configuration Change Tracking System")
+    parser.add_argument('--config-dir', help="Configuration directory")
+    parser.add_argument('--tracking-dir', help="Tracking data directory")
+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
+    # Track command
+    track_parser = subparsers.add_parser('track', help='Track a configuration change')
+    track_parser.add_argument('file_path', help='Path to configuration file')
+    track_parser.add_argument('change_type', choices=['create', 'update', 'delete'])
+    track_parser.add_argument('--description', default='', help='Change description')
+    track_parser.add_argument('--user', help='User making the change')
+    track_parser.add_argument('--environment', help='Environment')
+    # Snapshot command
+    snapshot_parser = subparsers.add_parser('snapshot', help='Create a configuration snapshot')
+    snapshot_parser.add_argument('--description', default='', help='Snapshot description')
+    snapshot_parser.add_argument('--environment', help='Environment')
+    # Rollback command
+    rollback_parser = subparsers.add_parser('rollback', help='Rollback configuration')
+    rollback_group = rollback_parser.add_mutually_exclusive_group(required=True)
+    rollback_group.add_argument('--change-id', help='Change ID to rollback to')
+    rollback_group.add_argument('--snapshot-id', help='Snapshot ID to rollback to')
+    # History command
+    history_parser = subparsers.add_parser('history', help='Show change history')
+    history_parser.add_argument('--file-path', help='Filter by file path')
+    history_parser.add_argument('--limit', type=int, help='Limit number of results')
+    # Snapshots command
+    snapshots_parser = subparsers.add_parser('snapshots', help='List snapshots')
+    snapshots_parser.add_argument('--limit', type=int, help='Limit number of results')
+    # Compare command
+    compare_parser = subparsers.add_parser('compare', help='Compare snapshots')
+    compare_parser.add_argument('snapshot1', help='First snapshot ID')
+    compare_parser.add_argument('snapshot2', help='Second snapshot ID')
+    # Cleanup command
+    cleanup_parser = subparsers.add_parser('cleanup', help='Clean up old backups')
+    cleanup_parser.add_argument('--days', type=int, default=30, help='Days to keep')
+    # Export command
+    export_parser = subparsers.add_parser('export', help='Export tracking data')
+    export_parser.add_argument('output_file', help='Output file path')
+    # Import command
+    import_parser = subparsers.add_parser('import', help='Import tracking data')
+    import_parser.add_argument('input_file', help='Input file path')
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        return
+    # Create tracker
+    tracker = ConfigTracker(args.config_dir, args.tracking_dir)
+    # Execute command
+    if args.command == 'track':
+        change_id = tracker.track_change(
+            args.file_path,
+            args.change_type,
+            args.description,
+            args.user,
+            args.environment
+        )
+        print(f"Change tracked: {change_id}")
+    elif args.command == 'snapshot':
+        snapshot_id = tracker.create_snapshot(args.description, args.environment)
+        print(f"Snapshot created: {snapshot_id}")
+    elif args.command == 'rollback':
+        if args.change_id:
+            success = tracker.rollback_to_change(args.change_id)
+        else:
+            success = tracker.rollback_to_snapshot(args.snapshot_id)
+        if success:
+            print("Rollback completed successfully")
+        else:
+            print("Rollback failed")
+            sys.exit(1)
+    elif args.command == 'history':
+        changes = tracker.get_change_history(args.file_path, args.limit)
+        print(json.dumps(changes, indent=2))
+    elif args.command == 'snapshots':
+        snapshots = tracker.get_snapshots(args.limit)
+        print(json.dumps(snapshots, indent=2))
+    elif args.command == 'compare':
+        comparison = tracker.compare_configurations(args.snapshot1, args.snapshot2)
+        print(json.dumps(comparison, indent=2))
+    elif args.command == 'cleanup':
+        count = tracker.cleanup_old_backups(args.days)
+        print(f"Cleaned up {count} old backup files")
+    elif args.command == 'export':
+        success = tracker.export_tracking_data(args.output_file)
+        if success:
+            print(f"Tracking data exported to: {args.output_file}")
+        else:
+            print("Export failed")
+            sys.exit(1)
+    elif args.command == 'import':
+        success = tracker.import_tracking_data(args.input_file)
+        if success:
+            print(f"Tracking data imported from: {args.input_file}")
+        else:
+            print("Import failed")
+            sys.exit(1)
+if __name__ == '__main__':
+    main()

configs/stage_configs/config_validator.py ADDED Viewed

	@@ -0,0 +1,705 @@

+#!/usr/bin/env python3
+"""
+Configuration Validation System for Pixelated Empathy AI
+Validates all configuration files and environment variables
+"""
+import os
+import sys
+import json
+import yaml
+import logging
+from typing import Dict, List, Any, Optional, Union
+from pathlib import Path
+from dataclasses import dataclass, field
+from enum import Enum
+import re
+from urllib.parse import urlparse
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class ValidationLevel(Enum):
+    """Validation severity levels"""
+    ERROR = "error"
+    WARNING = "warning"
+    INFO = "info"
+@dataclass
+class ValidationResult:
+    """Result of a configuration validation"""
+    level: ValidationLevel
+    message: str
+    field: Optional[str] = None
+    value: Optional[Any] = None
+    suggestion: Optional[str] = None
+@dataclass
+class ValidationReport:
+    """Complete validation report"""
+    results: List[ValidationResult] = field(default_factory=list)
+    def add_error(self, message: str, field: str = None, value: Any = None, suggestion: str = None):
+        """Add an error to the report"""
+        self.results.append(ValidationResult(
+            level=ValidationLevel.ERROR,
+            message=message,
+            field=field,
+            value=value,
+            suggestion=suggestion
+        ))
+    def add_warning(self, message: str, field: str = None, value: Any = None, suggestion: str = None):
+        """Add a warning to the report"""
+        self.results.append(ValidationResult(
+            level=ValidationLevel.WARNING,
+            message=message,
+            field=field,
+            value=value,
+            suggestion=suggestion
+        ))
+    def add_info(self, message: str, field: str = None, value: Any = None):
+        """Add an info message to the report"""
+        self.results.append(ValidationResult(
+            level=ValidationLevel.INFO,
+            message=message,
+            field=field,
+            value=value
+        ))
+    @property
+    def has_errors(self) -> bool:
+        """Check if report contains errors"""
+        return any(r.level == ValidationLevel.ERROR for r in self.results)
+    @property
+    def has_warnings(self) -> bool:
+        """Check if report contains warnings"""
+        return any(r.level == ValidationLevel.WARNING for r in self.results)
+    def get_summary(self) -> Dict[str, int]:
+        """Get summary of validation results"""
+        summary = {level.value: 0 for level in ValidationLevel}
+        for result in self.results:
+            summary[result.level.value] += 1
+        return summary
+class ConfigValidator:
+    """Main configuration validator"""
+    def __init__(self, config_dir: str = None):
+        self.config_dir = Path(config_dir) if config_dir else Path(__file__).parent
+        self.report = ValidationReport()
+    def validate_all(self) -> ValidationReport:
+        """Validate all configuration aspects"""
+        logger.info("Starting comprehensive configuration validation...")
+        # Reset report
+        self.report = ValidationReport()
+        # Validate different aspects
+        self._validate_environment_variables()
+        self._validate_database_config()
+        self._validate_redis_config()
+        self._validate_security_config()
+        self._validate_monitoring_config()
+        self._validate_file_permissions()
+        self._validate_network_config()
+        self._validate_resource_limits()
+        self._validate_backup_config()
+        # Log summary
+        summary = self.report.get_summary()
+        logger.info(f"Validation complete: {summary}")
+        return self.report
+    def _validate_environment_variables(self):
+        """Validate required environment variables"""
+        logger.info("Validating environment variables...")
+        required_vars = {
+            'DATABASE_URL': self._validate_database_url,
+            'REDIS_URL': self._validate_redis_url,
+            'JWT_SECRET': self._validate_jwt_secret,
+            'ENCRYPTION_KEY': self._validate_encryption_key,
+            'LOG_LEVEL': self._validate_log_level,
+            'ENVIRONMENT': self._validate_environment,
+        }
+        optional_vars = {
+            'MAX_WORKERS': self._validate_max_workers,
+            'BATCH_SIZE': self._validate_batch_size,
+            'DEBUG': self._validate_debug_flag,
+            'SENTRY_DSN': self._validate_sentry_dsn,
+        }
+        # Check required variables
+        for var_name, validator in required_vars.items():
+            value = os.getenv(var_name)
+            if not value:
+                self.report.add_error(
+                    f"Required environment variable '{var_name}' is not set",
+                    field=var_name,
+                    suggestion=f"Set {var_name} environment variable"
+                )
+            else:
+                validator(value, var_name)
+        # Check optional variables
+        for var_name, validator in optional_vars.items():
+            value = os.getenv(var_name)
+            if value:
+                validator(value, var_name)
+    def _validate_database_url(self, value: str, field: str):
+        """Validate database URL format"""
+        try:
+            parsed = urlparse(value)
+            if not parsed.scheme:
+                self.report.add_error(
+                    f"Database URL missing scheme",
+                    field=field,
+                    suggestion="Use format: postgresql://user:pass@host:port/db"
+                )
+            elif parsed.scheme not in ['postgresql', 'postgres']:
+                self.report.add_warning(
+                    f"Unexpected database scheme: {parsed.scheme}",
+                    field=field,
+                    suggestion="Consider using PostgreSQL for production"
+                )
+            if not parsed.hostname:
+                self.report.add_error(
+                    f"Database URL missing hostname",
+                    field=field
+                )
+            if not parsed.path or parsed.path == '/':
+                self.report.add_error(
+                    f"Database URL missing database name",
+                    field=field
+                )
+        except Exception as e:
+            self.report.add_error(
+                f"Invalid database URL format: {e}",
+                field=field
+            )
+    def _validate_redis_url(self, value: str, field: str):
+        """Validate Redis URL format"""
+        try:
+            parsed = urlparse(value)
+            if not parsed.scheme:
+                self.report.add_error(
+                    f"Redis URL missing scheme",
+                    field=field,
+                    suggestion="Use format: redis://[:password@]host:port[/db]"
+                )
+            elif parsed.scheme not in ['redis', 'rediss']:
+                self.report.add_error(
+                    f"Invalid Redis scheme: {parsed.scheme}",
+                    field=field,
+                    suggestion="Use 'redis://' or 'rediss://' for SSL"
+                )
+            if not parsed.hostname:
+                self.report.add_error(
+                    f"Redis URL missing hostname",
+                    field=field
+                )
+        except Exception as e:
+            self.report.add_error(
+                f"Invalid Redis URL format: {e}",
+                field=field
+            )
+    def _validate_jwt_secret(self, value: str, field: str):
+        """Validate JWT secret strength"""
+        if len(value) < 32:
+            self.report.add_error(
+                f"JWT secret too short (minimum 32 characters)",
+                field=field,
+                value=f"Length: {len(value)}",
+                suggestion="Generate a longer, more secure secret"
+            )
+        elif len(value) < 64:
+            self.report.add_warning(
+                f"JWT secret could be longer for better security",
+                field=field,
+                value=f"Length: {len(value)}",
+                suggestion="Consider using 64+ character secret"
+            )
+        # Check for common weak patterns
+        if value.lower() in ['secret', 'password', 'changeme', 'default']:
+            self.report.add_error(
+                f"JWT secret uses common weak value",
+                field=field,
+                suggestion="Generate a cryptographically secure random secret"
+            )
+    def _validate_encryption_key(self, value: str, field: str):
+        """Validate encryption key"""
+        if len(value) < 32:
+            self.report.add_error(
+                f"Encryption key too short (minimum 32 characters)",
+                field=field,
+                value=f"Length: {len(value)}"
+            )
+        # Check if it's base64 encoded (common for encryption keys)
+        try:
+            import base64
+            base64.b64decode(value)
+            if len(base64.b64decode(value)) < 32:
+                self.report.add_warning(
+                    f"Decoded encryption key may be too short",
+                    field=field
+                )
+        except Exception:
+            # Not base64, check raw length
+            pass
+    def _validate_log_level(self, value: str, field: str):
+        """Validate log level"""
+        valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+        if value.upper() not in valid_levels:
+            self.report.add_error(
+                f"Invalid log level: {value}",
+                field=field,
+                suggestion=f"Use one of: {', '.join(valid_levels)}"
+            )
+        elif value.upper() == 'DEBUG':
+            env = os.getenv('ENVIRONMENT', '').lower()
+            if env in ['production', 'prod']:
+                self.report.add_warning(
+                    f"DEBUG log level in production environment",
+                    field=field,
+                    suggestion="Use INFO or WARNING for production"
+                )
+    def _validate_environment(self, value: str, field: str):
+        """Validate environment setting"""
+        valid_envs = ['development', 'dev', 'staging', 'production', 'prod', 'test']
+        if value.lower() not in valid_envs:
+            self.report.add_warning(
+                f"Unexpected environment value: {value}",
+                field=field,
+                suggestion=f"Consider using: {', '.join(valid_envs)}"
+            )
+    def _validate_max_workers(self, value: str, field: str):
+        """Validate max workers setting"""
+        try:
+            workers = int(value)
+            if workers < 1:
+                self.report.add_error(
+                    f"Max workers must be positive",
+                    field=field,
+                    value=workers
+                )
+            elif workers > 32:
+                self.report.add_warning(
+                    f"Very high worker count may cause resource issues",
+                    field=field,
+                    value=workers,
+                    suggestion="Consider CPU core count when setting workers"
+                )
+        except ValueError:
+            self.report.add_error(
+                f"Max workers must be an integer",
+                field=field,
+                value=value
+            )
+    def _validate_batch_size(self, value: str, field: str):
+        """Validate batch size setting"""
+        try:
+            batch_size = int(value)
+            if batch_size < 1:
+                self.report.add_error(
+                    f"Batch size must be positive",
+                    field=field,
+                    value=batch_size
+                )
+            elif batch_size > 1000:
+                self.report.add_warning(
+                    f"Large batch size may cause memory issues",
+                    field=field,
+                    value=batch_size,
+                    suggestion="Consider memory constraints when setting batch size"
+                )
+        except ValueError:
+            self.report.add_error(
+                f"Batch size must be an integer",
+                field=field,
+                value=value
+            )
+    def _validate_debug_flag(self, value: str, field: str):
+        """Validate debug flag"""
+        if value.lower() not in ['true', 'false', '1', '0', 'yes', 'no']:
+            self.report.add_warning(
+                f"Debug flag should be boolean-like",
+                field=field,
+                value=value,
+                suggestion="Use 'true', 'false', '1', or '0'"
+            )
+        if value.lower() in ['true', '1', 'yes']:
+            env = os.getenv('ENVIRONMENT', '').lower()
+            if env in ['production', 'prod']:
+                self.report.add_warning(
+                    f"Debug enabled in production environment",
+                    field=field,
+                    suggestion="Disable debug in production"
+                )
+    def _validate_sentry_dsn(self, value: str, field: str):
+        """Validate Sentry DSN format"""
+        try:
+            parsed = urlparse(value)
+            if not parsed.scheme or not parsed.hostname:
+                self.report.add_error(
+                    f"Invalid Sentry DSN format",
+                    field=field,
+                    suggestion="Check Sentry project settings for correct DSN"
+                )
+        except Exception as e:
+            self.report.add_error(
+                f"Invalid Sentry DSN: {e}",
+                field=field
+            )
+    def _validate_database_config(self):
+        """Validate database configuration files"""
+        logger.info("Validating database configuration...")
+        # Check for database config files
+        db_config_files = [
+            'database.yaml',
+            'database.json',
+            'db_config.yaml'
+        ]
+        for config_file in db_config_files:
+            config_path = self.config_dir / config_file
+            if config_path.exists():
+                self._validate_config_file(config_path)
+    def _validate_redis_config(self):
+        """Validate Redis configuration"""
+        logger.info("Validating Redis configuration...")
+        redis_config = self.config_dir / 'redis.yaml'
+        if redis_config.exists():
+            self._validate_config_file(redis_config)
+    def _validate_security_config(self):
+        """Validate security configuration"""
+        logger.info("Validating security configuration...")
+        security_config = self.config_dir / 'security.yaml'
+        if security_config.exists():
+            try:
+                with open(security_config, 'r') as f:
+                    config = yaml.safe_load(f)
+                # Check security settings
+                if 'encryption' in config:
+                    if not config['encryption'].get('enabled', False):
+                        self.report.add_warning(
+                            "Encryption is disabled",
+                            field="encryption.enabled",
+                            suggestion="Enable encryption for production"
+                        )
+                if 'authentication' in config:
+                    auth_config = config['authentication']
+                    if auth_config.get('require_2fa', False) is False:
+                        env = os.getenv('ENVIRONMENT', '').lower()
+                        if env in ['production', 'prod']:
+                            self.report.add_warning(
+                                "2FA not required in production",
+                                field="authentication.require_2fa",
+                                suggestion="Enable 2FA for production security"
+                            )
+            except Exception as e:
+                self.report.add_error(
+                    f"Error reading security config: {e}",
+                    field="security.yaml"
+                )
+    def _validate_monitoring_config(self):
+        """Validate monitoring configuration"""
+        logger.info("Validating monitoring configuration...")
+        monitoring_config = self.config_dir / 'monitoring.yaml'
+        if monitoring_config.exists():
+            self._validate_config_file(monitoring_config)
+    def _validate_file_permissions(self):
+        """Validate file permissions for security"""
+        logger.info("Validating file permissions...")
+        sensitive_files = [
+            '.env',
+            'secrets.yaml',
+            'private.key',
+            'ssl.key'
+        ]
+        for filename in sensitive_files:
+            filepath = self.config_dir / filename
+            if filepath.exists():
+                stat_info = filepath.stat()
+                mode = oct(stat_info.st_mode)[-3:]
+                # Check if file is readable by others
+                if int(mode[2]) > 0:
+                    self.report.add_warning(
+                        f"Sensitive file '{filename}' is readable by others",
+                        field=f"permissions.{filename}",
+                        value=f"Mode: {mode}",
+                        suggestion="Set permissions to 600 or 640"
+                    )
+    def _validate_network_config(self):
+        """Validate network configuration"""
+        logger.info("Validating network configuration...")
+        # Check common network settings
+        bind_host = os.getenv('BIND_HOST', '0.0.0.0')
+        if bind_host == '0.0.0.0':
+            env = os.getenv('ENVIRONMENT', '').lower()
+            if env in ['production', 'prod']:
+                self.report.add_warning(
+                    "Binding to all interfaces (0.0.0.0) in production",
+                    field="BIND_HOST",
+                    suggestion="Consider binding to specific interface for security"
+                )
+        # Check port configuration
+        port = os.getenv('PORT', '8000')
+        try:
+            port_num = int(port)
+            if port_num < 1024 and os.getuid() != 0:
+                self.report.add_warning(
+                    f"Port {port_num} requires root privileges",
+                    field="PORT",
+                    suggestion="Use port >= 1024 or run as root"
+                )
+        except (ValueError, AttributeError):
+            pass
+    def _validate_resource_limits(self):
+        """Validate resource limit configurations"""
+        logger.info("Validating resource limits...")
+        # Check memory limits
+        max_memory = os.getenv('MAX_MEMORY')
+        if max_memory:
+            try:
+                # Parse memory value (e.g., "2G", "512M")
+                if max_memory.endswith('G'):
+                    memory_gb = float(max_memory[:-1])
+                    if memory_gb < 1:
+                        self.report.add_warning(
+                            f"Low memory limit: {max_memory}",
+                            field="MAX_MEMORY",
+                            suggestion="Consider increasing memory for better performance"
+                        )
+                elif max_memory.endswith('M'):
+                    memory_mb = float(max_memory[:-1])
+                    if memory_mb < 512:
+                        self.report.add_warning(
+                            f"Very low memory limit: {max_memory}",
+                            field="MAX_MEMORY",
+                            suggestion="Increase memory limit for stable operation"
+                        )
+            except ValueError:
+                self.report.add_error(
+                    f"Invalid memory limit format: {max_memory}",
+                    field="MAX_MEMORY",
+                    suggestion="Use format like '2G' or '512M'"
+                )
+    def _validate_backup_config(self):
+        """Validate backup configuration"""
+        logger.info("Validating backup configuration...")
+        backup_config = self.config_dir / 'backup.yaml'
+        if backup_config.exists():
+            try:
+                with open(backup_config, 'r') as f:
+                    config = yaml.safe_load(f)
+                if not config.get('enabled', False):
+                    env = os.getenv('ENVIRONMENT', '').lower()
+                    if env in ['production', 'prod']:
+                        self.report.add_error(
+                            "Backups disabled in production",
+                            field="backup.enabled",
+                            suggestion="Enable backups for production data protection"
+                        )
+                # Check backup schedule
+                schedule = config.get('schedule')
+                if schedule:
+                    # Basic cron validation
+                    if not re.match(r'^[\d\*\-,/]+\s+[\d\*\-,/]+\s+[\d\*\-,/]+\s+[\d\*\-,/]+\s+[\d\*\-,/]+$', schedule):
+                        self.report.add_warning(
+                            f"Invalid cron schedule format: {schedule}",
+                            field="backup.schedule",
+                            suggestion="Use valid cron format (e.g., '0 2 * * *')"
+                        )
+            except Exception as e:
+                self.report.add_error(
+                    f"Error reading backup config: {e}",
+                    field="backup.yaml"
+                )
+    def _validate_config_file(self, filepath: Path):
+        """Validate a configuration file"""
+        try:
+            with open(filepath, 'r') as f:
+                if filepath.suffix in ['.yaml', '.yml']:
+                    yaml.safe_load(f)
+                elif filepath.suffix == '.json':
+                    json.load(f)
+            self.report.add_info(
+                f"Configuration file '{filepath.name}' is valid",
+                field=str(filepath)
+            )
+        except yaml.YAMLError as e:
+            self.report.add_error(
+                f"Invalid YAML in '{filepath.name}': {e}",
+                field=str(filepath)
+            )
+        except json.JSONDecodeError as e:
+            self.report.add_error(
+                f"Invalid JSON in '{filepath.name}': {e}",
+                field=str(filepath)
+            )
+        except Exception as e:
+            self.report.add_error(
+                f"Error reading '{filepath.name}': {e}",
+                field=str(filepath)
+            )
+    def print_report(self, report: ValidationReport = None):
+        """Print validation report in a readable format"""
+        if report is None:
+            report = self.report
+        print("\n" + "="*80)
+        print("CONFIGURATION VALIDATION REPORT")
+        print("="*80)
+        summary = report.get_summary()
+        print(f"\nSUMMARY:")
+        print(f"  Errors:   {summary['error']}")
+        print(f"  Warnings: {summary['warning']}")
+        print(f"  Info:     {summary['info']}")
+        if report.results:
+            print(f"\nDETAILS:")
+            for result in report.results:
+                icon = {"error": "❌", "warning": "⚠️", "info": "ℹ️"}[result.level.value]
+                print(f"\n{icon} {result.level.value.upper()}: {result.message}")
+                if result.field:
+                    print(f"   Field: {result.field}")
+                if result.value is not None:
+                    print(f"   Value: {result.value}")
+                if result.suggestion:
+                    print(f"   Suggestion: {result.suggestion}")
+        print("\n" + "="*80)
+        if report.has_errors:
+            print("❌ VALIDATION FAILED - Please fix errors before proceeding")
+            return False
+        elif report.has_warnings:
+            print("⚠️  VALIDATION PASSED WITH WARNINGS - Review warnings for production")
+            return True
+        else:
+            print("✅ VALIDATION PASSED - Configuration is valid")
+            return True
+def main():
+    """Main entry point for configuration validation"""
+    import argparse
+    parser = argparse.ArgumentParser(description="Validate Pixelated Empathy AI configuration")
+    parser.add_argument(
+        '--config-dir',
+        default=None,
+        help="Configuration directory path"
+    )
+    parser.add_argument(
+        '--json',
+        action='store_true',
+        help="Output report in JSON format"
+    )
+    parser.add_argument(
+        '--fail-on-warnings',
+        action='store_true',
+        help="Exit with error code if warnings are found"
+    )
+    args = parser.parse_args()
+    # Create validator and run validation
+    validator = ConfigValidator(args.config_dir)
+    report = validator.validate_all()
+    if args.json:
+        # Output JSON report
+        json_report = {
+            'summary': report.get_summary(),
+            'results': [
+                {
+                    'level': r.level.value,
+                    'message': r.message,
+                    'field': r.field,
+                    'value': r.value,
+                    'suggestion': r.suggestion
+                }
+                for r in report.results
+            ]
+        }
+        print(json.dumps(json_report, indent=2))
+    else:
+        # Print human-readable report
+        success = validator.print_report(report)
+        # Exit with appropriate code
+        if not success:
+            sys.exit(1)
+        elif args.fail_on_warnings and report.has_warnings:
+            sys.exit(2)
+        else:
+            sys.exit(0)
+if __name__ == '__main__':
+    main()

configs/stage_configs/configs_config.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Centralized configuration for the Pixelated Empathy AI dataset pipeline.
+Provides an enterprise-grade, unified configuration management system.
+"""
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class DataLoaderConfig:
+    """Configuration for the data loader and acquisition."""
+    huggingface_datasets: dict[str, str] = field(default_factory=lambda: {
+        "mental_health_counseling": "Amod/mental_health_counseling_conversations",
+        "psych8k": "EmoCareAI/Psych8k",
+        # Mental Health Investigation Resources (Phase 1)
+        "mental_health_snli": "iqrakiran/customized-mental-health-snli2",
+        "mental_health_preprocessed": "typosonlr/MentalHealthPreProcessed",
+        "depression_detection": "ShreyaR/DepressionDetection",
+    })
+    download_path: str = "ai/datasets/external"
+    cache_dir: str = "ai/datasets/cache"
+    huggingface_cache_dir: str = "ai/datasets/huggingface_cache"
+    max_retries: int = 3
+@dataclass
+class StandardizationConfig:
+    """Configuration for the DataStandardizer."""
+    max_workers: int = 8
+    batch_size: int = 200
+    enable_monitoring: bool = True
+    output_dir: str = "ai/datasets/standardized"
+@dataclass
+class LoggingConfig:
+    """Configuration for the logging system."""
+    log_level: str = "INFO"
+    log_file: str = "logs/dataset_pipeline.log"
+    max_bytes: int = 10 * 1024 * 1024  # 10 MB
+    backup_count: int = 5
+@dataclass
+class Config:
+    """Root configuration class for the entire pipeline."""
+    data_loader: DataLoaderConfig = field(default_factory=DataLoaderConfig)
+    standardization: StandardizationConfig = field(default_factory=StandardizationConfig)
+    logging: LoggingConfig = field(default_factory=LoggingConfig)
+    def to_dict(self) -> dict[str, Any]:
+        """Serializes the config to a dictionary."""
+        return {
+            "data_loader": self.data_loader.__dict__,
+            "standardization": self.standardization.__dict__,
+            "logging": self.logging.__dict__,
+        }
+# Singleton instance to be used across the application
+config = Config()
+def get_config() -> Config:
+    """Returns the singleton config instance."""
+    return config
+# Example usage:
+# from config import get_config
+# config = get_config()
+# print(config.standardization.batch_size)

configs/stage_configs/corrected_audit_report.json ADDED Viewed

	@@ -0,0 +1,694 @@

+{
+  "audit_date": "2025-08-24T13:28:00.686206",
+  "total_tasks": 36,
+  "complete": 34,
+  "partial": 0,
+  "found": 0,
+  "missing": 2,
+  "working_count": 34,
+  "completion_rate": 0.9444444444444444,
+  "overall_status": "NEARLY_COMPLETE",
+  "ecosystem_files": 4,
+  "dataset_pipeline_files": 30,
+  "detailed_results": {
+    "6.1": {
+      "task_id": "6.1",
+      "expected_filename": "distributed_architecture.py",
+      "description": "Distributed processing architecture",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/distributed_architecture.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/distributed_architecture.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/distributed_architecture.py",
+      "file_stats": {
+        "size_kb": 20.2275390625,
+        "lines": 569,
+        "classes": 6,
+        "functions": 26,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.2": {
+      "task_id": "6.2",
+      "expected_filename": "data_fusion_engine.py",
+      "description": "Intelligent data fusion algorithms",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/data_fusion_engine.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/data_fusion_engine.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/data_fusion_engine.py",
+      "file_stats": {
+        "size_kb": 26.6845703125,
+        "lines": 694,
+        "classes": 5,
+        "functions": 20,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.3": {
+      "task_id": "6.3",
+      "expected_filename": "quality_assessment_framework.py",
+      "description": "Hierarchical quality assessment framework",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/quality_assessment_framework.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/quality_assessment_framework.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/quality_assessment_framework.py",
+      "file_stats": {
+        "size_kb": 27.6455078125,
+        "lines": 708,
+        "classes": 5,
+        "functions": 25,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.4": {
+      "task_id": "6.4",
+      "expected_filename": "deduplication.py",
+      "description": "Automated conversation deduplication",
+      "found_files": [
+        "/home/vivi/pixelated/ai/pipelines/dataset_pipeline/test_deduplication.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_deduplication.py",
+        "/home/vivi/pixelated/ai/datasets/dataset_pipeline/test_deduplication.py",
+        "/home/vivi/pixelated/ai/datasets/dataset_pipeline/deduplication_system.py",
+        "/home/vivi/pixelated/ai/pipelines/dataset_pipeline/deduplication.py",
+        "/home/vivi/pixelated/ai/datasets/dataset_pipeline/deduplication.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/deduplication.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/pipelines/dataset_pipeline/test_deduplication.py",
+      "file_stats": {
+        "size_kb": 15.1220703125,
+        "lines": 419,
+        "classes": 5,
+        "functions": 30,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.5": {
+      "task_id": "6.5",
+      "expected_filename": "cross_dataset_linker.py",
+      "description": "Cross-dataset conversation linking",
+      "found_files": [],
+      "status": "MISSING",
+      "primary_file": null,
+      "file_stats": {},
+      "issues": []
+    },
+    "6.6": {
+      "task_id": "6.6",
+      "expected_filename": "metadata_schema.py",
+      "description": "Unified metadata schema",
+      "found_files": [],
+      "status": "MISSING",
+      "primary_file": null,
+      "file_stats": {},
+      "issues": []
+    },
+    "6.7": {
+      "task_id": "6.7",
+      "expected_filename": "therapeutic_intelligence.py",
+      "description": "Comprehensive therapeutic approach classification",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/therapeutic_intelligence.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/test_therapeutic_intelligence.py",
+        "/home/vivi/pixelated/ai/datasets/dataset_pipeline/therapeutic_intelligence_orchestrator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/therapeutic_intelligence.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/therapeutic_intelligence.py",
+      "file_stats": {
+        "size_kb": 25.4091796875,
+        "lines": 582,
+        "classes": 4,
+        "functions": 18,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.8": {
+      "task_id": "6.8",
+      "expected_filename": "condition_pattern_recognition.py",
+      "description": "Mental health condition pattern recognition",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/condition_pattern_recognition.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/condition_pattern_recognition.py",
+      "file_stats": {
+        "size_kb": 30.849609375,
+        "lines": 730,
+        "classes": 4,
+        "functions": 17,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.9": {
+      "task_id": "6.9",
+      "expected_filename": "outcome_prediction.py",
+      "description": "Therapeutic outcome prediction models",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/outcome_prediction.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/outcome_prediction.py",
+      "file_stats": {
+        "size_kb": 23.421875,
+        "lines": 580,
+        "classes": 5,
+        "functions": 18,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.10": {
+      "task_id": "6.10",
+      "expected_filename": "crisis_intervention_detector.py",
+      "description": "Crisis intervention detection and escalation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/tests/test_crisis_intervention_detector_enhanced.py",
+        "/home/vivi/pixelated/ai/pixel/test_crisis_intervention_detector.py",
+        "/home/vivi/pixelated/ai/tests/test_crisis_intervention_detector.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/crisis_intervention_detector.py",
+        "/home/vivi/pixelated/ai/tests/test_crisis_intervention_detector_working.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/crisis_intervention_detector.py",
+      "file_stats": {
+        "size_kb": 39.1484375,
+        "lines": 849,
+        "classes": 7,
+        "functions": 24,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.11": {
+      "task_id": "6.11",
+      "expected_filename": "personality_adapter.py",
+      "description": "Personality-aware conversation adaptation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/personality_adapter.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_personality_adapter.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/personality_adapter.py",
+      "file_stats": {
+        "size_kb": 30.1650390625,
+        "lines": 704,
+        "classes": 7,
+        "functions": 26,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.12": {
+      "task_id": "6.12",
+      "expected_filename": "cultural_competency_generator.py",
+      "description": "Cultural competency and diversity-aware response generation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/cultural_competency_generator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_cultural_competency_generator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/cultural_competency_generator.py",
+      "file_stats": {
+        "size_kb": 33.9677734375,
+        "lines": 789,
+        "classes": 6,
+        "functions": 35,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.13": {
+      "task_id": "6.13",
+      "expected_filename": "audio_emotion_integration.py",
+      "description": "Audio emotion recognition integration",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/audio_emotion_integration.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/audio_emotion_integration.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/audio_emotion_integration.py",
+      "file_stats": {
+        "size_kb": 23.2099609375,
+        "lines": 575,
+        "classes": 5,
+        "functions": 18,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.14": {
+      "task_id": "6.14",
+      "expected_filename": "multimodal_disorder_analysis.py",
+      "description": "Multi-modal mental disorder analysis pipeline",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/multimodal_disorder_analysis.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/multimodal_disorder_analysis.py",
+      "file_stats": {
+        "size_kb": 28.7197265625,
+        "lines": 691,
+        "classes": 8,
+        "functions": 21,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.15": {
+      "task_id": "6.15",
+      "expected_filename": "emotion_cause_extraction.py",
+      "description": "Emotion cause extraction and intervention mapping",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/emotion_cause_extraction.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/ecosystem/emotion_cause_extraction.py",
+      "file_stats": {
+        "size_kb": 28.5,
+        "lines": 686,
+        "classes": 7,
+        "functions": 18,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.16": {
+      "task_id": "6.16",
+      "expected_filename": "tfidf_clusterer.py",
+      "description": "TF-IDF feature-based conversation clustering",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/tfidf_clusterer.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_tfidf_clusterer.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/tfidf_clusterer.py",
+      "file_stats": {
+        "size_kb": 27.6640625,
+        "lines": 668,
+        "classes": 6,
+        "functions": 20,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.17": {
+      "task_id": "6.17",
+      "expected_filename": "temporal_reasoner.py",
+      "description": "Temporal reasoning integration",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/temporal_reasoner.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/temporal_reasoner.py",
+      "file_stats": {
+        "size_kb": 30.3173828125,
+        "lines": 744,
+        "classes": 7,
+        "functions": 25,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.18": {
+      "task_id": "6.18",
+      "expected_filename": "evidence_validator.py",
+      "description": "Scientific evidence-based practice validation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/evidence_validator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/evidence_validator.py",
+      "file_stats": {
+        "size_kb": 32.271484375,
+        "lines": 755,
+        "classes": 8,
+        "functions": 22,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.19": {
+      "task_id": "6.19",
+      "expected_filename": "priority_weighted_sampler.py",
+      "description": "Priority-weighted sampling algorithms",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/priority_weighted_sampler.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_priority_weighted_sampler.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/priority_weighted_sampler.py",
+      "file_stats": {
+        "size_kb": 25.404296875,
+        "lines": 646,
+        "classes": 3,
+        "functions": 17,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.20": {
+      "task_id": "6.20",
+      "expected_filename": "condition_balancer.py",
+      "description": "Condition-specific balancing system",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/condition_balancer.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/condition_balancer.py",
+      "file_stats": {
+        "size_kb": 26.40625,
+        "lines": 612,
+        "classes": 3,
+        "functions": 12,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.21": {
+      "task_id": "6.21",
+      "expected_filename": "approach_diversity_optimizer.py",
+      "description": "Therapeutic approach diversity optimization",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/approach_diversity_optimizer.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/approach_diversity_optimizer.py",
+      "file_stats": {
+        "size_kb": 33.8076171875,
+        "lines": 718,
+        "classes": 3,
+        "functions": 15,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.22": {
+      "task_id": "6.22",
+      "expected_filename": "demographic_balancer.py",
+      "description": "Demographic and cultural diversity balancing",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/demographic_balancer.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/demographic_balancer.py",
+      "file_stats": {
+        "size_kb": 20.724609375,
+        "lines": 486,
+        "classes": 3,
+        "functions": 12,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.23": {
+      "task_id": "6.23",
+      "expected_filename": "complexity_stratifier.py",
+      "description": "Conversation complexity stratification",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/complexity_stratifier.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/complexity_stratifier.py",
+      "file_stats": {
+        "size_kb": 26.2333984375,
+        "lines": 623,
+        "classes": 3,
+        "functions": 14,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.24": {
+      "task_id": "6.24",
+      "expected_filename": "crisis_routine_balancer.py",
+      "description": "Crisis-to-routine conversation ratio optimization",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/crisis_routine_balancer.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/crisis_routine_balancer.py",
+      "file_stats": {
+        "size_kb": 23.8505859375,
+        "lines": 574,
+        "classes": 3,
+        "functions": 13,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.25": {
+      "task_id": "6.25",
+      "expected_filename": "multi_tier_validator.py",
+      "description": "Multi-tier quality validation system",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/multi_tier_validator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_multi_tier_validator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/multi_tier_validator.py",
+      "file_stats": {
+        "size_kb": 28.9892578125,
+        "lines": 730,
+        "classes": 5,
+        "functions": 25,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.26": {
+      "task_id": "6.26",
+      "expected_filename": "dsm5_accuracy_validator.py",
+      "description": "DSM-5 therapeutic accuracy validation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_dsm5_accuracy_validator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/dsm5_accuracy_validator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/test_dsm5_accuracy_validator.py",
+      "file_stats": {
+        "size_kb": 16.8955078125,
+        "lines": 393,
+        "classes": 1,
+        "functions": 22,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.27": {
+      "task_id": "6.27",
+      "expected_filename": "safety_ethics_validator.py",
+      "description": "Conversation safety and ethics validation",
+      "found_files": [
+        "/home/vivi/pixelated/ai/pixel/validation/test_safety_ethics_validator.py",
+        "/home/vivi/pixelated/ai/tests/test_safety_ethics_validator_working.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_safety_ethics_validator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/safety_ethics_validator.py",
+        "/home/vivi/pixelated/ai/tests/test_safety_ethics_validator.py",
+        "/home/vivi/pixelated/ai/pixel/validation/safety_ethics_validator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/test_safety_ethics_validator.py",
+      "file_stats": {
+        "size_kb": 21.326171875,
+        "lines": 542,
+        "classes": 1,
+        "functions": 21,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.28": {
+      "task_id": "6.28",
+      "expected_filename": "effectiveness_predictor.py",
+      "description": "Therapeutic effectiveness prediction",
+      "found_files": [
+        "/home/vivi/pixelated/ai/monitoring/conversation_effectiveness_predictor.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_effectiveness_predictor.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/effectiveness_predictor.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/test_effectiveness_predictor.py",
+      "file_stats": {
+        "size_kb": 17.89453125,
+        "lines": 447,
+        "classes": 1,
+        "functions": 20,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.29": {
+      "task_id": "6.29",
+      "expected_filename": "coherence_validator.py",
+      "description": "Conversation coherence validation using CoT reasoning",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/coherence_validator.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/test_coherence_validator.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/coherence_validator.py",
+      "file_stats": {
+        "size_kb": 38.3896484375,
+        "lines": 1016,
+        "classes": 5,
+        "functions": 24,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.30": {
+      "task_id": "6.30",
+      "expected_filename": "realtime_quality_monitor.py",
+      "description": "Real-time conversation quality monitoring",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/realtime_quality_monitor.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/realtime_quality_monitor.py",
+      "file_stats": {
+        "size_kb": 17.41015625,
+        "lines": 467,
+        "classes": 5,
+        "functions": 20,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.31": {
+      "task_id": "6.31",
+      "expected_filename": "production_exporter.py",
+      "description": "Production-ready dataset export with tiered access",
+      "found_files": [
+        "/home/vivi/pixelated/ai/tests/test_production_exporter.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/production_exporter.py",
+        "/home/vivi/pixelated/ai/pixel/test_production_exporter.py",
+        "/home/vivi/pixelated/ai/tests/test_production_exporter_working.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/production_exporter.py",
+      "file_stats": {
+        "size_kb": 26.828125,
+        "lines": 710,
+        "classes": 5,
+        "functions": 24,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.32": {
+      "task_id": "6.32",
+      "expected_filename": "adaptive_learner.py",
+      "description": "Adaptive learning pipeline",
+      "found_files": [
+        "/home/vivi/pixelated/ai/tests/test_adaptive_learner_working.py",
+        "/home/vivi/pixelated/ai/tests/test_adaptive_learner.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/adaptive_learner.py",
+        "/home/vivi/pixelated/ai/pixel/test_adaptive_learner.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/adaptive_learner.py",
+      "file_stats": {
+        "size_kb": 26.4423828125,
+        "lines": 684,
+        "classes": 8,
+        "functions": 34,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.33": {
+      "task_id": "6.33",
+      "expected_filename": "analytics_dashboard.py",
+      "description": "Comprehensive analytics dashboard",
+      "found_files": [
+        "/home/vivi/pixelated/ai/monitoring/test_quality_analytics_dashboard_v2.py",
+        "/home/vivi/pixelated/ai/monitoring/test_quality_analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/monitoring/launch_quality_analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/pixel/test_analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/monitoring/quality_analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/monitoring/launch_quality_analytics_dashboard_v2.py",
+        "/home/vivi/pixelated/ai/tests/test_analytics_dashboard_working.py",
+        "/home/vivi/pixelated/ai/tests/test_analytics_dashboard.py",
+        "/home/vivi/pixelated/ai/monitoring/quality_analytics_dashboard_v2.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/analytics_dashboard.py",
+      "file_stats": {
+        "size_kb": 18.1240234375,
+        "lines": 455,
+        "classes": 2,
+        "functions": 17,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.34": {
+      "task_id": "6.34",
+      "expected_filename": "automated_maintenance.py",
+      "description": "Automated dataset update and maintenance procedures",
+      "found_files": [
+        "/home/vivi/pixelated/ai/pixel/test_automated_maintenance.py",
+        "/home/vivi/pixelated/ai/dataset_pipeline/automated_maintenance.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/automated_maintenance.py",
+      "file_stats": {
+        "size_kb": 20.296875,
+        "lines": 571,
+        "classes": 5,
+        "functions": 22,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.35": {
+      "task_id": "6.35",
+      "expected_filename": "feedback_loops.py",
+      "description": "Conversation effectiveness feedback loops",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/feedback_loops.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/feedback_loops.py",
+      "file_stats": {
+        "size_kb": 18.7763671875,
+        "lines": 461,
+        "classes": 4,
+        "functions": 12,
+        "has_docstring": true
+      },
+      "issues": []
+    },
+    "6.36": {
+      "task_id": "6.36",
+      "expected_filename": "comprehensive_api.py",
+      "description": "Comprehensive documentation and API",
+      "found_files": [
+        "/home/vivi/pixelated/ai/dataset_pipeline/comprehensive_api.py"
+      ],
+      "status": "COMPLETE",
+      "primary_file": "/home/vivi/pixelated/ai/dataset_pipeline/comprehensive_api.py",
+      "file_stats": {
+        "size_kb": 29.732421875,
+        "lines": 873,
+        "classes": 3,
+        "functions": 8,
+        "has_docstring": true
+      },
+      "issues": []
+    }
+  }
+}