ddi / src /validation /quick_audit.py
github-actions[bot]
Deploy from GitHub Actions (fb28c05c54cf19184fc3f14f1bf3297ba5749ea2)
d29b763
"""Simplified production system audit without complex imports."""
import json
import logging
import sys
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
)
logger = logging.getLogger('medcare_ddi.quick_audit')
BASE_DIR = Path(__file__).resolve().parents[2]
DATA_PATH = BASE_DIR / 'data' / 'processed' / 'ddinter_combined.parquet'
MODEL_DIR = BASE_DIR / 'models'
FEATURE_PIPELINE_MULTISOURCE_PATH = MODEL_DIR / 'feature_pipeline_multisource.pkl'
def audit_files():
"""Check that all critical files exist."""
logger.info('='*60)
logger.info('FILE EXISTENCE CHECK')
logger.info('='*60)
critical_files = {
'Feature Pipeline (11MB)': FEATURE_PIPELINE_MULTISOURCE_PATH,
'Model Checkpoint (318KB)': MODEL_DIR / 'ddi_mlp_best.pt',
'Data File (13MB)': DATA_PATH,
'Metadata': MODEL_DIR / 'multisource_metadata.json',
'Training Config': MODEL_DIR / 'training_config.json',
'FastAPI Backend': BASE_DIR / 'src' / 'inference' / 'app_production.py',
'Production Training': BASE_DIR / 'src' / 'training' / 'train_production_simple.py',
'Smoke Tests': BASE_DIR / 'src' / 'validation' / 'smoke_test.py',
}
all_good = True
for name, path in critical_files.items():
exists = path.exists()
status = 'βœ“' if exists else 'βœ—'
if path.suffix in ['.pkl', '.pt', '.csv']:
try:
size_mb = path.stat().st_size / (1024 * 1024)
logger.info(f'{status} {name}: {size_mb:.1f}MB')
except:
logger.info(f'{status} {name}')
else:
logger.info(f'{status} {name}')
all_good = all_good and exists
return all_good
def audit_metadata():
"""Check metadata schema."""
logger.info('')
logger.info('='*60)
logger.info('METADATA & SCHEMA CHECK')
logger.info('='*60)
try:
with open(MODEL_DIR / 'multisource_metadata.json') as f:
metadata = json.load(f)
# Check for both possible field names
total_dim = metadata.get('total_dim') or metadata.get('vector_dim', 0)
logger.info(f'βœ“ Multisource metadata loaded')
logger.info(f' - Total dimension: {total_dim}')
if total_dim != 560:
logger.error(f'βœ— SCHEMA MISMATCH: Expected 560, got {total_dim}')
return False
# Check feature groups
feature_groups = metadata.get('feature_groups') or metadata.get('group_keep_counts', {})
if feature_groups:
for group, dim_or_count in feature_groups.items():
# Handle both dict and int values
dim = dim_or_count if isinstance(dim_or_count, int) else dim_or_count.get('dim', 0)
logger.info(f' - {group}: {dim}')
logger.info(f'βœ“ 560-dimensional schema confirmed')
return True
except Exception as e:
logger.error(f'βœ— Metadata check failed: {e}')
return False
def audit_model_config():
"""Check training config."""
logger.info('')
logger.info('='*60)
logger.info('MODEL TRAINING CONFIG')
logger.info('='*60)
try:
with open(MODEL_DIR / 'training_config.json') as f:
config = json.load(f)
logger.info(f'βœ“ Training config loaded')
logger.info(f' - Loss type: {config.get("loss_type")}')
logger.info(f' - Sampler: {config.get("sampler")}')
logger.info(f' - Hidden dim: {config.get("hidden_dim")}')
logger.info(f' - Learning rate: {config.get("lr")}')
if config.get('loss_type') == 'focal' and config.get('sampler') == 'weighted':
logger.info(f'βœ“ Healthcare optimization features enabled')
return True
else:
logger.warning(f'⚠ Some optimization features may not be enabled')
return True
except Exception as e:
logger.error(f'βœ— Config check failed: {e}')
return False
def audit_summary_metrics():
"""Check metrics from previous training."""
logger.info('')
logger.info('='*60)
logger.info('PREVIOUS MODEL METRICS')
logger.info('='*60)
try:
with open(MODEL_DIR / 'ddi_mlp_best.summary.json') as f:
summary = json.load(f)
logger.info(f'βœ“ Model summary loaded')
logger.info(f' - Accuracy: {summary.get("best_validation_accuracy", 0):.2%}')
logger.info(f' - Dataset size: {summary.get("dataset_size", 0):,}')
logger.info(f' - Training epochs: {len(summary.get("training_history", []))}')
return True
except Exception as e:
logger.error(f'βœ— Metrics check failed: {e}')
return False
def audit_code_structure():
"""Check that production code files exist and have content."""
logger.info('')
logger.info('='*60)
logger.info('PRODUCTION CODE STRUCTURE')
logger.info('='*60)
code_files = {
'FastAPI Backend': BASE_DIR / 'src' / 'inference' / 'app_production.py',
'Training Pipeline': BASE_DIR / 'src' / 'training' / 'train_production_simple.py',
'Smoke Tests': BASE_DIR / 'src' / 'validation' / 'smoke_test.py',
'Predictor': BASE_DIR / 'src' / 'inference' / 'predictor.py',
}
all_good = True
for name, path in code_files.items():
if not path.exists():
logger.error(f'βœ— {name} missing')
all_good = False
continue
try:
with open(path) as f:
lines = len(f.readlines())
logger.info(f'βœ“ {name}: {lines} lines')
except Exception as e:
logger.error(f'βœ— {name}: {e}')
all_good = False
return all_good
def main():
"""Run quick audit."""
logger.info('')
logger.info('β•”' + '═'*58 + 'β•—')
logger.info('β•‘ MEDCARE-DDI QUICK PRODUCTION AUDIT' + ' '*24 + 'β•‘')
logger.info('β•š' + '═'*58 + '╝')
results = {
'Files': audit_files(),
'Metadata': audit_metadata(),
'Config': audit_model_config(),
'Metrics': audit_summary_metrics(),
'Code': audit_code_structure(),
}
logger.info('')
logger.info('='*60)
logger.info('AUDIT SUMMARY')
logger.info('='*60)
all_passed = all(results.values())
status = 'βœ“ READY' if all_passed else '⚠ NEEDS_ATTENTION'
logger.info(f'{status} - Production system status')
for check, passed in results.items():
status = 'βœ“' if passed else 'βœ—'
logger.info(f'{status} {check}')
logger.info('')
# Save report
report = {
'timestamp': __import__('datetime').datetime.now().isoformat(),
'checks': results,
'status': 'READY' if all_passed else 'NEEDS_ATTENTION',
}
report_path = MODEL_DIR / 'reports' / 'quick_audit.json'
report_path.parent.mkdir(parents=True, exist_ok=True)
with open(report_path, 'w') as f:
json.dump(report, f, indent=2)
logger.info(f'βœ“ Report saved to {report_path}')
logger.info('')
if __name__ == '__main__':
main()