File size: 7,160 Bytes
d29b763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Simplified production system audit without complex imports."""
import json
import logging
import sys
from pathlib import Path

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
)
logger = logging.getLogger('medcare_ddi.quick_audit')

BASE_DIR = Path(__file__).resolve().parents[2]
DATA_PATH = BASE_DIR / 'data' / 'processed' / 'ddinter_combined.parquet'
MODEL_DIR = BASE_DIR / 'models'
FEATURE_PIPELINE_MULTISOURCE_PATH = MODEL_DIR / 'feature_pipeline_multisource.pkl'


def audit_files():
    """Check that all critical files exist."""
    logger.info('='*60)
    logger.info('FILE EXISTENCE CHECK')
    logger.info('='*60)

    critical_files = {
        'Feature Pipeline (11MB)': FEATURE_PIPELINE_MULTISOURCE_PATH,
        'Model Checkpoint (318KB)': MODEL_DIR / 'ddi_mlp_best.pt',
        'Data File (13MB)': DATA_PATH,
        'Metadata': MODEL_DIR / 'multisource_metadata.json',
        'Training Config': MODEL_DIR / 'training_config.json',
        'FastAPI Backend': BASE_DIR / 'src' / 'inference' / 'app_production.py',
        'Production Training': BASE_DIR / 'src' / 'training' / 'train_production_simple.py',
        'Smoke Tests': BASE_DIR / 'src' / 'validation' / 'smoke_test.py',
    }

    all_good = True
    for name, path in critical_files.items():
        exists = path.exists()
        status = 'βœ“' if exists else 'βœ—'
        
        if path.suffix in ['.pkl', '.pt', '.csv']:
            try:
                size_mb = path.stat().st_size / (1024 * 1024)
                logger.info(f'{status} {name}: {size_mb:.1f}MB')
            except:
                logger.info(f'{status} {name}')
        else:
            logger.info(f'{status} {name}')
        
        all_good = all_good and exists

    return all_good


def audit_metadata():
    """Check metadata schema."""
    logger.info('')
    logger.info('='*60)
    logger.info('METADATA & SCHEMA CHECK')
    logger.info('='*60)

    try:
        with open(MODEL_DIR / 'multisource_metadata.json') as f:
            metadata = json.load(f)

        # Check for both possible field names
        total_dim = metadata.get('total_dim') or metadata.get('vector_dim', 0)
        logger.info(f'βœ“ Multisource metadata loaded')
        logger.info(f'  - Total dimension: {total_dim}')

        if total_dim != 560:
            logger.error(f'βœ— SCHEMA MISMATCH: Expected 560, got {total_dim}')
            return False

        # Check feature groups
        feature_groups = metadata.get('feature_groups') or metadata.get('group_keep_counts', {})
        if feature_groups:
            for group, dim_or_count in feature_groups.items():
                # Handle both dict and int values
                dim = dim_or_count if isinstance(dim_or_count, int) else dim_or_count.get('dim', 0)
                logger.info(f'  - {group}: {dim}')

        logger.info(f'βœ“ 560-dimensional schema confirmed')
        return True

    except Exception as e:
        logger.error(f'βœ— Metadata check failed: {e}')
        return False


def audit_model_config():
    """Check training config."""
    logger.info('')
    logger.info('='*60)
    logger.info('MODEL TRAINING CONFIG')
    logger.info('='*60)

    try:
        with open(MODEL_DIR / 'training_config.json') as f:
            config = json.load(f)

        logger.info(f'βœ“ Training config loaded')
        logger.info(f'  - Loss type: {config.get("loss_type")}')
        logger.info(f'  - Sampler: {config.get("sampler")}')
        logger.info(f'  - Hidden dim: {config.get("hidden_dim")}')
        logger.info(f'  - Learning rate: {config.get("lr")}')

        if config.get('loss_type') == 'focal' and config.get('sampler') == 'weighted':
            logger.info(f'βœ“ Healthcare optimization features enabled')
            return True
        else:
            logger.warning(f'⚠ Some optimization features may not be enabled')
            return True

    except Exception as e:
        logger.error(f'βœ— Config check failed: {e}')
        return False


def audit_summary_metrics():
    """Check metrics from previous training."""
    logger.info('')
    logger.info('='*60)
    logger.info('PREVIOUS MODEL METRICS')
    logger.info('='*60)

    try:
        with open(MODEL_DIR / 'ddi_mlp_best.summary.json') as f:
            summary = json.load(f)

        logger.info(f'βœ“ Model summary loaded')
        logger.info(f'  - Accuracy: {summary.get("best_validation_accuracy", 0):.2%}')
        logger.info(f'  - Dataset size: {summary.get("dataset_size", 0):,}')
        logger.info(f'  - Training epochs: {len(summary.get("training_history", []))}')

        return True

    except Exception as e:
        logger.error(f'βœ— Metrics check failed: {e}')
        return False


def audit_code_structure():
    """Check that production code files exist and have content."""
    logger.info('')
    logger.info('='*60)
    logger.info('PRODUCTION CODE STRUCTURE')
    logger.info('='*60)

    code_files = {
        'FastAPI Backend': BASE_DIR / 'src' / 'inference' / 'app_production.py',
        'Training Pipeline': BASE_DIR / 'src' / 'training' / 'train_production_simple.py',
        'Smoke Tests': BASE_DIR / 'src' / 'validation' / 'smoke_test.py',
        'Predictor': BASE_DIR / 'src' / 'inference' / 'predictor.py',
    }

    all_good = True
    for name, path in code_files.items():
        if not path.exists():
            logger.error(f'βœ— {name} missing')
            all_good = False
            continue

        try:
            with open(path) as f:
                lines = len(f.readlines())
            logger.info(f'βœ“ {name}: {lines} lines')
        except Exception as e:
            logger.error(f'βœ— {name}: {e}')
            all_good = False

    return all_good


def main():
    """Run quick audit."""
    logger.info('')
    logger.info('β•”' + '═'*58 + 'β•—')
    logger.info('β•‘ MEDCARE-DDI QUICK PRODUCTION AUDIT' + ' '*24 + 'β•‘')
    logger.info('β•š' + '═'*58 + '╝')

    results = {
        'Files': audit_files(),
        'Metadata': audit_metadata(),
        'Config': audit_model_config(),
        'Metrics': audit_summary_metrics(),
        'Code': audit_code_structure(),
    }

    logger.info('')
    logger.info('='*60)
    logger.info('AUDIT SUMMARY')
    logger.info('='*60)

    all_passed = all(results.values())
    status = 'βœ“ READY' if all_passed else '⚠ NEEDS_ATTENTION'
    logger.info(f'{status} - Production system status')

    for check, passed in results.items():
        status = 'βœ“' if passed else 'βœ—'
        logger.info(f'{status} {check}')

    logger.info('')

    # Save report
    report = {
        'timestamp': __import__('datetime').datetime.now().isoformat(),
        'checks': results,
        'status': 'READY' if all_passed else 'NEEDS_ATTENTION',
    }

    report_path = MODEL_DIR / 'reports' / 'quick_audit.json'
    report_path.parent.mkdir(parents=True, exist_ok=True)

    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)

    logger.info(f'βœ“ Report saved to {report_path}')
    logger.info('')


if __name__ == '__main__':
    main()