Spaces:

AI-Med-Lab
/

SUD-PROMISE

Sleeping

File size: 15,366 Bytes

d107fd9

"""
Data Generator Module
Extracted from sud_promise_uab_theme.py
Handles disease ID mapping and synthetic data generation with real ML/DL scores
"""

import random
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import List

# Import dependencies
from const_ui import (
    UAB_GREEN, UAB_DARK_GREEN, UAB_LIGHT_GREEN, UAB_ACCENT_TEAL,
    STAGE_MAPPING
)
from const_data import (
    SUD_CATEGORY_DESCRIPTIONS, DISEASE_SEARCH_CONFIG,
    DRUG_TEMPLATES, PROJECT_TEMPLATES
)
from func_drug import find_drug_in_database

# ========================================
# DATA MODELS
# ========================================

@dataclass
class Project:
    """Research project providing evidence"""
    id: str
    name: str
    project_type: str
    added_date: datetime
    sample_size: int
    impact_score: float
    status: str
    summary: str

@dataclass
class DrugCandidate:
    """Drug being evaluated for repositioning"""
    id: str
    drug_name: str
    current_indication: str
    target_sud_subtype: str
    mechanism: str
    stage: str
    evidence_score: float
    baseline_score: float
    smiles: str
    attached_projects: List[Project]
    last_updated: datetime
    cohort_count: int
    has_market_analysis: bool
    has_validation_plan: bool
    team_members: int
    data_produced: int
    publications: int
    tools_used: int
    data_governance: int
    training_participated: int
    stage_entry_date: datetime = None
    stage_history: List[tuple] = field(default_factory=list)
    score_type: str = "Synthetic"
    model_scores: dict = field(default_factory=dict)
    protein_targets: List[str] = field(default_factory=list)
    disease_id: str = ""

@dataclass
class SUDCategory:
    """SUD disease category"""
    name: str
    color: str
    hex_color: str
    icon: str
    candidate_count: int
    description: str
    disease_id: str = ""

# ========================================
# DISEASE ID MAPPING
# ========================================

def setup_disease_mapping(diseases_df):
    """Setup disease ID mapping from database"""
    DISEASE_ID_MAPPING = {}
    
    if diseases_df is not None:
        print("\n🔍 Searching for best disease name matches in database...")
        
        for category, config in DISEASE_SEARCH_CONFIG.items():
            print(f"\n Searching for: {category}")
            found = False
            
            for search_term in config["search_terms"]:
                # Try exact match first
                exact_mask = diseases_df['DiseaseName'].str.lower() == search_term.lower()
                exact_matches = diseases_df[exact_mask]
                
                if len(exact_matches) > 0:
                    disease_id = exact_matches.iloc[0]['DiseaseID']
                    disease_name = exact_matches.iloc[0]['DiseaseName']
                    DISEASE_ID_MAPPING[category] = disease_id
                    print(f"    EXACT MATCH: '{disease_name}'")
                    print(f"      → {disease_id}")
                    found = True
                    break
                
                # Try partial match if exact fails
                partial_mask = diseases_df['DiseaseName'].str.lower().str.contains(search_term.lower(), na=False)
                partial_matches = diseases_df[partial_mask]
                
                if len(partial_matches) > 0:
                    disease_id = partial_matches.iloc[0]['DiseaseID']
                    disease_name = partial_matches.iloc[0]['DiseaseName']
                    DISEASE_ID_MAPPING[category] = disease_id
                    print(f"    Partial match for '{search_term}': '{disease_name}'")
                    print(f"      → {disease_id}")
                    found = True
                    break
            
            if not found:
                print(f"    No matches found for {category}")
        
        print(f"\n{'='*80}")
        print(f"📋 FINAL DISEASE MAPPING:")
        print(f"{'='*80}")
        for category, disease_id in DISEASE_ID_MAPPING.items():
            disease_row = diseases_df[diseases_df['DiseaseID'] == disease_id]
            if len(disease_row) > 0:
                disease_name = disease_row.iloc[0]['DiseaseName']
                print(f"{category}:")
                print(f"   → {disease_id} ({disease_name})")
        
        print(f"\n Successfully mapped {len(DISEASE_ID_MAPPING)}/6 SUD categories\n")
    
    return DISEASE_ID_MAPPING

# ========================================
# SYNTHETIC DATA GENERATION WITH REAL ML/DL SCORES
# ========================================

def generate_synthetic_data(DISEASE_ID_MAPPING, MODELS_AVAILABLE, ml_components, drugs_df):
    """Generate realistic synthetic SUD repositioning data with REAL ML/DL scores where possible"""
    
    # Import here to avoid circular dependency
    from func_models import predict_with_ml_models
    
    print("\n" + "="*70)
    print("🔬 GENERATING CANDIDATE DATA WITH ML/DL PREDICTIONS")
    print("="*70)
    
    # Use corrected category names
    sud_categories = [
        SUDCategory("Opioid-Related Disorders", "", UAB_GREEN, "", 0, 
                   SUD_CATEGORY_DESCRIPTIONS["Opioid-Related Disorders"],
                   disease_id=DISEASE_ID_MAPPING.get("Opioid-Related Disorders", "")),
        SUDCategory("Alcohol Use Disorder", "", UAB_ACCENT_TEAL, "", 0,
                   SUD_CATEGORY_DESCRIPTIONS["Alcohol Use Disorder"],
                   disease_id=DISEASE_ID_MAPPING.get("Alcohol Use Disorder", "")),
        SUDCategory("Stimulant Use Disorder", "", UAB_LIGHT_GREEN, "", 0,
                   SUD_CATEGORY_DESCRIPTIONS["Stimulant Use Disorder"],
                   disease_id=DISEASE_ID_MAPPING.get("Stimulant Use Disorder", "")),
        SUDCategory("Cannabis Use Disorder", "", UAB_DARK_GREEN, "", 0,
                   SUD_CATEGORY_DESCRIPTIONS["Cannabis Use Disorder"],
                   disease_id=DISEASE_ID_MAPPING.get("Cannabis Use Disorder", "")),
        SUDCategory("Sedative/Hypnotic Disorder", "", "#4A8B7A", "", 0,
                   SUD_CATEGORY_DESCRIPTIONS["Sedative/Hypnotic Disorder"],
                   disease_id=DISEASE_ID_MAPPING.get("Sedative/Hypnotic Disorder", "")),
        SUDCategory("Nicotine Use Disorder", "", "#2C7A64", "", 0,
                   SUD_CATEGORY_DESCRIPTIONS["Nicotine Use Disorder"],
                   disease_id=DISEASE_ID_MAPPING.get("Nicotine Use Disorder", "")),
    ]
    
    candidates = []
    candidate_id = 1
    
    total_real_scores = 0
    total_synthetic_scores = 0
    
    for category in sud_categories:
        if category.name not in DRUG_TEMPLATES:
            continue
        
        category_real = 0
        category_synthetic = 0
        
        print(f"\n🔬 Processing {category.name}...")
        if category.disease_id:
            print(f"   Disease ID: {category.disease_id}")
        else:
            print(f"     No disease ID mapped - using synthetic scores only")
        
        for drug_info in DRUG_TEMPLATES[category.name]:
            drug_name, current_use, mechanism, stage = drug_info
            
            # Try to find drug in database using corrected function
            drug_smiles, drug_targets = find_drug_in_database(drug_name, drugs_df)
            
            if not drug_smiles:
                # Generate random SMILES if not found
                drug_smiles = f"C{'C' * random.randint(5, 15)}N"
                drug_targets = []
                print(f"     {drug_name}: Not in database, using synthetic SMILES")
            else:
                print(f"    {drug_name}: Found in database")
            
            # Generate stage history
            stage_num = int(stage[1])
            stage_history = []
            
            days_in_pipeline = random.randint(360, 900)
            start_date_candidate = datetime.now() - timedelta(days=days_in_pipeline)
            
            current_date = start_date_candidate
            for s in range(stage_num + 1):
                stage_name = f"S{s}"
                days_in_stage = random.randint(60, 120)
                stage_entry = current_date
                stage_history.append((stage_name, stage_entry))
                current_date = current_date + timedelta(days=days_in_stage)
            
            stage_entry_date = stage_history[-1][1] if stage_history else datetime.now()
            
            score_min, score_max = STAGE_MAPPING[stage]
            baseline = random.uniform(0.40, 0.55)
            
            # Try to get REAL ML/DL scores
            score_type = "Synthetic"
            model_scores = {}
            evidence_score = baseline
            baseline_score = baseline  # Will be updated for Real predictions
            
            if category.disease_id and drug_smiles and MODELS_AVAILABLE:
                print(f"      Predicting {drug_name} → {category.disease_id}...", end=" ")
                ml_results, message, ml_score_type = predict_with_ml_models(
                    drug_smiles, drug_targets, category.disease_id, ml_components
                )
                
                if ml_results is not None and ml_score_type == "Real":
                    score_type = "Real"
                    model_scores = ml_results
                    
                    # FIXED: Use ensemble as BASELINE
                    baseline_score = ml_results.get('Ensemble', baseline)
                    
                    # Calculate target evidence score
                    num_projects = random.randint(3, 5)
                    total_impact_needed = random.uniform(0.15, 0.35)  # Total impact from projects
                    evidence_score = baseline_score + total_impact_needed
                    evidence_score = max(0.20, min(0.95, evidence_score))
                    
                    # Recalculate actual impact needed after clamping
                    total_impact_needed = evidence_score - baseline_score
                    
                    print(f" Real ML/DL: {baseline_score:.3f} → {evidence_score:.3f}")
                    category_real += 1
                    total_real_scores += 1
                else:
                    # Generate synthetic score with projects
                    baseline_score = baseline
                    num_projects = random.randint(3, 5)
                    total_impact_needed = random.uniform(0.20, 0.40)
                    evidence_score = baseline_score + total_impact_needed
                    evidence_score = max(0.20, min(0.95, evidence_score))
                    total_impact_needed = evidence_score - baseline_score
                    
                    print(f"  Synthetic score: {evidence_score:.3f} ({message})")
                    category_synthetic += 1
                    total_synthetic_scores += 1
            else:
                # Generate synthetic score
                baseline_score = baseline
                num_projects = random.randint(3, 5)
                total_impact_needed = random.uniform(0.20, 0.40)
                evidence_score = baseline_score + total_impact_needed
                evidence_score = max(0.20, min(0.95, evidence_score))
                total_impact_needed = evidence_score - baseline_score
                
                print(f"      {drug_name}: Synthetic score: {evidence_score:.3f}")
                category_synthetic += 1
                total_synthetic_scores += 1
            
            # NOW Generate projects with impacts that sum EXACTLY to total_impact_needed
            projects = []
            
            # Distribute the total impact across projects
            remaining_impact = total_impact_needed
            for i in range(num_projects):
                template = random.choice(PROJECT_TEMPLATES)
                days_ago = random.randint(30 + i*50, days_in_pipeline - (num_projects - i)*30)
                
                if i == num_projects - 1:
                    # Last project gets exactly the remaining impact
                    impact = remaining_impact
                else:
                    # Random portion of remaining, but ensure we don't exhaust it
                    max_this_impact = remaining_impact * 0.6  # Use at most 60% of remaining
                    impact = random.uniform(-0.05, max_this_impact)
                    remaining_impact -= impact
                
                project = Project(
                    id=f"proj_{candidate_id}_{i}",
                    name=template["name"],
                    project_type=template["type"],
                    added_date=datetime.now() - timedelta(days=days_ago),
                    sample_size=random.randint(*template["size_range"]),
                    impact_score=impact,  # Use calculated impact
                    status="Completed" if random.random() > 0.2 else "Active",
                    summary=template["summary"]
                )
                projects.append(project)
            
            # Sort by date
            projects.sort(key=lambda p: p.added_date)
            
            stage_num = int(stage[1])
            
            candidate = DrugCandidate(
                id=f"cand_{candidate_id}",
                drug_name=drug_name,
                current_indication=current_use,
                target_sud_subtype=category.name,
                mechanism=mechanism,
                stage=stage,
                evidence_score=evidence_score,
                baseline_score=baseline_score,  # Now uses ensemble for Real scores
                smiles=drug_smiles,
                attached_projects=projects,
                last_updated=datetime.now() - timedelta(days=random.randint(1, 30)),
                cohort_count=random.randint(1 + stage_num, 4 + stage_num * 2),
                has_market_analysis=random.random() > 0.3,
                has_validation_plan=random.random() > 0.4,
                team_members=random.randint(2 + stage_num, 5 + stage_num * 2),
                data_produced=random.randint(1 + stage_num, 3 + stage_num * 2),
                publications=random.randint(0 + stage_num // 2, 2 + stage_num),
                tools_used=random.randint(1 + stage_num // 2, 3 + stage_num),
                data_governance=random.randint(1, 2 + stage_num // 2),
                training_participated=random.randint(stage_num, 2 + stage_num),
                stage_entry_date=stage_entry_date,
                stage_history=stage_history,
                score_type=score_type,
                model_scores=model_scores,
                protein_targets=drug_targets,
                disease_id=category.disease_id
            )
            candidates.append(candidate)
            candidate_id += 1
        
        # Update category count
        category.candidate_count = category_real + category_synthetic
        print(f"   Summary: {category_real} real, {category_synthetic} synthetic")
    
    print(f"\n{'='*70}")
    print(f" FINAL STATISTICS")
    print(f"{'='*70}")
    print(f"Total candidates: {len(candidates)}")
    print(f"Real ML/DL scores: {total_real_scores}")
    print(f"Synthetic scores: {total_synthetic_scores}")
    print(f"Real score percentage: {total_real_scores / len(candidates) * 100:.1f}%")
    print(f"{'='*70}\n")
    
    return sud_categories, candidates