"""
N2N Precision Engine — Production API v3.0
Inventor: Manav Vanga | Patent Pending 2026
Brain: DNABERT-2 v2 (Pearson r=0.941, trained on 30,387 biological variants)
Calibrated thresholds: HIGH=0.88, MED=0.76
Includes: Full drug database + ClinicalTrials.gov live integration
"""

import os, re, hashlib, threading
from datetime import datetime, timezone
import numpy as np
import requests
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

# ── Inventor constants ────────────────────────────────────────────
SLIP_SCORES = {'C':0.82,'A':0.61,'T':0.34,'U':0.34,'G':0.19,'N':0.50}
POSITION_WEIGHTS = [
    0.20,0.22,0.24,0.26,0.28,0.32,0.36,0.42,0.50,0.58,
    0.65,0.72,0.80,0.88,0.95,1.00,1.00,1.00,1.80,
    1.40,1.20,1.00,0.85,0.72,0.60,0.50,0.42,0.36,0.28
]

# ── Calibrated thresholds (from validation on 10 known variants) ──
HIGH_THRESHOLD = 0.88
MED_THRESHOLD  = 0.76

PLUS4_ROAD = {
    'C':('Slippery','High readthrough — ribosome slides through stop codon'),
    'A':('Smooth',  'Moderate readthrough — some ribosomal slippage'),
    'T':('Rough',   'Low readthrough — ribosome mostly terminates'),
    'U':('Rough',   'Low readthrough — ribosome mostly terminates'),
    'G':('Sticky',  'Very low readthrough — ribosome terminates strongly'),
}

# ── Complete drug database ────────────────────────────────────────
DRUG_DATABASE = {
    'HIGH': {
        'therapy': 'Readthrough Therapy — Strong Candidate',
        'mechanism': 'Promote ribosomal readthrough of premature stop codon',
        'approved': [
            {
                'name': 'Ataluren (PTC124)',
                'status': 'EMA Approved (EU) — FDA Breakthrough Therapy',
                'diseases': ['Duchenne MD', 'Cystic Fibrosis'],
                'dose': '10/10/20 mg/kg three times daily',
                'note': 'First-in-class readthrough drug'
            },
        ],
        'phase3': [
            {
                'name': 'ELX-02 (Eloxx)',
                'status': 'Phase 3 Clinical Trial',
                'diseases': ['Cystic Fibrosis', 'Dravet Syndrome'],
                'mechanism': 'Eukaryotic ribosome-targeting aminoglycoside',
                'note': 'More selective than gentamicin, less nephrotoxic'
            },
        ],
        'phase2': [
            {
                'name': 'SRI-37240 + SRI-41315',
                'status': 'Phase 2',
                'diseases': ['Cystic Fibrosis'],
                'mechanism': 'Novel readthrough compound class',
                'note': 'University of Alabama Birmingham'
            },
            {
                'name': 'Gentamicin (G418)',
                'status': 'Phase 2 / Off-label',
                'diseases': ['Multiple — aminoglycoside readthrough'],
                'mechanism': 'Aminoglycoside-induced misreading of stop codon',
                'note': 'Nephrotoxicity limits long-term use'
            },
        ],
        'preclinical': [
            'Negamycin derivatives',
            'NV848 (Nonsense Therapeutics)',
            'Escin — natural readthrough compound',
            'Tylosin — macrolide with readthrough activity',
        ],
        'combination': [
            'Ataluren + NMD inhibitor (amlexanox)',
            'ELX-02 + CFTR corrector (lumacaftor)',
            'Readthrough + proteasome inhibitor',
        ]
    },
    'MEDIUM': {
        'therapy': 'Combination Approach — Moderate Candidate',
        'mechanism': 'Combine readthrough with NMD suppression',
        'approved': [
            {
                'name': 'Gentamicin',
                'status': 'Off-label / Investigational',
                'diseases': ['Multiple'],
                'note': 'Short-term use, monitor kidneys'
            }
        ],
        'phase3': [
            {
                'name': 'ELX-02',
                'status': 'Phase 3 — may benefit moderate responders',
                'diseases': ['CF', 'Dravet'],
                'note': 'Trial enrollment open'
            }
        ],
        'phase2': [
            {
                'name': 'Amlexanox + Readthrough',
                'status': 'Phase 2 combination',
                'diseases': ['Multiple NMD diseases'],
                'mechanism': 'NMD inhibition prolongs readthrough mRNA',
                'note': 'Increases mRNA half-life for readthrough product'
            }
        ],
        'preclinical': [
            'SMG1 kinase inhibitors',
            'NMDI-14',
            'UPF1 inhibitors',
        ],
        'combination': [
            'Readthrough + NMD inhibitor',
            'Low-dose gentamicin + antioxidant',
        ]
    },
    'LOW': {
        'therapy': 'Alternative Strategy — Poor Readthrough Candidate',
        'mechanism': 'Bypass or compensate for the nonsense mutation',
        'approved': [
            {
                'name': 'Eteplirsen (Exondys 51)',
                'status': 'FDA Approved',
                'diseases': ['Duchenne MD — exon 51 skipping'],
                'note': 'Exon skipping — bypasses mutation entirely'
            },
            {
                'name': 'Nusinersen (Spinraza)',
                'status': 'FDA Approved',
                'diseases': ['Spinal Muscular Atrophy'],
                'note': 'Antisense oligonucleotide — splicing modulation'
            },
            {
                'name': 'Onasemnogene (Zolgensma)',
                'status': 'FDA Approved',
                'diseases': ['SMA type 1'],
                'note': 'Gene replacement therapy'
            },
        ],
        'phase3': [
            {
                'name': 'Casimersen (Amondys 45)',
                'status': 'FDA Approved — exon 45 skipping',
                'diseases': ['Duchenne MD'],
                'note': 'Exon skipping strategy'
            }
        ],
        'phase2': [
            {
                'name': 'Gene therapy vectors',
                'status': 'Multiple Phase 1/2 trials',
                'diseases': ['Disease-specific'],
                'note': 'AAV-delivered corrected gene copy'
            }
        ],
        'preclinical': [
            'Base editing (adenine base editor)',
            'Prime editing',
            'CRISPR-Cas9 correction',
            'Codon suppressor tRNA therapy',
        ],
        'combination': [
            'Exon skipping + supportive care',
            'Gene therapy + enzyme replacement',
        ]
    }
}

# ── ClinicalTrials.gov integration ────────────────────────────────
READTHROUGH_DRUGS = [
    'ataluren','ptc124','elx-02','gentamicin','eloxx',
    'readthrough','nonsense mutation','premature stop codon'
]

def fetch_clinical_trials(gene=None, condition=None, max_trials=5):
    """
    Fetch live clinical trials from ClinicalTrials.gov API v2
    Free, no API key needed.
    """
    try:
        # Build search query
        terms = []
        if gene:
            terms.append(gene)
        terms.append('nonsense mutation readthrough')

        query = ' '.join(terms)

        url = "https://clinicaltrials.gov/api/v2/studies"
        params = {
            'query.term':   query,
            'filter.overallStatus': 'RECRUITING,ACTIVE_NOT_RECRUITING,ENROLLING_BY_INVITATION',
            'pageSize':     max_trials,
            'format':       'json',
            'fields':       'NCTId,BriefTitle,Phase,OverallStatus,Condition,InterventionName,LocationCity,LocationCountry,StartDate,PrimaryCompletionDate'
        }

        resp = requests.get(url, params=params, timeout=10)
        if resp.status_code != 200:
            return []

        data   = resp.json()
        studies = data.get('studies', [])
        trials  = []

        for s in studies:
            proto = s.get('protocolSection', {})
            ident = proto.get('identificationModule', {})
            status = proto.get('statusModule', {})
            desc   = proto.get('conditionsModule', {})
            interv = proto.get('armsInterventionsModule', {})
            locs   = proto.get('contactsLocationsModule', {})

            interventions = []
            for arm in interv.get('interventions', []):
                interventions.append(arm.get('name',''))

            conditions = desc.get('conditions', [])

            locations = []
            for loc in locs.get('locations', [])[:3]:
                city    = loc.get('city','')
                country = loc.get('country','')
                if city or country:
                    locations.append(city + ', ' + country)

            trials.append({
                'nct_id':        ident.get('nctId',''),
                'title':         ident.get('briefTitle',''),
                'phase':         status.get('phase','N/A'),
                'status':        status.get('overallStatus',''),
                'conditions':    conditions[:3],
                'interventions': interventions[:3],
                'locations':     locations[:3],
                'url':           'https://clinicaltrials.gov/study/' + ident.get('nctId',''),
            })

        return trials

    except Exception as e:
        return []

def fetch_drug_trials(drug_name, max_trials=3):
    """Fetch trials for a specific drug."""
    try:
        url = "https://clinicaltrials.gov/api/v2/studies"
        params = {
            'query.term':   drug_name + ' nonsense mutation',
            'filter.overallStatus': 'RECRUITING,ACTIVE_NOT_RECRUITING',
            'pageSize':     max_trials,
            'format':       'json',
            'fields':       'NCTId,BriefTitle,Phase,OverallStatus,LocationCountry'
        }
        resp = requests.get(url, params=params, timeout=8)
        if resp.status_code != 200:
            return []

        studies = resp.json().get('studies', [])
        results = []
        for s in studies:
            proto  = s.get('protocolSection', {})
            ident  = proto.get('identificationModule', {})
            status = proto.get('statusModule', {})
            results.append({
                'nct_id': ident.get('nctId',''),
                'title':  ident.get('briefTitle','')[:80],
                'phase':  status.get('phase',''),
                'status': status.get('overallStatus',''),
                'url':    'https://clinicaltrials.gov/study/' + ident.get('nctId',''),
            })
        return results
    except:
        return []

# ── Helper functions ──────────────────────────────────────────────
def compute_rp_score_rfc(window):
    w = (window.upper().replace('T','U')+'N'*30)[:30]
    rfc = sum(SLIP_SCORES.get(b,0.5)*wt for b,wt in zip(w,POSITION_WEIGHTS))
    return round(max(0.0, min(100.0, rfc/sum(POSITION_WEIGHTS)*100)), 2)

def get_tier(score):
    if score >= HIGH_THRESHOLD: return 'HIGH'
    if score >= MED_THRESHOLD:  return 'MEDIUM'
    return 'LOW'

def encode_window(window):
    import math
    from collections import Counter
    w = (window.upper().replace('T','U')+'N'*30)[:30]
    slips = [SLIP_SCORES.get(b,0.50) for b in w]
    rfc = sum(s*wt for s,wt in zip(slips,POSITION_WEIGHTS))/sum(POSITION_WEIGHTS)
    p4 = w[18]
    p4_oh = [int(p4==b) for b in ['C','A','G','U']]
    stop = w[15:18]
    stop_oh = [int(stop==s) for s in ['UGA','UAA','UAG']]
    hex6 = w[18:24]
    hex_mean = sum(SLIP_SCORES.get(b,0.5) for b in hex6)/6
    up5 = w[10:15]
    up_mean = sum(SLIP_SCORES.get(b,0.5) for b in up5)/5
    gc = sum(1 for b in w if b in 'GC')/30.0
    def entropy(seq):
        if not seq: return 0.0
        cnt = Counter(seq); total = len(seq)
        return -sum((c/total)*math.log2(c/total) for c in cnt.values() if c>0)
    return np.array(slips+p4_oh+stop_oh+
                    [rfc,hex_mean,up_mean,gc,0.5,entropy(w[18:]),entropy(w[:15])],
                    dtype=np.float32)

# ── Load brains ───────────────────────────────────────────────────
BRAIN_TYPE    = "RFC-Rule"
rfc_model     = None
dnabert_model = None
dnabert_tok   = None

try:
    import joblib
    rfc_model  = joblib.load("models/rfc_head_weights.pkl")
    BRAIN_TYPE = "RFC-ML"
    print("RFC-ML brain loaded")
except Exception as e:
    print("RFC-ML not found: " + str(e))

def load_dnabert():
    global dnabert_model, dnabert_tok, BRAIN_TYPE
    try:
        import torch
        import torch.nn as nn
        from transformers import AutoTokenizer, BertModel, BertConfig
        from huggingface_hub import snapshot_download

        print("Loading DNABERT-2 brain...")
        mp  = snapshot_download("zhihan1996/DNABERT-2-117M")
        tok = AutoTokenizer.from_pretrained(mp, trust_remote_code=True)
        cfg = BertConfig.from_pretrained(mp)
        db  = BertModel.from_pretrained(mp, config=cfg, ignore_mismatched_sizes=True)

        class RPScoreHead(nn.Module):
            def __init__(self, h=768):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(h,512),   nn.LayerNorm(512), nn.GELU(), nn.Dropout(0.15),
                    nn.Linear(512,256), nn.LayerNorm(256), nn.GELU(), nn.Dropout(0.10),
                    nn.Linear(256,128), nn.GELU(), nn.Dropout(0.05),
                    nn.Linear(128,32),  nn.GELU(),
                    nn.Linear(32,1),    nn.Sigmoid()
                )
            def forward(self, x): return self.net(x).squeeze(-1) * 100.0

        class N2NModel(nn.Module):
            def __init__(self, db):
                super().__init__()
                self.encoder = db
                self.head     = RPScoreHead()
            def forward(self, ids, mask):
                out = self.encoder(input_ids=ids, attention_mask=mask)
                return self.head(out.last_hidden_state[:,0,:])

        m = N2NModel(db)
        w = "models/n2n_dnabert2_v2.pt"
        if os.path.exists(w):
            import torch
            ck = torch.load(w, map_location='cpu')
            m.load_state_dict(ck['model_state_dict'])
            m.eval()
            dnabert_model = m
            dnabert_tok   = tok
            BRAIN_TYPE    = "DNABERT-2"
            print("DNABERT-2 v2 loaded. Pearson r=0.941")
        else:
            print("v2 weights not found")
    except Exception as e:
        print("DNABERT-2 failed: " + str(e))

threading.Thread(target=load_dnabert, daemon=True).start()

def predict(window):
    if dnabert_model is not None and dnabert_tok is not None:
        try:
            import torch
            enc = dnabert_tok(window, return_tensors='pt',
                              max_length=36, padding='max_length', truncation=True)
            with torch.no_grad():
                s = dnabert_model(enc['input_ids'], enc['attention_mask']).item()
            return round(s, 3), "DNABERT-2"
        except:
            pass
    if rfc_model is not None:
        try:
            s = float(rfc_model.predict(encode_window(window).reshape(1,-1))[0])
            return round(max(0,min(100,s))/100, 3), "RFC-ML"
        except:
            pass
    return round(compute_rp_score_rfc(window)/100, 3), "RFC-Rule"

# ── Routes ────────────────────────────────────────────────────────
@app.route('/', methods=['GET'])
def home():
    return jsonify({
        'name':        'N2N Precision Engine',
        'version':     '3.0',
        'brain':       BRAIN_TYPE,
        'inventor':    'Manav Vanga',
        'patent':      'Pending 2026',
        'description': 'Predicts readthrough therapy response for all nonsense mutation diseases',
        'calibration': {'high_threshold': HIGH_THRESHOLD, 'med_threshold': MED_THRESHOLD},
        'endpoints':   ['/api/health', '/api/score', '/api/demo', '/api/trials'],
    })

@app.route('/api/health', methods=['GET'])
def health():
    return jsonify({
        'status':      'healthy',
        'brain':       BRAIN_TYPE,
        'version':     '3.0',
        'calibrated':  True,
        'thresholds':  {'high': HIGH_THRESHOLD, 'med': MED_THRESHOLD},
    })

@app.route('/api/score', methods=['GET','POST'])
def score():
    if request.method == 'POST':
        data   = request.get_json() or {}
        window = data.get('window','')
        gene   = data.get('gene','UNKNOWN')
        fetch_trials = data.get('trials', True)
    else:
        window = request.args.get('window','')
        gene   = request.args.get('gene','UNKNOWN')
        fetch_trials = request.args.get('trials','true').lower() == 'true'

    if not window or len(window) < 20:
        return jsonify({'error': 'window required (min 20bp DNA sequence)'}), 400

    window = window.upper().replace('U','T')
    score, brain_used = predict(window)
    tier  = get_tier(score)
    w     = (window+'N'*30)[:30]
    p4    = w[18] if len(w)>18 else 'N'
    road, road_desc = PLUS4_ROAD.get(p4, ('Unknown','Unknown'))
    drugs = DRUG_DATABASE[tier]
    audit = hashlib.sha256(
        (window+str(score)+datetime.now(timezone.utc).isoformat()
        ).encode()).hexdigest()[:16]

    # Fetch live clinical trials
    trials = []
    if fetch_trials:
        trials = fetch_clinical_trials(gene=gene if gene != 'UNKNOWN' else None)

    return jsonify({
        'gene':             gene,
        'window':           window[:30],
        'rp_score':         score,
        'tier':             tier,
        'plus4_base':       p4,
        'plus4_road':       road,
        'plus4_road_desc':  road_desc,
        'therapy':          drugs['therapy'],
        'mechanism':        drugs['mechanism'],
        'approved_drugs':   drugs['approved'],
        'phase3_drugs':     drugs['phase3'],
        'phase2_drugs':     drugs['phase2'],
        'preclinical':      drugs['preclinical'],
        'combination':      drugs['combination'],
        'clinical_trials':  trials,
        'brain':            brain_used,
        'confidence':       'HIGH' if brain_used=='DNABERT-2' else 'MEDIUM',
        'audit_hash':       audit,
        'timestamp':        datetime.now(timezone.utc).isoformat(),
        'inventor':         'Manav Vanga',
        'patent':           'Pending 2026',
    })

@app.route('/api/trials', methods=['GET'])
def trials():
    """Live clinical trials from ClinicalTrials.gov"""
    gene      = request.args.get('gene','')
    condition = request.args.get('condition','')
    drug      = request.args.get('drug','')

    if drug:
        results = fetch_drug_trials(drug)
    else:
        results = fetch_clinical_trials(gene=gene, condition=condition)

    return jsonify({
        'query':   {'gene':gene, 'condition':condition, 'drug':drug},
        'count':   len(results),
        'trials':  results,
        'source':  'ClinicalTrials.gov API v2',
        'note':    'Live data — refreshed on every request',
    })

@app.route('/api/demo', methods=['GET'])
def demo():
    demos = [
        ('CFTR','Y122X', 'AAGAAATCGATCAGTTAACAGCTTGCAGCN', '18.5% paper'),
        ('CFTR','G542X', 'AAGAAATCGATCAGTTGAGAGCTTGCAGCN', '0.3% paper'),
        ('CFTR','W1282X','AAGAAATCGATCAGTTGACAGCTTGCAGCN', '8.2% paper'),
        ('DMD', 'Q1922X','GCAGCAGCAGCAGCATGACGCAGCAGCAGC', 'predicted HIGH'),
        ('TP53','R213X', 'CGCGGCGGCGGCGGTGACGCAGCAGCAGCN', 'predicted HIGH'),
    ]
    results = []
    for gene, variant, window, expected in demos:
        s, brain = predict(window)
        results.append({
            'gene':     gene,
            'variant':  variant,
            'rp_score': s,
            'tier':     get_tier(s),
            'expected': expected,
            'brain':    brain,
        })
    return jsonify({
        'demo_results': results,
        'brain':        BRAIN_TYPE,
        'calibration':  {'high': HIGH_THRESHOLD, 'med': MED_THRESHOLD},
    })

if __name__ == '__main__':
    port = int(os.environ.get('PORT', 7860))
    app.run(host='0.0.0.0', port=port)