"""
Optimization indices based on GenScript patent WO2020024917A1.
Implements Harmony Index, Codon Context Index, and Outlier Index.
Includes mRNA secondary structure prediction using seqfold.
"""

import math
import re
from collections import Counter
from typing import Dict, List, Tuple, Optional
from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table

# Try to import seqfold for mRNA structure prediction
try:
    from seqfold import dg, fold, dot_bracket
    SEQFOLD_AVAILABLE = True
except ImportError:
    SEQFOLD_AVAILABLE = False


def cosine_distance(v1: List[float], v2: List[float]) -> float:
    """Calculate cosine distance between two vectors."""
    dot_product = sum(a * b for a, b in zip(v1, v2))
    norm1 = math.sqrt(sum(a * a for a in v1))
    norm2 = math.sqrt(sum(b * b for b in v2))

    if norm1 == 0 or norm2 == 0:
        return 1.0

    similarity = dot_product / (norm1 * norm2)
    return 1.0 - similarity


def euclidean_distance(v1: List[float], v2: List[float]) -> float:
    """Calculate normalized Euclidean distance between two vectors."""
    if len(v1) != len(v2):
        raise ValueError("Vectors must have same length")

    squared_diff = sum((a - b) ** 2 for a, b in zip(v1, v2))
    max_dist = math.sqrt(len(v1))  # Max possible distance when all elements differ by 1

    return math.sqrt(squared_diff) / max_dist if max_dist > 0 else 0


def sequence_to_codons(dna_sequence: str) -> List[str]:
    """Convert DNA sequence to list of codons."""
    dna_sequence = dna_sequence.upper().replace(" ", "").replace("\n", "")
    codons = []
    for i in range(0, len(dna_sequence) - 2, 3):
        codons.append(dna_sequence[i:i+3])
    return codons


def codons_to_protein(codons: List[str]) -> str:
    """Convert codon list to protein sequence."""
    protein = ""
    for codon in codons:
        aa = CODON_TO_AA.get(codon, 'X')
        if aa != '*':
            protein += aa
    return protein


def protein_to_codons_random(protein: str, codon_table: dict) -> List[str]:
    """Convert protein to codons using weighted random selection based on usage table."""
    import random
    codons = []
    for aa in protein.upper():
        if aa not in AA_TO_CODONS:
            continue
        synonymous = AA_TO_CODONS[aa]
        # Weight by codon usage
        weights = [codon_table.get(c, 0.1) for c in synonymous]
        total = sum(weights)
        weights = [w / total for w in weights]
        codon = random.choices(synonymous, weights=weights, k=1)[0]
        codons.append(codon)
    return codons


class HarmonyIndex:
    """
    Harmony Index (H) measures how well the codon usage in the candidate sequence
    matches the codon usage distribution of highly-expressed genes in the host.

    H = 1 - D(F_hs, F_ts)

    Where:
    - F_hs = codon frequency vector from highly-expressed genes (reference)
    - F_ts = codon frequency vector from target sequence
    - D = distance function (cosine or euclidean)
    """

    def __init__(self, organism: str):
        self.codon_table = get_codon_table(organism)
        self.reference_freqs = self._build_reference_vector()

    def _build_reference_vector(self) -> List[float]:
        """Build reference frequency vector from codon table (59 codons, excluding stops and single-codon AAs)."""
        # All codons except stop codons
        codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']
        return [self.codon_table.get(c, 0) for c in sorted(codons)]

    def _build_sequence_vector(self, codons: List[str]) -> List[float]:
        """Build frequency vector from a codon sequence."""
        # Count codons
        codon_counts = Counter(codons)
        total = len(codons)

        if total == 0:
            return [0.0] * 61  # 64 - 3 stop codons

        # Build vector matching reference order (all codons except stops)
        all_codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']

        # Calculate frequencies per amino acid group
        freqs = []
        for codon in sorted(all_codons):
            aa = CODON_TO_AA[codon]
            synonymous = AA_TO_CODONS[aa]
            aa_total = sum(codon_counts.get(c, 0) for c in synonymous)
            if aa_total > 0:
                freq = codon_counts.get(codon, 0) / aa_total
            else:
                freq = 0.0
            freqs.append(freq)

        return freqs

    def calculate(self, codons: List[str]) -> float:
        """
        Calculate Harmony Index for a codon sequence.
        Returns value between 0 and 1, where 1 is perfect harmony.
        """
        target_freqs = self._build_sequence_vector(codons)
        distance = cosine_distance(self.reference_freqs, target_freqs)
        return 1.0 - distance


class CodonContextIndex:
    """
    Codon Context Index (CC) evaluates how well codon pairs in the candidate
    sequence match the codon pair preferences of highly-expressed genes.

    CC = 1 - D(F_hcc, F_tcc)

    Where:
    - F_hcc = codon-pair frequency vector from reference
    - F_tcc = codon-pair frequency vector from target sequence
    """

    def __init__(self, organism: str):
        self.organism = organism
        self.codon_table = get_codon_table(organism)
        # For simplicity, we use a basic model based on individual codon preferences
        # A full implementation would use experimental codon pair frequency data

    def _calculate_pair_score(self, codon1: str, codon2: str) -> float:
        """Calculate preference score for a codon pair based on individual codon usage."""
        freq1 = self.codon_table.get(codon1, 0.1)
        freq2 = self.codon_table.get(codon2, 0.1)
        # Geometric mean of frequencies as a simple pair score
        return math.sqrt(freq1 * freq2)

    def calculate(self, codons: List[str]) -> float:
        """
        Calculate Codon Context Index.
        Returns value between 0 and 1, where 1 is optimal context.
        """
        if len(codons) < 2:
            return 1.0

        total_score = 0.0
        max_score = 0.0

        for i in range(len(codons) - 1):
            pair_score = self._calculate_pair_score(codons[i], codons[i + 1])
            total_score += pair_score
            max_score += 1.0  # Maximum possible score per pair

        if max_score == 0:
            return 1.0

        return total_score / max_score


# =============================================================================
# mRNA Secondary Structure Analysis
# =============================================================================

class mRNAStructureAnalyzer:
    """
    Analyzes mRNA secondary structure using seqfold.

    Key metrics:
    - 5' end folding energy (critical for translation initiation)
    - Full sequence minimum free energy (MFE)
    - Hairpin detection in sliding windows
    """

    def __init__(self, temperature: float = 37.0):
        """
        Initialize analyzer.

        Args:
            temperature: Temperature in Celsius for folding calculations
        """
        self.temperature = temperature
        self.available = SEQFOLD_AVAILABLE

    def dna_to_rna(self, dna_seq: str) -> str:
        """Convert DNA sequence to RNA (T -> U)."""
        return dna_seq.upper().replace('T', 'U')

    def calculate_mfe(self, sequence: str, is_rna: bool = False, max_length: int = 200) -> float:
        """
        Calculate minimum free energy (MFE) for a sequence.

        Args:
            sequence: DNA or RNA sequence
            is_rna: If True, sequence is already RNA
            max_length: Maximum sequence length to analyze (longer sequences are truncated)

        Returns:
            MFE in kcal/mol (more negative = more stable structure)
        """
        if not self.available:
            return 0.0

        if not is_rna:
            sequence = self.dna_to_rna(sequence)

        if len(sequence) < 4:
            return 0.0

        # Truncate long sequences for performance (RNA folding is O(n³))
        if len(sequence) > max_length:
            sequence = sequence[:max_length]

        try:
            return dg(sequence, temp=self.temperature)
        except Exception:
            return 0.0

    def calculate_5prime_mfe(self, sequence: str, window_size: int = 50) -> float:
        """
        Calculate MFE for the 5' region (critical for translation initiation).

        Stable structures at the 5' end can reduce translation by up to 250-fold.

        Args:
            sequence: DNA sequence
            window_size: Number of nucleotides from 5' end to analyze

        Returns:
            MFE of 5' region in kcal/mol
        """
        if len(sequence) < window_size:
            return self.calculate_mfe(sequence)

        return self.calculate_mfe(sequence[:window_size])

    def find_hairpins(self, sequence: str, window_size: int = 60,
                      step: int = 30, threshold: float = -15.0) -> List[dict]:
        """
        Find potential hairpin structures using sliding window analysis.

        Args:
            sequence: DNA sequence
            window_size: Size of sliding window
            step: Step size for window
            threshold: MFE threshold for flagging (kcal/mol)

        Returns:
            List of hairpin regions with position and energy
        """
        if not self.available:
            return []

        hairpins = []
        rna_seq = self.dna_to_rna(sequence)

        for i in range(0, len(rna_seq) - window_size + 1, step):
            window = rna_seq[i:i + window_size]
            try:
                energy = dg(window, temp=self.temperature)
                if energy < threshold:
                    hairpins.append({
                        'start': i,
                        'end': i + window_size,
                        'energy': energy,
                        'sequence': sequence[i:i + window_size]
                    })
            except Exception:
                continue

        return hairpins

    def get_structure_string(self, sequence: str) -> str:
        """
        Get dot-bracket notation of predicted structure.

        Args:
            sequence: DNA sequence

        Returns:
            Dot-bracket string (e.g., "(((...)))")
        """
        if not self.available:
            return ""

        rna_seq = self.dna_to_rna(sequence)

        try:
            structs = fold(rna_seq)
            return dot_bracket(structs)
        except Exception:
            return ""

    def analyze(self, sequence: str, fast_mode: bool = True) -> dict:
        """
        Perform mRNA structure analysis.

        Args:
            sequence: DNA sequence
            fast_mode: If True, only analyze critical regions (faster)

        Returns:
            Dictionary with structure metrics
        """
        if not self.available:
            return {
                'available': False,
                'full_mfe': 0.0,
                'five_prime_mfe': 0.0,
                'hairpin_count': 0,
                'hairpins': [],
                'structure_penalty': 0.0,
            }

        # Always calculate 5' MFE (most important for translation)
        five_prime_mfe = self.calculate_5prime_mfe(sequence)

        # For full MFE, only analyze first 200 nt in fast mode
        if fast_mode:
            full_mfe = self.calculate_mfe(sequence[:min(200, len(sequence))])
            # Skip hairpin detection in fast mode for long sequences
            if len(sequence) > 300:
                hairpins = []
            else:
                hairpins = self.find_hairpins(sequence, window_size=40, step=40, threshold=-12.0)
        else:
            full_mfe = self.calculate_mfe(sequence, max_length=500)
            hairpins = self.find_hairpins(sequence)

        # Calculate structure penalty for optimization
        # Penalize stable 5' structures (< -30 kcal/mol is problematic)
        five_prime_penalty = 0.0
        if five_prime_mfe < -30:
            five_prime_penalty = (-30 - five_prime_mfe) / 10  # Scaled penalty

        # Penalize multiple hairpins
        hairpin_penalty = len(hairpins) * 0.1

        # Total structure penalty
        structure_penalty = five_prime_penalty + hairpin_penalty

        return {
            'available': True,
            'full_mfe': full_mfe,
            'five_prime_mfe': five_prime_mfe,
            'hairpin_count': len(hairpins),
            'hairpins': hairpins,
            'structure_penalty': structure_penalty,
        }


class OutlierIndex:
    """
    Outlier Index (OI) quantifies negative sequence features that should be minimized.

    OI = sum(w_i * f_i(x)) for various adverse features

    Features include:
    - GC content deviation from optimal range (organism-specific)
    - Repetitive sequences
    - Rare codon clusters
    - mRNA secondary structure (optional - expensive to compute)
    - Adverse motifs (restriction sites, splice sites, etc.)
    """

    # Weights for different features - GC penalties significantly increased
    WEIGHTS = {
        'gc_content': 8.0,       # Increased from 2.0 - global GC penalty
        'gc_local': 4.0,         # Increased from 1.5 - local window GC penalty
        'gc_extreme': 10.0,      # NEW - exponential penalty for extreme GC
        'rare_codons': 2.0,
        'repeats': 1.0,
        'homopolymers': 1.5,
        'motifs': 2.0,
        'mrna_structure': 2.5,
    }

    # Organism-specific optimal GC content ranges
    # Based on typical GC content of highly-expressed genes
    GC_TARGETS = {
        "Escherichia coli K12": (0.45, 0.55),      # E. coli prefers moderate GC
        "Homo sapiens": (0.45, 0.60),              # Human - moderate to slightly high
        "CHO (Chinese Hamster Ovary)": (0.45, 0.60),
        "Saccharomyces cerevisiae": (0.35, 0.45),  # Yeast prefers lower GC
        "Mus musculus": (0.45, 0.60),              # Mouse similar to human
        "Insect (Spodoptera frugiperda)": (0.40, 0.55),
        "Pichia pastoris": (0.38, 0.48),
        "Bacillus subtilis": (0.40, 0.50),
        "Drosophila melanogaster": (0.45, 0.58),
        "Caenorhabditis elegans": (0.35, 0.45),
    }
    DEFAULT_GC_TARGET = (0.40, 0.55)  # Default if organism not found

    # Adverse motifs to avoid
    ADVERSE_MOTIFS = [
        # Shine-Dalgarno sequences (prokaryotic RBS)
        'AGGAGG', 'GGAGG', 'AGGAG', 'GAGG',
        # Common restriction sites
        'GAATTC',  # EcoRI
        'GGATCC',  # BamHI
        'AAGCTT',  # HindIII
        'CTGCAG',  # PstI
        'GCGGCCGC',  # NotI
        'CATATG',  # NdeI
        # Splice site consensus
        'GGTAAG', 'GGTGAG',
        # Poly-A signals
        'AATAAA', 'ATTAAA',
        # AU-rich elements
        'ATTTA',
    ]

    def __init__(self, organism: str, excluded_sites: List[str] = None,
                 include_mrna_structure: bool = False):
        """
        Initialize OutlierIndex.

        Args:
            organism: Target host organism
            excluded_sites: Restriction sites to avoid
            include_mrna_structure: If True, include mRNA folding in calculations
                                   (expensive - set False for optimization iterations)
        """
        self.organism = organism
        self.codon_table = get_codon_table(organism)
        self.rare_threshold = 0.10  # Codons with frequency < 10% considered rare
        self.excluded_sites = excluded_sites or []
        self.include_mrna_structure = include_mrna_structure
        self.mrna_analyzer = mRNAStructureAnalyzer() if include_mrna_structure else None

        # Get organism-specific GC target range
        self.gc_target = self.GC_TARGETS.get(organism, self.DEFAULT_GC_TARGET)
        self.gc_min, self.gc_max = self.gc_target

    def _gc_content_penalty(self, sequence: str) -> float:
        """Calculate penalty for GC content outside organism-specific optimal range."""
        if len(sequence) == 0:
            return 0.0

        gc_count = sequence.count('G') + sequence.count('C')
        gc_percent = gc_count / len(sequence)

        # Use organism-specific optimal range
        if self.gc_min <= gc_percent <= self.gc_max:
            return 0.0
        elif gc_percent < self.gc_min:
            return (self.gc_min - gc_percent) * 3  # Increased multiplier
        else:
            return (gc_percent - self.gc_max) * 3  # Increased multiplier

    def _gc_extreme_penalty(self, sequence: str) -> float:
        """
        Calculate exponential penalty for extreme GC content.
        Heavily penalizes GC outside 30-70% range.
        """
        if len(sequence) == 0:
            return 0.0

        gc_count = sequence.count('G') + sequence.count('C')
        gc_percent = gc_count / len(sequence)

        # No penalty within acceptable range (organism target ± 15%)
        soft_min = max(0.30, self.gc_min - 0.10)
        soft_max = min(0.70, self.gc_max + 0.10)

        if soft_min <= gc_percent <= soft_max:
            return 0.0

        # Exponential penalty outside soft range
        if gc_percent < soft_min:
            deviation = soft_min - gc_percent
        else:
            deviation = gc_percent - soft_max

        # Exponential: penalty grows rapidly with deviation
        return (deviation * 10) ** 2

    def _gc_local_penalty(self, sequence: str, window_size: int = 60) -> float:
        """Calculate penalty for local GC content outside acceptable range."""
        if len(sequence) < window_size:
            return 0.0

        # Local range is organism target ± 10%
        local_min = max(0.25, self.gc_min - 0.10)
        local_max = min(0.75, self.gc_max + 0.10)

        penalties = []
        for i in range(0, len(sequence) - window_size + 1, 10):
            window = sequence[i:i + window_size]
            gc_count = window.count('G') + window.count('C')
            gc_percent = gc_count / len(window)

            if gc_percent < local_min:
                penalties.append((local_min - gc_percent) * 2)
            elif gc_percent > local_max:
                penalties.append((gc_percent - local_max) * 2)

        return sum(penalties) / max(len(penalties), 1) if penalties else 0.0

    def _rare_codon_penalty(self, codons: List[str]) -> float:
        """Calculate penalty for rare codon usage."""
        if len(codons) == 0:
            return 0.0

        rare_count = 0
        for codon in codons:
            freq = self.codon_table.get(codon, 0)
            if freq < self.rare_threshold:
                rare_count += 1

        return rare_count / len(codons)

    def _repeat_penalty(self, sequence: str) -> float:
        """Calculate penalty for direct repeats (>= 8bp)."""
        penalty = 0.0
        # Check for repeats
        for length in range(8, min(20, len(sequence) // 2)):
            for i in range(len(sequence) - 2 * length + 1):
                pattern = sequence[i:i + length]
                if pattern in sequence[i + length:]:
                    penalty += 0.1

        return min(penalty, 1.0)

    def _homopolymer_penalty(self, sequence: str) -> float:
        """Calculate penalty for homopolymer runs (>= 5 same nucleotides)."""
        penalty = 0.0
        for base in 'ATGC':
            runs = re.findall(f'{base}{{5,}}', sequence)
            for run in runs:
                penalty += (len(run) - 4) * 0.1

        return min(penalty, 1.0)

    def _motif_penalty(self, sequence: str) -> float:
        """Calculate penalty for adverse motifs."""
        penalty = 0.0
        all_motifs = self.ADVERSE_MOTIFS + self.excluded_sites

        for motif in all_motifs:
            count = sequence.count(motif)
            penalty += count * 0.2

        return min(penalty, 1.0)

    def _mrna_structure_penalty(self, sequence: str) -> float:
        """Calculate penalty for problematic mRNA secondary structures."""
        if not self.include_mrna_structure or self.mrna_analyzer is None:
            return 0.0

        if not self.mrna_analyzer.available:
            return 0.0

        analysis = self.mrna_analyzer.analyze(sequence)
        return analysis['structure_penalty']

    def calculate(self, codons: List[str]) -> float:
        """
        Calculate Outlier Index for a codon sequence.
        Returns value >= 0, where 0 is optimal (no adverse features).
        Lower is better.
        """
        sequence = ''.join(codons)

        penalties = {
            'gc_content': self._gc_content_penalty(sequence),
            'gc_extreme': self._gc_extreme_penalty(sequence),
            'gc_local': self._gc_local_penalty(sequence),
            'rare_codons': self._rare_codon_penalty(codons),
            'repeats': self._repeat_penalty(sequence),
            'homopolymers': self._homopolymer_penalty(sequence),
            'motifs': self._motif_penalty(sequence),
            'mrna_structure': self._mrna_structure_penalty(sequence),
        }

        total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())
        return total

    def calculate_detailed(self, codons: List[str]) -> dict:
        """
        Calculate Outlier Index with detailed breakdown.
        Returns dictionary with all penalty components and mRNA structure analysis.
        """
        sequence = ''.join(codons)

        penalties = {
            'gc_content': self._gc_content_penalty(sequence),
            'gc_extreme': self._gc_extreme_penalty(sequence),
            'gc_local': self._gc_local_penalty(sequence),
            'rare_codons': self._rare_codon_penalty(codons),
            'repeats': self._repeat_penalty(sequence),
            'homopolymers': self._homopolymer_penalty(sequence),
            'motifs': self._motif_penalty(sequence),
            'mrna_structure': self._mrna_structure_penalty(sequence),
        }

        total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())

        # Get detailed mRNA analysis
        mrna_analysis = self.mrna_analyzer.analyze(sequence)

        return {
            'total': total,
            'penalties': penalties,
            'weights': self.WEIGHTS,
            'mrna_structure': mrna_analysis,
        }


def calculate_cai(codons: List[str], codon_table: dict) -> float:
    """
    Calculate Codon Adaptation Index (CAI).
    CAI is the geometric mean of relative adaptiveness values.
    """
    if len(codons) == 0:
        return 0.0

    # Calculate relative adaptiveness for each codon
    w_values = []
    for codon in codons:
        aa = CODON_TO_AA.get(codon)
        if aa is None or aa == '*':
            continue

        # Get max frequency among synonymous codons
        synonymous = AA_TO_CODONS[aa]
        max_freq = max(codon_table.get(c, 0.01) for c in synonymous)

        # Relative adaptiveness
        freq = codon_table.get(codon, 0.01)
        w = freq / max_freq if max_freq > 0 else 0.01
        w_values.append(max(w, 0.01))  # Avoid log(0)

    if len(w_values) == 0:
        return 0.0

    # Geometric mean
    log_sum = sum(math.log(w) for w in w_values)
    cai = math.exp(log_sum / len(w_values))

    return cai


def calculate_gc_content(sequence: str) -> float:
    """Calculate GC content as percentage."""
    if len(sequence) == 0:
        return 0.0
    gc_count = sequence.count('G') + sequence.count('C')
    return (gc_count / len(sequence)) * 100