"""
This module provides comprehensive VCF file parsing
"""

import vcf
import json
import logging
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Union, Any
from dataclasses import dataclass, asdict
import pandas as pd

from config import DataConfig, ConfigManager


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@dataclass
class MutationRecord:
    """Represents a single mutation record with all relevant information."""
    
    chromosome: str
    position: int
    reference: str
    alternate: str
    impact: str
    gene_id: str
    pathway: str
    sample_id: str
    quality: Optional[float] = None
    depth: Optional[int] = None
    allele_frequency: Optional[float] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary format."""
        return asdict(self)


class PathwayMapper:
    """Manages pathway mapping data and provides gene-to-pathway lookups."""
    
    def __init__(self, pathway_file: Optional[Union[str, Path]] = None):
        self.gene_to_pathway: Dict[str, str] = {}
        self.pathway_to_genes: Dict[str, List[str]] = defaultdict(list)
        
        if pathway_file:
            self.load_pathway_mappings(pathway_file)
    
    def load_pathway_mappings(self, pathway_file: Union[str, Path]) -> None:
        """
        Load pathway mappings from file.
        
        Expected formats:
        - JSON: {"GENE1": "pathway1", "GENE2": "pathway2", ...}
        - TSV: gene\tpathway
        """
        pathway_file = Path(pathway_file)
        
        if not pathway_file.exists():
            logger.warning(f"Pathway file not found: {pathway_file}")
            return
        
        try:
            if pathway_file.suffix.lower() == '.json':
                self._load_json_pathways(pathway_file)
            elif pathway_file.suffix.lower() in ['.tsv', '.txt']:
                self._load_tsv_pathways(pathway_file)
            else:
                logger.error(f"Unsupported pathway file format: {pathway_file.suffix}")
                
        except Exception as e:
            logger.error(f"Error loading pathway mappings: {e}")
            raise
    
    def _load_json_pathways(self, file_path: Path) -> None:
        """Load pathway mappings from JSON file."""
        with open(file_path, 'r') as f:
            self.gene_to_pathway = json.load(f)
        
        # Build reverse mapping
        for gene, pathway in self.gene_to_pathway.items():
            self.pathway_to_genes[pathway].append(gene)
    
    def _load_tsv_pathways(self, file_path: Path) -> None:
        """Load pathway mappings from TSV file."""
        df = pd.read_csv(file_path, sep='\t')
        
        required_columns = ['gene', 'pathway']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"TSV file must contain columns: {required_columns}")
        
        for _, row in df.iterrows():
            gene = str(row['gene'])
            pathway = str(row['pathway'])
            
            self.gene_to_pathway[gene] = pathway
            self.pathway_to_genes[pathway].append(gene)
    
    def get_pathway(self, gene_id: str) -> str:
        """Get pathway for a gene, returns 'Unknown_Pathway' if not found."""
        return self.gene_to_pathway.get(gene_id, "Unknown_Pathway")
    
    def get_genes_in_pathway(self, pathway: str) -> List[str]:
        """Get all genes in a specific pathway."""
        return self.pathway_to_genes.get(pathway, [])


class VCFParser:
    """
    Comprehensive VCF parser with hierarchical data organization.
    
    Parses VCF files and organizes mutations in a hierarchical structure:
    Sample -> Pathway -> Chromosome -> Gene -> Mutations
    """
    
    def __init__(self, 
                 config: Optional[DataConfig] = None,
                 pathway_mapper: Optional[PathwayMapper] = None):
        
        self.config = config or DataConfig()
        self.pathway_mapper = pathway_mapper or PathwayMapper()
        
        # Statistics tracking
        self.parsing_stats = {
            'total_records': 0,
            'processed_records': 0,
            'skipped_records': 0,
            'samples_processed': 0,
            'unique_genes': set(),
            'unique_pathways': set()
        }
    
    def parse_vcf_file(self, vcf_file: Union[str, Path]) -> Dict[str, Any]:
        """
        Parse VCF file and return hierarchical mutation data.
        
        Returns:
            Dict with structure: {
                'sample_id': {
                    'pathway_id': {
                        'chromosome': {
                            'gene_id': [MutationRecord, ...]
                        }
                    }
                }
            }
        """
        vcf_file = Path(vcf_file)
        
        if not vcf_file.exists():
            raise FileNotFoundError(f"VCF file not found: {vcf_file}")
        
        logger.info(f"Parsing VCF file: {vcf_file}")
        
        # Initialize hierarchical data structure
        hierarchical_data = defaultdict(
            lambda: defaultdict(  # samples
                lambda: defaultdict(  # pathways
                    lambda: defaultdict(  # chromosomes
                        lambda: []  # genes -> list of mutations
                    )
                )
            )
        )
        
        try:
            vcf_reader = vcf.Reader(filename=str(vcf_file))
            
            for record in vcf_reader:
                self.parsing_stats['total_records'] += 1
                
                # Process each sample in the record
                for sample in record.samples:
                    mutation_record = self._process_vcf_record(record, sample)
                    
                    if mutation_record:
                        # Add to hierarchical structure
                        sample_id = mutation_record.sample_id
                        pathway = mutation_record.pathway
                        chromosome = mutation_record.chromosome
                        gene_id = mutation_record.gene_id
                        
                        hierarchical_data[sample_id][pathway][chromosome][gene_id].append(
                            mutation_record
                        )
                        
                        # Update statistics
                        self.parsing_stats['processed_records'] += 1
                        self.parsing_stats['unique_genes'].add(gene_id)
                        self.parsing_stats['unique_pathways'].add(pathway)
                    else:
                        self.parsing_stats['skipped_records'] += 1
            
            self.parsing_stats['samples_processed'] = len(hierarchical_data)
            
        except Exception as e:
            logger.error(f"Error parsing VCF file: {e}")
            raise
        
        logger.info(f"Parsing completed. Processed {self.parsing_stats['processed_records']} "
                   f"mutations from {self.parsing_stats['samples_processed']} samples")
        
        return dict(hierarchical_data)
    
    def _process_vcf_record(self, record, sample) -> Optional[MutationRecord]:
        """Process a single VCF record and return MutationRecord."""
        try:
            # Extract basic variant information
            chrom = str(record.CHROM)
            pos = record.POS
            ref = record.REF
            alt = str(record.ALT[0]) if record.ALT else '.'
            
            # Validate chromosome
            if chrom not in self.config.supported_chromosomes:
                return None
            
            # Extract impact information
            impact = self._extract_impact(record)
            if impact not in self.config.supported_impacts:
                impact = "MODERATE"  # Default impact
            
            # Extract gene information directly from VCF record
            gene_id = self._extract_gene_id(record)
            if not gene_id:
                gene_id = "Unknown_Gene"
            
            # Get pathway information
            pathway = self.pathway_mapper.get_pathway(gene_id)
            
            # Extract additional sample information
            quality = getattr(record, 'QUAL', None)
            depth = self._extract_depth(sample)
            allele_freq = self._extract_allele_frequency(sample)
            
            return MutationRecord(
                chromosome=chrom,
                position=pos,
                reference=ref,
                alternate=alt,
                impact=impact,
                gene_id=gene_id,
                pathway=pathway,
                sample_id=sample.sample,
                quality=quality,
                depth=depth,
                allele_frequency=allele_freq
            )
            
        except Exception as e:
            logger.warning(f"Error processing record at {record.CHROM}:{record.POS}: {e}")
            return None
    
    def _extract_gene_id(self, record) -> Optional[str]:
        """Extract gene ID directly from VCF record INFO fields."""
        # Try different fields where gene information might be stored
        gene_fields = ['GENE', 'SYMBOL', 'ANN', 'EFF', 'CSQ', 'GENEINFO']
        
        for field in gene_fields:
            if field in record.INFO:
                gene_value = record.INFO[field]
                if isinstance(gene_value, list):
                    gene_value = gene_value[0]
                
                gene_str = str(gene_value)
                
                # Parse gene from annotation strings (e.g., ANN, CSQ)
                if '|' in gene_str:
                    # Common annotation format: ALLELE|Annotation|...
                    parts = gene_str.split('|')
                    for part in parts:
                        if part and part not in ['', '.', 'ALLELE', 'Annotation']:
                            return part
                else:
                    # Direct gene value
                    if gene_str and gene_str not in ['', '.']:
                        return gene_str
        
        return None
    
    def _extract_impact(self, record) -> str:
        """Extract variant impact from VCF record."""
        # Try different fields where impact might be stored
        impact_fields = ['IMPACT', 'ANN', 'EFF', 'CSQ']
        
        for field in impact_fields:
            if field in record.INFO:
                impact_value = record.INFO[field]
                if isinstance(impact_value, list):
                    impact_value = impact_value[0]
                
                # Parse impact from annotation strings
                for supported_impact in self.config.supported_impacts:
                    if supported_impact in str(impact_value).upper():
                        return supported_impact
        
        return "MODERATE"  # Default impact
    
    def _extract_depth(self, sample) -> Optional[int]:
        """Extract read depth from sample."""
        if hasattr(sample.data, 'DP') and sample.data.DP is not None:
            return int(sample.data.DP)
        return None
    
    def _extract_allele_frequency(self, sample) -> Optional[float]:
        """Extract allele frequency from sample."""
        if hasattr(sample.data, 'AF') and sample.data.AF is not None:
            af = sample.data.AF
            if isinstance(af, list):
                af = af[0]
            return float(af)
        return None
    
    def get_parsing_statistics(self) -> Dict[str, Any]:
        """Get detailed parsing statistics."""
        stats = self.parsing_stats.copy()
        stats['unique_genes'] = len(stats['unique_genes'])
        stats['unique_pathways'] = len(stats['unique_pathways'])
        stats['success_rate'] = (
            stats['processed_records'] / max(stats['total_records'], 1) * 100
        )
        return stats
    
    def export_parsed_data(self, 
                          hierarchical_data: Dict, 
                          output_file: Union[str, Path],
                          format: str = 'json') -> None:
        """
        Export parsed hierarchical data to file.
        
        Args:
            hierarchical_data: Parsed VCF data
            output_file: Output file path
            format: Export format ('json', 'pickle')
        """
        output_file = Path(output_file)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        
        if format.lower() == 'json':
            # Convert MutationRecord objects to dictionaries for JSON serialization
            json_data = self._convert_to_json_serializable(hierarchical_data)
            with open(output_file, 'w') as f:
                json.dump(json_data, f, indent=2)
        
        elif format.lower() == 'pickle':
            import pickle
            with open(output_file, 'wb') as f:
                pickle.dump(hierarchical_data, f)
        
        else:
            raise ValueError(f"Unsupported export format: {format}")
        
        logger.info(f"Exported parsed data to: {output_file}")
    
    def _convert_to_json_serializable(self, data: Dict) -> Dict:
        """Convert hierarchical data with MutationRecord objects to JSON-serializable format."""
        result = {}
        
        for sample_id, pathways in data.items():
            result[sample_id] = {}
            
            for pathway_id, chromosomes in pathways.items():
                result[sample_id][pathway_id] = {}
                
                for chrom_id, genes in chromosomes.items():
                    result[sample_id][pathway_id][chrom_id] = {}
                    
                    for gene_id, mutations in genes.items():
                        result[sample_id][pathway_id][chrom_id][gene_id] = [
                            mutation.to_dict() for mutation in mutations
                        ]
        
        return result


def create_parser_from_config(config_manager: ConfigManager) -> VCFParser:
    """Create VCF parser from configuration manager."""
    config = config_manager.data_config
    
    # Initialize pathway mapper
    pathway_mapper = None
    if config.pathway_mapping_path:
        pathway_mapper = PathwayMapper(config.pathway_mapping_path)
    
    return VCFParser(
        config=config,
        pathway_mapper=pathway_mapper
    )


# Example usage
if __name__ == "__main__":
    # Example usage with configuration
    config_manager = ConfigManager()
    
    # Set up file paths
    config_manager.data_config.vcf_file_path = "example.vcf"
    config_manager.data_config.pathway_mapping_path = "pathway_mappings.json"
    
    # Create parser
    parser = create_parser_from_config(config_manager)
    
    # Parse VCF file
    try:
        hierarchical_data = parser.parse_vcf_file(config_manager.data_config.vcf_file_path)
        
        # Print statistics
        stats = parser.get_parsing_statistics()
        print(f"Parsing Statistics: {stats}")
        
        # Export results
        parser.export_parsed_data(
            hierarchical_data, 
            "parsed_vcf_data.json", 
            format='json'
        )
        
    except Exception as e:
        logger.error(f"Error in VCF parsing: {e}")