abd-ur
/

GvEM

@@ -1,5 +1,5 @@
 """
-VCF data parsing with support for gene annotations, pathway mappings, and hierarchical data organization.
 """
 import vcf
@@ -21,6 +21,7 @@ logger = logging.getLogger(__name__)
 @dataclass
 class MutationRecord:
     chromosome: str
     position: int
@@ -35,96 +36,12 @@ class MutationRecord:
     allele_frequency: Optional[float] = None
     def to_dict(self) -> Dict[str, Any]:
         return asdict(self)
-class GeneAnnotationManager:
-    """Optional"""
-    def __init__(self, annotation_file: Optional[Union[str, Path]] = None):
-        self.annotations: Dict[Tuple[str, int], Dict[str, str]] = {}
-        self.gene_to_coords: Dict[str, List[Tuple[str, int]]] = defaultdict(list)
-        if annotation_file:
-            self.load_annotations(annotation_file)
-    def load_annotations(self, annotation_file: Union[str, Path]) -> None:
-        """
-        Expected formats:
-        - JSON: {(chrom, pos): {"gene": "GENE1", "feature": "exon", ...}}
-        - TSV: chrom\tpos\tgene\tfeature\t...
-        """
-        annotation_file = Path(annotation_file)
-        if not annotation_file.exists():
-            logger.warning(f"Annotation file not found: {annotation_file}")
-            return
-        try:
-            if annotation_file.suffix.lower() == '.json':
-                self._load_json_annotations(annotation_file)
-            elif annotation_file.suffix.lower() in ['.tsv', '.txt']:
-                self._load_tsv_annotations(annotation_file)
-            else:
-                logger.error(f"Unsupported annotation file format: {annotation_file.suffix}")
-        except Exception as e:
-            logger.error(f"Error loading annotations: {e}")
-            raise
-    def _load_json_annotations(self, file_path: Path) -> None:
-        """Load annotations from JSON file."""
-        with open(file_path, 'r') as f:
-            data = json.load(f)
-        for key, annotation in data.items():
-            if isinstance(key, str):
-                # Parse string key like "chr1:12345"
-                chrom, pos = key.split(':')
-                pos = int(pos)
-            else:
-                chrom, pos = key
-            self.annotations[(chrom, pos)] = annotation
-            gene_id = annotation.get('gene')
-            if gene_id:
-                self.gene_to_coords[gene_id].append((chrom, pos))
-    def _load_tsv_annotations(self, file_path: Path) -> None:
-        """Load annotations from TSV file."""
-        df = pd.read_csv(file_path, sep='\t')
-        required_columns = ['chrom', 'pos', 'gene']
-        if not all(col in df.columns for col in required_columns):
-            raise ValueError(f"TSV file must contain columns: {required_columns}")
-        for _, row in df.iterrows():
-            chrom = str(row['chrom'])
-            pos = int(row['pos'])
-            annotation = row.to_dict()
-            self.annotations[(chrom, pos)] = annotation
-            gene_id = annotation.get('gene')
-            if gene_id:
-                self.gene_to_coords[gene_id].append((chrom, pos))
-    def get_gene_annotation(self, chrom: str, pos: int) -> Optional[Dict[str, str]]:
-        """Get gene annotation for a specific genomic position."""
-        return self.annotations.get((chrom, pos))
-    def get_genes_in_region(self, chrom: str, start: int, end: int) -> List[str]:
-        """Get all genes in a genomic region."""
-        genes = []
-        for (annot_chrom, annot_pos), annotation in self.annotations.items():
-            if annot_chrom == chrom and start <= annot_pos <= end:
-                gene_id = annotation.get('gene')
-                if gene_id and gene_id not in genes:
-                    genes.append(gene_id)
-        return genes
 class PathwayMapper:
-    """Optional"""
     def __init__(self, pathway_file: Optional[Union[str, Path]] = None):
         self.gene_to_pathway: Dict[str, str] = {}
@@ -135,6 +52,8 @@ class PathwayMapper:
     def load_pathway_mappings(self, pathway_file: Union[str, Path]) -> None:
         """
         Expected formats:
         - JSON: {"GENE1": "pathway1", "GENE2": "pathway2", ...}
         - TSV: gene\tpathway
@@ -192,17 +111,17 @@ class PathwayMapper:
 class VCFParser:
     """
     Parses VCF files and organizes mutations in a hierarchical structure:
     Sample -> Pathway -> Chromosome -> Gene -> Mutations
     """
     def __init__(self,
                  config: Optional[DataConfig] = None,
-                 gene_annotator: Optional[GeneAnnotationManager] = None,
                  pathway_mapper: Optional[PathwayMapper] = None):
         self.config = config or DataConfig()
-        self.gene_annotator = gene_annotator or GeneAnnotationManager()
         self.pathway_mapper = pathway_mapper or PathwayMapper()
         # Statistics tracking
@@ -217,7 +136,9 @@ class VCFParser:
     def parse_vcf_file(self, vcf_file: Union[str, Path]) -> Dict[str, Any]:
         """
-          Returns:
             Dict with structure: {
                 'sample_id': {
                     'pathway_id': {
@@ -303,12 +224,10 @@ class VCFParser:
             if impact not in self.config.supported_impacts:
                 impact = "MODERATE"  # Default impact
-            # Get gene annotation
-            gene_annotation = self.gene_annotator.get_gene_annotation(chrom, pos)
-            if not gene_annotation:
-                return None  # Skip variants without gene annotation
-            gene_id = gene_annotation.get('gene', 'Unknown_Gene')
             # Get pathway information
             pathway = self.pathway_mapper.get_pathway(gene_id)
@@ -336,6 +255,33 @@ class VCFParser:
             logger.warning(f"Error processing record at {record.CHROM}:{record.POS}: {e}")
             return None
     def _extract_impact(self, record) -> str:
         """Extract variant impact from VCF record."""
         # Try different fields where impact might be stored
@@ -435,35 +381,38 @@ def create_parser_from_config(config_manager: ConfigManager) -> VCFParser:
     """Create VCF parser from configuration manager."""
     config = config_manager.data_config
-    gene_annotator = None
-    if config.gene_annotation_path:
-        gene_annotator = GeneAnnotationManager(config.gene_annotation_path)
     pathway_mapper = None
     if config.pathway_mapping_path:
         pathway_mapper = PathwayMapper(config.pathway_mapping_path)
     return VCFParser(
         config=config,
-        gene_annotator=gene_annotator,
         pathway_mapper=pathway_mapper
     )
 if __name__ == "__main__":
     config_manager = ConfigManager()
     config_manager.data_config.vcf_file_path = "example.vcf"
-    config_manager.data_config.gene_annotation_path = "gene_annotations.json"
     config_manager.data_config.pathway_mapping_path = "pathway_mappings.json"
     parser = create_parser_from_config(config_manager)
     try:
         hierarchical_data = parser.parse_vcf_file(config_manager.data_config.vcf_file_path)
         stats = parser.get_parsing_statistics()
         print(f"Parsing Statistics: {stats}")
         parser.export_parsed_data(
             hierarchical_data,
             "parsed_vcf_data.json",

 """
+This module provides comprehensive VCF file parsing
 """
 import vcf
 @dataclass
 class MutationRecord:
+    """Represents a single mutation record with all relevant information."""
     chromosome: str
     position: int
     allele_frequency: Optional[float] = None
     def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format."""
         return asdict(self)
 class PathwayMapper:
+    """Manages pathway mapping data and provides gene-to-pathway lookups."""
     def __init__(self, pathway_file: Optional[Union[str, Path]] = None):
         self.gene_to_pathway: Dict[str, str] = {}
     def load_pathway_mappings(self, pathway_file: Union[str, Path]) -> None:
         """
+        Load pathway mappings from file.
         Expected formats:
         - JSON: {"GENE1": "pathway1", "GENE2": "pathway2", ...}
         - TSV: gene\tpathway
 class VCFParser:
     """
+    Comprehensive VCF parser with hierarchical data organization.
     Parses VCF files and organizes mutations in a hierarchical structure:
     Sample -> Pathway -> Chromosome -> Gene -> Mutations
     """
     def __init__(self,
                  config: Optional[DataConfig] = None,
                  pathway_mapper: Optional[PathwayMapper] = None):
         self.config = config or DataConfig()
         self.pathway_mapper = pathway_mapper or PathwayMapper()
         # Statistics tracking
     def parse_vcf_file(self, vcf_file: Union[str, Path]) -> Dict[str, Any]:
         """
+        Parse VCF file and return hierarchical mutation data.
+        Returns:
             Dict with structure: {
                 'sample_id': {
                     'pathway_id': {
             if impact not in self.config.supported_impacts:
                 impact = "MODERATE"  # Default impact
+            # Extract gene information directly from VCF record
+            gene_id = self._extract_gene_id(record)
+            if not gene_id:
+                gene_id = "Unknown_Gene"
             # Get pathway information
             pathway = self.pathway_mapper.get_pathway(gene_id)
             logger.warning(f"Error processing record at {record.CHROM}:{record.POS}: {e}")
             return None
+    def _extract_gene_id(self, record) -> Optional[str]:
+        """Extract gene ID directly from VCF record INFO fields."""
+        # Try different fields where gene information might be stored
+        gene_fields = ['GENE', 'SYMBOL', 'ANN', 'EFF', 'CSQ', 'GENEINFO']
+        for field in gene_fields:
+            if field in record.INFO:
+                gene_value = record.INFO[field]
+                if isinstance(gene_value, list):
+                    gene_value = gene_value[0]
+                gene_str = str(gene_value)
+                # Parse gene from annotation strings (e.g., ANN, CSQ)
+                if '|' in gene_str:
+                    # Common annotation format: ALLELE|Annotation|...
+                    parts = gene_str.split('|')
+                    for part in parts:
+                        if part and part not in ['', '.', 'ALLELE', 'Annotation']:
+                            return part
+                else:
+                    # Direct gene value
+                    if gene_str and gene_str not in ['', '.']:
+                        return gene_str
+        return None
     def _extract_impact(self, record) -> str:
         """Extract variant impact from VCF record."""
         # Try different fields where impact might be stored
     """Create VCF parser from configuration manager."""
     config = config_manager.data_config
+    # Initialize pathway mapper
     pathway_mapper = None
     if config.pathway_mapping_path:
         pathway_mapper = PathwayMapper(config.pathway_mapping_path)
     return VCFParser(
         config=config,
         pathway_mapper=pathway_mapper
     )
+# Example usage
 if __name__ == "__main__":
+    # Example usage with configuration
     config_manager = ConfigManager()
+    # Set up file paths
     config_manager.data_config.vcf_file_path = "example.vcf"
     config_manager.data_config.pathway_mapping_path = "pathway_mappings.json"
+    # Create parser
     parser = create_parser_from_config(config_manager)
+    # Parse VCF file
     try:
         hierarchical_data = parser.parse_vcf_file(config_manager.data_config.vcf_file_path)
+        # Print statistics
         stats = parser.get_parsing_statistics()
         print(f"Parsing Statistics: {stats}")
+        # Export results
         parser.export_parsed_data(
             hierarchical_data,
             "parsed_vcf_data.json",