Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 11, 2025

Commit

9fc06ce

verified ·

1 Parent(s): 8732342

Update predictor.py

Browse files

Files changed (1) hide show

predictor.py +277 -63

predictor.py CHANGED Viewed

@@ -1,24 +1,97 @@
-# -*- coding: utf-8 -*-
-"""predictor.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1JURb-0j-R4LWK3oxeGrNxpJm3V6nnX02
-"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from typing import List, Tuple, Dict, Optional
 import logging
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# ============================= MODEL COMPONENTS =============================
 class BoundaryAwareGenePredictor(nn.Module):
     """Multi-task model predicting genes, starts, and ends separately."""
@@ -80,8 +153,6 @@ class BoundaryAwareGenePredictor(nn.Module):
             'end': self.end_classifier(attended)
         }
-# ============================= DATA PREPROCESSING =============================
 class DNAProcessor:
     """DNA sequence processor with boundary-aware features."""
@@ -106,16 +177,16 @@ class DNAProcessor:
         one_hot.scatter_(1, encoded.unsqueeze(1), 1)
         features = [one_hot]
-        # Start codon indicators (increased weights for GTG and TTG)
         start_indicators = torch.zeros(seq_len, 3)
         for i in range(seq_len - 2):
             codon = sequence[i:i+3]
             if codon == 'ATG':
                 start_indicators[i:i+3, 0] = 1.0
             elif codon == 'GTG':
-                start_indicators[i:i+3, 1] = 0.9  # Increased from 0.7
             elif codon == 'TTG':
-                start_indicators[i:i+3, 2] = 0.8  # Increased from 0.5
         features.append(start_indicators)
         # Stop codon indicators
@@ -150,8 +221,6 @@ class DNAProcessor:
         return torch.cat(features, dim=1)  # 5 + 3 + 3 + 1 + 2 = 14
-# ============================= POST-PROCESSING =============================
 class EnhancedPostProcessor:
     """Enhanced post-processor with stricter boundary detection."""
@@ -164,8 +233,6 @@ class EnhancedPostProcessor:
     def process_predictions(self, gene_probs: np.ndarray, start_probs: np.ndarray,
                           end_probs: np.ndarray, sequence: str = None) -> np.ndarray:
         """Process predictions with enhanced boundary detection."""
-        # More conservative thresholds
         gene_pred = (gene_probs[:, 1] > 0.6).astype(int)
         start_pred = (start_probs[:, 1] > 0.4).astype(int)
         end_pred = (end_probs[:, 1] > 0.5).astype(int)
@@ -178,7 +245,6 @@ class EnhancedPostProcessor:
             processed = self._refine_with_boundaries(gene_pred, start_pred, end_pred)
         processed = self._apply_constraints(processed, sequence)
         return processed
     def _refine_with_codons_and_boundaries(self, gene_pred: np.ndarray,
@@ -205,7 +271,7 @@ class EnhancedPostProcessor:
         for g_start, g_end in zip(gene_starts, gene_ends):
             best_start = g_start
-            start_window = 100  # Increased from 50
             nearby_starts = [pos for pos in start_codon_positions
                            if abs(pos - g_start) <= start_window]
@@ -216,7 +282,7 @@ class EnhancedPostProcessor:
                         codon = sequence[pos:pos+3]
                         codon_weight = 1.0 if codon == 'ATG' else (0.9 if codon == 'GTG' else 0.8)
                         boundary_score = start_pred[pos]
-                        distance_penalty = abs(pos - g_start) / start_window * 0.2  # Add distance penalty
                         score = codon_weight * 0.5 + boundary_score * 0.4 - distance_penalty
                         start_scores.append((score, pos))
@@ -306,10 +372,10 @@ class EnhancedPostProcessor:
         return processed
-# ============================= PREDICTION =============================
-class GenePredictor:
-    """Handles gene prediction using the trained boundary-aware model."""
     def __init__(self, model_path: str = 'model/best_boundary_aware_model.pth',
                  device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
@@ -324,13 +390,50 @@ class GenePredictor:
         self.model.eval()
         self.processor = DNAProcessor()
         self.post_processor = EnhancedPostProcessor()
-    def predict(self, sequence: str) -> Tuple[np.ndarray, Dict[str, np.ndarray], float]:
         sequence = sequence.upper()
         if not re.match('^[ACTGN]+$', sequence):
-            logging.warning("Sequence contains invalid characters. Using 'N' for unknowns.")
             sequence = ''.join(c if c in 'ACTGN' else 'N' for c in sequence)
         features = self.processor.create_enhanced_features(sequence).unsqueeze(0).to(self.device)
         with torch.no_grad():
@@ -343,10 +446,95 @@ class GenePredictor:
             gene_probs, start_probs, end_probs, sequence
         )
         confidence = np.mean(gene_probs[:, 1][predictions == 1]) if np.any(predictions == 1) else 0.0
-        return predictions, {'gene': gene_probs, 'start': start_probs, 'end': end_probs}, confidence
     def extract_gene_regions(self, predictions: np.ndarray, sequence: str) -> List[Dict]:
         regions = []
         changes = np.diff(np.concatenate(([0], predictions, [0])))
         starts = np.where(changes == 1)[0]
@@ -370,9 +558,9 @@ class GenePredictor:
                             break
             regions.append({
-                'start': int(start),  # Convert to Python int for JSON serialization
                 'end': int(end),
-                'sequence': gene_seq,  # Return full sequence
                 'length': int(end - start),
                 'start_codon': actual_start_codon,
                 'stop_codon': actual_stop_codon,
@@ -381,34 +569,60 @@ class GenePredictor:
         return regions
-    def compute_accuracy(self, predictions: np.ndarray, labels: List[int]) -> Dict:
-        min_len = min(len(predictions), len(labels))
-        predictions = predictions[:min_len]
-        labels = np.array(labels[:min_len])
-        accuracy = np.mean(predictions == labels)
-        true_pos = np.sum((predictions == 1) & (labels == 1))
-        false_neg = np.sum((predictions == 0) & (labels == 1))
-        false_pos = np.sum((predictions == 1) & (labels == 0))
-        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0.0
-        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0.0
-        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
-        return {
-            'accuracy': accuracy,
-            'precision': precision,
-            'recall': recall,
-            'f1': f1,
-            'true_positives': int(true_pos),
-            'false_positives': int(false_pos),
-            'false_negatives': int(false_neg)
-        }
-    def labels_from_coordinates(self, seq_len: int, start: int, end: int) -> List[int]:
-        labels = [0] * seq_len
-        start = max(0, min(start, seq_len - 1))
-        end = max(start, min(end, seq_len))
-        for i in range(start, end):
-            labels[i] = 1
-        return labels

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+from typing import List, Tuple, Dict, Optional, Union
 import logging
 import re
+import os
+from pathlib import Path
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# ============================= FILE READERS =============================
+class FileReader:
+    """Handles reading DNA sequences from various file formats."""
+    @staticmethod
+    def read_fasta(file_path: str) -> Dict[str, str]:
+        """
+        Read FASTA file and return dictionary of sequence_id: sequence
+        """
+        sequences = {}
+        current_id = None
+        current_seq = []
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                for line in file:
+                    line = line.strip()
+                    if line.startswith('>'):
+                        # Save previous sequence if exists
+                        if current_id is not None:
+                            sequences[current_id] = ''.join(current_seq)
+                        # Start new sequence
+                        current_id = line[1:]  # Remove '>' character
+                        current_seq = []
+                    elif line and current_id is not None:
+                        # Add sequence line (remove any whitespace)
+                        current_seq.append(line.replace(' ', '').replace('\t', ''))
+                # Don't forget the last sequence
+                if current_id is not None:
+                    sequences[current_id] = ''.join(current_seq)
+        except Exception as e:
+            logging.error(f"Error reading FASTA file {file_path}: {e}")
+            raise
+        return sequences
+    @staticmethod
+    def read_txt(file_path: str) -> str:
+        """
+        Read plain text file containing DNA sequence
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read().strip()
+                # Remove any whitespace, newlines, and non-DNA characters
+                sequence = ''.join(c.upper() for c in content if c.upper() in 'ACTGN')
+                return sequence
+        except Exception as e:
+            logging.error(f"Error reading TXT file {file_path}: {e}")
+            raise
+    @staticmethod
+    def detect_file_type(file_path: str) -> str:
+        """
+        Detect file type based on extension and content
+        """
+        file_path = Path(file_path)
+        extension = file_path.suffix.lower()
+        if extension in ['.fasta', '.fa', '.fas', '.fna']:
+            return 'fasta'
+        elif extension in ['.txt', '.seq']:
+            return 'txt'
+        else:
+            # Try to detect by content
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    first_line = file.readline().strip()
+                    if first_line.startswith('>'):
+                        return 'fasta'
+                    else:
+                        return 'txt'
+            except:
+                logging.warning(f"Could not detect file type for {file_path}, assuming txt")
+                return 'txt'
+# ============================= ORIGINAL MODEL COMPONENTS =============================
+# (Including all the original classes: BoundaryAwareGenePredictor, DNAProcessor, EnhancedPostProcessor)
 class BoundaryAwareGenePredictor(nn.Module):
     """Multi-task model predicting genes, starts, and ends separately."""
             'end': self.end_classifier(attended)
         }
 class DNAProcessor:
     """DNA sequence processor with boundary-aware features."""
         one_hot.scatter_(1, encoded.unsqueeze(1), 1)
         features = [one_hot]
+        # Start codon indicators
         start_indicators = torch.zeros(seq_len, 3)
         for i in range(seq_len - 2):
             codon = sequence[i:i+3]
             if codon == 'ATG':
                 start_indicators[i:i+3, 0] = 1.0
             elif codon == 'GTG':
+                start_indicators[i:i+3, 1] = 0.9
             elif codon == 'TTG':
+                start_indicators[i:i+3, 2] = 0.8
         features.append(start_indicators)
         # Stop codon indicators
         return torch.cat(features, dim=1)  # 5 + 3 + 3 + 1 + 2 = 14
 class EnhancedPostProcessor:
     """Enhanced post-processor with stricter boundary detection."""
     def process_predictions(self, gene_probs: np.ndarray, start_probs: np.ndarray,
                           end_probs: np.ndarray, sequence: str = None) -> np.ndarray:
         """Process predictions with enhanced boundary detection."""
         gene_pred = (gene_probs[:, 1] > 0.6).astype(int)
         start_pred = (start_probs[:, 1] > 0.4).astype(int)
         end_pred = (end_probs[:, 1] > 0.5).astype(int)
             processed = self._refine_with_boundaries(gene_pred, start_pred, end_pred)
         processed = self._apply_constraints(processed, sequence)
         return processed
     def _refine_with_codons_and_boundaries(self, gene_pred: np.ndarray,
         for g_start, g_end in zip(gene_starts, gene_ends):
             best_start = g_start
+            start_window = 100
             nearby_starts = [pos for pos in start_codon_positions
                            if abs(pos - g_start) <= start_window]
                         codon = sequence[pos:pos+3]
                         codon_weight = 1.0 if codon == 'ATG' else (0.9 if codon == 'GTG' else 0.8)
                         boundary_score = start_pred[pos]
+                        distance_penalty = abs(pos - g_start) / start_window * 0.2
                         score = codon_weight * 0.5 + boundary_score * 0.4 - distance_penalty
                         start_scores.append((score, pos))
         return processed
+# ============================= ENHANCED GENE PREDICTOR =============================
+class EnhancedGenePredictor:
+    """Enhanced Gene Predictor with file input support."""
     def __init__(self, model_path: str = 'model/best_boundary_aware_model.pth',
                  device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
         self.model.eval()
         self.processor = DNAProcessor()
         self.post_processor = EnhancedPostProcessor()
+        self.file_reader = FileReader()
+    def predict_from_file(self, file_path: str) -> Dict[str, Dict]:
+        """
+        Predict genes from a file (.txt or .fasta)
+        Returns a dictionary with sequence_id as keys and prediction results as values
+        """
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        file_type = self.file_reader.detect_file_type(file_path)
+        logging.info(f"Detected file type: {file_type}")
+        results = {}
+        if file_type == 'fasta':
+            sequences = self.file_reader.read_fasta(file_path)
+            for seq_id, sequence in sequences.items():
+                logging.info(f"Processing sequence: {seq_id} (length: {len(sequence)})")
+                result = self.predict_sequence(sequence, seq_id)
+                results[seq_id] = result
+        else:  # txt file
+            sequence = self.file_reader.read_txt(file_path)
+            seq_id = Path(file_path).stem  # Use filename as sequence ID
+            logging.info(f"Processing sequence from {file_path} (length: {len(sequence)})")
+            result = self.predict_sequence(sequence, seq_id)
+            results[seq_id] = result
+        return results
+    def predict_sequence(self, sequence: str, seq_id: str = "sequence") -> Dict:
+        """
+        Predict genes from a single DNA sequence string
+        """
         sequence = sequence.upper()
         if not re.match('^[ACTGN]+$', sequence):
+            logging.warning(f"Sequence {seq_id} contains invalid characters. Using 'N' for unknowns.")
             sequence = ''.join(c if c in 'ACTGN' else 'N' for c in sequence)
+        # Handle very long sequences by chunking if needed
+        max_chunk_size = 50000  # Adjust based on your GPU memory
+        if len(sequence) > max_chunk_size:
+            return self._predict_long_sequence(sequence, seq_id, max_chunk_size)
         features = self.processor.create_enhanced_features(sequence).unsqueeze(0).to(self.device)
         with torch.no_grad():
             gene_probs, start_probs, end_probs, sequence
         )
         confidence = np.mean(gene_probs[:, 1][predictions == 1]) if np.any(predictions == 1) else 0.0
+        gene_regions = self.extract_gene_regions(predictions, sequence)
+        return {
+            'sequence_id': seq_id,
+            'sequence_length': len(sequence),
+            'predictions': predictions.tolist(),
+            'probabilities': {
+                'gene': gene_probs.tolist(),
+                'start': start_probs.tolist(),
+                'end': end_probs.tolist()
+            },
+            'confidence': float(confidence),
+            'gene_regions': gene_regions,
+            'total_genes_found': len(gene_regions)
+        }
+    def _predict_long_sequence(self, sequence: str, seq_id: str, chunk_size: int) -> Dict:
+        """
+        Handle very long sequences by processing in chunks with overlap
+        """
+        overlap = 1000  # Overlap between chunks to avoid missing genes at boundaries
+        all_predictions = []
+        all_gene_probs = []
+        all_start_probs = []
+        all_end_probs = []
+        for i in range(0, len(sequence), chunk_size - overlap):
+            end_pos = min(i + chunk_size, len(sequence))
+            chunk = sequence[i:end_pos]
+            logging.info(f"Processing chunk {i//chunk_size + 1} of sequence {seq_id}")
+            features = self.processor.create_enhanced_features(chunk).unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(features)
+                gene_probs = F.softmax(outputs['gene'], dim=-1).cpu().numpy()[0]
+                start_probs = F.softmax(outputs['start'], dim=-1).cpu().numpy()[0]
+                end_probs = F.softmax(outputs['end'], dim=-1).cpu().numpy()[0]
+            chunk_predictions = self.post_processor.process_predictions(
+                gene_probs, start_probs, end_probs, chunk
+            )
+            # Handle overlaps by taking the first chunk fully and subsequent chunks without overlap
+            if i == 0:
+                all_predictions.extend(chunk_predictions)
+                all_gene_probs.extend(gene_probs)
+                all_start_probs.extend(start_probs)
+                all_end_probs.extend(end_probs)
+            else:
+                # Skip overlap region
+                skip = min(overlap, len(chunk_predictions))
+                all_predictions.extend(chunk_predictions[skip:])
+                all_gene_probs.extend(gene_probs[skip:])
+                all_start_probs.extend(start_probs[skip:])
+                all_end_probs.extend(end_probs[skip:])
+        predictions = np.array(all_predictions)
+        gene_probs = np.array(all_gene_probs)
+        start_probs = np.array(all_start_probs)
+        end_probs = np.array(all_end_probs)
+        confidence = np.mean(gene_probs[:, 1][predictions == 1]) if np.any(predictions == 1) else 0.0
+        gene_regions = self.extract_gene_regions(predictions, sequence)
+        return {
+            'sequence_id': seq_id,
+            'sequence_length': len(sequence),
+            'predictions': predictions.tolist(),
+            'probabilities': {
+                'gene': gene_probs.tolist(),
+                'start': start_probs.tolist(),
+                'end': end_probs.tolist()
+            },
+            'confidence': float(confidence),
+            'gene_regions': gene_regions,
+            'total_genes_found': len(gene_regions)
+        }
+    def predict_from_text(self, sequence: str) -> Dict:
+        """
+        Predict genes from a text string (backward compatibility)
+        """
+        return self.predict_sequence(sequence)
     def extract_gene_regions(self, predictions: np.ndarray, sequence: str) -> List[Dict]:
+        """Extract gene regions from predictions"""
         regions = []
         changes = np.diff(np.concatenate(([0], predictions, [0])))
         starts = np.where(changes == 1)[0]
                             break
             regions.append({
+                'start': int(start),
                 'end': int(end),
+                'sequence': gene_seq,
                 'length': int(end - start),
                 'start_codon': actual_start_codon,
                 'stop_codon': actual_stop_codon,
         return regions
+    def save_results(self, results: Dict[str, Dict], output_path: str, format: str = 'json'):
+        """
+        Save prediction results to file
+        """
+        import json
+        if format.lower() == 'json':
+            with open(output_path, 'w') as f:
+                json.dump(results, f, indent=2)
+        elif format.lower() == 'csv':
+            import csv
+            with open(output_path, 'w', newline='') as f:
+                writer = csv.writer(f)
+                writer.writerow(['sequence_id', 'gene_start', 'gene_end', 'gene_length',
+                               'start_codon', 'stop_codon', 'in_frame', 'confidence'])
+                for seq_id, result in results.items():
+                    for gene in result['gene_regions']:
+                        writer.writerow([
+                            seq_id, gene['start'], gene['end'], gene['length'],
+                            gene['start_codon'], gene['stop_codon'], gene['in_frame'],
+                            result['confidence']
+                        ])
+        logging.info(f"Results saved to {output_path}")
+# ============================= USAGE EXAMPLE =============================
+def main():
+    """Example usage of the enhanced gene predictor"""
+    # Initialize predictor
+    predictor = EnhancedGenePredictor(model_path='model/best_boundary_aware_model.pth')
+    # Example 1: Predict from FASTA file
+    try:
+        fasta_results = predictor.predict_from_file('example.fasta')
+        predictor.save_results(fasta_results, 'fasta_predictions.json')
+        print("FASTA predictions saved to fasta_predictions.json")
+    except FileNotFoundError:
+        print("example.fasta not found, skipping FASTA example")
+    # Example 2: Predict from TXT file
+    try:
+        txt_results = predictor.predict_from_file('example.txt')
+        predictor.save_results(txt_results, 'txt_predictions.csv', format='csv')
+        print("TXT predictions saved to txt_predictions.csv")
+    except FileNotFoundError:
+        print("example.txt not found, skipping TXT example")
+    # Example 3: Predict from text string (original functionality)
+    example_sequence = "ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA"
+    text_results = predictor.predict_from_text(example_sequence)
+    print(f"Found {text_results['total_genes_found']} genes in example sequence")
+if __name__ == "__main__":
+    main()