Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 7, 2025

Commit

31e359a

1 Parent(s): 1cebb06

Add model file and app files

Browse files

Files changed (3) hide show

app.py +257 -0
best_boundary_aware_model.pth +3 -0
predictor.py +414 -0

app.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import gradio as gr
+import torch
+import numpy as np
+import json
+from typing import Optional, List, Dict, Tuple
+import logging
+from predictor import GenePredictor
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Initialize the predictor globally
+try:
+    predictor = GenePredictor(model_path='best_boundary_aware_model.pth')
+    logger.info("Gene predictor initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize predictor: {e}")
+    predictor = None
+def predict_gene_regions(sequence: str,
+                        ground_truth_labels: Optional[str] = None,
+                        ground_truth_start: Optional[int] = None,
+                        ground_truth_end: Optional[int] = None) -> Tuple[str, str, str]:
+    """
+    Main prediction function for Gradio interface
+    Returns:
+        - regions_display: Formatted string showing predicted regions
+        - metrics_display: Formatted string showing accuracy metrics (if ground truth provided)
+        - detailed_json: JSON string with full prediction details
+    """
+    if predictor is None:
+        error_msg = "❌ Model not loaded. Please check the model file."
+        return error_msg, "", ""
+    # Input validation
+    sequence = sequence.strip().upper()
+    if not sequence:
+        error_msg = "❌ Sequence cannot be empty"
+        return error_msg, "", ""
+    if not all(c in 'ACTGN' for c in sequence):
+        error_msg = "❌ Sequence contains invalid characters. Only A, C, T, G, N allowed"
+        return error_msg, "", ""
+    # Process ground truth if provided
+    labels = None
+    try:
+        if ground_truth_labels and ground_truth_labels.strip():
+            labels = [int(x) for x in ground_truth_labels.split(',')]
+            if len(labels) != len(sequence):
+                error_msg = f"❌ Labels length ({len(labels)}) must match sequence length ({len(sequence)})"
+                return error_msg, "", ""
+            if not all(x in (0, 1) for x in labels):
+                error_msg = "❌ Labels must be 0 or 1"
+                return error_msg, "", ""
+        elif ground_truth_start is not None and ground_truth_end is not None:
+            start = int(ground_truth_start)
+            end = int(ground_truth_end)
+            if start < 0 or end > len(sequence) or start >= end:
+                error_msg = f"❌ Invalid coordinates: start={start}, end={end}"
+                return error_msg, "", ""
+            labels = predictor.labels_from_coordinates(len(sequence), start, end)
+    except ValueError as e:
+        error_msg = f"❌ Invalid ground truth format: {str(e)}"
+        return error_msg, "", ""
+    # Make prediction
+    try:
+        predictions, probs_dict, confidence = predictor.predict(sequence)
+        regions = predictor.extract_gene_regions(predictions, sequence)
+        # Format regions display
+        regions_display = format_regions_display(regions, confidence)
+        # Compute metrics if ground truth provided
+        metrics_display = ""
+        metrics = None
+        if labels is not None:
+            metrics = predictor.compute_accuracy(predictions, labels)
+            metrics_display = format_metrics_display(metrics)
+        # Create detailed JSON response
+        detailed_response = {
+            "regions": regions,
+            "confidence": float(confidence),
+            "metrics": metrics,
+            "sequence_length": len(sequence),
+            "num_predicted_genes": len(regions),
+            "prediction_summary": {
+                "total_gene_positions": int(np.sum(predictions)),
+                "gene_coverage": float(np.sum(predictions) / len(predictions))
+            }
+        }
+        detailed_json = json.dumps(detailed_response, indent=2)
+        return regions_display, metrics_display, detailed_json
+    except Exception as e:
+        logger.error(f"Prediction failed: {e}")
+        error_msg = f"❌ Prediction failed: {str(e)}"
+        return error_msg, "", ""
+def format_regions_display(regions: List[Dict], confidence: float) -> str:
+    """Format the regions for display in the Gradio interface"""
+    if not regions:
+        return f"🔍 **No gene regions detected** (Confidence: {confidence:.3f})\n\nThe model did not identify any gene regions in the provided sequence."
+    display = f"🧬 **Found {len(regions)} gene region(s)** (Overall Confidence: {confidence:.3f})\n\n"
+    for i, region in enumerate(regions, 1):
+        display += f"**Gene {i}:**\n"
+        display += f"  • Position: {region['start']} - {region['end']}\n"
+        display += f"  • Length: {region['length']} bp\n"
+        display += f"  • Start Codon: {region['start_codon'] or 'None detected'}\n"
+        display += f"  • Stop Codon: {region['stop_codon'] or 'None detected'}\n"
+        display += f"  • In Frame: {'✅ Yes' if region['in_frame'] else '❌ No'}\n"
+        display += f"  • Sequence Preview: {region['sequence'][:60]}{'...' if len(region['sequence']) > 60 else ''}\n\n"
+    return display
+def format_metrics_display(metrics: Dict) -> str:
+    """Format the accuracy metrics for display"""
+    display = "📊 **Accuracy Metrics** (vs Ground Truth)\n\n"
+    display += f"  • **Accuracy:** {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)\n"
+    display += f"  • **Precision:** {metrics['precision']:.3f}\n"
+    display += f"  • **Recall:** {metrics['recall']:.3f}\n"
+    display += f"  • **F1 Score:** {metrics['f1']:.3f}\n\n"
+    display += f"**Confusion Matrix:**\n"
+    display += f"  • True Positives: {metrics['true_positives']}\n"
+    display += f"  • False Positives: {metrics['false_positives']}\n"
+    display += f"  • False Negatives: {metrics['false_negatives']}\n"
+    return display
+def load_example_sequence():
+    """Load an example DNA sequence for testing"""
+    example = """ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGATGGCAGGTTTCACCGCCGGTAATGAAAAAGGCGAACTGGTGGTGCTTGGACGCAACGGTTCCGACTACTCTGCTGCGGTGCTGGCTGCCTGTTTACGCGCCGATTGTTGCGAGATTTGGACGGACGTTGACGGGGTCTATACCTGCGACCCGCGTCAGGTGCCCGATGCGAGGTTGTTGAAGTCGATGTCCTACCAGGAAGCGATGGAGCTTTCCTACTTCGGCGCTAAAGTTCTTCACCCCCGCACCATTACCCCCATCGCCCAGTTCCAGATCCCTTGCCTGATTAAAAATACCGGAAATCCTCAAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATGAAGACGAATTACCGGTCAAGGGCATTTCCAATCTGAATAACATGGCAATGTTCAGCGTTTCCGGCCCGGGGATGAAAGGGATGGTCGGCATGGCGGCGCGCGTCTTTGCAGCGATGTCACGCGCCCGTATTTCCGTGGTGCTGATTACGCAATCATCTTCCGAATACAGCATCAGTTTCTGCGTTCCACAAAGCGACTGTGTGCGAGCTGAACGGGCAATGCAGGAAGAGTTCTACCTGGAACTGAAAGAAGGCTTACTGGAGCCGCTGGCAGTGACGGAACGGCTGGCCATTATCTCGGTGGTAGGTGATGGTATGCGCACCTTGCGTGGGATCTCGGCACCAGCGAAAGACGGTGGGCCGTGGATAAAGCGCGGCGTCTCGGCGTTTTCGGACCCCGCGGTCTCTTAACCCGAGTCCGAAAATTGTGATCGGGGCCGGGTTTAACGATGGAGCGATCGGGTCAATTGGGGCTGCACCGTTTGACCTGAAGACGCCGGCGGGAAACCGCGTTTCGTTTGCCAGGCGTGAGAGTATTCTTTCCGGCTCCGGTATAGCTGAAACATGAAATGCTTTCCCCTGCGCTTGGCCGATACGCTGGTTTAAGACTTCGGATCGCCGGGAAAGTCGCCCCCCACATTCTGCCAACGATTTGGTTAAAATAGTGACATTGGTGGAAACGGGGAAATGGGTTGACGGTTTTGAAGGGCGTGTCACACCATCGGTTGTTGGCGTTGACAAACGCGATCCGTATAATGAAACTGAATTTGTACACTTTCGCGTCGGGGATGTGGTCAGCAGTTAGGCTCCAATTGATGCCACGTTGACATGATCAATACCTGCGTGCCGGTCACAATCACCTTACCACCCAGTCCGATCAACGCCTGCGCGGGTGCGCAGATACGCGTGGTGTGTCTCGCGAACCGGGATCGTCGCACGGGCATGGAACACTATGGTGAGCAAGGGCGAGGAGTGATTACGCCTGATCTGCTGTTGAGAAGAAGCGCGTCTACCCCTCGGGACAAGGCAAAGAATTTGCTGCAGAAATACGCTGGAGATTGAAGGTTCTGGGAAACGTTTTGTTGACAGTTTACCTCCTGGACGATCCCGCGCCCGCAGGCTGGCGTCGCGATGAAACGAATTTCGGTTCACGGCCGGTGTAAGACGATCGATGGGCAGGGAATTGATGCCGATGCGGATGCCGCACCCGGGAAAGAACACGCTGCTGTGTACTGTCGGGTCGAAGAAAAGCTTGAAAGCGGGCGAAATTTTTCGCGCACCGTCGATGATCCGCACCCGCGAATTCGACCAGTGAAAGCGACTCGCGATGCGGCCGCGCTACAGGTTGTTAACCTGAATGAGGGCTAG"""
+    return example
+# Create the Gradio interface
+def create_interface():
+    with gr.Blocks(title="F Gene Prediction Tool", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # 🧬 F Gene Prediction Tool
+        This tool predicts gene regions in DNA sequences using a boundary-aware deep learning model.
+        The model identifies start and end positions of genes, along with confidence scores and detailed analysis.
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Input section
+                gr.Markdown("## 📝 Input")
+                sequence_input = gr.Textbox(
+                    label="DNA Sequence",
+                    placeholder="Enter your DNA sequence (A, C, T, G, N only)...",
+                    lines=5,
+                    max_lines=10
+                )
+                with gr.Row():
+                    example_btn = gr.Button("📋 Load Example Sequence", variant="secondary")
+                    predict_btn = gr.Button("🔬 Predict Genes", variant="primary")
+                # Ground truth section (optional)
+                gr.Markdown("## 🎯 Ground Truth (Optional)")
+                gr.Markdown("*Provide ground truth data to calculate accuracy metrics*")
+                with gr.Row():
+                    gt_start = gr.Number(
+                        label="Ground Truth Start Position",
+                        precision=0,
+                        value=None
+                    )
+                    gt_end = gr.Number(
+                        label="Ground Truth End Position",
+                        precision=0,
+                        value=None
+                    )
+                gt_labels = gr.Textbox(
+                    label="Ground Truth Labels (comma-separated 0s and 1s)",
+                    placeholder="0,0,1,1,1,0,0... (optional, alternative to start/end)",
+                    lines=2
+                )
+            with gr.Column(scale=3):
+                # Output section
+                gr.Markdown("## 🔬 Prediction Results")
+                regions_output = gr.Markdown(
+                    label="Predicted Gene Regions",
+                    value="*Results will appear here after prediction...*"
+                )
+                with gr.Row():
+                    with gr.Column():
+                        metrics_output = gr.Markdown(
+                            label="Accuracy Metrics",
+                            value="*Metrics will appear here if ground truth is provided...*"
+                        )
+                # Detailed JSON output (collapsible)
+                with gr.Accordion("📄 Detailed JSON Output", open=False):
+                    json_output = gr.Code(
+                        label="Full Prediction Details",
+                        language="json",
+                        value="{}",
+                        lines=20
+                    )
+        # Event handlers
+        example_btn.click(
+            fn=load_example_sequence,
+            outputs=sequence_input
+        )
+        predict_btn.click(
+            fn=predict_gene_regions,
+            inputs=[sequence_input, gt_labels, gt_start, gt_end],
+            outputs=[regions_output, metrics_output, json_output]
+        )
+        # Also trigger prediction on Enter in the sequence box
+        sequence_input.submit(
+            fn=predict_gene_regions,
+            inputs=[sequence_input, gt_labels, gt_start, gt_end],
+            outputs=[regions_output, metrics_output, json_output]
+        )
+        # Footer
+        gr.Markdown("""
+        ---
+        **Model Info:** Boundary-aware gene prediction using multi-task deep learning
+        **Supported:** DNA sequences with A, C, T, G, N nucleotides
+        **Output:** Gene regions with start/end positions, codons, and confidence scores
+        """)
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",  # Required for Hugging Face Spaces
+        server_port=7860,       # Standard port for HF Spaces
+        share=True
+    )

best_boundary_aware_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c92e4883bba94b680ba84904e2c36a3c01105196c2a935c979b583fe0dc30c
+size 6410291

predictor.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# -*- coding: utf-8 -*-
+"""predictor.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1JURb-0j-R4LWK3oxeGrNxpJm3V6nnX02
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Tuple, Dict, Optional
+import logging
+import re
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# ============================= MODEL COMPONENTS =============================
+class BoundaryAwareGenePredictor(nn.Module):
+    """Multi-task model predicting genes, starts, and ends separately."""
+    def __init__(self, input_dim: int = 14, hidden_dim: int = 256,
+                 num_layers: int = 3, dropout: float = 0.3):
+        super().__init__()
+        self.conv_layers = nn.ModuleList([
+            nn.Conv1d(input_dim, hidden_dim//4, kernel_size=k, padding=k//2)
+            for k in [3, 7, 15, 31]
+        ])
+        self.lstm = nn.LSTM(hidden_dim, hidden_dim//2, num_layers,
+                          batch_first=True, bidirectional=True, dropout=dropout)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.boundary_attention = nn.MultiheadAttention(hidden_dim, num_heads=8, batch_first=True)
+        self.gene_classifier = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim//2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim//2, 2)
+        )
+        self.start_classifier = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim//2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim//2, 2)
+        )
+        self.end_classifier = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim//2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim//2, 2)
+        )
+    def forward(self, x: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        batch_size, seq_len, _ = x.shape
+        x_conv = x.transpose(1, 2)
+        conv_features = [F.relu(conv(x_conv)) for conv in self.conv_layers]
+        features = torch.cat(conv_features, dim=1).transpose(1, 2)
+        if lengths is not None:
+            packed = nn.utils.rnn.pack_padded_sequence(
+                features, lengths.cpu(), batch_first=True, enforce_sorted=False
+            )
+            lstm_out, _ = self.lstm(packed)
+            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
+        else:
+            lstm_out, _ = self.lstm(features)
+        lstm_out = self.norm(lstm_out)
+        attended, _ = self.boundary_attention(lstm_out, lstm_out, lstm_out)
+        attended = self.dropout(attended)
+        return {
+            'gene': self.gene_classifier(attended),
+            'start': self.start_classifier(attended),
+            'end': self.end_classifier(attended)
+        }
+# ============================= DATA PREPROCESSING =============================
+class DNAProcessor:
+    """DNA sequence processor with boundary-aware features."""
+    def __init__(self):
+        self.base_to_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
+        self.idx_to_base = {v: k for k, v in self.base_to_idx.items()}
+        self.start_codons = {'ATG', 'GTG', 'TTG'}
+        self.stop_codons = {'TAA', 'TAG', 'TGA'}
+    def encode_sequence(self, sequence: str) -> torch.Tensor:
+        sequence = sequence.upper()
+        encoded = [self.base_to_idx.get(base, self.base_to_idx['N']) for base in sequence]
+        return torch.tensor(encoded, dtype=torch.long)
+    def create_enhanced_features(self, sequence: str) -> torch.Tensor:
+        sequence = sequence.upper()
+        seq_len = len(sequence)
+        encoded = self.encode_sequence(sequence)
+        # One-hot encoding
+        one_hot = torch.zeros(seq_len, 5)
+        one_hot.scatter_(1, encoded.unsqueeze(1), 1)
+        features = [one_hot]
+        # Start codon indicators (increased weights for GTG and TTG)
+        start_indicators = torch.zeros(seq_len, 3)
+        for i in range(seq_len - 2):
+            codon = sequence[i:i+3]
+            if codon == 'ATG':
+                start_indicators[i:i+3, 0] = 1.0
+            elif codon == 'GTG':
+                start_indicators[i:i+3, 1] = 0.9  # Increased from 0.7
+            elif codon == 'TTG':
+                start_indicators[i:i+3, 2] = 0.8  # Increased from 0.5
+        features.append(start_indicators)
+        # Stop codon indicators
+        stop_indicators = torch.zeros(seq_len, 3)
+        for i in range(seq_len - 2):
+            codon = sequence[i:i+3]
+            if codon == 'TAA':
+                stop_indicators[i:i+3, 0] = 1.0
+            elif codon == 'TAG':
+                stop_indicators[i:i+3, 1] = 1.0
+            elif codon == 'TGA':
+                stop_indicators[i:i+3, 2] = 1.0
+        features.append(stop_indicators)
+        # GC content
+        gc_content = torch.zeros(seq_len, 1)
+        window_size = 50
+        for i in range(seq_len):
+            start = max(0, i - window_size//2)
+            end = min(seq_len, i + window_size//2)
+            window = sequence[start:end]
+            gc_count = window.count('G') + window.count('C')
+            gc_content[i, 0] = gc_count / len(window) if len(window) > 0 else 0
+        features.append(gc_content)
+        # Position encoding
+        pos_encoding = torch.zeros(seq_len, 2)
+        positions = torch.arange(seq_len, dtype=torch.float)
+        pos_encoding[:, 0] = torch.sin(positions / 10000)
+        pos_encoding[:, 1] = torch.cos(positions / 10000)
+        features.append(pos_encoding)
+        return torch.cat(features, dim=1)  # 5 + 3 + 3 + 1 + 2 = 14
+# ============================= POST-PROCESSING =============================
+class EnhancedPostProcessor:
+    """Enhanced post-processor with stricter boundary detection."""
+    def __init__(self, min_gene_length: int = 150, max_gene_length: int = 5000):
+        self.min_gene_length = min_gene_length
+        self.max_gene_length = max_gene_length
+        self.start_codons = {'ATG', 'GTG', 'TTG'}
+        self.stop_codons = {'TAA', 'TAG', 'TGA'}
+    def process_predictions(self, gene_probs: np.ndarray, start_probs: np.ndarray,
+                          end_probs: np.ndarray, sequence: str = None) -> np.ndarray:
+        """Process predictions with enhanced boundary detection."""
+        # More conservative thresholds
+        gene_pred = (gene_probs[:, 1] > 0.6).astype(int)
+        start_pred = (start_probs[:, 1] > 0.4).astype(int)
+        end_pred = (end_probs[:, 1] > 0.5).astype(int)
+        if sequence is not None:
+            processed = self._refine_with_codons_and_boundaries(
+                gene_pred, start_pred, end_pred, sequence
+            )
+        else:
+            processed = self._refine_with_boundaries(gene_pred, start_pred, end_pred)
+        processed = self._apply_constraints(processed, sequence)
+        return processed
+    def _refine_with_codons_and_boundaries(self, gene_pred: np.ndarray,
+                                         start_pred: np.ndarray, end_pred: np.ndarray,
+                                         sequence: str) -> np.ndarray:
+        refined = gene_pred.copy()
+        sequence = sequence.upper()
+        start_codon_positions = []
+        stop_codon_positions = []
+        for i in range(len(sequence) - 2):
+            codon = sequence[i:i+3]
+            if codon in self.start_codons:
+                start_codon_positions.append(i)
+            if codon in self.stop_codons:
+                stop_codon_positions.append(i + 3)
+        changes = np.diff(np.concatenate(([0], gene_pred, [0])))
+        gene_starts = np.where(changes == 1)[0]
+        gene_ends = np.where(changes == -1)[0]
+        refined = np.zeros_like(gene_pred)
+        for g_start, g_end in zip(gene_starts, gene_ends):
+            best_start = g_start
+            start_window = 100  # Increased from 50
+            nearby_starts = [pos for pos in start_codon_positions
+                           if abs(pos - g_start) <= start_window]
+            if nearby_starts:
+                start_scores = []
+                for pos in nearby_starts:
+                    if pos < len(start_pred):
+                        codon = sequence[pos:pos+3]
+                        codon_weight = 1.0 if codon == 'ATG' else (0.9 if codon == 'GTG' else 0.8)
+                        boundary_score = start_pred[pos]
+                        distance_penalty = abs(pos - g_start) / start_window * 0.2  # Add distance penalty
+                        score = codon_weight * 0.5 + boundary_score * 0.4 - distance_penalty
+                        start_scores.append((score, pos))
+                if start_scores:
+                    best_start = max(start_scores, key=lambda x: x[0])[1]
+            best_end = g_end
+            end_window = 100
+            nearby_ends = [pos for pos in stop_codon_positions
+                          if g_start < pos <= g_end + end_window]
+            if nearby_ends:
+                end_scores = []
+                for pos in nearby_ends:
+                    gene_length = pos - best_start
+                    if self.min_gene_length <= gene_length <= self.max_gene_length:
+                        if pos < len(end_pred):
+                            frame_bonus = 0.2 if (pos - best_start) % 3 == 0 else 0
+                            boundary_score = end_pred[pos]
+                            length_penalty = abs(gene_length - 1000) / 10000
+                            score = boundary_score + frame_bonus - length_penalty
+                            end_scores.append((score, pos))
+                if end_scores:
+                    best_end = max(end_scores, key=lambda x: x[0])[1]
+            gene_length = best_end - best_start
+            if (gene_length >= self.min_gene_length and
+                gene_length <= self.max_gene_length and
+                best_start < best_end):
+                refined[best_start:best_end] = 1
+        return refined
+    def _refine_with_boundaries(self, gene_pred: np.ndarray, start_pred: np.ndarray,
+                               end_pred: np.ndarray) -> np.ndarray:
+        refined = gene_pred.copy()
+        changes = np.diff(np.concatenate(([0], gene_pred, [0])))
+        gene_starts = np.where(changes == 1)[0]
+        gene_ends = np.where(changes == -1)[0]
+        for g_start, g_end in zip(gene_starts, gene_ends):
+            start_window = slice(max(0, g_start-30), min(len(start_pred), g_start+30))
+            start_candidates = np.where(start_pred[start_window])[0]
+            if len(start_candidates) > 0:
+                relative_positions = start_candidates + max(0, g_start-30)
+                distances = np.abs(relative_positions - g_start)
+                best_start_idx = np.argmin(distances)
+                new_start = relative_positions[best_start_idx]
+                refined[g_start:new_start] = 0 if new_start > g_start else refined[g_start:new_start]
+                refined[new_start:g_end] = 1
+                g_start = new_start
+            end_window = slice(max(0, g_end-50), min(len(end_pred), g_end+50))
+            end_candidates = np.where(end_pred[end_window])[0]
+            if len(end_candidates) > 0:
+                relative_positions = end_candidates + max(0, g_end-50)
+                valid_ends = [pos for pos in relative_positions
+                            if self.min_gene_length <= pos - g_start <= self.max_gene_length]
+                if valid_ends:
+                    distances = np.abs(np.array(valid_ends) - g_end)
+                    new_end = valid_ends[np.argmin(distances)]
+                    refined[g_start:new_end] = 1
+                    refined[new_end:g_end] = 0 if new_end < g_end else refined[new_end:g_end]
+        return refined
+    def _apply_constraints(self, predictions: np.ndarray, sequence: str = None) -> np.ndarray:
+        processed = predictions.copy()
+        changes = np.diff(np.concatenate(([0], predictions, [0])))
+        starts = np.where(changes == 1)[0]
+        ends = np.where(changes == -1)[0]
+        for start, end in zip(starts, ends):
+            gene_length = end - start
+            if gene_length < self.min_gene_length or gene_length > self.max_gene_length:
+                processed[start:end] = 0
+                continue
+            if sequence is not None:
+                if gene_length % 3 != 0:
+                    new_length = (gene_length // 3) * 3
+                    if new_length >= self.min_gene_length:
+                        new_end = start + new_length
+                        processed[new_end:end] = 0
+                    else:
+                        processed[start:end] = 0
+        return processed
+# ============================= PREDICTION =============================
+class GenePredictor:
+    """Handles gene prediction using the trained boundary-aware model."""
+    def __init__(self, model_path: str = 'model/best_boundary_aware_model.pth',
+                 device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
+        self.device = device
+        self.model = BoundaryAwareGenePredictor(input_dim=14).to(device)
+        try:
+            self.model.load_state_dict(torch.load(model_path, map_location=device))
+            logging.info(f"Loaded model from {model_path}")
+        except Exception as e:
+            logging.error(f"Failed to load model: {e}")
+            raise
+        self.model.eval()
+        self.processor = DNAProcessor()
+        self.post_processor = EnhancedPostProcessor()
+    def predict(self, sequence: str) -> Tuple[np.ndarray, Dict[str, np.ndarray], float]:
+        sequence = sequence.upper()
+        if not re.match('^[ACTGN]+$', sequence):
+            logging.warning("Sequence contains invalid characters. Using 'N' for unknowns.")
+            sequence = ''.join(c if c in 'ACTGN' else 'N' for c in sequence)
+        features = self.processor.create_enhanced_features(sequence).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(features)
+            gene_probs = F.softmax(outputs['gene'], dim=-1).cpu().numpy()[0]
+            start_probs = F.softmax(outputs['start'], dim=-1).cpu().numpy()[0]
+            end_probs = F.softmax(outputs['end'], dim=-1).cpu().numpy()[0]
+        predictions = self.post_processor.process_predictions(
+            gene_probs, start_probs, end_probs, sequence
+        )
+        confidence = np.mean(gene_probs[:, 1][predictions == 1]) if np.any(predictions == 1) else 0.0
+        return predictions, {'gene': gene_probs, 'start': start_probs, 'end': end_probs}, confidence
+    def extract_gene_regions(self, predictions: np.ndarray, sequence: str) -> List[Dict]:
+        regions = []
+        changes = np.diff(np.concatenate(([0], predictions, [0])))
+        starts = np.where(changes == 1)[0]
+        ends = np.where(changes == -1)[0]
+        for start, end in zip(starts, ends):
+            gene_seq = sequence[start:end]
+            actual_start_codon = None
+            actual_stop_codon = None
+            if len(gene_seq) >= 3:
+                start_codon = gene_seq[:3]
+                if start_codon in ['ATG', 'GTG', 'TTG']:
+                    actual_start_codon = start_codon
+                if len(gene_seq) >= 6:
+                    for i in range(len(gene_seq) - 2, 2, -3):
+                        codon = gene_seq[i:i+3]
+                        if codon in ['TAA', 'TAG', 'TGA']:
+                            actual_stop_codon = codon
+                            break
+            regions.append({
+                'start': int(start),  # Convert to Python int for JSON serialization
+                'end': int(end),
+                'sequence': gene_seq,  # Return full sequence
+                'length': int(end - start),
+                'start_codon': actual_start_codon,
+                'stop_codon': actual_stop_codon,
+                'in_frame': (end - start) % 3 == 0
+            })
+        return regions
+    def compute_accuracy(self, predictions: np.ndarray, labels: List[int]) -> Dict:
+        min_len = min(len(predictions), len(labels))
+        predictions = predictions[:min_len]
+        labels = np.array(labels[:min_len])
+        accuracy = np.mean(predictions == labels)
+        true_pos = np.sum((predictions == 1) & (labels == 1))
+        false_neg = np.sum((predictions == 0) & (labels == 1))
+        false_pos = np.sum((predictions == 1) & (labels == 0))
+        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0.0
+        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0.0
+        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1,
+            'true_positives': int(true_pos),
+            'false_positives': int(false_pos),
+            'false_negatives': int(false_neg)
+        }
+    def labels_from_coordinates(self, seq_len: int, start: int, end: int) -> List[int]:
+        labels = [0] * seq_len
+        start = max(0, min(start, seq_len - 1))
+        end = max(start, min(end, seq_len))
+        for i in range(start, end):
+            labels[i] = 1
+        return labels