Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 9, 2025

Commit

22db390

verified ·

1 Parent(s): 74167c4

Update predictor.py

Browse files

Files changed (1) hide show

predictor.py +375 -406

predictor.py CHANGED Viewed

@@ -1,414 +1,383 @@
-# -*- coding: utf-8 -*-
-"""predictor.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1JURb-0j-R4LWK3oxeGrNxpJm3V6nnX02
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import numpy as np
-from typing import List, Tuple, Dict, Optional
-import logging
 import re
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# ============================= MODEL COMPONENTS =============================
-class BoundaryAwareGenePredictor(nn.Module):
-    """Multi-task model predicting genes, starts, and ends separately."""
-    def __init__(self, input_dim: int = 14, hidden_dim: int = 256,
-                 num_layers: int = 3, dropout: float = 0.3):
-        super().__init__()
-        self.conv_layers = nn.ModuleList([
-            nn.Conv1d(input_dim, hidden_dim//4, kernel_size=k, padding=k//2)
-            for k in [3, 7, 15, 31]
-        ])
-        self.lstm = nn.LSTM(hidden_dim, hidden_dim//2, num_layers,
-                          batch_first=True, bidirectional=True, dropout=dropout)
-        self.norm = nn.LayerNorm(hidden_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.boundary_attention = nn.MultiheadAttention(hidden_dim, num_heads=8, batch_first=True)
-        self.gene_classifier = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim//2),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim//2, 2)
-        )
-        self.start_classifier = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim//2),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim//2, 2)
-        )
-        self.end_classifier = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim//2),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim//2, 2)
-        )
-    def forward(self, x: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
-        batch_size, seq_len, _ = x.shape
-        x_conv = x.transpose(1, 2)
-        conv_features = [F.relu(conv(x_conv)) for conv in self.conv_layers]
-        features = torch.cat(conv_features, dim=1).transpose(1, 2)
-        if lengths is not None:
-            packed = nn.utils.rnn.pack_padded_sequence(
-                features, lengths.cpu(), batch_first=True, enforce_sorted=False
-            )
-            lstm_out, _ = self.lstm(packed)
-            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
         else:
-            lstm_out, _ = self.lstm(features)
-        lstm_out = self.norm(lstm_out)
-        attended, _ = self.boundary_attention(lstm_out, lstm_out, lstm_out)
-        attended = self.dropout(attended)
         return {
-            'gene': self.gene_classifier(attended),
-            'start': self.start_classifier(attended),
-            'end': self.end_classifier(attended)
         }
-# ============================= DATA PREPROCESSING =============================
-class DNAProcessor:
-    """DNA sequence processor with boundary-aware features."""
-    def __init__(self):
-        self.base_to_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
-        self.idx_to_base = {v: k for k, v in self.base_to_idx.items()}
-        self.start_codons = {'ATG', 'GTG', 'TTG'}
-        self.stop_codons = {'TAA', 'TAG', 'TGA'}
-    def encode_sequence(self, sequence: str) -> torch.Tensor:
-        sequence = sequence.upper()
-        encoded = [self.base_to_idx.get(base, self.base_to_idx['N']) for base in sequence]
-        return torch.tensor(encoded, dtype=torch.long)
-    def create_enhanced_features(self, sequence: str) -> torch.Tensor:
-        sequence = sequence.upper()
-        seq_len = len(sequence)
-        encoded = self.encode_sequence(sequence)
-        # One-hot encoding
-        one_hot = torch.zeros(seq_len, 5)
-        one_hot.scatter_(1, encoded.unsqueeze(1), 1)
-        features = [one_hot]
-        # Start codon indicators (increased weights for GTG and TTG)
-        start_indicators = torch.zeros(seq_len, 3)
-        for i in range(seq_len - 2):
-            codon = sequence[i:i+3]
-            if codon == 'ATG':
-                start_indicators[i:i+3, 0] = 1.0
-            elif codon == 'GTG':
-                start_indicators[i:i+3, 1] = 0.9  # Increased from 0.7
-            elif codon == 'TTG':
-                start_indicators[i:i+3, 2] = 0.8  # Increased from 0.5
-        features.append(start_indicators)
-        # Stop codon indicators
-        stop_indicators = torch.zeros(seq_len, 3)
-        for i in range(seq_len - 2):
-            codon = sequence[i:i+3]
-            if codon == 'TAA':
-                stop_indicators[i:i+3, 0] = 1.0
-            elif codon == 'TAG':
-                stop_indicators[i:i+3, 1] = 1.0
-            elif codon == 'TGA':
-                stop_indicators[i:i+3, 2] = 1.0
-        features.append(stop_indicators)
-        # GC content
-        gc_content = torch.zeros(seq_len, 1)
-        window_size = 50
-        for i in range(seq_len):
-            start = max(0, i - window_size//2)
-            end = min(seq_len, i + window_size//2)
-            window = sequence[start:end]
-            gc_count = window.count('G') + window.count('C')
-            gc_content[i, 0] = gc_count / len(window) if len(window) > 0 else 0
-        features.append(gc_content)
-        # Position encoding
-        pos_encoding = torch.zeros(seq_len, 2)
-        positions = torch.arange(seq_len, dtype=torch.float)
-        pos_encoding[:, 0] = torch.sin(positions / 10000)
-        pos_encoding[:, 1] = torch.cos(positions / 10000)
-        features.append(pos_encoding)
-        return torch.cat(features, dim=1)  # 5 + 3 + 3 + 1 + 2 = 14
-# ============================= POST-PROCESSING =============================
-class EnhancedPostProcessor:
-    """Enhanced post-processor with stricter boundary detection."""
-    def __init__(self, min_gene_length: int = 150, max_gene_length: int = 5000):
-        self.min_gene_length = min_gene_length
-        self.max_gene_length = max_gene_length
-        self.start_codons = {'ATG', 'GTG', 'TTG'}
-        self.stop_codons = {'TAA', 'TAG', 'TGA'}
-    def process_predictions(self, gene_probs: np.ndarray, start_probs: np.ndarray,
-                          end_probs: np.ndarray, sequence: str = None) -> np.ndarray:
-        """Process predictions with enhanced boundary detection."""
-        # More conservative thresholds
-        gene_pred = (gene_probs[:, 1] > 0.6).astype(int)
-        start_pred = (start_probs[:, 1] > 0.4).astype(int)
-        end_pred = (end_probs[:, 1] > 0.5).astype(int)
-        if sequence is not None:
-            processed = self._refine_with_codons_and_boundaries(
-                gene_pred, start_pred, end_pred, sequence
-            )
         else:
-            processed = self._refine_with_boundaries(gene_pred, start_pred, end_pred)
-        processed = self._apply_constraints(processed, sequence)
-        return processed
-    def _refine_with_codons_and_boundaries(self, gene_pred: np.ndarray,
-                                         start_pred: np.ndarray, end_pred: np.ndarray,
-                                         sequence: str) -> np.ndarray:
-        refined = gene_pred.copy()
-        sequence = sequence.upper()
-        start_codon_positions = []
-        stop_codon_positions = []
-        for i in range(len(sequence) - 2):
-            codon = sequence[i:i+3]
-            if codon in self.start_codons:
-                start_codon_positions.append(i)
-            if codon in self.stop_codons:
-                stop_codon_positions.append(i + 3)
-        changes = np.diff(np.concatenate(([0], gene_pred, [0])))
-        gene_starts = np.where(changes == 1)[0]
-        gene_ends = np.where(changes == -1)[0]
-        refined = np.zeros_like(gene_pred)
-        for g_start, g_end in zip(gene_starts, gene_ends):
-            best_start = g_start
-            start_window = 100  # Increased from 50
-            nearby_starts = [pos for pos in start_codon_positions
-                           if abs(pos - g_start) <= start_window]
-            if nearby_starts:
-                start_scores = []
-                for pos in nearby_starts:
-                    if pos < len(start_pred):
-                        codon = sequence[pos:pos+3]
-                        codon_weight = 1.0 if codon == 'ATG' else (0.9 if codon == 'GTG' else 0.8)
-                        boundary_score = start_pred[pos]
-                        distance_penalty = abs(pos - g_start) / start_window * 0.2  # Add distance penalty
-                        score = codon_weight * 0.5 + boundary_score * 0.4 - distance_penalty
-                        start_scores.append((score, pos))
-                if start_scores:
-                    best_start = max(start_scores, key=lambda x: x[0])[1]
-            best_end = g_end
-            end_window = 100
-            nearby_ends = [pos for pos in stop_codon_positions
-                          if g_start < pos <= g_end + end_window]
-            if nearby_ends:
-                end_scores = []
-                for pos in nearby_ends:
-                    gene_length = pos - best_start
-                    if self.min_gene_length <= gene_length <= self.max_gene_length:
-                        if pos < len(end_pred):
-                            frame_bonus = 0.2 if (pos - best_start) % 3 == 0 else 0
-                            boundary_score = end_pred[pos]
-                            length_penalty = abs(gene_length - 1000) / 10000
-                            score = boundary_score + frame_bonus - length_penalty
-                            end_scores.append((score, pos))
-                if end_scores:
-                    best_end = max(end_scores, key=lambda x: x[0])[1]
-            gene_length = best_end - best_start
-            if (gene_length >= self.min_gene_length and
-                gene_length <= self.max_gene_length and
-                best_start < best_end):
-                refined[best_start:best_end] = 1
-        return refined
-    def _refine_with_boundaries(self, gene_pred: np.ndarray, start_pred: np.ndarray,
-                               end_pred: np.ndarray) -> np.ndarray:
-        refined = gene_pred.copy()
-        changes = np.diff(np.concatenate(([0], gene_pred, [0])))
-        gene_starts = np.where(changes == 1)[0]
-        gene_ends = np.where(changes == -1)[0]
-        for g_start, g_end in zip(gene_starts, gene_ends):
-            start_window = slice(max(0, g_start-30), min(len(start_pred), g_start+30))
-            start_candidates = np.where(start_pred[start_window])[0]
-            if len(start_candidates) > 0:
-                relative_positions = start_candidates + max(0, g_start-30)
-                distances = np.abs(relative_positions - g_start)
-                best_start_idx = np.argmin(distances)
-                new_start = relative_positions[best_start_idx]
-                refined[g_start:new_start] = 0 if new_start > g_start else refined[g_start:new_start]
-                refined[new_start:g_end] = 1
-                g_start = new_start
-            end_window = slice(max(0, g_end-50), min(len(end_pred), g_end+50))
-            end_candidates = np.where(end_pred[end_window])[0]
-            if len(end_candidates) > 0:
-                relative_positions = end_candidates + max(0, g_end-50)
-                valid_ends = [pos for pos in relative_positions
-                            if self.min_gene_length <= pos - g_start <= self.max_gene_length]
-                if valid_ends:
-                    distances = np.abs(np.array(valid_ends) - g_end)
-                    new_end = valid_ends[np.argmin(distances)]
-                    refined[g_start:new_end] = 1
-                    refined[new_end:g_end] = 0 if new_end < g_end else refined[new_end:g_end]
-        return refined
-    def _apply_constraints(self, predictions: np.ndarray, sequence: str = None) -> np.ndarray:
-        processed = predictions.copy()
-        changes = np.diff(np.concatenate(([0], predictions, [0])))
-        starts = np.where(changes == 1)[0]
-        ends = np.where(changes == -1)[0]
-        for start, end in zip(starts, ends):
-            gene_length = end - start
-            if gene_length < self.min_gene_length or gene_length > self.max_gene_length:
-                processed[start:end] = 0
-                continue
-            if sequence is not None:
-                if gene_length % 3 != 0:
-                    new_length = (gene_length // 3) * 3
-                    if new_length >= self.min_gene_length:
-                        new_end = start + new_length
-                        processed[new_end:end] = 0
-                    else:
-                        processed[start:end] = 0
-        return processed
-# ============================= PREDICTION =============================
-class GenePredictor:
-    """Handles gene prediction using the trained boundary-aware model."""
-    def __init__(self, model_path: str = 'model/best_boundary_aware_model.pth',
-                 device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
-        self.device = device
-        self.model = BoundaryAwareGenePredictor(input_dim=14).to(device)
-        try:
-            self.model.load_state_dict(torch.load(model_path, map_location=device))
-            logging.info(f"Loaded model from {model_path}")
-        except Exception as e:
-            logging.error(f"Failed to load model: {e}")
-            raise
-        self.model.eval()
-        self.processor = DNAProcessor()
-        self.post_processor = EnhancedPostProcessor()
-    def predict(self, sequence: str) -> Tuple[np.ndarray, Dict[str, np.ndarray], float]:
-        sequence = sequence.upper()
-        if not re.match('^[ACTGN]+$', sequence):
-            logging.warning("Sequence contains invalid characters. Using 'N' for unknowns.")
-            sequence = ''.join(c if c in 'ACTGN' else 'N' for c in sequence)
-        features = self.processor.create_enhanced_features(sequence).unsqueeze(0).to(self.device)
-        with torch.no_grad():
-            outputs = self.model(features)
-            gene_probs = F.softmax(outputs['gene'], dim=-1).cpu().numpy()[0]
-            start_probs = F.softmax(outputs['start'], dim=-1).cpu().numpy()[0]
-            end_probs = F.softmax(outputs['end'], dim=-1).cpu().numpy()[0]
-        predictions = self.post_processor.process_predictions(
-            gene_probs, start_probs, end_probs, sequence
         )
-        confidence = np.mean(gene_probs[:, 1][predictions == 1]) if np.any(predictions == 1) else 0.0
-        return predictions, {'gene': gene_probs, 'start': start_probs, 'end': end_probs}, confidence
-    def extract_gene_regions(self, predictions: np.ndarray, sequence: str) -> List[Dict]:
-        regions = []
-        changes = np.diff(np.concatenate(([0], predictions, [0])))
-        starts = np.where(changes == 1)[0]
-        ends = np.where(changes == -1)[0]
-        for start, end in zip(starts, ends):
-            gene_seq = sequence[start:end]
-            actual_start_codon = None
-            actual_stop_codon = None
-            if len(gene_seq) >= 3:
-                start_codon = gene_seq[:3]
-                if start_codon in ['ATG', 'GTG', 'TTG']:
-                    actual_start_codon = start_codon
-                if len(gene_seq) >= 6:
-                    for i in range(len(gene_seq) - 2, 2, -3):
-                        codon = gene_seq[i:i+3]
-                        if codon in ['TAA', 'TAG', 'TGA']:
-                            actual_stop_codon = codon
-                            break
-            regions.append({
-                'start': int(start),  # Convert to Python int for JSON serialization
-                'end': int(end),
-                'sequence': gene_seq,  # Return full sequence
-                'length': int(end - start),
-                'start_codon': actual_start_codon,
-                'stop_codon': actual_stop_codon,
-                'in_frame': (end - start) % 3 == 0
-            })
-        return regions
-    def compute_accuracy(self, predictions: np.ndarray, labels: List[int]) -> Dict:
-        min_len = min(len(predictions), len(labels))
-        predictions = predictions[:min_len]
-        labels = np.array(labels[:min_len])
-        accuracy = np.mean(predictions == labels)
-        true_pos = np.sum((predictions == 1) & (labels == 1))
-        false_neg = np.sum((predictions == 0) & (labels == 1))
-        false_pos = np.sum((predictions == 1) & (labels == 0))
-        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0.0
-        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0.0
-        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
-        return {
-            'accuracy': accuracy,
-            'precision': precision,
-            'recall': recall,
-            'f1': f1,
-            'true_positives': int(true_pos),
-            'false_positives': int(false_pos),
-            'false_negatives': int(false_neg)
-        }
-    def labels_from_coordinates(self, seq_len: int, start: int, end: int) -> List[int]:
-        labels = [0] * seq_len
-        start = max(0, min(start, seq_len - 1))
-        end = max(start, min(end, seq_len))
-        for i in range(start, end):
-            labels[i] = 1
-        return labels

+# Improved F Gene Prediction Functions
 import numpy as np
 import re
+import logging
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+def preprocess_sequence_for_ndv_f_gene(sequence):
+    """Enhanced preprocessing specifically for NDV F gene sequences"""
+    try:
+        # Convert to uppercase and remove whitespace
+        sequence = sequence.upper().strip()
+        # Remove non-nucleotide characters except N
+        sequence = re.sub(r'[^ATCGN]', '', sequence)
+        # NDV F gene specific checks
+        # NDV F gene is typically around 1662-1800 nucleotides
+        if len(sequence) < 1000:
+            logging.warning(f"Sequence length ({len(sequence)}) shorter than typical NDV F gene (1662-1800 nt)")
+        # Check for start codon (ATG) - NDV F gene should start with ATG
+        if not sequence.startswith('ATG'):
+            logging.warning("Sequence doesn't start with ATG start codon")
+            # Try to find the first ATG
+            atg_pos = sequence.find('ATG')
+            if atg_pos != -1:
+                sequence = sequence[atg_pos:]
+                logging.info(f"Found ATG at position {atg_pos}, using sequence from there")
+        # Check reading frame (sequence length should be divisible by 3)
+        if len(sequence) % 3 != 0:
+            # Trim to make it divisible by 3
+            sequence = sequence[:len(sequence) - (len(sequence) % 3)]
+            logging.info(f"Trimmed sequence to maintain reading frame: {len(sequence)} nt")
+        # Look for NDV F gene specific motifs
+        # Fusion peptide region (typically around position 117-137)
+        # Heptad repeat regions
+        # These are characteristic of NDV F protein
+        return sequence
+    except Exception as e:
+        logging.error(f"Sequence preprocessing failed: {e}")
+        return sequence
+def enhanced_keras_prediction(sequence, keras_model, kmer_to_index, kmer_size=6):
+    """Enhanced Keras prediction with better handling for NDV F gene"""
+    try:
+        if not keras_model or not kmer_to_index:
+            return "Keras model not available"
+        # Preprocess sequence
+        processed_seq = preprocess_sequence_for_ndv_f_gene(sequence)
+        if len(processed_seq) < kmer_size:
+            return f"Sequence too short for k-mer prediction (minimum {kmer_size} nucleotides required)"
+        # Generate k-mers
+        kmers = [processed_seq[i:i+kmer_size] for i in range(len(processed_seq)-kmer_size+1)]
+        # Convert k-mers to indices
+        indices = []
+        unknown_kmers = 0
+        for kmer in kmers:
+            if kmer in kmer_to_index:
+                indices.append(kmer_to_index[kmer])
+            else:
+                indices.append(0)  # Unknown k-mer
+                unknown_kmers += 1
+        # Log statistics
+        logging.info(f"Generated {len(kmers)} k-mers, {unknown_kmers} unknown k-mers")
+        # Prepare input for model
+        input_arr = np.array([indices])
+        # Get prediction
+        prediction = keras_model.predict(input_arr, verbose=0)[0]
+        # Enhanced interpretation
+        max_prob = np.max(prediction)
+        mean_prob = np.mean(prediction)
+        # Calculate confidence metrics
+        confidence_score = max_prob
+        consistency_score = 1.0 - np.std(prediction)  # Lower std = more consistent
+        result = {
+            'raw_prediction': prediction.tolist(),
+            'max_probability': float(max_prob),
+            'mean_probability': float(mean_prob),
+            'confidence_score': float(confidence_score),
+            'consistency_score': float(consistency_score),
+            'sequence_length': len(processed_seq),
+            'kmers_generated': len(kmers),
+            'unknown_kmers': unknown_kmers,
+            'kmer_coverage': 1.0 - (unknown_kmers / len(kmers)) if kmers else 0.0
+        }
+        return result
+    except Exception as e:
+        logging.error(f"Enhanced Keras prediction failed: {e}")
+        return f"Enhanced Keras prediction failed: {str(e)}"
+def enhanced_classify_sequence(sequence, classifier_model, classifier_kmer_to_index, classifier_maxlen, labels):
+    """Enhanced classification with NDV F gene specific improvements"""
+    try:
+        if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
+            return {
+                "status": "error",
+                "message": "Classification model not available",
+                "confidence": None,
+                "predicted_label": None,
+                "details": {}
+            }
+        # Preprocess sequence
+        processed_seq = preprocess_sequence_for_ndv_f_gene(sequence)
+        # NDV F gene specific length check
+        if len(processed_seq) < 1000:
+            return {
+                "status": "warning",
+                "message": f"Sequence shorter than typical NDV F gene ({len(processed_seq)} < 1000 nt)",
+                "confidence": None,
+                "predicted_label": None,
+                "details": {"sequence_length": len(processed_seq)}
+            }
+        # Generate k-mers (6-mers)
+        kmer_size = 6
+        tokens = [processed_seq[i:i+kmer_size] for i in range(len(processed_seq)-kmer_size+1)]
+        # Encode k-mers
+        encoded = []
+        unknown_count = 0
+        for kmer in tokens:
+            if kmer in classifier_kmer_to_index:
+                encoded.append(classifier_kmer_to_index[kmer])
+            else:
+                encoded.append(0)  # Unknown k-mer
+                unknown_count += 1
+        # Pad sequences
+        padded = pad_sequences([encoded], maxlen=classifier_maxlen, padding='post')
+        # Get prediction
+        pred = classifier_model.predict(padded, verbose=0)
+        predicted_class = int(np.argmax(pred))
+        confidence = float(np.max(pred))
+        predicted_label = labels[predicted_class] if predicted_class < len(labels) else "Unknown"
+        # Calculate additional metrics
+        kmer_coverage = 1.0 - (unknown_count / len(tokens)) if tokens else 0.0
+        prediction_entropy = -np.sum(pred[0] * np.log(pred[0] + 1e-10))  # Lower entropy = more confident
+        details = {
+            "sequence_length": len(processed_seq),
+            "kmers_generated": len(tokens),
+            "unknown_kmers": unknown_count,
+            "kmer_coverage": kmer_coverage,
+            "prediction_entropy": float(prediction_entropy),
+            "all_probabilities": {labels[i]: float(pred[0][i]) for i in range(len(labels)) if i < len(pred[0])},
+            "starts_with_atg": processed_seq.startswith('ATG'),
+            "length_in_frame": len(processed_seq) % 3 == 0
+        }
+        # Enhanced decision logic for NDV F gene
+        if predicted_label == "F":
+            # Additional checks for F gene confidence
+            f_gene_score = confidence
+            # Bonus for good k-mer coverage
+            if kmer_coverage > 0.8:
+                f_gene_score *= 1.1
+            # Bonus for proper start codon
+            if processed_seq.startswith('ATG'):
+                f_gene_score *= 1.05
+            # Bonus for proper reading frame
+            if len(processed_seq) % 3 == 0:
+                f_gene_score *= 1.05
+            # Bonus for appropriate length (NDV F gene is ~1662-1800 nt)
+            if 1500 <= len(processed_seq) <= 2000:
+                f_gene_score *= 1.1
+            details["enhanced_f_score"] = min(f_gene_score, 1.0)
+            if f_gene_score > 0.7:
+                return {
+                    "status": "success",
+                    "message": "NDV F gene detected with high confidence",
+                    "confidence": confidence,
+                    "predicted_label": predicted_label,
+                    "details": details
+                }
+            elif f_gene_score > 0.5:
+                return {
+                    "status": "success",
+                    "message": "NDV F gene detected with moderate confidence",
+                    "confidence": confidence,
+                    "predicted_label": predicted_label,
+                    "details": details
+                }
+            else:
+                return {
+                    "status": "warning",
+                    "message": "Possible F gene but low confidence - check sequence quality",
+                    "confidence": confidence,
+                    "predicted_label": predicted_label,
+                    "details": details
+                }
+        elif predicted_label == "Random":
+            # Check if it might still be an F gene with issues
+            if kmer_coverage < 0.5:
+                return {
+                    "status": "error",
+                    "message": f"Poor sequence quality detected (coverage: {kmer_coverage:.1%}). Check for sequencing errors.",
+                    "confidence": confidence,
+                    "predicted_label": predicted_label,
+                    "details": details
+                }
+            else:
+                return {
+                    "status": "error",
+                    "message": "Sequence does not appear to be NDV F gene. Verify input sequence.",
+                    "confidence": confidence,
+                    "predicted_label": predicted_label,
+                    "details": details
+                }
         else:
+            # Other gene detected
+            return {
+                "status": "error",
+                "message": f"Detected as {predicted_label} gene, not F gene. Please provide NDV F gene sequence.",
+                "confidence": confidence,
+                "predicted_label": predicted_label,
+                "details": details
+            }
+    except Exception as e:
+        logging.error(f"Enhanced classification failed: {e}")
         return {
+            "status": "error",
+            "message": f"Classification failed: {str(e)}",
+            "confidence": None,
+            "predicted_label": None,
+            "details": {"error": str(e)}
         }
+def validate_ndv_f_gene_sequence(sequence):
+    """Additional validation specific to NDV F gene characteristics"""
+    issues = []
+    suggestions = []
+    # Length check
+    if len(sequence) < 1500:
+        issues.append(f"Sequence length ({len(sequence)}) shorter than typical NDV F gene (1662-1800 nt)")
+        suggestions.append("Verify complete F gene sequence was provided")
+    elif len(sequence) > 2000:
+        issues.append(f"Sequence length ({len(sequence)}) longer than typical NDV F gene")
+        suggestions.append("Check if sequence contains additional regions beyond F gene")
+    # Start codon check
+    if not sequence.startswith('ATG'):
+        issues.append("Sequence doesn't start with ATG start codon")
+        suggestions.append("Ensure sequence starts from the translation start site")
+    # Reading frame check
+    if len(sequence) % 3 != 0:
+        issues.append("Sequence length not divisible by 3 (reading frame issue)")
+        suggestions.append("Check for insertions/deletions or trim to proper reading frame")
+    # Stop codon check
+    if len(sequence) >= 3:
+        last_codon = sequence[-3:]
+        stop_codons = ['TAA', 'TAG', 'TGA']
+        if last_codon not in stop_codons:
+            issues.append(f"Sequence doesn't end with stop codon (ends with {last_codon})")
+            suggestions.append("Verify complete F gene sequence including stop codon")
+    # Nucleotide composition check
+    gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100
+    if gc_content < 30 or gc_content > 70:
+        issues.append(f"Unusual GC content: {gc_content:.1f}% (typical range: 35-65%)")
+        suggestions.append("Verify sequence quality and correct nucleotide composition")
+    # Check for too many N's (ambiguous nucleotides)
+    n_content = sequence.count('N') / len(sequence) * 100
+    if n_content > 5:
+        issues.append(f"High ambiguous nucleotide content: {n_content:.1f}% N's")
+        suggestions.append("Consider resequencing regions with ambiguous nucleotides")
+    return issues, suggestions
+# Updated run_pipeline function with enhanced predictions
+def enhanced_run_pipeline(dna_input, keras_model, kmer_to_index, classifier_model,
+                         classifier_kmer_to_index, classifier_maxlen, labels,
+                         similarity_score=95.0, build_ml_tree=False):
+    """Enhanced pipeline with improved F gene prediction"""
+    try:
+        # Input validation and preprocessing
+        dna_input = dna_input.upper().strip()
+        if not dna_input:
+            return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
+        # Clean sequence
+        if not re.match('^[ACTGN]+$', dna_input):
+            dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
+            logging.info("DNA sequence sanitized")
+        # Validate NDV F gene characteristics
+        validation_issues, validation_suggestions = validate_ndv_f_gene_sequence(dna_input)
+        # Step 1: Enhanced Keras Prediction
+        keras_result = enhanced_keras_prediction(dna_input, keras_model, kmer_to_index)
+        if isinstance(keras_result, dict):
+            keras_output = f"Prediction confidence: {keras_result['confidence_score']:.3f}\n"
+            keras_output += f"K-mer coverage: {keras_result['kmer_coverage']:.1%}\n"
+            keras_output += f"Sequence length: {keras_result['sequence_length']} nt"
+            if keras_result['kmer_coverage'] < 0.8:
+                keras_output += "\n⚠️ Low k-mer coverage - may affect accuracy"
         else:
+            keras_output = str(keras_result)
+        # Step 2: Enhanced Classification
+        classifier_result = enhanced_classify_sequence(
+            dna_input, classifier_model, classifier_kmer_to_index, classifier_maxlen, labels
         )
+        classifier_status = classifier_result["status"]
+        classifier_message = classifier_result["message"]
+        classifier_label = classifier_result["predicted_label"]
+        classifier_confidence = classifier_result["confidence"]
+        # Add validation feedback
+        if validation_issues:
+            classifier_message += f"\n\n⚠️ Sequence validation issues:\n" + "\n".join(f"• {issue}" for issue in validation_issues[:3])
+            if validation_suggestions:
+                classifier_message += f"\n\n💡 Suggestions:\n" + "\n".join(f"• {sug}" for sug in validation_suggestions[:3])
+        # Enhanced confidence reporting
+        if classifier_result.get("details"):
+            details = classifier_result["details"]
+            if "all_probabilities" in details:
+                probs = details["all_probabilities"]
+                classifier_message += f"\n\nPrediction probabilities:"
+                for label, prob in sorted(probs.items(), key=lambda x: x[1], reverse=True)[:3]:
+                    classifier_message += f"\n• {label}: {prob:.1%}"
+        # Return enhanced results
+        boundary_output = f"Enhanced preprocessing applied. Length: {len(dna_input)} bp"
+        if validation_issues:
+            boundary_output += f"\n{len(validation_issues)} validation issues detected"
+        return (
+            boundary_output,
+            keras_output,
+            classifier_status,
+            classifier_message,
+            classifier_label or "Unknown",
+            f"{classifier_confidence:.3f}" if classifier_confidence else "N/A",
+            "ML tree not requested" if not build_ml_tree else "ML tree processing...",
+            "Enhanced analysis completed",
+            "<p>Enhanced F gene analysis completed</p>",
+            None, None, None,
+            f"Enhanced pipeline completed. Processed {len(dna_input)} bp sequence."
+        )
+    except Exception as e:
+        error_msg = f"Enhanced pipeline failed: {str(e)}"
+        logging.error(error_msg)
+        return (
+            error_msg, "", "error", error_msg, "Error", "0.000",
+            "", "", f"<p>Error: {error_msg}</p>",
+            None, None, None, error_msg
+        )