Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 14, 2025

Commit

2438908

verified ·

1 Parent(s): 183a83a

Delete analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +0 -1577

analyzer.py DELETED Viewed

@@ -1,1577 +0,0 @@
-import pandas as pd
-import numpy as np
-import plotly.graph_objects as go
-from Bio import SeqIO, AlignIO
-from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor, DistanceMatrix
-from Bio.Phylo.BaseTree import Tree
-from Bio.Align import MultipleSeqAlignment
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
-import warnings
-import os
-import sys
-import re
-import time
-from pathlib import Path
-from typing import Dict, List, Tuple, Optional
-import itertools
-import argparse
-warnings.filterwarnings('ignore')
-class PhylogeneticTreeAnalyzer:
-    """Analyzes phylogenetic relationships using ML-based sequence similarity and tree construction."""
-    def __init__(self):
-        self.data = None
-        self.query_sequence = None
-        self.query_id = None
-        self.matching_percentage = 95.0
-        self.actual_percentage = None
-        self.matched_sequences = []
-        self.tree_structure = {}
-        self.similarity_scores = {}
-        self.ai_model = None  # ML model for sequence classification
-        self.genotype_model = None  # Model for genotype prediction
-        self.label_encoder = LabelEncoder()  # Encoder for ML labels
-        self.genotype_label_encoder = LabelEncoder()  # Encoder for genotype labels
-        self.ml_tree = None
-        self.ml_alignment = None
-        self.ml_results = {}
-        self.horizontal_line_tracker = []
-        self.query_ml_group = None
-        self.base_horizontal_length = 1.2
-        self.ml_model_accuracy = None  # Accuracy of ML model
-        self.genotype_model_accuracy = None  # Accuracy of genotype model
-    # --- Data Loading ---
-    def load_data(self, data_file: str) -> bool:
-        """Loads sequence data from a CSV file."""
-        try:
-            self.data = pd.read_csv(data_file)
-            print(f"✓ Data loaded: {len(self.data)} sequences, "
-                  f"{self.data['ML'].nunique()} ML groups, "
-                  f"{self.data['Genotype'].nunique()} genotypes")
-            return True
-        except Exception as e:
-            print(f"Error loading data: {e}")
-            return False
-    # --- Model Training ---
-    def train_ai_model(self) -> bool:
-        """Trains RandomForest models for ML group and genotype prediction."""
-        try:
-            if len(self.data) < 10:
-                print("⚠️ Insufficient data for training (minimum 10 samples)")
-                return False
-            print("🤖 Training AI models...")
-            f_gene_sequences = self.data['F-gene'].fillna('').astype(str)
-            features = []
-            for seq in f_gene_sequences:
-                seq_clean = re.sub(r'[^ATGC]', '', seq.upper())
-                if len(seq_clean) < 3:
-                    features.append([0] * 100)
-                    continue
-                feature_vector = []
-                kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
-                kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
-                kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
-                kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
-                all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
-                all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
-                feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
-                feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
-                features.append(feature_vector)
-            X = np.array(features)
-            # Train ML model
-            ml_targets = self.label_encoder.fit_transform(self.data['ML'].fillna('Unknown'))
-            if len(np.unique(ml_targets)) < 2:
-                print("⚠️ Need at least 2 ML classes for training")
-                return False
-            X_train, X_test, y_train, y_test = train_test_split(X, ml_targets, test_size=0.2, random_state=42)
-            self.ai_model = RandomForestClassifier(n_estimators=100, random_state=42)
-            self.ai_model.fit(X_train, y_train)
-            self.ml_model_accuracy = self.ai_model.score(X_test, y_test)
-            print(f"✓ ML model trained with accuracy: {self.ml_model_accuracy:.2%}")
-            # Train genotype model
-            genotype_targets = self.genotype_label_encoder.fit_transform(self.data['Genotype'].fillna('Unknown'))
-            if len(np.unique(genotype_targets)) >= 2:
-                X_train, X_test, y_train, y_test = train_test_split(X, genotype_targets, test_size=0.2, random_state=42)
-                self.genotype_model = RandomForestClassifier(n_estimators=100, random_state=42)
-                self.genotype_model.fit(X_train, y_train)
-                self.genotype_model_accuracy = self.genotype_model.score(X_test, y_test)
-                print(f"✓ Genotype model trained with accuracy: {self.genotype_model_accuracy:.2%}")
-            return True
-        except Exception as e:
-            print(f"Error training models: {e}")
-            return False
-    def predict_ml_group(self, sequence: str) -> str:
-        """Predicts ML group for a sequence using the trained model."""
-        try:
-            if not self.ai_model:
-                return "Unknown"
-            seq_clean = re.sub(r'[^ATGC]', '', sequence.upper())
-            if len(seq_clean) < 3:
-                return "Unknown"
-            feature_vector = []
-            kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
-            kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
-            kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
-            kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
-            all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
-            all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
-            feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
-            feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
-            X = np.array([feature_vector])
-            ml_pred = self.label_encoder.inverse_transform(self.ai_model.predict(X))[0]
-            return ml_pred
-        except Exception as e:
-            print(f"Error predicting ML group: {e}")
-            return "Unknown"
-    def predict_genotype(self, sequence: str) -> str:
-        """Predicts genotype for a sequence using the trained model."""
-        try:
-            if not self.genotype_model:
-                return "Unknown"
-            seq_clean = re.sub(r'[^ATGC]', '', sequence.upper())
-            if len(seq_clean) < 3:
-                return "Unknown"
-            feature_vector = []
-            kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
-            kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
-            kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
-            kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
-            all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
-            all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
-            feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
-            feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
-            X = np.array([feature_vector])
-            genotype_pred = self.genotype_label_encoder.inverse_transform(self.genotype_model.predict(X))[0]
-            return genotype_pred
-        except Exception as e:
-            print(f"Error predicting genotype: {e}")
-            return "Unknown"
-    # --- Sequence Processing ---
-    def find_query_sequence(self, query_input: str) -> bool:
-        """Identifies query sequence by accession number, F-gene, or as a novel sequence."""
-        try:
-            query_input = query_input.strip()
-            if query_input in self.data['Accession Number'].values:
-                self.query_id = query_input
-                query_row = self.data[self.data['Accession Number'] == query_input].iloc[0]
-                self.query_sequence = query_row['F-gene']
-                print(f"✓ Query found by accession: {query_input}, ML: {query_row['ML']}, Genotype: {query_row['Genotype']}")
-                return True
-            query_clean = re.sub(r'[^ATGC]', '', str(query_input).upper())
-            if query_clean in self.data['F-gene'].values:
-                query_row = self.data[self.data['F-gene'] == query_clean].iloc[0]
-                self.query_id = query_row['Accession Number']
-                self.query_sequence = query_clean
-                print(f"✓ Query matched to accession: {self.query_id}, ML: {query_row['ML']}, Genotype: {query_row['Genotype']}")
-                return True
-            if len(query_clean) >= 10:
-                self.query_id = f"QUERY_{hash(query_clean) % 100000:05d}"
-                self.query_sequence = query_clean
-                predicted_ml = self.predict_ml_group(query_clean)
-                predicted_genotype = self.predict_genotype(query_clean)
-                print(f"✓ Novel query accepted: {self.query_id}, Length: {len(query_clean)}, "
-                      f"Predicted ML: {predicted_ml}, Predicted Genotype: {predicted_genotype}")
-                return True
-            print(f"✗ Invalid query: Too short (<10) or not found")
-            return False
-        except Exception as e:
-            print(f"Error processing query: {e}")
-            return False
-    def calculate_f_gene_similarity(self, seq1: str, seq2: str) -> float:
-        """Calculates similarity between two sequences using k-mer analysis."""
-        try:
-            if not seq1 or not seq2:
-                return 0.0
-            seq1 = re.sub(r'[^ATGC]', '', str(seq1).upper())
-            seq2 = re.sub(r'[^ATGC]', '', str(seq2).upper())
-            if len(seq1) == 0 or len(seq2) == 0:
-                return 0.0
-            k = 5
-            kmers1 = set(seq1[i:i+k] for i in range(len(seq1)-k+1) if len(seq1[i:i+k]) == k)
-            kmers2 = set(seq2[i:i+k] for i in range(len(seq2)-k+1) if len(seq2[i:i+k]) == k)
-            if len(kmers1) == 0 and len(kmers2) == 0:
-                return 100.0
-            if len(kmers1) == 0 or len(kmers2) == 0:
-                return 0.0
-            intersection = len(kmers1.intersection(kmers2))
-            union = len(kmers1.union(kmers2))
-            return round((intersection / union) * 100, 2) if union > 0 else 0.0
-        except Exception as e:
-            print(f"Error calculating similarity: {e}")
-            return 0.0
-    def find_similar_sequences(self, target_percentage: float) -> Tuple[List[str], float]:
-        """Finds sequences similar to the query sequence."""
-        try:
-            print(f"🔍 Finding sequences with {target_percentage}% similarity...")
-            similarities = []
-            for idx, row in self.data.iterrows():
-                if row['Accession Number'] == self.query_id:
-                    continue
-                similarity = self.calculate_f_gene_similarity(self.query_sequence, row['F-gene'])
-                similarities.append({
-                    'id': row['Accession Number'],
-                    'similarity': similarity,
-                    'ml': row.get('ML', 'Unknown'),
-                    'genotype': row.get('Genotype', 'Unknown')
-                })
-            if not similarities:
-                print("❌ No valid sequences for comparison")
-                return [], target_percentage
-            similarities.sort(key=lambda x: x['similarity'], reverse=True)
-            target_range = 2.0
-            candidates = [s for s in similarities if abs(s['similarity'] - target_percentage) <= target_range]
-            if not candidates:
-                closest = min(similarities, key=lambda x: abs(x['similarity'] - target_percentage))
-                actual_percentage = closest['similarity']
-                candidates = [s for s in similarities if abs(s['similarity'] - actual_percentage) <= 1.0]
-                print(f"⚠ No sequences at {target_percentage}%. Using closest: {actual_percentage:.1f}%")
-            else:
-                actual_percentage = target_percentage
-            max_results = 50
-            if len(candidates) > max_results:
-                candidates = candidates[:max_results]
-                print(f"⚠ Limited to top {max_results} matches")
-            self.similarity_scores = {c['id']: c['similarity'] for c in candidates}
-            matched_ids = [c['id'] for c in candidates]
-            if similarities:
-                max_sim = max(s['similarity'] for s in similarities)
-                min_sim = min(s['similarity'] for s in similarities)
-                avg_sim = sum(s['similarity'] for s in similarities) / len(similarities)
-                print(f"✓ Found {len(matched_ids)} sequences at ~{actual_percentage:.1f}% similarity, "
-                      f"Range: {min_sim:.1f}% - {max_sim:.1f}% (avg: {avg_sim:.1f}%)")
-            return matched_ids, actual_percentage
-        except Exception as e:
-            print(f"Error finding similar sequences: {e}")
-            return [], target_percentage
-    # --- Tree Construction ---
-    def build_tree_structure(self, matched_ids: List[str]) -> Dict:
-        """Builds a hierarchical tree structure based on ML groups and genotypes."""
-        try:
-            print("🌳 Building normalized tree structure...")
-            tree_structure = {
-                'root': {'name': 'Root', 'type': 'root', 'children': {}, 'x': 0, 'y': 0,
-                         'has_vertical_attachment': False, 'extension_level': 0}
-            }
-            ml_groups = {}
-            for idx, row in self.data.iterrows():
-                ml_group = row['ML']
-                genotype = row['Genotype']
-                seq_id = row['Accession Number']
-                if ml_group not in ml_groups:
-                    ml_groups[ml_group] = {}
-                if genotype not in ml_groups[ml_group]:
-                    ml_groups[ml_group][genotype] = []
-                ml_groups[ml_group][genotype].append({
-                    'id': seq_id, 'data': row.to_dict(), 'is_query': seq_id == self.query_id,
-                    'is_matched': seq_id in matched_ids, 'similarity': self.similarity_scores.get(seq_id, 0.0)
-                })
-            if self.query_id.startswith("QUERY_"):
-                predicted_ml = self.predict_ml_group(self.query_sequence)
-                predicted_genotype = self.predict_genotype(self.query_sequence)
-                if predicted_ml not in ml_groups:
-                    ml_groups[predicted_ml] = {}
-                if predicted_genotype not in ml_groups[predicted_ml]:
-                    ml_groups[predicted_ml][predicted_genotype] = []
-                ml_groups[predicted_ml][predicted_genotype].append({
-                    'id': self.query_id, 'data': {
-                        'F-gene': self.query_sequence, 'ML': predicted_ml, 'Genotype': predicted_genotype,
-                        'Accession Number': self.query_id
-                    }, 'is_query': True, 'is_matched': False, 'similarity': 100.0
-                })
-            normalized_ml_groups = self._normalize_ml_groups(ml_groups)
-            self._build_normalized_ml_nodes(tree_structure, normalized_ml_groups, matched_ids)
-            self.tree_structure = tree_structure
-            print("✓ Tree structure built")
-            return tree_structure
-        except Exception as e:
-            print(f"Error building tree structure: {e}")
-            return {}
-    def build_tree_structure_with_ml_safe(self, matched_ids: List[str]) -> Dict:
-        """Enhances tree structure with ML analysis."""
-        try:
-            print("🌳 Building ML-enhanced tree structure...")
-            ml_results = self.perform_ml_analysis_safe(matched_ids)
-            tree_structure = self.build_tree_structure(matched_ids)
-            if ml_results and 'tree' in ml_results:
-                tree_structure['ml_analysis'] = {
-                    'log_likelihood': ml_results['log_likelihood'],
-                    'sequence_count': ml_results['sequence_count'],
-                    'alignment_length': ml_results['alignment_length'],
-                    'ml_tree_available': True
-                }
-                self.ml_tree = ml_results['tree']
-                self.ml_alignment = ml_results.get('alignment')
-                print("✓ Tree enhanced with ML analysis")
-            else:
-                tree_structure['ml_analysis'] = {'ml_tree_available': False, 'error': 'ML analysis failed'}
-                print("⚠ ML analysis failed, using standard tree")
-            return tree_structure
-        except Exception as e:
-            print(f"Error building ML-enhanced tree: {e}")
-            try:
-                return self.build_tree_structure(matched_ids)
-            except Exception as e2:
-                print(f"Fallback failed: {e2}")
-                return {'error': 'Tree construction failed'}
-    def _normalize_ml_groups(self, ml_groups: Dict) -> Dict:
-        """Normalizes ML group names for hierarchical organization."""
-        try:
-            normalized_groups = {}
-            for ml_name, genotypes in ml_groups.items():
-                base_ml = 'UNCL' if ml_name.startswith('UNCL') else ml_name.split('.')[0] if '.' in ml_name and any(c.isdigit() for c in ml_name) else ml_name
-                if base_ml not in normalized_groups:
-                    normalized_groups[base_ml] = {'full_ml_groups': {}, 'representative_sequences': [], 'has_special_sequences': False}
-                has_special = any(any(seq['is_query'] or seq['is_matched'] for seq in seqs) for seqs in genotypes.values())
-                if has_special:
-                    normalized_groups[base_ml]['has_special_sequences'] = True
-                    normalized_groups[base_ml]['full_ml_groups'][ml_name] = genotypes
-                elif len(normalized_groups[base_ml]['representative_sequences']) < 2:
-                    for genotype, sequences in list(genotypes.items())[:2]:
-                        if len(normalized_groups[base_ml]['representative_sequences']) < 2:
-                            normalized_groups[base_ml]['representative_sequences'].extend(sequences[:1])
-            return normalized_groups
-        except Exception as e:
-            print(f"Error normalizing ML groups: {e}")
-            return {}
-    def _build_normalized_ml_nodes(self, tree_structure: Dict, normalized_ml_groups: Dict, matched_ids: List[str]):
-        """Builds normalized ML nodes with equal spacing."""
-        try:
-            self.horizontal_line_tracker = []
-            self._identify_query_ml_group(normalized_ml_groups)
-            ml_positions = self._calculate_dynamic_ml_positions(normalized_ml_groups)
-            tree_structure['root']['has_vertical_attachment'] = len(normalized_ml_groups) > 1
-            for ml_idx, (base_ml, ml_data) in enumerate(normalized_ml_groups.items()):
-                y_pos = ml_positions[ml_idx]
-                has_vertical = ml_data['has_special_sequences'] and len(ml_data['full_ml_groups']) > 1
-                contains_query = base_ml == self.query_ml_group
-                horizontal_length = self._determine_horizontal_line_length('normalized_ml_group', has_vertical, contains_query)
-                x_pos = horizontal_length
-                tree_structure['root']['children'][base_ml] = {
-                    'name': base_ml, 'type': 'normalized_ml_group', 'children': {}, 'x': x_pos, 'y': y_pos,
-                    'has_special_sequences': ml_data['has_special_sequences'], 'has_vertical_attachment': has_vertical,
-                    'horizontal_line_length': horizontal_length, 'contains_query': contains_query
-                }
-                if ml_data['has_special_sequences']:
-                    self._build_full_ml_nodes(tree_structure['root']['children'][base_ml], ml_data['full_ml_groups'],
-                                             y_pos, matched_ids, x_pos)
-                else:
-                    self._add_representative_sequences(tree_structure['root']['children'][base_ml],
-                                                       ml_data['representative_sequences'], y_pos, x_pos)
-        except Exception as e:
-            print(f"Error building normalized ML nodes: {e}")
-    def _build_full_ml_nodes(self, normalized_ml_node: Dict, full_ml_groups: Dict, base_y: float, matched_ids: List[str], parent_x: float):
-        """Builds full ML nodes with genotypes."""
-        try:
-            full_ml_positions = self._calculate_full_ml_positions(full_ml_groups, base_y)
-            for ml_idx, (full_ml_name, genotypes) in enumerate(full_ml_groups.items()):
-                y_pos = full_ml_positions[ml_idx]
-                special_genotypes_count = sum(1 for g, seqs in genotypes.items() if any(s['is_query'] or s['is_matched'] for s in seqs))
-                has_vertical = special_genotypes_count > 1
-                contains_query = any(any(seq['is_query'] for seq in seqs) for seqs in genotypes.values())
-                horizontal_length = self._determine_horizontal_line_length('full_ml_group', has_vertical, contains_query)
-                x_pos = parent_x + horizontal_length
-                normalized_ml_node['children'][full_ml_name] = {
-                    'name': full_ml_name, 'type': 'full_ml_group', 'children': {}, 'x': x_pos, 'y': y_pos,
-                    'sequences_count': sum(len(seqs) for seqs in genotypes.values()), 'has_vertical_attachment': has_vertical,
-                    'horizontal_line_length': horizontal_length, 'contains_query': contains_query
-                }
-                self._build_genotype_nodes(normalized_ml_node['children'][full_ml_name], genotypes, y_pos, matched_ids, x_pos)
-        except Exception as e:
-            print(f"Error building full ML nodes: {e}")
-    def _build_genotype_nodes(self, full_ml_node: Dict, genotypes: Dict, base_y: float, matched_ids: List[str], parent_x: float):
-        """Builds genotype nodes with sequences."""
-        try:
-            special_genotypes = [(g, seqs) for g, seqs in genotypes.items() if any(s['is_query'] or s['is_matched'] for s in seqs)]
-            if not special_genotypes:
-                return
-            genotype_positions = self._calculate_genotype_positions(special_genotypes, base_y)
-            genotype_sequence_counts = [(g, seqs, len([s for s in seqs if s['is_query'] or s['is_matched']])) for g, seqs in special_genotypes]
-            for gt_idx, (genotype, sequences, sequence_count) in enumerate(genotype_sequence_counts):
-                y_pos = genotype_positions[gt_idx]
-                special_sequences = [s for s in sequences if s['is_query'] or s['is_matched']]
-                has_vertical = len(special_sequences) > 1
-                contains_query = any(s['is_query'] for s in sequences)
-                horizontal_length = self._determine_genotype_horizontal_line_length(sequence_count, has_vertical, contains_query)
-                x_pos = parent_x + horizontal_length
-                full_ml_node['children'][genotype] = {
-                    'name': genotype, 'type': 'genotype', 'children': {}, 'x': x_pos, 'y': y_pos,
-                    'sequences': sequences, 'has_vertical_attachment': has_vertical,
-                    'horizontal_line_length': horizontal_length, 'contains_query': contains_query,
-                    'sequence_count': sequence_count
-                }
-                self._add_sequences_horizontal(full_ml_node['children'][genotype], sequences, y_pos, x_pos)
-        except Exception as e:
-            print(f"Error building genotype nodes: {e}")
-    def _add_representative_sequences(self, normalized_ml_node: Dict, representative_sequences: List[Dict], base_y: float, parent_x: float):
-        """Adds representative sequences to normalized ML nodes."""
-        try:
-            if not representative_sequences:
-                return
-            has_vertical = len(representative_sequences) > 1
-            horizontal_length = self._determine_horizontal_line_length('representative', has_vertical)
-            x_pos = parent_x + horizontal_length
-            if len(representative_sequences) == 1:
-                seq = representative_sequences[0]
-                normalized_ml_node['children'][f"{seq['id']}_rep"] = {
-                    'name': f"{seq['id']} (Rep)", 'type': 'representative_sequence', 'data': seq,
-                    'x': x_pos, 'y': base_y, 'has_vertical_attachment': False, 'horizontal_line_length': horizontal_length
-                }
-            else:
-                positions = self._calculate_sequence_positions(representative_sequences, base_y)
-                for idx, seq in enumerate(representative_sequences):
-                    normalized_ml_node['children'][f"{seq['id']}_rep"] = {
-                        'name': f"{seq['id']} (Rep)", 'type': 'representative_sequence', 'data': seq,
-                        'x': x_pos, 'y': positions[idx], 'has_vertical_attachment': False, 'horizontal_line_length': horizontal_length
-                    }
-        except Exception as e:
-            print(f"Error adding representative sequences: {e}")
-    def _add_sequences_horizontal(self, genotype_node: Dict, sequences: List[Dict], base_y: float, parent_x: float):
-        """Adds sequences with similarity-based line lengths."""
-        try:
-            query_line_length = 3.0
-            query_sequences = [s for s in sequences if s['is_query']]
-            matched_sequences = [s for s in sequences if s['is_matched'] and not s['is_query']]
-            all_special_sequences = query_sequences + matched_sequences
-            if len(all_special_sequences) == 1:
-                sequence = all_special_sequences[0]
-                line_length = self._calculate_similarity_based_line_length(sequence, query_line_length)
-                x_pos = parent_x + line_length
-                genotype_node['children'][sequence['id']] = {
-                    'name': f"{sequence['id']} ({sequence['similarity']}%)" if sequence['is_matched'] else sequence['id'],
-                    'type': 'sequence', 'data': sequence, 'x': x_pos, 'y': base_y,
-                    'has_vertical_attachment': False, 'similarity_line_length': line_length
-                }
-            else:
-                sequence_positions = self._calculate_sequence_positions(all_special_sequences, base_y)
-                for seq_idx, sequence in enumerate(all_special_sequences):
-                    line_length = self._calculate_similarity_based_line_length(sequence, query_line_length)
-                    x_pos = parent_x + line_length
-                    genotype_node['children'][sequence['id']] = {
-                        'name': f"{sequence['id']} ({sequence['similarity']}%)" if sequence['is_matched'] else sequence['id'],
-                        'type': 'sequence', 'data': sequence, 'x': x_pos, 'y': sequence_positions[seq_idx],
-                        'has_vertical_attachment': False, 'similarity_line_length': line_length
-                    }
-        except Exception as e:
-            print(f"Error adding sequences: {e}")
-    def _identify_query_ml_group(self, normalized_ml_groups: Dict):
-        """Identifies the ML group containing the query sequence."""
-        try:
-            for base_ml, ml_data in normalized_ml_groups.items():
-                if ml_data['has_special_sequences']:
-                    for genotypes in ml_data['full_ml_groups'].values():
-                        for sequences in genotypes.values():
-                            if any(seq['is_query'] for seq in sequences):
-                                self.query_ml_group = base_ml
-                                return
-        except Exception as e:
-            print(f"Error identifying query ML group: {e}")
-    def _calculate_dynamic_ml_positions(self, normalized_ml_groups: Dict) -> List[float]:
-        """Calculates equal Y positions for ML groups."""
-        try:
-            ml_count = len(normalized_ml_groups)
-            if ml_count == 0:
-                return []
-            if ml_count == 1:
-                return [0.0]
-            total_spacing = (ml_count - 1) * 2.0
-            start_y = -total_spacing / 2
-            return [start_y + i * 2.0 for i in range(ml_count)]
-        except Exception as e:
-            print(f"Error calculating ML positions: {e}")
-            return list(range(len(normalized_ml_groups)))
-    def _calculate_full_ml_positions(self, full_ml_groups: Dict, base_y: float) -> List[float]:
-        """Calculates equal positions for full ML groups."""
-        try:
-            ml_count = len(full_ml_groups)
-            if ml_count <= 1:
-                return [base_y]
-            spacing = 1.5
-            start_y = base_y - (spacing * (ml_count - 1)) / 2
-            return [start_y + i * spacing for i in range(ml_count)]
-        except Exception as e:
-            print(f"Error calculating full ML positions: {e}")
-            return [base_y] * len(full_ml_groups)
-    def _calculate_genotype_positions(self, special_genotypes: List, base_y: float) -> List[float]:
-        """Calculates equal positions for genotypes."""
-        try:
-            genotype_count = len(special_genotypes)
-            if genotype_count <= 1:
-                return [base_y]
-            spacing = 1.0
-            start_y = base_y - (spacing * (genotype_count - 1)) / 2
-            return [start_y + i * spacing for i in range(genotype_count)]
-        except Exception as e:
-            print(f"Error calculating genotype positions: {e}")
-            return [base_y] * len(special_genotypes)
-    def _calculate_sequence_positions(self, sequences: List[Dict], base_y: float) -> List[float]:
-        """Calculates equal positions for sequences."""
-        try:
-            seq_count = len(sequences)
-            if seq_count <= 1:
-                return [base_y]
-            spacing = 0.8
-            start_y = base_y - (spacing * (seq_count - 1)) / 2
-            return [start_y + i * spacing for i in range(seq_count)]
-        except Exception as e:
-            print(f"Error calculating sequence positions: {e}")
-            return [base_y] * len(sequences)
-    def _calculate_similarity_based_line_length(self, sequence: Dict, query_line_length: float) -> float:
-        """Calculates line length based on sequence similarity."""
-        try:
-            if sequence['is_query']:
-                return query_line_length
-            if sequence['is_matched']:
-                similarity = sequence['similarity']
-                proportional_length = (similarity / 100.0) * query_line_length
-                return max(proportional_length, query_line_length * 0.2)
-            return query_line_length * 0.5
-        except Exception as e:
-            print(f"Error calculating line length: {e}")
-            return query_line_length * 0.5
-    def _determine_horizontal_line_length(self, node_type: str, has_vertical: bool, contains_query: bool = False) -> float:
-        """Determines horizontal line length based on node type."""
-        try:
-            base_length = self.base_horizontal_length
-            if contains_query and node_type == 'normalized_ml_group':
-                return base_length * 2.5
-            if has_vertical:
-                current_max = base_length
-                for length in self.horizontal_line_tracker:
-                    if length > current_max:
-                        current_max = length
-                new_length = current_max + 0.3
-                self.horizontal_line_tracker.append(new_length)
-                return new_length
-            return base_length
-        except Exception as e:
-            print(f"Error determining line length: {e}")
-            return self.base_horizontal_length
-    def _determine_genotype_horizontal_line_length(self, sequence_count: int, has_vertical: bool, contains_query: bool = False) -> float:
-        """Determines horizontal line length for genotype nodes."""
-        try:
-            base_length = self.base_horizontal_length
-            query_bonus = 0.5 if contains_query else 0.0
-            if sequence_count <= 1:
-                length_multiplier = 1.0
-            elif sequence_count <= 3:
-                length_multiplier = 1.6
-            elif sequence_count <= 5:
-                length_multiplier = 2.3
-            else:
-                length_multiplier = 6.0
-            return base_length * length_multiplier + query_bonus
-        except Exception as e:
-            print(f"Error determining genotype line length: {e}")
-            return self.base_horizontal_length
-    # --- Visualization ---
-    def create_interactive_tree(self, matched_ids: List[str], actual_percentage: float) -> Optional[go.Figure]:
-        """Creates an interactive horizontal phylogenetic tree visualization."""
-        try:
-            print("🎨 Creating interactive tree visualization...")
-            edge_x, edge_y = [], []
-            node_x, node_y = [], []
-            node_colors, node_text, node_hover, node_sizes = [], [], [], []
-            colors = {
-                'root': '#FF0000', 'normalized_ml_group': '#FFB6C1', 'full_ml_group': '#FF69B4',
-                'genotype': '#FFD700', 'representative_sequence': '#FFA500', 'query_sequence': '#4B0082',
-                'matched_sequence': '#6A5ACD', 'other_sequence': '#87CEEB'
-            }
-            def add_horizontal_edges(parent_x, parent_y, children_dict):
-                if not children_dict:
-                    return
-                children_list = list(children_dict.values())
-                if len(children_list) == 1:
-                    child = children_list[0]
-                    edge_x.extend([parent_x, child['x'], None])
-                    edge_y.extend([parent_y, child['y'], None])
-                else:
-                    child_x_positions = [child['x'] for child in children_list]
-                    min_child_x = min(child_x_positions)
-                    intermediate_x = parent_x + (min_child_x - parent_x) * 0.8
-                    edge_x.extend([parent_x, intermediate_x, None])
-                    edge_y.extend([parent_y, parent_y, None])
-                    child_y_positions = [child['y'] for child in children_list]
-                    min_y, max_y = min(child_y_positions), max(child_y_positions)
-                    edge_x.extend([intermediate_x, intermediate_x, None])
-                    edge_y.extend([min_y, max_y, None])
-                    for child in children_list:
-                        edge_x.extend([intermediate_x, child['x'], None])
-                        edge_y.extend([child['y'], child['y'], None])
-            def get_node_color_and_size(node):
-                if node['type'] == 'sequence':
-                    if node['data']['is_query']:
-                        return colors['query_sequence'], 10
-                    if node['data']['is_matched']:
-                        return colors['matched_sequence'], 8
-                    return colors['other_sequence'], 6
-                if node['type'] == 'representative_sequence':
-                    return colors['representative_sequence'], 7
-                if node['type'] == 'normalized_ml_group':
-                    return colors['normalized_ml_group'], 9 if node.get('has_special_sequences', False) else 7
-                if node['type'] == 'full_ml_group':
-                    return colors['full_ml_group'], 8
-                if node['type'] == 'genotype':
-                    return colors['genotype'], 7
-                return colors.get(node['type'], '#000000'), 7
-            def create_node_text(node):
-                if node['type'] == 'sequence':
-                    return f"{node['name']}" if node['data']['is_matched'] and not node['data']['is_query'] else node['name']
-                if node['type'] == 'representative_sequence':
-                    return node['name']
-                if node['type'] == 'normalized_ml_group':
-                    return f"{node['name']} *" if node.get('has_special_sequences', False) else node['name']
-                return node['name']
-            def create_hover_text(node):
-                if node['type'] == 'sequence':
-                    data = node['data']['data']
-                    hover_text = (
-                        f"<b>{node['name']}</b><br>Type: {'Query' if node['data']['is_query'] else 'Matched' if node['data']['is_matched'] else 'Other'} Sequence<br>"
-                        f"ML Group: {data.get('ML', 'N/A')}<br>Genotype: {data.get('Genotype', 'N/A')}<br>"
-                        f"Host: {data.get('Host', 'N/A')}<br>Country: {data.get('Country', 'N/A')}<br>"
-                        f"Isolate: {data.get('Isolate', 'N/A')}<br>Year: {data.get('Year', 'N/A')}"
-                    )
-                    if node['data']['is_matched']:
-                        hover_text += f"<br><b>Similarity: {node['data']['similarity']}%</b>"
-                elif node['type'] == 'representative_sequence':
-                    data = node['data']['data']
-                    hover_text = (
-                        f"<b>{node['name']}</b><br>Type: Representative Sequence<br>"
-                        f"ML Group: {data.get('ML', 'N/A')}<br>Genotype: {data.get('Genotype', 'N/A')}<br>"
-                        f"Host: {data.get('Host', 'N/A')}<br>Country: {data.get('Country', 'N/A')}"
-                    )
-                elif node['type'] == 'normalized_ml_group':
-                    hover_text = f"<b>{node['name']}</b><br>Type: Normalized ML Group"
-                    if node.get('has_special_sequences', False):
-                        hover_text += "<br>Contains query/matched sequences"
-                    else:
-                        hover_text += "<br>Representative sequences only"
-                elif node['type'] == 'full_ml_group':
-                    hover_text = f"<b>{node['name']}</b><br>Type: Full ML Group"
-                    if 'sequences_count' in node:
-                        hover_text += f"<br>Total Sequences: {node['sequences_count']}"
-                elif node['type'] == 'genotype':
-                    hover_text = f"<b>{node['name']}</b><br>Type: Genotype"
-                    if 'sequences' in node:
-                        special_count = sum(1 for seq in node['sequences'] if seq['is_query'] or seq['is_matched'])
-                        hover_text += f"<br>Special Sequences: {special_count}/{len(node['sequences'])}"
-                else:
-                    hover_text = f"<b>{node['name']}</b><br>Type: {node['type'].replace('_', ' ').title()}"
-                return hover_text
-            def add_node_and_edges(node, parent_x=None, parent_y=None):
-                x, y = node['x'], node['y']
-                node_x.append(x)
-                node_y.append(y)
-                color, size = get_node_color_and_size(node)
-                node_colors.append(color)
-                node_sizes.append(size)
-                node_text.append(create_node_text(node))
-                node_hover.append(create_hover_text(node))
-                if 'children' in node and node['children']:
-                    add_horizontal_edges(x, y, node['children'])
-                    for child in node['children'].values():
-                        add_node_and_edges(child, x, y)
-            root_node = self.tree_structure['root']
-            add_node_and_edges(root_node)
-            if root_node['children']:
-                add_horizontal_edges(root_node['x'], root_node['y'], root_node['children'])
-            fig = go.Figure()
-            fig.add_trace(go.Scatter(
-                x=edge_x, y=edge_y, mode='lines', line=dict(width=1, color='gray'),
-                hoverinfo='none', showlegend=False
-            ))
-            fig.add_trace(go.Scatter(
-                x=node_x, y=node_y, mode='markers+text',
-                marker=dict(size=node_sizes, color=node_colors, line=dict(width=1, color='black'), opacity=0.85),
-                text=node_text, textposition="middle right", textfont=dict(size=9, color="black"),
-                hoverinfo='text', hovertext=node_hover, showlegend=False
-            ))
-            min_x, max_x = min(node_x), max(node_x) if node_x else (0, 1)
-            min_y, max_y = min(node_y), max(node_y) if node_y else (0, 1)
-            x_range = max_x - min_x
-            y_range = max_y - min_y
-            x_padding = x_range * 0.2 if x_range > 0 else 1
-            y_padding = y_range * 0.2 if y_range > 0 else 1
-            width = min(1400, max(800, int(x_range * 80 + 400)))
-            height = min(900, max(500, int(y_range * 40 + 300)))
-            fig.update_layout(
-                title=dict(
-                    text=f"Horizontal Phylogenetic Tree<br>Query: {self.query_id} | Similarity: {actual_percentage}% | Matched: {len(matched_ids)}",
-                    x=0.5, font=dict(size=12)
-                ),
-                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[min_x - x_padding, max_x + x_padding], automargin=True),
-                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[min_y - y_padding, max_y + y_padding], automargin=True),
-                plot_bgcolor="white", paper_bgcolor="white", hovermode="closest",
-                width=width, height=height, margin=dict(l=20, r=100, t=40, b=10),
-                showlegend=True, legend=dict(x=1.02, y=1, xanchor='left', yanchor='top',
-                                             bgcolor='rgba(255,255,255,0.8)', bordercolor='gray', borderwidth=1, font=dict(size=10))
-            )
-            legend_elements = [
-                dict(name="Root", marker=dict(color=colors['root'], size=8)),
-                dict(name="Normalized ML Groups", marker=dict(color=colors['normalized_ml_group'], size=8)),
-                dict(name="Full ML Groups", marker=dict(color=colors['full_ml_group'], size=8)),
-                dict(name="Genotypes", marker=dict(color=colors['genotype'], size=8)),
-                dict(name="Query Sequence", marker=dict(color=colors['query_sequence'], size=10)),
-                dict(name="Similar Sequences", marker=dict(color=colors['matched_sequence'], size=9)),
-                dict(name="Representative Sequences", marker=dict(color=colors['representative_sequence'], size=8)),
-            ]
-            for element in legend_elements:
-                fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=element['marker'], name=element['name'], showlegend=True))
-            config = {
-                'displayModeBar': True, 'displaylogo': False, 'modeBarButtonsToRemove': ['select2d', 'lasso2d'],
-                'toImageButtonOptions': {'format': 'png', 'filename': 'phylogenetic_tree', 'height': height, 'width': width, 'scale': 2}
-            }
-            try:
-                fig.show(config)
-            except Exception as e:
-                print(f"Warning: Could not display figure: {e}")
-            return fig
-        except Exception as e:
-            print(f"Error creating tree visualization: {e}")
-            return None
-    # --- ML Analysis ---
-    def perform_ml_analysis_safe(self, matched_ids: List[str]) -> Dict:
-        try:
-            print("\n🧬 PERFORMING MAXIMUM LIKELIHOOD ANALYSIS")
-            print("="*50)
-            # Include query sequence in analysis
-            all_sequences = [self.query_id] + [seq_id for seq_id in matched_ids if seq_id != self.query_id]
-            # Limit number of sequences to prevent memory issues
-            if len(all_sequences) > 20:
-                print(f"Warning: Limiting analysis to 20 sequences (had {len(all_sequences)})")
-                all_sequences = all_sequences[:20]
-            if len(all_sequences) < 3:
-                print("❌ Need at least 3 sequences for ML analysis")
-                return {}
-            # Step 1: Create multiple sequence alignment
-            alignment = self.create_sequence_alignment(all_sequences)
-            if not alignment:
-                return {}
-            # Step 2: Calculate ML distances
-            distance_matrix = self.calculate_ml_distances(alignment)
-            if distance_matrix.size == 0:
-                return {}
-            # Step 3: Construct ML tree
-            ml_tree = self.construct_ml_tree(alignment)
-            if not ml_tree:
-                return {}
-            # Step 4: Calculate tree likelihood (safely)
-            log_likelihood = self.calculate_ml_likelihood_safe(ml_tree, alignment)
-            # Step 5: Prepare results
-            ml_results = {
-                'tree': ml_tree,
-                'alignment': alignment,
-                'distance_matrix': distance_matrix,
-                'log_likelihood': log_likelihood,
-                'sequence_count': len(all_sequences),
-                'alignment_length': len(alignment[0]) if alignment else 0
-            }
-            print(f"✅ ML analysis completed successfully")
-            print(f"   Sequences analyzed: {len(all_sequences)}")
-            print(f"   Alignment length: {ml_results['alignment_length']}")
-            print(f"   Log-likelihood: {log_likelihood:.2f}")
-            return ml_results
-        except Exception as e:
-            print(f"❌ ML analysis failed: {e}")
-            import traceback
-            traceback.print_exc()
-            return {}
-    def create_sequence_alignment(self, sequence_ids: List[str]) -> Optional[MultipleSeqAlignment]:
-        try:
-            print("🧬 Creating multiple sequence alignment...")
-            # Get sequences
-            sequences = []
-            for seq_id in sequence_ids:
-                try:
-                    row = self.data[self.data['Accession Number'] == seq_id]
-                    if not row.empty:
-                        f_gene = str(row.iloc[0]['F-gene'])
-                        # Clean sequence (remove non-nucleotide characters)
-                        clean_seq = re.sub(r'[^ATGCN-]', '', f_gene.upper())
-                        if len(clean_seq) > 10:  # Minimum sequence length
-                            seq_record = SeqRecord(Seq(clean_seq), id=seq_id, description="")
-                            sequences.append(seq_record)
-                except Exception as e:
-                    print(f"Warning: Skipping sequence {seq_id}: {e}")
-                    continue
-            if len(sequences) < 2:
-                print("❌ Need at least 2 valid sequences for alignment")
-                return None
-            # Simple alignment (you might want to use MUSCLE or CLUSTAL for better results)
-            aligned_sequences = self._simple_alignment(sequences)
-            print(f"✓ Alignment created with {len(aligned_sequences)} sequences")
-            return MultipleSeqAlignment(aligned_sequences)
-        except Exception as e:
-            print(f"Error creating alignment: {e}")
-            return None
-    def _simple_alignment(self, sequences: List[SeqRecord]) -> List[SeqRecord]:
-        try:
-            # Find maximum length
-            max_length = max(len(seq.seq) for seq in sequences)
-            # Cap maximum length to prevent memory issues
-            if max_length > 10000:
-                max_length = 10000
-                print(f"Warning: Sequences truncated to {max_length} bp")
-            # Pad sequences to same length
-            aligned_sequences = []
-            for seq in sequences:
-                seq_str = str(seq.seq)[:max_length]  # Truncate if too long
-                if len(seq_str) < max_length:
-                    # Pad with gaps at the end
-                    padded_seq = seq_str + '-' * (max_length - len(seq_str))
-                else:
-                    padded_seq = seq_str
-                aligned_sequences.append(SeqRecord(Seq(padded_seq), id=seq.id, description=seq.description))
-            return aligned_sequences
-        except Exception as e:
-            print(f"Error in simple alignment: {e}")
-            return sequences
-    def calculate_ml_distances(self, alignment: MultipleSeqAlignment) -> np.ndarray:
-        try:
-            print("📊 Calculating ML distances...")
-            # Convert alignment to numeric matrix
-            seq_matrix = self._alignment_to_matrix(alignment)
-            n_sequences = len(alignment)
-            if n_sequences == 0:
-                return np.array([])
-            # Initialize distance matrix
-            distance_matrix = np.zeros((n_sequences, n_sequences))
-            # Calculate pairwise ML distances
-            for i in range(n_sequences):
-                for j in range(i + 1, n_sequences):
-                    try:
-                        ml_distance = self._calculate_ml_distance_pair(seq_matrix[i], seq_matrix[j])
-                        distance_matrix[i][j] = ml_distance
-                        distance_matrix[j][i] = ml_distance
-                    except Exception as e:
-                        print(f"Warning: Error calculating distance between sequences {i} and {j}: {e}")
-                        # Use maximum distance as fallback
-                        distance_matrix[i][j] = 1.0
-                        distance_matrix[j][i] = 1.0
-            print("✓ ML distances calculated")
-            return distance_matrix
-        except Exception as e:
-            print(f"Error calculating ML distances: {e}")
-            return np.array([])
-    def _alignment_to_matrix(self, alignment: MultipleSeqAlignment) -> np.ndarray:
-        try:
-            nucleotide_map = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'N': 4, '-': 5}
-            matrix = []
-            for record in alignment:
-                sequence = str(record.seq).upper()
-                numeric_seq = [nucleotide_map.get(nuc, 4) for nuc in sequence]
-                matrix.append(numeric_seq)
-            return np.array(matrix)
-        except Exception as e:
-            print(f"Error converting alignment to matrix: {e}")
-            return np.array([])
-    def _calculate_ml_distance_pair(self, seq1: np.ndarray, seq2: np.ndarray) -> float:
-        try:
-            if len(seq1) == 0 or len(seq2) == 0:
-                return 1.0
-            # Count differences (excluding gaps and N's)
-            valid_positions = (seq1 < 4) & (seq2 < 4)  # Exclude N's and gaps
-            if np.sum(valid_positions) == 0:
-                return 1.0  # Maximum distance if no valid comparisons
-            differences = np.sum(seq1[valid_positions] != seq2[valid_positions])
-            total_valid = np.sum(valid_positions)
-            if total_valid == 0:
-                return 1.0
-            # Calculate proportion of differences
-            p = differences / total_valid
-            # Jukes-Cantor correction
-            if p >= 0.75:
-                return 1.0  # Maximum distance
-            # JC distance formula: -3/4 * ln(1 - 4p/3)
-            try:
-                jc_distance = -0.75 * np.log(1 - (4 * p / 3))
-                return min(max(jc_distance, 0.0), 1.0)  # Clamp between 0 and 1
-            except (ValueError, RuntimeWarning):
-                return 1.0  # Return maximum distance if log calculation fails
-        except Exception as e:
-            return 1.0
-    def construct_ml_tree(self, alignment: MultipleSeqAlignment) -> Optional[Tree]:
-        """Constructs a maximum likelihood tree."""
-        try:
-            print("🌳 Constructing ML tree...")
-            distance_matrix = self.calculate_ml_distances(alignment)
-            if distance_matrix.size == 0:
-                return None
-            sequence_names = [record.id for record in alignment]
-            tree = self._build_nj_tree_from_distances(distance_matrix, sequence_names)
-            if tree:
-                tree = self._optimize_branch_lengths_ml_safe(tree, alignment)
-            print("✓ ML tree constructed")
-            return tree
-        except Exception as e:
-            print(f"Error constructing ML tree: {e}")
-            return None
-    def _build_nj_tree_from_distances(self, distance_matrix: np.ndarray, sequence_names: List[str]) -> Optional[Tree]:
-        """Builds a neighbor-joining tree from distance matrix."""
-        try:
-            if distance_matrix.shape[0] != len(sequence_names):
-                print("Error: Distance matrix size mismatch")
-                return None
-            matrix_data = [[0.0 if i == j else max(0.0, float(distance_matrix[i][j])) for j in range(i + 1)] for i in range(len(sequence_names))]
-            dm = DistanceMatrix(names=sequence_names, matrix=matrix_data)
-            constructor = DistanceTreeConstructor()
-            tree = constructor.nj(dm)
-            return tree if self._validate_tree_structure(tree) else None
-        except Exception as e:
-            print(f"Error building NJ tree: {e}")
-            return None
-    def _validate_tree_structure(self, tree: Tree, max_depth: int = 100) -> bool:
-        """Validates tree structure to prevent recursion issues."""
-        try:
-            visited = set()
-            def check_node(node, depth=0):
-                if depth > max_depth:
-                    return False
-                node_id = id(node)
-                if node_id in visited:
-                    return False
-                visited.add(node_id)
-                return all(check_node(child, depth + 1) for child in getattr(node, 'clades', []))
-            return check_node(tree.root if hasattr(tree, 'root') else tree)
-        except Exception:
-            return False
-    def _optimize_branch_lengths_ml_safe(self, tree: Tree, alignment: MultipleSeqAlignment) -> Tree:
-        """Optimizes branch lengths using ML model."""
-        try:
-            print("🔧 Optimizing branch lengths...")
-            old_limit = sys.getrecursionlimit()
-            sys.setrecursionlimit(1000)
-            try:
-                seq_matrix = self._alignment_to_matrix(alignment)
-                if seq_matrix.size == 0:
-                    return tree
-                all_clades = self._get_clades_safe(tree)
-                for clade in all_clades:
-                    if hasattr(clade, 'branch_length') and clade.branch_length is not None:
-                        optimal_length = self._calculate_optimal_branch_length(clade, seq_matrix)
-                        clade.branch_length = max(optimal_length, 0.001)
-            finally:
-                sys.setrecursionlimit(old_limit)
-            print("✓ Branch lengths optimized")
-            return tree
-        except Exception as e:
-            print(f"Warning: Branch optimization failed: {e}")
-            return tree
-    def _get_clades_safe(self, tree: Tree, max_depth: int = 50) -> List:
-        """Safely retrieves all clades in the tree."""
-        clades = []
-        visited = set()
-        def traverse_node(node, depth=0):
-            if depth > max_depth or id(node) in visited:
-                return
-            visited.add(id(node))
-            clades.append(node)
-            for child in getattr(node, 'clades', []):
-                traverse_node(child, depth + 1)
-        try:
-            traverse_node(tree.root if hasattr(tree, 'root') else tree)
-        except Exception as e:
-            print(f"Warning: Tree traversal error: {e}")
-        return clades
-    def _calculate_optimal_branch_length(self, clade: float, seq_matrix: np.ndarray) -> float:
-        """Calculates optimal branch length for a clade."""
-        try:
-            if not hasattr(clade, 'branch_length') or clade.branch_length is None:
-                return 0.1
-            current_length = float(clade.branch_length)
-            if np.isnan(current_length) or np.isinf(current_length) or current_length <= 0:
-                return 0.1
-            return min(max(current_length * (0.9 if hasattr(clade, 'name') and clade.name else 1.1), 0.001), 1.0)
-        except Exception:
-            return 0.1
-    def calculate_ml_likelihood_safe(self, tree: Tree, alignment: MultipleSeqAlignment) -> float:
-        """Calculates tree likelihood using Jukes-Cantor model."""
-        try:
-            print("Trying to calculate tree likelihood...")
-            seq_matrix = self._alignment_to_matrix(alignment)
-            if seq_matrix.size == 0:
-                return -np.inf
-            total_log_likelihood = 0.0
-            n_sites = min(seq_matrix.shape[1], 1000)
-            for site in range(0, n_sites, max(1, n_sites // 100)):
-                site_pattern = seq_matrix[:, site]
-                valid_positions = site_pattern < 4
-                if np.sum(valid_positions) < 2:
-                    continue
-                site_likelihood = self._calculate_site_likelihood_safe(tree, site_pattern)
-                if site_likelihood > 0:
-                    total_log_likelihood += np.log(site_likelihood)
-            print(f"Likelihood: {total_log_likelihood:.2f}")
-            return total_log_likelihood
-        except Exception as e:
-            print(f"Error calculating likelihood: {e}")
-            return -np.inf
-    def _calculate_site_likelihood_safe(self, tree: np.ndarray, site_pattern: np.ndarray) -> float:
-        """Calculates likelihood for a single site."""
-        try:
-            valid_nucs = site_pattern[site_pattern < 4]
-            if len(valid_nucs) == 0:
-                return 1.0
-            unique_nucs = len(np.unique(valid_nucs))
-            total_nucs = len(valid_nucs)
-            diversity_factor = unique_nucs / 4.0
-            likelihood = np.exp(-diversity_factor * total_nucs * 0.1)
-            return max(likelihood, 1e-10)
-        except Exception:
-            return 1e-10
-    # --- Reporting ---
-    def generate_detailed_report(self, matched_ids: List[str], actual_percentage: float) -> bool:
-        """
-        Generate a detailed HTML report for virologists/scientists with query details, matched sequences,
-        model performance, phylogenetic tree insights, and ML analysis results in tabular format.
-        Outputs a styled HTML file.
-        Returns True if successful, False otherwise.
-        """
-        try:
-            print("📝 Generating detailed HTML analysis report...")
-            # --- HTML Template with Inline CSS ---
-            html_content = """
-            <!DOCTYPE html>
-            <html lang="en">
-            <head>
-                <meta charset="UTF-8">
-                <meta name="viewport" content="width=device-width, initial-scale=1.0">
-                <title>Phylogenetic Analysis Report - {query_id}</title>
-                <style>
-                    body {{
-                        font-family: Arial, sans-serif;
-                        margin: 20px;
-                        background-color: #f9f9f9;
-                        color: #333;
-                    }}
-                    h1 {{
-                        text-align: center;
-                        color: #2c3e50;
-                    }}
-                    h2 {{
-                        color: #34495e;
-                        margin-top: 20px;
-                    }}
-                    table {{
-                        width: 100%;
-                        border-collapse: collapse;
-                        margin-bottom: 20px;
-                        background-color: #fff;
-                        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-                    }}
-                    th, td {{
-                        padding: 10px;
-                        text-align: left;
-                        border: 1px solid #ddd;
-                    }}
-                    th {{
-                        background-color: #3498db;
-                        color: #fff;
-                    }}
-                    tr:nth-child(even) {{
-                        background-color: #f2f2f2;
-                    }}
-                    tr:hover {{
-                        background-color: #e0f7fa;
-                    }}
-                    .metadata {{
-                        margin-left: 20px;
-                        font-size: 0.9em;
-                    }}
-                    .metadata p {{
-                        margin: 5px 0;
-                    }}
-                    @media (max-width: 600px) {{
-                        table {{
-                            font-size: 0.85em;
-                        }}
-                        th, td {{
-                            padding: 8px;
-                        }}
-                    }}
-                </style>
-            </head>
-            <body>
-                <h1>Phylogenetic Analysis Report</h1>
-                <p style="text-align: center;">Generated on: {timestamp}</p>
-                <p style="text-align: center;">Query ID: {query_id}</p>
-            """
-            # Add timestamp and query ID to HTML
-            timestamp = time.strftime("%Y-%m-%d %H:%M:%S %Z")
-            html_content = html_content.format(query_id=self.query_id, timestamp=timestamp)
-            # --- Query Information ---
-            query_type = (
-                "Accession Number" if self.query_id in self.data['Accession Number'].values else
-                "Dataset Sequence" if self.query_sequence in self.data['F-gene'].values else
-                "Novel Sequence"
-            )
-            query_ml = "Unknown"
-            query_genotype = "Unknown"
-            query_metadata = {}
-            if query_type == "Novel Sequence":
-                query_ml = self.predict_ml_group(self.query_sequence)
-                query_genotype = self.predict_genotype(self.query_sequence)
-                query_metadata = {"F-gene": self.query_sequence[:50] + "..." if len(self.query_sequence) > 50 else self.query_sequence}
-            else:
-                query_row = self.data[
-                    (self.data['Accession Number'] == self.query_id) |
-                    (self.data['F-gene'] == re.sub(r'[^ATGC]', '', self.query_sequence.upper()))
-                ].iloc[0]
-                query_ml = query_row['ML']
-                query_genotype = query_row['Genotype']
-                query_metadata = query_row.to_dict()
-                query_metadata['F-gene'] = query_metadata['F-gene'][:50] + "..." if len(query_metadata['F-gene']) > 50 else query_metadata['F-gene']
-            query_info_table = [
-                ["Query ID", self.query_id],
-                ["Query Type", query_type],
-                ["Sequence Length", f"{len(self.query_sequence)} nucleotides"],
-                ["ML Group", query_ml],
-                ["Genotype", query_genotype],
-                ["Target Similarity", f"{self.matching_percentage}%"],
-                ["Actual Similarity", f"{actual_percentage:.1f}%"]
-            ]
-            # Add Query Information section
-            html_content += """
-                <h2>Query Information</h2>
-                <table>
-                    <tr><th>Field</th><th>Value</th></tr>
-            """
-            for row in query_info_table:
-                html_content += f"""
-                    <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
-                """
-            html_content += """
-                </table>
-                <div class="metadata">
-                    <h3>Metadata</h3>
-            """
-            for key, value in query_metadata.items():
-                html_content += f"""
-                    <p><strong>{key}:</strong> {value}</p>
-                """
-            html_content += """
-                </div>
-            """
-            # --- Matched Sequences ---
-            matched_sequences_table = []
-            headers = ["Accession Number", "Similarity (%)", "ML Group", "Genotype", "Host", "Country", "Isolate", "Year"]
-            for seq_id in matched_ids:
-                row = self.data[self.data['Accession Number'] == seq_id].iloc[0]
-                matched_sequences_table.append([
-                    seq_id,
-                    f"{self.similarity_scores.get(seq_id, 0.0):.1f}",
-                    row.get('ML', 'N/A'),
-                    row.get('Genotype', 'N/A'),
-                    row.get('Host', 'N/A'),
-                    row.get('Country', 'N/A'),
-                    row.get('Isolate', 'N/A'),
-                    row.get('Year', 'N/A')
-                ])
-            # Add Matched Sequences section
-            html_content += f"""
-                <h2>Matched Sequences</h2>
-                <p>Total Matched Sequences: {len(matched_ids)}</p>
-            """
-            if matched_sequences_table:
-                html_content += """
-                    <table>
-                        <tr>
-                """
-                for header in headers:
-                    html_content += f"<th>{header}</th>"
-                html_content += """
-                        </tr>
-                """
-                for row in matched_sequences_table:
-                    html_content += "<tr>"
-                    for cell in row:
-                        html_content += f"<td>{cell}</td>"
-                    html_content += "</tr>"
-                html_content += """
-                    </table>
-                """
-            else:
-                html_content += """
-                    <p>No matched sequences found.</p>
-                """
-            # --- Model Performance ---
-            model_performance_table = [
-                ["ML Model Accuracy", f"{self.ml_model_accuracy:.2%}" if self.ml_model_accuracy else "Not trained"],
-                ["Genotype Model Accuracy", f"{self.genotype_model_accuracy:.2%}" if self.genotype_model_accuracy else "Not trained"]
-            ]
-            # Add Model Performance section
-            html_content += """
-                <h2>Model Performance</h2>
-                <table>
-                    <tr><th>Metric</th><th>Value</th></tr>
-            """
-            for row in model_performance_table:
-                html_content += f"""
-                    <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
-                """
-            html_content += """
-                </table>
-            """
-            # --- Phylogenetic Tree Insights ---
-            def count_nodes(node):
-                count = 1
-                for child in node.get('children', {}).values():
-                    count += count_nodes(child)
-                return count
-            total_nodes = count_nodes(self.tree_structure)
-            query_node_path = []
-            def find_query_path(node, path):
-                if node.get('data', {}).get('is_query', False):
-                    query_node_path.append(" -> ".join(path + [node['name']]))
-                for name, child in node.get('children', {}).items():
-                    find_query_path(child, path + [node['name']])
-            find_query_path(self.tree_structure['root'], [])
-            tree_insights_table = [
-                ["Total Nodes", total_nodes],
-                ["ML Groups Represented", len(self.tree_structure['root']['children'])],
-                ["Query Node Path", query_node_path[0] if query_node_path else "Not found"]
-            ]
-            # Add Phylogenetic Tree Insights section
-            html_content += """
-                <h2>Phylogenetic Tree Insights</h2>
-                <table>
-                    <tr><th>Field</th><th>Value</th></tr>
-            """
-            for row in tree_insights_table:
-                html_content += f"""
-                    <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
-                """
-            html_content += """
-                </table>
-            """
-            # --- ML Analysis Results ---
-            ml_analysis = self.tree_structure.get('ml_analysis', {})
-            ml_analysis_table = [
-                ["ML Tree Available", ml_analysis.get('ml_tree_available', False)],
-                ["Log-Likelihood", f"{ml_analysis.get('log_likelihood', 'N/A'):.2f}" if ml_analysis.get('log_likelihood') else "N/A"],
-                ["Sequence Count", ml_analysis.get('sequence_count', 'N/A')],
-                ["Alignment Length", ml_analysis.get('alignment_length', 'N/A')]
-            ]
-            # Add ML Analysis Results section
-            html_content += """
-                <h2>Maximum Likelihood Analysis Results</h2>
-                <table>
-                    <tr><th>Field</th><th>Value</th></tr>
-            """
-            for row in ml_analysis_table:
-                html_content += f"""
-                    <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
-                """
-            html_content += """
-                </table>
-            """
-            # --- Close HTML ---
-            html_content += """
-            </body>
-            </html>
-            """
-            # --- Save HTML Report ---
-            report_filename = f"detailed_report_{self.query_id.replace('/', '_')}.html"
-            print(f"Attempting to save report to: {os.path.abspath(report_filename)}")
-            with open(report_filename, 'w') as f:
-                f.write(html_content)
-            print(f"✓ Detailed HTML report saved as '{report_filename}'")
-            return True
-        except Exception as e:
-            print(f"Error generating detailed report: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            return False
-def command_line_interface():
-    """Parse command-line arguments and run phylogenetic analysis."""
-    parser = argparse.ArgumentParser(
-        description="Advanced Phylogenetic Tree Analyzer with AI-enhanced similarity matching",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="Examples:\n  %(prog)s -d data.csv -q MH087032 -s 95\n  %(prog)s -d data.csv -q MH087032 -s 90 --no-ai --batch query1,query2,query3"
-    )
-    parser.add_argument('-d', '--data', required=True, help='Path to CSV data file')
-    parser.add_argument('-q', '--query', required=True, help='Query sequence ID or nucleotide sequence')
-    parser.add_argument('-s', '--similarity', type=float, default=95.0, help='Target similarity percentage (70-99, default: 95)')
-    parser.add_argument('--no-ai', action='store_true', help='Skip AI model training')
-    parser.add_argument('--batch', help='Comma-separated list of query IDs for batch processing')
-    parser.add_argument('--output-dir', default='.', help='Output directory for results')
-    parser.add_argument('--save-json', action='store_true', help='Save detailed results to JSON')
-    args = parser.parse_args()
-    # Validate arguments
-    if not 70 <= args.similarity <= 99:
-        print("❌ Similarity percentage must be between 70 and 99.")
-        sys.exit(1)
-    if not Path(args.data).exists():
-        print(f"❌ Data file not found: {args.data}")
-        sys.exit(1)
-    # Initialize analyzer
-    analyzer = PhylogeneticTreeAnalyzer()
-    if not analyzer.load_data(args.data):
-        print("❌ Failed to load data.")
-        sys.exit(1)
-    # Train AI model unless disabled
-    if not args.no_ai:
-        print("⏳ Training AI model...")
-        start_time = time.time()
-        if analyzer.train_ai_model():
-            print(f"✅ AI model training completed in {time.time() - start_time:.1f} seconds")
-        else:
-            print("⚠️ AI model training failed, continuing with basic analysis")
-    # Process queries
-    queries = args.batch.split(',') if args.batch else [args.query]
-    for query in queries:
-        query = query.strip()
-        print(f"🔍 Processing: {query}")
-        if not analyzer.find_query_sequence(query):
-            print(f"❌ Query not found: {query}")
-            continue
-        matched_ids, actual_percentage = analyzer.find_similar_sequences(args.similarity)
-        if not matched_ids:
-            print(f"❌ No similar sequences found for {query}")
-            continue
-        analyzer.build_tree_structure_with_ml_safe(matched_ids)
-        fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
-        if fig:
-            html_filename = f"phylogenetic_tree_{query.replace('/', '_')}_interactive.html"
-            fig.write_html(html_filename)
-            print(f"📄 Interactive HTML saved: {html_filename}")
-            analyzer.generate_detailed_report(matched_ids, actual_percentage)
-            print(f"📄 Detailed HTML report saved: detailed_report_{query.replace('/', '_')}.html")
-        print(f"✅ Analysis completed for {query}")
-def main():
-    """Run interactive phylogenetic analysis with user input."""
-    print("\n" + "="*70)
-    print("🧬 PHYLOGENETIC TREE ANALYZER - ADVANCED ML-BASED ANALYSIS")
-    print("Version 2.0 | AI-Enhanced Similarity Matching")
-    print("="*70)
-    analyzer = PhylogeneticTreeAnalyzer()
-    # Load data
-    data_file = "f cleaned.csv"
-    while not Path(data_file).exists() or not analyzer.load_data(data_file):
-        print(f"❌ File not found or invalid: {data_file}")
-        data_file = input("Enter valid data file path: ").strip()
-        if not data_file:
-            print("❌ Analysis cancelled.")
-            return
-    # Train AI model
-    print("⏳ Training AI model...")
-    start_time = time.time()
-    if analyzer.train_ai_model():
-        print(f"✅ AI model training completed in {time.time() - start_time:.1f} seconds")
-    else:
-        print("⚠️ AI model training failed, continuing with basic analysis")
-    # Get query sequence
-    while True:
-        query_input = input("\nEnter query sequence or ID (min 10 nucleotides): ").strip()
-        if analyzer.find_query_sequence(query_input):
-            break
-        retry = input("❌ Invalid input. Try again? (y/n): ").strip().lower()
-        if retry != 'y':
-            print("👋 Analysis cancelled.")
-            return
-    # Set similarity percentage
-    while True:
-        try:
-            similarity_input = input("Enter target similarity percentage (1-99) [85]: ").strip()
-            target_percentage = float(similarity_input) if similarity_input else 85.0
-            if 1 <= target_percentage <= 99:
-                analyzer.matching_percentage = target_percentage
-                break
-            print("❌ Please enter a percentage between 1 and 99.")
-        except ValueError:
-            print("❌ Please enter a valid number.")
-    # Find similar sequences
-    print(f"⏳ Analyzing sequences for {target_percentage}% similarity...")
-    start_time = time.time()
-    matched_ids, actual_percentage = analyzer.find_similar_sequences(target_percentage)
-    if not matched_ids:
-        print(f"❌ No similar sequences found at {target_percentage}% similarity.")
-        return
-    analyzer.matched_sequences = matched_ids
-    analyzer.actual_percentage = actual_percentage
-    print(f"✅ Similarity analysis completed in {time.time() - start_time:.1f} seconds")
-    # Build tree structure
-    print("⏳ Building phylogenetic tree structure...")
-    start_time = time.time()
-    tree_structure = analyzer.build_tree_structure_with_ml_safe(matched_ids)
-    if not tree_structure:
-        print("❌ Failed to build tree structure.")
-        return
-    print(f"✅ Tree structure built in {time.time() - start_time:.1f} seconds")
-    # Create visualization and save HTML
-    print("⏳ Creating interactive visualization...")
-    start_time = time.time()
-    fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
-    if not fig:
-        print("❌ Visualization creation failed.")
-        return
-    html_filename = "phylogenetic_tree_interactive.html"
-    fig.write_html(html_filename)
-    print(f"📄 Interactive HTML saved: {html_filename}")
-    # Generate detailed report
-    print("⏳ Generating detailed report...")
-    start_time = time.time()
-    if analyzer.generate_detailed_report(matched_ids, actual_percentage):
-        print(f"✅ Detailed report generated in {time.time() - start_time:.1f} seconds")
-    print(f"\n🎉 Analysis completed successfully!")
-    print(f"   Query ID: {analyzer.query_id}")
-    print(f"   Query sequence length: {len(analyzer.query_sequence)} nucleotides")
-    print(f"   Similar sequences found: {len(matched_ids)}")
-    print(f"   Actual similarity percentage: {actual_percentage:.1f}%")
-    print(f"   HTML visualization file: {html_filename}")
-    print(f"   HTML report file: detailed_report_{analyzer.query_id.replace('/', '_')}.html")
-if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        print("\n👋 Goodbye!")
-        sys.exit(0)
-    except Exception as e:
-        print(f"\n❌ Unexpected error: {e}")
-        sys.exit(1)