re-type commited on
Commit
2438908
·
verified ·
1 Parent(s): 183a83a

Delete analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +0 -1577
analyzer.py DELETED
@@ -1,1577 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import plotly.graph_objects as go
4
- from Bio import SeqIO, AlignIO
5
- from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor, DistanceMatrix
6
- from Bio.Phylo.BaseTree import Tree
7
- from Bio.Align import MultipleSeqAlignment
8
- from Bio.Seq import Seq
9
- from Bio.SeqRecord import SeqRecord
10
- from sklearn.ensemble import RandomForestClassifier
11
- from sklearn.model_selection import train_test_split
12
- from sklearn.preprocessing import LabelEncoder
13
- import warnings
14
- import os
15
- import sys
16
- import re
17
- import time
18
- from pathlib import Path
19
- from typing import Dict, List, Tuple, Optional
20
- import itertools
21
- import argparse
22
-
23
- warnings.filterwarnings('ignore')
24
-
25
- class PhylogeneticTreeAnalyzer:
26
- """Analyzes phylogenetic relationships using ML-based sequence similarity and tree construction."""
27
-
28
- def __init__(self):
29
- self.data = None
30
- self.query_sequence = None
31
- self.query_id = None
32
- self.matching_percentage = 95.0
33
- self.actual_percentage = None
34
- self.matched_sequences = []
35
- self.tree_structure = {}
36
- self.similarity_scores = {}
37
- self.ai_model = None # ML model for sequence classification
38
- self.genotype_model = None # Model for genotype prediction
39
- self.label_encoder = LabelEncoder() # Encoder for ML labels
40
- self.genotype_label_encoder = LabelEncoder() # Encoder for genotype labels
41
- self.ml_tree = None
42
- self.ml_alignment = None
43
- self.ml_results = {}
44
- self.horizontal_line_tracker = []
45
- self.query_ml_group = None
46
- self.base_horizontal_length = 1.2
47
- self.ml_model_accuracy = None # Accuracy of ML model
48
- self.genotype_model_accuracy = None # Accuracy of genotype model
49
-
50
- # --- Data Loading ---
51
- def load_data(self, data_file: str) -> bool:
52
- """Loads sequence data from a CSV file."""
53
- try:
54
- self.data = pd.read_csv(data_file)
55
- print(f"✓ Data loaded: {len(self.data)} sequences, "
56
- f"{self.data['ML'].nunique()} ML groups, "
57
- f"{self.data['Genotype'].nunique()} genotypes")
58
- return True
59
- except Exception as e:
60
- print(f"Error loading data: {e}")
61
- return False
62
-
63
- # --- Model Training ---
64
- def train_ai_model(self) -> bool:
65
- """Trains RandomForest models for ML group and genotype prediction."""
66
- try:
67
- if len(self.data) < 10:
68
- print("⚠️ Insufficient data for training (minimum 10 samples)")
69
- return False
70
-
71
- print("🤖 Training AI models...")
72
- f_gene_sequences = self.data['F-gene'].fillna('').astype(str)
73
- features = []
74
- for seq in f_gene_sequences:
75
- seq_clean = re.sub(r'[^ATGC]', '', seq.upper())
76
- if len(seq_clean) < 3:
77
- features.append([0] * 100)
78
- continue
79
- feature_vector = []
80
- kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
81
- kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
82
- kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
83
- kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
84
- all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
85
- all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
86
- feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
87
- feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
88
- features.append(feature_vector)
89
-
90
- X = np.array(features)
91
-
92
- # Train ML model
93
- ml_targets = self.label_encoder.fit_transform(self.data['ML'].fillna('Unknown'))
94
- if len(np.unique(ml_targets)) < 2:
95
- print("⚠️ Need at least 2 ML classes for training")
96
- return False
97
- X_train, X_test, y_train, y_test = train_test_split(X, ml_targets, test_size=0.2, random_state=42)
98
- self.ai_model = RandomForestClassifier(n_estimators=100, random_state=42)
99
- self.ai_model.fit(X_train, y_train)
100
- self.ml_model_accuracy = self.ai_model.score(X_test, y_test)
101
- print(f"✓ ML model trained with accuracy: {self.ml_model_accuracy:.2%}")
102
-
103
- # Train genotype model
104
- genotype_targets = self.genotype_label_encoder.fit_transform(self.data['Genotype'].fillna('Unknown'))
105
- if len(np.unique(genotype_targets)) >= 2:
106
- X_train, X_test, y_train, y_test = train_test_split(X, genotype_targets, test_size=0.2, random_state=42)
107
- self.genotype_model = RandomForestClassifier(n_estimators=100, random_state=42)
108
- self.genotype_model.fit(X_train, y_train)
109
- self.genotype_model_accuracy = self.genotype_model.score(X_test, y_test)
110
- print(f"✓ Genotype model trained with accuracy: {self.genotype_model_accuracy:.2%}")
111
-
112
- return True
113
- except Exception as e:
114
- print(f"Error training models: {e}")
115
- return False
116
-
117
- def predict_ml_group(self, sequence: str) -> str:
118
- """Predicts ML group for a sequence using the trained model."""
119
- try:
120
- if not self.ai_model:
121
- return "Unknown"
122
- seq_clean = re.sub(r'[^ATGC]', '', sequence.upper())
123
- if len(seq_clean) < 3:
124
- return "Unknown"
125
- feature_vector = []
126
- kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
127
- kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
128
- kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
129
- kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
130
- all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
131
- all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
132
- feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
133
- feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
134
- X = np.array([feature_vector])
135
- ml_pred = self.label_encoder.inverse_transform(self.ai_model.predict(X))[0]
136
- return ml_pred
137
- except Exception as e:
138
- print(f"Error predicting ML group: {e}")
139
- return "Unknown"
140
-
141
- def predict_genotype(self, sequence: str) -> str:
142
- """Predicts genotype for a sequence using the trained model."""
143
- try:
144
- if not self.genotype_model:
145
- return "Unknown"
146
- seq_clean = re.sub(r'[^ATGC]', '', sequence.upper())
147
- if len(seq_clean) < 3:
148
- return "Unknown"
149
- feature_vector = []
150
- kmers_3 = [seq_clean[i:i+3] for i in range(len(seq_clean)-2)]
151
- kmer_counts_3 = {kmer: kmers_3.count(kmer) for kmer in set(kmers_3)}
152
- kmers_4 = [seq_clean[i:i+4] for i in range(len(seq_clean)-3)]
153
- kmer_counts_4 = {kmer: kmers_4.count(kmer) for kmer in set(kmers_4)}
154
- all_3mers = [''.join(p) for p in itertools.product('ATGC', repeat=3)]
155
- all_4mers = [''.join(p) for p in itertools.product('ATGC', repeat=4)]
156
- feature_vector.extend([kmer_counts_3.get(kmer, 0) for kmer in all_3mers[:50]])
157
- feature_vector.extend([kmer_counts_4.get(kmer, 0) for kmer in all_4mers[:50]])
158
- X = np.array([feature_vector])
159
- genotype_pred = self.genotype_label_encoder.inverse_transform(self.genotype_model.predict(X))[0]
160
- return genotype_pred
161
- except Exception as e:
162
- print(f"Error predicting genotype: {e}")
163
- return "Unknown"
164
-
165
- # --- Sequence Processing ---
166
- def find_query_sequence(self, query_input: str) -> bool:
167
- """Identifies query sequence by accession number, F-gene, or as a novel sequence."""
168
- try:
169
- query_input = query_input.strip()
170
- if query_input in self.data['Accession Number'].values:
171
- self.query_id = query_input
172
- query_row = self.data[self.data['Accession Number'] == query_input].iloc[0]
173
- self.query_sequence = query_row['F-gene']
174
- print(f"✓ Query found by accession: {query_input}, ML: {query_row['ML']}, Genotype: {query_row['Genotype']}")
175
- return True
176
- query_clean = re.sub(r'[^ATGC]', '', str(query_input).upper())
177
- if query_clean in self.data['F-gene'].values:
178
- query_row = self.data[self.data['F-gene'] == query_clean].iloc[0]
179
- self.query_id = query_row['Accession Number']
180
- self.query_sequence = query_clean
181
- print(f"✓ Query matched to accession: {self.query_id}, ML: {query_row['ML']}, Genotype: {query_row['Genotype']}")
182
- return True
183
- if len(query_clean) >= 10:
184
- self.query_id = f"QUERY_{hash(query_clean) % 100000:05d}"
185
- self.query_sequence = query_clean
186
- predicted_ml = self.predict_ml_group(query_clean)
187
- predicted_genotype = self.predict_genotype(query_clean)
188
- print(f"✓ Novel query accepted: {self.query_id}, Length: {len(query_clean)}, "
189
- f"Predicted ML: {predicted_ml}, Predicted Genotype: {predicted_genotype}")
190
- return True
191
- print(f"✗ Invalid query: Too short (<10) or not found")
192
- return False
193
- except Exception as e:
194
- print(f"Error processing query: {e}")
195
- return False
196
-
197
- def calculate_f_gene_similarity(self, seq1: str, seq2: str) -> float:
198
- """Calculates similarity between two sequences using k-mer analysis."""
199
- try:
200
- if not seq1 or not seq2:
201
- return 0.0
202
- seq1 = re.sub(r'[^ATGC]', '', str(seq1).upper())
203
- seq2 = re.sub(r'[^ATGC]', '', str(seq2).upper())
204
- if len(seq1) == 0 or len(seq2) == 0:
205
- return 0.0
206
- k = 5
207
- kmers1 = set(seq1[i:i+k] for i in range(len(seq1)-k+1) if len(seq1[i:i+k]) == k)
208
- kmers2 = set(seq2[i:i+k] for i in range(len(seq2)-k+1) if len(seq2[i:i+k]) == k)
209
- if len(kmers1) == 0 and len(kmers2) == 0:
210
- return 100.0
211
- if len(kmers1) == 0 or len(kmers2) == 0:
212
- return 0.0
213
- intersection = len(kmers1.intersection(kmers2))
214
- union = len(kmers1.union(kmers2))
215
- return round((intersection / union) * 100, 2) if union > 0 else 0.0
216
- except Exception as e:
217
- print(f"Error calculating similarity: {e}")
218
- return 0.0
219
-
220
- def find_similar_sequences(self, target_percentage: float) -> Tuple[List[str], float]:
221
- """Finds sequences similar to the query sequence."""
222
- try:
223
- print(f"🔍 Finding sequences with {target_percentage}% similarity...")
224
- similarities = []
225
- for idx, row in self.data.iterrows():
226
- if row['Accession Number'] == self.query_id:
227
- continue
228
- similarity = self.calculate_f_gene_similarity(self.query_sequence, row['F-gene'])
229
- similarities.append({
230
- 'id': row['Accession Number'],
231
- 'similarity': similarity,
232
- 'ml': row.get('ML', 'Unknown'),
233
- 'genotype': row.get('Genotype', 'Unknown')
234
- })
235
- if not similarities:
236
- print("❌ No valid sequences for comparison")
237
- return [], target_percentage
238
- similarities.sort(key=lambda x: x['similarity'], reverse=True)
239
- target_range = 2.0
240
- candidates = [s for s in similarities if abs(s['similarity'] - target_percentage) <= target_range]
241
- if not candidates:
242
- closest = min(similarities, key=lambda x: abs(x['similarity'] - target_percentage))
243
- actual_percentage = closest['similarity']
244
- candidates = [s for s in similarities if abs(s['similarity'] - actual_percentage) <= 1.0]
245
- print(f"⚠ No sequences at {target_percentage}%. Using closest: {actual_percentage:.1f}%")
246
- else:
247
- actual_percentage = target_percentage
248
- max_results = 50
249
- if len(candidates) > max_results:
250
- candidates = candidates[:max_results]
251
- print(f"⚠ Limited to top {max_results} matches")
252
- self.similarity_scores = {c['id']: c['similarity'] for c in candidates}
253
- matched_ids = [c['id'] for c in candidates]
254
- if similarities:
255
- max_sim = max(s['similarity'] for s in similarities)
256
- min_sim = min(s['similarity'] for s in similarities)
257
- avg_sim = sum(s['similarity'] for s in similarities) / len(similarities)
258
- print(f"✓ Found {len(matched_ids)} sequences at ~{actual_percentage:.1f}% similarity, "
259
- f"Range: {min_sim:.1f}% - {max_sim:.1f}% (avg: {avg_sim:.1f}%)")
260
- return matched_ids, actual_percentage
261
- except Exception as e:
262
- print(f"Error finding similar sequences: {e}")
263
- return [], target_percentage
264
-
265
- # --- Tree Construction ---
266
- def build_tree_structure(self, matched_ids: List[str]) -> Dict:
267
- """Builds a hierarchical tree structure based on ML groups and genotypes."""
268
- try:
269
- print("🌳 Building normalized tree structure...")
270
- tree_structure = {
271
- 'root': {'name': 'Root', 'type': 'root', 'children': {}, 'x': 0, 'y': 0,
272
- 'has_vertical_attachment': False, 'extension_level': 0}
273
- }
274
- ml_groups = {}
275
- for idx, row in self.data.iterrows():
276
- ml_group = row['ML']
277
- genotype = row['Genotype']
278
- seq_id = row['Accession Number']
279
- if ml_group not in ml_groups:
280
- ml_groups[ml_group] = {}
281
- if genotype not in ml_groups[ml_group]:
282
- ml_groups[ml_group][genotype] = []
283
- ml_groups[ml_group][genotype].append({
284
- 'id': seq_id, 'data': row.to_dict(), 'is_query': seq_id == self.query_id,
285
- 'is_matched': seq_id in matched_ids, 'similarity': self.similarity_scores.get(seq_id, 0.0)
286
- })
287
- if self.query_id.startswith("QUERY_"):
288
- predicted_ml = self.predict_ml_group(self.query_sequence)
289
- predicted_genotype = self.predict_genotype(self.query_sequence)
290
- if predicted_ml not in ml_groups:
291
- ml_groups[predicted_ml] = {}
292
- if predicted_genotype not in ml_groups[predicted_ml]:
293
- ml_groups[predicted_ml][predicted_genotype] = []
294
- ml_groups[predicted_ml][predicted_genotype].append({
295
- 'id': self.query_id, 'data': {
296
- 'F-gene': self.query_sequence, 'ML': predicted_ml, 'Genotype': predicted_genotype,
297
- 'Accession Number': self.query_id
298
- }, 'is_query': True, 'is_matched': False, 'similarity': 100.0
299
- })
300
- normalized_ml_groups = self._normalize_ml_groups(ml_groups)
301
- self._build_normalized_ml_nodes(tree_structure, normalized_ml_groups, matched_ids)
302
- self.tree_structure = tree_structure
303
- print("✓ Tree structure built")
304
- return tree_structure
305
- except Exception as e:
306
- print(f"Error building tree structure: {e}")
307
- return {}
308
-
309
- def build_tree_structure_with_ml_safe(self, matched_ids: List[str]) -> Dict:
310
- """Enhances tree structure with ML analysis."""
311
- try:
312
- print("🌳 Building ML-enhanced tree structure...")
313
- ml_results = self.perform_ml_analysis_safe(matched_ids)
314
- tree_structure = self.build_tree_structure(matched_ids)
315
- if ml_results and 'tree' in ml_results:
316
- tree_structure['ml_analysis'] = {
317
- 'log_likelihood': ml_results['log_likelihood'],
318
- 'sequence_count': ml_results['sequence_count'],
319
- 'alignment_length': ml_results['alignment_length'],
320
- 'ml_tree_available': True
321
- }
322
- self.ml_tree = ml_results['tree']
323
- self.ml_alignment = ml_results.get('alignment')
324
- print("✓ Tree enhanced with ML analysis")
325
- else:
326
- tree_structure['ml_analysis'] = {'ml_tree_available': False, 'error': 'ML analysis failed'}
327
- print("⚠ ML analysis failed, using standard tree")
328
- return tree_structure
329
- except Exception as e:
330
- print(f"Error building ML-enhanced tree: {e}")
331
- try:
332
- return self.build_tree_structure(matched_ids)
333
- except Exception as e2:
334
- print(f"Fallback failed: {e2}")
335
- return {'error': 'Tree construction failed'}
336
-
337
- def _normalize_ml_groups(self, ml_groups: Dict) -> Dict:
338
- """Normalizes ML group names for hierarchical organization."""
339
- try:
340
- normalized_groups = {}
341
- for ml_name, genotypes in ml_groups.items():
342
- base_ml = 'UNCL' if ml_name.startswith('UNCL') else ml_name.split('.')[0] if '.' in ml_name and any(c.isdigit() for c in ml_name) else ml_name
343
- if base_ml not in normalized_groups:
344
- normalized_groups[base_ml] = {'full_ml_groups': {}, 'representative_sequences': [], 'has_special_sequences': False}
345
- has_special = any(any(seq['is_query'] or seq['is_matched'] for seq in seqs) for seqs in genotypes.values())
346
- if has_special:
347
- normalized_groups[base_ml]['has_special_sequences'] = True
348
- normalized_groups[base_ml]['full_ml_groups'][ml_name] = genotypes
349
- elif len(normalized_groups[base_ml]['representative_sequences']) < 2:
350
- for genotype, sequences in list(genotypes.items())[:2]:
351
- if len(normalized_groups[base_ml]['representative_sequences']) < 2:
352
- normalized_groups[base_ml]['representative_sequences'].extend(sequences[:1])
353
- return normalized_groups
354
- except Exception as e:
355
- print(f"Error normalizing ML groups: {e}")
356
- return {}
357
-
358
- def _build_normalized_ml_nodes(self, tree_structure: Dict, normalized_ml_groups: Dict, matched_ids: List[str]):
359
- """Builds normalized ML nodes with equal spacing."""
360
- try:
361
- self.horizontal_line_tracker = []
362
- self._identify_query_ml_group(normalized_ml_groups)
363
- ml_positions = self._calculate_dynamic_ml_positions(normalized_ml_groups)
364
- tree_structure['root']['has_vertical_attachment'] = len(normalized_ml_groups) > 1
365
- for ml_idx, (base_ml, ml_data) in enumerate(normalized_ml_groups.items()):
366
- y_pos = ml_positions[ml_idx]
367
- has_vertical = ml_data['has_special_sequences'] and len(ml_data['full_ml_groups']) > 1
368
- contains_query = base_ml == self.query_ml_group
369
- horizontal_length = self._determine_horizontal_line_length('normalized_ml_group', has_vertical, contains_query)
370
- x_pos = horizontal_length
371
- tree_structure['root']['children'][base_ml] = {
372
- 'name': base_ml, 'type': 'normalized_ml_group', 'children': {}, 'x': x_pos, 'y': y_pos,
373
- 'has_special_sequences': ml_data['has_special_sequences'], 'has_vertical_attachment': has_vertical,
374
- 'horizontal_line_length': horizontal_length, 'contains_query': contains_query
375
- }
376
- if ml_data['has_special_sequences']:
377
- self._build_full_ml_nodes(tree_structure['root']['children'][base_ml], ml_data['full_ml_groups'],
378
- y_pos, matched_ids, x_pos)
379
- else:
380
- self._add_representative_sequences(tree_structure['root']['children'][base_ml],
381
- ml_data['representative_sequences'], y_pos, x_pos)
382
- except Exception as e:
383
- print(f"Error building normalized ML nodes: {e}")
384
-
385
- def _build_full_ml_nodes(self, normalized_ml_node: Dict, full_ml_groups: Dict, base_y: float, matched_ids: List[str], parent_x: float):
386
- """Builds full ML nodes with genotypes."""
387
- try:
388
- full_ml_positions = self._calculate_full_ml_positions(full_ml_groups, base_y)
389
- for ml_idx, (full_ml_name, genotypes) in enumerate(full_ml_groups.items()):
390
- y_pos = full_ml_positions[ml_idx]
391
- special_genotypes_count = sum(1 for g, seqs in genotypes.items() if any(s['is_query'] or s['is_matched'] for s in seqs))
392
- has_vertical = special_genotypes_count > 1
393
- contains_query = any(any(seq['is_query'] for seq in seqs) for seqs in genotypes.values())
394
- horizontal_length = self._determine_horizontal_line_length('full_ml_group', has_vertical, contains_query)
395
- x_pos = parent_x + horizontal_length
396
- normalized_ml_node['children'][full_ml_name] = {
397
- 'name': full_ml_name, 'type': 'full_ml_group', 'children': {}, 'x': x_pos, 'y': y_pos,
398
- 'sequences_count': sum(len(seqs) for seqs in genotypes.values()), 'has_vertical_attachment': has_vertical,
399
- 'horizontal_line_length': horizontal_length, 'contains_query': contains_query
400
- }
401
- self._build_genotype_nodes(normalized_ml_node['children'][full_ml_name], genotypes, y_pos, matched_ids, x_pos)
402
- except Exception as e:
403
- print(f"Error building full ML nodes: {e}")
404
-
405
- def _build_genotype_nodes(self, full_ml_node: Dict, genotypes: Dict, base_y: float, matched_ids: List[str], parent_x: float):
406
- """Builds genotype nodes with sequences."""
407
- try:
408
- special_genotypes = [(g, seqs) for g, seqs in genotypes.items() if any(s['is_query'] or s['is_matched'] for s in seqs)]
409
- if not special_genotypes:
410
- return
411
- genotype_positions = self._calculate_genotype_positions(special_genotypes, base_y)
412
- genotype_sequence_counts = [(g, seqs, len([s for s in seqs if s['is_query'] or s['is_matched']])) for g, seqs in special_genotypes]
413
- for gt_idx, (genotype, sequences, sequence_count) in enumerate(genotype_sequence_counts):
414
- y_pos = genotype_positions[gt_idx]
415
- special_sequences = [s for s in sequences if s['is_query'] or s['is_matched']]
416
- has_vertical = len(special_sequences) > 1
417
- contains_query = any(s['is_query'] for s in sequences)
418
- horizontal_length = self._determine_genotype_horizontal_line_length(sequence_count, has_vertical, contains_query)
419
- x_pos = parent_x + horizontal_length
420
- full_ml_node['children'][genotype] = {
421
- 'name': genotype, 'type': 'genotype', 'children': {}, 'x': x_pos, 'y': y_pos,
422
- 'sequences': sequences, 'has_vertical_attachment': has_vertical,
423
- 'horizontal_line_length': horizontal_length, 'contains_query': contains_query,
424
- 'sequence_count': sequence_count
425
- }
426
- self._add_sequences_horizontal(full_ml_node['children'][genotype], sequences, y_pos, x_pos)
427
- except Exception as e:
428
- print(f"Error building genotype nodes: {e}")
429
-
430
- def _add_representative_sequences(self, normalized_ml_node: Dict, representative_sequences: List[Dict], base_y: float, parent_x: float):
431
- """Adds representative sequences to normalized ML nodes."""
432
- try:
433
- if not representative_sequences:
434
- return
435
- has_vertical = len(representative_sequences) > 1
436
- horizontal_length = self._determine_horizontal_line_length('representative', has_vertical)
437
- x_pos = parent_x + horizontal_length
438
- if len(representative_sequences) == 1:
439
- seq = representative_sequences[0]
440
- normalized_ml_node['children'][f"{seq['id']}_rep"] = {
441
- 'name': f"{seq['id']} (Rep)", 'type': 'representative_sequence', 'data': seq,
442
- 'x': x_pos, 'y': base_y, 'has_vertical_attachment': False, 'horizontal_line_length': horizontal_length
443
- }
444
- else:
445
- positions = self._calculate_sequence_positions(representative_sequences, base_y)
446
- for idx, seq in enumerate(representative_sequences):
447
- normalized_ml_node['children'][f"{seq['id']}_rep"] = {
448
- 'name': f"{seq['id']} (Rep)", 'type': 'representative_sequence', 'data': seq,
449
- 'x': x_pos, 'y': positions[idx], 'has_vertical_attachment': False, 'horizontal_line_length': horizontal_length
450
- }
451
- except Exception as e:
452
- print(f"Error adding representative sequences: {e}")
453
-
454
- def _add_sequences_horizontal(self, genotype_node: Dict, sequences: List[Dict], base_y: float, parent_x: float):
455
- """Adds sequences with similarity-based line lengths."""
456
- try:
457
- query_line_length = 3.0
458
- query_sequences = [s for s in sequences if s['is_query']]
459
- matched_sequences = [s for s in sequences if s['is_matched'] and not s['is_query']]
460
- all_special_sequences = query_sequences + matched_sequences
461
- if len(all_special_sequences) == 1:
462
- sequence = all_special_sequences[0]
463
- line_length = self._calculate_similarity_based_line_length(sequence, query_line_length)
464
- x_pos = parent_x + line_length
465
- genotype_node['children'][sequence['id']] = {
466
- 'name': f"{sequence['id']} ({sequence['similarity']}%)" if sequence['is_matched'] else sequence['id'],
467
- 'type': 'sequence', 'data': sequence, 'x': x_pos, 'y': base_y,
468
- 'has_vertical_attachment': False, 'similarity_line_length': line_length
469
- }
470
- else:
471
- sequence_positions = self._calculate_sequence_positions(all_special_sequences, base_y)
472
- for seq_idx, sequence in enumerate(all_special_sequences):
473
- line_length = self._calculate_similarity_based_line_length(sequence, query_line_length)
474
- x_pos = parent_x + line_length
475
- genotype_node['children'][sequence['id']] = {
476
- 'name': f"{sequence['id']} ({sequence['similarity']}%)" if sequence['is_matched'] else sequence['id'],
477
- 'type': 'sequence', 'data': sequence, 'x': x_pos, 'y': sequence_positions[seq_idx],
478
- 'has_vertical_attachment': False, 'similarity_line_length': line_length
479
- }
480
- except Exception as e:
481
- print(f"Error adding sequences: {e}")
482
-
483
- def _identify_query_ml_group(self, normalized_ml_groups: Dict):
484
- """Identifies the ML group containing the query sequence."""
485
- try:
486
- for base_ml, ml_data in normalized_ml_groups.items():
487
- if ml_data['has_special_sequences']:
488
- for genotypes in ml_data['full_ml_groups'].values():
489
- for sequences in genotypes.values():
490
- if any(seq['is_query'] for seq in sequences):
491
- self.query_ml_group = base_ml
492
- return
493
- except Exception as e:
494
- print(f"Error identifying query ML group: {e}")
495
-
496
- def _calculate_dynamic_ml_positions(self, normalized_ml_groups: Dict) -> List[float]:
497
- """Calculates equal Y positions for ML groups."""
498
- try:
499
- ml_count = len(normalized_ml_groups)
500
- if ml_count == 0:
501
- return []
502
- if ml_count == 1:
503
- return [0.0]
504
- total_spacing = (ml_count - 1) * 2.0
505
- start_y = -total_spacing / 2
506
- return [start_y + i * 2.0 for i in range(ml_count)]
507
- except Exception as e:
508
- print(f"Error calculating ML positions: {e}")
509
- return list(range(len(normalized_ml_groups)))
510
-
511
- def _calculate_full_ml_positions(self, full_ml_groups: Dict, base_y: float) -> List[float]:
512
- """Calculates equal positions for full ML groups."""
513
- try:
514
- ml_count = len(full_ml_groups)
515
- if ml_count <= 1:
516
- return [base_y]
517
- spacing = 1.5
518
- start_y = base_y - (spacing * (ml_count - 1)) / 2
519
- return [start_y + i * spacing for i in range(ml_count)]
520
- except Exception as e:
521
- print(f"Error calculating full ML positions: {e}")
522
- return [base_y] * len(full_ml_groups)
523
-
524
- def _calculate_genotype_positions(self, special_genotypes: List, base_y: float) -> List[float]:
525
- """Calculates equal positions for genotypes."""
526
- try:
527
- genotype_count = len(special_genotypes)
528
- if genotype_count <= 1:
529
- return [base_y]
530
- spacing = 1.0
531
- start_y = base_y - (spacing * (genotype_count - 1)) / 2
532
- return [start_y + i * spacing for i in range(genotype_count)]
533
- except Exception as e:
534
- print(f"Error calculating genotype positions: {e}")
535
- return [base_y] * len(special_genotypes)
536
-
537
- def _calculate_sequence_positions(self, sequences: List[Dict], base_y: float) -> List[float]:
538
- """Calculates equal positions for sequences."""
539
- try:
540
- seq_count = len(sequences)
541
- if seq_count <= 1:
542
- return [base_y]
543
- spacing = 0.8
544
- start_y = base_y - (spacing * (seq_count - 1)) / 2
545
- return [start_y + i * spacing for i in range(seq_count)]
546
- except Exception as e:
547
- print(f"Error calculating sequence positions: {e}")
548
- return [base_y] * len(sequences)
549
-
550
- def _calculate_similarity_based_line_length(self, sequence: Dict, query_line_length: float) -> float:
551
- """Calculates line length based on sequence similarity."""
552
- try:
553
- if sequence['is_query']:
554
- return query_line_length
555
- if sequence['is_matched']:
556
- similarity = sequence['similarity']
557
- proportional_length = (similarity / 100.0) * query_line_length
558
- return max(proportional_length, query_line_length * 0.2)
559
- return query_line_length * 0.5
560
- except Exception as e:
561
- print(f"Error calculating line length: {e}")
562
- return query_line_length * 0.5
563
-
564
- def _determine_horizontal_line_length(self, node_type: str, has_vertical: bool, contains_query: bool = False) -> float:
565
- """Determines horizontal line length based on node type."""
566
- try:
567
- base_length = self.base_horizontal_length
568
- if contains_query and node_type == 'normalized_ml_group':
569
- return base_length * 2.5
570
- if has_vertical:
571
- current_max = base_length
572
- for length in self.horizontal_line_tracker:
573
- if length > current_max:
574
- current_max = length
575
- new_length = current_max + 0.3
576
- self.horizontal_line_tracker.append(new_length)
577
- return new_length
578
- return base_length
579
- except Exception as e:
580
- print(f"Error determining line length: {e}")
581
- return self.base_horizontal_length
582
-
583
- def _determine_genotype_horizontal_line_length(self, sequence_count: int, has_vertical: bool, contains_query: bool = False) -> float:
584
- """Determines horizontal line length for genotype nodes."""
585
- try:
586
- base_length = self.base_horizontal_length
587
- query_bonus = 0.5 if contains_query else 0.0
588
- if sequence_count <= 1:
589
- length_multiplier = 1.0
590
- elif sequence_count <= 3:
591
- length_multiplier = 1.6
592
- elif sequence_count <= 5:
593
- length_multiplier = 2.3
594
- else:
595
- length_multiplier = 6.0
596
- return base_length * length_multiplier + query_bonus
597
- except Exception as e:
598
- print(f"Error determining genotype line length: {e}")
599
- return self.base_horizontal_length
600
-
601
- # --- Visualization ---
602
- def create_interactive_tree(self, matched_ids: List[str], actual_percentage: float) -> Optional[go.Figure]:
603
- """Creates an interactive horizontal phylogenetic tree visualization."""
604
- try:
605
- print("🎨 Creating interactive tree visualization...")
606
- edge_x, edge_y = [], []
607
- node_x, node_y = [], []
608
- node_colors, node_text, node_hover, node_sizes = [], [], [], []
609
- colors = {
610
- 'root': '#FF0000', 'normalized_ml_group': '#FFB6C1', 'full_ml_group': '#FF69B4',
611
- 'genotype': '#FFD700', 'representative_sequence': '#FFA500', 'query_sequence': '#4B0082',
612
- 'matched_sequence': '#6A5ACD', 'other_sequence': '#87CEEB'
613
- }
614
-
615
- def add_horizontal_edges(parent_x, parent_y, children_dict):
616
- if not children_dict:
617
- return
618
- children_list = list(children_dict.values())
619
- if len(children_list) == 1:
620
- child = children_list[0]
621
- edge_x.extend([parent_x, child['x'], None])
622
- edge_y.extend([parent_y, child['y'], None])
623
- else:
624
- child_x_positions = [child['x'] for child in children_list]
625
- min_child_x = min(child_x_positions)
626
- intermediate_x = parent_x + (min_child_x - parent_x) * 0.8
627
- edge_x.extend([parent_x, intermediate_x, None])
628
- edge_y.extend([parent_y, parent_y, None])
629
- child_y_positions = [child['y'] for child in children_list]
630
- min_y, max_y = min(child_y_positions), max(child_y_positions)
631
- edge_x.extend([intermediate_x, intermediate_x, None])
632
- edge_y.extend([min_y, max_y, None])
633
- for child in children_list:
634
- edge_x.extend([intermediate_x, child['x'], None])
635
- edge_y.extend([child['y'], child['y'], None])
636
-
637
- def get_node_color_and_size(node):
638
- if node['type'] == 'sequence':
639
- if node['data']['is_query']:
640
- return colors['query_sequence'], 10
641
- if node['data']['is_matched']:
642
- return colors['matched_sequence'], 8
643
- return colors['other_sequence'], 6
644
- if node['type'] == 'representative_sequence':
645
- return colors['representative_sequence'], 7
646
- if node['type'] == 'normalized_ml_group':
647
- return colors['normalized_ml_group'], 9 if node.get('has_special_sequences', False) else 7
648
- if node['type'] == 'full_ml_group':
649
- return colors['full_ml_group'], 8
650
- if node['type'] == 'genotype':
651
- return colors['genotype'], 7
652
- return colors.get(node['type'], '#000000'), 7
653
-
654
- def create_node_text(node):
655
- if node['type'] == 'sequence':
656
- return f"{node['name']}" if node['data']['is_matched'] and not node['data']['is_query'] else node['name']
657
- if node['type'] == 'representative_sequence':
658
- return node['name']
659
- if node['type'] == 'normalized_ml_group':
660
- return f"{node['name']} *" if node.get('has_special_sequences', False) else node['name']
661
- return node['name']
662
-
663
- def create_hover_text(node):
664
- if node['type'] == 'sequence':
665
- data = node['data']['data']
666
- hover_text = (
667
- f"<b>{node['name']}</b><br>Type: {'Query' if node['data']['is_query'] else 'Matched' if node['data']['is_matched'] else 'Other'} Sequence<br>"
668
- f"ML Group: {data.get('ML', 'N/A')}<br>Genotype: {data.get('Genotype', 'N/A')}<br>"
669
- f"Host: {data.get('Host', 'N/A')}<br>Country: {data.get('Country', 'N/A')}<br>"
670
- f"Isolate: {data.get('Isolate', 'N/A')}<br>Year: {data.get('Year', 'N/A')}"
671
- )
672
- if node['data']['is_matched']:
673
- hover_text += f"<br><b>Similarity: {node['data']['similarity']}%</b>"
674
- elif node['type'] == 'representative_sequence':
675
- data = node['data']['data']
676
- hover_text = (
677
- f"<b>{node['name']}</b><br>Type: Representative Sequence<br>"
678
- f"ML Group: {data.get('ML', 'N/A')}<br>Genotype: {data.get('Genotype', 'N/A')}<br>"
679
- f"Host: {data.get('Host', 'N/A')}<br>Country: {data.get('Country', 'N/A')}"
680
- )
681
- elif node['type'] == 'normalized_ml_group':
682
- hover_text = f"<b>{node['name']}</b><br>Type: Normalized ML Group"
683
- if node.get('has_special_sequences', False):
684
- hover_text += "<br>Contains query/matched sequences"
685
- else:
686
- hover_text += "<br>Representative sequences only"
687
- elif node['type'] == 'full_ml_group':
688
- hover_text = f"<b>{node['name']}</b><br>Type: Full ML Group"
689
- if 'sequences_count' in node:
690
- hover_text += f"<br>Total Sequences: {node['sequences_count']}"
691
- elif node['type'] == 'genotype':
692
- hover_text = f"<b>{node['name']}</b><br>Type: Genotype"
693
- if 'sequences' in node:
694
- special_count = sum(1 for seq in node['sequences'] if seq['is_query'] or seq['is_matched'])
695
- hover_text += f"<br>Special Sequences: {special_count}/{len(node['sequences'])}"
696
- else:
697
- hover_text = f"<b>{node['name']}</b><br>Type: {node['type'].replace('_', ' ').title()}"
698
- return hover_text
699
-
700
- def add_node_and_edges(node, parent_x=None, parent_y=None):
701
- x, y = node['x'], node['y']
702
- node_x.append(x)
703
- node_y.append(y)
704
- color, size = get_node_color_and_size(node)
705
- node_colors.append(color)
706
- node_sizes.append(size)
707
- node_text.append(create_node_text(node))
708
- node_hover.append(create_hover_text(node))
709
- if 'children' in node and node['children']:
710
- add_horizontal_edges(x, y, node['children'])
711
- for child in node['children'].values():
712
- add_node_and_edges(child, x, y)
713
-
714
- root_node = self.tree_structure['root']
715
- add_node_and_edges(root_node)
716
- if root_node['children']:
717
- add_horizontal_edges(root_node['x'], root_node['y'], root_node['children'])
718
-
719
- fig = go.Figure()
720
- fig.add_trace(go.Scatter(
721
- x=edge_x, y=edge_y, mode='lines', line=dict(width=1, color='gray'),
722
- hoverinfo='none', showlegend=False
723
- ))
724
- fig.add_trace(go.Scatter(
725
- x=node_x, y=node_y, mode='markers+text',
726
- marker=dict(size=node_sizes, color=node_colors, line=dict(width=1, color='black'), opacity=0.85),
727
- text=node_text, textposition="middle right", textfont=dict(size=9, color="black"),
728
- hoverinfo='text', hovertext=node_hover, showlegend=False
729
- ))
730
-
731
- min_x, max_x = min(node_x), max(node_x) if node_x else (0, 1)
732
- min_y, max_y = min(node_y), max(node_y) if node_y else (0, 1)
733
- x_range = max_x - min_x
734
- y_range = max_y - min_y
735
- x_padding = x_range * 0.2 if x_range > 0 else 1
736
- y_padding = y_range * 0.2 if y_range > 0 else 1
737
- width = min(1400, max(800, int(x_range * 80 + 400)))
738
- height = min(900, max(500, int(y_range * 40 + 300)))
739
-
740
- fig.update_layout(
741
- title=dict(
742
- text=f"Horizontal Phylogenetic Tree<br>Query: {self.query_id} | Similarity: {actual_percentage}% | Matched: {len(matched_ids)}",
743
- x=0.5, font=dict(size=12)
744
- ),
745
- xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[min_x - x_padding, max_x + x_padding], automargin=True),
746
- yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[min_y - y_padding, max_y + y_padding], automargin=True),
747
- plot_bgcolor="white", paper_bgcolor="white", hovermode="closest",
748
- width=width, height=height, margin=dict(l=20, r=100, t=40, b=10),
749
- showlegend=True, legend=dict(x=1.02, y=1, xanchor='left', yanchor='top',
750
- bgcolor='rgba(255,255,255,0.8)', bordercolor='gray', borderwidth=1, font=dict(size=10))
751
- )
752
-
753
- legend_elements = [
754
- dict(name="Root", marker=dict(color=colors['root'], size=8)),
755
- dict(name="Normalized ML Groups", marker=dict(color=colors['normalized_ml_group'], size=8)),
756
- dict(name="Full ML Groups", marker=dict(color=colors['full_ml_group'], size=8)),
757
- dict(name="Genotypes", marker=dict(color=colors['genotype'], size=8)),
758
- dict(name="Query Sequence", marker=dict(color=colors['query_sequence'], size=10)),
759
- dict(name="Similar Sequences", marker=dict(color=colors['matched_sequence'], size=9)),
760
- dict(name="Representative Sequences", marker=dict(color=colors['representative_sequence'], size=8)),
761
- ]
762
- for element in legend_elements:
763
- fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=element['marker'], name=element['name'], showlegend=True))
764
-
765
- config = {
766
- 'displayModeBar': True, 'displaylogo': False, 'modeBarButtonsToRemove': ['select2d', 'lasso2d'],
767
- 'toImageButtonOptions': {'format': 'png', 'filename': 'phylogenetic_tree', 'height': height, 'width': width, 'scale': 2}
768
- }
769
- try:
770
- fig.show(config)
771
- except Exception as e:
772
- print(f"Warning: Could not display figure: {e}")
773
- return fig
774
- except Exception as e:
775
- print(f"Error creating tree visualization: {e}")
776
- return None
777
-
778
- # --- ML Analysis ---
779
- def perform_ml_analysis_safe(self, matched_ids: List[str]) -> Dict:
780
-
781
- try:
782
- print("\n🧬 PERFORMING MAXIMUM LIKELIHOOD ANALYSIS")
783
- print("="*50)
784
-
785
- # Include query sequence in analysis
786
- all_sequences = [self.query_id] + [seq_id for seq_id in matched_ids if seq_id != self.query_id]
787
-
788
- # Limit number of sequences to prevent memory issues
789
- if len(all_sequences) > 20:
790
- print(f"Warning: Limiting analysis to 20 sequences (had {len(all_sequences)})")
791
- all_sequences = all_sequences[:20]
792
-
793
- if len(all_sequences) < 3:
794
- print("❌ Need at least 3 sequences for ML analysis")
795
- return {}
796
-
797
- # Step 1: Create multiple sequence alignment
798
- alignment = self.create_sequence_alignment(all_sequences)
799
- if not alignment:
800
- return {}
801
-
802
- # Step 2: Calculate ML distances
803
- distance_matrix = self.calculate_ml_distances(alignment)
804
- if distance_matrix.size == 0:
805
- return {}
806
-
807
- # Step 3: Construct ML tree
808
- ml_tree = self.construct_ml_tree(alignment)
809
- if not ml_tree:
810
- return {}
811
-
812
- # Step 4: Calculate tree likelihood (safely)
813
- log_likelihood = self.calculate_ml_likelihood_safe(ml_tree, alignment)
814
-
815
- # Step 5: Prepare results
816
- ml_results = {
817
- 'tree': ml_tree,
818
- 'alignment': alignment,
819
- 'distance_matrix': distance_matrix,
820
- 'log_likelihood': log_likelihood,
821
- 'sequence_count': len(all_sequences),
822
- 'alignment_length': len(alignment[0]) if alignment else 0
823
- }
824
-
825
- print(f"✅ ML analysis completed successfully")
826
- print(f" Sequences analyzed: {len(all_sequences)}")
827
- print(f" Alignment length: {ml_results['alignment_length']}")
828
- print(f" Log-likelihood: {log_likelihood:.2f}")
829
-
830
- return ml_results
831
-
832
- except Exception as e:
833
- print(f"❌ ML analysis failed: {e}")
834
- import traceback
835
- traceback.print_exc()
836
- return {}
837
-
838
-
839
- def create_sequence_alignment(self, sequence_ids: List[str]) -> Optional[MultipleSeqAlignment]:
840
-
841
- try:
842
- print("🧬 Creating multiple sequence alignment...")
843
-
844
- # Get sequences
845
- sequences = []
846
- for seq_id in sequence_ids:
847
- try:
848
- row = self.data[self.data['Accession Number'] == seq_id]
849
- if not row.empty:
850
- f_gene = str(row.iloc[0]['F-gene'])
851
- # Clean sequence (remove non-nucleotide characters)
852
- clean_seq = re.sub(r'[^ATGCN-]', '', f_gene.upper())
853
- if len(clean_seq) > 10: # Minimum sequence length
854
- seq_record = SeqRecord(Seq(clean_seq), id=seq_id, description="")
855
- sequences.append(seq_record)
856
- except Exception as e:
857
- print(f"Warning: Skipping sequence {seq_id}: {e}")
858
- continue
859
-
860
- if len(sequences) < 2:
861
- print("❌ Need at least 2 valid sequences for alignment")
862
- return None
863
-
864
- # Simple alignment (you might want to use MUSCLE or CLUSTAL for better results)
865
- aligned_sequences = self._simple_alignment(sequences)
866
-
867
- print(f"✓ Alignment created with {len(aligned_sequences)} sequences")
868
- return MultipleSeqAlignment(aligned_sequences)
869
-
870
- except Exception as e:
871
- print(f"Error creating alignment: {e}")
872
- return None
873
-
874
- def _simple_alignment(self, sequences: List[SeqRecord]) -> List[SeqRecord]:
875
-
876
- try:
877
- # Find maximum length
878
- max_length = max(len(seq.seq) for seq in sequences)
879
-
880
- # Cap maximum length to prevent memory issues
881
- if max_length > 10000:
882
- max_length = 10000
883
- print(f"Warning: Sequences truncated to {max_length} bp")
884
-
885
- # Pad sequences to same length
886
- aligned_sequences = []
887
- for seq in sequences:
888
- seq_str = str(seq.seq)[:max_length] # Truncate if too long
889
-
890
- if len(seq_str) < max_length:
891
- # Pad with gaps at the end
892
- padded_seq = seq_str + '-' * (max_length - len(seq_str))
893
- else:
894
- padded_seq = seq_str
895
-
896
- aligned_sequences.append(SeqRecord(Seq(padded_seq), id=seq.id, description=seq.description))
897
-
898
- return aligned_sequences
899
- except Exception as e:
900
- print(f"Error in simple alignment: {e}")
901
- return sequences
902
-
903
- def calculate_ml_distances(self, alignment: MultipleSeqAlignment) -> np.ndarray:
904
-
905
- try:
906
- print("📊 Calculating ML distances...")
907
-
908
- # Convert alignment to numeric matrix
909
- seq_matrix = self._alignment_to_matrix(alignment)
910
- n_sequences = len(alignment)
911
-
912
- if n_sequences == 0:
913
- return np.array([])
914
-
915
- # Initialize distance matrix
916
- distance_matrix = np.zeros((n_sequences, n_sequences))
917
-
918
- # Calculate pairwise ML distances
919
- for i in range(n_sequences):
920
- for j in range(i + 1, n_sequences):
921
- try:
922
- ml_distance = self._calculate_ml_distance_pair(seq_matrix[i], seq_matrix[j])
923
- distance_matrix[i][j] = ml_distance
924
- distance_matrix[j][i] = ml_distance
925
- except Exception as e:
926
- print(f"Warning: Error calculating distance between sequences {i} and {j}: {e}")
927
- # Use maximum distance as fallback
928
- distance_matrix[i][j] = 1.0
929
- distance_matrix[j][i] = 1.0
930
-
931
- print("✓ ML distances calculated")
932
- return distance_matrix
933
-
934
- except Exception as e:
935
- print(f"Error calculating ML distances: {e}")
936
- return np.array([])
937
-
938
- def _alignment_to_matrix(self, alignment: MultipleSeqAlignment) -> np.ndarray:
939
-
940
- try:
941
- nucleotide_map = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'N': 4, '-': 5}
942
-
943
- matrix = []
944
- for record in alignment:
945
- sequence = str(record.seq).upper()
946
- numeric_seq = [nucleotide_map.get(nuc, 4) for nuc in sequence]
947
- matrix.append(numeric_seq)
948
-
949
- return np.array(matrix)
950
- except Exception as e:
951
- print(f"Error converting alignment to matrix: {e}")
952
- return np.array([])
953
-
954
-
955
- def _calculate_ml_distance_pair(self, seq1: np.ndarray, seq2: np.ndarray) -> float:
956
-
957
- try:
958
- if len(seq1) == 0 or len(seq2) == 0:
959
- return 1.0
960
-
961
- # Count differences (excluding gaps and N's)
962
- valid_positions = (seq1 < 4) & (seq2 < 4) # Exclude N's and gaps
963
-
964
- if np.sum(valid_positions) == 0:
965
- return 1.0 # Maximum distance if no valid comparisons
966
-
967
- differences = np.sum(seq1[valid_positions] != seq2[valid_positions])
968
- total_valid = np.sum(valid_positions)
969
-
970
- if total_valid == 0:
971
- return 1.0
972
-
973
- # Calculate proportion of differences
974
- p = differences / total_valid
975
-
976
- # Jukes-Cantor correction
977
- if p >= 0.75:
978
- return 1.0 # Maximum distance
979
-
980
- # JC distance formula: -3/4 * ln(1 - 4p/3)
981
- try:
982
- jc_distance = -0.75 * np.log(1 - (4 * p / 3))
983
- return min(max(jc_distance, 0.0), 1.0) # Clamp between 0 and 1
984
- except (ValueError, RuntimeWarning):
985
- return 1.0 # Return maximum distance if log calculation fails
986
-
987
- except Exception as e:
988
- return 1.0
989
-
990
- def construct_ml_tree(self, alignment: MultipleSeqAlignment) -> Optional[Tree]:
991
- """Constructs a maximum likelihood tree."""
992
- try:
993
- print("🌳 Constructing ML tree...")
994
- distance_matrix = self.calculate_ml_distances(alignment)
995
- if distance_matrix.size == 0:
996
- return None
997
- sequence_names = [record.id for record in alignment]
998
- tree = self._build_nj_tree_from_distances(distance_matrix, sequence_names)
999
- if tree:
1000
- tree = self._optimize_branch_lengths_ml_safe(tree, alignment)
1001
- print("✓ ML tree constructed")
1002
- return tree
1003
- except Exception as e:
1004
- print(f"Error constructing ML tree: {e}")
1005
- return None
1006
-
1007
- def _build_nj_tree_from_distances(self, distance_matrix: np.ndarray, sequence_names: List[str]) -> Optional[Tree]:
1008
- """Builds a neighbor-joining tree from distance matrix."""
1009
- try:
1010
- if distance_matrix.shape[0] != len(sequence_names):
1011
- print("Error: Distance matrix size mismatch")
1012
- return None
1013
- matrix_data = [[0.0 if i == j else max(0.0, float(distance_matrix[i][j])) for j in range(i + 1)] for i in range(len(sequence_names))]
1014
- dm = DistanceMatrix(names=sequence_names, matrix=matrix_data)
1015
- constructor = DistanceTreeConstructor()
1016
- tree = constructor.nj(dm)
1017
- return tree if self._validate_tree_structure(tree) else None
1018
- except Exception as e:
1019
- print(f"Error building NJ tree: {e}")
1020
- return None
1021
-
1022
- def _validate_tree_structure(self, tree: Tree, max_depth: int = 100) -> bool:
1023
- """Validates tree structure to prevent recursion issues."""
1024
- try:
1025
- visited = set()
1026
- def check_node(node, depth=0):
1027
- if depth > max_depth:
1028
- return False
1029
- node_id = id(node)
1030
- if node_id in visited:
1031
- return False
1032
- visited.add(node_id)
1033
- return all(check_node(child, depth + 1) for child in getattr(node, 'clades', []))
1034
- return check_node(tree.root if hasattr(tree, 'root') else tree)
1035
- except Exception:
1036
- return False
1037
-
1038
- def _optimize_branch_lengths_ml_safe(self, tree: Tree, alignment: MultipleSeqAlignment) -> Tree:
1039
- """Optimizes branch lengths using ML model."""
1040
- try:
1041
- print("🔧 Optimizing branch lengths...")
1042
- old_limit = sys.getrecursionlimit()
1043
- sys.setrecursionlimit(1000)
1044
- try:
1045
- seq_matrix = self._alignment_to_matrix(alignment)
1046
- if seq_matrix.size == 0:
1047
- return tree
1048
- all_clades = self._get_clades_safe(tree)
1049
- for clade in all_clades:
1050
- if hasattr(clade, 'branch_length') and clade.branch_length is not None:
1051
- optimal_length = self._calculate_optimal_branch_length(clade, seq_matrix)
1052
- clade.branch_length = max(optimal_length, 0.001)
1053
- finally:
1054
- sys.setrecursionlimit(old_limit)
1055
- print("✓ Branch lengths optimized")
1056
- return tree
1057
- except Exception as e:
1058
- print(f"Warning: Branch optimization failed: {e}")
1059
- return tree
1060
-
1061
- def _get_clades_safe(self, tree: Tree, max_depth: int = 50) -> List:
1062
- """Safely retrieves all clades in the tree."""
1063
- clades = []
1064
- visited = set()
1065
- def traverse_node(node, depth=0):
1066
- if depth > max_depth or id(node) in visited:
1067
- return
1068
- visited.add(id(node))
1069
- clades.append(node)
1070
- for child in getattr(node, 'clades', []):
1071
- traverse_node(child, depth + 1)
1072
- try:
1073
- traverse_node(tree.root if hasattr(tree, 'root') else tree)
1074
- except Exception as e:
1075
- print(f"Warning: Tree traversal error: {e}")
1076
- return clades
1077
-
1078
- def _calculate_optimal_branch_length(self, clade: float, seq_matrix: np.ndarray) -> float:
1079
- """Calculates optimal branch length for a clade."""
1080
- try:
1081
- if not hasattr(clade, 'branch_length') or clade.branch_length is None:
1082
- return 0.1
1083
- current_length = float(clade.branch_length)
1084
- if np.isnan(current_length) or np.isinf(current_length) or current_length <= 0:
1085
- return 0.1
1086
- return min(max(current_length * (0.9 if hasattr(clade, 'name') and clade.name else 1.1), 0.001), 1.0)
1087
- except Exception:
1088
- return 0.1
1089
-
1090
- def calculate_ml_likelihood_safe(self, tree: Tree, alignment: MultipleSeqAlignment) -> float:
1091
- """Calculates tree likelihood using Jukes-Cantor model."""
1092
- try:
1093
- print("Trying to calculate tree likelihood...")
1094
- seq_matrix = self._alignment_to_matrix(alignment)
1095
- if seq_matrix.size == 0:
1096
- return -np.inf
1097
- total_log_likelihood = 0.0
1098
- n_sites = min(seq_matrix.shape[1], 1000)
1099
- for site in range(0, n_sites, max(1, n_sites // 100)):
1100
- site_pattern = seq_matrix[:, site]
1101
- valid_positions = site_pattern < 4
1102
- if np.sum(valid_positions) < 2:
1103
- continue
1104
- site_likelihood = self._calculate_site_likelihood_safe(tree, site_pattern)
1105
- if site_likelihood > 0:
1106
- total_log_likelihood += np.log(site_likelihood)
1107
- print(f"Likelihood: {total_log_likelihood:.2f}")
1108
- return total_log_likelihood
1109
- except Exception as e:
1110
- print(f"Error calculating likelihood: {e}")
1111
- return -np.inf
1112
-
1113
- def _calculate_site_likelihood_safe(self, tree: np.ndarray, site_pattern: np.ndarray) -> float:
1114
- """Calculates likelihood for a single site."""
1115
- try:
1116
- valid_nucs = site_pattern[site_pattern < 4]
1117
- if len(valid_nucs) == 0:
1118
- return 1.0
1119
- unique_nucs = len(np.unique(valid_nucs))
1120
- total_nucs = len(valid_nucs)
1121
- diversity_factor = unique_nucs / 4.0
1122
- likelihood = np.exp(-diversity_factor * total_nucs * 0.1)
1123
- return max(likelihood, 1e-10)
1124
- except Exception:
1125
- return 1e-10
1126
-
1127
- # --- Reporting ---
1128
- def generate_detailed_report(self, matched_ids: List[str], actual_percentage: float) -> bool:
1129
- """
1130
- Generate a detailed HTML report for virologists/scientists with query details, matched sequences,
1131
- model performance, phylogenetic tree insights, and ML analysis results in tabular format.
1132
- Outputs a styled HTML file.
1133
- Returns True if successful, False otherwise.
1134
- """
1135
- try:
1136
- print("📝 Generating detailed HTML analysis report...")
1137
-
1138
- # --- HTML Template with Inline CSS ---
1139
- html_content = """
1140
- <!DOCTYPE html>
1141
- <html lang="en">
1142
- <head>
1143
- <meta charset="UTF-8">
1144
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
1145
- <title>Phylogenetic Analysis Report - {query_id}</title>
1146
- <style>
1147
- body {{
1148
- font-family: Arial, sans-serif;
1149
- margin: 20px;
1150
- background-color: #f9f9f9;
1151
- color: #333;
1152
- }}
1153
- h1 {{
1154
- text-align: center;
1155
- color: #2c3e50;
1156
- }}
1157
- h2 {{
1158
- color: #34495e;
1159
- margin-top: 20px;
1160
- }}
1161
- table {{
1162
- width: 100%;
1163
- border-collapse: collapse;
1164
- margin-bottom: 20px;
1165
- background-color: #fff;
1166
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
1167
- }}
1168
- th, td {{
1169
- padding: 10px;
1170
- text-align: left;
1171
- border: 1px solid #ddd;
1172
- }}
1173
- th {{
1174
- background-color: #3498db;
1175
- color: #fff;
1176
- }}
1177
- tr:nth-child(even) {{
1178
- background-color: #f2f2f2;
1179
- }}
1180
- tr:hover {{
1181
- background-color: #e0f7fa;
1182
- }}
1183
- .metadata {{
1184
- margin-left: 20px;
1185
- font-size: 0.9em;
1186
- }}
1187
- .metadata p {{
1188
- margin: 5px 0;
1189
- }}
1190
- @media (max-width: 600px) {{
1191
- table {{
1192
- font-size: 0.85em;
1193
- }}
1194
- th, td {{
1195
- padding: 8px;
1196
- }}
1197
- }}
1198
- </style>
1199
- </head>
1200
- <body>
1201
- <h1>Phylogenetic Analysis Report</h1>
1202
- <p style="text-align: center;">Generated on: {timestamp}</p>
1203
- <p style="text-align: center;">Query ID: {query_id}</p>
1204
- """
1205
-
1206
- # Add timestamp and query ID to HTML
1207
- timestamp = time.strftime("%Y-%m-%d %H:%M:%S %Z")
1208
- html_content = html_content.format(query_id=self.query_id, timestamp=timestamp)
1209
-
1210
- # --- Query Information ---
1211
- query_type = (
1212
- "Accession Number" if self.query_id in self.data['Accession Number'].values else
1213
- "Dataset Sequence" if self.query_sequence in self.data['F-gene'].values else
1214
- "Novel Sequence"
1215
- )
1216
- query_ml = "Unknown"
1217
- query_genotype = "Unknown"
1218
- query_metadata = {}
1219
-
1220
- if query_type == "Novel Sequence":
1221
- query_ml = self.predict_ml_group(self.query_sequence)
1222
- query_genotype = self.predict_genotype(self.query_sequence)
1223
- query_metadata = {"F-gene": self.query_sequence[:50] + "..." if len(self.query_sequence) > 50 else self.query_sequence}
1224
- else:
1225
- query_row = self.data[
1226
- (self.data['Accession Number'] == self.query_id) |
1227
- (self.data['F-gene'] == re.sub(r'[^ATGC]', '', self.query_sequence.upper()))
1228
- ].iloc[0]
1229
- query_ml = query_row['ML']
1230
- query_genotype = query_row['Genotype']
1231
- query_metadata = query_row.to_dict()
1232
- query_metadata['F-gene'] = query_metadata['F-gene'][:50] + "..." if len(query_metadata['F-gene']) > 50 else query_metadata['F-gene']
1233
-
1234
- query_info_table = [
1235
- ["Query ID", self.query_id],
1236
- ["Query Type", query_type],
1237
- ["Sequence Length", f"{len(self.query_sequence)} nucleotides"],
1238
- ["ML Group", query_ml],
1239
- ["Genotype", query_genotype],
1240
- ["Target Similarity", f"{self.matching_percentage}%"],
1241
- ["Actual Similarity", f"{actual_percentage:.1f}%"]
1242
- ]
1243
-
1244
- # Add Query Information section
1245
- html_content += """
1246
- <h2>Query Information</h2>
1247
- <table>
1248
- <tr><th>Field</th><th>Value</th></tr>
1249
- """
1250
- for row in query_info_table:
1251
- html_content += f"""
1252
- <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
1253
- """
1254
- html_content += """
1255
- </table>
1256
- <div class="metadata">
1257
- <h3>Metadata</h3>
1258
- """
1259
- for key, value in query_metadata.items():
1260
- html_content += f"""
1261
- <p><strong>{key}:</strong> {value}</p>
1262
- """
1263
- html_content += """
1264
- </div>
1265
- """
1266
-
1267
- # --- Matched Sequences ---
1268
- matched_sequences_table = []
1269
- headers = ["Accession Number", "Similarity (%)", "ML Group", "Genotype", "Host", "Country", "Isolate", "Year"]
1270
-
1271
- for seq_id in matched_ids:
1272
- row = self.data[self.data['Accession Number'] == seq_id].iloc[0]
1273
- matched_sequences_table.append([
1274
- seq_id,
1275
- f"{self.similarity_scores.get(seq_id, 0.0):.1f}",
1276
- row.get('ML', 'N/A'),
1277
- row.get('Genotype', 'N/A'),
1278
- row.get('Host', 'N/A'),
1279
- row.get('Country', 'N/A'),
1280
- row.get('Isolate', 'N/A'),
1281
- row.get('Year', 'N/A')
1282
- ])
1283
-
1284
- # Add Matched Sequences section
1285
- html_content += f"""
1286
- <h2>Matched Sequences</h2>
1287
- <p>Total Matched Sequences: {len(matched_ids)}</p>
1288
- """
1289
- if matched_sequences_table:
1290
- html_content += """
1291
- <table>
1292
- <tr>
1293
- """
1294
- for header in headers:
1295
- html_content += f"<th>{header}</th>"
1296
- html_content += """
1297
- </tr>
1298
- """
1299
- for row in matched_sequences_table:
1300
- html_content += "<tr>"
1301
- for cell in row:
1302
- html_content += f"<td>{cell}</td>"
1303
- html_content += "</tr>"
1304
- html_content += """
1305
- </table>
1306
- """
1307
- else:
1308
- html_content += """
1309
- <p>No matched sequences found.</p>
1310
- """
1311
-
1312
- # --- Model Performance ---
1313
- model_performance_table = [
1314
- ["ML Model Accuracy", f"{self.ml_model_accuracy:.2%}" if self.ml_model_accuracy else "Not trained"],
1315
- ["Genotype Model Accuracy", f"{self.genotype_model_accuracy:.2%}" if self.genotype_model_accuracy else "Not trained"]
1316
- ]
1317
-
1318
- # Add Model Performance section
1319
- html_content += """
1320
- <h2>Model Performance</h2>
1321
- <table>
1322
- <tr><th>Metric</th><th>Value</th></tr>
1323
- """
1324
- for row in model_performance_table:
1325
- html_content += f"""
1326
- <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
1327
- """
1328
- html_content += """
1329
- </table>
1330
- """
1331
-
1332
- # --- Phylogenetic Tree Insights ---
1333
- def count_nodes(node):
1334
- count = 1
1335
- for child in node.get('children', {}).values():
1336
- count += count_nodes(child)
1337
- return count
1338
-
1339
- total_nodes = count_nodes(self.tree_structure)
1340
- query_node_path = []
1341
- def find_query_path(node, path):
1342
- if node.get('data', {}).get('is_query', False):
1343
- query_node_path.append(" -> ".join(path + [node['name']]))
1344
- for name, child in node.get('children', {}).items():
1345
- find_query_path(child, path + [node['name']])
1346
-
1347
- find_query_path(self.tree_structure['root'], [])
1348
-
1349
- tree_insights_table = [
1350
- ["Total Nodes", total_nodes],
1351
- ["ML Groups Represented", len(self.tree_structure['root']['children'])],
1352
- ["Query Node Path", query_node_path[0] if query_node_path else "Not found"]
1353
- ]
1354
-
1355
- # Add Phylogenetic Tree Insights section
1356
- html_content += """
1357
- <h2>Phylogenetic Tree Insights</h2>
1358
- <table>
1359
- <tr><th>Field</th><th>Value</th></tr>
1360
- """
1361
- for row in tree_insights_table:
1362
- html_content += f"""
1363
- <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
1364
- """
1365
- html_content += """
1366
- </table>
1367
- """
1368
-
1369
- # --- ML Analysis Results ---
1370
- ml_analysis = self.tree_structure.get('ml_analysis', {})
1371
- ml_analysis_table = [
1372
- ["ML Tree Available", ml_analysis.get('ml_tree_available', False)],
1373
- ["Log-Likelihood", f"{ml_analysis.get('log_likelihood', 'N/A'):.2f}" if ml_analysis.get('log_likelihood') else "N/A"],
1374
- ["Sequence Count", ml_analysis.get('sequence_count', 'N/A')],
1375
- ["Alignment Length", ml_analysis.get('alignment_length', 'N/A')]
1376
- ]
1377
-
1378
- # Add ML Analysis Results section
1379
- html_content += """
1380
- <h2>Maximum Likelihood Analysis Results</h2>
1381
- <table>
1382
- <tr><th>Field</th><th>Value</th></tr>
1383
- """
1384
- for row in ml_analysis_table:
1385
- html_content += f"""
1386
- <tr><td>{row[0]}</td><td>{row[1]}</td></tr>
1387
- """
1388
- html_content += """
1389
- </table>
1390
- """
1391
-
1392
- # --- Close HTML ---
1393
- html_content += """
1394
- </body>
1395
- </html>
1396
- """
1397
-
1398
- # --- Save HTML Report ---
1399
- report_filename = f"detailed_report_{self.query_id.replace('/', '_')}.html"
1400
- print(f"Attempting to save report to: {os.path.abspath(report_filename)}")
1401
- with open(report_filename, 'w') as f:
1402
- f.write(html_content)
1403
- print(f"✓ Detailed HTML report saved as '{report_filename}'")
1404
- return True
1405
- except Exception as e:
1406
- print(f"Error generating detailed report: {str(e)}")
1407
- import traceback
1408
- traceback.print_exc()
1409
- return False
1410
-
1411
- def command_line_interface():
1412
- """Parse command-line arguments and run phylogenetic analysis."""
1413
- parser = argparse.ArgumentParser(
1414
- description="Advanced Phylogenetic Tree Analyzer with AI-enhanced similarity matching",
1415
- formatter_class=argparse.RawDescriptionHelpFormatter,
1416
- epilog="Examples:\n %(prog)s -d data.csv -q MH087032 -s 95\n %(prog)s -d data.csv -q MH087032 -s 90 --no-ai --batch query1,query2,query3"
1417
- )
1418
- parser.add_argument('-d', '--data', required=True, help='Path to CSV data file')
1419
- parser.add_argument('-q', '--query', required=True, help='Query sequence ID or nucleotide sequence')
1420
- parser.add_argument('-s', '--similarity', type=float, default=95.0, help='Target similarity percentage (70-99, default: 95)')
1421
- parser.add_argument('--no-ai', action='store_true', help='Skip AI model training')
1422
- parser.add_argument('--batch', help='Comma-separated list of query IDs for batch processing')
1423
- parser.add_argument('--output-dir', default='.', help='Output directory for results')
1424
- parser.add_argument('--save-json', action='store_true', help='Save detailed results to JSON')
1425
-
1426
- args = parser.parse_args()
1427
-
1428
- # Validate arguments
1429
- if not 70 <= args.similarity <= 99:
1430
- print("❌ Similarity percentage must be between 70 and 99.")
1431
- sys.exit(1)
1432
- if not Path(args.data).exists():
1433
- print(f"❌ Data file not found: {args.data}")
1434
- sys.exit(1)
1435
-
1436
- # Initialize analyzer
1437
- analyzer = PhylogeneticTreeAnalyzer()
1438
- if not analyzer.load_data(args.data):
1439
- print("❌ Failed to load data.")
1440
- sys.exit(1)
1441
-
1442
- # Train AI model unless disabled
1443
- if not args.no_ai:
1444
- print("⏳ Training AI model...")
1445
- start_time = time.time()
1446
- if analyzer.train_ai_model():
1447
- print(f"✅ AI model training completed in {time.time() - start_time:.1f} seconds")
1448
- else:
1449
- print("⚠️ AI model training failed, continuing with basic analysis")
1450
-
1451
- # Process queries
1452
- queries = args.batch.split(',') if args.batch else [args.query]
1453
- for query in queries:
1454
- query = query.strip()
1455
- print(f"🔍 Processing: {query}")
1456
- if not analyzer.find_query_sequence(query):
1457
- print(f"❌ Query not found: {query}")
1458
- continue
1459
-
1460
- matched_ids, actual_percentage = analyzer.find_similar_sequences(args.similarity)
1461
- if not matched_ids:
1462
- print(f"❌ No similar sequences found for {query}")
1463
- continue
1464
-
1465
- analyzer.build_tree_structure_with_ml_safe(matched_ids)
1466
- fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
1467
- if fig:
1468
- html_filename = f"phylogenetic_tree_{query.replace('/', '_')}_interactive.html"
1469
- fig.write_html(html_filename)
1470
- print(f"📄 Interactive HTML saved: {html_filename}")
1471
- analyzer.generate_detailed_report(matched_ids, actual_percentage)
1472
- print(f"📄 Detailed HTML report saved: detailed_report_{query.replace('/', '_')}.html")
1473
- print(f"✅ Analysis completed for {query}")
1474
-
1475
- def main():
1476
- """Run interactive phylogenetic analysis with user input."""
1477
- print("\n" + "="*70)
1478
- print("🧬 PHYLOGENETIC TREE ANALYZER - ADVANCED ML-BASED ANALYSIS")
1479
- print("Version 2.0 | AI-Enhanced Similarity Matching")
1480
- print("="*70)
1481
-
1482
- analyzer = PhylogeneticTreeAnalyzer()
1483
-
1484
- # Load data
1485
- data_file = "f cleaned.csv"
1486
- while not Path(data_file).exists() or not analyzer.load_data(data_file):
1487
- print(f"❌ File not found or invalid: {data_file}")
1488
- data_file = input("Enter valid data file path: ").strip()
1489
- if not data_file:
1490
- print("❌ Analysis cancelled.")
1491
- return
1492
-
1493
- # Train AI model
1494
- print("⏳ Training AI model...")
1495
- start_time = time.time()
1496
- if analyzer.train_ai_model():
1497
- print(f"✅ AI model training completed in {time.time() - start_time:.1f} seconds")
1498
- else:
1499
- print("⚠️ AI model training failed, continuing with basic analysis")
1500
-
1501
- # Get query sequence
1502
- while True:
1503
- query_input = input("\nEnter query sequence or ID (min 10 nucleotides): ").strip()
1504
- if analyzer.find_query_sequence(query_input):
1505
- break
1506
- retry = input("❌ Invalid input. Try again? (y/n): ").strip().lower()
1507
- if retry != 'y':
1508
- print("👋 Analysis cancelled.")
1509
- return
1510
-
1511
- # Set similarity percentage
1512
- while True:
1513
- try:
1514
- similarity_input = input("Enter target similarity percentage (1-99) [85]: ").strip()
1515
- target_percentage = float(similarity_input) if similarity_input else 85.0
1516
- if 1 <= target_percentage <= 99:
1517
- analyzer.matching_percentage = target_percentage
1518
- break
1519
- print("❌ Please enter a percentage between 1 and 99.")
1520
- except ValueError:
1521
- print("❌ Please enter a valid number.")
1522
-
1523
- # Find similar sequences
1524
- print(f"⏳ Analyzing sequences for {target_percentage}% similarity...")
1525
- start_time = time.time()
1526
- matched_ids, actual_percentage = analyzer.find_similar_sequences(target_percentage)
1527
- if not matched_ids:
1528
- print(f"❌ No similar sequences found at {target_percentage}% similarity.")
1529
- return
1530
- analyzer.matched_sequences = matched_ids
1531
- analyzer.actual_percentage = actual_percentage
1532
- print(f"✅ Similarity analysis completed in {time.time() - start_time:.1f} seconds")
1533
-
1534
- # Build tree structure
1535
- print("⏳ Building phylogenetic tree structure...")
1536
- start_time = time.time()
1537
- tree_structure = analyzer.build_tree_structure_with_ml_safe(matched_ids)
1538
- if not tree_structure:
1539
- print("❌ Failed to build tree structure.")
1540
- return
1541
- print(f"✅ Tree structure built in {time.time() - start_time:.1f} seconds")
1542
-
1543
- # Create visualization and save HTML
1544
- print("⏳ Creating interactive visualization...")
1545
- start_time = time.time()
1546
- fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
1547
- if not fig:
1548
- print("❌ Visualization creation failed.")
1549
- return
1550
-
1551
- html_filename = "phylogenetic_tree_interactive.html"
1552
- fig.write_html(html_filename)
1553
- print(f"📄 Interactive HTML saved: {html_filename}")
1554
-
1555
- # Generate detailed report
1556
- print("⏳ Generating detailed report...")
1557
- start_time = time.time()
1558
- if analyzer.generate_detailed_report(matched_ids, actual_percentage):
1559
- print(f"✅ Detailed report generated in {time.time() - start_time:.1f} seconds")
1560
-
1561
- print(f"\n🎉 Analysis completed successfully!")
1562
- print(f" Query ID: {analyzer.query_id}")
1563
- print(f" Query sequence length: {len(analyzer.query_sequence)} nucleotides")
1564
- print(f" Similar sequences found: {len(matched_ids)}")
1565
- print(f" Actual similarity percentage: {actual_percentage:.1f}%")
1566
- print(f" HTML visualization file: {html_filename}")
1567
- print(f" HTML report file: detailed_report_{analyzer.query_id.replace('/', '_')}.html")
1568
-
1569
- if __name__ == "__main__":
1570
- try:
1571
- main()
1572
- except KeyboardInterrupt:
1573
- print("\n👋 Goodbye!")
1574
- sys.exit(0)
1575
- except Exception as e:
1576
- print(f"\n❌ Unexpected error: {e}")
1577
- sys.exit(1)