Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 9, 2025

Commit

74167c4

verified ·

1 Parent(s): e856e28

Update app.py

Browse files

Files changed (1) hide show

app.py +713 -407

app.py CHANGED Viewed

@@ -1,833 +1,1139 @@
-# app.py
-import gradio as gr
-import torch
-import pickle
-import subprocess
-import pandas as pd
-import os
-import re
-import logging
-import numpy as np
-from predictor import GenePredictor  # Kept for potential future use, but not loaded
-from tensorflow.keras.models import load_model
 import ml_simplified_tree
 import tempfile
 import shutil
-import stat
 from pathlib import Path
-from huggingface_hub import hf_hub_download
-from tensorflow.keras.preprocessing.sequence import pad_sequences
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
-IQTREE_PATH = "iqtree/bin/iqtree2"  # Update this path as needed
-# --- Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
-classifier_model_dir = "model"  # Directory for second model files
 # Get HF token from environment (if available)
 hf_token = os.getenv("HF_TOKEN")
-# --- Load Models ---
-boundary_model = None  # Disabled as per request
 keras_model = None
 kmer_to_index = None
-classifier_model = None
-classifier_kmer_to_index = None
-classifier_maxlen = None
-# Note: Boundary Model is disabled as per user request
-logging.info("Boundary Model is currently disabled. Input will be used directly for verification and tree analysis.")
 # Try to load Keras model from Hugging Face Hub
 try:
-    keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token)
-    kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token)
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
-            kmer_to_index = pickle.load(f)
-        logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
-    else:
-        logging.warning(f"Keras model or kmer files not found after download")
 except Exception as e:
     logging.error(f"Failed to load Keras model from HF Hub: {e}")
-# Try to load classifier model (second model)
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-try:
-    classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
-    classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
-    classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
-    missing_files = []
-    if not os.path.exists(classifier_path):
-        missing_files.append("best_model.keras")
-    if not os.path.exists(classifier_kmer_path):
-        missing_files.append("kmer_to_index.pkl")
-    if not os.path.exists(classifier_maxlen_path):
-        missing_files.append("maxlen.txt")
-    if missing_files:
-        logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
-    else:
-        classifier_model = load_model(classifier_path)
-        with open(classifier_kmer_path, "rb") as f:
-            classifier_kmer_to_index = pickle.load(f)
-        with open(classifier_maxlen_path, "r") as f:
-            classifier_maxlen = int(f.read().strip())
-        logging.info("Classifier model loaded successfully.")
-except Exception as e:
-    logging.error(f"Failed to load classifier model: {e}")
-    logging.warning("Falling back to existing Keras model for validation.")
-LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
 # --- Initialize Tree Analyzer ---
 analyzer = None
 try:
-    analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
     if os.path.exists(csv_path):
         if analyzer.load_data(csv_path):
             logging.info("Tree analyzer initialized successfully")
             try:
                 if not analyzer.train_ai_model():
                     logging.warning("AI model training failed; proceeding with basic analysis.")
-            except Exception as e:
-                logging.warning(f"AI model training failed: {e}")
-        else:
-            logging.error("Failed to load CSV data for tree analyzer")
-            analyzer = None
-    else:
-        logging.error(f"CSV file not found: {csv_path}")
-        analyzer = None
-except Exception as e:
-    logging.error(f"Failed to initialize tree analyzer: {e}")
     analyzer = None
 # --- Enhanced Tool Detection ---
-def check_and_fix_executable_permissions(filepath):
-    """Check and fix executable permissions for a file"""
-    try:
-        if os.path.exists(filepath):
-            if not os.access(filepath, os.X_OK):
-                logging.info(f"File {filepath} is not executable, attempting to fix permissions...")
-                current_permissions = os.stat(filepath).st_mode
-                os.chmod(filepath, current_permissions | stat.S_IEXEC | stat.S_IXUSR | stat.S_IXGRP)
-                logging.info(f"Fixed permissions for {filepath}")
-                return True
-            return True
-        return False
-    except Exception as e:
-        logging.error(f"Failed to fix permissions for {filepath}: {e}")
-        return False
-def enhanced_check_tool_availability():
-    """Enhanced check for MAFFT and IQ-TREE availability with permission fixing"""
     mafft_available = False
     mafft_cmd = None
     mafft_candidates = [
         MAFFT_PATH,
         'mafft',
         '/usr/bin/mafft',
         '/usr/local/bin/mafft',
-        '/opt/homebrew/bin/mafft',
-        '/usr/local/homebrew/bin/mafft',
-        'mafft.bat',
     ]
     for candidate in mafft_candidates:
-        if candidate and os.path.exists(candidate):
-            if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
-                check_and_fix_executable_permissions(candidate)
-            if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
-                mafft_available = True
-                mafft_cmd = candidate
-                logging.info(f"Found MAFFT at: {candidate}")
-                break
-        elif candidate and shutil.which(candidate) is not None:
             mafft_available = True
             mafft_cmd = candidate
-            logging.info(f"Found MAFFT in PATH: {candidate}")
             break
     iqtree_available = False
     iqtree_cmd = None
     iqtree_candidates = [
         IQTREE_PATH,
         'iqtree2',
-        'iqtree',
-        '/usr/bin/iqtree2',
         '/usr/local/bin/iqtree2',
         '/usr/bin/iqtree',
         '/usr/local/bin/iqtree',
-        '/opt/homebrew/bin/iqtree2',
-        'iqtree2.exe',
-        'iqtree.exe',
     ]
     for candidate in iqtree_candidates:
-        if candidate and os.path.exists(candidate):
-            if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
-                check_and_fix_executable_permissions(candidate)
-            if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
-                iqtree_available = True
-                iqtree_cmd = candidate
-                logging.info(f"Found IQ-TREE at: {candidate}")
-                break
-        elif candidate and shutil.which(candidate) is not None:
             iqtree_available = True
             iqtree_cmd = candidate
-            logging.info(f"Found IQ-TREE in PATH: {candidate}")
             break
     return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
-def get_installation_instructions():
-    """Get detailed installation instructions based on the current system"""
-    import platform
-    system = platform.system().lower()
-    if system == "linux":
-        try:
-            with open('/etc/os-release', 'r') as f:
-                os_info = f.read().lower()
-            if 'ubuntu' in os_info or 'debian' in os_info:
-                return """
-📦 INSTALLATION INSTRUCTIONS (Ubuntu/Debian):
-1. Update package list: sudo apt-get update
-2. Install MAFFT and IQ-TREE: sudo apt-get install mafft iqtree
-3. Verify installation: mafft --version, iqtree2 --version
-Alternative using Conda: conda install -c bioconda mafft iqtree
-"""
-            elif 'centos' in os_info or 'rhel' in os_info or 'fedora' in os_info:
-                return """
-📦 INSTALLATION INSTRUCTIONS (CentOS/RHEL/Fedora):
-1. Install EPEL repository (CentOS/RHEL): sudo yum install epel-release
-2. Install packages: sudo yum install mafft iqtree
-3. Verify installation: mafft --version, iqtree2 --version
-"""
-        except:
-            pass
-    elif system == "darwin":
-        return """
-📦 INSTALLATION INSTRUCTIONS (macOS):
-Using Homebrew: 1. Install Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
-2. Install MAFFT and IQ-TREE: brew install mafft iqtree
-3. Verify installation: mafft --version, iqtree2 --version
-Using Conda: conda install -c bioconda mafft iqtree
-"""
-    elif system == "windows":
-        return """
-📦 INSTALLATION INSTRUCTIONS (Windows):
-Option 1 - Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
-Option 2 - Manual: 1. Download MAFFT: https://mafft.cbrc.jp/alignment/software/
-2. Download IQ-TREE: http://www.iqtree.org/
-3. Add to PATH
-"""
-    return """
-📦 GENERAL INSTALLATION INSTRUCTIONS:
-Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
-Manual: 1. MAFFT: https://mafft.cbrc.jp/alignment/software/
-2. IQ-TREE: http://www.iqtree.org/
 """
-def run_mafft_alignment_improved(input_fasta, output_fasta, mafft_cmd):
-    """Run MAFFT alignment with improved permission and error handling"""
     try:
-        if not os.access(mafft_cmd, os.X_OK):
-            logging.warning(f"MAFFT executable {mafft_cmd} is not executable")
-            if not check_and_fix_executable_permissions(mafft_cmd):
-                return False, f"Cannot make {mafft_cmd} executable"
-        try:
-            test_result = subprocess.run([mafft_cmd, '--version'], capture_output=True, text=True, timeout=10)
-            if test_result.returncode != 0:
-                return False, f"MAFFT version check failed: {test_result.stderr}"
-        except Exception as e:
-            return False, f"MAFFT version check failed: {str(e)}"
-        cmd = [mafft_cmd, '--auto', '--quiet', '--thread', '2', input_fasta]
         logging.info(f"Running MAFFT: {' '.join(cmd)}")
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
         if result.returncode == 0:
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
             logging.info(f"MAFFT alignment completed: {output_fasta}")
             if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
                 return True, output_fasta
             else:
-                return False, "MAFFT completed but output file is empty"
-        else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
             logging.error(f"MAFFT failed: {error_msg}")
             return False, f"MAFFT error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("MAFFT timeout")
         return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
-    except PermissionError as e:
-        logging.error(f"Permission error running MAFFT: {e}")
-        return False, f"Permission denied: {mafft_cmd}. Please check file permissions."
     except FileNotFoundError:
         return False, f"MAFFT executable not found: {mafft_cmd}"
     except Exception as e:
-        logging.error(f"MAFFT execution failed: {e}")
-        return False, f"MAFFT execution failed: {str(e)}"
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
     """Run IQ-TREE with enhanced options and error handling"""
     try:
-        if not os.access(iqtree_cmd, os.X_OK):
-            logging.warning(f"IQ-TREE executable {iqtree_cmd} is not executable")
-            if not check_and_fix_executable_permissions(iqtree_cmd):
-                return False, f"Cannot make {iqtree_cmd} executable"
-        try:
-            test_result = subprocess.run([iqtree_cmd, '--version'], capture_output=True, text=True, timeout=10)
-            if test_result.returncode != 0:
-                return False, f"IQ-TREE version check failed: {test_result.stderr}"
-        except Exception as e:
-            return False, f"IQ-TREE version check failed: {str(e)}"
-        cmd = [iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000', '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '-redo', '--quiet']
         logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200, cwd=os.getcwd())
         if result.returncode == 0:
             tree_file = f"{output_prefix}.treefile"
             if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
-                logging.info(f"IQ-TREE analysis completed: {tree_file}")
-                return True, tree_file
-            else:
-                logging.error("IQ-TREE completed but tree file not found or empty")
-                return False, "Tree file not generated or empty"
-        else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
             logging.error(f"IQ-TREE failed: {error_msg}")
             return False, f"IQ-TREE error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("IQ-TREE timeout")
         return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
-    except PermissionError as e:
-        logging.error(f"Permission error running IQ-TREE: {e}")
-        return False, f"Permission denied: {iqtree_cmd}. Please check file permissions."
     except FileNotFoundError:
         return False, f"IQ-TREE executable not found: {iqtree_cmd}"
     except Exception as e:
-        logging.error(f"IQ-TREE execution failed: {e}")
-        return False, f"IQ-TREE execution failed: {str(e)}"
 def create_simple_neighbor_joining_tree(sequences_dict):
     """Create a simple distance-based tree when ML tools are not available"""
     try:
         import random
         seq_names = list(sequences_dict.keys())
         n_seqs = len(seq_names)
         if n_seqs < 2:
             return None, "Need at least 2 sequences for tree construction"
         if n_seqs == 2:
             tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
         else:
             tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
         return tree_file, "Simple distance-based tree created"
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
 def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
     """Create a multi-FASTA file with query sequence and reference sequences"""
     try:
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
             logging.info(f"Added reference sequences from {ref_fasta_path}")
         else:
             if analyzer and hasattr(analyzer, 'data'):
                 count = 0
                 for idx, row in analyzer.data.iterrows():
-                    if 'sequence' in row and len(str(row['sequence'])) > 50:
-                        seq_id = row.get('id', f"Ref_{count}")
                         sequence = str(row['sequence']).upper()
                         temp_fasta.write(f">{seq_id}\n{sequence}\n")
                         count += 1
-                        if count >= 20:
                             break
                 logging.info(f"Added {count} reference sequences from CSV")
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
         logging.error(f"Failed to create multi-FASTA: {e}")
         return None
 def build_maximum_likelihood_tree(f_gene_sequence):
     """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
     try:
-        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = enhanced_check_tool_availability()
         status_msg = "🔍 Checking dependencies...\n"
-        status_msg += f"✅ MAFFT found: {mafft_cmd}\n" if mafft_available else "❌ MAFFT not found\n"
-        status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n" if iqtree_available else "❌ IQ-TREE not found\n"
-        if not mafft_available or not iqtree_available:
-            instructions = get_installation_instructions()
-            return False, f"{status_msg}\n{instructions}", None, None
         output_dir = "ml_tree_output"
         os.makedirs(output_dir, exist_ok=True)
         logging.info("Creating multi-FASTA file...")
         multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
         if not multi_fasta:
             return False, f"{status_msg}❌ Failed to create input FASTA", None, None
         logging.info("Running MAFFT alignment...")
         aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
-        mafft_success, mafft_result = run_mafft_alignment_improved(multi_fasta, aligned_fasta, mafft_cmd)
         os.unlink(multi_fasta)
         if not mafft_success:
             return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
         logging.info("Running IQ-TREE analysis...")
         tree_prefix = os.path.join(output_dir, "ml_tree")
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
             return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
         tree_file = iqtree_result
         log_file = f"{tree_prefix}.log"
         standard_aligned = "f_gene_sequences_aligned.fasta"
         standard_tree = "f_gene_sequences.phy.treefile"
         if os.path.exists(aligned_fasta):
             shutil.copy2(aligned_fasta, standard_aligned)
         if os.path.exists(tree_file):
             shutil.copy2(tree_file, standard_tree)
-        success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}\n"
         if os.path.exists(log_file):
             try:
                 with open(log_file, 'r') as f:
                     log_content = f.read()
                     if "Best-fit model:" in log_content:
                         model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
                         if model_lines:
                             success_msg += f"- {model_lines[0].strip()}\n"
             except Exception as e:
                 logging.warning(f"Could not read log file: {e}")
         logging.info("Maximum likelihood tree construction completed")
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
-    """Analyze sequence and create phylogenetic tree"""
     try:
         if not analyzer:
             return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
         if not sequence:
             return "Error: Please provide a sequence."
         if not (1 <= matching_percentage <= 99):
             return "Error: Matching percentage must be between 1 and 99."
         if not analyzer.find_query_sequence(sequence):
             return "Error: Invalid query sequence or sequence not found in dataset."
         analyzer.matching_percentage = matching_percentage
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
         logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
         tree_structure = analyzer.build_tree_structure(matched_ids)
         if not tree_structure:
             return "Error: Failed to build tree structure."
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         if not fig:
             return "Error: Failed to create tree visualization."
         html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
         output_dir = "output"
         os.makedirs(output_dir, exist_ok=True)
         safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
         html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
         with open(html_filename, "w", encoding='utf-8') as f:
             f.write(html_content)
         logging.info(f"Tree HTML saved to {html_filename}")
         return html_content
     except Exception as e:
         error_msg = f"Tree analysis error: {str(e)}"
         logging.error(error_msg)
-        import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return error_msg
 def predict_with_keras(sequence):
-    """Keras prediction for initial sequence processing"""
     try:
         if not keras_model or not kmer_to_index:
             return f"Keras model not available. Input sequence: {sequence[:100]}..."
         if len(sequence) < 6:
             return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
         kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
         indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
         result = ''.join([str(round(p, 3)) for p in prediction])
         return result
     except Exception as e:
         logging.error(f"Keras prediction failed: {e}")
         return f"Keras prediction failed: {str(e)}"
-def classify_sequence(sequence):
-    """Classify sequence using the second model or fallback"""
-    try:
-        if not classifier_model or not classifier_kmer_to_index or classifier_maxlen is None:
-            if keras_model and kmer_to_index:  # Fallback to Keras model
-                logging.warning("Using Keras model as fallback for classification.")
-                if len(sequence) < 6:
-                    return {
-                        "status": "error",
-                        "message": "Sequence too short for k-mer prediction (minimum 6 nucleotides).",
-                        "confidence": None,
-                        "predicted_label": None
-                    }
-                kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
-                indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
-                input_arr = np.array([indices])
-                pred = keras_model.predict(input_arr, verbose=0)[0]
-                confidence = float(np.max(pred))
-                label = "F" if confidence > 0.5 else "Unknown"  # Simple threshold-based fallback
-                return {
-                    "status": "success" if label == "F" else "warning",
-                    "message": f"F gene detected (fallback)" if label == "F" else "Uncertain classification (fallback)",
-                    "confidence": confidence,
-                    "predicted_label": label
-                }
-            return {
-                "status": "error",
-                "message": "No classification model available.",
-                "confidence": None,
-                "predicted_label": None
-            }
-        if len(sequence) < 1500:
-            return {
-                "status": "error",
-                "message": "Sequence too short. Must be at least 1500 bases.",
-                "confidence": None,
-                "predicted_label": None
-            }
-        tokens = [sequence[i:i+6] for i in range(len(sequence)-5+1)]
-        encoded = [classifier_kmer_to_index.get(kmer, 0) for kmer in tokens]
-        padded = pad_sequences([encoded], maxlen=classifier_maxlen, padding='post')
-        pred = classifier_model.predict(padded, verbose=0)
-        predicted_class = int(np.argmax(pred))
-        label = LABELS[predicted_class]
-        confidence = float(np.max(pred))
-        if label == "F":
-            return {
-                "status": "success",
-                "message": "F gene detected.",
-                "confidence": confidence,
-                "predicted_label": label
-            }
-        elif label == "Random":
-            return {
-                "status": "error",
-                "message": "Unidentified sequence detected. Make sure you're entering the F gene of the NDV.",
-                "confidence": confidence,
-                "predicted_label": label
-            }
-        else:
-            return {
-                "status": "error",
-                "message": "No F-gene detected. Please enter an NDV's F gene.",
-                "confidence": confidence,
-                "predicted_label": label
-            }
-    except Exception as e:
-        logging.error(f"Classifier prediction failed: {e}")
-        return {
-            "status": "error",
-            "message": f"Prediction failed: {str(e)}",
-            "confidence": None,
-            "predicted_label": None
-        }
 def read_fasta_file(file_obj):
-    """Read FASTA file content"""
     try:
         if file_obj is None:
             return ""
         if hasattr(file_obj, 'name'):
             with open(file_obj.name, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
-    except Exception as e:
         logging.error(f"Failed to read FASTA file: {e}")
         return ""
 def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
-    """Run pipeline from FASTA file"""
     try:
         dna_input = read_fasta_file(fasta_file_obj)
         if not dna_input:
-            return "Failed to read FASTA file", "", "", "", "", "", "", "", "", None, None, None, "No input sequence"
         return run_pipeline(dna_input, similarity_score, build_ml_tree)
     except Exception as e:
         error_msg = f"Pipeline error: {str(e)}"
         logging.error(error_msg)
-        return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
 def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
-    """Run the full pipeline with direct input to verification and ML tree"""
     try:
         dna_input = dna_input.upper().strip()
         if not dna_input:
-            return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
-        # Step 1: Direct input (Boundary Model disabled)
-        processed_sequence = dna_input
-        boundary_output = "Boundary Model disabled. Using raw input: " + str(len(dna_input)) + " bp"
-        logging.info("Using raw input directly for verification and tree analysis")
-        # Step 2: Keras Prediction (Verification)
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
             keras_prediction = predict_with_keras(processed_sequence)
-            keras_output = keras_prediction if not keras_prediction.startswith(("Keras", "Sequence")) else keras_prediction
-        # Step 3: Classifier Prediction
-        classifier_result = classify_sequence(processed_sequence)
-        classifier_status = classifier_result["status"]
-        classifier_message = classifier_result["message"]
-        classifier_label = classifier_result["predicted_label"]
-        classifier_confidence = classifier_result["confidence"]
-        # Step 4: Maximum Likelihood Tree
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
             try:
                 logging.info("Starting maximum likelihood tree construction...")
                 ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
                 if ml_success:
                     ml_tree_output = ml_message
                     aligned_file = ml_aligned
                     phy_file = ml_tree
                 else:
-                    ml_tree_output = ml_message
             except Exception as e:
                 ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
-        elif build_ml_tree:
-            ml_tree_output = "❌ F gene sequence too short for ML tree construction (minimum 50 bp)"
         else:
             ml_tree_output = "ML tree construction skipped (not requested)"
-        # Step 5: ML Simplified Tree
         html_file = None
         tree_html_content = "No tree generated"
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
                 tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
                 if tree_result and not tree_result.startswith("Error:"):
                     tree_html_content = tree_result
                     simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
                     output_dir = "output"
                     if os.path.exists(output_dir):
                         html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
                         if html_files:
-                            html_file = os.path.join(output_dir, html_files[-1])
                             simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
                     if analyzer.find_query_sequence(processed_sequence):
                         matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
                         simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
-                        simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
                 else:
                     simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
                     tree_html_content = f"<p>Error: {tree_result}</p>"
             except Exception as e:
                 logging.error(f"Simplified ML tree analysis failed: {e}")
                 simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
-                tree_html_content = f"<p>Error: {str(e)}</p>"
-        else:
-            if not analyzer:
-                simplified_ml_output = "❌ Tree analyzer not available"
-            else:
-                simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
         # Return all results
         return (
-            boundary_output,
-            keras_output,
-            classifier_status,
-            classifier_message,
-            classifier_label,
-            classifier_confidence,
-            ml_tree_output,
-            simplified_ml_output,
-            tree_html_content,
-            aligned_file,
-            phy_file,
-            html_file,
-            f"Pipeline completed. Input length: {len(processed_sequence)} bp"
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
         logging.error(error_msg)
         import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return (
-            error_msg, "", "", "", "", "", "", "", f"<p>Error: {error_msg}</p>",
             None, None, None, error_msg
         )
 # --- Gradio Interface ---
 def create_interface():
     """Create the Gradio interface with enhanced layout and features"""
     custom_css = """
-    .gradio-container { max-width: 1200px !important; }
-    .tab-nav button { font-size: 16px !important; }
-    .output-html { height: 600px !important; overflow: auto; }
     """
     with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
-        This tool analyzes input sequences directly (Boundary Model disabled):
-        - **Gene Validation**: Validates with machine learning.
-        - **Gene Classification**: Classifies sequence type (F gene or other).
-        - **Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
         **Instructions:**
-        1. Enter your sequence or upload a FASTA file
-        2. Adjust similarity threshold (1-99%)
-        3. Choose whether to build ML tree (requires MAFFT & IQ-TREE)
-        4. Click "Run Analysis" to start
         """)
         with gr.Tab("🔬 Analysis Pipeline"):
             with gr.Row():
                 with gr.Column(scale=2):
                     gr.Markdown("### Input Sequence")
-                    dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter your DNA sequence here (ATCG format)...", lines=5, max_lines=10)
-                    fasta_file = gr.File(label="Or Upload FASTA File", file_types=[".fasta", ".fa", ".fas", ".txt"])
                     with gr.Row():
-                        similarity_score = gr.Slider(minimum=1, maximum=99, value=95.0, step=1.0, label="Similarity Threshold (%)", info="Minimum similarity for phylogenetic analysis")
-                        build_ml_tree = gr.Checkbox(label="Build ML Tree", value=False, info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)")
                     with gr.Row():
                         run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 with gr.Column(scale=1):
                     gr.Markdown("### Analysis Status")
-                    status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
                     gr.Markdown("### Available Models")
                     model_status = []
-                    model_status.append("❌ Boundary Detection Model (Disabled)")  # Reflect disabled state
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
                         model_status.append("❌ Gene Validation Model")
-                    if classifier_model:
-                        model_status.append("✅ Gene Classification Model")
-                    else:
-                        model_status.append("❌ Gene Classification Model")
                     if analyzer:
                         model_status.append("✅ Tree Analysis Module")
                     else:
                         model_status.append("❌ Tree Analysis Module")
                     gr.Markdown("\n".join(model_status))
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
-                    boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False, value="Boundary Model disabled. Using raw input.")
-                    keras_output = gr.Textbox(label="🔍 Gene Validation", lines=3, interactive=False)
-                    classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
-                    classifier_message = gr.Textbox(label="📝 Classification Message", lines=2, interactive=False)
-                    classifier_label = gr.Textbox(label="🏷️ Predicted Label", lines=1, interactive=False)
-                    classifier_confidence = gr.Textbox(label="📊 Confidence Score", lines=1, interactive=False)
                 with gr.Column():
-                    ml_tree_output = gr.Textbox(label="🌳 Maximum Likelihood Tree", lines=5, interactive=False)
-                    simplified_ml_output = gr.Textbox(label="📈 Simplified Phylogenetic Analysis", lines=3, interactive=False)
             gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
-            tree_html = gr.HTML(label="Interactive Tree", value="<p>No tree generated yet. Run analysis to see results.</p>")
             gr.Markdown("### 📁 Download Results")
             with gr.Row():
-                aligned_file = gr.File(label="Aligned Sequences (FASTA)", interactive=False)
-                phy_file = gr.File(label="Phylogenetic Tree File", interactive=False)
-                html_file = gr.File(label="Interactive Tree (HTML)", interactive=False)
         with gr.Tab("ℹ️ Help & Info"):
             gr.Markdown("""
             ## About This Tool
             ### F Gene Analysis Pipeline
-            - **🎯 F Gene Extraction**: Disabled; uses raw input directly.
-            - **🔍 Gene Validation**: Validates with k-mer based machine learning.
-            - **🧬 Gene Classification**: Classifies sequences (F gene or other).
-            - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
             ### Input Requirements
-            - DNA Sequences: ATCG format, minimum 50 bp.
-            - FASTA Files: Standard format.
-            - Similarity Threshold: 1-99%.
             ### Dependencies
-            **For ML Trees:**
             ```bash
-            # Ubuntu/Debian: sudo apt-get install mafft iqtree
-            # macOS: brew install mafft iqtree
-            # Conda: conda install -c bioconda mafft iqtree
             ```
             ### Troubleshooting
-            - *"No similar sequences"*: Lower similarity threshold.
-            - *"Sequence too short"*: Provide >50 bp.
-            - *"MAFFT/IQ-TREE not found"*: Install dependencies.
-            - *"Model not available"*: Check model files.
             """)
         def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
             if file_obj is not None:
                 return run_pipeline_from_file(file_obj, sim_score, build_tree)
             else:
-                return run_pipeline(dna_seq, sim_score, build_tree)
         def clear_inputs():
             return "", None, 95.0, False, "Ready to analyze"
         run_btn.click(
             fn=run_analysis_combined,
             inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
             outputs=[
-                boundary_output, keras_output, classifier_status, classifier_message,
-                classifier_label, classifier_confidence, ml_tree_output, simplified_ml_output,
-                tree_html, aligned_file, phy_file, html_file, status_display
             ]
         )
         clear_btn.click(
             fn=clear_inputs,
             outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
         )
         example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
         def load_example():
             example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
             return example_seq, "Example F gene sequence loaded"
-        example_btn.click(fn=load_example, outputs=[dna_input, status_display])
     return iface
 # --- Main Execution ---
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=True,
-        show_error=True,
-        max_threads=4,
-        auth=None,
-        ssl_verify=False,
-        quiet=False
     )

 import ml_simplified_tree
 import tempfile
 import shutil
+import sys
 from pathlib import Path
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
+from huggingface_hub import hf_hub_download
+# Model repository and file paths
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
 # Get HF token from environment (if available)
 hf_token = os.getenv("HF_TOKEN")
+boundary_model = None
 keras_model = None
 kmer_to_index = None
+# Try to load boundary model from Hugging Face Hub
+try:
+    boundary_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="best_boundary_aware_model.pth",
+        token=hf_token
+    )
+    if os.path.exists(boundary_path):
+        boundary_model = GenePredictor(boundary_path)
+        logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 # Try to load Keras model from Hugging Face Hub
 try:
+    keras_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="best_model.keras",
+        token=hf_token
+    )
+    kmer_path = hf_hub_download(
+        repo_id=model_repo,
+        filename="kmer_to_index.pkl",
+        token=hf_token
+    )
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
 except Exception as e:
     logging.error(f"Failed to load Keras model from HF Hub: {e}")
 # --- Initialize Tree Analyzer ---
 analyzer = None
 try:
     if os.path.exists(csv_path):
         if analyzer.load_data(csv_path):
             logging.info("Tree analyzer initialized successfully")
+            # Try to train AI model (optional)
             try:
                 if not analyzer.train_ai_model():
                     logging.warning("AI model training failed; proceeding with basic analysis.")
     analyzer = None
 # --- Enhanced Tool Detection ---
+def check_tool_availability():
+    """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
+    # Check MAFFT
     mafft_available = False
     mafft_cmd = None
+    # Try multiple MAFFT locations
     mafft_candidates = [
         MAFFT_PATH,
         'mafft',
         '/usr/bin/mafft',
         '/usr/local/bin/mafft',
+        'mafft.bat',  # Windows
     ]
     for candidate in mafft_candidates:
+        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
             mafft_available = True
             mafft_cmd = candidate
+            logging.info(f"Found MAFFT at: {candidate}")
             break
+    # Check IQ-TREE
     iqtree_available = False
     iqtree_cmd = None
+    # Try multiple IQ-TREE locations and names
     iqtree_candidates = [
         IQTREE_PATH,
         'iqtree2',
         '/usr/local/bin/iqtree2',
         '/usr/bin/iqtree',
         '/usr/local/bin/iqtree',
+        'iqtree2.exe',  # Windows
+        'iqtree.exe',   # Windows
     ]
     for candidate in iqtree_candidates:
+        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
             iqtree_available = True
             iqtree_cmd = candidate
+            logging.info(f"Found IQ-TREE at: {candidate}")
             break
     return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
+def install_dependencies_guide():
+    """Provide installation guidance for missing dependencies"""
+    guide = """
+🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
+For MAFFT:
+- Ubuntu/Debian: sudo apt-get install mafft
+- CentOS/RHEL: sudo yum install mafft
+- macOS: brew install mafft
+- Windows: Download from https://mafft.cbrc.jp/alignment/software/
+For IQ-TREE:
+- Ubuntu/Debian: sudo apt-get install iqtree
+- CentOS/RHEL: sudo yum install iqtree
+- macOS: brew install iqtree
+- Windows: Download from http://www.iqtree.org/
+Alternative: Use conda/mamba:
+- conda install -c bioconda mafft iqtree
+Docker option:
+- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
+- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
 """
+    return guide
+def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
+    """Run MAFFT alignment with enhanced error handling"""
     try:
+        # MAFFT command with more robust options
+        cmd = [
+            mafft_cmd,
+            '--auto',  # Automatic strategy selection
+            '--quiet',  # Reduce output verbosity
+            input_fasta
+        ]
         logging.info(f"Running MAFFT: {' '.join(cmd)}")
+        # Run MAFFT with enhanced error handling
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=600,  # Increased timeout to 10 minutes
+            cwd=os.getcwd()  # Ensure working directory is set
+        )
         if result.returncode == 0:
+            # Write aligned sequences to output file
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
             logging.info(f"MAFFT alignment completed: {output_fasta}")
+            # Verify output file
             if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
                 return True, output_fasta
             else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
             logging.error(f"MAFFT failed: {error_msg}")
             return False, f"MAFFT error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("MAFFT timeout")
         return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
     except FileNotFoundError:
         return False, f"MAFFT executable not found: {mafft_cmd}"
     except Exception as e:
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
     """Run IQ-TREE with enhanced options and error handling"""
     try:
+        # Enhanced IQ-TREE command
+        cmd = [
+            iqtree_cmd,
+            '-s', aligned_fasta,
+            '-m', 'MFP',  # ModelFinder Plus for automatic model selection
+            '-bb', '1000',  # Bootstrap replicates
+            '-alrt', '1000',  # SH-aLRT test
+            '-nt', 'AUTO',  # Auto detect threads
+            '--prefix', output_prefix,
+            '-redo',  # Overwrite existing files
+            '--quiet'  # Reduce verbosity
+        ]
         logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
+        # Run IQ-TREE with enhanced error handling
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=1200,  # 20 minute timeout for larger datasets
+            cwd=os.getcwd()
+        )
         if result.returncode == 0:
             tree_file = f"{output_prefix}.treefile"
             if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
             error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
             logging.error(f"IQ-TREE failed: {error_msg}")
             return False, f"IQ-TREE error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("IQ-TREE timeout")
         return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
     except FileNotFoundError:
         return False, f"IQ-TREE executable not found: {iqtree_cmd}"
     except Exception as e:
 def create_simple_neighbor_joining_tree(sequences_dict):
     """Create a simple distance-based tree when ML tools are not available"""
     try:
+        # This is a simplified implementation
+        # In a real scenario, you'd want to use a proper NJ implementation
         import random
         seq_names = list(sequences_dict.keys())
         n_seqs = len(seq_names)
         if n_seqs < 2:
             return None, "Need at least 2 sequences for tree construction"
+        # Create a simple Newick tree structure
         if n_seqs == 2:
             tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
         else:
+            # Simple clustering approach
             tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
+        # Save to temporary file
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
         return tree_file, "Simple distance-based tree created"
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
 def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
     """Create a multi-FASTA file with query sequence and reference sequences"""
     try:
+        # Create temporary FASTA file
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
+        # Add query sequence
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
+        # Add reference sequences from existing aligned FASTA if available
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
             logging.info(f"Added reference sequences from {ref_fasta_path}")
         else:
+            # If no reference file, try to create from CSV data
             if analyzer and hasattr(analyzer, 'data'):
                 count = 0
                 for idx, row in analyzer.data.iterrows():
                         sequence = str(row['sequence']).upper()
                         temp_fasta.write(f">{seq_id}\n{sequence}\n")
                         count += 1
+                        if count >= 20:  # Limit to prevent too large datasets
                             break
                 logging.info(f"Added {count} reference sequences from CSV")
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
         logging.error(f"Failed to create multi-FASTA: {e}")
         return None
 def build_maximum_likelihood_tree(f_gene_sequence):
     """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
     try:
+        # Check tool availability with enhanced detection
+        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
+        # Prepare status message
         status_msg = "🔍 Checking dependencies...\n"
+        if not mafft_available:
+            status_msg += "❌ MAFFT not found\n"
+        else:
+            status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
+        if not iqtree_available:
+            status_msg += "❌ IQ-TREE not found\n"
+        else:
+            status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
+        # If neither tool is available, provide installation guide
+        if not mafft_available and not iqtree_available:
+            guide = install_dependencies_guide()
+            return False, f"{status_msg}\n{guide}", None, None
+        # If only one tool is missing, provide specific guidance
+        if not mafft_available:
+            return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
+        if not iqtree_available:
+            status_msg += "\n⚠️  IQ-TREE not available. Attempting simple tree construction...\n"
+            # Try to create a simple tree as fallback
+            multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
+            if multi_fasta:
+                # Read sequences
+                sequences = {}
+                current_seq = ""
+                current_name = ""
+                with open(multi_fasta, 'r') as f:
+                    for line in f:
+                        line = line.strip()
+                        if line.startswith('>'):
+                            if current_name and current_seq:
+                                sequences[current_name] = current_seq
+                            current_name = line[1:]
+                            current_seq = ""
+                        else:
+                            current_seq += line
+                    if current_name and current_seq:
+                        sequences[current_name] = current_seq
+                simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
+                os.unlink(multi_fasta)
+                if simple_tree:
+                    return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
+                else:
+                    return False, f"{status_msg}❌ {simple_msg}", None, None
+            else:
+                return False, f"{status_msg}❌ Failed to create input sequences", None, None
+        # Both tools available - proceed with full ML analysis
+        # Create output directory
         output_dir = "ml_tree_output"
         os.makedirs(output_dir, exist_ok=True)
+        # Step 1: Create multi-FASTA file with query and reference sequences
         logging.info("Creating multi-FASTA file...")
         multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
         if not multi_fasta:
             return False, f"{status_msg}❌ Failed to create input FASTA", None, None
+        # Step 2: Run MAFFT alignment
         logging.info("Running MAFFT alignment...")
         aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
+        mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
+        # Clean up temporary file
         os.unlink(multi_fasta)
         if not mafft_success:
             return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
+        # Step 3: Run IQ-TREE analysis
         logging.info("Running IQ-TREE analysis...")
         tree_prefix = os.path.join(output_dir, "ml_tree")
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
             return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
+        # Step 4: Prepare output files
         tree_file = iqtree_result
         log_file = f"{tree_prefix}.log"
+        # Copy to standard names for compatibility
         standard_aligned = "f_gene_sequences_aligned.fasta"
         standard_tree = "f_gene_sequences.phy.treefile"
         if os.path.exists(aligned_fasta):
             shutil.copy2(aligned_fasta, standard_aligned)
         if os.path.exists(tree_file):
             shutil.copy2(tree_file, standard_tree)
+        success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
+        success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
+        success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
         if os.path.exists(log_file):
             try:
                 with open(log_file, 'r') as f:
                     log_content = f.read()
+                    # Extract model information
                     if "Best-fit model:" in log_content:
                         model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
                         if model_lines:
                             success_msg += f"- {model_lines[0].strip()}\n"
             except Exception as e:
                 logging.warning(f"Could not read log file: {e}")
         logging.info("Maximum likelihood tree construction completed")
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
+# --- Tree Analysis Function (Based on old Gradio API) ---
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
+    """
+    Analyze sequence and create phylogenetic tree using the working Gradio API pattern
+    """
     try:
         if not analyzer:
             return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
         if not sequence:
             return "Error: Please provide a sequence."
         if not (1 <= matching_percentage <= 99):
             return "Error: Matching percentage must be between 1 and 99."
+        # Find query sequence
         if not analyzer.find_query_sequence(sequence):
             return "Error: Invalid query sequence or sequence not found in dataset."
+        # Set matching percentage
         analyzer.matching_percentage = matching_percentage
+        # Find similar sequences
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
         logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
+        # Build tree structure
         tree_structure = analyzer.build_tree_structure(matched_ids)
         if not tree_structure:
             return "Error: Failed to build tree structure."
+        # Create interactive tree
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         if not fig:
             return "Error: Failed to create tree visualization."
+        # Generate HTML content
         html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
+        # Save to output folder
         output_dir = "output"
         os.makedirs(output_dir, exist_ok=True)
+        # Create a safe filename
         safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
         html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
         with open(html_filename, "w", encoding='utf-8') as f:
             f.write(html_content)
         logging.info(f"Tree HTML saved to {html_filename}")
         return html_content
     except Exception as e:
         error_msg = f"Tree analysis error: {str(e)}"
         logging.error(error_msg)
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return error_msg
+# --- Keras Prediction ---
 def predict_with_keras(sequence):
     try:
         if not keras_model or not kmer_to_index:
             return f"Keras model not available. Input sequence: {sequence[:100]}..."
         if len(sequence) < 6:
             return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
+        # Generate k-mers
         kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
         indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
+        # Prepare input
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
+        # Format prediction as probabilities/scores (not a sequence)
         result = ''.join([str(round(p, 3)) for p in prediction])
         return result
     except Exception as e:
         logging.error(f"Keras prediction failed: {e}")
         return f"Keras prediction failed: {str(e)}"
+# --- FASTA Reader ---
 def read_fasta_file(file_obj):
     try:
         if file_obj is None:
             return ""
+        # Handle file object
         if hasattr(file_obj, 'name'):
             with open(file_obj.name, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
         logging.error(f"Failed to read FASTA file: {e}")
         return ""
+# --- Full Pipeline ---
 def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
     try:
         dna_input = read_fasta_file(fasta_file_obj)
         if not dna_input:
+            return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
         return run_pipeline(dna_input, similarity_score, build_ml_tree)
     except Exception as e:
         error_msg = f"Pipeline error: {str(e)}"
         logging.error(error_msg)
+        return error_msg, "", "", "", "", None, None, None, error_msg
 def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
     try:
+        # Clean input
         dna_input = dna_input.upper().strip()
         if not dna_input:
+            return "Empty input", "", "", "", "", None, None, None, "No input provided"
+        # Sanitize DNA sequence
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
+        # Step 1: Boundary Prediction - Extract F gene sequence
+        processed_sequence = dna_input  # This will be the sequence used for downstream analysis
+        boundary_output = ""
+        if boundary_model:
+            try:
+                predictions, probs, confidence = boundary_model.predict(dna_input)
+                regions = boundary_model.extract_gene_regions(predictions, dna_input)
+                if regions:
+                    processed_sequence = regions[0]["sequence"]  # Use the extracted gene region
+                    boundary_output = processed_sequence  # Output the actual F gene sequence
+                    logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
+                else:
+                    boundary_output = f"No F gene regions found in input sequence"
+                    processed_sequence = dna_input
+                    logging.warning("No gene regions found, using full sequence")
+                logging.info("Boundary model prediction completed")
+            except Exception as e:
+                logging.error(f"Boundary model failed: {e}")
+                boundary_output = f"Boundary model error: {str(e)}"
+                processed_sequence = dna_input  # Fall back to original sequence
+        else:
+            boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
+            processed_sequence = dna_input
+        # Step 2: Keras Prediction (F gene validation)
         keras_output = ""
         if processed_sequence and len(processed_sequence) >= 6:
             keras_prediction = predict_with_keras(processed_sequence)
+            # Interpret keras prediction as F gene validation
+            if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
+                # You might want to add logic here to interpret the prediction scores
+                # For now, just show the prediction
+                keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
+            else:
+                keras_output = keras_prediction
+        else:
+            keras_output = "Skipped: sequence too short for F gene validation"
+        # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
             try:
                 logging.info("Starting maximum likelihood tree construction...")
                 ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
                 if ml_success:
                     ml_tree_output = ml_message
                     aligned_file = ml_aligned
                     phy_file = ml_tree
                 else:
+                    ml_tree_output = ml_message  # This now includes detailed error information
             except Exception as e:
                 ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
         else:
             ml_tree_output = "ML tree construction skipped (not requested)"
+        # Step 4: ML Simplified Tree (using the existing approach)
         html_file = None
         tree_html_content = "No tree generated"
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
+                # Use the existing tree analysis function with user-specified similarity
                 tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
                 if tree_result and not tree_result.startswith("Error:"):
+                    # Success - we have HTML content
                     tree_html_content = tree_result
                     simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
+                    # Check if HTML file was created
                     output_dir = "output"
                     if os.path.exists(output_dir):
                         html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
                         if html_files:
+                            html_file = os.path.join(output_dir, html_files[-1])  # Get the latest
                             simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
+                    # Count sequences analyzed
                     if analyzer.find_query_sequence(processed_sequence):
                         matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
                         simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
                 else:
                     simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
                     tree_html_content = f"<p>Error: {tree_result}</p>"
             except Exception as e:
                 logging.error(f"Simplified ML tree analysis failed: {e}")
                 simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
         # Return all results
         return (
+            boundary_output,           # F gene extraction result
+            keras_output,             # F gene validation result
+            ml_tree_output,           # ML tree construction status
+            simplified_ml_output,     # Simplified tree analysis status
+            tree_html_content,        # HTML content for tree display
+            aligned_file,             # Path to aligned FASTA file
+            phy_file,                 # Path to phylogenetic tree file
+            html_file,                # Path to HTML tree file
+            f"Pipeline completed. F gene length: {len(processed_sequence)} bp"  # Summary
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
         logging.error(error_msg)
         import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return (
+            error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
             None, None, None, error_msg
         )
 # --- Gradio Interface ---
 def create_interface():
     """Create the Gradio interface with enhanced layout and features"""
+    # Custom CSS for better styling
     custom_css = """
+    .gradio-container {
+        max-width: 1200px !important;
+    }
+    .tab-nav button {
+        font-size: 16px !important;
+    }
+    .output-html {
+        height: 600px !important;
+        overflow: auto;
+    }
     """
     with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
+        This tool provides comprehensive analysis of F genes including:
+        - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
+        - **Gene Validation**: Validate extracted sequences using machine learning
+        - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
         **Instructions:**
+        1. Enter your sequence directly or upload a FASTA file
+        2. Adjust similarity threshold for phylogenetic analysis (1-99%)
+        3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
+        4. Click "Run Analysis" to start the pipeline
         """)
         with gr.Tab("🔬 Analysis Pipeline"):
             with gr.Row():
                 with gr.Column(scale=2):
+                    # Input section
                     gr.Markdown("### Input Sequence")
+                    dna_input = gr.Textbox(
+                        label="DNA Sequence",
+                        placeholder="Enter your DNA sequence here (ATCG format)...",
+                        lines=5,
+                        max_lines=10
+                    )
+                    fasta_file = gr.File(
+                        label="Or Upload FASTA File",
+                        file_types=[".fasta", ".fa", ".fas", ".txt"]
+                    )
                     with gr.Row():
+                        similarity_score = gr.Slider(
+                            minimum=1,
+                            maximum=99,
+                            value=95.0,
+                            step=1.0,
+                            label="Similarity Threshold (%)",
+                            info="Minimum similarity for phylogenetic analysis"
+                        )
+                        build_ml_tree = gr.Checkbox(
+                            label="Build ML Tree",
+                            value=False,
+                            info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
+                        )
+                    # Action buttons
                     with gr.Row():
                         run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 with gr.Column(scale=1):
+                    # Status and info
                     gr.Markdown("### Analysis Status")
+                    status_display = gr.Textbox(
+                        label="Status",
+                        value="Ready to analyze",
+                        interactive=False,
+                        lines=3
+                    )
+                    # Model status
                     gr.Markdown("### Available Models")
                     model_status = []
+                    if boundary_model:
+                        model_status.append("✅ Boundary Detection Model")
+                    else:
+                        model_status.append("❌ Boundary Detection Model")
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
                         model_status.append("❌ Gene Validation Model")
                     if analyzer:
                         model_status.append("✅ Tree Analysis Module")
                     else:
                         model_status.append("❌ Tree Analysis Module")
                     gr.Markdown("\n".join(model_status))
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
+                    # Text outputs
+                    boundary_output = gr.Textbox(
+                        label="🎯 F Gene Extraction",
+                        lines=5,
+                        interactive=False
+                    )
+                    keras_output = gr.Textbox(
+                        label="🔍 Gene Validation",
+                        lines=3,
+                        interactive=False
+                    )
                 with gr.Column():
+                    ml_tree_output = gr.Textbox(
+                        label="🌳 Maximum Likelihood Tree",
+                        lines=5,
+                        interactive=False
+                    )
+                    simplified_ml_output = gr.Textbox(
+                        label="📈 Simplified Phylogenetic Analysis",
+                        lines=3,
+                        interactive=False
+                    )
+            # Tree visualization
             gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
+            tree_html = gr.HTML(
+                label="Interactive Tree",
+                value="<p>No tree generated yet. Run analysis to see results.</p>"
+            )
+            # File downloads
             gr.Markdown("### 📁 Download Results")
             with gr.Row():
+                aligned_file = gr.File(
+                    label="Aligned Sequences (FASTA)",
+                    interactive=False
+                )
+                phy_file = gr.File(
+                    label="Phylogenetic Tree File",
+                    interactive=False
+                )
+                html_file = gr.File(
+                    label="Interactive Tree (HTML)",
+                    interactive=False
+                )
         with gr.Tab("ℹ️ Help & Info"):
             gr.Markdown("""
             ## About This Tool
             ### F Gene Analysis Pipeline
+            This comprehensive pipeline analyzes F genes through multiple computational approaches:
+            #### 🎯 Gene Boundary Detection
+            - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
+            - Provides confidence scores for detected boundaries
+            - Automatically trims sequences to focus on the F gene region
+            #### 🔍 Gene Validation
+            - Employs k-mer based machine learning models to validate extracted sequences
+            - Provides probability scores indicating likelihood of being a genuine F gene
+            - Uses 6-mer frequency patterns for classification
+            #### 🌳 Phylogenetic Analysis
+            **Maximum Likelihood Trees:**
+            - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
+            - Performs model selection and bootstrap analysis
+            - Generates publication-quality phylogenetic trees
+            - Provides detailed evolutionary analysis
+            **Simplified Trees:**
+            - Uses built-in algorithms for quick phylogenetic analysis
+            - Interactive visualization with similarity-based clustering
+            - Faster alternative when external tools are not available
             ### Input Requirements
+            - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
+            - **FASTA Files**: Standard FASTA format with single or multiple sequences
+            - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
             ### Dependencies
+            **Required for ML Trees:**
             ```bash
+            # Ubuntu/Debian
+            sudo apt-get install mafft iqtree
+            # macOS
+            brew install mafft iqtree
+            # Conda
+            conda install -c bioconda mafft iqtree
             ```
+            ### Output Files
+            - **Aligned FASTA**: Multiple sequence alignment in FASTA format
+            - **Tree File**: Newick format phylogenetic tree
+            - **HTML Tree**: Interactive visualization for web browsers
             ### Troubleshooting
+            **Common Issues:**
+            - *"No similar sequences found"*: Lower the similarity threshold
+            - *"Sequence too short"*: Provide sequences longer than 50 bp
+            - *"MAFFT/IQ-TREE not found"*: Install required dependencies
+            - *"Model not available"*: Check model files are properly downloaded
+            **Performance Tips:**
+            - Use sequences between 100-2000 bp for optimal performance
+            - Limit to <50 sequences for faster tree construction
+            - Lower similarity thresholds find more distant relatives
+            - Higher thresholds focus on closely related sequences
+            ### Citation
+            If you use this tool in your research, please cite the appropriate methods and tools used.
             """)
+        # Event handlers
+        def run_analysis_text(dna_seq, sim_score, build_tree):
+            return run_pipeline(dna_seq, sim_score, build_tree)
+        def run_analysis_file(file_obj, sim_score, build_tree):
+            return run_pipeline_from_file(file_obj, sim_score, build_tree)
         def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
+            # Priority: file upload over text input
             if file_obj is not None:
                 return run_pipeline_from_file(file_obj, sim_score, build_tree)
             else:
         def clear_inputs():
             return "", None, 95.0, False, "Ready to analyze"
+        # Connect events
         run_btn.click(
             fn=run_analysis_combined,
             inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
             outputs=[
+                boundary_output, keras_output, ml_tree_output,
+                simplified_ml_output, tree_html, aligned_file,
+                phy_file, html_file, status_display
             ]
         )
         clear_btn.click(
             fn=clear_inputs,
             outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
         )
+        # Example data loading
+        gr.Markdown("### 🧪 Example Data")
         example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
         def load_example():
             example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
             return example_seq, "Example F gene sequence loaded"
+        example_btn.click(
+            fn=load_example,
+            outputs=[dna_input, status_display]
+        )
     return iface
 # --- Main Execution ---
 if __name__ == "__main__":
+    # Initialize and launch interface
     interface = create_interface()
+    # Launch with enhanced configuration
     interface.launch(
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860,        # Default Gradio port
+        share=False,             # Set to True for public sharing
+        debug=True,              # Enable debug mode
+        show_error=True,         # Show detailed errors
+        max_threads=4,           # Limit concurrent threads
+        auth=None,               # Add authentication if needed: ("username", "password")
+        ssl_verify=False,        # For development environments
+        quiet=False              # Show startup messages
     )