Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 9, 2025

Commit

d6bbc7a

verified ·

1 Parent(s): 22db390

Update app.py

Browse files

Files changed (1) hide show

app.py +362 -711

app.py CHANGED Viewed

@@ -1,1139 +1,790 @@
 import ml_simplified_tree
 import tempfile
 import shutil
-import sys
 from pathlib import Path
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
-from huggingface_hub import hf_hub_download
-# Model repository and file paths
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
 # Get HF token from environment (if available)
 hf_token = os.getenv("HF_TOKEN")
 boundary_model = None
 keras_model = None
 kmer_to_index = None
 # Try to load boundary model from Hugging Face Hub
 try:
-    boundary_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="best_boundary_aware_model.pth",
-        token=hf_token
-    )
     if os.path.exists(boundary_path):
         boundary_model = GenePredictor(boundary_path)
         logging.info("Boundary model loaded successfully from Hugging Face Hub.")
 # Try to load Keras model from Hugging Face Hub
 try:
-    keras_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="best_model.keras",
-        token=hf_token
-    )
-    kmer_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="kmer_to_index.pkl",
-        token=hf_token
-    )
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
 except Exception as e:
     logging.error(f"Failed to load Keras model from HF Hub: {e}")
 # --- Initialize Tree Analyzer ---
 analyzer = None
 try:
     if os.path.exists(csv_path):
         if analyzer.load_data(csv_path):
             logging.info("Tree analyzer initialized successfully")
-            # Try to train AI model (optional)
             try:
                 if not analyzer.train_ai_model():
                     logging.warning("AI model training failed; proceeding with basic analysis.")
     analyzer = None
 # --- Enhanced Tool Detection ---
-def check_tool_availability():
-    """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
-    # Check MAFFT
     mafft_available = False
     mafft_cmd = None
-    # Try multiple MAFFT locations
     mafft_candidates = [
         MAFFT_PATH,
         'mafft',
         '/usr/bin/mafft',
         '/usr/local/bin/mafft',
-        'mafft.bat',  # Windows
     ]
     for candidate in mafft_candidates:
-        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
             mafft_available = True
             mafft_cmd = candidate
-            logging.info(f"Found MAFFT at: {candidate}")
             break
-    # Check IQ-TREE
     iqtree_available = False
     iqtree_cmd = None
-    # Try multiple IQ-TREE locations and names
     iqtree_candidates = [
         IQTREE_PATH,
-        'iqtree2',
-        '/usr/local/bin/iqtree2',
         '/usr/bin/iqtree',
         '/usr/local/bin/iqtree',
-        'iqtree2.exe',  # Windows
-        'iqtree.exe',   # Windows
     ]
     for candidate in iqtree_candidates:
-        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
             iqtree_available = True
             iqtree_cmd = candidate
-            logging.info(f"Found IQ-TREE at: {candidate}")
             break
-    return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
-def install_dependencies_guide():
-    """Provide installation guidance for missing dependencies"""
-    guide = """
-🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
-For MAFFT:
-- Ubuntu/Debian: sudo apt-get install mafft
-- CentOS/RHEL: sudo yum install mafft
-- macOS: brew install mafft
-- Windows: Download from https://mafft.cbrc.jp/alignment/software/
-For IQ-TREE:
-- Ubuntu/Debian: sudo apt-get install iqtree
-- CentOS/RHEL: sudo yum install iqtree
-- macOS: brew install iqtree
-- Windows: Download from http://www.iqtree.org/
-Alternative: Use conda/mamba:
-- conda install -c bioconda mafft iqtree
-Docker option:
-- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
-- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
 """
-    return guide
-def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
-    """Run MAFFT alignment with enhanced error handling"""
     try:
-        # MAFFT command with more robust options
-        cmd = [
-            mafft_cmd,
-            '--auto',  # Automatic strategy selection
-            '--quiet',  # Reduce output verbosity
-            input_fasta
-        ]
         logging.info(f"Running MAFFT: {' '.join(cmd)}")
-        # Run MAFFT with enhanced error handling
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=600,  # Increased timeout to 10 minutes
-            cwd=os.getcwd()  # Ensure working directory is set
-        )
         if result.returncode == 0:
-            # Write aligned sequences to output file
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
             logging.info(f"MAFFT alignment completed: {output_fasta}")
-            # Verify output file
             if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
                 return True, output_fasta
             else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
             logging.error(f"MAFFT failed: {error_msg}")
             return False, f"MAFFT error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("MAFFT timeout")
         return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
     except FileNotFoundError:
         return False, f"MAFFT executable not found: {mafft_cmd}"
     except Exception as e:
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
     """Run IQ-TREE with enhanced options and error handling"""
     try:
-        # Enhanced IQ-TREE command
-        cmd = [
-            iqtree_cmd,
-            '-s', aligned_fasta,
-            '-m', 'MFP',  # ModelFinder Plus for automatic model selection
-            '-bb', '1000',  # Bootstrap replicates
-            '-alrt', '1000',  # SH-aLRT test
-            '-nt', 'AUTO',  # Auto detect threads
-            '--prefix', output_prefix,
-            '-redo',  # Overwrite existing files
-            '--quiet'  # Reduce verbosity
-        ]
         logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
-        # Run IQ-TREE with enhanced error handling
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=1200,  # 20 minute timeout for larger datasets
-            cwd=os.getcwd()
-        )
         if result.returncode == 0:
             tree_file = f"{output_prefix}.treefile"
             if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
             error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
             logging.error(f"IQ-TREE failed: {error_msg}")
             return False, f"IQ-TREE error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("IQ-TREE timeout")
         return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
     except FileNotFoundError:
         return False, f"IQ-TREE executable not found: {iqtree_cmd}"
     except Exception as e:
 def create_simple_neighbor_joining_tree(sequences_dict):
     """Create a simple distance-based tree when ML tools are not available"""
     try:
-        # This is a simplified implementation
-        # In a real scenario, you'd want to use a proper NJ implementation
         import random
         seq_names = list(sequences_dict.keys())
         n_seqs = len(seq_names)
         if n_seqs < 2:
             return None, "Need at least 2 sequences for tree construction"
-        # Create a simple Newick tree structure
         if n_seqs == 2:
             tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
         else:
-            # Simple clustering approach
             tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
-        # Save to temporary file
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
         return tree_file, "Simple distance-based tree created"
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
 def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
     """Create a multi-FASTA file with query sequence and reference sequences"""
     try:
-        # Create temporary FASTA file
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
-        # Add query sequence
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
-        # Add reference sequences from existing aligned FASTA if available
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
             logging.info(f"Added reference sequences from {ref_fasta_path}")
         else:
-            # If no reference file, try to create from CSV data
             if analyzer and hasattr(analyzer, 'data'):
                 count = 0
                 for idx, row in analyzer.data.iterrows():
                         sequence = str(row['sequence']).upper()
                         temp_fasta.write(f">{seq_id}\n{sequence}\n")
                         count += 1
-                        if count >= 20:  # Limit to prevent too large datasets
                             break
                 logging.info(f"Added {count} reference sequences from CSV")
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
         logging.error(f"Failed to create multi-FASTA: {e}")
         return None
 def build_maximum_likelihood_tree(f_gene_sequence):
     """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
     try:
-        # Check tool availability with enhanced detection
-        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
-        # Prepare status message
         status_msg = "🔍 Checking dependencies...\n"
-        if not mafft_available:
-            status_msg += "❌ MAFFT not found\n"
-        else:
-            status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
-        if not iqtree_available:
-            status_msg += "❌ IQ-TREE not found\n"
-        else:
-            status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
-        # If neither tool is available, provide installation guide
-        if not mafft_available and not iqtree_available:
-            guide = install_dependencies_guide()
-            return False, f"{status_msg}\n{guide}", None, None
-        # If only one tool is missing, provide specific guidance
-        if not mafft_available:
-            return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
-        if not iqtree_available:
-            status_msg += "\n⚠️  IQ-TREE not available. Attempting simple tree construction...\n"
-            # Try to create a simple tree as fallback
-            multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
-            if multi_fasta:
-                # Read sequences
-                sequences = {}
-                current_seq = ""
-                current_name = ""
-                with open(multi_fasta, 'r') as f:
-                    for line in f:
-                        line = line.strip()
-                        if line.startswith('>'):
-                            if current_name and current_seq:
-                                sequences[current_name] = current_seq
-                            current_name = line[1:]
-                            current_seq = ""
-                        else:
-                            current_seq += line
-                    if current_name and current_seq:
-                        sequences[current_name] = current_seq
-                simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
-                os.unlink(multi_fasta)
-                if simple_tree:
-                    return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
-                else:
-                    return False, f"{status_msg}❌ {simple_msg}", None, None
-            else:
-                return False, f"{status_msg}❌ Failed to create input sequences", None, None
-        # Both tools available - proceed with full ML analysis
-        # Create output directory
         output_dir = "ml_tree_output"
         os.makedirs(output_dir, exist_ok=True)
-        # Step 1: Create multi-FASTA file with query and reference sequences
         logging.info("Creating multi-FASTA file...")
         multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
         if not multi_fasta:
             return False, f"{status_msg}❌ Failed to create input FASTA", None, None
-        # Step 2: Run MAFFT alignment
         logging.info("Running MAFFT alignment...")
         aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
-        mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
-        # Clean up temporary file
         os.unlink(multi_fasta)
         if not mafft_success:
             return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
-        # Step 3: Run IQ-TREE analysis
         logging.info("Running IQ-TREE analysis...")
         tree_prefix = os.path.join(output_dir, "ml_tree")
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
             return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
-        # Step 4: Prepare output files
         tree_file = iqtree_result
         log_file = f"{tree_prefix}.log"
-        # Copy to standard names for compatibility
         standard_aligned = "f_gene_sequences_aligned.fasta"
         standard_tree = "f_gene_sequences.phy.treefile"
         if os.path.exists(aligned_fasta):
             shutil.copy2(aligned_fasta, standard_aligned)
         if os.path.exists(tree_file):
             shutil.copy2(tree_file, standard_tree)
-        success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
-        success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
-        success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
         if os.path.exists(log_file):
             try:
                 with open(log_file, 'r') as f:
                     log_content = f.read()
-                    # Extract model information
                     if "Best-fit model:" in log_content:
                         model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
                         if model_lines:
                             success_msg += f"- {model_lines[0].strip()}\n"
             except Exception as e:
                 logging.warning(f"Could not read log file: {e}")
         logging.info("Maximum likelihood tree construction completed")
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
-# --- Tree Analysis Function (Based on old Gradio API) ---
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
-    """
-    Analyze sequence and create phylogenetic tree using the working Gradio API pattern
-    """
     try:
         if not analyzer:
             return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
         if not sequence:
             return "Error: Please provide a sequence."
         if not (1 <= matching_percentage <= 99):
             return "Error: Matching percentage must be between 1 and 99."
-        # Find query sequence
         if not analyzer.find_query_sequence(sequence):
             return "Error: Invalid query sequence or sequence not found in dataset."
-        # Set matching percentage
         analyzer.matching_percentage = matching_percentage
-        # Find similar sequences
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
         logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
-        # Build tree structure
         tree_structure = analyzer.build_tree_structure(matched_ids)
         if not tree_structure:
             return "Error: Failed to build tree structure."
-        # Create interactive tree
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         if not fig:
             return "Error: Failed to create tree visualization."
-        # Generate HTML content
         html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
-        # Save to output folder
         output_dir = "output"
         os.makedirs(output_dir, exist_ok=True)
-        # Create a safe filename
         safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
         html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
         with open(html_filename, "w", encoding='utf-8') as f:
             f.write(html_content)
         logging.info(f"Tree HTML saved to {html_filename}")
         return html_content
     except Exception as e:
         error_msg = f"Tree analysis error: {str(e)}"
         logging.error(error_msg)
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return error_msg
-# --- Keras Prediction ---
-def predict_with_keras(sequence):
-    try:
-        if not keras_model or not kmer_to_index:
-            return f"Keras model not available. Input sequence: {sequence[:100]}..."
-        if len(sequence) < 6:
-            return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
-        # Generate k-mers
-        kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
-        indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
-        # Prepare input
-        input_arr = np.array([indices])
-        prediction = keras_model.predict(input_arr, verbose=0)[0]
-        # Format prediction as probabilities/scores (not a sequence)
-        result = ''.join([str(round(p, 3)) for p in prediction])
-        return result
-    except Exception as e:
-        logging.error(f"Keras prediction failed: {e}")
-        return f"Keras prediction failed: {str(e)}"
-# --- FASTA Reader ---
 def read_fasta_file(file_obj):
     try:
         if file_obj is None:
             return ""
-        # Handle file object
         if hasattr(file_obj, 'name'):
             with open(file_obj.name, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
         logging.error(f"Failed to read FASTA file: {e}")
         return ""
-# --- Full Pipeline ---
 def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
     try:
         dna_input = read_fasta_file(fasta_file_obj)
         if not dna_input:
-            return "Failed to read FASTA file", "", "", "", "", None, None, None, "No input sequence"
-        return run_pipeline(dna_input, similarity_score, build_ml_tree)
     except Exception as e:
         error_msg = f"Pipeline error: {str(e)}"
         logging.error(error_msg)
-        return error_msg, "", "", "", "", None, None, None, error_msg
-def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
     try:
-        # Clean input
         dna_input = dna_input.upper().strip()
         if not dna_input:
-            return "Empty input", "", "", "", "", None, None, None, "No input provided"
-        # Sanitize DNA sequence
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
-        # Step 1: Boundary Prediction - Extract F gene sequence
-        processed_sequence = dna_input  # This will be the sequence used for downstream analysis
-        boundary_output = ""
         if boundary_model:
             try:
                 predictions, probs, confidence = boundary_model.predict(dna_input)
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
-                    processed_sequence = regions[0]["sequence"]  # Use the extracted gene region
-                    boundary_output = processed_sequence  # Output the actual F gene sequence
                     logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
                 else:
                     boundary_output = f"No F gene regions found in input sequence"
-                    processed_sequence = dna_input
                     logging.warning("No gene regions found, using full sequence")
-                logging.info("Boundary model prediction completed")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
                 boundary_output = f"Boundary model error: {str(e)}"
-                processed_sequence = dna_input  # Fall back to original sequence
         else:
             boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
-            processed_sequence = dna_input
-        # Step 2: Keras Prediction (F gene validation)
-        keras_output = ""
-        if processed_sequence and len(processed_sequence) >= 6:
-            keras_prediction = predict_with_keras(processed_sequence)
-            # Interpret keras prediction as F gene validation
-            if keras_prediction and not keras_prediction.startswith(("Keras", "Sequence too short")):
-                # You might want to add logic here to interpret the prediction scores
-                # For now, just show the prediction
-                keras_output = f"F gene validation scores: {keras_prediction[:100]}..."
-            else:
-                keras_output = keras_prediction
         else:
-            keras_output = "Skipped: sequence too short for F gene validation"
-        # Step 3: Maximum Likelihood Tree (MAFFT + IQ-TREE)
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
             try:
                 logging.info("Starting maximum likelihood tree construction...")
                 ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
                 if ml_success:
                     ml_tree_output = ml_message
                     aligned_file = ml_aligned
                     phy_file = ml_tree
                 else:
-                    ml_tree_output = ml_message  # This now includes detailed error information
             except Exception as e:
                 ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
         else:
             ml_tree_output = "ML tree construction skipped (not requested)"
-        # Step 4: ML Simplified Tree (using the existing approach)
         html_file = None
         tree_html_content = "No tree generated"
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
-                # Use the existing tree analysis function with user-specified similarity
                 tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
                 if tree_result and not tree_result.startswith("Error:"):
-                    # Success - we have HTML content
                     tree_html_content = tree_result
                     simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
-                    # Check if HTML file was created
                     output_dir = "output"
                     if os.path.exists(output_dir):
                         html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
                         if html_files:
-                            html_file = os.path.join(output_dir, html_files[-1])  # Get the latest
                             simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
-                    # Count sequences analyzed
                     if analyzer.find_query_sequence(processed_sequence):
                         matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
                         simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
                 else:
                     simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
                     tree_html_content = f"<p>Error: {tree_result}</p>"
             except Exception as e:
                 logging.error(f"Simplified ML tree analysis failed: {e}")
                 simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
         # Return all results
         return (
-            boundary_output,           # F gene extraction result
-            keras_output,             # F gene validation result
-            ml_tree_output,           # ML tree construction status
-            simplified_ml_output,     # Simplified tree analysis status
-            tree_html_content,        # HTML content for tree display
-            aligned_file,             # Path to aligned FASTA file
-            phy_file,                 # Path to phylogenetic tree file
-            html_file,                # Path to HTML tree file
-            f"Pipeline completed. F gene length: {len(processed_sequence)} bp"  # Summary
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
         logging.error(error_msg)
         import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return (
-            error_msg, "", "", "", f"<p>Error: {error_msg}</p>",
             None, None, None, error_msg
         )
 # --- Gradio Interface ---
 def create_interface():
     """Create the Gradio interface with enhanced layout and features"""
-    # Custom CSS for better styling
     custom_css = """
-    .gradio-container {
-        max-width: 1200px !important;
-    }
-    .tab-nav button {
-        font-size: 16px !important;
-    }
-    .output-html {
-        height: 600px !important;
-        overflow: auto;
-    }
     """
     with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
-        This tool provides comprehensive analysis of F genes including:
-        - **Gene Boundary Detection**: Extract F gene sequences from larger genomic sequences
-        - **Gene Validation**: Validate extracted sequences using machine learning
-        - **Phylogenetic Analysis**: Build maximum likelihood trees and simplified phylogenetic trees
         **Instructions:**
-        1. Enter your sequence directly or upload a FASTA file
-        2. Adjust similarity threshold for phylogenetic analysis (1-99%)
-        3. Choose whether to build maximum likelihood trees (requires MAFFT & IQ-TREE)
-        4. Click "Run Analysis" to start the pipeline
         """)
         with gr.Tab("🔬 Analysis Pipeline"):
             with gr.Row():
                 with gr.Column(scale=2):
-                    # Input section
                     gr.Markdown("### Input Sequence")
-                    dna_input = gr.Textbox(
-                        label="DNA Sequence",
-                        placeholder="Enter your DNA sequence here (ATCG format)...",
-                        lines=5,
-                        max_lines=10
-                    )
-                    fasta_file = gr.File(
-                        label="Or Upload FASTA File",
-                        file_types=[".fasta", ".fa", ".fas", ".txt"]
-                    )
                     with gr.Row():
-                        similarity_score = gr.Slider(
-                            minimum=1,
-                            maximum=99,
-                            value=95.0,
-                            step=1.0,
-                            label="Similarity Threshold (%)",
-                            info="Minimum similarity for phylogenetic analysis"
-                        )
-                        build_ml_tree = gr.Checkbox(
-                            label="Build ML Tree",
-                            value=False,
-                            info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)"
-                        )
-                    # Action buttons
                     with gr.Row():
                         run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 with gr.Column(scale=1):
-                    # Status and info
                     gr.Markdown("### Analysis Status")
-                    status_display = gr.Textbox(
-                        label="Status",
-                        value="Ready to analyze",
-                        interactive=False,
-                        lines=3
-                    )
-                    # Model status
                     gr.Markdown("### Available Models")
                     model_status = []
                     if boundary_model:
                         model_status.append("✅ Boundary Detection Model")
                     else:
                         model_status.append("❌ Boundary Detection Model")
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
                         model_status.append("❌ Gene Validation Model")
                     if analyzer:
                         model_status.append("✅ Tree Analysis Module")
                     else:
                         model_status.append("❌ Tree Analysis Module")
                     gr.Markdown("\n".join(model_status))
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
-                    # Text outputs
-                    boundary_output = gr.Textbox(
-                        label="🎯 F Gene Extraction",
-                        lines=5,
-                        interactive=False
-                    )
-                    keras_output = gr.Textbox(
-                        label="🔍 Gene Validation",
-                        lines=3,
-                        interactive=False
-                    )
                 with gr.Column():
-                    ml_tree_output = gr.Textbox(
-                        label="🌳 Maximum Likelihood Tree",
-                        lines=5,
-                        interactive=False
-                    )
-                    simplified_ml_output = gr.Textbox(
-                        label="📈 Simplified Phylogenetic Analysis",
-                        lines=3,
-                        interactive=False
-                    )
-            # Tree visualization
             gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
-            tree_html = gr.HTML(
-                label="Interactive Tree",
-                value="<p>No tree generated yet. Run analysis to see results.</p>"
-            )
-            # File downloads
             gr.Markdown("### 📁 Download Results")
             with gr.Row():
-                aligned_file = gr.File(
-                    label="Aligned Sequences (FASTA)",
-                    interactive=False
-                )
-                phy_file = gr.File(
-                    label="Phylogenetic Tree File",
-                    interactive=False
-                )
-                html_file = gr.File(
-                    label="Interactive Tree (HTML)",
-                    interactive=False
-                )
         with gr.Tab("ℹ️ Help & Info"):
             gr.Markdown("""
             ## About This Tool
             ### F Gene Analysis Pipeline
-            This comprehensive pipeline analyzes F genes through multiple computational approaches:
-            #### 🎯 Gene Boundary Detection
-            - Uses deep learning to identify and extract F gene sequences from larger genomic sequences
-            - Provides confidence scores for detected boundaries
-            - Automatically trims sequences to focus on the F gene region
-            #### 🔍 Gene Validation
-            - Employs k-mer based machine learning models to validate extracted sequences
-            - Provides probability scores indicating likelihood of being a genuine F gene
-            - Uses 6-mer frequency patterns for classification
-            #### 🌳 Phylogenetic Analysis
-            **Maximum Likelihood Trees:**
-            - Requires MAFFT (sequence alignment) and IQ-TREE (phylogenetic reconstruction)
-            - Performs model selection and bootstrap analysis
-            - Generates publication-quality phylogenetic trees
-            - Provides detailed evolutionary analysis
-            **Simplified Trees:**
-            - Uses built-in algorithms for quick phylogenetic analysis
-            - Interactive visualization with similarity-based clustering
-            - Faster alternative when external tools are not available
             ### Input Requirements
-            - **DNA Sequences**: ATCG format, minimum 50 bp for meaningful analysis
-            - **FASTA Files**: Standard FASTA format with single or multiple sequences
-            - **Similarity Threshold**: 1-99% for controlling phylogenetic analysis sensitivity
             ### Dependencies
-            **Required for ML Trees:**
             ```bash
-            # Ubuntu/Debian
-            sudo apt-get install mafft iqtree
-            # macOS
-            brew install mafft iqtree
-            # Conda
-            conda install -c bioconda mafft iqtree
             ```
-            ### Output Files
-            - **Aligned FASTA**: Multiple sequence alignment in FASTA format
-            - **Tree File**: Newick format phylogenetic tree
-            - **HTML Tree**: Interactive visualization for web browsers
             ### Troubleshooting
-            **Common Issues:**
-            - *"No similar sequences found"*: Lower the similarity threshold
-            - *"Sequence too short"*: Provide sequences longer than 50 bp
-            - *"MAFFT/IQ-TREE not found"*: Install required dependencies
-            - *"Model not available"*: Check model files are properly downloaded
-            **Performance Tips:**
-            - Use sequences between 100-2000 bp for optimal performance
-            - Limit to <50 sequences for faster tree construction
-            - Lower similarity thresholds find more distant relatives
-            - Higher thresholds focus on closely related sequences
-            ### Citation
-            If you use this tool in your research, please cite the appropriate methods and tools used.
             """)
-        # Event handlers
-        def run_analysis_text(dna_seq, sim_score, build_tree):
-            return run_pipeline(dna_seq, sim_score, build_tree)
-        def run_analysis_file(file_obj, sim_score, build_tree):
-            return run_pipeline_from_file(file_obj, sim_score, build_tree)
         def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
-            # Priority: file upload over text input
             if file_obj is not None:
                 return run_pipeline_from_file(file_obj, sim_score, build_tree)
             else:
         def clear_inputs():
             return "", None, 95.0, False, "Ready to analyze"
-        # Connect events
         run_btn.click(
             fn=run_analysis_combined,
             inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
             outputs=[
-                boundary_output, keras_output, ml_tree_output,
-                simplified_ml_output, tree_html, aligned_file,
-                phy_file, html_file, status_display
             ]
         )
         clear_btn.click(
             fn=clear_inputs,
             outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
         )
-        # Example data loading
-        gr.Markdown("### 🧪 Example Data")
         example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
         def load_example():
             example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
             return example_seq, "Example F gene sequence loaded"
-        example_btn.click(
-            fn=load_example,
-            outputs=[dna_input, status_display]
-        )
     return iface
 # --- Main Execution ---
 if __name__ == "__main__":
-    # Initialize and launch interface
     interface = create_interface()
-    # Launch with enhanced configuration
     interface.launch(
-        server_name="0.0.0.0",  # Allow external connections
-        server_port=7860,        # Default Gradio port
-        share=False,             # Set to True for public sharing
-        debug=True,              # Enable debug mode
-        show_error=True,         # Show detailed errors
-        max_threads=4,           # Limit concurrent threads
-        auth=None,               # Add authentication if needed: ("username", "password")
-        ssl_verify=False,        # For development environments
-        quiet=False              # Show startup messages
     )

+# app.py
+import gradio as gr
+import torch
+import pickle
+import subprocess
+import pandas as pd
+import os
+import re
+import logging
+import numpy as np
+from predictor import GenePredictor, preprocess_sequence_for_ndv_f_gene, enhanced_keras_prediction, enhanced_classify_sequence, validate_ndv_f_gene_sequence
+from tensorflow.keras.models import load_model
 import ml_simplified_tree
 import tempfile
 import shutil
+import stat
 from pathlib import Path
+from huggingface_hub import hf_hub_download
+from tensorflow.keras.preprocessing.sequence import pad_sequences
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
+IQTREE_PATH = "iqtree/bin/iqtree3"  # Updated to match uploaded iqtree3 files
+# --- Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Paths ---
 model_repo = "GGproject10/best_boundary_aware_model"
 csv_path = "f cleaned.csv"
+classifier_model_dir = "model"  # Directory for second model files
 # Get HF token from environment (if available)
 hf_token = os.getenv("HF_TOKEN")
+# --- Load Models ---
 boundary_model = None
 keras_model = None
 kmer_to_index = None
+classifier_model = None
+classifier_kmer_to_index = None
+classifier_maxlen = None
+LABELS = ["Random", "F", "P", "N", "M", "HN", "L"]
 # Try to load boundary model from Hugging Face Hub
 try:
+    boundary_path = hf_hub_download(repo_id=model_repo, filename="best_boundary_aware_model.pth", token=hf_token)
     if os.path.exists(boundary_path):
         boundary_model = GenePredictor(boundary_path)
         logging.info("Boundary model loaded successfully from Hugging Face Hub.")
+    else:
+        logging.warning(f"Boundary model file not found after download")
+except Exception as e:
+    logging.error(f"Failed to load boundary model from HF Hub: {e}")
 # Try to load Keras model from Hugging Face Hub
 try:
+    keras_path = hf_hub_download(repo_id=model_repo, filename="best_model.keras", token=hf_token)
+    kmer_path = hf_hub_download(repo_id=model_repo, filename="kmer_to_index.pkl", token=hf_token)
     if os.path.exists(keras_path) and os.path.exists(kmer_path):
         keras_model = load_model(keras_path)
         with open(kmer_path, "rb") as f:
+            kmer_to_index = pickle.load(f)
+        logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
+    else:
+        logging.warning(f"Keras model or kmer files not found after download")
 except Exception as e:
     logging.error(f"Failed to load Keras model from HF Hub: {e}")
+# Try to load classifier model (second model)
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+try:
+    classifier_path = os.path.join(classifier_model_dir, "best_model.keras")
+    classifier_kmer_path = os.path.join(classifier_model_dir, "kmer_to_index.pkl")
+    classifier_maxlen_path = os.path.join(classifier_model_dir, "maxlen.txt")
+    missing_files = []
+    if not os.path.exists(classifier_path):
+        missing_files.append("best_model.keras")
+    if not os.path.exists(classifier_kmer_path):
+        missing_files.append("kmer_to_index.pkl")
+    if not os.path.exists(classifier_maxlen_path):
+        missing_files.append("maxlen.txt")
+    if missing_files:
+        logging.warning(f"Classifier model files not found: {', '.join(missing_files)}")
+    else:
+        classifier_model = load_model(classifier_path)
+        with open(classifier_kmer_path, "rb") as f:
+            classifier_kmer_to_index = pickle.load(f)
+        with open(classifier_maxlen_path, "r") as f:
+            classifier_maxlen = int(f.read().strip())
+        logging.info("Classifier model loaded successfully.")
+except Exception as e:
+    logging.error(f"Failed to load classifier model: {e}")
+    logging.warning("Falling back to existing Keras model for validation.")
 # --- Initialize Tree Analyzer ---
 analyzer = None
 try:
+    analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
     if os.path.exists(csv_path):
         if analyzer.load_data(csv_path):
             logging.info("Tree analyzer initialized successfully")
             try:
                 if not analyzer.train_ai_model():
                     logging.warning("AI model training failed; proceeding with basic analysis.")
+            except Exception as e:
+                logging.warning(f"AI model training failed: {e}")
+        else:
+            logging.error("Failed to load CSV data for tree analyzer")
+            analyzer = None
+    else:
+        logging.error(f"CSV file not found: {csv_path}")
+        analyzer = None
+except Exception as e:
+    logging.error(f"Failed to initialize tree analyzer: {e}")
     analyzer = None
 # --- Enhanced Tool Detection ---
+def check_and_fix_executable_permissions(filepath):
+    """Check and fix executable permissions for a file"""
+    try:
+        if os.path.exists(filepath):
+            if not os.access(filepath, os.X_OK):
+                logging.info(f"File {filepath} is not executable, attempting to fix permissions...")
+                current_permissions = os.stat(filepath).st_mode
+                os.chmod(filepath, current_permissions | stat.S_IEXEC | stat.S_IXUSR | stat.S_IXGRP)
+                logging.info(f"Fixed permissions for {filepath}")
+                return True
+            return True
+        return False
+    except Exception as e:
+        logging.error(f"Failed to fix permissions for {filepath): {e}")
+        return False
+def enhanced_check_tool_availability():
+    """Enhanced check for MAFFT and IQ-TREE availability with permission fixing and MAFFT_BINARIES unset"""
+    # Unset MAFFT_BINARIES to fix version check issue
+    if 'MAFFT_BINARIES' in os.environ:
+        del os.environ['MAFFT_BINARIES']
+        logging.info("Unset MAFFT_BINARIES environment variable to resolve version check issue.")
     mafft_available = False
     mafft_cmd = None
     mafft_candidates = [
         MAFFT_PATH,
         'mafft',
         '/usr/bin/mafft',
         '/usr/local/bin/mafft',
+        '/opt/homebrew/bin/mafft',
+        '/usr/local/homebrew/bin/mafft',
+        'mafft.bat',
     ]
     for candidate in mafft_candidates:
+        if candidate and os.path.exists(candidate):
+            if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
+                check_and_fix_executable_permissions(candidate)
+            if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
+                mafft_available = True
+                mafft_cmd = candidate
+                logging.info(f"Found MAFFT at: {candidate}")
+                break
+        elif candidate and shutil.which(candidate) is not None:
             mafft_available = True
             mafft_cmd = candidate
+            logging.info(f"Found MAFFT in PATH: {candidate}")
             break
     iqtree_available = False
     iqtree_cmd = None
     iqtree_candidates = [
         IQTREE_PATH,
+        'iqtree3',
+        'iqtree',
+        '/usr/bin/iqtree3',
+        '/usr/local/bin/iqtree3',
         '/usr/bin/iqtree',
         '/usr/local/bin/iqtree',
+        '/opt/homebrew/bin/iqtree3',
+        'iqtree3.exe',
     ]
     for candidate in iqtree_candidates:
+        if candidate and os.path.exists(candidate):
+            if "/" in candidate and not candidate.startswith("/usr/") and not candidate.startswith("/opt/"):
+                check_and_fix_executable_permissions(candidate)
+            if os.access(candidate, os.X_OK) or shutil.which(candidate) is not None:
+                iqtree_available = True
+                iqtree_cmd = candidate
+                logging.info(f"Found IQ-TREE at: {candidate}")
+                break
+        elif candidate and shutil.which(candidate) is not None:
             iqtree_available = True
             iqtree_cmd = candidate
+            logging.info(f"Found IQ-TREE in PATH: {candidate}")
             break
+    return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
+def get_installation_instructions():
+    """Get detailed installation instructions based on the current system"""
+    import platform
+    system = platform.system().lower()
+    if system == "linux":
+        try:
+            with open('/etc/os-release', 'r') as f:
+                os_info = f.read().lower()
+            if 'ubuntu' in os_info or 'debian' in os_info:
+                return """
+📦 INSTALLATION INSTRUCTIONS (Ubuntu/Debian):
+1. Update package list: sudo apt-get update
+2. Install MAFFT and IQ-TREE: sudo apt-get install mafft iqtree
+3. Verify installation: mafft --version, iqtree3 --version
+Alternative using Conda: conda install -c bioconda mafft iqtree
+"""
+            elif 'centos' in os_info or 'rhel' in os_info or 'fedora' in os_info:
+                return """
+📦 INSTALLATION INSTRUCTIONS (CentOS/RHEL/Fedora):
+1. Install EPEL repository (CentOS/RHEL): sudo yum install epel-release
+2. Install packages: sudo yum install mafft iqtree
+3. Verify installation: mafft --version, iqtree3 --version
+"""
+        except:
+            pass
+    elif system == "darwin":
+        return """
+📦 INSTALLATION INSTRUCTIONS (macOS):
+Using Homebrew: 1. Install Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+2. Install MAFFT and IQ-TREE: brew install mafft iqtree
+3. Verify installation: mafft --version, iqtree3 --version
+Using Conda: conda install -c bioconda mafft iqtree
+"""
+    elif system == "windows":
+        return """
+📦 INSTALLATION INSTRUCTIONS (Windows):
+Option 1 - Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
+Option 2 - Manual: 1. Download MAFFT: https://mafft.cbrc.jp/alignment/software/
+2. Download IQ-TREE: http://www.iqtree.org/
+3. Add to PATH
+"""
+    return """
+📦 GENERAL INSTALLATION INSTRUCTIONS:
+Using Conda: 1. Install Miniconda 2. Run: conda install -c bioconda mafft iqtree
+Manual: 1. MAFFT: https://mafft.cbrc.jp/alignment/software/
+2. IQ-TREE: http://www.iqtree.org/
 """
+def run_mafft_alignment_improved(input_fasta, output_fasta, mafft_cmd):
+    """Run MAFFT alignment with improved permission and error handling"""
     try:
+        if not os.access(mafft_cmd, os.X_OK):
+            logging.warning(f"MAFFT executable {mafft_cmd} is not executable")
+            if not check_and_fix_executable_permissions(mafft_cmd):
+                return False, f"Cannot make {mafft_cmd} executable"
+        try:
+            test_result = subprocess.run([mafft_cmd, '--version'], capture_output=True, text=True, timeout=10, env={k: v for k, v in os.environ.items() if k != 'MAFFT_BINARIES'})
+            if test_result.returncode != 0:
+                return False, f"MAFFT version check failed: {test_result.stderr}"
+        except Exception as e:
+            return False, f"MAFFT version check failed: {str(e)}"
+        cmd = [mafft_cmd, '--auto', '--quiet', '--thread', '2', input_fasta]
         logging.info(f"Running MAFFT: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd(), env={k: v for k, v in os.environ.items() if k != 'MAFFT_BINARIES'})
         if result.returncode == 0:
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
             logging.info(f"MAFFT alignment completed: {output_fasta}")
             if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
                 return True, output_fasta
             else:
+                return False, "MAFFT completed but output file is empty"
+        else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
             logging.error(f"MAFFT failed: {error_msg}")
             return False, f"MAFFT error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("MAFFT timeout")
         return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
+    except PermissionError as e:
+        logging.error(f"Permission error running MAFFT: {e}")
+        return False, f"Permission denied: {mafft_cmd}. Please check file permissions."
     except FileNotFoundError:
         return False, f"MAFFT executable not found: {mafft_cmd}"
     except Exception as e:
+        logging.error(f"MAFFT execution failed: {e}")
+        return False, f"MAFFT execution failed: {str(e)}"
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
     """Run IQ-TREE with enhanced options and error handling"""
     try:
+        if not os.access(iqtree_cmd, os.X_OK):
+            logging.warning(f"IQ-TREE executable {iqtree_cmd} is not executable")
+            if not check_and_fix_executable_permissions(iqtree_cmd):
+                return False, f"Cannot make {iqtree_cmd} executable"
+        try:
+            test_result = subprocess.run([iqtree_cmd, '--version'], capture_output=True, text=True, timeout=10)
+            if test_result.returncode != 0:
+                return False, f"IQ-TREE version check failed: {test_result.stderr}"
+        except Exception as e:
+            return False, f"IQ-TREE version check failed: {str(e)}"
+        cmd = [iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000', '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '-redo', '--quiet']
         logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200, cwd=os.getcwd())
         if result.returncode == 0:
             tree_file = f"{output_prefix}.treefile"
             if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
+                logging.info(f"IQ-TREE analysis completed: {tree_file}")
+                return True, tree_file
+            else:
+                logging.error("IQ-TREE completed but tree file not found or empty")
+                return False, "Tree file not generated or empty"
+        else:
             error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
             logging.error(f"IQ-TREE failed: {error_msg}")
             return False, f"IQ-TREE error: {error_msg}"
     except subprocess.TimeoutExpired:
         logging.error("IQ-TREE timeout")
         return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
+    except PermissionError as e:
+        logging.error(f"Permission error running IQ-TREE: {e}")
+        return False, f"Permission denied: {iqtree_cmd}. Please check file permissions."
     except FileNotFoundError:
         return False, f"IQ-TREE executable not found: {iqtree_cmd}"
     except Exception as e:
+        logging.error(f"IQ-TREE execution failed: {e}")
+        return False, f"IQ-TREE execution failed: {str(e)}"
 def create_simple_neighbor_joining_tree(sequences_dict):
     """Create a simple distance-based tree when ML tools are not available"""
     try:
         import random
         seq_names = list(sequences_dict.keys())
         n_seqs = len(seq_names)
         if n_seqs < 2:
             return None, "Need at least 2 sequences for tree construction"
         if n_seqs == 2:
             tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
         else:
             tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
         return tree_file, "Simple distance-based tree created"
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
 def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
     """Create a multi-FASTA file with query sequence and reference sequences"""
     try:
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
             logging.info(f"Added reference sequences from {ref_fasta_path}")
         else:
             if analyzer and hasattr(analyzer, 'data'):
                 count = 0
                 for idx, row in analyzer.data.iterrows():
+                    if 'sequence' in row and len(str(row['sequence'])) > 50:
+                        seq_id = row.get('id', f"Ref_{count}")
                         sequence = str(row['sequence']).upper()
                         temp_fasta.write(f">{seq_id}\n{sequence}\n")
                         count += 1
+                        if count >= 20:
                             break
                 logging.info(f"Added {count} reference sequences from CSV")
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
         logging.error(f"Failed to create multi-FASTA: {e}")
         return None
 def build_maximum_likelihood_tree(f_gene_sequence):
     """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
     try:
+        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = enhanced_check_tool_availability()
         status_msg = "🔍 Checking dependencies...\n"
+        status_msg += f"✅ MAFFT found: {mafft_cmd}\n" if mafft_available else "❌ MAFFT not found\n"
+        status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n" if iqtree_available else "❌ IQ-TREE not found\n"
+        if not mafft_available or not iqtree_available:
+            instructions = get_installation_instructions()
+            return False, f"{status_msg}\n{instructions}", None, None
         output_dir = "ml_tree_output"
         os.makedirs(output_dir, exist_ok=True)
         logging.info("Creating multi-FASTA file...")
         multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
         if not multi_fasta:
             return False, f"{status_msg}❌ Failed to create input FASTA", None, None
         logging.info("Running MAFFT alignment...")
         aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
+        mafft_success, mafft_result = run_mafft_alignment_improved(multi_fasta, aligned_fasta, mafft_cmd)
         os.unlink(multi_fasta)
         if not mafft_success:
             return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
         logging.info("Running IQ-TREE analysis...")
         tree_prefix = os.path.join(output_dir, "ml_tree")
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
             return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
         tree_file = iqtree_result
         log_file = f"{tree_prefix}.log"
         standard_aligned = "f_gene_sequences_aligned.fasta"
         standard_tree = "f_gene_sequences.phy.treefile"
         if os.path.exists(aligned_fasta):
             shutil.copy2(aligned_fasta, standard_aligned)
         if os.path.exists(tree_file):
             shutil.copy2(tree_file, standard_tree)
+        success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}\n"
         if os.path.exists(log_file):
             try:
                 with open(log_file, 'r') as f:
                     log_content = f.read()
                     if "Best-fit model:" in log_content:
                         model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
                         if model_lines:
                             success_msg += f"- {model_lines[0].strip()}\n"
             except Exception as e:
                 logging.warning(f"Could not read log file: {e}")
         logging.info("Maximum likelihood tree construction completed")
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> str:
+    """Analyze sequence and create phylogenetic tree"""
     try:
         if not analyzer:
             return "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
         if not sequence:
             return "Error: Please provide a sequence."
         if not (1 <= matching_percentage <= 99):
             return "Error: Matching percentage must be between 1 and 99."
         if not analyzer.find_query_sequence(sequence):
             return "Error: Invalid query sequence or sequence not found in dataset."
         analyzer.matching_percentage = matching_percentage
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
         logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
         tree_structure = analyzer.build_tree_structure(matched_ids)
         if not tree_structure:
             return "Error: Failed to build tree structure."
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         if not fig:
             return "Error: Failed to create tree visualization."
         html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
         output_dir = "output"
         os.makedirs(output_dir, exist_ok=True)
         safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
         html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
         with open(html_filename, "w", encoding='utf-8') as f:
             f.write(html_content)
         logging.info(f"Tree HTML saved to {html_filename}")
         return html_content
     except Exception as e:
         error_msg = f"Tree analysis error: {str(e)}"
         logging.error(error_msg)
+        import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return error_msg
 def read_fasta_file(file_obj):
+    """Read FASTA file content"""
     try:
         if file_obj is None:
             return ""
         if hasattr(file_obj, 'name'):
             with open(file_obj.name, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8") if hasattr(file_obj, "read") else str(file_obj)
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
+    except Exception as e:
         logging.error(f"Failed to read FASTA file: {e}")
         return ""
 def run_pipeline_from_file(fasta_file_obj, similarity_score, build_ml_tree):
+    """Run pipeline from FASTA file"""
     try:
         dna_input = read_fasta_file(fasta_file_obj)
         if not dna_input:
+            return "Failed to read FASTA file", "", "", "", "", "", "", "", "", None, None, None, "No input sequence"
+        return enhanced_run_pipeline(
+            dna_input, keras_model, kmer_to_index, classifier_model,
+            classifier_kmer_to_index, classifier_maxlen, LABELS,
+            similarity_score, build_ml_tree
+        )
     except Exception as e:
         error_msg = f"Pipeline error: {str(e)}"
         logging.error(error_msg)
+        return error_msg, "", "", "", "", "", "", "", "", None, None, None, error_msg
+def enhanced_run_pipeline(dna_input, keras_model, kmer_to_index, classifier_model,
+                        classifier_kmer_to_index, classifier_maxlen, labels,
+                        similarity_score=95.0, build_ml_tree=False):
+    """Enhanced pipeline with improved F gene prediction"""
     try:
+        # Input validation and preprocessing
         dna_input = dna_input.upper().strip()
         if not dna_input:
+            return "Empty input", "", "", "", "", "", "", "", "", None, None, None, "No input provided"
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
             logging.info("DNA sequence sanitized")
+        # Step 1: Boundary Prediction
+        processed_sequence = dna_input
+        boundary_output = ""
         if boundary_model:
             try:
                 predictions, probs, confidence = boundary_model.predict(dna_input)
                 regions = boundary_model.extract_gene_regions(predictions, dna_input)
                 if regions:
+                    processed_sequence = regions[0]["sequence"]
+                    boundary_output = f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})"
                     logging.info(f"F gene extracted: {len(processed_sequence)} bp (confidence: {confidence:.3f})")
                 else:
                     boundary_output = f"No F gene regions found in input sequence"
                     logging.warning("No gene regions found, using full sequence")
             except Exception as e:
                 logging.error(f"Boundary model failed: {e}")
                 boundary_output = f"Boundary model error: {str(e)}"
         else:
             boundary_output = f"Boundary model not available. Using original input: {len(dna_input)} bp"
+        # Step 2: Enhanced Keras Prediction
+        keras_result = enhanced_keras_prediction(processed_sequence, keras_model, kmer_to_index)
+        if isinstance(keras_result, dict):
+            keras_output = f"Prediction confidence: {keras_result['confidence_score']:.3f}\n"
+            keras_output += f"K-mer coverage: {keras_result['kmer_coverage']:.1%}\n"
+            keras_output += f"Sequence length: {keras_result['sequence_length']} nt"
+            if keras_result['kmer_coverage'] < 0.8:
+                keras_output += "\n⚠️ Low k-mer coverage - may affect accuracy"
         else:
+            keras_output = str(keras_result)
+        # Step 3: Enhanced Classification
+        classifier_result = enhanced_classify_sequence(
+            processed_sequence, classifier_model, classifier_kmer_to_index, classifier_maxlen, labels
+        )
+        classifier_status = classifier_result["status"]
+        classifier_message = classifier_result["message"]
+        classifier_label = classifier_result["predicted_label"] or "Unknown"
+        classifier_confidence = f"{classifier_result['confidence']:.3f}" if classifier_result['confidence'] is not None else "N/A"
+        # Step 4: Maximum Likelihood Tree
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 50:
             try:
                 logging.info("Starting maximum likelihood tree construction...")
                 ml_success, ml_message, ml_aligned, ml_tree = build_maximum_likelihood_tree(processed_sequence)
                 if ml_success:
                     ml_tree_output = ml_message
                     aligned_file = ml_aligned
                     phy_file = ml_tree
                 else:
+                    ml_tree_output = ml_message
             except Exception as e:
                 ml_tree_output = f"❌ ML Tree construction failed: {str(e)}"
                 logging.error(f"ML Tree failed: {e}")
+        elif build_ml_tree:
+            ml_tree_output = "❌ F gene sequence too short for ML tree construction (minimum 50 bp)"
         else:
             ml_tree_output = "ML tree construction skipped (not requested)"
+        # Step 5: ML Simplified Tree
         html_file = None
         tree_html_content = "No tree generated"
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 logging.info(f"Starting simplified ML tree analysis with F gene sequence length: {len(processed_sequence)}")
                 tree_result = analyze_sequence_for_tree(processed_sequence, matching_percentage=similarity_score)
                 if tree_result and not tree_result.startswith("Error:"):
                     tree_html_content = tree_result
                     simplified_ml_output = "✅ Simplified phylogenetic tree generated successfully!"
                     output_dir = "output"
                     if os.path.exists(output_dir):
                         html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]
                         if html_files:
+                            html_file = os.path.join(output_dir, html_files[-1])
                             simplified_ml_output += f"\n- Tree file: {html_files[-1]}"
                     if analyzer.find_query_sequence(processed_sequence):
                         matched_ids, perc = analyzer.find_similar_sequences(similarity_score)
                         simplified_ml_output += f"\n- {len(matched_ids)} sequences analyzed"
+                        simplified_ml_output += f"\n- Similarity threshold: {perc:.1f}%"
                 else:
                     simplified_ml_output = f"❌ Simplified ML tree failed: {tree_result}"
                     tree_html_content = f"<p>Error: {tree_result}</p>"
             except Exception as e:
                 logging.error(f"Simplified ML tree analysis failed: {e}")
                 simplified_ml_output = f"❌ Simplified ML tree analysis failed: {str(e)}"
+                tree_html_content = f"<p>Error: {str(e)}</p>"
+        else:
+            if not analyzer:
+                simplified_ml_output = "❌ Tree analyzer not available"
+            else:
+                simplified_ml_output = "❌ F gene sequence too short for tree analysis (minimum 10 bp)"
         # Return all results
         return (
+            boundary_output,
+            keras_output,
+            classifier_status,
+            classifier_message,
+            classifier_label,
+            classifier_confidence,
+            ml_tree_output,
+            simplified_ml_output,
+            tree_html_content,
+            aligned_file,
+            phy_file,
+            html_file,
+            f"Pipeline completed. F gene length: {len(processed_sequence)} bp"
         )
     except Exception as e:
         error_msg = f"Pipeline execution failed: {str(e)}"
         logging.error(error_msg)
         import traceback
         logging.error(f"Full traceback: {traceback.format_exc()}")
         return (
+            error_msg, "", "error", error_msg, "Error", "0.000", "", "", f"<p>Error: {error_msg}</p>",
             None, None, None, error_msg
         )
 # --- Gradio Interface ---
 def create_interface():
     """Create the Gradio interface with enhanced layout and features"""
     custom_css = """
+    .gradio-container { max-width: 1200px !important; }
+    .tab-nav button { font-size: 16px !important; }
+    .output-html { height: 600px !important; overflow: auto; }
     """
     with gr.Blocks(css=custom_css, title="F Gene Analysis Pipeline") as iface:
         gr.Markdown("""
         # 🧬 F Gene Analysis Pipeline
+        This tool provides comprehensive analysis of F genes:
+        - **🎯 F Gene Extraction**: Extracts F gene sequences using deep learning.
+        - **🔍 Gene Validation**: Validates with machine learning.
+        - **🧬 Gene Classification**: Classifies sequence type (F gene or other).
+        - **🌳 Phylogenetic Analysis**: Builds maximum likelihood and simplified trees.
         **Instructions:**
+        1. Enter your sequence or upload a FASTA file
+        2. Adjust similarity threshold (1-99%)
+        3. Choose whether to build ML tree (requires MAFFT & IQ-TREE)
+        4. Click "Run Analysis" to start
         """)
         with gr.Tab("🔬 Analysis Pipeline"):
             with gr.Row():
                 with gr.Column(scale=2):
                     gr.Markdown("### Input Sequence")
+                    dna_input = gr.Textbox(label="DNA Sequence", placeholder="Enter your DNA sequence here (ATCG format)...", lines=5, max_lines=10)
+                    fasta_file = gr.File(label="Or Upload FASTA File", file_types=[".fasta", ".fa", ".fas", ".txt"])
                     with gr.Row():
+                        similarity_score = gr.Slider(minimum=1, maximum=99, value=95.0, step=1.0, label="Similarity Threshold (%)", info="Minimum similarity for phylogenetic analysis")
+                        build_ml_tree = gr.Checkbox(label="Build ML Tree", value=False, info="Build maximum likelihood tree (requires MAFFT & IQ-TREE)")
                     with gr.Row():
                         run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
                         clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 with gr.Column(scale=1):
                     gr.Markdown("### Analysis Status")
+                    status_display = gr.Textbox(label="Status", value="Ready to analyze", interactive=False, lines=3)
                     gr.Markdown("### Available Models")
                     model_status = []
                     if boundary_model:
                         model_status.append("✅ Boundary Detection Model")
                     else:
                         model_status.append("❌ Boundary Detection Model")
                     if keras_model:
                         model_status.append("✅ Gene Validation Model")
                     else:
                         model_status.append("❌ Gene Validation Model")
+                    if classifier_model:
+                        model_status.append("✅ Gene Classification Model")
+                    else:
+                        model_status.append("❌ Gene Classification Model")
                     if analyzer:
                         model_status.append("✅ Tree Analysis Module")
                     else:
                         model_status.append("❌ Tree Analysis Module")
                     gr.Markdown("\n".join(model_status))
         with gr.Tab("📊 Results"):
             with gr.Row():
                 with gr.Column():
+                    boundary_output = gr.Textbox(label="🎯 F Gene Extraction", lines=5, interactive=False)
+                    keras_output = gr.Textbox(label="🔍 Gene Validation", lines=5, interactive=False)
+                    classifier_status = gr.Textbox(label="🧬 Classification Status", lines=1, interactive=False)
+                    classifier_message = gr.Textbox(label="📝 Classification Message", lines=6, interactive=False)
+                    classifier_label = gr.Textbox(label="🏷️ Predicted Label", lines=1, interactive=False)
+                    classifier_confidence = gr.Textbox(label="📊 Confidence Score", lines=1, interactive=False)
                 with gr.Column():
+                    ml_tree_output = gr.Textbox(label="🌳 Maximum Likelihood Tree", lines=5, interactive=False)
+                    simplified_ml_output = gr.Textbox(label="📈 Simplified Phylogenetic Analysis", lines=3, interactive=False)
             gr.Markdown("### 🌲 Phylogenetic Tree Visualization")
+            tree_html = gr.HTML(label="Interactive Tree", value="<p>No tree generated yet. Run analysis to see results.</p>")
             gr.Markdown("### 📁 Download Results")
             with gr.Row():
+                aligned_file = gr.File(label="Aligned Sequences (FASTA)", interactive=False)
+                phy_file = gr.File(label="Phylogenetic Tree File", interactive=False)
+                html_file = gr.File(label="Interactive Tree (HTML)", interactive=False)
         with gr.Tab("ℹ️ Help & Info"):
             gr.Markdown("""
             ## About This Tool
             ### F Gene Analysis Pipeline
+            - **🎯 F Gene Extraction**: Extracts F gene sequences using deep learning.
+            - **🔍 Gene Validation**: Validates with k-mer based machine learning.
+            - **🧬 Gene Classification**: Classifies sequences (F gene or other) with confidence scores.
+            - **🌳 Phylogenetic Analysis**: Builds ML and simplified trees.
             ### Input Requirements
+            - DNA Sequences: ATCG format, minimum 50 bp (preferably 1500-2000 bp for F gene).
+            - FASTA Files: Standard format.
+            - Similarity Threshold: 1-99%.
             ### Dependencies
+            **For ML Trees:**
             ```bash
+            # Ubuntu/Debian: sudo apt-get update && sudo apt-get install mafft iqtree
+            # macOS: brew install mafft iqtree
+            # Conda: conda install -c bioconda mafft iqtree
             ```
             ### Troubleshooting
+            - *"No similar sequences"*: Lower similarity threshold.
+            - *"Sequence too short"*: Provide >50 bp (ideally >1500 bp for F gene).
+            - *"MAFFT/IQ-TREE not found"*: Install dependencies.
+            - *"Model not available"*: Check model files.
             """)
         def run_analysis_combined(dna_seq, file_obj, sim_score, build_tree):
             if file_obj is not None:
                 return run_pipeline_from_file(file_obj, sim_score, build_tree)
             else:
+                return enhanced_run_pipeline(
+                    dna_seq, keras_model, kmer_to_index, classifier_model,
+                    classifier_kmer_to_index, classifier_maxlen, LABELS,
+                    sim_score, build_tree
+                )
         def clear_inputs():
             return "", None, 95.0, False, "Ready to analyze"
         run_btn.click(
             fn=run_analysis_combined,
             inputs=[dna_input, fasta_file, similarity_score, build_ml_tree],
             outputs=[
+                boundary_output, keras_output, classifier_status, classifier_message,
+                classifier_label, classifier_confidence, ml_tree_output, simplified_ml_output,
+                tree_html, aligned_file, phy_file, html_file, status_display
             ]
         )
         clear_btn.click(
             fn=clear_inputs,
             outputs=[dna_input, fasta_file, similarity_score, build_ml_tree, status_display]
         )
         example_btn = gr.Button("Load Example F Gene Sequence", variant="secondary")
         def load_example():
             example_seq = "ATGAAACTGTCAACACTCACTGAGTACATTAGCCAAGTTCTCAAGACTGAGTGTTTACCTTTGTGAATACACTGAGTCCTTGTCAACGTTCGGCTGCAGTCACACTGATGGTCTTGTCTTCAGGAGCAACTGCAGTCTGTGCTGTGTACTATAGTGCTAAGAGTGATAATGCACTGTTCAGTACCTTTGACAGTGTGTCTCTGTCACCTGGTGCTATGCAGAGCTGCGATGAGATCTACATTGGTCTGATCGATAAGACTGAGTCCAAGGGTGTTGCTGTGTGTACTGTAGAGTGTGATAGTGTTGCCTGCACTGTGTCTATGGCTGATCTTGAGGCTCTGCTTATGTCAACACTGAGTGTGAAATGTTCATTTGCTACTTCAAGACTGATGTGAAGACTGTGTATTGTACTCAGTCATGCAGAGTGAAGTCCTTGAGCCACTTGCTTTGTACAATGTGGGTGATGAGATGTTGTGCTGCAGTGTCAAGGGGCCACAGTCTTGCCTTGATAGTGCGATTGCTGTGATGATGTGCACTTCAATGAGTGGTCGAGATGCTGCTGTGTGTAAGGATGCTGCTGTGTGTAAGAAGGATGCTGCTGTGTGTAAGA"
             return example_seq, "Example F gene sequence loaded"
+        example_btn.click(fn=load_example, outputs=[dna_input, status_display])
     return iface
 # --- Main Execution ---
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True,
+        show_error=True,
+        max_threads=4,
+        auth=None,
+        ssl_verify=False,
+        quiet=False
     )