Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 9, 2025

Commit

20b71b9

verified ·

1 Parent(s): b5a86a2

Update app.py

Browse files

Files changed (1) hide show

app.py +305 -1042

app.py CHANGED Viewed

@@ -8,201 +8,132 @@ import os
 import re
 import logging
 import numpy as np
-from predictor import GenePredictor
-from tensorflow.keras.models import load_model
-import ml_simplified_tree
 import tempfile
 import shutil
 import sys
 from pathlib import Path
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
 IQTREE_PATH = "iqtree/bin/iqtree2"  # Update this path as needed
-# --- Logging ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# --- Paths ---
-from huggingface_hub import hf_hub_download
-# Model repository and file paths
-model_repo = "GGproject10/best_boundary_aware_model"
-csv_path = "f cleaned.csv"
-# Get HF token from environment (if available)
-hf_token = os.getenv("HF_TOKEN")
-# --- Load Models ---
 boundary_model = None
 keras_model = None
 kmer_to_index = None
-# Try to load boundary model from Hugging Face Hub
-try:
-    boundary_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="best_boundary_aware_model.pth",
-        token=hf_token
-    )
-    if os.path.exists(boundary_path):
-        boundary_model = GenePredictor(boundary_path)
-        logging.info("Boundary model loaded successfully from Hugging Face Hub.")
-    else:
-        logging.warning(f"Boundary model file not found after download")
-except Exception as e:
-    logging.error(f"Failed to load boundary model from HF Hub: {e}")
-# Try to load Keras model from Hugging Face Hub
-try:
-    keras_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="best_model.keras",
-        token=hf_token
-    )
-    kmer_path = hf_hub_download(
-        repo_id=model_repo,
-        filename="kmer_to_index.pkl",
-        token=hf_token
-    )
-    if os.path.exists(keras_path) and os.path.exists(kmer_path):
-        keras_model = load_model(keras_path)
-        with open(kmer_path, "rb") as f:
-            kmer_to_index = pickle.load(f)
-        logging.info("Keras model and k-mer index loaded successfully from Hugging Face Hub.")
     else:
-        logging.warning(f"Keras model or kmer files not found after download")
-except Exception as e:
-    logging.error(f"Failed to load Keras model from HF Hub: {e}")
-# --- Load Verification Models from models directory ---
-verification_models = {}
-def load_verification_models():
-    """Load all verification models from the models directory"""
-    global verification_models
-    models_dir = "models"
-    if not os.path.exists(models_dir):
-        logging.warning(f"Models directory not found: {models_dir}")
-        return
-    # Load different types of verification models
-    model_files = {
-        "boundary_model": "best_boundary_aware_model.pth",
-        "keras_model": "best_model.keras",
-        "kmer_index": "kmer_to_index.pkl",
-        "additional_model_1": "verification_model_1.pth",  # Add your model names here
-        "additional_model_2": "verification_model_2.keras",
-        # Add more models as needed
-    }
-    for model_name, filename in model_files.items():
-        model_path = os.path.join(models_dir, filename)
         try:
-            if os.path.exists(model_path):
-                if filename.endswith('.pth'):
-                    # PyTorch model
-                    if model_name == "boundary_model":
-                        verification_models[model_name] = GenePredictor(model_path)
-                    else:
-                        verification_models[model_name] = torch.load(model_path, map_location='cpu')
-                elif filename.endswith('.keras'):
-                    # Keras model
-                    verification_models[model_name] = load_model(model_path)
-                elif filename.endswith('.pkl'):
-                    # Pickle file
-                    with open(model_path, 'rb') as f:
-                        verification_models[model_name] = pickle.load(f)
-                logging.info(f"Loaded verification model: {model_name}")
         except Exception as e:
-            logging.error(f"Failed to load {model_name} from {model_path}: {e}")
-# Load verification models at startup
-load_verification_models()
 # --- Initialize Tree Analyzer ---
-analyzer = None
-try:
-    analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
-    if os.path.exists(csv_path):
-        if analyzer.load_data(csv_path):
-            logging.info("Tree analyzer initialized successfully")
-            # Try to train AI model (optional)
-            try:
-                if not analyzer.train_ai_model():
-                    logging.warning("AI model training failed; proceeding with basic analysis.")
-            except Exception as e:
-                logging.warning(f"AI model training failed: {e}")
-        else:
-            logging.error("Failed to load CSV data for tree analyzer")
             analyzer = None
     else:
-        logging.error(f"CSV file not found: {csv_path}")
         analyzer = None
-except Exception as e:
-    logging.error(f"Failed to initialize tree analyzer: {e}")
-    analyzer = None
-# --- Enhanced Tool Detection ---
 def check_tool_availability():
-    """Enhanced check for MAFFT and IQ-TREE availability with multiple fallback options"""
-    # Check MAFFT
-    mafft_available = False
-    mafft_cmd = None
-    # Try multiple MAFFT locations
     mafft_candidates = [
-        MAFFT_PATH,
-        'mafft',
-        '/usr/bin/mafft',
-        '/usr/local/bin/mafft',
-        'mafft.bat',  # Windows
     ]
-    for candidate in mafft_candidates:
-        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
-            mafft_available = True
-            mafft_cmd = candidate
-            logging.info(f"Found MAFFT at: {candidate}")
-            break
-    # Check IQ-TREE
-    iqtree_available = False
-    iqtree_cmd = None
-    # Try multiple IQ-TREE locations and names
     iqtree_candidates = [
-        IQTREE_PATH,
-        'iqtree2',
-        'iqtree',
-        '/usr/bin/iqtree2',
-        '/usr/local/bin/iqtree2',
-        '/usr/bin/iqtree',
-        '/usr/local/bin/iqtree',
-        'iqtree2.exe',  # Windows
-        'iqtree.exe',   # Windows
     ]
-    for candidate in iqtree_candidates:
-        if candidate and (os.path.exists(candidate) or shutil.which(candidate) is not None):
-            iqtree_available = True
-            iqtree_cmd = candidate
-            logging.info(f"Found IQ-TREE at: {candidate}")
-            break
-    return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
 def install_dependencies_guide():
-    """Provide installation guidance for missing dependencies"""
-    guide = """
 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
 For MAFFT:
@@ -217,999 +148,331 @@ For IQ-TREE:
 - macOS: brew install iqtree
 - Windows: Download from http://www.iqtree.org/
-Alternative: Use conda/mamba:
-- conda install -c bioconda mafft iqtree
-Docker option:
-- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/mafft:7.490--h779adbc_0
-- docker run -it --rm -v $(pwd):/data quay.io/biocontainers/iqtree:2.1.4_beta--hdcc8f71_0
 """
-    return guide
 def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
-    """Run MAFFT alignment with enhanced error handling"""
     try:
-        # MAFFT command with more robust options
-        cmd = [
-            mafft_cmd,
-            '--auto',  # Automatic strategy selection
-            '--quiet',  # Reduce output verbosity
-            input_fasta
-        ]
-        logging.info(f"Running MAFFT: {' '.join(cmd)}")
-        # Run MAFFT with enhanced error handling
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=600,  # Increased timeout to 10 minutes
-            cwd=os.getcwd()  # Ensure working directory is set
-        )
         if result.returncode == 0:
-            # Write aligned sequences to output file
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
-            logging.info(f"MAFFT alignment completed: {output_fasta}")
-            # Verify output file
-            if os.path.exists(output_fasta) and os.path.getsize(output_fasta) > 0:
                 return True, output_fasta
-            else:
-                return False, "MAFFT completed but output file is empty"
-        else:
-            error_msg = result.stderr.strip() if result.stderr else "Unknown MAFFT error"
-            logging.error(f"MAFFT failed: {error_msg}")
-            return False, f"MAFFT error: {error_msg}"
-    except subprocess.TimeoutExpired:
-        logging.error("MAFFT timeout")
-        return False, "MAFFT timeout (>10 minutes). Try with fewer sequences."
-    except FileNotFoundError:
-        return False, f"MAFFT executable not found: {mafft_cmd}"
     except Exception as e:
-        logging.error(f"MAFFT execution failed: {e}")
-        return False, f"MAFFT execution failed: {str(e)}"
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
-    """Run IQ-TREE with enhanced options and error handling"""
     try:
-        # Enhanced IQ-TREE command
         cmd = [
-            iqtree_cmd,
-            '-s', aligned_fasta,
-            '-m', 'MFP',  # ModelFinder Plus for automatic model selection
-            '-bb', '1000',  # Bootstrap replicates
-            '-alrt', '1000',  # SH-aLRT test
-            '-nt', 'AUTO',  # Auto detect threads
-            '--prefix', output_prefix,
-            '-redo',  # Overwrite existing files
-            '--quiet'  # Reduce verbosity
         ]
-        logging.info(f"Running IQ-TREE: {' '.join(cmd)}")
-        # Run IQ-TREE with enhanced error handling
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=1200,  # 20 minute timeout for larger datasets
-            cwd=os.getcwd()
-        )
-        if result.returncode == 0:
-            tree_file = f"{output_prefix}.treefile"
-            if os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
-                logging.info(f"IQ-TREE analysis completed: {tree_file}")
-                return True, tree_file
-            else:
-                logging.error("IQ-TREE completed but tree file not found or empty")
-                return False, "Tree file not generated or empty"
-        else:
-            error_msg = result.stderr.strip() if result.stderr else "Unknown IQ-TREE error"
-            logging.error(f"IQ-TREE failed: {error_msg}")
-            return False, f"IQ-TREE error: {error_msg}"
-    except subprocess.TimeoutExpired:
-        logging.error("IQ-TREE timeout")
-        return False, "IQ-TREE timeout (>20 minutes). Try with fewer sequences or simpler model."
-    except FileNotFoundError:
-        return False, f"IQ-TREE executable not found: {iqtree_cmd}"
     except Exception as e:
-        logging.error(f"IQ-TREE execution failed: {e}")
-        return False, f"IQ-TREE execution failed: {str(e)}"
-def create_simple_neighbor_joining_tree(sequences_dict):
-    """Create a simple distance-based tree when ML tools are not available"""
     try:
-        # This is a simplified implementation
-        # In a real scenario, you'd want to use a proper NJ implementation
-        import random
         seq_names = list(sequences_dict.keys())
-        n_seqs = len(seq_names)
-        if n_seqs < 2:
-            return None, "Need at least 2 sequences for tree construction"
-        # Create a simple Newick tree structure
-        if n_seqs == 2:
-            tree_str = f"({seq_names[0]}:0.1,{seq_names[1]}:0.1);"
-        else:
-            # Simple clustering approach
-            tree_str = "(" + ",".join([f"{name}:0.1" for name in seq_names[:5]]) + ");"
-        # Save to temporary file
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
-        return tree_file, "Simple distance-based tree created"
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
-def create_multi_fasta_with_query(query_sequence, query_id="Query_F_Gene"):
-    """Create a multi-FASTA file with query sequence and reference sequences"""
     try:
-        # Create temporary FASTA file
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
-        # Add query sequence
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
-        # Add reference sequences from existing aligned FASTA if available
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
-            logging.info(f"Added reference sequences from {ref_fasta_path}")
-        else:
-            # If no reference file, try to create from CSV data
-            if analyzer and hasattr(analyzer, 'data'):
-                count = 0
-                for idx, row in analyzer.data.iterrows():
-                    if 'sequence' in row and len(str(row['sequence'])) > 50:
-                        seq_id = row.get('id', f"Ref_{count}")
-                        sequence = str(row['sequence']).upper()
-                        temp_fasta.write(f">{seq_id}\n{sequence}\n")
-                        count += 1
-                        if count >= 20:  # Limit to prevent too large datasets
-                            break
-                logging.info(f"Added {count} reference sequences from CSV")
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
-        logging.error(f"Failed to create multi-FASTA: {e}")
         return None
-def build_maximum_likelihood_tree(f_gene_sequence):
-    """Build maximum likelihood phylogenetic tree with comprehensive fallback options"""
     try:
-        # Check tool availability with enhanced detection
-        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
-        # Prepare status message
-        status_msg = "🔍 Checking dependencies...\n"
-        if not mafft_available:
-            status_msg += "❌ MAFFT not found\n"
-        else:
-            status_msg += f"✅ MAFFT found: {mafft_cmd}\n"
-        if not iqtree_available:
-            status_msg += "❌ IQ-TREE not found\n"
-        else:
-            status_msg += f"✅ IQ-TREE found: {iqtree_cmd}\n"
-        # If neither tool is available, provide installation guide
-        if not mafft_available and not iqtree_available:
             guide = install_dependencies_guide()
-            return False, f"{status_msg}\n{guide}", None, None
-        # If only one tool is missing, provide specific guidance
-        if not mafft_available:
-            return False, f"{status_msg}\n❌ MAFFT is required for sequence alignment. Please install MAFFT first.", None, None
-        if not iqtree_available:
-            status_msg += "\n⚠️  IQ-TREE not available. Attempting simple tree construction...\n"
-            # Try to create a simple tree as fallback
-            multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
-            if multi_fasta:
-                # Read sequences
-                sequences = {}
-                current_seq = ""
-                current_name = ""
-                with open(multi_fasta, 'r') as f:
-                    for line in f:
-                        line = line.strip()
-                        if line.startswith('>'):
-                            if current_name and current_seq:
-                                sequences[current_name] = current_seq
-                            current_name = line[1:]
-                            current_seq = ""
-                        else:
-                            current_seq += line
-                    if current_name and current_seq:
-                        sequences[current_name] = current_seq
-                simple_tree, simple_msg = create_simple_neighbor_joining_tree(sequences)
-                os.unlink(multi_fasta)
-                if simple_tree:
-                    return True, f"{status_msg}✅ {simple_msg}", None, simple_tree
-                else:
-                    return False, f"{status_msg}❌ {simple_msg}", None, None
-            else:
-                return False, f"{status_msg}❌ Failed to create input sequences", None, None
-        # Both tools available - proceed with full ML analysis
-        # Create output directory
-        output_dir = "ml_tree_output"
-        os.makedirs(output_dir, exist_ok=True)
-        # Step 1: Create multi-FASTA file with query and reference sequences
-        logging.info("Creating multi-FASTA file...")
-        multi_fasta = create_multi_fasta_with_query(f_gene_sequence)
         if not multi_fasta:
-            return False, f"{status_msg}❌ Failed to create input FASTA", None, None
-        # Step 2: Run MAFFT alignment
-        logging.info("Running MAFFT alignment...")
-        aligned_fasta = os.path.join(output_dir, "aligned_sequences.fasta")
         mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
-        # Clean up temporary file
         os.unlink(multi_fasta)
         if not mafft_success:
-            return False, f"{status_msg}❌ MAFFT failed: {mafft_result}", None, None
-        # Step 3: Run IQ-TREE analysis
-        logging.info("Running IQ-TREE analysis...")
-        tree_prefix = os.path.join(output_dir, "ml_tree")
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
-            return False, f"{status_msg}❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
-        # Step 4: Prepare output files
         tree_file = iqtree_result
-        log_file = f"{tree_prefix}.log"
-        # Copy to standard names for compatibility
-        standard_aligned = "f_gene_sequences_aligned.fasta"
-        standard_tree = "f_gene_sequences.phy.treefile"
-        if os.path.exists(aligned_fasta):
-            shutil.copy2(aligned_fasta, standard_aligned)
-        if os.path.exists(tree_file):
-            shutil.copy2(tree_file, standard_tree)
-        success_msg = f"{status_msg}✅ Maximum likelihood tree built successfully!\n"
-        success_msg += f"- Alignment: {os.path.basename(aligned_fasta)}\n"
-        success_msg += f"- Tree: {os.path.basename(tree_file)}\n"
-        if os.path.exists(log_file):
-            try:
-                with open(log_file, 'r') as f:
-                    log_content = f.read()
-                    # Extract model information
-                    if "Best-fit model:" in log_content:
-                        model_lines = [line for line in log_content.split('\n') if "Best-fit model:" in line]
-                        if model_lines:
-                            success_msg += f"- {model_lines[0].strip()}\n"
-            except Exception as e:
-                logging.warning(f"Could not read log file: {e}")
-        logging.info("Maximum likelihood tree construction completed")
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
-# --- Tree Analysis Function (Fixed for display) ---
-def analyze_sequence_for_tree(sequence: str, matching_percentage: float) -> tuple:
-    """
-    Analyze sequence and create phylogenetic tree - FIXED to return HTML content properly
-    Returns: (html_content, html_file_path, success_message)
-    """
-    try:
-        if not analyzer:
-            return None, None, "Error: Tree analyzer not initialized. Please check if the CSV data file is available."
-        if not sequence:
-            return None, None, "Error: Please provide a sequence."
-        if not (1 <= matching_percentage <= 99):
-            return None, None, "Error: Matching percentage must be between 1 and 99."
-        # Find query sequence
-        if not analyzer.find_query_sequence(sequence):
-            return None, None, "Error: Invalid query sequence or sequence not found in dataset."
-        # Set matching percentage
-        analyzer.matching_percentage = matching_percentage
-        # Find similar sequences
-        matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
-        if not matched_ids:
-            return None, None, f"No similar sequences found at {matching_percentage}% similarity. Try lowering the threshold."
-        logging.info(f"Found {len(matched_ids)} similar sequences at {actual_percentage:.1f}% similarity")
-        # Build tree structure
-        tree_structure = analyzer.build_tree_structure(matched_ids)
-        if not tree_structure:
-            return None, None, "Error: Failed to build tree structure."
-        # Create interactive tree
-        fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
-        if not fig:
-            return None, None, "Error: Failed to create tree visualization."
-        # Generate HTML content
-        html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
-        # Save to output folder
-        output_dir = "output"
-        os.makedirs(output_dir, exist_ok=True)
-        # Create a safe filename
-        safe_seq_name = re.sub(r'[^a-zA-Z0-9]', '_', sequence[:20])
-        html_filename = os.path.join(output_dir, f"tree_{safe_seq_name}_{matching_percentage}.html")
-        with open(html_filename, "w", encoding='utf-8') as f:
-            f.write(html_content)
-        logging.info(f"Tree HTML saved to {html_filename}")
-        success_msg = f"✅ Simplified phylogenetic tree generated successfully!\n"
-        success_msg += f"- {len(matched_ids)} sequences analyzed\n"
-        success_msg += f"- Similarity threshold: {actual_percentage:.1f}%\n"
-        success_msg += f"- Tree file: {os.path.basename(html_filename)}"
-        return html_content, html_filename, success_msg
-    except Exception as e:
-        error_msg = f"Tree analysis error: {str(e)}"
-        logging.error(error_msg)
-        import traceback
-        logging.error(f"Full traceback: {traceback.format_exc()}")
-        return None, None, error_msg
-# --- Verification Functions for Hugging Face Models ---
-def run_verification_pipeline(sequence, model_names=None):
-    """
-    Run verification using models from the models directory
-    Args:
-        sequence: DNA sequence to verify
-        model_names: List of model names to use (None = use all available)
-    Returns:
-        Dictionary with verification results from each model
-    """
     results = {}
-    if not verification_models:
-        return {"error": "No verification models loaded from models directory"}
-    # Use all models if none specified
-    if model_names is None:
-        model_names = list(verification_models.keys())
-    for model_name in model_names:
-        if model_name not in verification_models:
-            results[model_name] = f"Model {model_name} not found"
-            continue
         try:
-            model = verification_models[model_name]
-            if model_name == "boundary_model" and hasattr(model, 'predict'):
-                # Boundary prediction model
-                predictions, probs, confidence = model.predict(sequence)
-                regions = model.extract_gene_regions(predictions, sequence)
-                results[model_name] = {
-                    "type": "boundary_detection",
-                    "confidence": confidence,
-                    "regions_found": len(regions) if regions else 0,
-                    "extracted_sequence": regions[0]["sequence"] if regions else None
-                }
-            elif model_name == "keras_model":
-                # Keras model for gene validation
-                if len(sequence) < 6:
-                    results[model_name] = {"error": "Sequence too short for k-mer analysis"}
-                    continue
-                # Generate k-mers
-                kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
-                kmer_index = verification_models.get("kmer_index", {})
-                indices = [kmer_index.get(kmer, 0) for kmer in kmers]
-                # Prepare input
-                input_arr = np.array([indices])
-                prediction = model.predict(input_arr, verbose=0)[0]
-                results[model_name] = {
-                    "type": "gene_validation",
-                    "prediction_scores": prediction.tolist(),
-                    "mean_score": float(np.mean(prediction)),
-                    "max_score": float(np.max(prediction))
-                }
-            else:
-                # Generic model handling
-                results[model_name] = {
-                    "type": "generic",
-                    "status": "Model loaded but no specific handler implemented",
-                    "model_type": type(model).__name__
-                }
         except Exception as e:
-            results[model_name] = {"error": str(e)}
-            logging.error(f"Verification failed for {model_name}: {e}")
     return results
-def format_verification_results(verification_results):
-    """Format verification results for display"""
-    if not verification_results:
-        return "No verification results available"
-    if "error" in verification_results:
-        return f"Verification Error: {verification_results['error']}"
-    formatted = "🔍 VERIFICATION RESULTS:\n\n"
-    for model_name, result in verification_results.items():
-        formatted += f"📊 {model_name.upper()}:\n"
-        if isinstance(result, dict):
-            if "error" in result:
-                formatted += f"   ❌ Error: {result['error']}\n"
-            elif result.get("type") == "boundary_detection":
-                formatted += f"   ✅ Confidence: {result.get('confidence', 'N/A'):.3f}\n"
-                formatted += f"   🎯 Regions Found: {result.get('regions_found', 0)}\n"
-                if result.get('extracted_sequence'):
-                    seq_len = len(result['extracted_sequence'])
-                    formatted += f"   📏 Extracted Length: {seq_len} bp\n"
-            elif result.get("type") == "gene_validation":
-                formatted += f"   📈 Mean Score: {result.get('mean_score', 0):.3f}\n"
-                formatted += f"   🔝 Max Score: {result.get('max_score', 0):.3f}\n"
             else:
-                formatted += f"   ℹ️  Status: {result.get('status', 'Processed')}\n"
-        else:
-            formatted += f"   📝 Result: {str(result)}\n"
-        formatted += "\n"
-    return formatted
-# --- Keras Prediction ---
-def predict_with_keras(sequence):
-    try:
-        if not keras_model or not kmer_to_index:
-            return f"Keras model not available. Input sequence: {sequence[:100]}..."
-        if len(sequence) < 6:
-            return "Sequence too short for k-mer prediction (minimum 6 nucleotides required)."
-        # Generate k-mers
-        kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
-        indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
-        # Prepare input
-        input_arr = np.array([indices])
-        prediction = keras_model.predict(input_arr, verbose=0)[0]
-        # Format prediction as probabilities/scores
-        mean_score = np.mean(prediction)
-        max_score = np.max(prediction)
-        min_score = np.min(prediction)
-        result = f"Keras Model Prediction Results:\n"
-        result += f"- Mean Score: {mean_score:.4f}\n"
-        result += f"- Max Score: {max_score:.4f}\n"
-        result += f"- Min Score: {min_score:.4f}\n"
-        result += f"- Total K-mers: {len(kmers)}\n"
-        result += f"- Sequence Length: {len(sequence)} bp"
-        return result
-    except Exception as e:
-        logging.error(f"Keras prediction error: {e}")
-        return f"Keras prediction failed: {str(e)}"
-# --- Boundary Prediction ---
-def predict_with_boundary(sequence):
-    try:
-        if not boundary_model:
-            return f"Boundary model not available. Input sequence: {sequence[:100]}..."
-        # Get predictions from boundary model
-        predictions, probabilities, confidence = boundary_model.predict(sequence)
-        # Extract gene regions
-        regions = boundary_model.extract_gene_regions(predictions, sequence)
-        result = f"Boundary Model Prediction Results:\n"
-        result += f"- Overall Confidence: {confidence:.4f}\n"
-        result += f"- Regions Detected: {len(regions) if regions else 0}\n"
-        if regions:
-            for i, region in enumerate(regions[:3]):  # Show first 3 regions
-                result += f"\nRegion {i+1}:\n"
-                result += f"  - Start: {region['start']}\n"
-                result += f"  - End: {region['end']}\n"
-                result += f"  - Length: {len(region['sequence'])} bp\n"
-                result += f"  - Confidence: {region.get('confidence', 'N/A'):.4f}\n"
-        return result
-    except Exception as e:
-        logging.error(f"Boundary prediction error: {e}")
-        return f"Boundary prediction failed: {str(e)}"
-# --- Combined Prediction Function ---
-def predict_gene_sequence(sequence):
-    """Combined prediction using both models"""
-    try:
-        if not sequence or len(sequence.strip()) == 0:
-            return "Please provide a DNA sequence."
-        # Clean and validate sequence
-        sequence = re.sub(r'[^ATCG]', '', sequence.upper())
-        if len(sequence) < 10:
-            return "Sequence too short. Please provide at least 10 nucleotides."
-        results = []
-        results.append(f"🧬 GENE SEQUENCE ANALYSIS\n")
-        results.append(f"Input sequence length: {len(sequence)} bp\n")
-        results.append("=" * 50)
-        # Boundary model prediction
-        if boundary_model:
-            results.append("\n🎯 BOUNDARY DETECTION:")
-            boundary_result = predict_with_boundary(sequence)
-            results.append(boundary_result)
-        else:
-            results.append("\n❌ Boundary model not available")
-        # Keras model prediction
-        if keras_model:
-            results.append("\n🔍 KERAS MODEL ANALYSIS:")
-            keras_result = predict_with_keras(sequence)
-            results.append(keras_result)
-        else:
-            results.append("\n❌ Keras model not available")
-        # Verification models
-        if verification_models:
-            results.append("\n🔬 VERIFICATION ANALYSIS:")
-            verification_result = run_verification_pipeline(sequence)
-            formatted_verification = format_verification_results(verification_result)
-            results.append(formatted_verification)
-        return "\n".join(results)
-    except Exception as e:
-        logging.error(f"Gene prediction error: {e}")
-        return f"Gene prediction failed: {str(e)}"
-# --- File Processing Functions ---
 def process_fasta_file(file):
-    """Process uploaded FASTA file"""
     try:
-        if file is None:
             return "Please upload a FASTA file."
-        # Read file content
-        with open(file.name, 'r') as f:
-            content = f.read()
-        # Parse FASTA
         sequences = {}
         current_seq = ""
         current_name = ""
-        lines = content.strip().split('\n')
-        for line in lines:
-            line = line.strip()
-            if line.startswith('>'):
-                if current_name and current_seq:
-                    sequences[current_name] = current_seq
-                current_name = line[1:]  # Remove '>'
-                current_seq = ""
-            else:
-                current_seq += line.upper()
-        # Add last sequence
         if current_name and current_seq:
             sequences[current_name] = current_seq
         if not sequences:
-            return "No valid sequences found in FASTA file."
-        # Process each sequence
-        results = []
-        results.append(f"📁 FASTA FILE ANALYSIS")
-        results.append(f"Found {len(sequences)} sequences\n")
-        results.append("=" * 60)
         for i, (name, seq) in enumerate(sequences.items()):
-            if i >= 5:  # Limit to first 5 sequences
                 results.append(f"\n... and {len(sequences) - 5} more sequences")
                 break
-            results.append(f"\n🧬 Sequence: {name}")
-            results.append(f"Length: {len(seq)} bp")
-            # Clean sequence
             clean_seq = re.sub(r'[^ATCG]', '', seq)
             if len(clean_seq) >= 10:
-                # Run prediction on cleaned sequence
-                prediction = predict_gene_sequence(clean_seq)
-                results.append(prediction)
             else:
                 results.append("❌ Sequence too short or invalid")
             results.append("-" * 40)
         return "\n".join(results)
     except Exception as e:
-        logging.error(f"FASTA processing error: {e}")
         return f"FASTA processing failed: {str(e)}"
-# --- Tree Building Interface Functions ---
-def build_tree_interface(sequence):
-    """Interface function for building phylogenetic trees"""
-    try:
-        if not sequence or len(sequence.strip()) == 0:
-            return "Please provide a DNA sequence for tree construction."
-        # Clean sequence
-        clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
-        if len(clean_seq) < 50:
-            return "Sequence too short for phylogenetic analysis (minimum 50 bp required)."
-        # Try ML tree construction first
-        success, message, aligned_file, tree_file = build_maximum_likelihood_tree(clean_seq)
-        result = f"🌳 PHYLOGENETIC TREE CONSTRUCTION\n"
-        result += f"Input sequence length: {len(clean_seq)} bp\n"
-        result += "=" * 50 + "\n\n"
-        result += message
-        if success and tree_file:
-            # Try to read and display tree
-            try:
-                with open(tree_file, 'r') as f:
-                    tree_content = f.read().strip()
-                result += f"\n\n📄 Tree file content:\n"
-                result += f"File: {os.path.basename(tree_file)}\n"
-                result += f"Size: {len(tree_content)} characters\n"
-                # Show first part of tree if it's very long
-                if len(tree_content) > 500:
-                    result += f"Preview: {tree_content[:500]}...\n"
-                else:
-                    result += f"Content: {tree_content}\n"
-            except Exception as e:
-                result += f"\n⚠️  Could not read tree file: {e}"
-        return result
-    except Exception as e:
-        logging.error(f"Tree building interface error: {e}")
-        return f"Tree construction failed: {str(e)}"
-def analyze_tree_interface(sequence, similarity_threshold):
-    """Interface function for tree analysis with similarity threshold"""
-    try:
-        if not sequence or len(sequence.strip()) == 0:
-            return "Please provide a DNA sequence.", None
-        # Clean sequence
-        clean_seq = re.sub(r'[^ATCG]', '', sequence.upper())
-        if len(clean_seq) < 20:
-            return "Sequence too short for analysis (minimum 20 bp required).", None
-        # Validate similarity threshold
-        if not (1 <= similarity_threshold <= 99):
-            return "Similarity threshold must be between 1 and 99%.", None
-        # Run tree analysis
-        html_content, html_file, success_msg = analyze_sequence_for_tree(
-            clean_seq, similarity_threshold
-        )
-        if html_content:
-            result = f"🌳 PHYLOGENETIC TREE ANALYSIS\n"
-            result += f"Input sequence length: {len(clean_seq)} bp\n"
-            result += f"Similarity threshold: {similarity_threshold}%\n"
-            result += "=" * 50 + "\n\n"
-            result += success_msg
-            return result, html_file
-        else:
-            return success_msg or "Tree analysis failed.", None
-    except Exception as e:
-        logging.error(f"Tree analysis interface error: {e}")
-        return f"Tree analysis failed: {str(e)}", None
-# --- Gradio Interface ---
-def create_gradio_interface():
-    """Create the Gradio interface"""
-    # Custom CSS for better styling
-    css = """
-    .gradio-container {
-        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-    }
-    .output-text {
-        font-family: 'Courier New', monospace;
-        font-size: 12px;
-        line-height: 1.4;
-    }
-    .tab-nav {
-        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-    }
-    """
-    with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
-        gr.Markdown("""
-        # 🧬 Advanced Gene Analysis Tool
-        This tool provides comprehensive gene sequence analysis including:
-        - **Gene Prediction**: Boundary detection and validation
-        - **Phylogenetic Analysis**: Tree construction and similarity analysis
-        - **File Processing**: Batch analysis of FASTA files
-        - **Model Verification**: Multi-model validation pipeline
-        """)
-        with gr.Tabs():
-            # Tab 1: Gene Prediction
-            with gr.Tab("🔬 Gene Prediction"):
-                gr.Markdown("### Predict gene sequences using trained models")
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        seq_input = gr.Textbox(
-                            label="DNA Sequence",
-                            placeholder="Enter DNA sequence (A, T, C, G only)...",
-                            lines=5,
-                            max_lines=10
-                        )
-                        predict_btn = gr.Button("🚀 Analyze Sequence", variant="primary")
-                    with gr.Column(scale=3):
-                        prediction_output = gr.Textbox(
-                            label="Analysis Results",
-                            lines=20,
-                            max_lines=30,
-                            elem_classes=["output-text"]
-                        )
-                predict_btn.click(
-                    fn=predict_gene_sequence,
-                    inputs=[seq_input],
-                    outputs=[prediction_output]
-                )
-            # Tab 2: File Processing
-            with gr.Tab("📁 File Processing"):
-                gr.Markdown("### Upload and analyze FASTA files")
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        file_input = gr.File(
-                            label="Upload FASTA File",
-                            file_types=[".fasta", ".fa", ".fas", ".txt"]
-                        )
-                        process_btn = gr.Button("📊 Process File", variant="primary")
-                    with gr.Column(scale=2):
-                        file_output = gr.Textbox(
-                            label="Processing Results",
-                            lines=25,
-                            max_lines=35,
-                            elem_classes=["output-text"]
-                        )
-                process_btn.click(
-                    fn=process_fasta_file,
-                    inputs=[file_input],
-                    outputs=[file_output]
-                )
-            # Tab 3: Phylogenetic Trees
-            with gr.Tab("🌳 Phylogenetic Trees"):
-                gr.Markdown("### Build and analyze phylogenetic trees")
-                with gr.Tabs():
-                    # Subtab: ML Tree Construction
-                    with gr.Tab("Maximum Likelihood Tree"):
-                        gr.Markdown("**Build ML tree using MAFFT + IQ-TREE**")
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                ml_seq_input = gr.Textbox(
-                                    label="DNA Sequence",
-                                    placeholder="Enter sequence for ML tree construction...",
-                                    lines=4
-                                )
-                                ml_tree_btn = gr.Button("🌳 Build ML Tree", variant="primary")
-                            with gr.Column(scale=2):
-                                ml_tree_output = gr.Textbox(
-                                    label="ML Tree Results",
-                                    lines=20,
-                                    elem_classes=["output-text"]
-                                )
-                        ml_tree_btn.click(
-                            fn=build_tree_interface,
-                            inputs=[ml_seq_input],
-                            outputs=[ml_tree_output]
-                        )
-                    # Subtab: Interactive Tree Analysis
-                    with gr.Tab("Interactive Analysis"):
-                        gr.Markdown("**Analyze sequence similarity with interactive tree**")
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                tree_seq_input = gr.Textbox(
-                                    label="Query Sequence",
-                                    placeholder="Enter sequence for tree analysis...",
-                                    lines=4
-                                )
-                                similarity_slider = gr.Slider(
-                                    minimum=1,
-                                    maximum=99,
-                                    value=80,
-                                    step=1,
-                                    label="Similarity Threshold (%)"
-                                )
-                                tree_analyze_btn = gr.Button("🔍 Analyze Tree", variant="primary")
-                            with gr.Column(scale=2):
-                                tree_analysis_output = gr.Textbox(
-                                    label="Tree Analysis Results",
-                                    lines=15,
-                                    elem_classes=["output-text"]
-                                )
-                                tree_file_output = gr.File(
-                                    label="Interactive Tree File (HTML)"
-                                )
-                        tree_analyze_btn.click(
-                            fn=analyze_tree_interface,
-                            inputs=[tree_seq_input, similarity_slider],
-                            outputs=[tree_analysis_output, tree_file_output]
-                        )
-            # Tab 4: Model Information
-            with gr.Tab("ℹ️ Model Information"):
-                gr.Markdown("""
-                ### Model Status and Information
-                **Available Models:**
-                """)
-                # Model status
-                model_status = []
-                if boundary_model:
-                    model_status.append("✅ Boundary Detection Model: Loaded")
-                else:
-                    model_status.append("❌ Boundary Detection Model: Not Available")
-                if keras_model:
-                    model_status.append("✅ Keras Validation Model: Loaded")
-                else:
-                    model_status.append("❌ Keras Validation Model: Not Available")
-                if verification_models:
-                    model_status.append(f"✅ Verification Models: {len(verification_models)} loaded")
-                    for model_name in verification_models.keys():
-                        model_status.append(f"   - {model_name}")
-                else:
-                    model_status.append("❌ Verification Models: None loaded")
-                if analyzer:
-                    model_status.append("✅ Tree Analyzer: Initialized")
-                else:
-                    model_status.append("❌ Tree Analyzer: Not Available")
-                # Check external tools
-                mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
-                if mafft_available:
-                    model_status.append(f"✅ MAFFT: Available ({mafft_cmd})")
-                else:
-                    model_status.append("❌ MAFFT: Not Available")
-                if iqtree_available:
-                    model_status.append(f"✅ IQ-TREE: Available ({iqtree_cmd})")
-                else:
-                    model_status.append("❌ IQ-TREE: Not Available")
-                gr.Markdown("\n".join(model_status))
-                gr.Markdown("""
-                ### Usage Guidelines:
-                1. **Gene Prediction**: Input DNA sequences containing only A, T, C, G characters
-                2. **File Processing**: Upload FASTA files with multiple sequences
-                3. **ML Trees**: Requires MAFFT and IQ-TREE installation
-                4. **Interactive Trees**: Uses simplified clustering for quick analysis
-                ### System Requirements:
-                - Python 3.8+
-                - TensorFlow/Keras for neural network models
-                - PyTorch for boundary detection
-                - MAFFT and IQ-TREE for phylogenetic analysis (optional)
-                """)
-        return interface
-# --- Main Application ---
 if __name__ == "__main__":
-    # Initialize logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.FileHandler('gene_analysis.log'),
-            logging.StreamHandler(sys.stdout)
-        ]
-    )
-    # Create output directories
     os.makedirs("output", exist_ok=True)
     os.makedirs("ml_tree_output", exist_ok=True)
-    # Log startup information
-    logging.info("Starting Gene Analysis Tool")
-    logging.info(f"Boundary model loaded: {boundary_model is not None}")
-    logging.info(f"Keras model loaded: {keras_model is not None}")
-    logging.info(f"Verification models loaded: {len(verification_models) if verification_models else 0}")
-    logging.info(f"Tree analyzer initialized: {analyzer is not None}")
-    # Check external tools
-    mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
-    logging.info(f"MAFFT available: {mafft_available}")
-    logging.info(f"IQ-TREE available: {iqtree_available}")
-    # Create and launch interface
     try:
         interface = create_gradio_interface()
-        # Launch with appropriate settings
         interface.launch(
-            share=False,  # Set to True if you want a public link
-            server_name="0.0.0.0",  # Allow external connections
             server_port=7860,
             show_error=True,
             debug=True
         )
     except Exception as e:
-        logging.error(f"Failed to launch interface: {e}")
-        import traceback
-        logging.error(f"Full traceback: {traceback.format_exc()}")
         sys.exit(1)

 import re
 import logging
 import numpy as np
 import tempfile
 import shutil
 import sys
 from pathlib import Path
+try:
+    from predictor import GenePredictor
+except ImportError:
+    GenePredictor = None
+try:
+    from tensorflow.keras.models import load_model
+except ImportError:
+    load_model = None
+try:
+    import ml_simplified_tree
+except ImportError:
+    ml_simplified_tree = None
+from huggingface_hub import hf_hub_download
 # --- Global Variables ---
 MAFFT_PATH = "mafft/mafftdir/bin/mafft"  # Update this path as needed
 IQTREE_PATH = "iqtree/bin/iqtree2"  # Update this path as needed
+CSV_PATH = "f_cleaned.csv"  # Updated to match your naming
+# --- Logging Setup ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('gene_analysis.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+# --- Model Paths and Variables ---
+MODEL_REPO = "GGproject10/best_boundary_aware_model"
 boundary_model = None
 keras_model = None
 kmer_to_index = None
+analyzer = None
+# --- Load Models ---
+def load_models():
+    global boundary_model, keras_model, kmer_to_index
+    hf_token = os.getenv("HF_TOKEN")
+    # Load boundary model
+    if GenePredictor:
+        try:
+            boundary_path = hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename="best_boundary_aware_model.pth",
+                token=hf_token
+            )
+            boundary_model = GenePredictor(boundary_path)
+            logging.info("Boundary model loaded successfully.")
+        except Exception as e:
+            logging.warning(f"Failed to load boundary model: {e}")
+            boundary_model = None
     else:
+        logging.warning("GenePredictor not available.")
+    # Load Keras model
+    if load_model:
         try:
+            keras_path = hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename="best_model.keras",
+                token=hf_token
+            )
+            kmer_path = hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename="kmer_to_index.pkl",
+                token=hf_token
+            )
+            keras_model = load_model(keras_path)
+            with open(kmer_path, "rb") as f:
+                kmer_to_index = pickle.load(f)
+            logging.info("Keras model and k-mer index loaded successfully.")
         except Exception as e:
+            logging.warning(f"Failed to load Keras model or k-mer index: {e}")
+            keras_model = None
+            kmer_to_index = None
+    else:
+        logging.warning("Keras/TensorFlow not available.")
 # --- Initialize Tree Analyzer ---
+def init_tree_analyzer():
+    global analyzer
+    if ml_simplified_tree and os.path.exists(CSV_PATH):
+        try:
+            analyzer = ml_simplified_tree.PhylogeneticTreeAnalyzer()
+            if analyzer.load_data(CSV_PATH):
+                logging.info("Tree analyzer initialized successfully.")
+                try:
+                    if not analyzer.train_ai_model():
+                        logging.warning("AI model training failed.")
+                except Exception as e:
+                    logging.warning(f"AI model training failed: {e}")
+            else:
+                logging.error("Failed to load CSV data.")
+                analyzer = None
+        except Exception as e:
+            logging.error(f"Failed to initialize tree analyzer: {e}")
             analyzer = None
     else:
+        logging.warning("Tree analyzer or CSV file not available.")
         analyzer = None
+# --- Tool Detection ---
 def check_tool_availability():
     mafft_candidates = [
+        MAFFT_PATH, 'mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', 'mafft.bat'
     ]
     iqtree_candidates = [
+        IQTREE_PATH, 'iqtree2', 'iqtree', '/usr/bin/iqtree2', '/usr/local/bin/iqtree2',
+        '/usr/bin/iqtree', '/usr/local/bin/iqtree', 'iqtree2.exe', 'iqtree.exe'
     ]
+    mafft_cmd = next((cmd for cmd in mafft_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
+    iqtree_cmd = next((cmd for cmd in iqtree_candidates if cmd and (os.path.exists(cmd) or shutil.which(cmd))), None)
+    return bool(mafft_cmd), bool(iqtree_cmd), mafft_cmd, iqtree_cmd
+# --- Installation Guide ---
 def install_dependencies_guide():
+    return """
 🔧 INSTALLATION GUIDE FOR MISSING DEPENDENCIES:
 For MAFFT:
 - macOS: brew install iqtree
 - Windows: Download from http://www.iqtree.org/
+Conda: conda install -c bioconda mafft iqtree
 """
+# --- MAFFT and IQ-TREE Functions ---
 def run_mafft_alignment(input_fasta, output_fasta, mafft_cmd):
     try:
+        cmd = [mafft_cmd, '--auto', '--quiet', input_fasta]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
         if result.returncode == 0:
             with open(output_fasta, 'w') as f:
                 f.write(result.stdout)
+            if os.path.getsize(output_fasta) > 0:
+                logging.info(f"MAFFT alignment completed: {output_fasta}")
                 return True, output_fasta
+            return False, "MAFFT output empty."
+        return False, f"MAFFT error: {result.stderr.strip() or 'Unknown error'}"
     except Exception as e:
+        logging.error(f"MAFFT failed: {e}")
+        return False, f"MAFFT failed: {str(e)}"
 def run_iqtree_analysis(aligned_fasta, output_prefix, iqtree_cmd):
     try:
         cmd = [
+            iqtree_cmd, '-s', aligned_fasta, '-m', 'MFP', '-bb', '1000',
+            '-alrt', '1000', '-nt', 'AUTO', '--prefix', output_prefix, '--quiet'
         ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
+        tree_file = f"{output_prefix}.treefile"
+        if result.returncode == 0 and os.path.exists(tree_file) and os.path.getsize(tree_file) > 0:
+            logging.info(f"IQ-TREE completed: {tree_file}")
+            return True, tree_file
+        return False, f"IQ-TREE error: {result.stderr.strip() or 'Tree file not generated'}"
     except Exception as e:
+        logging.error(f"IQ-TREE failed: {e}")
+        return False, f"IQ-TREE failed: {str(e)}"
+# --- Fallback Tree Construction ---
+def create_simple_tree(sequences_dict):
     try:
         seq_names = list(sequences_dict.keys())
+        if len(seq_names) < 2:
+            return None, "Need at least 2 sequences."
+        tree_str = f"({','.join([f'{name}:0.1' for name in seq_names[:5]])});"
         tree_file = "simple_tree.nwk"
         with open(tree_file, 'w') as f:
             f.write(tree_str)
+        return tree_file, "Simple tree created."
     except Exception as e:
         return None, f"Simple tree creation failed: {str(e)}"
+# --- Create Multi-FASTA ---
+def create_multi_fasta(query_sequence, query_id="Query_F_Gene"):
     try:
         temp_fasta = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
         temp_fasta.write(f">{query_id}\n{query_sequence}\n")
         ref_fasta_path = "f_gene_sequences_aligned.fasta"
         if os.path.exists(ref_fasta_path):
             with open(ref_fasta_path, 'r') as ref_file:
                 temp_fasta.write(ref_file.read())
+        elif analyzer and hasattr(analyzer, 'data'):
+            count = 0
+            for idx, row in analyzer.data.iterrows():
+                if 'sequence' in row and len(str(row['sequence'])) > 50:
+                    temp_fasta.write(f">{row.get('id', f'Ref_{count}')}\n{str(row['sequence']).upper()}\n")
+                    count += 1
+                    if count >= 20:
+                        break
         temp_fasta.close()
         return temp_fasta.name
     except Exception as e:
+        logging.error(f"Multi-FASTA creation failed: {e}")
         return None
+# --- Pipeline: Maximum Likelihood Tree ---
+def build_maximum_likelihood_tree(sequence):
     try:
+        sequence = re.sub(r'[^ATCG]', '', sequence.upper())
+        if len(sequence) < 50:
+            return False, "Sequence too short (<50 bp).", None, None
+        mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
+        status_msg = "🔍 Dependencies:\n"
+        status_msg += f"✅ MAFFT: {mafft_cmd or 'Not found'}\n"
+        status_msg += f"✅ IQ-TREE: {iqtree_cmd or 'Not found'}\n"
+        if not mafft_available or not iqtree_available:
             guide = install_dependencies_guide()
+            return False, f"{status_msg}\n❌ Missing tools:\n{guide}", None, None
+        os.makedirs("ml_tree_output", exist_ok=True)
+        multi_fasta = create_multi_fasta(sequence)
         if not multi_fasta:
+            return False, f"{status_msg}\n❌ Failed to create input FASTA.", None, None
+        aligned_fasta = "ml_tree_output/aligned_sequences.fasta"
         mafft_success, mafft_result = run_mafft_alignment(multi_fasta, aligned_fasta, mafft_cmd)
         os.unlink(multi_fasta)
         if not mafft_success:
+            return False, f"{status_msg}\n❌ MAFFT failed: {mafft_result}", None, None
+        tree_prefix = "ml_tree_output/ml_tree"
         iqtree_success, iqtree_result = run_iqtree_analysis(aligned_fasta, tree_prefix, iqtree_cmd)
         if not iqtree_success:
+            return False, f"{status_msg}\n❌ IQ-TREE failed: {iqtree_result}", aligned_fasta, None
         tree_file = iqtree_result
+        shutil.copy2(aligned_fasta, "f_gene_sequences_aligned.fasta")
+        shutil.copy2(tree_file, "f_gene_sequences.phy.treefile")
+        success_msg = f"{status_msg}\n✅ ML tree built:\n- Alignment: {os.path.basename(aligned_fasta)}\n- Tree: {os.path.basename(tree_file)}"
         return True, success_msg, aligned_fasta, tree_file
     except Exception as e:
         logging.error(f"ML tree construction failed: {e}")
         return False, f"ML tree construction failed: {str(e)}", None, None
+# --- Pipeline: Verification ---
+def run_verification_pipeline(sequence):
     results = {}
+    sequence = re.sub(r'[^ATCG]', '', sequence.upper())
+    if len(sequence) < 10:
+        results["error"] = "Sequence too short (<10 bp)."
+        return results
+    # Boundary model verification
+    if boundary_model:
+        try:
+            predictions, probs, confidence = boundary_model.predict(sequence)
+            regions = boundary_model.extract_gene_regions(predictions, sequence)
+            results["boundary_model"] = {
+                "type": "boundary_detection",
+                "confidence": float(confidence),
+                "regions_found": len(regions) if regions else 0,
+                "extracted_sequence": regions[0]["sequence"] if regions else None
+            }
+        except Exception as e:
+            results["boundary_model"] = {"error": f"Boundary prediction failed: {str(e)}"}
+    # Keras model verification
+    if keras_model and kmer_to_index:
         try:
+            kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
+            indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
+            input_arr = np.array([indices])
+            prediction = keras_model.predict(input_arr, verbose=0)[0]
+            results["keras_model"] = {
+                "type": "gene_validation",
+                "mean_score": float(np.mean(prediction)),
+                "max_score": float(np.max(prediction))
+            }
         except Exception as e:
+            results["keras_model"] = {"error": f"Keras prediction failed: {str(e)}"}
     return results
+# --- Format Results ---
+def format_results(results, sequence, pipeline_type):
+    output = [f"🧬 {pipeline_type.upper()} ANALYSIS\nSequence length: {len(sequence)} bp\n{'=' * 50}"]
+    if "error" in results:
+        output.append(f"❌ Error: {results['error']}")
+        return "\n".join(output)
+    if pipeline_type == "prediction":
+        if boundary_model and "boundary_model" in results:
+            r = results["boundary_model"]
+            if "error" not in r:
+                output.append("\n🎯 Boundary Detection:")
+                output.append(f"- Confidence: {r['confidence']:.3f}")
+                output.append(f"- Regions Found: {r['regions_found']}")
+                if r['extracted_sequence']:
+                    output.append(f"- Extracted Length: {len(r['extracted_sequence'])} bp")
             else:
+                output.append(f"\n❌ Boundary Detection: {r['error']}")
+        if keras_model and "keras_model" in results:
+            r = results["keras_model"]
+            if "error" not in r:
+                output.append("\n🔍 Keras Validation:")
+                output.append(f"- Mean Score: {r['mean_score']:.3f}")
+                output.append(f"- Max Score: {r['max_score']:.3f}")
+            else:
+                output.append(f"\n❌ Keras Validation: {r['error']}")
+    elif pipeline_type == "tree":
+        output.append(results.get("message", "No tree results available."))
+        if results.get("tree_file"):
+            output.append(f"\nTree File: {os.path.basename(results['tree_file'])}")
+    return "\n".join(output)
+# --- Interface Functions ---
+def analyze_sequence(sequence):
+    sequence = re.sub(r'[^ATCG]', '', sequence.upper())
+    if not sequence or len(sequence) < 10:
+        return "Invalid or too short sequence (<10 bp)."
+    results = run_verification_pipeline(sequence)
+    return format_results(results, sequence, "prediction")
+def build_tree(sequence):
+    success, message, aligned_fasta, tree_file = build_maximum_likelihood_tree(sequence)
+    return format_results({"message": message, "tree_file": tree_file}, sequence, "tree")
+# --- Gradio Interface ---
+def create_gradio_interface():
+    css = """
+    .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
+    .output-text { font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4; }
+    .input-section { margin-bottom: 20px; }
+    """
+    with gr.Blocks(css=css, title="Gene Analysis Tool") as interface:
+        gr.Markdown("""
+        # 🧬 Gene Analysis Tool
+        Analyze DNA sequences, predict gene boundaries, and build phylogenetic trees.
+        """)
+        # Input Section
+        with gr.Row():
+            with gr.Column(scale=1):
+                seq_input = gr.Textbox(
+                    label="DNA Sequence",
+                    placeholder="Enter DNA sequence (A, T, C, G only)...",
+                    lines=5,
+                    max_lines=10
+                )
+                file_input = gr.File(
+                    label="Upload FASTA File",
+                    file_types=[".fasta", ".fa", ".fas", ".txt"]
+                )
+                analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
+                tree_btn = gr.Button("🌳 Build Tree", variant="primary")
+            with gr.Column(scale=2):
+                output = gr.Textbox(
+                    label="Results",
+                    lines=20,
+                    max_lines=30,
+                    elem_classes=["output-text"]
+                )
+        # Status Information
+        gr.Markdown("### Tool Status")
+        status = []
+        status.append(f"✅ Boundary Model: {'Loaded' if boundary_model else 'Not Available'}")
+        status.append(f"✅ Keras Model: {'Loaded' if keras_model else 'Not Available'}")
+        status.append(f"✅ Tree Analyzer: {'Initialized' if analyzer else 'Not Available'}")
+        mafft_available, iqtree_available, _, _ = check_tool_availability()
+        status.append(f"✅ MAFFT: {'Available' if mafft_available else 'Not Available'}")
+        status.append(f"✅ IQ-TREE: {'Available' if iqtree_available else 'Not Available'}")
+        gr.Markdown("\n".join(status))
+        # Event Handlers
+        analyze_btn.click(fn=analyze_sequence, inputs=seq_input, outputs=output)
+        tree_btn.click(fn=build_tree, inputs=seq_input, outputs=output)
+        file_input.change(fn=process_fasta_file, inputs=file_input, outputs=output)
+    return interface
+# --- File Processing ---
 def process_fasta_file(file):
     try:
+        if not file:
             return "Please upload a FASTA file."
         sequences = {}
         current_seq = ""
         current_name = ""
+        with open(file.name, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith('>'):
+                    if current_name and current_seq:
+                        sequences[current_name] = current_seq
+                    current_name = line[1:]
+                    current_seq = ""
+                else:
+                    current_seq += line.upper()
         if current_name and current_seq:
             sequences[current_name] = current_seq
         if not sequences:
+            return "No valid sequences in FASTA file."
+        results = [f"📁 FASTA FILE ANALYSIS\nFound {len(sequences)} sequences\n{'=' * 50}"]
         for i, (name, seq) in enumerate(sequences.items()):
+            if i >= 5:
                 results.append(f"\n... and {len(sequences) - 5} more sequences")
                 break
+            results.append(f"\n🧬 Sequence: {name}\nLength: {len(seq)} bp")
             clean_seq = re.sub(r'[^ATCG]', '', seq)
             if len(clean_seq) >= 10:
+                results.append(analyze_sequence(clean_seq))
             else:
                 results.append("❌ Sequence too short or invalid")
             results.append("-" * 40)
         return "\n".join(results)
     except Exception as e:
+        logging.error(f"FASTA processing failed: {e}")
         return f"FASTA processing failed: {str(e)}"
+# --- Main ---
 if __name__ == "__main__":
     os.makedirs("output", exist_ok=True)
     os.makedirs("ml_tree_output", exist_ok=True)
+    load_models()
+    init_tree_analyzer()
+    logging.info("Starting Gene Analysis Tool")
+    logging.info(f"Boundary model: {boundary_model is not None}")
+    logging.info(f"Keras model: {keras_model is not None}")
+    logging.info(f"Tree analyzer: {analyzer is not None}")
     try:
         interface = create_gradio_interface()
         interface.launch(
+            share=False,
+            server_name="0.0.0.0",
             server_port=7860,
             show_error=True,
             debug=True
         )
     except Exception as e:
+        logging.error(f"Interface launch failed: {e}")
         sys.exit(1)