Spaces:

GGproject10
/

simplified_tree_AI

No application file

App Files Files Community

re-type commited on Jun 13, 2025

Commit

664ad2e

verified ·

1 Parent(s): a780000

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -216

app.py CHANGED Viewed

@@ -22,8 +22,6 @@ from Bio.SeqRecord import SeqRecord
 import stat
 import time
 import asyncio
-# FastAPI imports
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import HTMLResponse
 from pydantic import BaseModel
@@ -43,20 +41,16 @@ app = FastAPI(title="🧬 Gene Analysis Pipeline", version="1.0.0")
 log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 log_handler = logging.StreamHandler()
 log_handler.setFormatter(log_formatter)
-# File handler with error handling
 try:
     file_handler = logging.FileHandler('/tmp/app.log')
     file_handler.setFormatter(log_formatter)
     logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
 except Exception:
     logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 logger = logging.getLogger(__name__)
 # --- Global Variables ---
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-MODELS_DIR = os.path.join(BASE_DIR, "models")  # Local models directory
 MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
 IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
 ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
@@ -64,10 +58,10 @@ TREE_PATH = os.path.join(BASE_DIR, "f_gene_sequences.phy.treefile")
 QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
 os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
-# --- Paths ---
-model_repo = "GGproject10/best_boundary_aware_model"
-csv_path = os.path.join(BASE_DIR, "f_cleaned.csv")
-hf_token = os.getenv("HF_TOKEN")
 # Initialize models as None
 boundary_model = None
@@ -75,84 +69,58 @@ keras_model = None
 kmer_to_index = None
 analyzer = None
-# --- Enhanced Model Loading with Local Priority ---
 def load_models_safely():
     global boundary_model, keras_model, kmer_to_index, analyzer
-    logger.info(f"🔍 Looking for models in: {MODELS_DIR}")
-    logger.info(f"📁 Models directory exists: {os.path.exists(MODELS_DIR)}")
-    if os.path.exists(MODELS_DIR):
-        logger.info(f"📂 Contents of models directory: {os.listdir(MODELS_DIR)}")
-    # Load Boundary Model - Try local first, then HF
     try:
-        # Local model paths
-        local_boundary_path = os.path.join(MODELS_DIR, "best_boundary_aware_model.pth")
-        if os.path.exists(local_boundary_path):
-            logger.info(f"✅ Loading boundary model from local path: {local_boundary_path}")
-            boundary_model = EnhancedGenePredictor(local_boundary_path)
-            logger.info("✅ Boundary model loaded successfully from local directory")
-        elif hf_token:
-            logger.info("🌐 Attempting to load boundary model from Hugging Face...")
-            boundary_path = hf_hub_download(
-                repo_id=model_repo,
-                filename="best_boundary_aware_model.pth",
-                token=hf_token,
-                cache_dir="/tmp/hf_cache"
-            )
-            if os.path.exists(boundary_path):
-                boundary_model = EnhancedGenePredictor(boundary_path)
-                logger.info("✅ Boundary model loaded successfully from HF")
-            else:
-                logger.warning("❌ Boundary model file not found after HF download")
         else:
-            logger.warning("❌ No local boundary model found and no HF_TOKEN available")
     except Exception as e:
         logger.error(f"❌ Failed to load boundary model: {e}")
         boundary_model = None
-    # Load Keras Model - Try local first, then HF
     try:
-        # Local model paths
-        local_keras_path = os.path.join(MODELS_DIR, "best_model.keras")
-        local_kmer_path = os.path.join(MODELS_DIR, "kmer_to_index.pkl")
-        if os.path.exists(local_keras_path) and os.path.exists(local_kmer_path):
-            logger.info(f"✅ Loading Keras model from local paths:")
-            logger.info(f"   - Keras model: {local_keras_path}")
-            logger.info(f"   - K-mer index: {local_kmer_path}")
-            keras_model = load_model(local_keras_path)
-            with open(local_kmer_path, "rb") as f:
                 kmer_to_index = pickle.load(f)
-            logger.info("✅ Keras model loaded successfully from local directory")
-        elif hf_token:
-            logger.info("🌐 Attempting to load Keras model from Hugging Face...")
-            keras_path = hf_hub_download(
-                repo_id=model_repo,
-                filename="best_model.keras",
-                token=hf_token,
-                cache_dir="/tmp/hf_cache"
-            )
-            kmer_path = hf_hub_download(
-                repo_id=model_repo,
-                filename="kmer_to_index.pkl",
-                token=hf_token,
-                cache_dir="/tmp/hf_cache"
-            )
-            if os.path.exists(keras_path) and os.path.exists(kmer_path):
-                keras_model = load_model(keras_path)
-                with open(kmer_path, "rb") as f:
-                    kmer_to_index = pickle.load(f)
-                logger.info("✅ Keras model loaded successfully from HF")
-            else:
-                logger.warning("❌ Keras model files not found after HF download")
         else:
-            logger.warning("❌ No local Keras model found and no HF_TOKEN available")
     except Exception as e:
         logger.error(f"❌ Failed to load Keras model: {e}")
         keras_model = None
@@ -162,38 +130,22 @@ def load_models_safely():
     try:
         logger.info("🌳 Initializing tree analyzer...")
         analyzer = PhylogeneticTreeAnalyzer()
-        # Try multiple CSV locations
-        csv_candidates = [
-            csv_path,
-            os.path.join(BASE_DIR, "f cleaned.csv"),
-            "f_cleaned.csv",
-            os.path.join(BASE_DIR, "data", "f_cleaned.csv"),
-            os.path.join(MODELS_DIR, "f_cleaned.csv")  # Also check models directory
-        ]
-        csv_loaded = False
-        for csv_candidate in csv_candidates:
-            if os.path.exists(csv_candidate):
-                try:
-                    logger.info(f"📊 Trying to load CSV from: {csv_candidate}")
-                    if analyzer.load_data(csv_candidate):
-                        logger.info(f"✅ Tree analyzer loaded CSV from: {csv_candidate}")
-                        csv_loaded = True
-                        break
-                except Exception as e:
-                    logger.warning(f"Failed to load CSV from {csv_candidate}: {e}")
-                    continue
-        if not csv_loaded:
-            logger.error("❌ Failed to load CSV data from any location")
-            logger.info("📂 Available files in base directory:")
-            try:
-                for file in os.listdir(BASE_DIR):
-                    if file.endswith('.csv'):
-                        logger.info(f"   - {file}")
-            except:
-                pass
             analyzer = None
     except Exception as e:
         logger.error(f"❌ Failed to initialize tree analyzer: {e}")
@@ -214,12 +166,9 @@ def setup_binary_permissions():
 def check_tool_availability():
     setup_binary_permissions()
-    # Check MAFFT
     mafft_available = False
     mafft_cmd = None
     mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
     for candidate in mafft_candidates:
         if shutil.which(candidate) or os.path.exists(candidate):
             try:
@@ -236,12 +185,9 @@ def check_tool_availability():
                     break
             except Exception as e:
                 logger.debug(f"MAFFT test failed for {candidate}: {e}")
-    # Check IQ-TREE
     iqtree_available = False
     iqtree_cmd = None
     iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
     for candidate in iqtree_candidates:
         if shutil.which(candidate) or os.path.exists(candidate):
             try:
@@ -258,46 +204,36 @@ def check_tool_availability():
                     break
             except Exception as e:
                 logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
     return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
-# --- Pipeline Functions (keeping your original logic) ---
 def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
     try:
         if len(sequence.strip()) < 100:
             return False, "Sequence too short (<100 bp).", None, None
         query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
         query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
         aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
         output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
         if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
             return False, "Reference alignment or tree not found.", None, None
         query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
         SeqIO.write([query_record], query_fasta, "fasta")
         with open(aligned_with_query, "w") as output_file:
             subprocess.run([
                 mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
             ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
         if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
             return False, "MAFFT alignment failed.", None, None
         subprocess.run([
             iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
             "-m", "GTR+G", "-pre", output_prefix, "-redo"
         ], capture_output=True, text=True, timeout=1200, check=True)
         treefile = f"{output_prefix}.treefile"
         if not os.path.exists(treefile):
             return False, "IQ-TREE placement failed.", aligned_with_query, None
         success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
         return True, success_msg, aligned_with_query, treefile
     except Exception as e:
         logger.error(f"Phylogenetic placement failed: {e}")
         return False, f"Error: {str(e)}", None, None
@@ -312,18 +248,14 @@ def predict_with_keras(sequence):
     try:
         if not keras_model or not kmer_to_index:
             return "❌ Keras model not available."
         if len(sequence) < 6:
             return "❌ Sequence too short (<6 bp)."
         kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
         indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
         f_gene_prob = prediction[-1]
         percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
         return f"✅ {percentage}% F gene confidence"
     except Exception as e:
         logger.error(f"Keras prediction failed: {e}")
@@ -334,14 +266,9 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
         dna_input = dna_input.upper().strip()
         if not dna_input:
             return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
-        # Clean sequence
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
         processed_sequence = dna_input
-        # Boundary prediction
         boundary_output = ""
         if boundary_model:
             try:
@@ -358,15 +285,10 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
                 processed_sequence = dna_input
         else:
             boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
-        # Keras prediction
         keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
-        # ML Tree (keeping your original logic)
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
             try:
                 mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
@@ -383,29 +305,23 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
             ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
         else:
             ml_tree_output = "⚠️ Phylogenetic placement skipped."
-        # Tree analysis
         tree_html_content = "No tree generated."
         report_html_content = "No report generated."
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
                 simplified_ml_output = tree_result
                 if tree_html_path and os.path.exists(tree_html_path):
                     with open(tree_html_path, 'r', encoding='utf-8') as f:
                         tree_html_content = f.read()
                 else:
                     tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
                 if report_html_path and os.path.exists(report_html_path):
                     with open(report_html_path, 'r', encoding='utf-8') as f:
                         report_html_content = f.read()
                 else:
                     report_html_content = f"<div style='color: red;'>{tree_result}</div>"
             except Exception as e:
                 simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
                 tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
@@ -414,8 +330,6 @@ def run_pipeline(dna_input, similarity_score=95.0, build_ml_tree=False):
             simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
             tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
             report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
-        # Summary
         summary_output = f"""
 📊 ANALYSIS SUMMARY:
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -426,49 +340,37 @@ Placement: {'✅ OK' if '✅' in ml_tree_output else '⚠️ Skipped' if 'skippe
 Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 """
         return (
             boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
             aligned_file, phy_file, None, None, tree_html_content, report_html_content
         )
     except Exception as e:
         logger.error(f"Pipeline error: {e}")
         error_msg = f"❌ Pipeline Error: {str(e)}"
         return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
-# Keep your other functions (analyze_sequence_for_tree, build_maximum_likelihood_tree, etc.)
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
     try:
         if not analyzer:
             return "❌ Tree analyzer not initialized.", None, None
         if not sequence or len(sequence.strip()) < 10:
             return "❌ Invalid sequence.", None, None
         if not (1 <= matching_percentage <= 99):
             return "❌ Matching percentage must be 1-99.", None, None
         if not analyzer.find_query_sequence(sequence):
             return "❌ Sequence not accepted.", None, None
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
         analyzer.build_tree_structure_with_ml_safe(matched_ids)
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         query_id = analyzer.query_id or f"query_{int(time.time())}"
         tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
         fig.write_html(tree_html_path)
         analyzer.matching_percentage = matching_percentage
         report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
         report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
         return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
     except Exception as e:
         logger.error(f"Tree analysis failed: {e}")
         return f"❌ Error: {str(e)}", None, None
@@ -477,17 +379,14 @@ def read_fasta_file(file_obj):
     try:
         if file_obj is None:
             return ""
         if isinstance(file_obj, str):
             with open(file_obj, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8")
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
     except Exception as e:
         logger.error(f"Failed to read FASTA file: {e}")
         return ""
@@ -548,13 +447,11 @@ async def health_check():
             },
             "paths": {
                 "base_dir": BASE_DIR,
-                "models_dir": MODELS_DIR,
-                "models_dir_exists": os.path.exists(MODELS_DIR),
-                "csv_path": csv_path,
-                "csv_exists": os.path.exists(csv_path)
             },
             "recommendations": {
-                "models": "Models loaded from local directory" if (boundary_model and keras_model) else "Check models directory",
                 "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
             }
         }
@@ -594,9 +491,7 @@ async def analyze_file(
             content = await file.read()
             temp_file.write(content)
             temp_file_path = temp_file.name
         result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
         return AnalysisResponse(
             boundary_output=result[0] or "",
             keras_output=result[1] or "",
@@ -633,10 +528,7 @@ def create_gradio_interface():
             .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
             """
         ) as iface:
             gr.Markdown("# 🧬 Gene Analysis Pipeline")
-            # Status display
             with gr.Row():
                 with gr.Column():
                     status_display = gr.HTML(value=f"""
@@ -648,9 +540,7 @@ def create_gradio_interface():
                         <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
                         <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
                     """)
             with gr.Tabs() as tabs:
-                # Tab 1: Text Input
                 with gr.TabItem("📝 Text Input"):
                     with gr.Row():
                         with gr.Column(scale=2):
@@ -660,7 +550,6 @@ def create_gradio_interface():
                                 lines=5,
                                 info="Paste your DNA sequence here"
                             )
                         with gr.Column(scale=1):
                             similarity_score = gr.Slider(
                                 minimum=1,
@@ -670,16 +559,12 @@ def create_gradio_interface():
                                 label="🎯 Similarity Threshold (%)",
                                 info="Minimum similarity for tree analysis"
                             )
                             build_ml_tree = gr.Checkbox(
                                 label="🌲 Build ML Tree",
                                 value=False,
                                 info="Generate phylogenetic placement (slower)"
                             )
                             analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
-                # Tab 2: File Upload
                 with gr.TabItem("📁 File Upload"):
                     with gr.Row():
                         with gr.Column(scale=2):
@@ -688,7 +573,6 @@ def create_gradio_interface():
                                 file_types=[".fasta", ".fa", ".fas", ".txt"],
                                 info="Upload a FASTA file containing your sequence"
                             )
                         with gr.Column(scale=1):
                             file_similarity_score = gr.Slider(
                                 minimum=1,
@@ -697,17 +581,12 @@ def create_gradio_interface():
                                 step=1.0,
                                 label="🎯 Similarity Threshold (%)"
                             )
                             file_build_ml_tree = gr.Checkbox(
                                 label="🌲 Build ML Tree",
                                 value=False
                             )
                             analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
-            # Results Section
             gr.Markdown("## 📊 Analysis Results")
             with gr.Row():
                 with gr.Column():
                     boundary_output = gr.Textbox(
@@ -715,53 +594,41 @@ def create_gradio_interface():
                         interactive=False,
                         lines=2
                     )
                     keras_output = gr.Textbox(
                         label="🧠 F Gene Validation",
                         interactive=False,
                         lines=2
                     )
                 with gr.Column():
                     ml_tree_output = gr.Textbox(
                         label="🌲 Phylogenetic Placement",
                         interactive=False,
                         lines=2
                     )
                     tree_analysis_output = gr.Textbox(
                         label="🌳 Tree Analysis",
                         interactive=False,
                         lines=2
                     )
-            # Summary
             summary_output = gr.Textbox(
                 label="📋 Summary",
                 interactive=False,
                 lines=8
             )
-            # File Downloads
             with gr.Row():
                 aligned_file = gr.File(label="📄 Alignment File", visible=False)
                 tree_file = gr.File(label="🌲 Tree File", visible=False)
-            # Interactive Visualizations
             with gr.Tabs():
                 with gr.TabItem("🌳 Interactive Tree"):
                     tree_html = gr.HTML(
                         label="Phylogenetic Tree",
                         value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
                     )
                 with gr.TabItem("📊 Detailed Report"):
                     report_html = gr.HTML(
                         label="Analysis Report",
                         value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
                     )
-            # Event handlers
             analyze_btn.click(
                 fn=run_pipeline,
                 inputs=[dna_input, similarity_score, build_ml_tree],
@@ -772,7 +639,6 @@ def create_gradio_interface():
                     tree_html, report_html
                 ]
             )
             analyze_file_btn.click(
                 fn=run_pipeline_from_file,
                 inputs=[file_input, file_similarity_score, file_build_ml_tree],
@@ -783,51 +649,38 @@ def create_gradio_interface():
                     tree_html, report_html
                 ]
             )
-            # Examples
             gr.Markdown("## 🔬 Example Sequences")
             example_sequences = [
                 ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
                 ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True],
             ]
             gr.Examples(
                 examples=example_sequences,
                 inputs=[dna_input, similarity_score, build_ml_tree],
                 label="Click to load example sequences"
             )
-            # Help Section
             with gr.Accordion("❓ Help & Information", open=False):
                 gr.Markdown("""
                 ### 🧬 Gene Analysis Pipeline
                 This tool performs comprehensive analysis of F gene sequences:
                 **🎯 Boundary Detection**: Identifies F gene regions within your sequence
                 **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
                 **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
                 **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
                 ### 📋 Input Requirements
                 - DNA sequences in ATCG format
                 - Minimum 10 bp for basic analysis
                 - Minimum 100 bp for phylogenetic placement
                 - FASTA files supported for upload
                 ### ⚙️ Parameters
                 - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
                 - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
                 ### 📊 Output Files
                 - Alignment files (.fa format)
                 - Tree files (.treefile format)
                 - Interactive HTML visualizations
                 """)
         return iface
     except Exception as e:
         logger.error(f"Failed to create Gradio interface: {e}")
         return None
@@ -850,17 +703,13 @@ mount_gradio_app()
 # --- Main Application ---
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
     parser.add_argument("--host", default="0.0.0.0", help="Host address")
     parser.add_argument("--port", type=int, default=7860, help="Port number")
     parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
     parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
     args = parser.parse_args()
     if args.gradio_only:
-        # Run Gradio interface only
         logger.info("🚀 Starting Gradio interface only...")
         iface = create_gradio_interface()
         if iface:
@@ -874,11 +723,9 @@ if __name__ == "__main__":
             logger.error("❌ Failed to create Gradio interface")
             sys.exit(1)
     else:
-        # Run FastAPI with Gradio mounted
         logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
         logger.info("📊 API Documentation: http://localhost:7860/docs")
         logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
         try:
             uvicorn.run(
                 "app:app" if args.reload else app,

 import stat
 import time
 import asyncio
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import HTMLResponse
 from pydantic import BaseModel
 log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 log_handler = logging.StreamHandler()
 log_handler.setFormatter(log_formatter)
 try:
     file_handler = logging.FileHandler('/tmp/app.log')
     file_handler.setFormatter(log_formatter)
     logging.basicConfig(level=logging.INFO, handlers=[log_handler, file_handler])
 except Exception:
     logging.basicConfig(level=logging.INFO, handlers=[log_handler])
 logger = logging.getLogger(__name__)
 # --- Global Variables ---
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 MAFFT_PATH = shutil.which("mafft") or os.path.join(BASE_DIR, "binaries", "mafft", "mafft")
 IQTREE_PATH = shutil.which("iqtree") or os.path.join(BASE_DIR, "binaries", "iqtree", "bin", "iqtree3")
 ALIGNMENT_PATH = os.path.join(BASE_DIR, "f_gene_sequences_aligned.fasta")
 QUERY_OUTPUT_DIR = os.path.join("/tmp", "queries")
 os.makedirs(QUERY_OUTPUT_DIR, exist_ok=True)
+# --- Hugging Face Repositories ---
+MODEL_REPO = "GGproject10/best_boundary_aware_model"
+DATA_REPO = "GGproject10/simplified_tree_AI"
+HF_TOKEN = os.getenv("HF_TOKEN")
 # Initialize models as None
 boundary_model = None
 kmer_to_index = None
 analyzer = None
+# --- Model Loading ---
 def load_models_safely():
     global boundary_model, keras_model, kmer_to_index, analyzer
+    logger.info("🔍 Loading models and data from Hugging Face repositories")
+    if not HF_TOKEN:
+        logger.error("❌ HF_TOKEN environment variable not set")
+        return
+    # Load Boundary Model
     try:
+        logger.info(f"🌐 Downloading boundary model from {MODEL_REPO}")
+        boundary_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename="best_boundary_aware_model.pth",
+            token=HF_TOKEN,
+            cache_dir="/tmp/hf_cache"
+        )
+        if os.path.exists(boundary_path):
+            logger.info(f"✅ Boundary model downloaded to: {boundary_path}")
+            boundary_model = EnhancedGenePredictor(boundary_path)
+            logger.info("✅ Boundary model loaded successfully")
         else:
+            logger.warning(f"❌ Boundary model not found at: {boundary_path}")
     except Exception as e:
         logger.error(f"❌ Failed to load boundary model: {e}")
         boundary_model = None
+    # Load Keras Model
     try:
+        logger.info(f"🌐 Downloading Keras model and kmer index from {MODEL_REPO}")
+        keras_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename="best_model.keras",
+            token=HF_TOKEN,
+            cache_dir="/tmp/hf_cache"
+        )
+        kmer_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename="kmer_to_index.pkl",
+            token=HF_TOKEN,
+            cache_dir="/tmp/hf_cache"
+        )
+        if os.path.exists(keras_path) and os.path.exists(kmer_path):
+            logger.info(f"✅ Keras model downloaded to: {keras_path}")
+            logger.info(f"✅ Kmer index downloaded to: {kmer_path}")
+            keras_model = load_model(keras_path)
+            with open(kmer_path, "rb") as f:
                 kmer_to_index = pickle.load(f)
+            logger.info("✅ Keras model loaded successfully")
         else:
+            logger.warning(f"❌ Keras model files not found: keras={os.path.exists(keras_path)}, kmer={os.path.exists(kmer_path)}")
     except Exception as e:
         logger.error(f"❌ Failed to load Keras model: {e}")
         keras_model = None
     try:
         logger.info("🌳 Initializing tree analyzer...")
         analyzer = PhylogeneticTreeAnalyzer()
+        logger.info(f"🌐 Downloading CSV from {DATA_REPO}")
+        csv_path = hf_hub_download(
+            repo_id=DATA_REPO,
+            filename="f_cleaned.csv",
+            token=HF_TOKEN,
+            cache_dir="/tmp/hf_cache"
+        )
+        if os.path.exists(csv_path):
+            logger.info(f"📊 CSV downloaded to: {csv_path}")
+            if analyzer.load_data(csv_path):
+                logger.info(f"✅ Tree analyzer loaded CSV successfully")
+            else:
+                logger.error("❌ Failed to load CSV data")
+                analyzer = None
+        else:
+            logger.warning(f"❌ CSV not found at: {csv_path}")
             analyzer = None
     except Exception as e:
         logger.error(f"❌ Failed to initialize tree analyzer: {e}")
 def check_tool_availability():
     setup_binary_permissions()
     mafft_available = False
     mafft_cmd = None
     mafft_candidates = ['mafft', '/usr/bin/mafft', '/usr/local/bin/mafft', MAFFT_PATH]
     for candidate in mafft_candidates:
         if shutil.which(candidate) or os.path.exists(candidate):
             try:
                     break
             except Exception as e:
                 logger.debug(f"MAFFT test failed for {candidate}: {e}")
     iqtree_available = False
     iqtree_cmd = None
     iqtree_candidates = ['iqtree', 'iqtree2', 'iqtree3', '/usr/bin/iqtree', '/usr/local/bin/iqtree', IQTREE_PATH]
     for candidate in iqtree_candidates:
         if shutil.which(candidate) or os.path.exists(candidate):
             try:
                     break
             except Exception as e:
                 logger.debug(f"IQ-TREE test failed for {candidate}: {e}")
     return mafft_available, iqtree_available, mafft_cmd, iqtree_cmd
+# --- Pipeline Functions ---
 def phylogenetic_placement(sequence: str, mafft_cmd: str, iqtree_cmd: str):
     try:
         if len(sequence.strip()) < 100:
             return False, "Sequence too short (<100 bp).", None, None
         query_id = f"QUERY_{uuid.uuid4().hex[:8]}"
         query_fasta = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}.fa")
         aligned_with_query = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_aligned.fa")
         output_prefix = os.path.join(QUERY_OUTPUT_DIR, f"{query_id}_placed_tree")
         if not os.path.exists(ALIGNMENT_PATH) or not os.path.exists(TREE_PATH):
             return False, "Reference alignment or tree not found.", None, None
         query_record = SeqRecord(Seq(sequence.upper()), id=query_id, description="")
         SeqIO.write([query_record], query_fasta, "fasta")
         with open(aligned_with_query, "w") as output_file:
             subprocess.run([
                 mafft_cmd, "--add", query_fasta, "--reorder", ALIGNMENT_PATH
             ], stdout=output_file, stderr=subprocess.PIPE, text=True, timeout=600, check=True)
         if not os.path.exists(aligned_with_query) or os.path.getsize(aligned_with_query) == 0:
             return False, "MAFFT alignment failed.", None, None
         subprocess.run([
             iqtree_cmd, "-s", aligned_with_query, "-g", TREE_PATH,
             "-m", "GTR+G", "-pre", output_prefix, "-redo"
         ], capture_output=True, text=True, timeout=1200, check=True)
         treefile = f"{output_prefix}.treefile"
         if not os.path.exists(treefile):
             return False, "IQ-TREE placement failed.", aligned_with_query, None
         success_msg = f"Placement completed!\nQuery ID: {query_id}\nAlignment: {os.path.basename(aligned_with_query)}\nTree: {os.path.basename(treefile)}"
         return True, success_msg, aligned_with_query, treefile
     except Exception as e:
         logger.error(f"Phylogenetic placement failed: {e}")
         return False, f"Error: {str(e)}", None, None
     try:
         if not keras_model or not kmer_to_index:
             return "❌ Keras model not available."
         if len(sequence) < 6:
             return "❌ Sequence too short (<6 bp)."
         kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
         indices = [kmer_to_index.get(kmer, 0) for kmer in kmers]
         input_arr = np.array([indices])
         prediction = keras_model.predict(input_arr, verbose=0)[0]
         f_gene_prob = prediction[-1]
         percentage = min(100, max(0, int(f_gene_prob * 100 + 5)))
         return f"✅ {percentage}% F gene confidence"
     except Exception as e:
         logger.error(f"Keras prediction failed: {e}")
         dna_input = dna_input.upper().strip()
         if not dna_input:
             return "❌ Empty input", "", "", "", "", None, None, None, None, "No input", "No input"
         if not re.match('^[ACTGN]+$', dna_input):
             dna_input = ''.join(c if c in 'ACTGN' else 'N' for c in dna_input)
         processed_sequence = dna_input
         boundary_output = ""
         if boundary_model:
             try:
                 processed_sequence = dna_input
         else:
             boundary_output = f"⚠️ Boundary model not available. Using full input: {len(dna_input)} bp"
         keras_output = predict_with_keras(processed_sequence) if processed_sequence and len(processed_sequence) >= 6 else "❌ Sequence too short."
         aligned_file = None
         phy_file = None
         ml_tree_output = ""
         if build_ml_tree and processed_sequence and len(processed_sequence) >= 100:
             try:
                 mafft_available, iqtree_available, mafft_cmd, iqtree_cmd = check_tool_availability()
             ml_tree_output = "❌ Sequence too short for placement (<100 bp)."
         else:
             ml_tree_output = "⚠️ Phylogenetic placement skipped."
         tree_html_content = "No tree generated."
         report_html_content = "No report generated."
         simplified_ml_output = ""
         if analyzer and processed_sequence and len(processed_sequence) >= 10:
             try:
                 tree_result, tree_html_path, report_html_path = analyze_sequence_for_tree(processed_sequence, similarity_score)
                 simplified_ml_output = tree_result
                 if tree_html_path and os.path.exists(tree_html_path):
                     with open(tree_html_path, 'r', encoding='utf-8') as f:
                         tree_html_content = f.read()
                 else:
                     tree_html_content = f"<div style='color: red;'>{tree_result}</div>"
                 if report_html_path and os.path.exists(report_html_path):
                     with open(report_html_path, 'r', encoding='utf-8') as f:
                         report_html_content = f.read()
                 else:
                     report_html_content = f"<div style='color: red;'>{tree_result}</div>"
             except Exception as e:
                 simplified_ml_output = f"❌ Tree analysis error: {str(e)}"
                 tree_html_content = f"<div style='color: red;'>{simplified_ml_output}</div>"
             simplified_ml_output = "❌ Tree analyzer not available." if not analyzer else "❌ Sequence too short (<10 bp)."
             tree_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
             report_html_content = f"<div style='color: orange;'>{simplified_ml_output}</div>"
         summary_output = f"""
 📊 ANALYSIS SUMMARY:
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Tree Analysis: {'✅ OK' if 'Found' in simplified_ml_output else '❌ Failed'}
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 """
         return (
             boundary_output, keras_output, ml_tree_output, simplified_ml_output, summary_output,
             aligned_file, phy_file, None, None, tree_html_content, report_html_content
         )
     except Exception as e:
         logger.error(f"Pipeline error: {e}")
         error_msg = f"❌ Pipeline Error: {str(e)}"
         return error_msg, "", "", "", "", None, None, None, None, error_msg, error_msg
 def analyze_sequence_for_tree(sequence: str, matching_percentage: float):
     try:
         if not analyzer:
             return "❌ Tree analyzer not initialized.", None, None
         if not sequence or len(sequence.strip()) < 10:
             return "❌ Invalid sequence.", None, None
         if not (1 <= matching_percentage <= 99):
             return "❌ Matching percentage must be 1-99.", None, None
         if not analyzer.find_query_sequence(sequence):
             return "❌ Sequence not accepted.", None, None
         matched_ids, actual_percentage = analyzer.find_similar_sequences(matching_percentage)
         if not matched_ids:
             return f"❌ No similar sequences at {matching_percentage}% threshold.", None, None
         analyzer.build_tree_structure_with_ml_safe(matched_ids)
         fig = analyzer.create_interactive_tree(matched_ids, actual_percentage)
         query_id = analyzer.query_id or f"query_{int(time.time())}"
         tree_html_path = os.path.join("/tmp", f'phylogenetic_tree_{query_id}.html')
         fig.write_html(tree_html_path)
         analyzer.matching_percentage = matching_percentage
         report_success = analyzer.generate_detailed_report(matched_ids, actual_percentage)
         report_html_path = os.path.join("/tmp", f"detailed_report_{query_id}.html") if report_success else None
         return f"✅ Found {len(matched_ids)} sequences at {actual_percentage:.2f}% similarity.", tree_html_path, report_html_path
     except Exception as e:
         logger.error(f"Tree analysis failed: {e}")
         return f"❌ Error: {str(e)}", None, None
     try:
         if file_obj is None:
             return ""
         if isinstance(file_obj, str):
             with open(file_obj, "r") as f:
                 content = f.read()
         else:
             content = file_obj.read().decode("utf-8")
         lines = content.strip().split("\n")
         seq_lines = [line.strip() for line in lines if not line.startswith(">")]
         return ''.join(seq_lines)
     except Exception as e:
         logger.error(f"Failed to read FASTA file: {e}")
         return ""
             },
             "paths": {
                 "base_dir": BASE_DIR,
+                "hf_cache": "/tmp/hf_cache",
+                "hf_cache_exists": os.path.exists("/tmp/hf_cache")
             },
             "recommendations": {
+                "models": "Models loaded from Hugging Face" if (boundary_model and keras_model) else "Check HF_TOKEN and repository",
                 "bioinformatics_tools": "Install MAFFT and IQ-TREE" if not (mafft_available and iqtree_available) else "OK"
             }
         }
             content = await file.read()
             temp_file.write(content)
             temp_file_path = temp_file.name
         result = await run_pipeline_from_file(temp_file_path, similarity_score, build_ml_tree)
         return AnalysisResponse(
             boundary_output=result[0] or "",
             keras_output=result[1] or "",
             .error { background-color: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
             """
         ) as iface:
             gr.Markdown("# 🧬 Gene Analysis Pipeline")
             with gr.Row():
                 with gr.Column():
                     status_display = gr.HTML(value=f"""
                         <p>🧬 MAFFT: {'✅ Available' if check_tool_availability()[0] else '❌ Missing'}</p>
                         <p>🌲 IQ-TREE: {'✅ Available' if check_tool_availability()[1] else '❌ Missing'}</p>
                     """)
             with gr.Tabs() as tabs:
                 with gr.TabItem("📝 Text Input"):
                     with gr.Row():
                         with gr.Column(scale=2):
                                 lines=5,
                                 info="Paste your DNA sequence here"
                             )
                         with gr.Column(scale=1):
                             similarity_score = gr.Slider(
                                 minimum=1,
                                 label="🎯 Similarity Threshold (%)",
                                 info="Minimum similarity for tree analysis"
                             )
                             build_ml_tree = gr.Checkbox(
                                 label="🌲 Build ML Tree",
                                 value=False,
                                 info="Generate phylogenetic placement (slower)"
                             )
                             analyze_btn = gr.Button("🔬 Analyze Sequence", variant="primary")
                 with gr.TabItem("📁 File Upload"):
                     with gr.Row():
                         with gr.Column(scale=2):
                                 file_types=[".fasta", ".fa", ".fas", ".txt"],
                                 info="Upload a FASTA file containing your sequence"
                             )
                         with gr.Column(scale=1):
                             file_similarity_score = gr.Slider(
                                 minimum=1,
                                 step=1.0,
                                 label="🎯 Similarity Threshold (%)"
                             )
                             file_build_ml_tree = gr.Checkbox(
                                 label="🌲 Build ML Tree",
                                 value=False
                             )
                             analyze_file_btn = gr.Button("🔬 Analyze File", variant="primary")
             gr.Markdown("## 📊 Analysis Results")
             with gr.Row():
                 with gr.Column():
                     boundary_output = gr.Textbox(
                         interactive=False,
                         lines=2
                     )
                     keras_output = gr.Textbox(
                         label="🧠 F Gene Validation",
                         interactive=False,
                         lines=2
                     )
                 with gr.Column():
                     ml_tree_output = gr.Textbox(
                         label="🌲 Phylogenetic Placement",
                         interactive=False,
                         lines=2
                     )
                     tree_analysis_output = gr.Textbox(
                         label="🌳 Tree Analysis",
                         interactive=False,
                         lines=2
                     )
             summary_output = gr.Textbox(
                 label="📋 Summary",
                 interactive=False,
                 lines=8
             )
             with gr.Row():
                 aligned_file = gr.File(label="📄 Alignment File", visible=False)
                 tree_file = gr.File(label="🌲 Tree File", visible=False)
             with gr.Tabs():
                 with gr.TabItem("🌳 Interactive Tree"):
                     tree_html = gr.HTML(
                         label="Phylogenetic Tree",
                         value="<div style='text-align: center; padding: 20px; color: #666;'>No tree generated yet.</div>"
                     )
                 with gr.TabItem("📊 Detailed Report"):
                     report_html = gr.HTML(
                         label="Analysis Report",
                         value="<div style='text-align: center; padding: 20px; color: #666;'>No report generated yet.</div>"
                     )
             analyze_btn.click(
                 fn=run_pipeline,
                 inputs=[dna_input, similarity_score, build_ml_tree],
                     tree_html, report_html
                 ]
             )
             analyze_file_btn.click(
                 fn=run_pipeline_from_file,
                 inputs=[file_input, file_similarity_score, file_build_ml_tree],
                     tree_html, report_html
                 ]
             )
             gr.Markdown("## 🔬 Example Sequences")
             example_sequences = [
                 ["ATGGACTTCCAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 90.0, False],
                 ["ATGAAACAAATTAACAACCTCAACAACCTCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAACATCAACAAC", 85.0, True],
             ]
             gr.Examples(
                 examples=example_sequences,
                 inputs=[dna_input, similarity_score, build_ml_tree],
                 label="Click to load example sequences"
             )
             with gr.Accordion("❓ Help & Information", open=False):
                 gr.Markdown("""
                 ### 🧬 Gene Analysis Pipeline
                 This tool performs comprehensive analysis of F gene sequences:
                 **🎯 Boundary Detection**: Identifies F gene regions within your sequence
                 **🧠 F Gene Validation**: Validates sequence as F gene using deep learning
                 **🌲 Phylogenetic Placement**: Places sequence in reference phylogeny
                 **🌳 Tree Analysis**: Finds similar sequences and builds interactive trees
                 ### 📋 Input Requirements
                 - DNA sequences in ATCG format
                 - Minimum 10 bp for basic analysis
                 - Minimum 100 bp for phylogenetic placement
                 - FASTA files supported for upload
                 ### ⚙️ Parameters
                 - **Similarity Threshold**: Minimum % similarity for tree analysis (1-99%)
                 - **Build ML Tree**: Enable phylogenetic placement (requires MAFFT/IQ-TREE)
                 ### 📊 Output Files
                 - Alignment files (.fa format)
                 - Tree files (.treefile format)
                 - Interactive HTML visualizations
                 """)
         return iface
     except Exception as e:
         logger.error(f"Failed to create Gradio interface: {e}")
         return None
 # --- Main Application ---
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="🧬 Gene Analysis Pipeline")
     parser.add_argument("--host", default="0.0.0.0", help="Host address")
     parser.add_argument("--port", type=int, default=7860, help="Port number")
     parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
     parser.add_argument("--gradio-only", action="store_true", help="Run Gradio interface only")
     args = parser.parse_args()
     if args.gradio_only:
         logger.info("🚀 Starting Gradio interface only...")
         iface = create_gradio_interface()
         if iface:
             logger.error("❌ Failed to create Gradio interface")
             sys.exit(1)
     else:
         logger.info(f"🚀 Starting Gene Analysis Pipeline on {args.host}:{args.port}")
         logger.info("📊 API Documentation: http://localhost:7860/docs")
         logger.info("🧬 Gradio Interface: http://localhost:7860/gradio")
         try:
             uvicorn.run(
                 "app:app" if args.reload else app,