Spaces:

saketh11
/

ColiFormer

Running

saketh11 commited on Jan 7

Commit

d66b023

1 Parent(s): 3acd7be

Fix CAI installation for Hugging Face Spaces

- Remove Dockerfile approach (wasn't working in HF Spaces)
- Add CAI auto-installation at app startup before any imports
- Install CAI with --no-use-pep517 flag to avoid wheel naming issues
- Add packages.txt for system dependencies
- Add setup_cai.py as standalone installation script
- This should resolve the 'ModuleNotFoundError: No module named CAI' issue

Files changed (4) hide show

Dockerfile +0 -74
app.py +51 -25
packages.txt +4 -0
setup_cai.py +118 -0

Dockerfile DELETED Viewed

@@ -1,74 +0,0 @@
-FROM python:3.10
-WORKDIR /app
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    git \
-    git-lfs \
-    ffmpeg \
-    libsm6 \
-    libxext6 \
-    cmake \
-    rsync \
-    libgl1 \
-    && rm -rf /var/lib/apt/lists/* \
-    && git lfs install
-# Upgrade pip and install base packages
-RUN pip install --no-cache-dir pip -U && \
-    pip install --no-cache-dir \
-        setuptools \
-        wheel \
-        datasets \
-        "huggingface-hub>=0.30" \
-        "hf-transfer>=0.1.4" \
-        "protobuf<4" \
-        "click<8.1" \
-        "pydantic~=1.0"
-# Install CAI package first with legacy build system to avoid wheel naming issues
-RUN pip install --no-use-pep517 --no-cache-dir \
-    git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610
-# Verify CAI installation
-RUN python -c "import CAI; from CAI import CAI as cai_func, relative_adaptiveness; print('✅ CAI package verified successfully')"
-# Copy requirements file and install remaining dependencies
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir -r /tmp/requirements.txt
-# Install additional streamlit dependencies
-RUN pip install --no-cache-dir \
-    streamlit==1.28.1 \
-    "uvicorn>=0.14.0" \
-    spaces
-# Create user directory structure
-RUN mkdir -p .streamlit && \
-    git config --global core.excludesfile ~/.gitignore && \
-    echo ".streamlit" > ~/.gitignore
-RUN mkdir -p /home/user && \
-    ( [ -e /home/user/app ] || ln -s /app/ /home/user/app ) || true
-# Copy application files
-COPY --link ./ /app
-# Verify application can import CAI after copying files
-RUN python -c "from CodonTransformer.CodonEvaluation import *; print('✅ CodonTransformer.CodonEvaluation imports successfully')"
-# Set up environment
-ENV PYTHONPATH=/app
-ENV STREAMLIT_SERVER_HEADLESS=true
-ENV STREAMLIT_SERVER_PORT=7860
-ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
-# Expose port
-EXPOSE 7860
-# Health check
-HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
-# Run the application
-CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

app.py CHANGED Viewed

@@ -1,3 +1,31 @@
 import streamlit as st
 import torch
 import pandas as pd
@@ -14,8 +42,6 @@ import warnings
 warnings.filterwarnings("ignore")
 # Import CodonTransformer modules
-import sys
-import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from CodonTransformer.CodonPrediction import (
@@ -220,7 +246,7 @@ def validate_sequence(sequence: str) -> Tuple[bool, str, str, str]:
     if sequence_chars.issubset(dna_chars):
         if len(sequence) < 3:
             return False, "DNA sequence must be at least 3 nucleotides long", "dna", sequence
         # Auto-fix DNA sequences not divisible by 3
         if len(sequence) % 3 != 0:
             remainder = len(sequence) % 3
@@ -229,7 +255,7 @@ def validate_sequence(sequence: str) -> Tuple[bool, str, str, str]:
         else:
             fixed_sequence = sequence
             message = "Valid DNA sequence"
         return True, message, "dna", fixed_sequence
     # If contains protein-specific amino acids, treat as protein
@@ -654,7 +680,7 @@ def main():
 def single_sequence_optimization():
     """Single sequence optimization interface - enhanced from original functionality"""
-    # Sidebar configuration
     st.sidebar.header("🔧 Configuration")
     organism_options = [
         "Escherichia coli general",
@@ -687,17 +713,17 @@ def single_sequence_optimization():
     )
     if not POST_PROCESSING_AVAILABLE:
         st.sidebar.warning("⚠️ DNAChisel not available. Install with: pip install dnachisel")
     # Dataset Information
     st.sidebar.markdown("---")
     st.sidebar.markdown("### 📊 Dataset Information")
     st.sidebar.markdown("""
     - **Dataset**: [ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data)
     - **Training**: 4,300 high-CAI E. coli sequences
-    - **Reference**: 50,000+ E. coli gene sequences
     - **Auto-download**: CAI weights & tAI coefficients
     """)
     # Model Information
     st.sidebar.markdown("### 🤖 Model Information")
     st.sidebar.markdown("""
@@ -853,10 +879,10 @@ def single_sequence_optimization():
                 st.error(f"❌ **Optimization Failed:** {st.session_state.results}")
             else:
                 display_optimization_results(
-                    st.session_state.results,
-                    st.session_state.get('organism', organism),
-                    st.session_state.get('sequence_clean', ''),
-                    st.session_state.get('sequence_type', 'protein'),
                     st.session_state.get('input_metrics', {})
                 )
@@ -1226,20 +1252,20 @@ def display_batch_results():
     # CAI Extremes Analysis
     st.subheader("🎯 CAI Performance Analysis")
     # Filter out rows with NaN CAI values for analysis
     valid_cai_df = results_df.dropna(subset=['cai_after'])
     if len(valid_cai_df) > 0:
         # Find lowest and highest CAI sequences
         lowest_cai_idx = valid_cai_df['cai_after'].idxmin()
         highest_cai_idx = valid_cai_df['cai_after'].idxmax()
         lowest_cai_row = results_df.loc[lowest_cai_idx]
         highest_cai_row = results_df.loc[highest_cai_idx]
         col1, col2 = st.columns(2)
         with col1:
             st.markdown("**🔻 Lowest CAI Sequence**")
             st.write(f"**Name:** {lowest_cai_row['name']}")
@@ -1247,12 +1273,12 @@ def display_batch_results():
             st.metric("GC Content", f"{lowest_cai_row['gc_content_after']:.1f}%")
             st.metric("tAI Score", f"{lowest_cai_row['tai_after']:.3f}")
             st.metric("Length", f"{lowest_cai_row['length_after']} bp")
             # Show improvement
             if pd.notna(lowest_cai_row['cai_before']):
                 cai_improvement = lowest_cai_row['cai_after'] - lowest_cai_row['cai_before']
                 st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
         with col2:
             st.markdown("**🔺 Highest CAI Sequence**")
             st.write(f"**Name:** {highest_cai_row['name']}")
@@ -1260,12 +1286,12 @@ def display_batch_results():
             st.metric("GC Content", f"{highest_cai_row['gc_content_after']:.1f}%")
             st.metric("tAI Score", f"{highest_cai_row['tai_after']:.3f}")
             st.metric("Length", f"{highest_cai_row['length_after']} bp")
             # Show improvement
             if pd.notna(highest_cai_row['cai_before']):
                 cai_improvement = highest_cai_row['cai_after'] - highest_cai_row['cai_before']
                 st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
         # CAI Distribution Chart
         st.subheader("📊 CAI Distribution")
         fig = go.Figure()
@@ -1276,7 +1302,7 @@ def display_batch_results():
             marker_color='darkblue',
             opacity=0.7
         ))
         # Add vertical lines for lowest and highest
         fig.add_vline(
             x=lowest_cai_row['cai_after'],
@@ -1286,11 +1312,11 @@ def display_batch_results():
         )
         fig.add_vline(
             x=highest_cai_row['cai_after'],
-            line_dash="dash",
             line_color="green",
             annotation_text=f"Highest: {highest_cai_row['cai_after']:.3f}"
         )
         fig.update_layout(
             title="Distribution of Optimized CAI Scores",
             xaxis_title="CAI Score",
@@ -1339,7 +1365,7 @@ def display_batch_results():
             st.plotly_chart(fig_gc, use_container_width=True)
         else:
             st.warning("⚠️ No valid GC content values found in the batch results.")
     else:
         st.warning("⚠️ No valid CAI scores found in the batch results. Check if CAI weights are properly loaded.")

+# Setup CAI package before any other imports
+import sys
+import os
+# Check and setup CAI package if needed
+def setup_cai_if_needed():
+    try:
+        import CAI
+        return True
+    except ImportError:
+        print("CAI not found, attempting to install...")
+        try:
+            import subprocess
+            # Install CAI with legacy build system
+            subprocess.check_call([
+                sys.executable, "-m", "pip", "install", "--no-use-pep517", "--no-cache-dir",
+                "git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610"
+            ])
+            import CAI
+            print("✅ CAI installed successfully")
+            return True
+        except Exception as e:
+            print(f"❌ Failed to install CAI: {e}")
+            return False
+# Setup CAI before any other imports that might need it
+setup_cai_if_needed()
 import streamlit as st
 import torch
 import pandas as pd
 warnings.filterwarnings("ignore")
 # Import CodonTransformer modules
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from CodonTransformer.CodonPrediction import (
     if sequence_chars.issubset(dna_chars):
         if len(sequence) < 3:
             return False, "DNA sequence must be at least 3 nucleotides long", "dna", sequence
         # Auto-fix DNA sequences not divisible by 3
         if len(sequence) % 3 != 0:
             remainder = len(sequence) % 3
         else:
             fixed_sequence = sequence
             message = "Valid DNA sequence"
         return True, message, "dna", fixed_sequence
     # If contains protein-specific amino acids, treat as protein
 def single_sequence_optimization():
     """Single sequence optimization interface - enhanced from original functionality"""
+    # Sidebar configuration
     st.sidebar.header("🔧 Configuration")
     organism_options = [
         "Escherichia coli general",
     )
     if not POST_PROCESSING_AVAILABLE:
         st.sidebar.warning("⚠️ DNAChisel not available. Install with: pip install dnachisel")
     # Dataset Information
     st.sidebar.markdown("---")
     st.sidebar.markdown("### 📊 Dataset Information")
     st.sidebar.markdown("""
     - **Dataset**: [ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data)
     - **Training**: 4,300 high-CAI E. coli sequences
+    - **Reference**: 50,000+ E. coli gene sequences
     - **Auto-download**: CAI weights & tAI coefficients
     """)
     # Model Information
     st.sidebar.markdown("### 🤖 Model Information")
     st.sidebar.markdown("""
                 st.error(f"❌ **Optimization Failed:** {st.session_state.results}")
             else:
                 display_optimization_results(
+                    st.session_state.results,
+                    st.session_state.get('organism', organism),
+                    st.session_state.get('sequence_clean', ''),
+                    st.session_state.get('sequence_type', 'protein'),
                     st.session_state.get('input_metrics', {})
                 )
     # CAI Extremes Analysis
     st.subheader("🎯 CAI Performance Analysis")
     # Filter out rows with NaN CAI values for analysis
     valid_cai_df = results_df.dropna(subset=['cai_after'])
     if len(valid_cai_df) > 0:
         # Find lowest and highest CAI sequences
         lowest_cai_idx = valid_cai_df['cai_after'].idxmin()
         highest_cai_idx = valid_cai_df['cai_after'].idxmax()
         lowest_cai_row = results_df.loc[lowest_cai_idx]
         highest_cai_row = results_df.loc[highest_cai_idx]
         col1, col2 = st.columns(2)
         with col1:
             st.markdown("**🔻 Lowest CAI Sequence**")
             st.write(f"**Name:** {lowest_cai_row['name']}")
             st.metric("GC Content", f"{lowest_cai_row['gc_content_after']:.1f}%")
             st.metric("tAI Score", f"{lowest_cai_row['tai_after']:.3f}")
             st.metric("Length", f"{lowest_cai_row['length_after']} bp")
             # Show improvement
             if pd.notna(lowest_cai_row['cai_before']):
                 cai_improvement = lowest_cai_row['cai_after'] - lowest_cai_row['cai_before']
                 st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
         with col2:
             st.markdown("**🔺 Highest CAI Sequence**")
             st.write(f"**Name:** {highest_cai_row['name']}")
             st.metric("GC Content", f"{highest_cai_row['gc_content_after']:.1f}%")
             st.metric("tAI Score", f"{highest_cai_row['tai_after']:.3f}")
             st.metric("Length", f"{highest_cai_row['length_after']} bp")
             # Show improvement
             if pd.notna(highest_cai_row['cai_before']):
                 cai_improvement = highest_cai_row['cai_after'] - highest_cai_row['cai_before']
                 st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
         # CAI Distribution Chart
         st.subheader("📊 CAI Distribution")
         fig = go.Figure()
             marker_color='darkblue',
             opacity=0.7
         ))
         # Add vertical lines for lowest and highest
         fig.add_vline(
             x=lowest_cai_row['cai_after'],
         )
         fig.add_vline(
             x=highest_cai_row['cai_after'],
+            line_dash="dash",
             line_color="green",
             annotation_text=f"Highest: {highest_cai_row['cai_after']:.3f}"
         )
         fig.update_layout(
             title="Distribution of Optimized CAI Scores",
             xaxis_title="CAI Score",
             st.plotly_chart(fig_gc, use_container_width=True)
         else:
             st.warning("⚠️ No valid GC content values found in the batch results.")
     else:
         st.warning("⚠️ No valid CAI scores found in the batch results. Check if CAI weights are properly loaded.")

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+git
+git-lfs
+build-essential
+python3-dev

setup_cai.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+"""
+Setup script for CAI package installation in ColiFormer.
+This script handles the installation of the CAI package with proper build flags
+to avoid wheel naming issues that occur with standard pip install.
+"""
+import subprocess
+import sys
+import os
+import importlib.util
+def check_cai_installed():
+    """Check if CAI package is already installed and working."""
+    try:
+        spec = importlib.util.find_spec("CAI")
+        if spec is None:
+            return False
+        # Try to import the specific functions we need
+        import CAI
+        from CAI import CAI as cai_func, relative_adaptiveness
+        print("✅ CAI package is already installed and working")
+        return True
+    except ImportError as e:
+        print(f"❌ CAI package not found or not working: {e}")
+        return False
+def install_cai():
+    """Install CAI package with proper build configuration."""
+    print("🔧 Installing CAI package...")
+    cai_repo = "git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610"
+    try:
+        # First ensure we have build tools
+        print("Installing build dependencies...")
+        subprocess.run([
+            sys.executable, "-m", "pip", "install", "--upgrade",
+            "setuptools>=65.0", "wheel>=0.37.0", "pip>=21.0"
+        ], check=True, capture_output=True)
+        # Try installing with --no-use-pep517 flag first (preferred method)
+        print("Attempting CAI installation with legacy build system...")
+        try:
+            result = subprocess.run([
+                sys.executable, "-m", "pip", "install",
+                "--no-use-pep517", "--no-cache-dir", cai_repo
+            ], check=True, capture_output=True, text=True)
+            print("✅ CAI installed successfully with legacy build")
+            return True
+        except subprocess.CalledProcessError:
+            print("⚠️ Legacy build failed, trying standard installation...")
+        # Fallback to standard installation
+        result = subprocess.run([
+            sys.executable, "-m", "pip", "install", "--no-cache-dir", cai_repo
+        ], check=True, capture_output=True, text=True)
+        print("✅ CAI installed successfully with standard build")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ CAI installation failed: {e}")
+        if hasattr(e, 'stderr') and e.stderr:
+            print(f"Error output: {e.stderr}")
+        return False
+    except Exception as e:
+        print(f"❌ Unexpected error during CAI installation: {e}")
+        return False
+def verify_cai_installation():
+    """Verify that CAI package is working correctly."""
+    try:
+        import CAI
+        from CAI import CAI as cai_func, relative_adaptiveness
+        # Test basic functionality
+        test_sequences = ["ATGAAATAA", "ATGGGCTAA"]
+        weights = relative_adaptiveness(sequences=test_sequences)
+        cai_score = cai_func("ATGAAATAA", weights=weights)
+        print(f"✅ CAI verification successful (test score: {cai_score:.3f})")
+        return True
+    except Exception as e:
+        print(f"❌ CAI verification failed: {e}")
+        return False
+def main():
+    """Main setup function."""
+    print("ColiFormer CAI Setup")
+    print("=" * 50)
+    # Check if already installed
+    if check_cai_installed():
+        if verify_cai_installation():
+            print("🎉 CAI is ready to use!")
+            return True
+        else:
+            print("⚠️ CAI is installed but not working properly, reinstalling...")
+    # Install CAI
+    if install_cai():
+        # Verify installation
+        if verify_cai_installation():
+            print("🎉 CAI setup completed successfully!")
+            return True
+        else:
+            print("💥 CAI installation verification failed!")
+            return False
+    else:
+        print("💥 CAI installation failed!")
+        return False
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)