saketh11 commited on
Commit
d66b023
Β·
1 Parent(s): 3acd7be

Fix CAI installation for Hugging Face Spaces

Browse files

- Remove Dockerfile approach (wasn't working in HF Spaces)
- Add CAI auto-installation at app startup before any imports
- Install CAI with --no-use-pep517 flag to avoid wheel naming issues
- Add packages.txt for system dependencies
- Add setup_cai.py as standalone installation script
- This should resolve the 'ModuleNotFoundError: No module named CAI' issue

Files changed (4) hide show
  1. Dockerfile +0 -74
  2. app.py +51 -25
  3. packages.txt +4 -0
  4. setup_cai.py +118 -0
Dockerfile DELETED
@@ -1,74 +0,0 @@
1
- FROM python:3.10
2
-
3
- WORKDIR /app
4
-
5
- # Install system dependencies
6
- RUN apt-get update && apt-get install -y \
7
- git \
8
- git-lfs \
9
- ffmpeg \
10
- libsm6 \
11
- libxext6 \
12
- cmake \
13
- rsync \
14
- libgl1 \
15
- && rm -rf /var/lib/apt/lists/* \
16
- && git lfs install
17
-
18
- # Upgrade pip and install base packages
19
- RUN pip install --no-cache-dir pip -U && \
20
- pip install --no-cache-dir \
21
- setuptools \
22
- wheel \
23
- datasets \
24
- "huggingface-hub>=0.30" \
25
- "hf-transfer>=0.1.4" \
26
- "protobuf<4" \
27
- "click<8.1" \
28
- "pydantic~=1.0"
29
-
30
- # Install CAI package first with legacy build system to avoid wheel naming issues
31
- RUN pip install --no-use-pep517 --no-cache-dir \
32
- git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610
33
-
34
- # Verify CAI installation
35
- RUN python -c "import CAI; from CAI import CAI as cai_func, relative_adaptiveness; print('βœ… CAI package verified successfully')"
36
-
37
- # Copy requirements file and install remaining dependencies
38
- COPY requirements.txt /tmp/requirements.txt
39
- RUN pip install --no-cache-dir -r /tmp/requirements.txt
40
-
41
- # Install additional streamlit dependencies
42
- RUN pip install --no-cache-dir \
43
- streamlit==1.28.1 \
44
- "uvicorn>=0.14.0" \
45
- spaces
46
-
47
- # Create user directory structure
48
- RUN mkdir -p .streamlit && \
49
- git config --global core.excludesfile ~/.gitignore && \
50
- echo ".streamlit" > ~/.gitignore
51
-
52
- RUN mkdir -p /home/user && \
53
- ( [ -e /home/user/app ] || ln -s /app/ /home/user/app ) || true
54
-
55
- # Copy application files
56
- COPY --link ./ /app
57
-
58
- # Verify application can import CAI after copying files
59
- RUN python -c "from CodonTransformer.CodonEvaluation import *; print('βœ… CodonTransformer.CodonEvaluation imports successfully')"
60
-
61
- # Set up environment
62
- ENV PYTHONPATH=/app
63
- ENV STREAMLIT_SERVER_HEADLESS=true
64
- ENV STREAMLIT_SERVER_PORT=7860
65
- ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
66
-
67
- # Expose port
68
- EXPOSE 7860
69
-
70
- # Health check
71
- HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
72
-
73
- # Run the application
74
- CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,3 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
  import pandas as pd
@@ -14,8 +42,6 @@ import warnings
14
  warnings.filterwarnings("ignore")
15
 
16
  # Import CodonTransformer modules
17
- import sys
18
- import os
19
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
 
21
  from CodonTransformer.CodonPrediction import (
@@ -220,7 +246,7 @@ def validate_sequence(sequence: str) -> Tuple[bool, str, str, str]:
220
  if sequence_chars.issubset(dna_chars):
221
  if len(sequence) < 3:
222
  return False, "DNA sequence must be at least 3 nucleotides long", "dna", sequence
223
-
224
  # Auto-fix DNA sequences not divisible by 3
225
  if len(sequence) % 3 != 0:
226
  remainder = len(sequence) % 3
@@ -229,7 +255,7 @@ def validate_sequence(sequence: str) -> Tuple[bool, str, str, str]:
229
  else:
230
  fixed_sequence = sequence
231
  message = "Valid DNA sequence"
232
-
233
  return True, message, "dna", fixed_sequence
234
 
235
  # If contains protein-specific amino acids, treat as protein
@@ -654,7 +680,7 @@ def main():
654
 
655
  def single_sequence_optimization():
656
  """Single sequence optimization interface - enhanced from original functionality"""
657
- # Sidebar configuration
658
  st.sidebar.header("πŸ”§ Configuration")
659
  organism_options = [
660
  "Escherichia coli general",
@@ -687,17 +713,17 @@ def single_sequence_optimization():
687
  )
688
  if not POST_PROCESSING_AVAILABLE:
689
  st.sidebar.warning("⚠️ DNAChisel not available. Install with: pip install dnachisel")
690
-
691
  # Dataset Information
692
  st.sidebar.markdown("---")
693
  st.sidebar.markdown("### πŸ“Š Dataset Information")
694
  st.sidebar.markdown("""
695
  - **Dataset**: [ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data)
696
  - **Training**: 4,300 high-CAI E. coli sequences
697
- - **Reference**: 50,000+ E. coli gene sequences
698
  - **Auto-download**: CAI weights & tAI coefficients
699
  """)
700
-
701
  # Model Information
702
  st.sidebar.markdown("### πŸ€– Model Information")
703
  st.sidebar.markdown("""
@@ -853,10 +879,10 @@ def single_sequence_optimization():
853
  st.error(f"❌ **Optimization Failed:** {st.session_state.results}")
854
  else:
855
  display_optimization_results(
856
- st.session_state.results,
857
- st.session_state.get('organism', organism),
858
- st.session_state.get('sequence_clean', ''),
859
- st.session_state.get('sequence_type', 'protein'),
860
  st.session_state.get('input_metrics', {})
861
  )
862
 
@@ -1226,20 +1252,20 @@ def display_batch_results():
1226
 
1227
  # CAI Extremes Analysis
1228
  st.subheader("🎯 CAI Performance Analysis")
1229
-
1230
  # Filter out rows with NaN CAI values for analysis
1231
  valid_cai_df = results_df.dropna(subset=['cai_after'])
1232
-
1233
  if len(valid_cai_df) > 0:
1234
  # Find lowest and highest CAI sequences
1235
  lowest_cai_idx = valid_cai_df['cai_after'].idxmin()
1236
  highest_cai_idx = valid_cai_df['cai_after'].idxmax()
1237
-
1238
  lowest_cai_row = results_df.loc[lowest_cai_idx]
1239
  highest_cai_row = results_df.loc[highest_cai_idx]
1240
-
1241
  col1, col2 = st.columns(2)
1242
-
1243
  with col1:
1244
  st.markdown("**πŸ”» Lowest CAI Sequence**")
1245
  st.write(f"**Name:** {lowest_cai_row['name']}")
@@ -1247,12 +1273,12 @@ def display_batch_results():
1247
  st.metric("GC Content", f"{lowest_cai_row['gc_content_after']:.1f}%")
1248
  st.metric("tAI Score", f"{lowest_cai_row['tai_after']:.3f}")
1249
  st.metric("Length", f"{lowest_cai_row['length_after']} bp")
1250
-
1251
  # Show improvement
1252
  if pd.notna(lowest_cai_row['cai_before']):
1253
  cai_improvement = lowest_cai_row['cai_after'] - lowest_cai_row['cai_before']
1254
  st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
1255
-
1256
  with col2:
1257
  st.markdown("**πŸ”Ί Highest CAI Sequence**")
1258
  st.write(f"**Name:** {highest_cai_row['name']}")
@@ -1260,12 +1286,12 @@ def display_batch_results():
1260
  st.metric("GC Content", f"{highest_cai_row['gc_content_after']:.1f}%")
1261
  st.metric("tAI Score", f"{highest_cai_row['tai_after']:.3f}")
1262
  st.metric("Length", f"{highest_cai_row['length_after']} bp")
1263
-
1264
  # Show improvement
1265
  if pd.notna(highest_cai_row['cai_before']):
1266
  cai_improvement = highest_cai_row['cai_after'] - highest_cai_row['cai_before']
1267
  st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
1268
-
1269
  # CAI Distribution Chart
1270
  st.subheader("πŸ“Š CAI Distribution")
1271
  fig = go.Figure()
@@ -1276,7 +1302,7 @@ def display_batch_results():
1276
  marker_color='darkblue',
1277
  opacity=0.7
1278
  ))
1279
-
1280
  # Add vertical lines for lowest and highest
1281
  fig.add_vline(
1282
  x=lowest_cai_row['cai_after'],
@@ -1286,11 +1312,11 @@ def display_batch_results():
1286
  )
1287
  fig.add_vline(
1288
  x=highest_cai_row['cai_after'],
1289
- line_dash="dash",
1290
  line_color="green",
1291
  annotation_text=f"Highest: {highest_cai_row['cai_after']:.3f}"
1292
  )
1293
-
1294
  fig.update_layout(
1295
  title="Distribution of Optimized CAI Scores",
1296
  xaxis_title="CAI Score",
@@ -1339,7 +1365,7 @@ def display_batch_results():
1339
  st.plotly_chart(fig_gc, use_container_width=True)
1340
  else:
1341
  st.warning("⚠️ No valid GC content values found in the batch results.")
1342
-
1343
  else:
1344
  st.warning("⚠️ No valid CAI scores found in the batch results. Check if CAI weights are properly loaded.")
1345
 
 
1
+ # Setup CAI package before any other imports
2
+ import sys
3
+ import os
4
+
5
+ # Check and setup CAI package if needed
6
+ def setup_cai_if_needed():
7
+ try:
8
+ import CAI
9
+ return True
10
+ except ImportError:
11
+ print("CAI not found, attempting to install...")
12
+ try:
13
+ import subprocess
14
+ # Install CAI with legacy build system
15
+ subprocess.check_call([
16
+ sys.executable, "-m", "pip", "install", "--no-use-pep517", "--no-cache-dir",
17
+ "git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610"
18
+ ])
19
+ import CAI
20
+ print("βœ… CAI installed successfully")
21
+ return True
22
+ except Exception as e:
23
+ print(f"❌ Failed to install CAI: {e}")
24
+ return False
25
+
26
+ # Setup CAI before any other imports that might need it
27
+ setup_cai_if_needed()
28
+
29
  import streamlit as st
30
  import torch
31
  import pandas as pd
 
42
  warnings.filterwarnings("ignore")
43
 
44
  # Import CodonTransformer modules
 
 
45
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
46
 
47
  from CodonTransformer.CodonPrediction import (
 
246
  if sequence_chars.issubset(dna_chars):
247
  if len(sequence) < 3:
248
  return False, "DNA sequence must be at least 3 nucleotides long", "dna", sequence
249
+
250
  # Auto-fix DNA sequences not divisible by 3
251
  if len(sequence) % 3 != 0:
252
  remainder = len(sequence) % 3
 
255
  else:
256
  fixed_sequence = sequence
257
  message = "Valid DNA sequence"
258
+
259
  return True, message, "dna", fixed_sequence
260
 
261
  # If contains protein-specific amino acids, treat as protein
 
680
 
681
  def single_sequence_optimization():
682
  """Single sequence optimization interface - enhanced from original functionality"""
683
+ # Sidebar configuration
684
  st.sidebar.header("πŸ”§ Configuration")
685
  organism_options = [
686
  "Escherichia coli general",
 
713
  )
714
  if not POST_PROCESSING_AVAILABLE:
715
  st.sidebar.warning("⚠️ DNAChisel not available. Install with: pip install dnachisel")
716
+
717
  # Dataset Information
718
  st.sidebar.markdown("---")
719
  st.sidebar.markdown("### πŸ“Š Dataset Information")
720
  st.sidebar.markdown("""
721
  - **Dataset**: [ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data)
722
  - **Training**: 4,300 high-CAI E. coli sequences
723
+ - **Reference**: 50,000+ E. coli gene sequences
724
  - **Auto-download**: CAI weights & tAI coefficients
725
  """)
726
+
727
  # Model Information
728
  st.sidebar.markdown("### πŸ€– Model Information")
729
  st.sidebar.markdown("""
 
879
  st.error(f"❌ **Optimization Failed:** {st.session_state.results}")
880
  else:
881
  display_optimization_results(
882
+ st.session_state.results,
883
+ st.session_state.get('organism', organism),
884
+ st.session_state.get('sequence_clean', ''),
885
+ st.session_state.get('sequence_type', 'protein'),
886
  st.session_state.get('input_metrics', {})
887
  )
888
 
 
1252
 
1253
  # CAI Extremes Analysis
1254
  st.subheader("🎯 CAI Performance Analysis")
1255
+
1256
  # Filter out rows with NaN CAI values for analysis
1257
  valid_cai_df = results_df.dropna(subset=['cai_after'])
1258
+
1259
  if len(valid_cai_df) > 0:
1260
  # Find lowest and highest CAI sequences
1261
  lowest_cai_idx = valid_cai_df['cai_after'].idxmin()
1262
  highest_cai_idx = valid_cai_df['cai_after'].idxmax()
1263
+
1264
  lowest_cai_row = results_df.loc[lowest_cai_idx]
1265
  highest_cai_row = results_df.loc[highest_cai_idx]
1266
+
1267
  col1, col2 = st.columns(2)
1268
+
1269
  with col1:
1270
  st.markdown("**πŸ”» Lowest CAI Sequence**")
1271
  st.write(f"**Name:** {lowest_cai_row['name']}")
 
1273
  st.metric("GC Content", f"{lowest_cai_row['gc_content_after']:.1f}%")
1274
  st.metric("tAI Score", f"{lowest_cai_row['tai_after']:.3f}")
1275
  st.metric("Length", f"{lowest_cai_row['length_after']} bp")
1276
+
1277
  # Show improvement
1278
  if pd.notna(lowest_cai_row['cai_before']):
1279
  cai_improvement = lowest_cai_row['cai_after'] - lowest_cai_row['cai_before']
1280
  st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
1281
+
1282
  with col2:
1283
  st.markdown("**πŸ”Ί Highest CAI Sequence**")
1284
  st.write(f"**Name:** {highest_cai_row['name']}")
 
1286
  st.metric("GC Content", f"{highest_cai_row['gc_content_after']:.1f}%")
1287
  st.metric("tAI Score", f"{highest_cai_row['tai_after']:.3f}")
1288
  st.metric("Length", f"{highest_cai_row['length_after']} bp")
1289
+
1290
  # Show improvement
1291
  if pd.notna(highest_cai_row['cai_before']):
1292
  cai_improvement = highest_cai_row['cai_after'] - highest_cai_row['cai_before']
1293
  st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
1294
+
1295
  # CAI Distribution Chart
1296
  st.subheader("πŸ“Š CAI Distribution")
1297
  fig = go.Figure()
 
1302
  marker_color='darkblue',
1303
  opacity=0.7
1304
  ))
1305
+
1306
  # Add vertical lines for lowest and highest
1307
  fig.add_vline(
1308
  x=lowest_cai_row['cai_after'],
 
1312
  )
1313
  fig.add_vline(
1314
  x=highest_cai_row['cai_after'],
1315
+ line_dash="dash",
1316
  line_color="green",
1317
  annotation_text=f"Highest: {highest_cai_row['cai_after']:.3f}"
1318
  )
1319
+
1320
  fig.update_layout(
1321
  title="Distribution of Optimized CAI Scores",
1322
  xaxis_title="CAI Score",
 
1365
  st.plotly_chart(fig_gc, use_container_width=True)
1366
  else:
1367
  st.warning("⚠️ No valid GC content values found in the batch results.")
1368
+
1369
  else:
1370
  st.warning("⚠️ No valid CAI scores found in the batch results. Check if CAI weights are properly loaded.")
1371
 
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git
2
+ git-lfs
3
+ build-essential
4
+ python3-dev
setup_cai.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for CAI package installation in ColiFormer.
4
+ This script handles the installation of the CAI package with proper build flags
5
+ to avoid wheel naming issues that occur with standard pip install.
6
+ """
7
+
8
+ import subprocess
9
+ import sys
10
+ import os
11
+ import importlib.util
12
+
13
+ def check_cai_installed():
14
+ """Check if CAI package is already installed and working."""
15
+ try:
16
+ spec = importlib.util.find_spec("CAI")
17
+ if spec is None:
18
+ return False
19
+
20
+ # Try to import the specific functions we need
21
+ import CAI
22
+ from CAI import CAI as cai_func, relative_adaptiveness
23
+ print("βœ… CAI package is already installed and working")
24
+ return True
25
+ except ImportError as e:
26
+ print(f"❌ CAI package not found or not working: {e}")
27
+ return False
28
+
29
+ def install_cai():
30
+ """Install CAI package with proper build configuration."""
31
+ print("πŸ”§ Installing CAI package...")
32
+
33
+ cai_repo = "git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git@b6e017a92c58829f6a5aec8c26a21262bc2a6610"
34
+
35
+ try:
36
+ # First ensure we have build tools
37
+ print("Installing build dependencies...")
38
+ subprocess.run([
39
+ sys.executable, "-m", "pip", "install", "--upgrade",
40
+ "setuptools>=65.0", "wheel>=0.37.0", "pip>=21.0"
41
+ ], check=True, capture_output=True)
42
+
43
+ # Try installing with --no-use-pep517 flag first (preferred method)
44
+ print("Attempting CAI installation with legacy build system...")
45
+ try:
46
+ result = subprocess.run([
47
+ sys.executable, "-m", "pip", "install",
48
+ "--no-use-pep517", "--no-cache-dir", cai_repo
49
+ ], check=True, capture_output=True, text=True)
50
+ print("βœ… CAI installed successfully with legacy build")
51
+ return True
52
+ except subprocess.CalledProcessError:
53
+ print("⚠️ Legacy build failed, trying standard installation...")
54
+
55
+ # Fallback to standard installation
56
+ result = subprocess.run([
57
+ sys.executable, "-m", "pip", "install", "--no-cache-dir", cai_repo
58
+ ], check=True, capture_output=True, text=True)
59
+
60
+ print("βœ… CAI installed successfully with standard build")
61
+ return True
62
+
63
+ except subprocess.CalledProcessError as e:
64
+ print(f"❌ CAI installation failed: {e}")
65
+ if hasattr(e, 'stderr') and e.stderr:
66
+ print(f"Error output: {e.stderr}")
67
+ return False
68
+ except Exception as e:
69
+ print(f"❌ Unexpected error during CAI installation: {e}")
70
+ return False
71
+
72
+ def verify_cai_installation():
73
+ """Verify that CAI package is working correctly."""
74
+ try:
75
+ import CAI
76
+ from CAI import CAI as cai_func, relative_adaptiveness
77
+
78
+ # Test basic functionality
79
+ test_sequences = ["ATGAAATAA", "ATGGGCTAA"]
80
+ weights = relative_adaptiveness(sequences=test_sequences)
81
+ cai_score = cai_func("ATGAAATAA", weights=weights)
82
+
83
+ print(f"βœ… CAI verification successful (test score: {cai_score:.3f})")
84
+ return True
85
+
86
+ except Exception as e:
87
+ print(f"❌ CAI verification failed: {e}")
88
+ return False
89
+
90
+ def main():
91
+ """Main setup function."""
92
+ print("ColiFormer CAI Setup")
93
+ print("=" * 50)
94
+
95
+ # Check if already installed
96
+ if check_cai_installed():
97
+ if verify_cai_installation():
98
+ print("πŸŽ‰ CAI is ready to use!")
99
+ return True
100
+ else:
101
+ print("⚠️ CAI is installed but not working properly, reinstalling...")
102
+
103
+ # Install CAI
104
+ if install_cai():
105
+ # Verify installation
106
+ if verify_cai_installation():
107
+ print("πŸŽ‰ CAI setup completed successfully!")
108
+ return True
109
+ else:
110
+ print("πŸ’₯ CAI installation verification failed!")
111
+ return False
112
+ else:
113
+ print("πŸ’₯ CAI installation failed!")
114
+ return False
115
+
116
+ if __name__ == "__main__":
117
+ success = main()
118
+ sys.exit(0 if success else 1)