LoocasGoose commited on
Commit
dcb463f
·
1 Parent(s): c087d3a

created dockerfile and dockerignore

Browse files
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ # Nothing here yet
data/create_pfam_data.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d66a3b8c2ba7fe8750182a735b30f7411ddf3868d879b0360999bc5b7d435bb2
3
  size 60030
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a501ca1d972d4e54cc335e10da2bbf637f5be700553a06d35754ae76c23f85
3
  size 60030
dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Base image: Ubuntu 22.04
2
+ FROM ubuntu:22.04
3
+
4
+ # 2. Prevent interactive prompts during apt installs
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+
7
+ # 3. System dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ wget bzip2 ca-certificates git \
10
+ libglib2.0-0 libxext6 libsm6 libxrender1 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # 4. Install Miniconda
14
+ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
15
+ && bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
16
+ && rm Miniconda3-latest-Linux-x86_64.sh
17
+
18
+ ENV PATH=/opt/conda/bin:$PATH
19
+
20
+ # 5. Create a working dir and copy only environment spec
21
+ WORKDIR /workspace
22
+ COPY environment.yml /workspace/
23
+
24
+ # Pre-accept Anaconda channel Terms of Service
25
+ RUN conda tos accept \
26
+ --override-channels \
27
+ --channel https://repo.anaconda.com/pkgs/main && \
28
+ conda tos accept \
29
+ --override-channels \
30
+ --channel https://repo.anaconda.com/pkgs/r
31
+
32
+ # Create the env and clean up
33
+ RUN conda env create -f environment.yml && \
34
+ conda clean -afy
35
+
36
+ # 7. Copy the rest of your code
37
+ COPY . /workspace/
38
+
39
+ # 8. Activate env by default
40
+ SHELL ["conda", "run", "-n", "protein-conformal", "/bin/bash", "-c"]
41
+
42
+ # 9. Expose Gradio port
43
+ EXPOSE 7860
44
+
45
+ # 10. Default command: start your Gradio app
46
+ CMD ["python", "-m", "protein_conformal.gradio_app"]
environment.yml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: protein-conformal
2
+ channels:
3
+ - pytorch
4
+ - conda-forge
5
+ - huggingface
6
+ - defaults
7
+
8
+ dependencies:
9
+ # Python version
10
+ - python=3.10
11
+
12
+ # Core scientific computing
13
+ - numpy>=1.24.0
14
+ - pandas>=2.0.0
15
+ - scipy>=1.10.0
16
+ - scikit-learn>=1.0.0
17
+
18
+ # Machine Learning & Deep Learning
19
+ - pytorch>=2.0.0
20
+ - cpuonly # CPU-only PyTorch for Windows compatibility
21
+ - transformers>=4.30.0
22
+
23
+ # FAISS for similarity search
24
+ - faiss-cpu>=1.7.4 # Use faiss-gpu if you have GPU support
25
+
26
+ # Bioinformatics
27
+ - biopython>=1.81
28
+
29
+ # Web frameworks and APIs
30
+ - gradio>=3.50.0
31
+ - fastapi>=0.90.0
32
+ - uvicorn>=0.18.0
33
+ - jinja2>=3.1.0
34
+ - pydantic>=1.10.0
35
+ - python-multipart>=0.0.5
36
+
37
+ # Visualization and plotting
38
+ - matplotlib>=3.5.0
39
+ - seaborn>=0.12.0
40
+ - plotly>=5.9.0
41
+ - networkx>=2.8.0
42
+
43
+ # Development and debugging tools
44
+ - tqdm
45
+ - ipdb
46
+ - jupyter
47
+ - notebook
48
+ - jupyterlab
49
+
50
+ # Utilities
51
+ - requests>=2.27.1
52
+
53
+ # Pip dependencies (packages not available via conda)
54
+ - pip
55
+ - pip:
56
+ - py3Dmol>=1.8.0 # 3D molecular visualization for Gradio
57
+
58
+ # Installation instructions:
59
+ # conda env update -f environment.yaml --prune # Update existing 'cpr' environment
60
+ # conda activate cpr
61
+ #
62
+ # Alternative: Create new environment
63
+ # conda env create -f environment.yaml
64
+ # conda activate protein-conformal
65
+ #
66
+ # For GPU support on Linux/properly configured CUDA systems:
67
+ # 1. Replace 'cpuonly' with 'pytorch-cuda=11.8'
68
+ # 2. Change 'faiss-cpu' to 'faiss-gpu'
69
+ # 3. Add nvidia channel: conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
pfam/analyze_protein_vec_results.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e612e3e900789cbb6789c7ba6e56e71f446e5a75d9713785631373929acc4294
3
- size 529367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8d2f7b45d20e8454f32382094c420d90d143b272f0593e6f5e5cfb2e5a0a4f4
3
+ size 1033684
pfam/genes_unknown.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3eeb0f63505548713fd7d511a1c96282bc98e37f524d807c85d9f6a6dda069e
3
- size 192542
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651874d343ab2bc89588a928ec485ecff2ef898a1b4cb8444064d30aaace8e58
3
+ size 225341
scripts/precompute_fnr_thresholds.sh CHANGED
@@ -13,53 +13,6 @@ OUTPUT_DIR="../results"
13
  TEMP_DIR="./temp_fnr_results"
14
  CSV_OUTPUT="$OUTPUT_DIR/fnr_thresholds.csv"
15
 
16
- # Parse command line arguments
17
- while [[ $# -gt 0 ]]; do
18
- case $1 in
19
- --min-alpha)
20
- MIN_ALPHA="$2"
21
- shift 2
22
- ;;
23
- --max-alpha)
24
- MAX_ALPHA="$2"
25
- shift 2
26
- ;;
27
- --num-values)
28
- NUM_ALPHA_VALUES="$2"
29
- shift 2
30
- ;;
31
- --num-trials)
32
- NUM_TRIALS="$2"
33
- shift 2
34
- ;;
35
- --n-calib)
36
- N_CALIB="$2"
37
- shift 2
38
- ;;
39
- --output)
40
- CSV_OUTPUT="$2"
41
- shift 2
42
- ;;
43
- -h|--help)
44
- echo "Usage: $0 [OPTIONS]"
45
- echo "Options:"
46
- echo " --min-alpha FLOAT Minimum alpha value (default: $MIN_ALPHA)"
47
- echo " --max-alpha FLOAT Maximum alpha value (default: $MAX_ALPHA)"
48
- echo " --num-values INT Number of alpha values to test (default: $NUM_ALPHA_VALUES)"
49
- echo " --num-trials INT Number of trials per alpha (default: $NUM_TRIALS)"
50
- echo " --n-calib INT Calibration set size (default: $N_CALIB)"
51
- echo " --output PATH Output CSV file (default: $CSV_OUTPUT)"
52
- echo " -h, --help Show this help message"
53
- exit 0
54
- ;;
55
- *)
56
- echo "Unknown option: $1"
57
- exit 1
58
- ;;
59
- esac
60
- done
61
-
62
- # Create necessary directories
63
  mkdir -p "$OUTPUT_DIR"
64
  mkdir -p "$TEMP_DIR"
65
 
@@ -75,7 +28,7 @@ echo "Output file: $CSV_OUTPUT"
75
  echo ""
76
 
77
  # Generate alpha values using Python
78
- ALPHA_VALUES=$(python3 -c "
79
  import numpy as np
80
  alphas = np.linspace($MIN_ALPHA, $MAX_ALPHA, $NUM_ALPHA_VALUES)
81
  print(' '.join([str(a) for a in alphas]))
@@ -92,7 +45,7 @@ for alpha in $ALPHA_VALUES; do
92
 
93
  # Run FNR generation for exact matches
94
  echo " Running exact matches..."
95
- python3 ../pfam/generate_fnr.py \
96
  --alpha "$alpha" \
97
  --partial false \
98
  --num_trials "$NUM_TRIALS" \
@@ -102,7 +55,7 @@ for alpha in $ALPHA_VALUES; do
102
 
103
  # Run FNR generation for partial matches
104
  echo " Running partial matches..."
105
- python3 ../pfam/generate_fnr.py \
106
  --alpha "$alpha" \
107
  --partial true \
108
  --num_trials "$NUM_TRIALS" \
@@ -111,7 +64,7 @@ for alpha in $ALPHA_VALUES; do
111
  --add_date false
112
 
113
  # Extract results and append to CSV using Python
114
- python3 -c "
115
  import numpy as np
116
  import sys
117
 
 
13
  TEMP_DIR="./temp_fnr_results"
14
  CSV_OUTPUT="$OUTPUT_DIR/fnr_thresholds.csv"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  mkdir -p "$OUTPUT_DIR"
17
  mkdir -p "$TEMP_DIR"
18
 
 
28
  echo ""
29
 
30
  # Generate alpha values using Python
31
+ ALPHA_VALUES=$(python -c "
32
  import numpy as np
33
  alphas = np.linspace($MIN_ALPHA, $MAX_ALPHA, $NUM_ALPHA_VALUES)
34
  print(' '.join([str(a) for a in alphas]))
 
45
 
46
  # Run FNR generation for exact matches
47
  echo " Running exact matches..."
48
+ python ../pfam/generate_fnr.py \
49
  --alpha "$alpha" \
50
  --partial false \
51
  --num_trials "$NUM_TRIALS" \
 
55
 
56
  # Run FNR generation for partial matches
57
  echo " Running partial matches..."
58
+ python ../pfam/generate_fnr.py \
59
  --alpha "$alpha" \
60
  --partial true \
61
  --num_trials "$NUM_TRIALS" \
 
64
  --add_date false
65
 
66
  # Extract results and append to CSV using Python
67
+ python -c "
68
  import numpy as np
69
  import sys
70