Spaces:
Sleeping
Sleeping
Commit ·
dcb463f
1
Parent(s): c087d3a
created dockerfile and dockerignore
Browse files- .dockerignore +1 -0
- data/create_pfam_data.ipynb +1 -1
- dockerfile +46 -0
- environment.yml +69 -0
- pfam/analyze_protein_vec_results.ipynb +2 -2
- pfam/genes_unknown.ipynb +2 -2
- scripts/precompute_fnr_thresholds.sh +4 -51
.dockerignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Nothing here yet
|
data/create_pfam_data.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 60030
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3a501ca1d972d4e54cc335e10da2bbf637f5be700553a06d35754ae76c23f85
|
| 3 |
size 60030
|
dockerfile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 1. Base image: Ubuntu 22.04
|
| 2 |
+
FROM ubuntu:22.04
|
| 3 |
+
|
| 4 |
+
# 2. Prevent interactive prompts during apt installs
|
| 5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
+
|
| 7 |
+
# 3. System dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
wget bzip2 ca-certificates git \
|
| 10 |
+
libglib2.0-0 libxext6 libsm6 libxrender1 \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# 4. Install Miniconda
|
| 14 |
+
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
| 15 |
+
&& bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
|
| 16 |
+
&& rm Miniconda3-latest-Linux-x86_64.sh
|
| 17 |
+
|
| 18 |
+
ENV PATH=/opt/conda/bin:$PATH
|
| 19 |
+
|
| 20 |
+
# 5. Create a working dir and copy only environment spec
|
| 21 |
+
WORKDIR /workspace
|
| 22 |
+
COPY environment.yml /workspace/
|
| 23 |
+
|
| 24 |
+
# Pre-accept Anaconda channel Terms of Service
|
| 25 |
+
RUN conda tos accept \
|
| 26 |
+
--override-channels \
|
| 27 |
+
--channel https://repo.anaconda.com/pkgs/main && \
|
| 28 |
+
conda tos accept \
|
| 29 |
+
--override-channels \
|
| 30 |
+
--channel https://repo.anaconda.com/pkgs/r
|
| 31 |
+
|
| 32 |
+
# Create the env and clean up
|
| 33 |
+
RUN conda env create -f environment.yml && \
|
| 34 |
+
conda clean -afy
|
| 35 |
+
|
| 36 |
+
# 7. Copy the rest of your code
|
| 37 |
+
COPY . /workspace/
|
| 38 |
+
|
| 39 |
+
# 8. Activate env by default
|
| 40 |
+
SHELL ["conda", "run", "-n", "protein-conformal", "/bin/bash", "-c"]
|
| 41 |
+
|
| 42 |
+
# 9. Expose Gradio port
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# 10. Default command: start your Gradio app
|
| 46 |
+
CMD ["python", "-m", "protein_conformal.gradio_app"]
|
environment.yml
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: protein-conformal
|
| 2 |
+
channels:
|
| 3 |
+
- pytorch
|
| 4 |
+
- conda-forge
|
| 5 |
+
- huggingface
|
| 6 |
+
- defaults
|
| 7 |
+
|
| 8 |
+
dependencies:
|
| 9 |
+
# Python version
|
| 10 |
+
- python=3.10
|
| 11 |
+
|
| 12 |
+
# Core scientific computing
|
| 13 |
+
- numpy>=1.24.0
|
| 14 |
+
- pandas>=2.0.0
|
| 15 |
+
- scipy>=1.10.0
|
| 16 |
+
- scikit-learn>=1.0.0
|
| 17 |
+
|
| 18 |
+
# Machine Learning & Deep Learning
|
| 19 |
+
- pytorch>=2.0.0
|
| 20 |
+
- cpuonly # CPU-only PyTorch for Windows compatibility
|
| 21 |
+
- transformers>=4.30.0
|
| 22 |
+
|
| 23 |
+
# FAISS for similarity search
|
| 24 |
+
- faiss-cpu>=1.7.4 # Use faiss-gpu if you have GPU support
|
| 25 |
+
|
| 26 |
+
# Bioinformatics
|
| 27 |
+
- biopython>=1.81
|
| 28 |
+
|
| 29 |
+
# Web frameworks and APIs
|
| 30 |
+
- gradio>=3.50.0
|
| 31 |
+
- fastapi>=0.90.0
|
| 32 |
+
- uvicorn>=0.18.0
|
| 33 |
+
- jinja2>=3.1.0
|
| 34 |
+
- pydantic>=1.10.0
|
| 35 |
+
- python-multipart>=0.0.5
|
| 36 |
+
|
| 37 |
+
# Visualization and plotting
|
| 38 |
+
- matplotlib>=3.5.0
|
| 39 |
+
- seaborn>=0.12.0
|
| 40 |
+
- plotly>=5.9.0
|
| 41 |
+
- networkx>=2.8.0
|
| 42 |
+
|
| 43 |
+
# Development and debugging tools
|
| 44 |
+
- tqdm
|
| 45 |
+
- ipdb
|
| 46 |
+
- jupyter
|
| 47 |
+
- notebook
|
| 48 |
+
- jupyterlab
|
| 49 |
+
|
| 50 |
+
# Utilities
|
| 51 |
+
- requests>=2.27.1
|
| 52 |
+
|
| 53 |
+
# Pip dependencies (packages not available via conda)
|
| 54 |
+
- pip
|
| 55 |
+
- pip:
|
| 56 |
+
- py3Dmol>=1.8.0 # 3D molecular visualization for Gradio
|
| 57 |
+
|
| 58 |
+
# Installation instructions:
|
| 59 |
+
# conda env update -f environment.yaml --prune # Update existing 'cpr' environment
|
| 60 |
+
# conda activate cpr
|
| 61 |
+
#
|
| 62 |
+
# Alternative: Create new environment
|
| 63 |
+
# conda env create -f environment.yaml
|
| 64 |
+
# conda activate protein-conformal
|
| 65 |
+
#
|
| 66 |
+
# For GPU support on Linux/properly configured CUDA systems:
|
| 67 |
+
# 1. Replace 'cpuonly' with 'pytorch-cuda=11.8'
|
| 68 |
+
# 2. Change 'faiss-cpu' to 'faiss-gpu'
|
| 69 |
+
# 3. Add nvidia channel: conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
|
pfam/analyze_protein_vec_results.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8d2f7b45d20e8454f32382094c420d90d143b272f0593e6f5e5cfb2e5a0a4f4
|
| 3 |
+
size 1033684
|
pfam/genes_unknown.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:651874d343ab2bc89588a928ec485ecff2ef898a1b4cb8444064d30aaace8e58
|
| 3 |
+
size 225341
|
scripts/precompute_fnr_thresholds.sh
CHANGED
|
@@ -13,53 +13,6 @@ OUTPUT_DIR="../results"
|
|
| 13 |
TEMP_DIR="./temp_fnr_results"
|
| 14 |
CSV_OUTPUT="$OUTPUT_DIR/fnr_thresholds.csv"
|
| 15 |
|
| 16 |
-
# Parse command line arguments
|
| 17 |
-
while [[ $# -gt 0 ]]; do
|
| 18 |
-
case $1 in
|
| 19 |
-
--min-alpha)
|
| 20 |
-
MIN_ALPHA="$2"
|
| 21 |
-
shift 2
|
| 22 |
-
;;
|
| 23 |
-
--max-alpha)
|
| 24 |
-
MAX_ALPHA="$2"
|
| 25 |
-
shift 2
|
| 26 |
-
;;
|
| 27 |
-
--num-values)
|
| 28 |
-
NUM_ALPHA_VALUES="$2"
|
| 29 |
-
shift 2
|
| 30 |
-
;;
|
| 31 |
-
--num-trials)
|
| 32 |
-
NUM_TRIALS="$2"
|
| 33 |
-
shift 2
|
| 34 |
-
;;
|
| 35 |
-
--n-calib)
|
| 36 |
-
N_CALIB="$2"
|
| 37 |
-
shift 2
|
| 38 |
-
;;
|
| 39 |
-
--output)
|
| 40 |
-
CSV_OUTPUT="$2"
|
| 41 |
-
shift 2
|
| 42 |
-
;;
|
| 43 |
-
-h|--help)
|
| 44 |
-
echo "Usage: $0 [OPTIONS]"
|
| 45 |
-
echo "Options:"
|
| 46 |
-
echo " --min-alpha FLOAT Minimum alpha value (default: $MIN_ALPHA)"
|
| 47 |
-
echo " --max-alpha FLOAT Maximum alpha value (default: $MAX_ALPHA)"
|
| 48 |
-
echo " --num-values INT Number of alpha values to test (default: $NUM_ALPHA_VALUES)"
|
| 49 |
-
echo " --num-trials INT Number of trials per alpha (default: $NUM_TRIALS)"
|
| 50 |
-
echo " --n-calib INT Calibration set size (default: $N_CALIB)"
|
| 51 |
-
echo " --output PATH Output CSV file (default: $CSV_OUTPUT)"
|
| 52 |
-
echo " -h, --help Show this help message"
|
| 53 |
-
exit 0
|
| 54 |
-
;;
|
| 55 |
-
*)
|
| 56 |
-
echo "Unknown option: $1"
|
| 57 |
-
exit 1
|
| 58 |
-
;;
|
| 59 |
-
esac
|
| 60 |
-
done
|
| 61 |
-
|
| 62 |
-
# Create necessary directories
|
| 63 |
mkdir -p "$OUTPUT_DIR"
|
| 64 |
mkdir -p "$TEMP_DIR"
|
| 65 |
|
|
@@ -75,7 +28,7 @@ echo "Output file: $CSV_OUTPUT"
|
|
| 75 |
echo ""
|
| 76 |
|
| 77 |
# Generate alpha values using Python
|
| 78 |
-
ALPHA_VALUES=$(
|
| 79 |
import numpy as np
|
| 80 |
alphas = np.linspace($MIN_ALPHA, $MAX_ALPHA, $NUM_ALPHA_VALUES)
|
| 81 |
print(' '.join([str(a) for a in alphas]))
|
|
@@ -92,7 +45,7 @@ for alpha in $ALPHA_VALUES; do
|
|
| 92 |
|
| 93 |
# Run FNR generation for exact matches
|
| 94 |
echo " Running exact matches..."
|
| 95 |
-
|
| 96 |
--alpha "$alpha" \
|
| 97 |
--partial false \
|
| 98 |
--num_trials "$NUM_TRIALS" \
|
|
@@ -102,7 +55,7 @@ for alpha in $ALPHA_VALUES; do
|
|
| 102 |
|
| 103 |
# Run FNR generation for partial matches
|
| 104 |
echo " Running partial matches..."
|
| 105 |
-
|
| 106 |
--alpha "$alpha" \
|
| 107 |
--partial true \
|
| 108 |
--num_trials "$NUM_TRIALS" \
|
|
@@ -111,7 +64,7 @@ for alpha in $ALPHA_VALUES; do
|
|
| 111 |
--add_date false
|
| 112 |
|
| 113 |
# Extract results and append to CSV using Python
|
| 114 |
-
|
| 115 |
import numpy as np
|
| 116 |
import sys
|
| 117 |
|
|
|
|
| 13 |
TEMP_DIR="./temp_fnr_results"
|
| 14 |
CSV_OUTPUT="$OUTPUT_DIR/fnr_thresholds.csv"
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
mkdir -p "$OUTPUT_DIR"
|
| 17 |
mkdir -p "$TEMP_DIR"
|
| 18 |
|
|
|
|
| 28 |
echo ""
|
| 29 |
|
| 30 |
# Generate alpha values using Python
|
| 31 |
+
ALPHA_VALUES=$(python -c "
|
| 32 |
import numpy as np
|
| 33 |
alphas = np.linspace($MIN_ALPHA, $MAX_ALPHA, $NUM_ALPHA_VALUES)
|
| 34 |
print(' '.join([str(a) for a in alphas]))
|
|
|
|
| 45 |
|
| 46 |
# Run FNR generation for exact matches
|
| 47 |
echo " Running exact matches..."
|
| 48 |
+
python ../pfam/generate_fnr.py \
|
| 49 |
--alpha "$alpha" \
|
| 50 |
--partial false \
|
| 51 |
--num_trials "$NUM_TRIALS" \
|
|
|
|
| 55 |
|
| 56 |
# Run FNR generation for partial matches
|
| 57 |
echo " Running partial matches..."
|
| 58 |
+
python ../pfam/generate_fnr.py \
|
| 59 |
--alpha "$alpha" \
|
| 60 |
--partial true \
|
| 61 |
--num_trials "$NUM_TRIALS" \
|
|
|
|
| 64 |
--add_date false
|
| 65 |
|
| 66 |
# Extract results and append to CSV using Python
|
| 67 |
+
python -c "
|
| 68 |
import numpy as np
|
| 69 |
import sys
|
| 70 |
|