Spaces:

LoocasGoose
/

cpr

Sleeping

App Files Files Community

cpr / scripts /search_and_assign_probs.sh

LoocasGoose

new clean branch

3a8e9de 5 months ago

raw

history blame contribute delete

5.09 kB

	#!/bin/bash

	# Define the lookup directory
	LOOKUP_DIR="/home/yangk/proteins/conformal_protein_search/data"

	# Navigate to the conformal-protein-retrieval directory
	cd /home/yangk/proteins/conformal_protein_search/conformal-protein-retrieval \|\| exit

	# Specify the directory containing FASTA and embedding files
	INPUT_DIR="${1:-.}"

	# Create an output directory for the results
	OUTPUT_DIR="$INPUT_DIR/output"
	mkdir -p "$OUTPUT_DIR"

	# Precomputed probabilities file for get_probs.py
	PRECOMPUTED_PROBS="$LOOKUP_DIR/pfam_new_proteins_SVA_probs_1000_bins_200_calibration_pts.csv"

	# Loop through all *_prot.fasta files
	for fasta_file in "$INPUT_DIR"/*_prot.fasta; do
	if [[ -f "$fasta_file" ]]; then
	# Extract base name
	base_name=$(basename "$fasta_file" "_prot.fasta")

	# Corresponding embedding file
	emb_file="$INPUT_DIR/emb/${base_name}_emb.npy"

	# Ensure embedding file exists
	if [[ -f "$emb_file" ]]; then
	# Output CSV file for search.py results
	search_output_csv="$OUTPUT_DIR/partial_pfam_${base_name}_hits.csv"

	# Run the search script, with pfam partial FDR control as filtering threshold
	echo "Running search for: $fasta_file & $emb_file -> $search_output_csv"
	python scripts/search.py --fdr --fdr_lambda 0.9999642502418673 \
	--output "$search_output_csv" \
	--query_embedding "$emb_file" \
	--query_fasta "$fasta_file" \
	--lookup_embedding "$LOOKUP_DIR/lookup_embeddings.npy" \
	--lookup_fasta "$LOOKUP_DIR/lookup_embeddings_meta_data.fasta"

	# Ensure search output file exists before proceeding
	if [[ -f "$search_output_csv" ]]; then
	# Output CSV file for get_probs.py results
	probs_output_csv="$OUTPUT_DIR/partial_pfam_${base_name}_hits_with_probs.csv"

	# Run get_probs.py
	echo "Running get_probs for: $search_output_csv -> $probs_output_csv"
	python scripts/get_probs.py --precomputed --precomputed_path "$PRECOMPUTED_PROBS" \
	--input "$search_output_csv" --output "$probs_output_csv" --partial
	else
	echo "Warning: Search output file $search_output_csv not found. Skipping probability computation."
	fi
	else
	echo "Warning: No matching embedding found for $fasta_file. Skipping..."
	fi
	fi
	done

	echo "Pipeline completed. Results saved in $OUTPUT_DIR."


	#!/bin/bash

	# Define the lookup directory
	LOOKUP_DIR="/home/yangk/proteins/conformal_protein_search/data"

	# Navigate to the conformal-protein-retrieval directory
	cd /home/yangk/proteins/conformal_protein_search/conformal-protein-retrieval \|\| exit

	# Specify the directory containing FASTA and embedding files
	INPUT_DIR="${1:-.}"

	# Create an output directory for the results
	OUTPUT_DIR="$INPUT_DIR/output"
	mkdir -p "$OUTPUT_DIR"

	# Precomputed probabilities file for get_probs.py
	PRECOMPUTED_PROBS="$LOOKUP_DIR/pfam_new_proteins_SVA_probs_1000_bins_200_calibration_pts.csv"

	# Loop through all *_prot.fasta files
	for fasta_file in "$INPUT_DIR"/*_prot.fasta; do
	if [[ -f "$fasta_file" ]]; then
	# Extract base name
	base_name=$(basename "$fasta_file" "_prot.fasta")

	# Corresponding embedding file
	emb_file="$INPUT_DIR/emb/${base_name}_emb.npy"

	# Ensure embedding file exists
	if [[ -f "$emb_file" ]]; then
	# Output CSV file for search.py results
	search_output_csv="$OUTPUT_DIR/partial_pfam_${base_name}_hits.csv"

	# Run the search script, with pfam partial FDR control as filtering threshold
	echo "Running search for: $fasta_file & $emb_file -> $search_output_csv"
	python scripts/search.py --fdr --fdr_lambda 0.9999642502418673 \
	--output "$search_output_csv" \
	--query_embedding "$emb_file" \
	--query_fasta "$fasta_file" \
	--lookup_embedding "$LOOKUP_DIR/lookup_embeddings.npy" \
	--lookup_fasta "$LOOKUP_DIR/lookup_embeddings_meta_data.fasta"

	# Ensure search output file exists before proceeding
	if [[ -f "$search_output_csv" ]]; then
	# Output CSV file for get_probs.py results
	probs_output_csv="$OUTPUT_DIR/partial_pfam_${base_name}_hits_with_probs.csv"

	# Run get_probs.py
	echo "Running get_probs for: $search_output_csv -> $probs_output_csv"
	python scripts/get_probs.py --precomputed --precomputed_path "$PRECOMPUTED_PROBS" \
	--input "$search_output_csv" --output "$probs_output_csv" --partial
	else
	echo "Warning: Search output file $search_output_csv not found. Skipping probability computation."
	fi
	else
	echo "Warning: No matching embedding found for $fasta_file. Skipping..."
	fi
	fi
	done

	echo "Pipeline completed. Results saved in $OUTPUT_DIR."