import pandas as pd import numpy as np import os import subprocess import sys from Bio import SeqIO import shutil from fuson_plm.utils.logging import open_logfile, log_update from fuson_plm.data.config import CLUSTER def main(): # Read all the input args LOG_PATH = "clustering_log.txt" INPUT_PATH = CLUSTER.INPUT_PATH MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID C = CLUSTER.C COV_MODE = CLUSTER.COV_MODE PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS MAX_SEQ_LENGTH = CLUSTER.MAX_SEQ_LENGTH with open_logfile(LOG_PATH): log_update("Input params from config.py:") CLUSTER.print_config(indent='\t') # Make a subfolder for clustering results, and direct MMSeqs2 outputs here if not(os.path.exists("clustering")): os.mkdir("clustering") output_dir = "clustering/raw_output" # Make fasta of input file sequences = pd.read_csv(INPUT_PATH) log_update(f"\nPreparing input data...\n\tInitial dataset size: {len(sequences)} sequences") sequences = sequences.loc[sequences['aa_seq'].str.len() <= MAX_SEQ_LENGTH].reset_index(drop=True) log_update(f"\tApplied length cutoff of {MAX_SEQ_LENGTH}AAs. New dataset size: {len(sequences)} sequences") sequences = dict(zip(sequences['seq_id'],sequences['aa_seq'])) fasta_path = make_fasta(sequences, "clustering/input.fasta") log_update(f"\tMade fasta of input sequences, saved at {fasta_path}") run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, path_to_mmseqs=PATH_TO_MMSEQS) # Brief read to preview results clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv') # Save clusters clusters.to_csv('clustering/mmseqs_full_results.csv',index=False) log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv") cluster_summary(clusters) if __name__ == "__main__": main()