# Quick script to just get sequences out of import subprocess import os import pandas as pd import pickle def get_sequences_from_blastdb(database_path, entries): """ Retrieves sequences for a list of entries from a BLAST database. Parameters: - database_path (str): Path to the BLAST database (without file extension). - entries (list): List of entry IDs to query. Returns: - dict: A dictionary with entry IDs as keys and sequences as values. """ sequences = {} os.chdir("ncbi-blast-2.16.0+/swissprot") for entry in entries: try: # Run blastdbcmd command to retrieve the sequence for each entry result = subprocess.run( ["blastdbcmd", "-db", database_path, "-entry", entry], capture_output=True, text=True, check=True ) # Store the output in the dictionary (entry ID as key, sequence as value) # make sure the ID is what we think result = result.stdout.strip() id = result.split(' ',1)[0].split('>')[1] assert id==entry seq = result.split('\n',1)[1] seq = seq.replace('\n','').strip('').strip('\n') sequences[entry] = seq except subprocess.CalledProcessError as e: print(f"Error retrieving entry {entry}: {e}") sequences[entry] = None # Store None if there's an error for this entry return sequences def main(): # Query SwissProt database for the sequences of all the head and tail genes that produced the top alignments htgs = pd.read_csv("blast_outputs/ht_uniprot_query.txt",header=None) htgs = list(htgs[0]) database_path = "swissprot" # Path to the BLAST database without extension entries = htgs sequences_dict = get_sequences_from_blastdb(database_path, entries) with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "wb") as f: pickle.dump(sequences_dict, f) # Now look at the file you just wrote with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "rb") as f: d = pickle.load(f) if __name__ == '__main__': main()