# Filter FASTA to canonical amino acids and length bounds; emit FASTA + CSV. from Bio import SeqIO import pandas as pd # CONFIG input_fasta = "amps.fasta" output_fasta = "amps_clean.fasta" output_csv = "amps_clean.csv" min_len = 5 max_len = 100 valid_aas = set("ACDEFGHIKLMNPQRSTVWY") # CLEAN FUNCTION def clean_seq(seq): seq = seq.strip().upper() if not (min_len <= len(seq) <= max_len): return None if any(aa not in valid_aas for aa in seq): return None return seq # READ AND CLEAN clean_records = [] for record in SeqIO.parse(input_fasta, "fasta"): seq = clean_seq(str(record.seq)) if seq: clean_records.append(seq) # DEDUPLICATE clean_records = list(set(clean_records)) # SAVE CLEAN FASTA with open(output_fasta, "w") as f: for i, seq in enumerate(clean_records, start=1): f.write(f">AMP_{i}\n{seq}\n") # SAVE CSV pd.DataFrame({"sequence": clean_records}).to_csv(output_csv, index=False) print(f"Cleaned {len(clean_records)} sequences saved to '{output_fasta}' and '{output_csv}'.")