PeptideAI / Data /Data Editors /csvCombiner.py
m0ksh's picture
Sync from GitHub (preserve manual model files)
ea61d54 verified
# Merge AMP / non-AMP FASTA files into one labeled CSV for training or app Data/.
import pandas as pd
from Bio import SeqIO
from pathlib import Path
amp_fasta = "amps.fasta"
non_amp_fasta = "non_amps.fasta"
output_csv = "ampData3.csv"
valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
# HELPER: clean and validate sequences
def clean_seq(seq):
seq = seq.strip().upper()
if not seq or any(aa not in valid_aas for aa in seq):
return None
return seq
# LOAD FASTAS
def load_fasta(filepath, label):
"""Load fasta file. Accepts a filename or path. If the path does not exist
as given, try resolving it relative to this script's directory.
Returns list of dicts: {"sequence": seq, "label": label}.
"""
p = Path(filepath)
if not p.exists():
p = Path(__file__).resolve().parent / filepath
if not p.exists():
raise FileNotFoundError(f"Fasta file not found: '{filepath}' (tried '{p}')")
records = []
for record in SeqIO.parse(str(p), "fasta"):
seq = clean_seq(str(record.seq))
if seq:
records.append({"sequence": seq, "label": label})
return records
amps = load_fasta(amp_fasta, 1)
non_amps = load_fasta(non_amp_fasta, 0)
print(f"Loaded {len(amps)} AMPs and {len(non_amps)} non-AMPs before cleaning.")
# REMOVE DUPLICATES
amp_df = pd.DataFrame(amps).drop_duplicates(subset=["sequence"])
non_amp_df = pd.DataFrame(non_amps).drop_duplicates(subset=["sequence"])
# BALANCE CLASSES
min_count = min(len(amp_df), len(non_amp_df))
amp_balanced = amp_df.sample(n=min_count, random_state=42)
non_amp_balanced = non_amp_df.sample(n=min_count, random_state=42)
# COMBINE AND SHUFFLE
final_df = pd.concat([amp_balanced, non_amp_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
# SAVE TO CSV
final_df.to_csv(output_csv, index=False)
print(f"Saved balanced dataset with {len(final_df)} total sequences ({min_count} per class).")
print(f"Output file: {output_csv}")