Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Select top-N, bottom-N, and random-Nr sequences based on a chosen score column. | |
| Outputs: | |
| selected_top.csv | |
| selected_bottom.csv | |
| selected_random.csv | |
| selected_for_af2.txt (list of Accessions) | |
| """ | |
| import pandas as pd | |
| import argparse | |
| import numpy as np | |
| def main(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--input", required=True, help="merged_preAF2_scores.csv") | |
| p.add_argument("--sortcol", required=True, help="Column to sort by (default=scfvtools_score)") | |
| p.add_argument("--Nt", type=int, required=True) | |
| p.add_argument("--Nb", type=int, required=True) | |
| p.add_argument("--Nr", type=int, required=True) | |
| p.add_argument("--outdir", required=True) | |
| args = p.parse_args() | |
| df = pd.read_csv(args.input) | |
| if args.sortcol not in df.columns: | |
| raise ValueError(f"sortcol '{args.sortcol}' not found in columns: {df.columns}") | |
| # Ensure no NA in sort column (rank misses last) | |
| df = df.copy() | |
| df[args.sortcol] = pd.to_numeric(df[args.sortcol], errors="coerce") | |
| # Sort | |
| df_sorted = df.sort_values(by=args.sortcol, ascending=False) # Higher = better | |
| # Top Nt (clamp to available rows) | |
| top = df_sorted.head(min(args.Nt, len(df_sorted))) | |
| top.to_csv(f"{args.outdir}/selected_top.csv", index=False) | |
| # Bottom Nb (clamp to available rows) | |
| bottom = df_sorted.tail(min(args.Nb, len(df_sorted))) | |
| bottom.to_csv(f"{args.outdir}/selected_bottom.csv", index=False) | |
| # Random Nr (clamp to available rows) | |
| nr = min(args.Nr, len(df)) | |
| random = df.sample(n=nr, replace=False, random_state=37) | |
| random.to_csv(f"{args.outdir}/selected_random.csv", index=False) | |
| # Combined list for AF2 | |
| combined = pd.concat([top, bottom, random], ignore_index=True) | |
| combined["Accession"].drop_duplicates().to_csv( | |
| f"{args.outdir}/selected_for_af2.txt", | |
| index=False, | |
| header=False | |
| ) | |
| print("[select_top_bottom_random] Selection complete.") | |
| print(f"Top β {args.outdir}/selected_top.csv") | |
| print(f"Bottom β {args.outdir}/selected_bottom.csv") | |
| print(f"Random β {args.outdir}/selected_random.csv") | |
| print(f"Selected list β {args.outdir}/selected_for_af2.txt") | |
| if __name__ == "__main__": | |
| main() | |