| import pandas as pd |
| from typing import Optional |
| from huggingface_hub import hf_hub_download |
|
|
|
|
| def _load_parquet_by_dms(repo_id: str, dms_id: str) -> Optional[pd.DataFrame]: |
| """ |
| Loads a single-assay parquet shard from the Hub at by_dms_id/{DMS_id}.parquet. |
| """ |
| assay_files = [] |
| id_str = str(dms_id) |
| assay_files.append(f"by_dms_id/{id_str}.parquet") |
| for filename in assay_files: |
| local_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset") |
| df = pd.read_parquet(local_path) |
| return df |
|
|
|
|
| def load_proteingym_dms(dms_id: str, mode: str, repo_id: str = "GleghornLab/ProteinGym_DMS") -> pd.DataFrame: |
| """ |
| Load a single ProteinGym DMS assay, processing columns as specified. |
| |
| Modes: |
| - "benchmark": Keeps the columns for standard substitution benchmark. |
| - "indels": Keeps only indels assays. |
| - "singles": Keeps only single substitutions variants. |
| - "multiples": Keeps only multiple substitutions variants. |
| """ |
| df = _load_parquet_by_dms(repo_id=repo_id, dms_id=dms_id) |
| |
| if mode == 'benchmark': |
| df = df[df['is_indel'] == False] |
| df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]] |
| elif mode == 'indels': |
| |
| df = df[df['is_indel'] == True] |
| df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin"]] |
| elif mode == 'singles': |
| |
| df = df[df['is_indel'] == False] |
| df = df[df['num_mutations'] == 1] |
| df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]] |
| elif mode == 'multiples': |
| |
| df = df[df['is_indel'] == False] |
| df = df[df['num_mutations'] > 1] |
| df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]] |
|
|
| return df.reset_index(drop=True) |