| from baidubce.bce_client_configuration import BceClientConfiguration | |
| from baidubce.auth.bce_credentials import BceCredentials | |
| from baidubce.services.bos.bos_client import BosClient | |
| import hashlib | |
| import glob | |
| def seq_encoder(sequence, method='md5'): | |
| hasher = eval(f'hashlib.{method}') | |
| return hasher(sequence.encode(encoding='utf-8')).hexdigest() | |
| config = BceClientConfiguration( | |
| credentials = BceCredentials( | |
| '35420270cb5c46118d6729b692669e2b', | |
| '35474e577b514954b72a128a53304cab' | |
| ), | |
| endpoint = 'https://bj.bcebos.com' | |
| ) | |
| bos_client = BosClient(config) | |
| # response = bos_client.list_buckets() | |
| # for bucket in response.buckets: | |
| # print(bucket.name) | |
| if __name__ == "__main__": | |
| import pandas as pd | |
| all_csvs = list( | |
| glob.iglob( | |
| "/nfs_beijing/kubeflow-user/zhangyang_2024/workspace/protein_benchmark/datasets/*/*.csv", | |
| recursive=True | |
| ) | |
| ) | |
| all_csvs.sort() | |
| seqs = [] | |
| pdb_paths = [] | |
| for _csv in all_csvs: | |
| df = pd.read_csv(_csv) | |
| if "pdb_path" not in df or "aa_seq" not in df: | |
| print(_csv) | |
| continue | |
| _seqs = df["aa_seq"].tolist() | |
| _pdb_paths = df["pdb_path"].tolist() | |
| seqs.extend(_seqs) | |
| pdb_paths.extend(_pdb_paths) | |
| print(len(pdb_paths)) | |
| print(len(seqs)) | |