Spaces:
Runtime error
Runtime error
| import requests | |
| import time | |
| import json | |
| import os | |
| import argparse | |
| from tqdm import tqdm | |
| def fetch_info_data(url): | |
| data_list = [] | |
| while url: | |
| response = requests.get(url) | |
| data = response.json() | |
| data_list.extend(data["results"]) | |
| url = data.get("next") | |
| time.sleep(10) | |
| return data_list | |
| def download_single_interpro(interpro_id, out_dir): | |
| interpro_dir = os.path.join(out_dir, interpro_id) | |
| os.makedirs(interpro_dir, exist_ok=True) | |
| start_url = f"https://www.ebi.ac.uk/interpro/api/protein/reviewed/entry/InterPro/{interpro_id}/?extra_fields=counters&page_size=20" | |
| file = os.path.join(interpro_dir, "detail.json") | |
| if os.path.exists(file): | |
| return f"Skipping {interpro_id}, already exists" | |
| info_data = [] | |
| try: | |
| info_data = fetch_info_data(start_url) | |
| except: | |
| return f"Error downloading {interpro_id}" | |
| if not info_data: | |
| return f"No data found for {interpro_id}" | |
| with open(file, 'w') as f: | |
| json.dump(info_data, f) | |
| # Save metadata | |
| meta_data = { | |
| "metadata": {"accession": interpro_id}, | |
| "num_proteins": len(info_data) | |
| } | |
| with open(os.path.join(interpro_dir, "meta.json"), 'w') as f: | |
| json.dump(meta_data, f) | |
| # Save UIDs | |
| uids = [d["metadata"]["accession"] for d in info_data] | |
| with open(os.path.join(interpro_dir, "uids.txt"), 'w') as f: | |
| f.write("\n".join(uids)) | |
| return f"Successfully downloaded {interpro_id}" | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--interpro_id", type=str, default=None) | |
| parser.add_argument("--interpro_json", type=str, default=None) | |
| parser.add_argument("--out_dir", type=str, default="download/interpro_domain") | |
| parser.add_argument("--error_file", type=str, default=None) | |
| parser.add_argument("--chunk_num", type=int, default=None) | |
| parser.add_argument("--chunk_id", type=int, default=None) | |
| args = parser.parse_args() | |
| if not args.interpro_id and not args.interpro_json: | |
| print("Error: Must provide either interpro_id or interpro_json") | |
| exit(1) | |
| os.makedirs(args.out_dir, exist_ok=True) | |
| error_proteins = [] | |
| error_messages = [] | |
| if args.interpro_id: | |
| result = download_single_interpro(args.interpro_id, args.out_dir) | |
| print(result) | |
| if "Error" in result or "No data" in result: | |
| error_proteins.append(args.interpro_id) | |
| error_messages.append(result) | |
| elif args.interpro_json: | |
| dir_path = os.path.dirname(args.interpro_json) | |
| os.makedirs(dir_path, exist_ok=True) | |
| try: | |
| with open(args.interpro_json, 'r') as f: | |
| all_data = json.load(f) | |
| except FileNotFoundError: | |
| print(f"Error: Could not find file {args.interpro_json}") | |
| exit(1) | |
| except json.JSONDecodeError: | |
| print(f"Error: Invalid JSON file {args.interpro_json}") | |
| exit(1) | |
| if args.chunk_num is not None and args.chunk_id is not None: | |
| start = args.chunk_id * len(all_data) // args.chunk_num | |
| end = (args.chunk_id + 1) * len(all_data) // args.chunk_num | |
| all_data = all_data[start:end] | |
| for data in tqdm(all_data): | |
| interpro_id = data["metadata"]["accession"] | |
| result = download_single_interpro(interpro_id, args.out_dir) | |
| if "Error" in result or "No data" in result: | |
| error_proteins.append(interpro_id) | |
| error_messages.append(result) | |
| if error_proteins and args.error_file: | |
| error_dict = {"protein": error_proteins, "error": error_messages} | |
| error_file_dir = os.path.dirname(args.error_file) | |
| os.makedirs(error_file_dir, exist_ok=True) | |
| with open(args.error_file, 'w') as f: | |
| for protein, message in zip(error_proteins, error_messages): | |
| f.write(f"{protein} - {message}\n") | |