Spaces:
Runtime error
Runtime error
| import json | |
| import time | |
| import os | |
| import sys | |
| import requests | |
| def fetch_digital_commonwealth(): | |
| start = time.time() | |
| BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q=" | |
| PAGE = sys.argv[1] | |
| END_PAGE = sys.argv[2] | |
| file_name = f"out{PAGE}_{END_PAGE}.json" | |
| FINAL_PAGE = 13038 # hardcoded from old version, I suggest doing logic to determine final page. This was used to keep us from going out of index. | |
| output = [] | |
| file_path = f"./{file_name}" | |
| # file_path = './output.json' | |
| if os.path.exists(file_path): | |
| with open(file_path,'r') as file: | |
| output = json.load(file) | |
| if int(PAGE) < (len(output) + 1): | |
| PAGE = len(output) + 1 | |
| if int(PAGE) >= int(END_PAGE): | |
| return None | |
| print(f'Reading page {PAGE} up to page {END_PAGE}') | |
| retries = 0 | |
| while True: | |
| try: | |
| response = requests.get(f"{BASE_URL}&page={PAGE}") | |
| response.raise_for_status() | |
| data = response.json() | |
| # Append current page data to the output list | |
| output.append(data) | |
| # Save the entire output to a JSON file after each iteration | |
| with open(file_path, 'w') as f: | |
| json.dump(output, f) | |
| # check if theres a next page | |
| # print(len(response)) | |
| if data['meta']['pages']['next_page']: | |
| if data['meta']['pages']['next_page'] == int(END_PAGE): | |
| print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | |
| break | |
| elif data['meta']['pages']['next_page'] == FINAL_PAGE: # This is hardcoded from an old version | |
| print(f"finished page {PAGE}") | |
| PAGE = FINAL_PAGE | |
| else: | |
| print(f"finished page {PAGE}") | |
| PAGE = data['meta']['pages']['next_page'] | |
| else: | |
| print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | |
| break | |
| retries = 0 | |
| # time.sleep(0.5) was concerned about rate limiting | |
| except requests.exceptions.RequestException as e: | |
| print(f"An error occurred: {e}") | |
| print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | |
| retries += 1 | |
| if retries >= 5: | |
| break | |
| end = time.time() | |
| print(f"Timer: {end - start}") | |
| print(f"Finished processing all pages. Total pages saved: {len(output)}") | |
| if __name__ == "__main__": | |
| fetch_digital_commonwealth() | |