Spaces:
Runtime error
Runtime error
| import requests | |
| def clean_up_tags(tags_list): | |
| tags_cleaned = [] | |
| for tag in tags_list: | |
| if ':' in tag: | |
| tag = tag.split(':')[1] | |
| tags_cleaned.append(tag) | |
| return ", ".join(tags_cleaned) | |
| def check_api_url(url): | |
| """ | |
| This function checks to see if "api" is present in the URL between ".co" and "/datasets". If not, it inserts "api" in the correct position. | |
| Args: | |
| url (str): A URL string | |
| Returns: | |
| str: A URL string with "api" inserted if necessary | |
| """ | |
| # Split the URL into three parts based on the location of ".co" and "/datasets" | |
| parts = url.split(".co") | |
| first_part = parts[0] + ".co" | |
| last_part = parts[1] | |
| last_parts = last_part.split("/datasets") | |
| middle_part = "" | |
| if len(last_parts) > 1 and "/api" not in last_parts[0]: | |
| middle_part = "/api" | |
| # Concatenate the three parts to form the final URL | |
| new_url = first_part + middle_part + last_parts[0] + "/datasets" + last_parts[1] | |
| return new_url | |
| def get_dataset_metadata(dataset_url): | |
| retrieved_metadata = {} | |
| dataset_url = check_api_url(dataset_url) | |
| keys_to_retrieve = ['id','description', 'tags'] | |
| response = requests.get(dataset_url) | |
| if response.status_code == 200: | |
| response_json = response.json() | |
| for key in keys_to_retrieve: | |
| if key in response_json: | |
| retrieved_metadata[key] = response_json[key] | |
| return retrieved_metadata | |
| def get_dataset_readme(dataset_url): | |
| retrieved_metadata = {} | |
| metadata_url = check_api_url(dataset_url) | |
| readme_url = dataset_url + '/raw/main/README.md' | |
| readme_response = requests.get(readme_url) | |
| metadata_response = requests.get(metadata_url) | |
| if readme_response.status_code == 200: | |
| response_text = readme_response.text | |
| dataset_id = metadata_response.json()['id'] | |
| retrieved_metadata = {'id': dataset_id, 'README': response_text} | |
| return retrieved_metadata | |