Spaces:
Runtime error
Runtime error
| import os | |
| from tqdm import tqdm | |
| import json | |
| files = [os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData', f) for f in os.listdir('/capstor/scratch/cscs/dshah/AI/HospitalData')] | |
| os.makedirs('/capstor/scratch/cscs/dshah/AI/CleanedHospitalData', exist_ok=True) | |
| for file in tqdm(files): | |
| with open(file, 'r') as f: | |
| lines = f.read() | |
| if lines.strip() == '[]': | |
| continue | |
| data = lines.strip('[{').strip('}]').split('}, {') | |
| first = True | |
| base_url = None | |
| data_cleaned = [] | |
| for d in data: | |
| url = d.split(", 'text':")[0].replace("'url': '", "").replace("'", "") | |
| if first: | |
| print(url) | |
| if "://" not in url: | |
| base_url = url.split("www.")[1].split("/")[0] | |
| else: | |
| base_url = url.split("://")[1].split("/")[0] | |
| first = False | |
| try: | |
| text = d.split(", 'text':")[1].strip("'").strip() | |
| except: | |
| print(d) | |
| print("Error in splitting text\n") | |
| continue | |
| # print(f"URL: {url}") | |
| # print(f"Text: {text}\n") | |
| if base_url not in url: | |
| print("Different base URL\n") | |
| continue | |
| if text == '': | |
| print("Text is empty\n") | |
| continue | |
| if 'Error' in text or 'Not Found' in text: | |
| print("Error or Not Found in text\n") | |
| continue | |
| if len(text) < 100: | |
| print("Text is too short\n") | |
| continue | |
| data_cleaned.append({ | |
| 'url': url.strip(), | |
| 'text': text.strip() | |
| }) | |
| if len(data_cleaned) == 0: | |
| continue | |
| with open(file.replace('HospitalData', 'CleanedHospitalData') + '.json', 'w') as f: | |
| json.dump(data_cleaned, f, indent=4) | |