SwissAI-Team7 / clean.py
Muzansama's picture
Upload 3 files
2ab9254 verified
raw
history blame
1.78 kB
import os
from tqdm import tqdm
import json
files = [os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData', f) for f in os.listdir('/capstor/scratch/cscs/dshah/AI/HospitalData')]
os.makedirs('/capstor/scratch/cscs/dshah/AI/CleanedHospitalData', exist_ok=True)
for file in tqdm(files):
with open(file, 'r') as f:
lines = f.read()
if lines.strip() == '[]':
continue
data = lines.strip('[{').strip('}]').split('}, {')
first = True
base_url = None
data_cleaned = []
for d in data:
url = d.split(", 'text':")[0].replace("'url': '", "").replace("'", "")
if first:
print(url)
if "://" not in url:
base_url = url.split("www.")[1].split("/")[0]
else:
base_url = url.split("://")[1].split("/")[0]
first = False
try:
text = d.split(", 'text':")[1].strip("'").strip()
except:
print(d)
print("Error in splitting text\n")
continue
# print(f"URL: {url}")
# print(f"Text: {text}\n")
if base_url not in url:
print("Different base URL\n")
continue
if text == '':
print("Text is empty\n")
continue
if 'Error' in text or 'Not Found' in text:
print("Error or Not Found in text\n")
continue
if len(text) < 100:
print("Text is too short\n")
continue
data_cleaned.append({
'url': url.strip(),
'text': text.strip()
})
if len(data_cleaned) == 0:
continue
with open(file.replace('HospitalData', 'CleanedHospitalData') + '.json', 'w') as f:
json.dump(data_cleaned, f, indent=4)