Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import pandas as pd | |
| import yaml | |
| def load_config(config_file: str) -> dict: | |
| with open(config_file) as f: | |
| config = yaml.safe_load(f) | |
| return config | |
| INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv" | |
| OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv" | |
| def subset_docs(input_file: str, output_file: str, sample: int): | |
| print(f"Reading data from {input_file}") | |
| data = pd.read_csv(input_file) | |
| data = ( | |
| data[["Description"]] | |
| .replace("Not available", np.nan) | |
| .dropna() | |
| .drop_duplicates() | |
| .reset_index(drop=True) | |
| .sample(sample) | |
| ) | |
| print(f"Number of rows: {data.shape[0]}") | |
| print(f"Number of unique rows: {data['Description'].nunique()}") | |
| print(f"Saving file to {output_file}") | |
| data.to_csv(output_file, index=False) | |
| if __name__ == "__main__": | |
| params = load_config("params.yaml") | |
| subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"]) | |