Spaces:
Runtime error
Runtime error
| from utilities import get_simple_logger, PDFExtractor, ModuleException | |
| import pandas as pd | |
| import os, glob | |
| import json | |
| from tqdm.auto import tqdm | |
| import re | |
| file_dir = os.path.dirname(os.path.realpath(__file__)) | |
| csv_file_dir = os.path.join(file_dir, "materials") | |
| data_dir = os.path.join(file_dir, "data") | |
| train_json_dir = os.path.join(data_dir, "train_jsons") | |
| test_json_dir = os.path.join(data_dir, "test_jsons") | |
| logger = get_simple_logger("create_dataset") | |
| def save_json(data, split="train"): | |
| id_ = data["id"] | |
| if split == "train": | |
| to_save = os.path.join(train_json_dir, f"{id_}.json") | |
| else: | |
| to_save = os.path.join(test_json_dir, f"{id_}.json") | |
| logger.debug(f"Saving the json to {to_save}") | |
| with open(to_save, "w") as f: | |
| json.dump(data, f) | |
| def clean_text(text): | |
| # Clean the text by | |
| # remove extra whitespace | |
| regex = re.compile(r"\s{2,}") | |
| text = regex.sub(" ", text) | |
| # removing more than one new line with a single new line | |
| regex = re.compile(r"\n{2,}") | |
| text = regex.sub("\n", text) | |
| # if each line has less than 3 characters, remove it | |
| lines = text.split("\n") | |
| lines = [line for line in lines if len(line) > 3] | |
| text = "\n".join(lines) | |
| # cap max to 10000 | |
| text = text[:10000] | |
| return text | |
| def create_id(id_, split): | |
| id_ = int(id_) | |
| if split == "train": | |
| return f"P-{id_}" | |
| else: | |
| return f"TP{id_}" | |
| def create_json(split="train"): | |
| """Creates the dataset from the csv file and saves it to the data_dir | |
| Parameters | |
| ---------- | |
| split : str, optional | |
| The split to create the dataset for, by default "train" | |
| """ | |
| logger.info(f"Creating the dataset for {split}") | |
| df_path = os.path.join(csv_file_dir, f"parspec_{split}_data.csv") | |
| df = pd.read_csv(df_path) | |
| df.dropna(inplace=True) | |
| json_dir = train_json_dir if split == "train" else test_json_dir | |
| os.makedirs(json_dir, exist_ok=True) | |
| # already extracted files | |
| extracted_files = os.listdir(json_dir) | |
| extracted_files = list(map(lambda x: x.split(".")[0], os.listdir(train_json_dir))) | |
| logger.info(f"{len(extracted_files)} files are already extracted.") | |
| for i, row in tqdm( | |
| df.iterrows(), | |
| desc="extracting information...", | |
| total=len(df) - len(extracted_files), | |
| ): | |
| # if i == 3: | |
| # break | |
| id_ = row["ID"] | |
| if "-" in id_: | |
| # for train | |
| id_ = id_.split("-")[1] | |
| else: | |
| # for test | |
| id_ = id_[2:] | |
| id_ = id_.zfill(4) | |
| if id_ in extracted_files: | |
| logger.debug(f"File {id_} already extracted") | |
| continue | |
| logger.info(f"Extracting the file for ID {id_}") | |
| url = row["URL"] | |
| label = 1 if row["Is lighting product?"] in [1, "Yes"] else 0 | |
| try: | |
| pdf_extractor = PDFExtractor( | |
| file_path=url, | |
| is_url=True, | |
| min_characters=5, | |
| maximum_pages=3, | |
| ) | |
| final = pdf_extractor.extract_pages() | |
| data = { | |
| "status": "ok", | |
| "id": id_, | |
| "label": label, | |
| "page_contents": pdf_extractor.page_contents, | |
| "final_content": clean_text(final), | |
| "url": url, | |
| } | |
| # save the json | |
| except ModuleException: | |
| logger.error(f"Url is not valid for ID {id_}. Using Null values.") | |
| data = { | |
| "status": "error", | |
| "id": id_, | |
| "label": label, | |
| "page_contents": None, | |
| "final_content": None, | |
| "url": url, | |
| } | |
| save_json(data, split) | |
| def create_dataframe(split): | |
| df_path = os.path.join(csv_file_dir, f"parspec_{split}_data.csv") | |
| df = pd.read_csv(df_path) | |
| json_dir = train_json_dir if split == "train" else test_json_dir | |
| json_files = glob.glob(f"{json_dir}/*.json") | |
| statuss = [] | |
| ids = [] | |
| labels = [] | |
| contents = [] | |
| urls = [] | |
| for file in tqdm(json_files, "creating dataframe..."): | |
| with open(file, "r") as f: | |
| data = json.load(f) | |
| if data["status"] == "error": | |
| continue | |
| statuss.append(data["status"]) | |
| ids.append(create_id(data["id"], split=split)) | |
| labels.append(data["label"]) | |
| contents.append(clean_text(data["final_content"])) | |
| urls.append(data["url"]) | |
| final_df = pd.DataFrame( | |
| { | |
| "status": statuss, | |
| "id": ids, | |
| "label": labels, | |
| "content": contents, | |
| "url": urls, | |
| } | |
| ) | |
| final = pd.merge(final_df, df, left_on="id", right_on="ID")[ | |
| ["id", "content", "Is lighting product?", "url"] | |
| ] | |
| final.rename(columns={"Is lighting product?": "label"}, inplace=True) | |
| final["label"] = final["label"].map( | |
| { | |
| "Yes": 1, | |
| "No": 0, | |
| } | |
| ) | |
| final.to_csv( | |
| os.path.join(data_dir, f"{split}.csv"), index=False, escapechar="\\" | |
| ) # setting escapechar is required | |
| return final | |
| if __name__ == "__main__": | |
| create_dataframe(split="test") | |