import os import time import shutil from bs4 import BeautifulSoup from SEM.find_subtitle import find_title_Label_with_html, find_title_Label from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments from SEM.types_pp_processing import getSentences_with_classifier def run_single_pp(file): # INPUT = "../dataset/privacy_policies_html/" # INPUT = "./pp_example/" # cleaning_txt("./txt") # os.mkdir("./txt") result_root = "./SEM/txt/" if os.path.exists(result_root): shutil.rmtree(result_root) os.makedirs(result_root) # file = os.listdir(INPUT)[0] segmentation_start_time = time.time() pathName = "1.html" label = find_title_Label(file) # print("label: ", label) # print("The current file is:" + pathName) # if pathName != '20.html': # continue para_start_time = time.time() soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser") # print("soup.contents: ", soup.contents) title_list = soup.find_all(label) # cleaning_txt() # print("title_list: ", title_list) if not os.path.exists(result_root + pathName[:-5]): os.mkdir(result_root + pathName[:-5]) if len(title_list) == 0: # write_text_without_label(soup.getText(), pathName) removeUnneccessaryElements(soup) result = makeCoarseSegments(soup) for seg in result: with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: f.write(seg) f.write("\n") else: write_text(title_list, pathName, soup) print("Paragraph level processing time: %2.2f s" % (time.time() - para_start_time)) for t in title_list: with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g: g.write(str(t)) g.write("\n") # data types if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"): print("No information about data types!") else: sen_start_time = time.time() # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt") dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt") print("sentence level processing time: %2.2f s" % (time.time() - sen_start_time)) os.makedirs(result_root + pathName[:-5] + "/classified_sentences") for key in dict_sentences: if dict_sentences[key] == "": continue with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g: g.write(dict_sentences[key]) for key in dict_index: with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f: f.write(key + ":" + str(dict_index[key]) + "\n") print("time cost for segmentation: %2.2f s" % (time.time() - segmentation_start_time))