import os import time import shutil from bs4 import BeautifulSoup from find_subtitle import find_title_Label from get_text import write_text, write_text_without_label, removeUnneccessaryElements, makeCoarseSegments from types_pp_processing import caculateSim, getSentences, getSentences_no_classifier, getSentences_with_classifier # from children_pp_processing import process_specialGroup # from region_pp_processing import get_alifornia # from retention_pp_processing import retention_process # from clean_txt import cleaning_txt if __name__ == '__main__': # INPUT = "../dataset/privacy_policies_html/" INPUT = "./pp_example/" # cleaning_txt("./txt") # os.mkdir("./txt") if os.path.exists("./txt"): shutil.rmtree("./txt") os.makedirs("./txt") for file in os.listdir(INPUT): segmentation_start_time = time.clock() pathName = os.path.basename(file) if pathName == ".DS_Store": continue path = INPUT+pathName label = find_title_Label(path) print("The current file is:" + pathName) # if pathName != '20.html': # continue para_start_time = time.clock() soup = BeautifulSoup(open(path,encoding='utf-8'), features="html.parser") title_list = soup.find_all(label) # cleaning_txt() if not os.path.exists('./txt/' + pathName[:-5]): os.mkdir('./txt/' + pathName[:-5]) if len(title_list) == 0 or pathName == '20.html' or pathName == '29.html' or pathName == '25.html' or pathName == '8.html' or pathName == '27.html' or pathName == '28.html': # write_text_without_label(soup.getText(), pathName) removeUnneccessaryElements(soup) result = makeCoarseSegments(soup) for seg in result: with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: f.write(seg) f.write("\n") else: write_text(title_list, pathName) print("Paragraph level processing time: %2.2f s" % (time.clock() - para_start_time)) for t in title_list: with open('./txt/' + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g: g.write(str(t)) g.write("\n") # data types if not os.path.exists("./txt/"+pathName[:-5]+"/data_types.txt"): print("No information about data types!") else: sen_start_time = time.clock() # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt") dict_sentences, dict_index = getSentences_with_classifier("./txt/" + pathName[:-5] + "/data_types.txt") print("sentence level processing time: %2.2f s" % (time.clock() - sen_start_time)) os.makedirs("./txt/"+pathName[:-5]+"/classified_sentences") for key in dict_sentences: if dict_sentences[key] == "": continue with open('./txt/' + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g: g.write(dict_sentences[key]) for key in dict_index: with open('./txt/' + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f: f.write(key + ":" + str(dict_index[key]) + "\n") # #children # if not os.path.exists("./txt/"+pathName[:-5]+"/children.txt"): # print("No information about children!") # else: # age , rule, childUse, specialGroup = process_specialGroup("./txt/"+pathName[:-5]+"/children.txt") # # print("children age is :") # print("D.CHILDREN.age : " + str(age)) # if childUse == 1: # print(" the skill’s privacy policy states that it does not collect any information from children") # print("D.CHILDREN.[CTypes] = [ ]") # else: # # print("D.CHILDREN.[CTypes] :" + str(all_types)) # None # #region # if not os.path.exists("./txt/"+pathName[:-5]+"/region.txt"): # print("No information about region!") # else: # specialArea,california = get_alifornia("./txt/"+pathName[:-5]+"/region.txt") # if california == 1: # print("D.REGIONS.region :California") # print("D.REGIONS.delete : Yes") # else: # print("D.REGIONS.region :No mention") # print("D.REGIONS.delete : No") # # #retention # if not os.path.exists("./txt/"+pathName[:-5]+"/data_retention.txt"): # print("No information about data retention!") # else: # retention_time, text = retention_process("./txt/"+pathName[:-5]+"/data_retention.txt") # print("D.RETENTION.period :"+ retention_time) # # cleaning_txt() # print("-------------------------------------------------------") print("time cost for segmentation: %2.2f s" % (time.clock() - segmentation_start_time))