Cpp4App_test / SEM /run_single_sem.py
HaochenGong
change time cost count
1c42b13
import os
import time
import shutil
from bs4 import BeautifulSoup
from SEM.find_subtitle import find_title_Label_with_html, find_title_Label
from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments
from SEM.types_pp_processing import getSentences_with_classifier
def run_single_pp(file):
# INPUT = "../dataset/privacy_policies_html/"
# INPUT = "./pp_example/"
# cleaning_txt("./txt")
# os.mkdir("./txt")
result_root = "./SEM/txt/"
if os.path.exists(result_root):
shutil.rmtree(result_root)
os.makedirs(result_root)
# file = os.listdir(INPUT)[0]
segmentation_start_time = time.time()
pathName = "1.html"
label = find_title_Label(file)
# print("label: ", label)
# print("The current file is:" + pathName)
# if pathName != '20.html':
# continue
para_start_time = time.time()
soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser")
# print("soup.contents: ", soup.contents)
title_list = soup.find_all(label)
# cleaning_txt()
# print("title_list: ", title_list)
if not os.path.exists(result_root + pathName[:-5]):
os.mkdir(result_root + pathName[:-5])
if len(title_list) == 0:
# write_text_without_label(soup.getText(), pathName)
removeUnneccessaryElements(soup)
result = makeCoarseSegments(soup)
for seg in result:
with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
f.write(seg)
f.write("\n")
else:
write_text(title_list, pathName, soup)
print("Paragraph level processing time: %2.2f s" % (time.time() - para_start_time))
for t in title_list:
with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
g.write(str(t))
g.write("\n")
# data types
if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"):
print("No information about data types!")
else:
sen_start_time = time.time()
# all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt")
print("sentence level processing time: %2.2f s" % (time.time() - sen_start_time))
os.makedirs(result_root + pathName[:-5] + "/classified_sentences")
for key in dict_sentences:
if dict_sentences[key] == "":
continue
with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a",
encoding='utf-8') as g:
g.write(dict_sentences[key])
for key in dict_index:
with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a",
encoding='utf-8') as f:
f.write(key + ":" + str(dict_index[key]) + "\n")
print("time cost for segmentation: %2.2f s" % (time.time() - segmentation_start_time))