Spaces:

Cpp4App
/

Cpp4App_test

Sleeping

Cpp4App_test / SEM /run_single_sem.py

HaochenGong

change time cost count

1c42b13 over 1 year ago

3.07 kB

	import os
	import time
	import shutil

	from bs4 import BeautifulSoup

	from SEM.find_subtitle import find_title_Label_with_html, find_title_Label
	from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments
	from SEM.types_pp_processing import getSentences_with_classifier


	def run_single_pp(file):
	# INPUT = "../dataset/privacy_policies_html/"
	# INPUT = "./pp_example/"
	# cleaning_txt("./txt")
	# os.mkdir("./txt")

	result_root = "./SEM/txt/"

	if os.path.exists(result_root):
	shutil.rmtree(result_root)
	os.makedirs(result_root)

	# file = os.listdir(INPUT)[0]

	segmentation_start_time = time.time()

	pathName = "1.html"

	label = find_title_Label(file)
	# print("label: ", label)
	# print("The current file is:" + pathName)

	# if pathName != '20.html':
	# continue

	para_start_time = time.time()
	soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser")
	# print("soup.contents: ", soup.contents)
	title_list = soup.find_all(label)
	# cleaning_txt()
	# print("title_list: ", title_list)

	if not os.path.exists(result_root + pathName[:-5]):
	os.mkdir(result_root + pathName[:-5])

	if len(title_list) == 0:
	# write_text_without_label(soup.getText(), pathName)
	removeUnneccessaryElements(soup)
	result = makeCoarseSegments(soup)
	for seg in result:
	with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
	f.write(seg)
	f.write("\n")
	else:
	write_text(title_list, pathName, soup)
	print("Paragraph level processing time: %2.2f s" % (time.time() - para_start_time))

	for t in title_list:
	with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
	g.write(str(t))
	g.write("\n")

	# data types
	if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"):
	print("No information about data types!")
	else:
	sen_start_time = time.time()
	# all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
	dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt")
	print("sentence level processing time: %2.2f s" % (time.time() - sen_start_time))

	os.makedirs(result_root + pathName[:-5] + "/classified_sentences")
	for key in dict_sentences:

	if dict_sentences[key] == "":
	continue
	with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a",
	encoding='utf-8') as g:
	g.write(dict_sentences[key])

	for key in dict_index:
	with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a",
	encoding='utf-8') as f:
	f.write(key + ":" + str(dict_index[key]) + "\n")

	print("time cost for segmentation: %2.2f s" % (time.time() - segmentation_start_time))