Spaces:

Cpp4App
/

Cpp4App_test

Sleeping

Cpp4App_test / SEM /P1_PP_processing.py

HaochenGong

create

f1554a2 over 1 year ago

5.15 kB

	import os
	import time
	import shutil

	from bs4 import BeautifulSoup

	from find_subtitle import find_title_Label
	from get_text import write_text, write_text_without_label, removeUnneccessaryElements, makeCoarseSegments
	from types_pp_processing import caculateSim, getSentences, getSentences_no_classifier, getSentences_with_classifier
	# from children_pp_processing import process_specialGroup
	# from region_pp_processing import get_alifornia
	# from retention_pp_processing import retention_process
	# from clean_txt import cleaning_txt

	if __name__ == '__main__':
	# INPUT = "../dataset/privacy_policies_html/"
	INPUT = "./pp_example/"
	# cleaning_txt("./txt")
	# os.mkdir("./txt")
	if os.path.exists("./txt"):
	shutil.rmtree("./txt")
	os.makedirs("./txt")

	for file in os.listdir(INPUT):

	segmentation_start_time = time.clock()

	pathName = os.path.basename(file)
	if pathName == ".DS_Store":
	continue
	path = INPUT+pathName
	label = find_title_Label(path)
	print("The current file is:" + pathName)

	# if pathName != '20.html':
	# continue

	para_start_time = time.clock()
	soup = BeautifulSoup(open(path,encoding='utf-8'), features="html.parser")
	title_list = soup.find_all(label)
	# cleaning_txt()

	if not os.path.exists('./txt/' + pathName[:-5]):
	os.mkdir('./txt/' + pathName[:-5])

	if len(title_list) == 0 or pathName == '20.html' or pathName == '29.html' or pathName == '25.html' or pathName == '8.html' or pathName == '27.html' or pathName == '28.html':
	# write_text_without_label(soup.getText(), pathName)
	removeUnneccessaryElements(soup)
	result = makeCoarseSegments(soup)
	for seg in result:
	with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:

	f.write(seg)
	f.write("\n")
	else:
	write_text(title_list, pathName)
	print("Paragraph level processing time: %2.2f s" % (time.clock() - para_start_time))

	for t in title_list:
	with open('./txt/' + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
	g.write(str(t))
	g.write("\n")

	# data types
	if not os.path.exists("./txt/"+pathName[:-5]+"/data_types.txt"):
	print("No information about data types!")
	else:
	sen_start_time = time.clock()
	# all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
	dict_sentences, dict_index = getSentences_with_classifier("./txt/" + pathName[:-5] + "/data_types.txt")
	print("sentence level processing time: %2.2f s" % (time.clock() - sen_start_time))

	os.makedirs("./txt/"+pathName[:-5]+"/classified_sentences")
	for key in dict_sentences:

	if dict_sentences[key] == "":
	continue
	with open('./txt/' + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g:
	g.write(dict_sentences[key])

	for key in dict_index:
	with open('./txt/' + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f:
	f.write(key + ":" + str(dict_index[key]) + "\n")


	# #children
	# if not os.path.exists("./txt/"+pathName[:-5]+"/children.txt"):
	# print("No information about children!")
	# else:
	# age , rule, childUse, specialGroup = process_specialGroup("./txt/"+pathName[:-5]+"/children.txt")
	# # print("children age is :")
	# print("D.CHILDREN.age : " + str(age))
	# if childUse == 1:
	# print(" the skill’s privacy policy states that it does not collect any information from children")
	# print("D.CHILDREN.[CTypes] = [ ]")
	# else:
	# # print("D.CHILDREN.[CTypes] :" + str(all_types))
	# None
	# #region
	# if not os.path.exists("./txt/"+pathName[:-5]+"/region.txt"):
	# print("No information about region!")
	# else:
	# specialArea,california = get_alifornia("./txt/"+pathName[:-5]+"/region.txt")
	# if california == 1:
	# print("D.REGIONS.region :California")
	# print("D.REGIONS.delete : Yes")
	# else:
	# print("D.REGIONS.region :No mention")
	# print("D.REGIONS.delete : No")
	#
	# #retention
	# if not os.path.exists("./txt/"+pathName[:-5]+"/data_retention.txt"):
	# print("No information about data retention!")
	# else:
	# retention_time, text = retention_process("./txt/"+pathName[:-5]+"/data_retention.txt")
	# print("D.RETENTION.period :"+ retention_time)
	# # cleaning_txt()
	# print("-------------------------------------------------------")

	print("time cost for segmentation: %2.2f s" % (time.clock() - segmentation_start_time))