Spaces:
Sleeping
Sleeping
File size: 3,068 Bytes
f1554a2 1c42b13 f1554a2 b3e6fcf f1554a2 1c42b13 f1554a2 b3e6fcf f1554a2 b3e6fcf f1554a2 b3e6fcf 1c42b13 f1554a2 1c42b13 f1554a2 1c42b13 f1554a2 1c42b13 f1554a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import os
import time
import shutil
from bs4 import BeautifulSoup
from SEM.find_subtitle import find_title_Label_with_html, find_title_Label
from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments
from SEM.types_pp_processing import getSentences_with_classifier
def run_single_pp(file):
# INPUT = "../dataset/privacy_policies_html/"
# INPUT = "./pp_example/"
# cleaning_txt("./txt")
# os.mkdir("./txt")
result_root = "./SEM/txt/"
if os.path.exists(result_root):
shutil.rmtree(result_root)
os.makedirs(result_root)
# file = os.listdir(INPUT)[0]
segmentation_start_time = time.time()
pathName = "1.html"
label = find_title_Label(file)
# print("label: ", label)
# print("The current file is:" + pathName)
# if pathName != '20.html':
# continue
para_start_time = time.time()
soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser")
# print("soup.contents: ", soup.contents)
title_list = soup.find_all(label)
# cleaning_txt()
# print("title_list: ", title_list)
if not os.path.exists(result_root + pathName[:-5]):
os.mkdir(result_root + pathName[:-5])
if len(title_list) == 0:
# write_text_without_label(soup.getText(), pathName)
removeUnneccessaryElements(soup)
result = makeCoarseSegments(soup)
for seg in result:
with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
f.write(seg)
f.write("\n")
else:
write_text(title_list, pathName, soup)
print("Paragraph level processing time: %2.2f s" % (time.time() - para_start_time))
for t in title_list:
with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
g.write(str(t))
g.write("\n")
# data types
if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"):
print("No information about data types!")
else:
sen_start_time = time.time()
# all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt")
print("sentence level processing time: %2.2f s" % (time.time() - sen_start_time))
os.makedirs(result_root + pathName[:-5] + "/classified_sentences")
for key in dict_sentences:
if dict_sentences[key] == "":
continue
with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a",
encoding='utf-8') as g:
g.write(dict_sentences[key])
for key in dict_index:
with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a",
encoding='utf-8') as f:
f.write(key + ":" + str(dict_index[key]) + "\n")
print("time cost for segmentation: %2.2f s" % (time.time() - segmentation_start_time))
|