File size: 3,068 Bytes
f1554a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c42b13
f1554a2
 
 
 
b3e6fcf
 
f1554a2
 
 
 
1c42b13
f1554a2
b3e6fcf
f1554a2
 
b3e6fcf
f1554a2
 
 
 
 
 
 
 
 
 
 
 
 
b3e6fcf
1c42b13
f1554a2
 
 
 
 
 
 
 
 
 
1c42b13
f1554a2
 
1c42b13
f1554a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c42b13
f1554a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import time
import shutil

from bs4 import BeautifulSoup

from SEM.find_subtitle import find_title_Label_with_html, find_title_Label
from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments
from SEM.types_pp_processing import getSentences_with_classifier


def run_single_pp(file):
    # INPUT = "../dataset/privacy_policies_html/"
    # INPUT = "./pp_example/"
    # cleaning_txt("./txt")
    # os.mkdir("./txt")

    result_root = "./SEM/txt/"

    if os.path.exists(result_root):
        shutil.rmtree(result_root)
    os.makedirs(result_root)

    # file = os.listdir(INPUT)[0]

    segmentation_start_time = time.time()

    pathName = "1.html"

    label = find_title_Label(file)
    # print("label: ", label)
    # print("The current file is:" + pathName)

    # if pathName != '20.html':
    #     continue

    para_start_time = time.time()
    soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser")
    # print("soup.contents: ", soup.contents)
    title_list = soup.find_all(label)
    # cleaning_txt()
    # print("title_list: ", title_list)

    if not os.path.exists(result_root + pathName[:-5]):
        os.mkdir(result_root + pathName[:-5])

    if len(title_list) == 0:
        # write_text_without_label(soup.getText(), pathName)
        removeUnneccessaryElements(soup)
        result = makeCoarseSegments(soup)
        for seg in result:
            with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:
                f.write(seg)
                f.write("\n")
    else:
        write_text(title_list, pathName, soup)
    print("Paragraph level processing time: %2.2f s" % (time.time() - para_start_time))

    for t in title_list:
        with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
            g.write(str(t))
            g.write("\n")

    # data types
    if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"):
        print("No information about data types!")
    else:
        sen_start_time = time.time()
        # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
        dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt")
        print("sentence level processing time: %2.2f s" % (time.time() - sen_start_time))

        os.makedirs(result_root + pathName[:-5] + "/classified_sentences")
        for key in dict_sentences:

            if dict_sentences[key] == "":
                continue
            with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a",
                      encoding='utf-8') as g:
                g.write(dict_sentences[key])

        for key in dict_index:
            with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a",
                      encoding='utf-8') as f:
                f.write(key + ":" + str(dict_index[key]) + "\n")

    print("time cost for segmentation: %2.2f s" % (time.time() - segmentation_start_time))