File size: 5,153 Bytes
f1554a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import time
import shutil

from bs4 import BeautifulSoup

from find_subtitle import find_title_Label
from get_text import write_text, write_text_without_label, removeUnneccessaryElements, makeCoarseSegments
from types_pp_processing import caculateSim, getSentences, getSentences_no_classifier, getSentences_with_classifier
# from children_pp_processing import process_specialGroup
# from region_pp_processing import get_alifornia
# from retention_pp_processing import retention_process
# from clean_txt import cleaning_txt

if __name__ == '__main__':
    # INPUT = "../dataset/privacy_policies_html/"
    INPUT = "./pp_example/"
    # cleaning_txt("./txt")
    # os.mkdir("./txt")
    if os.path.exists("./txt"):
        shutil.rmtree("./txt")
    os.makedirs("./txt")

    for file in os.listdir(INPUT):

        segmentation_start_time = time.clock()

        pathName = os.path.basename(file)
        if pathName == ".DS_Store":
            continue
        path = INPUT+pathName
        label = find_title_Label(path)
        print("The current file is:" + pathName)

        # if pathName != '20.html':
        #     continue

        para_start_time = time.clock()
        soup = BeautifulSoup(open(path,encoding='utf-8'), features="html.parser")
        title_list = soup.find_all(label)
        # cleaning_txt()

        if not os.path.exists('./txt/' + pathName[:-5]):
            os.mkdir('./txt/' + pathName[:-5])

        if len(title_list) == 0 or pathName == '20.html' or pathName == '29.html' or pathName == '25.html' or pathName == '8.html' or pathName == '27.html' or pathName == '28.html':
            # write_text_without_label(soup.getText(), pathName)
            removeUnneccessaryElements(soup)
            result = makeCoarseSegments(soup)
            for seg in result:
                with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:

                    f.write(seg)
                    f.write("\n")
        else:
            write_text(title_list, pathName)
        print("Paragraph level processing time: %2.2f s" % (time.clock() - para_start_time))

        for t in title_list:
            with open('./txt/' + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
                g.write(str(t))
                g.write("\n")

        # data types
        if not os.path.exists("./txt/"+pathName[:-5]+"/data_types.txt"):
            print("No information about data types!")
        else:
            sen_start_time = time.clock()
            # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
            dict_sentences, dict_index = getSentences_with_classifier("./txt/" + pathName[:-5] + "/data_types.txt")
            print("sentence level processing time: %2.2f s" % (time.clock() - sen_start_time))

            os.makedirs("./txt/"+pathName[:-5]+"/classified_sentences")
            for key in dict_sentences:

                if dict_sentences[key] == "":
                    continue
                with open('./txt/' + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g:
                    g.write(dict_sentences[key])

            for key in dict_index:
                with open('./txt/' + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f:
                    f.write(key + ":" + str(dict_index[key]) + "\n")


        # #children
        # if not os.path.exists("./txt/"+pathName[:-5]+"/children.txt"):
        #     print("No information about children!")
        # else:
        #     age , rule, childUse, specialGroup = process_specialGroup("./txt/"+pathName[:-5]+"/children.txt")
        #     # print("children age is :")
        #     print("D.CHILDREN.age : " + str(age))
        #     if childUse == 1:
        #         print(" the skill’s privacy policy states that it does not collect any information from children")
        #         print("D.CHILDREN.[CTypes] = [ ]")
        #     else:
        #         # print("D.CHILDREN.[CTypes] :" + str(all_types))
        #         None
        # #region
        # if not os.path.exists("./txt/"+pathName[:-5]+"/region.txt"):
        #     print("No information about region!")
        # else:
        #     specialArea,california = get_alifornia("./txt/"+pathName[:-5]+"/region.txt")
        #     if california == 1:
        #         print("D.REGIONS.region :California")
        #         print("D.REGIONS.delete : Yes")
        #     else:
        #         print("D.REGIONS.region :No mention")
        #         print("D.REGIONS.delete : No")
        #
        # #retention
        # if not os.path.exists("./txt/"+pathName[:-5]+"/data_retention.txt"):
        #     print("No information about data retention!")
        # else:
        #     retention_time, text = retention_process("./txt/"+pathName[:-5]+"/data_retention.txt")
        #     print("D.RETENTION.period :"+ retention_time)
        #     # cleaning_txt()
        #     print("-------------------------------------------------------")

        print("time cost for segmentation: %2.2f s" % (time.clock() - segmentation_start_time))