Spaces:

Cpp4App
/

Cpp4App_test

Sleeping

File size: 5,153 Bytes

f1554a2

import os
import time
import shutil

from bs4 import BeautifulSoup

from find_subtitle import find_title_Label
from get_text import write_text, write_text_without_label, removeUnneccessaryElements, makeCoarseSegments
from types_pp_processing import caculateSim, getSentences, getSentences_no_classifier, getSentences_with_classifier
# from children_pp_processing import process_specialGroup
# from region_pp_processing import get_alifornia
# from retention_pp_processing import retention_process
# from clean_txt import cleaning_txt

if __name__ == '__main__':
    # INPUT = "../dataset/privacy_policies_html/"
    INPUT = "./pp_example/"
    # cleaning_txt("./txt")
    # os.mkdir("./txt")
    if os.path.exists("./txt"):
        shutil.rmtree("./txt")
    os.makedirs("./txt")

    for file in os.listdir(INPUT):

        segmentation_start_time = time.clock()

        pathName = os.path.basename(file)
        if pathName == ".DS_Store":
            continue
        path = INPUT+pathName
        label = find_title_Label(path)
        print("The current file is:" + pathName)

        # if pathName != '20.html':
        #     continue

        para_start_time = time.clock()
        soup = BeautifulSoup(open(path,encoding='utf-8'), features="html.parser")
        title_list = soup.find_all(label)
        # cleaning_txt()

        if not os.path.exists('./txt/' + pathName[:-5]):
            os.mkdir('./txt/' + pathName[:-5])

        if len(title_list) == 0 or pathName == '20.html' or pathName == '29.html' or pathName == '25.html' or pathName == '8.html' or pathName == '27.html' or pathName == '28.html':
            # write_text_without_label(soup.getText(), pathName)
            removeUnneccessaryElements(soup)
            result = makeCoarseSegments(soup)
            for seg in result:
                with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f:

                    f.write(seg)
                    f.write("\n")
        else:
            write_text(title_list, pathName)
        print("Paragraph level processing time: %2.2f s" % (time.clock() - para_start_time))

        for t in title_list:
            with open('./txt/' + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g:
                g.write(str(t))
                g.write("\n")

        # data types
        if not os.path.exists("./txt/"+pathName[:-5]+"/data_types.txt"):
            print("No information about data types!")
        else:
            sen_start_time = time.clock()
            # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt")
            dict_sentences, dict_index = getSentences_with_classifier("./txt/" + pathName[:-5] + "/data_types.txt")
            print("sentence level processing time: %2.2f s" % (time.clock() - sen_start_time))

            os.makedirs("./txt/"+pathName[:-5]+"/classified_sentences")
            for key in dict_sentences:

                if dict_sentences[key] == "":
                    continue
                with open('./txt/' + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g:
                    g.write(dict_sentences[key])

            for key in dict_index:
                with open('./txt/' + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f:
                    f.write(key + ":" + str(dict_index[key]) + "\n")


        # #children
        # if not os.path.exists("./txt/"+pathName[:-5]+"/children.txt"):
        #     print("No information about children!")
        # else:
        #     age , rule, childUse, specialGroup = process_specialGroup("./txt/"+pathName[:-5]+"/children.txt")
        #     # print("children age is :")
        #     print("D.CHILDREN.age : " + str(age))
        #     if childUse == 1:
        #         print(" the skill’s privacy policy states that it does not collect any information from children")
        #         print("D.CHILDREN.[CTypes] = [ ]")
        #     else:
        #         # print("D.CHILDREN.[CTypes] :" + str(all_types))
        #         None
        # #region
        # if not os.path.exists("./txt/"+pathName[:-5]+"/region.txt"):
        #     print("No information about region!")
        # else:
        #     specialArea,california = get_alifornia("./txt/"+pathName[:-5]+"/region.txt")
        #     if california == 1:
        #         print("D.REGIONS.region :California")
        #         print("D.REGIONS.delete : Yes")
        #     else:
        #         print("D.REGIONS.region :No mention")
        #         print("D.REGIONS.delete : No")
        #
        # #retention
        # if not os.path.exists("./txt/"+pathName[:-5]+"/data_retention.txt"):
        #     print("No information about data retention!")
        # else:
        #     retention_time, text = retention_process("./txt/"+pathName[:-5]+"/data_retention.txt")
        #     print("D.RETENTION.period :"+ retention_time)
        #     # cleaning_txt()
        #     print("-------------------------------------------------------")

        print("time cost for segmentation: %2.2f s" % (time.clock() - segmentation_start_time))