diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.idea/Cpp4App_test.iml b/.idea/Cpp4App_test.iml new file mode 100644 index 0000000000000000000000000000000000000000..f8df9dd0e2c18074c2778c2a5a9b44f35e2f0d13 --- /dev/null +++ b/.idea/Cpp4App_test.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000000000000000000000000000000000..be1b0f0b9a64196709563287a7de2eb78ce66553 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,16 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..e7aa3a7008889808be605075901ee5c9adf41cc8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..fcc32e6057be840e35118eb9a143983f7f1f9d51 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..35eb1ddfbbc029bcab630581847471d7f238ec53 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000000000000000000000000000000000000..175e1e26a3777e61e5f7416fa5e2e042741ec837 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + 1723386888312 + + + + + + \ No newline at end of file diff --git a/CDM/.idea/.gitignore b/CDM/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/CDM/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/CDM/.idea/UIED.iml b/CDM/.idea/UIED.iml new file mode 100644 index 0000000000000000000000000000000000000000..6801817b882b16f494e83eca32cfd855f12b7831 --- /dev/null +++ b/CDM/.idea/UIED.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/CDM/.idea/inspectionProfiles/Project_Default.xml b/CDM/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000000000000000000000000000000000..4fa1f1b08b265dd2f6d211efb7473e38897917e2 --- /dev/null +++ b/CDM/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,29 @@ + + + + \ No newline at end of file diff --git a/CDM/.idea/misc.xml b/CDM/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..a2e120dcc86d18fdcb6ccfbe56c5d0b0dcd7c04a --- /dev/null +++ b/CDM/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/CDM/.idea/modules.xml b/CDM/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..b560a6bf77a48fbc28c056f11fded48711fc7490 --- /dev/null +++ b/CDM/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/CDM/.idea/vcs.xml b/CDM/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671 --- /dev/null +++ b/CDM/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/CDM/LICENSE b/CDM/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..638aa1c5c2ff56d30533ae9e2da4c91afdb2f781 --- /dev/null +++ b/CDM/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] [UIED mulong.xie@anu.edu.au] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/CDM/README.md b/CDM/README.md new file mode 100644 index 0000000000000000000000000000000000000000..55ccbc5a96d37a7c24f5d253e52c7c10fbc779cb --- /dev/null +++ b/CDM/README.md @@ -0,0 +1,80 @@ +# UIED - UI element detection, detecting UI elements from UI screenshots or drawnings + +This project is still ongoing and this repo may be updated irregularly, I developed a web app for the UIED in http://uied.online + +## Related Publications: +[1. UIED: a hybrid tool for GUI element detection](https://dl.acm.org/doi/10.1145/3368089.3417940) + +[2. Object Detection for Graphical User Interface: Old Fashioned or Deep Learning or a Combination?](https://arxiv.org/abs/2008.05132) + +>The repo has been **upgraded with Google OCR** for GUI text detection, to use the original version in our paper (using [EAST](https://github.com/argman/EAST) as text detector), check the relase [v2.3](https://github.com/MulongXie/UIED/releases/tag/v2.3) and download the pre-trained model in [this link](https://drive.google.com/drive/folders/1MK0Om7Lx0wRXGDfNcyj21B0FL1T461v5?usp=sharing). + +## What is it? + +UI Element Detection (UIED) is an old-fashioned computer vision (CV) based element detection approach for graphic user interface. + +The input of UIED could be various UI image, such as mobile app or web page screenshot, UI design drawn by Photoshop or Sketch, and even some hand-drawn UI design. Then the approach detects and classifies text and graphic UI elements, and exports the detection result as JSON file for future application. + +UIED comprises two parts to detect UI text and graphic elements, such as button, image and input bar. +* For text, it leverages [Google OCR](https://cloud.google.com/vision/docs/ocr) to perfrom detection. + +* For graphical elements, it uses old-fashioned CV approaches to locate the elements and a CNN classifier to achieve classification. + +> UIED is highly customizable, you can replace both parts by your choice (e.g. other text detection approaches). Unlike black-box end-to-end deep learning approach, you can revise the algorithms in the non-text detection and merging (partially or entirely) easily to fit your task. + +![UIED Approach](https://github.com/MulongXie/UIED/blob/master/data/demo/approach.png) + +## How to use? + +### Dependency +* **Python 3.5** +* **Opencv 3.4.2** +* **Pandas** + + +### Installation + + + + +The new version of UIED equipped with Google OCR is easy to deploy and no pre-trained model is needed. Simply donwload the repo along with the dependencies. + +> Please replace the Google OCR key at `detect_text/ocr.py line 28` with your own (apply in [Google website](https://cloud.google.com/vision)). + +### Usage +To test your own image(s): +* To test single image, change *input_path_img* in ``run_single.py`` to your input image and the results will be output to *output_root*. +* To test mutiple images, change *input_img_root* in ``run_batch.py`` to your input directory and the results will be output to *output_root*. +* To adjust the parameters lively, using ``run_testing.py`` + +> Note: The best set of parameters vary for different types of GUI image (Mobile App, Web, PC). I highly recommend to first play with the ``run_testing.py`` to pick a good set of parameters for your data. + +## Folder structure +``cnn/`` +* Used to train classifier for graphic UI elements +* Set path of the CNN classification model + +``config/`` +* Set data paths +* Set parameters for graphic elements detection + +``data/`` +* Input UI images and output detection results + +``detect_compo/`` +* Non-text GUI component detection + +``detect_text/`` +* GUI text detection using Google OCR + +``detect_merge/`` +* Merge the detection results of non-text and text GUI elements + +The major detection algorithms are in ``detect_compo/``, ``detect_text/`` and ``detect_merge/`` + +## Demo +GUI element detection result for web screenshot + +![UI Components detection result](https://github.com/MulongXie/UIED/blob/master/data/demo/demo.png) diff --git a/CDM/cnn/CNN.py b/CDM/cnn/CNN.py new file mode 100644 index 0000000000000000000000000000000000000000..fb365ff6635f9978978c24bf8963b798c6541cb4 --- /dev/null +++ b/CDM/cnn/CNN.py @@ -0,0 +1,114 @@ +import keras +from keras.applications.resnet50 import ResNet50 +from keras.models import Model,load_model +from keras.layers import Dense, Activation, Flatten, Dropout +from sklearn.metrics import confusion_matrix +import numpy as np +import cv2 + +from config.CONFIG import Config +cfg = Config() + + +class CNN: + def __init__(self, classifier_type, is_load=True): + ''' + :param classifier_type: 'Text' or 'Noise' or 'Elements' + ''' + self.data = None + self.model = None + + self.classifier_type = classifier_type + + self.image_shape = (32,32,3) + self.class_number = None + self.class_map = None + self.model_path = None + self.classifier_type = classifier_type + if is_load: + self.load(classifier_type) + + def build_model(self, epoch_num, is_compile=True): + base_model = ResNet50(include_top=False, weights='imagenet', input_shape=self.image_shape) + for layer in base_model.layers: + layer.trainable = False + self.model = Flatten()(base_model.output) + self.model = Dense(128, activation='relu')(self.model) + self.model = Dropout(0.5)(self.model) + self.model = Dense(15, activation='softmax')(self.model) + + self.model = Model(inputs=base_model.input, outputs=self.model) + if is_compile: + self.model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) + self.model.fit(self.data.X_train, self.data.Y_train, batch_size=64, epochs=epoch_num, verbose=1, + validation_data=(self.data.X_test, self.data.Y_test)) + + def train(self, data, epoch_num=30): + self.data = data + self.build_model(epoch_num) + self.model.save(self.model_path) + print("Trained model is saved to", self.model_path) + + def load(self, classifier_type): + if classifier_type == 'Text': + self.model_path = 'E:/Mulong/Model/rico_compos/cnn-textview-2.h5' + self.class_map = ['Text', 'Non-Text'] + elif classifier_type == 'Noise': + self.model_path = 'E:/Mulong/Model/rico_compos/cnn-noise-1.h5' + self.class_map = ['Noise', 'Non-Noise'] + elif classifier_type == 'Elements': + # self.model_path = 'E:/Mulong/Model/rico_compos/resnet-ele14-19.h5' + # self.model_path = 'E:/Mulong/Model/rico_compos/resnet-ele14-28.h5' + # self.model_path = 'E:/Mulong/Model/rico_compos/resnet-ele14-45.h5' + self.model_path = cfg.CNN_PATH + self.class_map = cfg.element_class + self.image_shape = (64, 64, 3) + elif classifier_type == 'Image': + self.model_path = 'E:/Mulong/Model/rico_compos/cnn-image-1.h5' + self.class_map = ['Image', 'Non-Image'] + self.class_number = len(self.class_map) + self.model = load_model(self.model_path) + print('Model Loaded From', self.model_path) + + def preprocess_img(self, image): + image = cv2.resize(image, self.image_shape[:2]) + x = (image / 255).astype('float32') + x = np.array([x]) + return x + + def predict(self, imgs, compos, load=False, show=False): + """ + :type img_path: list of img path + """ + if load: + self.load(self.classifier_type) + if self.model is None: + print("*** No model loaded ***") + return + for i in range(len(imgs)): + X = self.preprocess_img(imgs[i]) + Y = self.class_map[np.argmax(self.model.predict(X))] + compos[i].category = Y + if show: + print(Y) + cv2.imshow('element', imgs[i]) + cv2.waitKey() + + def evaluate(self, data, load=True): + if load: + self.load(self.classifier_type) + X_test = data.X_test + Y_test = [np.argmax(y) for y in data.Y_test] + Y_pre = [np.argmax(y_pre) for y_pre in self.model.predict(X_test, verbose=1)] + + matrix = confusion_matrix(Y_test, Y_pre) + print(matrix) + + TP, FP, FN = 0, 0, 0 + for i in range(len(matrix)): + TP += matrix[i][i] + FP += sum(matrix[i][:]) - matrix[i][i] + FN += sum(matrix[:][i]) - matrix[i][i] + precision = TP/(TP+FP) + recall = TP / (TP+FN) + print("Precision:%.3f, Recall:%.3f" % (precision, recall)) \ No newline at end of file diff --git a/CDM/cnn/Config.py b/CDM/cnn/Config.py new file mode 100644 index 0000000000000000000000000000000000000000..7143c0b3b4ae150bae1afd3046db3e98d752f706 --- /dev/null +++ b/CDM/cnn/Config.py @@ -0,0 +1,21 @@ + +class Config: + def __init__(self): + # cnn 4 classes + # self.MODEL_PATH = 'E:/Mulong/Model/ui_compos/cnn6_icon.h5' # cnn 4 classes + # self.class_map = ['Image', 'Icon', 'Button', 'Input'] + + # resnet 14 classes + # self.DATA_PATH = "E:/Mulong/Datasets/rico/elements-14-2" + # self.MODEL_PATH = 'E:/Mulong/Model/rico_compos/resnet-ele14.h5' + # self.class_map = ['Button', 'CheckBox', 'Chronometer', 'EditText', 'ImageButton', 'ImageView', + # 'ProgressBar', 'RadioButton', 'RatingBar', 'SeekBar', 'Spinner', 'Switch', + # 'ToggleButton', 'VideoView', 'TextView'] # ele-14 + + self.DATA_PATH = "E:\Mulong\Datasets\dataset_webpage\Components3" + + self.MODEL_PATH = 'E:/Mulong/Model/rico_compos/cnn2-textview.h5' + self.class_map = ['Text', 'Non-Text'] + + self.image_shape = (32, 32, 3) + self.class_number = len(self.class_map) diff --git a/CDM/cnn/Data.py b/CDM/cnn/Data.py new file mode 100644 index 0000000000000000000000000000000000000000..def4747977079d268176a15c9b9dab7a0118ff88 --- /dev/null +++ b/CDM/cnn/Data.py @@ -0,0 +1,69 @@ +import cv2 +import numpy as np +from os.path import join as pjoin +import glob +from tqdm import tqdm +from Config import Config + +cfg = Config() + + +class Data: + def __init__(self): + self.data_num = 0 + self.images = [] + self.labels = [] + self.X_train, self.Y_train = None, None + self.X_test, self.Y_test = None, None + + self.image_shape = cfg.image_shape + self.class_number = cfg.class_number + self.class_map = cfg.class_map + self.DATA_PATH = cfg.DATA_PATH + + def load_data(self, resize=True, shape=None, max_number=1000000): + # if customize shape + if shape is not None: + self.image_shape = shape + else: + shape = self.image_shape + + # load data + for p in glob.glob(pjoin(self.DATA_PATH, '*')): + print("*** Loading components of %s: %d ***" %(p.split('\\')[-1], int(len(glob.glob(pjoin(p, '*.png')))))) + label = self.class_map.index(p.split('\\')[-1]) # map to index of classes + for i, image_path in enumerate(tqdm(glob.glob(pjoin(p, '*.png'))[:max_number])): + image = cv2.imread(image_path) + if resize: + image = cv2.resize(image, shape[:2]) + self.images.append(image) + self.labels.append(label) + + assert len(self.images) == len(self.labels) + self.data_num = len(self.images) + print('%d Data Loaded' % self.data_num) + + def generate_training_data(self, train_data_ratio=0.8): + # transfer int into c dimensions one-hot array + def expand(label, class_number): + # return y : (num_class, num_samples) + y = np.eye(class_number)[label] + y = np.squeeze(y) + return y + + # reshuffle + np.random.seed(0) + self.images = np.random.permutation(self.images) + np.random.seed(0) + self.labels = np.random.permutation(self.labels) + Y = expand(self.labels, self.class_number) + + # separate dataset + cut = int(train_data_ratio * self.data_num) + self.X_train = (self.images[:cut] / 255).astype('float32') + self.X_test = (self.images[cut:] / 255).astype('float32') + self.Y_train = Y[:cut] + self.Y_test = Y[cut:] + + print('X_train:%d, Y_train:%d' % (len(self.X_train), len(self.Y_train))) + print('X_test:%d, Y_test:%d' % (len(self.X_test), len(self.Y_test))) diff --git a/CDM/config/CONFIG.py b/CDM/config/CONFIG.py new file mode 100644 index 0000000000000000000000000000000000000000..0a2af94cd1b54ad9c39576c00e4b8d853c9d5559 --- /dev/null +++ b/CDM/config/CONFIG.py @@ -0,0 +1,45 @@ +from os.path import join as pjoin +import os + + +class Config: + + def __init__(self): + # setting CNN (graphic elements) model + self.image_shape = (64, 64, 3) + # self.MODEL_PATH = 'E:\\Mulong\\Model\\UI2CODE\\cnn6_icon.h5' + # self.class_map = ['button', 'input', 'icon', 'img', 'text'] + self.CNN_PATH = 'E:/Mulong/Model/rico_compos/cnn-rico-1.h5' + self.element_class = ['Button', 'CheckBox', 'Chronometer', 'EditText', 'ImageButton', 'ImageView', + 'ProgressBar', 'RadioButton', 'RatingBar', 'SeekBar', 'Spinner', 'Switch', + 'ToggleButton', 'VideoView', 'TextView'] + self.class_number = len(self.element_class) + + # setting EAST (ocr) model + self.EAST_PATH = 'E:/Mulong/Model/East/east_icdar2015_resnet_v1_50_rbox' + + self.COLOR = {'Button': (0, 255, 0), 'CheckBox': (0, 0, 255), 'Chronometer': (255, 166, 166), + 'EditText': (255, 166, 0), + 'ImageButton': (77, 77, 255), 'ImageView': (255, 0, 166), 'ProgressBar': (166, 0, 255), + 'RadioButton': (166, 166, 166), + 'RatingBar': (0, 166, 255), 'SeekBar': (0, 166, 10), 'Spinner': (50, 21, 255), + 'Switch': (80, 166, 66), 'ToggleButton': (0, 66, 80), 'VideoView': (88, 66, 0), + 'TextView': (169, 255, 0), 'NonText': (0,0,255), + 'Compo':(0, 0, 255), 'Text':(169, 255, 0), 'Block':(80, 166, 66)} + + def build_output_folders(self): + # setting data flow paths + self.ROOT_INPUT = "E:\\Mulong\\Datasets\\rico\\combined" + self.ROOT_OUTPUT = "E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3" + + self.ROOT_IMG_ORG = pjoin(self.ROOT_INPUT, "org") + self.ROOT_IP = pjoin(self.ROOT_OUTPUT, "ip") + self.ROOT_OCR = pjoin(self.ROOT_OUTPUT, "ocr") + self.ROOT_MERGE = pjoin(self.ROOT_OUTPUT, "merge") + self.ROOT_IMG_COMPONENT = pjoin(self.ROOT_OUTPUT, "components") + if not os.path.exists(self.ROOT_IP): + os.mkdir(self.ROOT_IP) + if not os.path.exists(self.ROOT_OCR): + os.mkdir(self.ROOT_OCR) + if not os.path.exists(self.ROOT_MERGE): + os.mkdir(self.ROOT_MERGE) diff --git a/CDM/config/CONFIG_UIED.py b/CDM/config/CONFIG_UIED.py new file mode 100644 index 0000000000000000000000000000000000000000..e4c85e6e71e004c3cbfbb6a1de1b5cd4b4845595 --- /dev/null +++ b/CDM/config/CONFIG_UIED.py @@ -0,0 +1,49 @@ +class Config: + + def __init__(self): + # Adjustable + # self.THRESHOLD_PRE_GRADIENT = 4 # dribbble:4 rico:4 web:1 + # self.THRESHOLD_OBJ_MIN_AREA = 55 # bottom line 55 of small circle + # self.THRESHOLD_BLOCK_GRADIENT = 5 + + # *** Frozen *** + self.THRESHOLD_REC_MIN_EVENNESS = 0.7 + self.THRESHOLD_REC_MAX_DENT_RATIO = 0.25 + self.THRESHOLD_LINE_THICKNESS = 8 + self.THRESHOLD_LINE_MIN_LENGTH = 0.95 + self.THRESHOLD_COMPO_MAX_SCALE = (0.25, 0.98) # (120/800, 422.5/450) maximum height and width ratio for a atomic compo (button) + self.THRESHOLD_TEXT_MAX_WORD_GAP = 10 + self.THRESHOLD_TEXT_MAX_HEIGHT = 0.04 # 40/800 maximum height of text + self.THRESHOLD_TOP_BOTTOM_BAR = (0.045, 0.94) # (36/800, 752/800) height ratio of top and bottom bar + self.THRESHOLD_BLOCK_MIN_HEIGHT = 0.03 # 24/800 + + # deprecated + # self.THRESHOLD_OBJ_MIN_PERIMETER = 0 + # self.THRESHOLD_BLOCK_MAX_BORDER_THICKNESS = 8 + # self.THRESHOLD_BLOCK_MAX_CROSS_POINT = 0.1 + # self.THRESHOLD_UICOMPO_MIN_W_H_RATIO = 0.4 + # self.THRESHOLD_TEXT_MAX_WIDTH = 150 + # self.THRESHOLD_LINE_MIN_LENGTH_H = 50 + # self.THRESHOLD_LINE_MIN_LENGTH_V = 50 + # self.OCR_PADDING = 5 + # self.OCR_MIN_WORD_AREA = 0.45 + # self.THRESHOLD_MIN_IOU = 0.1 # dribbble:0.003 rico:0.1 web:0.1 + # self.THRESHOLD_BLOCK_MIN_EDGE_LENGTH = 210 # dribbble:68 rico:210 web:70 + # self.THRESHOLD_UICOMPO_MAX_W_H_RATIO = 10 # dribbble:10 rico:10 web:22 + + self.CLASS_MAP = {'0':'Button', '1':'CheckBox', '2':'Chronometer', '3':'EditText', '4':'ImageButton', '5':'ImageView', + '6':'ProgressBar', '7':'RadioButton', '8':'RatingBar', '9':'SeekBar', '10':'Spinner', '11':'Switch', + '12':'ToggleButton', '13':'VideoView', '14':'TextView'} + self.COLOR = {'Button': (0, 255, 0), 'CheckBox': (0, 0, 255), 'Chronometer': (255, 166, 166), + 'EditText': (255, 166, 0), + 'ImageButton': (77, 77, 255), 'ImageView': (255, 0, 166), 'ProgressBar': (166, 0, 255), + 'RadioButton': (166, 166, 166), + 'RatingBar': (0, 166, 255), 'SeekBar': (0, 166, 10), 'Spinner': (50, 21, 255), + 'Switch': (80, 166, 66), 'ToggleButton': (0, 66, 80), 'VideoView': (88, 66, 0), + 'TextView': (169, 255, 0), + + 'Text':(169, 255, 0), 'Non-Text':(255, 0, 166), + + 'Noise':(6,6,255), 'Non-Noise': (6,255,6), + + 'Image':(255,6,6), 'Non-Image':(6,6,255)} diff --git a/CDM/detect_classify/classification.py b/CDM/detect_classify/classification.py new file mode 100644 index 0000000000000000000000000000000000000000..fef30d40c43d601c990418c96de1928cf88e5209 --- /dev/null +++ b/CDM/detect_classify/classification.py @@ -0,0 +1,380 @@ +from CDM.detect_merge.Element import Element +import CDM.detect_compo.lib_ip.ip_preprocessing as pre +import time +import cv2 +import torch +import numpy as np +from torchvision import models +from torch import nn +import pandas as pd +import re +import openai +import random +import os +from CDM.detect_merge.merge import reassign_ids +import CDM.detect_merge.merge as merge +from os.path import join as pjoin, exists + +label_dic ={'72':'Location', '42':'Photos', '77':'Social media', '91':'Voices', '6':'Email', '89':'Social media', '40':'Location', '43':'Phone', '82':'Photos', + '3':'Contacts', '68':'Contacts', '49':'Profile', '56':'Photos'} + +keyword_list = {'Name':['name', 'first name', 'last name', 'full name', 'real name', 'surname', 'family name', 'given name'], + 'Birthday':['birthday', 'date of birth', 'birth date', 'DOB', 'dob full birthday', 'birth year'], + 'Address':['mailing address', 'physical address', 'postal address', 'billing address', 'shipping address', 'delivery address', 'residence', 'collect address', 'personal address', 'residential address'], + 'Phone':['phone', 'phone number', 'mobile', 'mobile phone', 'mobile number', 'telephone', 'telephone number', 'call'], + 'Email':['email', 'e-mail', 'email address', 'e-mail address'], + 'Contacts':['contacts', 'phone-book', 'phone book', 'phonebook', 'contact list', 'phone contacts', 'address book'], + 'Location':['location', 'locate', 'geography', 'geo', 'geo-location', 'precision location', 'nearby'], + 'Photos':['camera', 'photo', 'scan', 'album', 'picture', 'gallery', 'photo library', 'storage', 'image', 'video', 'scanner', 'photograph'], + 'Voices':['microphone', 'voice', 'mic', 'speech', 'talk'], + 'Financial info':['credit card', 'pay', 'payment', 'debit card', 'mastercard', 'wallet'], + 'IP':['IP', 'Internet Protocol', 'IP address', 'internet protocol address'], + 'Cookies':['cookies', 'cookie'], + 'Social media':['facebook', 'twitter', 'socialmedia', 'social media'], + 'Profile':['profile', 'account'], + 'Gender':['gender']} + +def get_data_type(sentence, keywords, use_gpt=True): + + sent_data_type = "others" + + if use_gpt: + openai.api_key = os.environ["OPENAI_API_KEY"] + + prompt = f"Is this piece of texts \"{sentence}\" related to any following privacy information data types? Or not relevant to any of them? ONLY answer the data type or \"not relevant\". ONLY use following data type list. Data types and their Description:\n" \ + f"Name: How a user refers to themselves," \ + f" Birthday: A user’s birthday," \ + f" Address: A user’s address," \ + f" Phone: A user’s phone number," \ + f" Email: A user’s email address," \ + f" Contacts: A user’s contact information, or the access to the contact permission," \ + f" Location: A user’s location information, or the access to the location permission," \ + f" Photos: A user’s photos, videos, or the access to the camera permission," \ + f" Voices: A user’s voices, recordings, or the access to the microphone permission," \ + f" Financial Info: Information about a user’s financial accounts, purchases, or transactions," \ + f" Profile: A user’s account information," \ + f"Social Media: A user's social media information, or the access to social media accounts" + + response = openai.ChatCompletion.create( + # engine="text-davinci-002", + model="gpt-3.5-turbo", + messages=[ + # {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ], + max_tokens=100, + n=1, + stop=None, + temperature=0, + ) + + # response_full_text = response.choices[0].text.strip() + response_full_text = response.choices[0].message['content'] + for k in keywords.keys(): + if k == "Financial info" or k == "Social media": + if k.lower() in response_full_text.lower(): + sent_data_type = k + break + else: + words = re.split(r'\W+', response_full_text.lower()) + if k.lower() in words: + sent_data_type = k + break + + # print("----------------------") + # print("sentence: ", sentence) + # print("prompt: ", prompt) + # print("response: ", response_full_text) + # print("sent_data_type: ", sent_data_type) + + else: + for k in keywords.keys(): + for w in keywords[k]: + words = re.split(r'\W+', sentence.lower()) + if w.lower() in words: + sent_data_type = k + break + if sent_data_type != "others": + break + + return sent_data_type + +# def get_clf_model(use_resnet18=True, use_gpu=False): +# +# device = 'cpu' +# if use_gpu: +# device = 'cuda:0' +# +# if use_resnet18: +# model = models.resnet18().to(device) +# in_feature_num = model.fc.in_features +# model.fc = nn.Linear(in_feature_num, 99) +# model.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(5, 5), padding=(3, 3), stride=(2, 2), +# bias=False) +# +# PATH = "./CDM/model/model-99-resnet18.pkl" +# model.load_state_dict(torch.load(PATH, map_location=torch.device(device))) +# +# model.eval() +# else: +# # replace with your own model +# None +# +# return model + +def get_clf_model(clf_model="ResNet18", use_gpu=False): + + device = 'cpu' + if use_gpu: + device = 'cuda:0' + + if clf_model == "ResNet18": + model = models.resnet18().to(device) + in_feature_num = model.fc.in_features + model.fc = nn.Linear(in_feature_num, 99) + model.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(5, 5), padding=(3, 3), stride=(2, 2), + bias=False) + + PATH = "./CDM/model/model-99-resnet18.pkl" + model.load_state_dict(torch.load(PATH, map_location=torch.device(device))) + + model.eval() + elif clf_model == "ViT": + model = torch.load('./CDM/model/model-99-ViT-entire.pkl', map_location=torch.device(device)) + model = model.to(device) + model.eval() + + else: + # replace with your own model + None + + return model + +def compo_classification(input_img, output_root, segment_root, merge_json, output_data, resize_by_height=800, clf_model="ResNet18"): + # load text and non-text compo + ele_id = 0 + compos = [] + texts = [] + elements = [] + + for compo in merge_json['compos']: + if compo['class'] == 'Text': + element = Element(ele_id, + (compo["position"]['column_min'], compo["position"]['row_min'], + compo["position"]['column_max'], compo["position"]['row_max']), + 'Text', text_content=compo['text_content']) + texts.append(element) + ele_id += 1 + else: + element = Element(ele_id, + (compo["position"]['column_min'], compo["position"]['row_min'], + compo["position"]['column_max'], compo["position"]['row_max']), + compo['class']) + compos.append(element) + ele_id += 1 + + org, grey = pre.read_img(input_img, resize_by_height) + + grey = grey.astype('float32') + grey = grey / 255 + + # grey = (grey - grey.mean()) / grey.std() + + # --------- classification ---------- + + classification_start_time = time.process_time() + + for compo in compos: + + # comp_grey = grey[compo.row_min:compo.row_max, compo.col_min:compo.col_max] + # + # comp_crop = cv2.resize(comp_grey, (32, 32)) + # + # comp_crop = comp_crop.reshape(1, 1, 32, 32) + # + # comp_tensor = torch.tensor(comp_crop) + # comp_tensor = comp_tensor.permute(0, 1, 3, 2) + # + # model = get_clf_model() + # pred_label = model(comp_tensor) + # + # if str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0]) in label_dic.keys(): + # compo.label = label_dic[str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0])] + # elements.append(compo) + # else: + # compo.label = str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0]) + + if clf_model == "ResNet18": + + comp_grey = grey[compo.row_min:compo.row_max, compo.col_min:compo.col_max] + + comp_crop = cv2.resize(comp_grey, (32, 32)) + + comp_crop = comp_crop.reshape(1, 1, 32, 32) + + comp_tensor = torch.tensor(comp_crop) + comp_tensor = comp_tensor.permute(0, 1, 3, 2) + + model = get_clf_model(clf_model) + pred_label = model(comp_tensor) + + if str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0]) in label_dic.keys(): + compo.label = label_dic[str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0])] + elements.append(compo) + else: + compo.label = str(np.argmax(pred_label.cpu().data.numpy(), axis=1)[0]) + + elif clf_model == "ViT": + + comp_grey = grey[compo.row_min:compo.row_max, compo.col_min:compo.col_max] + + comp_crop = cv2.resize(comp_grey, (224, 224)) + + # Convert the image to tensor + comp_tensor = torch.from_numpy(comp_crop) + + # Reshape and repeat along the channel dimension to convert to RGB + comp_tensor = comp_tensor.view(1, 224, 224).repeat(3, 1, 1) + + # comp_tensor = comp_tensor.permute(0, 2, 1) + + comp_tensor = comp_tensor.unsqueeze(0) # add a batch dimension + + model = get_clf_model(clf_model) + # pred_label = model(comp_tensor) + + # Forward pass through the model + with torch.no_grad(): + output = model(comp_tensor) + + # Get the predicted label + _, predicted = torch.max(output.logits, 1) + + # print("predicted_label: ", predicted.cpu().numpy()) + + if str(predicted.cpu().numpy()[0]) in label_dic.keys(): + compo.label = label_dic[str(predicted.cpu().numpy()[0])] + elements.append(compo) + else: + compo.label = str(predicted.cpu().numpy()[0]) + + else: + print("clf_model has to be ResNet18 or ViT") + + time_cost_ic = time.process_time() - classification_start_time + print("time cost for icon classification: %2.2f s" % time_cost_ic) + # ic_time_cost_all.append(time_cost_ic) + + # --------- end classification ---------- + + text_selection_time = time.process_time() + + for this_text in texts: + # found_flag = 0 + # + # for key in keyword_list: + # for w in keyword_list[key]: + # words = re.split(r'\W+', this_text.text_content.lower()) + # if w.lower() in words: + # this_text.label = key + # elements.append(this_text) + # found_flag = 1 + # break + # + # if found_flag == 0: + # this_text.label = 'others' + + retries = 10 + for i in range(retries): + try: + text_label = get_data_type(this_text.text_content.lower(), keyword_list, use_gpt=False) + break + except openai.error.RateLimitError as e: + if "overloaded" in str(e): + # Exponential backoff with jitter + sleep_time = 2 * (2 ** i) + random.uniform(0, 0.1) + time.sleep(sleep_time) + else: + raise + except Exception as e: + raise + + this_text.label = text_label + + if this_text.label != "others": + elements.append(this_text) + + time_cost_ts = time.process_time() - text_selection_time + print("time cost for text selection: %2.2f s" % time_cost_ts) + # ts_time_cost_all.append(time_cost_ts) + + # ---------- end ------------------------------- + + full_size_org, full_size_grey = pre.read_img(input_img) + ratio = full_size_org.shape[0]/org.shape[0] + + show = False + wait_key = 0 + + reassign_ids(elements) + board = merge.show_elements(full_size_org, elements, ratio, show=show, win_name='elements after merging', wait_key=wait_key, line=3) + board_one_element = merge.show_one_element(full_size_org, elements, ratio, show=show, win_name='elements after merging', wait_key=wait_key, line=3) + + classification_root = pjoin(output_root, 'classification') + + # save all merged elements, clips and blank background + name = input_img.replace('\\', '/').split('/')[-1][:-4] + components = merge.save_elements(pjoin(classification_root, name + '.json'), elements, full_size_org.shape, ratio) + cv2.imwrite(pjoin(classification_root, name + '.jpg'), board) + + print("len(board_one_element): ", len(board_one_element)) + + for i in range(len(elements)): + e_name = str(int(elements[i].id) + 1) + cv2.imwrite(pjoin(classification_root + '/GUI', name + '-' + e_name + '.jpg'), board_one_element[i]) + + print('[Classification Completed] Input: %s Output: %s' % (input_img, pjoin(classification_root, name + '.jpg'))) + + # ---------- matching result ----------- + + index = input_img.split('/')[-1][:-4] + app_id = str(index).split('-')[0] + + index_path = pjoin(segment_root, app_id, 'classified_sentences/keyword_index.txt') + dict_index = {} + if exists(index_path): + with open(index_path, 'r') as g: + for line in g: + key, value = line.strip().split(':', 1) + dict_index[key] = value + + for item in elements: + complete_path = pjoin(segment_root, app_id, 'classified_sentences', item.label + '.txt') + print("complete_path: ", complete_path) + + if exists(complete_path): + + with open(complete_path, 'r', encoding='utf-8') as file: + content = file.read() + + # Replace line breaks with spaces and strip any extra whitespace + this_text = ' '.join(content.splitlines()).strip() + + lines = content.splitlines() + non_empty_lines = [line for line in lines if line.strip() != ""] + for i in range(len(non_empty_lines)): + if non_empty_lines[i][0].isalpha(): + non_empty_lines[i] = non_empty_lines[i][0].upper() + non_empty_lines[i][1:] + + # output_data = output_data.append({'screenshot': 's' + str(index), 'id': item.id + 1, 'label': item.label, 'index': dict_index[item.label], 'text': this_text, 'sentences': non_empty_lines}, ignore_index=True) + output_data = pd.concat([output_data, pd.DataFrame([{'screenshot': 's' + str(index), 'id': item.id + 1, + 'label': item.label, 'index': dict_index[item.label], + 'text': this_text, 'sentences': non_empty_lines}])]) + + else: + # output_data = output_data.append({'screenshot': 's' + str(index), 'id': item.id + 1, 'label': item.label, 'index': "None", 'text': "No information!", 'sentences': "None"}, + # ignore_index=True) + output_data = pd.concat([output_data, pd.DataFrame([{'screenshot': 's' + str(index), 'id': item.id + 1, + 'label': item.label, 'index': "None", + 'text': "No information!", 'sentences': "None"}])]) + return time_cost_ic, time_cost_ts, output_data, board diff --git a/CDM/detect_compo/deprecated/Block.py b/CDM/detect_compo/deprecated/Block.py new file mode 100644 index 0000000000000000000000000000000000000000..8cf301418704e0ac91f791126690100bbedc5a8b --- /dev/null +++ b/CDM/detect_compo/deprecated/Block.py @@ -0,0 +1,56 @@ +import cv2 +from os.path import join as pjoin +import time +import numpy as np + +from CDM.detect_compo.lib_ip.Component import Component +from CDM.config.CONFIG_UIED import Config +C = Config() + + +class Block(Component): + def __init__(self, region, image_shape): + super().__init__(region, image_shape) + self.category = 'Block' + self.parent = None + self.children = [] + self.uicompo_ = None + self.top_or_botm = None + self.redundant = False + + def block_is_uicompo(self, image_shape, max_compo_scale): + ''' + Check the if the block is a ui component according to its relative size + ''' + row, column = image_shape[:2] + # print(height, height / row, max_compo_scale[0], height / row > max_compo_scale[0]) + # draw.draw_bounding_box(org, [corner], show=True) + # ignore atomic components + if self.bbox.height / row > max_compo_scale[0] or self.bbox.width / column > max_compo_scale[1]: + return False + return True + + def block_is_top_or_bottom_bar(self, image_shape, top_bottom_height): + ''' + Check if the block is top bar or bottom bar + ''' + height, width = image_shape[:2] + (column_min, row_min, column_max, row_max) = self.bbox.put_bbox() + if column_min < 5 and row_min < 5 and \ + width - column_max < 5 and row_max < height * top_bottom_height[0]: + self.uicompo_ = True + return True + if column_min < 5 and row_min > height * top_bottom_height[1] and \ + width - column_max < 5 and height - row_max < 5: + self.uicompo_ = True + return True + return False + + def block_erase_from_bin(self, binary, pad): + (column_min, row_min, column_max, row_max) = self.put_bbox() + column_min = max(column_min - pad, 0) + column_max = min(column_max + pad, binary.shape[1]) + row_min = max(row_min - pad, 0) + row_max = min(row_max + pad, binary.shape[0]) + cv2.rectangle(binary, (column_min, row_min), (column_max, row_max), (0), -1) + diff --git a/CDM/detect_compo/deprecated/block_division.py b/CDM/detect_compo/deprecated/block_division.py new file mode 100644 index 0000000000000000000000000000000000000000..fa33194471b0aaa7f2a17ab342285de62e657436 --- /dev/null +++ b/CDM/detect_compo/deprecated/block_division.py @@ -0,0 +1,108 @@ +import cv2 +import numpy as np +from random import randint as rint +import time + +import CDM.detect_compo.lib_ip.ip_preprocessing as pre +import CDM.detect_compo.lib_ip.ip_detection as det +import CDM.detect_compo.lib_ip.ip_draw as draw +import CDM.detect_compo.lib_ip.ip_segment as seg +from CDM.detect_compo.lib_ip.Block import Block +from CDM.config.CONFIG_UIED import Config +C = Config() + + +def block_hierarchy(blocks): + for i in range(len(blocks) - 1): + for j in range(i + 1, len(blocks)): + relation = blocks[i].compo_relation(blocks[j]) + if relation == -1: + blocks[j].children.append(i) + if relation == 1: + blocks[i].children.append(j) + return + + +def block_bin_erase_all_blk(binary, blocks, pad=0, show=False): + ''' + erase the block parts from the binary map + :param binary: binary map of original image + :param blocks_corner: corners of detected layout block + :param show: show or not + :param pad: expand the bounding boxes of blocks + :return: binary map without block parts + ''' + + bin_org = binary.copy() + for block in blocks: + block.block_erase_from_bin(binary, pad) + if show: + cv2.imshow('before', bin_org) + cv2.imshow('after', binary) + cv2.waitKey() + + +def block_division(grey, org, grad_thresh, + show=False, write_path=None, + step_h=10, step_v=10, + line_thickness=C.THRESHOLD_LINE_THICKNESS, + min_rec_evenness=C.THRESHOLD_REC_MIN_EVENNESS, + max_dent_ratio=C.THRESHOLD_REC_MAX_DENT_RATIO, + min_block_height_ratio=C.THRESHOLD_BLOCK_MIN_HEIGHT): + ''' + :param grey: grey-scale of original image + :return: corners: list of [(top_left, bottom_right)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + ''' + blocks = [] + mask = np.zeros((grey.shape[0]+2, grey.shape[1]+2), dtype=np.uint8) + broad = np.zeros((grey.shape[0], grey.shape[1], 3), dtype=np.uint8) + broad_all = broad.copy() + + row, column = grey.shape[0], grey.shape[1] + for x in range(0, row, step_h): + for y in range(0, column, step_v): + if mask[x, y] == 0: + # region = flood_fill_bfs(grey, x, y, mask) + + # flood fill algorithm to get background (layout block) + mask_copy = mask.copy() + ff = cv2.floodFill(grey, mask, (y, x), None, grad_thresh, grad_thresh, cv2.FLOODFILL_MASK_ONLY) + # ignore small regions + if ff[0] < 500: continue + mask_copy = mask - mask_copy + region = np.reshape(cv2.findNonZero(mask_copy[1:-1, 1:-1]), (-1, 2)) + region = [(p[1], p[0]) for p in region] + + block = Block(region, grey.shape) + # draw.draw_region(region, broad_all) + # if block.height < 40 and block.width < 40: + # continue + if block.height < 30: + continue + + # print(block.area / (row * column)) + if block.area / (row * column) > 0.9: + continue + elif block.area / (row * column) > 0.7: + block.redundant = True + + # get the boundary of this region + # ignore lines + if block.compo_is_line(line_thickness): + continue + # ignore non-rectangle as blocks must be rectangular + if not block.compo_is_rectangle(min_rec_evenness, max_dent_ratio): + continue + # if block.height/row < min_block_height_ratio: + # continue + blocks.append(block) + # draw.draw_region(region, broad) + if show: + cv2.imshow('flood-fill all', broad_all) + cv2.imshow('block', broad) + cv2.waitKey() + if write_path is not None: + cv2.imwrite(write_path, broad) + return blocks diff --git a/CDM/detect_compo/deprecated/ip_detection_utils.py b/CDM/detect_compo/deprecated/ip_detection_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..17e2140fcbb4c09ef25a53184dd9048113b0d3de --- /dev/null +++ b/CDM/detect_compo/deprecated/ip_detection_utils.py @@ -0,0 +1,461 @@ +import numpy as np +import cv2 +from collections import Counter + +import lib_ip.ip_draw as draw +from CDM.config.CONFIG_UIED import Config +C = Config() + + +# detect object(connected region) +# def boundary_bfs_connected_area(img, x, y, mark): +# def neighbor(img, x, y, mark, stack): +# for i in range(x - 1, x + 2): +# if i < 0 or i >= img.shape[0]: continue +# for j in range(y - 1, y + 2): +# if j < 0 or j >= img.shape[1]: continue +# if img[i, j] == 255 and mark[i, j] == 0: +# stack.append([i, j]) +# mark[i, j] = 255 +# +# stack = [[x, y]] # points waiting for inspection +# area = [[x, y]] # points of this area +# mark[x, y] = 255 # drawing broad +# +# while len(stack) > 0: +# point = stack.pop() +# area.append(point) +# neighbor(img, point[0], point[1], mark, stack) +# return area + + +# def line_check_perpendicular(lines_h, lines_v, max_thickness): +# """ +# lines: [line_h, line_v] +# -> line_h: horizontal {'head':(column_min, row), 'end':(column_max, row), 'thickness':int) +# -> line_v: vertical {'head':(column, row_min), 'end':(column, row_max), 'thickness':int} +# """ +# is_per_h = np.full(len(lines_h), False) +# is_per_v = np.full(len(lines_v), False) +# for i in range(len(lines_h)): +# # save the intersection point of h +# lines_h[i]['inter_point'] = set() +# h = lines_h[i] +# +# for j in range(len(lines_v)): +# # save the intersection point of v +# if 'inter_point' not in lines_v[j]: lines_v[j]['inter_point'] = set() +# v = lines_v[j] +# +# # if h is perpendicular to v in head of v +# if abs(h['head'][1]-v['head'][1]) <= max_thickness: +# if abs(h['head'][0] - v['head'][0]) <= max_thickness: +# lines_h[i]['inter_point'].add('head') +# lines_v[j]['inter_point'].add('head') +# is_per_h[i] = True +# is_per_v[j] = True +# elif abs(h['end'][0] - v['head'][0]) <= max_thickness: +# lines_h[i]['inter_point'].add('end') +# lines_v[j]['inter_point'].add('head') +# is_per_h[i] = True +# is_per_v[j] = True +# +# # if h is perpendicular to v in end of v +# elif abs(h['head'][1]-v['end'][1]) <= max_thickness: +# if abs(h['head'][0] - v['head'][0]) <= max_thickness: +# lines_h[i]['inter_point'].add('head') +# lines_v[j]['inter_point'].add('end') +# is_per_h[i] = True +# is_per_v[j] = True +# elif abs(h['end'][0] - v['head'][0]) <= max_thickness: +# lines_h[i]['inter_point'].add('end') +# lines_v[j]['inter_point'].add('end') +# is_per_h[i] = True +# is_per_v[j] = True +# per_h = [] +# per_v = [] +# for i in range(len(is_per_h)): +# if is_per_h[i]: +# lines_h[i]['inter_point'] = list(lines_h[i]['inter_point']) +# per_h.append(lines_h[i]) +# for i in range(len(is_per_v)): +# if is_per_v[i]: +# lines_v[i]['inter_point'] = list(lines_v[i]['inter_point']) +# per_v.append(lines_v[i]) +# return per_h, per_v + + +# def line_shrink_corners(corner, lines_h, lines_v): +# """ +# shrink the corner according to lines: +# col_min_shrink: shrink right (increase) +# col_max_shrink: shrink left (decrease) +# row_min_shrink: shrink down (increase) +# row_max_shrink: shrink up (decrease) +# :param lines_h: horizontal {'head':(column_min, row), 'end':(column_max, row), 'thickness':int) +# :param lines_v: vertical {'head':(column, row_min), 'end':(column, row_max), 'thickness':int} +# :return: shrunken corner: (top_left, bottom_right) +# """ +# (col_min, row_min), (col_max, row_max) = corner +# col_min_shrink, row_min_shrink = col_min, row_min +# col_max_shrink, row_max_shrink = col_max, row_max +# valid_frame = False +# +# for h in lines_h: +# # ignore outer border +# if len(h['inter_point']) == 2: +# valid_frame = True +# continue +# # shrink right -> col_min move to end +# if h['inter_point'][0] == 'head': +# col_min_shrink = max(h['end'][0], col_min_shrink) +# # shrink left -> col_max move to head +# elif h['inter_point'][0] == 'end': +# col_max_shrink = min(h['head'][0], col_max_shrink) +# +# for v in lines_v: +# # ignore outer border +# if len(v['inter_point']) == 2: +# valid_frame = True +# continue +# # shrink down -> row_min move to end +# if v['inter_point'][0] == 'head': +# row_min_shrink = max(v['end'][1], row_min_shrink) +# # shrink up -> row_max move to head +# elif v['inter_point'][0] == 'end': +# row_max_shrink = min(v['head'][1], row_max_shrink) +# +# # return the shrunken corner if only there is line intersecting with two other lines +# if valid_frame: +# return (col_min_shrink, row_min_shrink), (col_max_shrink, row_max_shrink) +# return corner + + +# def line_cvt_relative_position(col_min, row_min, lines_h, lines_v): +# """ +# convert the relative position of lines in the entire image +# :param col_min: based column the img lines belong to +# :param row_min: based row the img lines belong to +# :param lines_h: horizontal {'head':(column_min, row), 'end':(column_max, row), 'thickness':int) +# :param lines_v: vertical {'head':(column, row_min), 'end':(column, row_max), 'thickness':int} +# :return: lines_h_cvt, lines_v_cvt +# """ +# for h in lines_h: +# h['head'][0] += col_min +# h['head'][1] += row_min +# h['end'][0] += col_min +# h['end'][1] += row_min +# for v in lines_v: +# v['head'][0] += col_min +# v['head'][1] += row_min +# v['end'][0] += col_min +# v['end'][1] += row_min +# +# return lines_h, lines_v + + +# check if an object is so slim +# @boundary: [border_up, border_bottom, border_left, border_right] +# -> up, bottom: (column_index, min/max row border) +# -> left, right: (row_index, min/max column border) detect range of each row +def clipping_by_line(boundary, boundary_rec, lines): + boundary = boundary.copy() + for orient in lines: + # horizontal + if orient == 'h': + # column range of sub area + r1, r2 = 0, 0 + for line in lines[orient]: + if line[0] == 0: + r1 = line[1] + continue + r2 = line[0] + b_top = [] + b_bottom = [] + for i in range(len(boundary[0])): + if r2 > boundary[0][i][0] >= r1: + b_top.append(boundary[0][i]) + for i in range(len(boundary[1])): + if r2 > boundary[1][i][0] >= r1: + b_bottom.append(boundary[1][i]) + + b_left = [x for x in boundary[2]] # (row_index, min column border) + for i in range(len(b_left)): + if b_left[i][1] < r1: + b_left[i][1] = r1 + b_right = [x for x in boundary[3]] # (row_index, max column border) + for i in range(len(b_right)): + if b_right[i][1] > r2: + b_right[i][1] = r2 + + boundary_rec.append([b_top, b_bottom, b_left, b_right]) + r1 = line[1] + + +# remove imgs that contain text +# def rm_text(org, corners, compo_class, +# max_text_height=C.THRESHOLD_TEXT_MAX_HEIGHT, max_text_width=C.THRESHOLD_TEXT_MAX_WIDTH, +# ocr_padding=C.OCR_PADDING, ocr_min_word_area=C.OCR_MIN_WORD_AREA, show=False): +# """ +# Remove area that full of text +# :param org: original image +# :param corners: [(top_left, bottom_right)] +# -> top_left: (column_min, row_min) +# -> bottom_right: (column_max, row_max) +# :param compo_class: classes of corners +# :param max_text_height: Too large to be text +# :param max_text_width: Too large to be text +# :param ocr_padding: Padding for clipping +# :param ocr_min_word_area: If too text area ratio is too large +# :param show: Show or not +# :return: corners without text objects +# """ +# new_corners = [] +# new_class = [] +# for i in range(len(corners)): +# corner = corners[i] +# (top_left, bottom_right) = corner +# (col_min, row_min) = top_left +# (col_max, row_max) = bottom_right +# height = row_max - row_min +# width = col_max - col_min +# # highly likely to be block or img if too large +# if height > max_text_height and width > max_text_width: +# new_corners.append(corner) +# new_class.append(compo_class[i]) +# else: +# row_min = row_min - ocr_padding if row_min - ocr_padding >= 0 else 0 +# row_max = row_max + ocr_padding if row_max + ocr_padding < org.shape[0] else org.shape[0] +# col_min = col_min - ocr_padding if col_min - ocr_padding >= 0 else 0 +# col_max = col_max + ocr_padding if col_max + ocr_padding < org.shape[1] else org.shape[1] +# # check if this area is text +# clip = org[row_min: row_max, col_min: col_max] +# if not ocr.is_text(clip, ocr_min_word_area, show=show): +# new_corners.append(corner) +# new_class.append(compo_class[i]) +# return new_corners, new_class + + +# def rm_img_in_compo(corners_img, corners_compo): +# """ +# Remove imgs in component +# """ +# corners_img_new = [] +# for img in corners_img: +# is_nested = False +# for compo in corners_compo: +# if util.corner_relation(img, compo) == -1: +# is_nested = True +# break +# if not is_nested: +# corners_img_new.append(img) +# return corners_img_new + + +# def block_or_compo(org, binary, corners, +# max_thickness=C.THRESHOLD_BLOCK_MAX_BORDER_THICKNESS, max_block_cross_points=C.THRESHOLD_BLOCK_MAX_CROSS_POINT, +# min_compo_w_h_ratio=C.THRESHOLD_UICOMPO_MIN_W_H_RATIO, max_compo_w_h_ratio=C.THRESHOLD_UICOMPO_MAX_W_H_RATIO, +# min_block_edge=C.THRESHOLD_BLOCK_MIN_EDGE_LENGTH): +# """ +# Check if the objects are img components or just block +# :param org: Original image +# :param binary: Binary image from pre-processing +# :param corners: [(top_left, bottom_right)] +# -> top_left: (column_min, row_min) +# -> bottom_right: (column_max, row_max) +# :param max_thickness: The max thickness of border of blocks +# :param max_block_cross_points: Ratio of point of interaction +# :return: corners of blocks and imgs +# """ +# blocks = [] +# imgs = [] +# compos = [] +# for corner in corners: +# (top_left, bottom_right) = corner +# (col_min, row_min) = top_left +# (col_max, row_max) = bottom_right +# height = row_max - row_min +# width = col_max - col_min +# +# block = False +# vacancy = [0, 0, 0, 0] +# for i in range(1, max_thickness): +# try: +# # top to bottom +# if vacancy[0] == 0 and (col_max - col_min - 2 * i) is not 0 and ( +# np.sum(binary[row_min + i, col_min + i: col_max - i]) / 255) / (col_max - col_min - 2 * i) <= max_block_cross_points: +# vacancy[0] = 1 +# # bottom to top +# if vacancy[1] == 0 and (col_max - col_min - 2 * i) is not 0 and ( +# np.sum(binary[row_max - i, col_min + i: col_max - i]) / 255) / (col_max - col_min - 2 * i) <= max_block_cross_points: +# vacancy[1] = 1 +# # left to right +# if vacancy[2] == 0 and (row_max - row_min - 2 * i) is not 0 and ( +# np.sum(binary[row_min + i: row_max - i, col_min + i]) / 255) / (row_max - row_min - 2 * i) <= max_block_cross_points: +# vacancy[2] = 1 +# # right to left +# if vacancy[3] == 0 and (row_max - row_min - 2 * i) is not 0 and ( +# np.sum(binary[row_min + i: row_max - i, col_max - i]) / 255) / (row_max - row_min - 2 * i) <= max_block_cross_points: +# vacancy[3] = 1 +# if np.sum(vacancy) == 4: +# block = True +# except: +# pass +# +# # too big to be UI components +# if block: +# if height > min_block_edge and width > min_block_edge: +# blocks.append(corner) +# else: +# if min_compo_w_h_ratio < width / height < max_compo_w_h_ratio: +# compos.append(corner) +# # filter out small objects +# else: +# if height > min_block_edge: +# imgs.append(corner) +# else: +# if min_compo_w_h_ratio < width / height < max_compo_w_h_ratio: +# compos.append(corner) +# return blocks, imgs, compos + + +# def compo_on_img(processing, org, binary, clf, +# compos_corner, compos_class): +# """ +# Detect potential UI components inner img; +# Only leave non-img +# """ +# pad = 2 +# for i in range(len(compos_corner)): +# if compos_class[i] != 'img': +# continue +# ((col_min, row_min), (col_max, row_max)) = compos_corner[i] +# col_min = max(col_min - pad, 0) +# col_max = min(col_max + pad, org.shape[1]) +# row_min = max(row_min - pad, 0) +# row_max = min(row_max + pad, org.shape[0]) +# area = (col_max - col_min) * (row_max - row_min) +# if area < 600: +# continue +# +# clip_org = org[row_min:row_max, col_min:col_max] +# clip_bin_inv = pre.reverse_binary(binary[row_min:row_max, col_min:col_max]) +# +# compos_boundary_new, compos_corner_new, compos_class_new = processing(clip_org, clip_bin_inv, clf) +# compos_corner_new = util.corner_cvt_relative_position(compos_corner_new, col_min, row_min) +# +# assert len(compos_corner_new) == len(compos_class_new) +# +# # only leave non-img elements +# for i in range(len(compos_corner_new)): +# ((col_min_new, row_min_new), (col_max_new, row_max_new)) = compos_corner_new[i] +# area_new = (col_max_new - col_min_new) * (row_max_new - row_min_new) +# if compos_class_new[i] != 'img' and area_new / area < 0.8: +# compos_corner.append(compos_corner_new[i]) +# compos_class.append(compos_class_new[i]) +# +# return compos_corner, compos_class + + +# def strip_img(corners_compo, compos_class, corners_img): +# """ +# Separate img from other compos +# :return: compos without img +# """ +# corners_compo_withuot_img = [] +# compo_class_withuot_img = [] +# for i in range(len(compos_class)): +# if compos_class[i] == 'img': +# corners_img.append(corners_compo[i]) +# else: +# corners_compo_withuot_img.append(corners_compo[i]) +# compo_class_withuot_img.append(compos_class[i]) +# return corners_compo_withuot_img, compo_class_withuot_img + + +# def merge_corner(corners, compos_class, min_selected_IoU=C.THRESHOLD_MIN_IOU, is_merge_nested_same=True): +# """ +# Calculate the Intersection over Overlap (IoU) and merge corners according to the value of IoU +# :param is_merge_nested_same: if true, merge the nested corners with same class whatever the IoU is +# :param corners: corners: [(top_left, bottom_right)] +# -> top_left: (column_min, row_min) +# -> bottom_right: (column_max, row_max) +# :return: new corners +# """ +# new_corners = [] +# new_class = [] +# for i in range(len(corners)): +# is_intersected = False +# for j in range(len(new_corners)): +# r = util.corner_relation_nms(corners[i], new_corners[j], min_selected_IoU) +# # r = util.corner_relation(corners[i], new_corners[j]) +# if is_merge_nested_same: +# if compos_class[i] == new_class[j]: +# # if corners[i] is in new_corners[j], ignore corners[i] +# if r == -1: +# is_intersected = True +# break +# # if new_corners[j] is in corners[i], replace new_corners[j] with corners[i] +# elif r == 1: +# is_intersected = True +# new_corners[j] = corners[i] +# +# # if above IoU threshold, and corners[i] is in new_corners[j], ignore corners[i] +# if r == -2: +# is_intersected = True +# break +# # if above IoU threshold, and new_corners[j] is in corners[i], replace new_corners[j] with corners[i] +# elif r == 2: +# is_intersected = True +# new_corners[j] = corners[i] +# new_class[j] = compos_class[i] +# +# # containing and too small +# elif r == -3: +# is_intersected = True +# break +# elif r == 3: +# is_intersected = True +# new_corners[j] = corners[i] +# +# # if [i] and [j] are overlapped but no containing relation, merge corners when same class +# elif r == 4: +# is_intersected = True +# if compos_class[i] == new_class[j]: +# new_corners[j] = util.corner_merge_two_corners(corners[i], new_corners[j]) +# +# if not is_intersected: +# new_corners.append(corners[i]) +# new_class.append(compos_class[i]) +# return new_corners, new_class + + +# def select_corner(corners, compos_class, class_name): +# """ +# Select corners in given compo type +# """ +# corners_wanted = [] +# for i in range(len(compos_class)): +# if compos_class[i] == class_name: +# corners_wanted.append(corners[i]) +# return corners_wanted + + +# def flood_fill_bfs(img, x_start, y_start, mark, grad_thresh): +# def neighbor(x, y): +# for i in range(x - 1, x + 2): +# if i < 0 or i >= img.shape[0]: continue +# for j in range(y - 1, y + 2): +# if j < 0 or j >= img.shape[1]: continue +# if mark[i, j] == 0 and abs(img[i, j] - img[x, y]) < grad_thresh: +# stack.append([i, j]) +# mark[i, j] = 255 +# +# stack = [[x_start, y_start]] # points waiting for inspection +# region = [[x_start, y_start]] # points of this connected region +# mark[x_start, y_start] = 255 # drawing broad +# while len(stack) > 0: +# point = stack.pop() +# region.append(point) +# neighbor(point[0], point[1]) +# return region \ No newline at end of file diff --git a/CDM/detect_compo/deprecated/ip_segment.py b/CDM/detect_compo/deprecated/ip_segment.py new file mode 100644 index 0000000000000000000000000000000000000000..a4c02cb1989e2939bb38eb692c2f4fb021a6ff16 --- /dev/null +++ b/CDM/detect_compo/deprecated/ip_segment.py @@ -0,0 +1,123 @@ +import cv2 +import numpy as np +import shutil +import os +from os.path import join as pjoin + + +def segment_img(org, segment_size, output_path, overlap=100): + if not os.path.exists(output_path): + os.mkdir(output_path) + + height, width = np.shape(org)[0], np.shape(org)[1] + top = 0 + bottom = segment_size + segment_no = 0 + while top < height and bottom < height: + segment = org[top:bottom] + cv2.imwrite(os.path.join(output_path, str(segment_no) + '.png'), segment) + segment_no += 1 + top += segment_size - overlap + bottom = bottom + segment_size - overlap if bottom + segment_size - overlap <= height else height + + +def clipping(img, components, pad=0, show=False): + """ + :param adjust: shrink(negative) or expand(positive) the bounding box + :param img: original image + :param corners: ((column_min, row_min),(column_max, row_max)) + :return: list of clipping images + """ + clips = [] + for component in components: + clip = component.compo_clipping(img, pad=pad) + clips.append(clip) + if show: + cv2.imshow('clipping', clip) + cv2.waitKey() + return clips + + +def dissemble_clip_img_hollow(clip_root, org, compos): + if os.path.exists(clip_root): + shutil.rmtree(clip_root) + os.mkdir(clip_root) + cls_dirs = [] + + bkg = org.copy() + hollow_out = np.ones(bkg.shape[:2], dtype=np.uint8) * 255 + for compo in compos: + cls = compo.category + c_root = pjoin(clip_root, cls) + c_path = pjoin(c_root, str(compo.id) + '.jpg') + if cls not in cls_dirs: + os.mkdir(c_root) + cls_dirs.append(cls) + clip = compo.compo_clipping(org) + cv2.imwrite(c_path, clip) + + col_min, row_min, col_max, row_max = compo.put_bbox() + hollow_out[row_min: row_max, col_min: col_max] = 0 + + bkg = cv2.merge((bkg, hollow_out)) + cv2.imwrite(os.path.join(clip_root, 'bkg.png'), bkg) + + +def dissemble_clip_img_fill(clip_root, org, compos, flag='most'): + + def average_pix_around(pad=6, offset=3): + up = row_min - pad if row_min - pad >= 0 else 0 + left = col_min - pad if col_min - pad >= 0 else 0 + bottom = row_max + pad if row_max + pad < org.shape[0] - 1 else org.shape[0] - 1 + right = col_max + pad if col_max + pad < org.shape[1] - 1 else org.shape[1] - 1 + + average = [] + for i in range(3): + avg_up = np.average(org[up:row_min - offset, left:right, i]) + avg_bot = np.average(org[row_max + offset:bottom, left:right, i]) + avg_left = np.average(org[up:bottom, left:col_min - offset, i]) + avg_right = np.average(org[up:bottom, col_max + offset:right, i]) + average.append(int((avg_up + avg_bot + avg_left + avg_right)/4)) + return average + + def most_pix_around(pad=6, offset=2): + up = row_min - pad if row_min - pad >= 0 else 0 + left = col_min - pad if col_min - pad >= 0 else 0 + bottom = row_max + pad if row_max + pad < org.shape[0] - 1 else org.shape[0] - 1 + right = col_max + pad if col_max + pad < org.shape[1] - 1 else org.shape[1] - 1 + + most = [] + for i in range(3): + val = np.concatenate((org[up:row_min - offset, left:right, i].flatten(), + org[row_max + offset:bottom, left:right, i].flatten(), + org[up:bottom, left:col_min - offset, i].flatten(), + org[up:bottom, col_max + offset:right, i].flatten())) + # print(val) + # print(np.argmax(np.bincount(val))) + most.append(int(np.argmax(np.bincount(val)))) + return most + + if os.path.exists(clip_root): + shutil.rmtree(clip_root) + os.mkdir(clip_root) + cls_dirs = [] + + bkg = org.copy() + for compo in compos: + cls = compo.category + c_root = pjoin(clip_root, cls) + c_path = pjoin(c_root, str(compo.id) + '.jpg') + if cls not in cls_dirs: + os.mkdir(c_root) + cls_dirs.append(cls) + clip = compo.compo_clipping(org) + cv2.imwrite(c_path, clip) + + col_min, row_min, col_max, row_max = compo.put_bbox() + if flag == 'average': + color = average_pix_around() + elif flag == 'most': + color = most_pix_around() + cv2.rectangle(bkg, (col_min, row_min), (col_max, row_max), color, -1) + + cv2.imwrite(os.path.join(clip_root, 'bkg.png'), bkg) diff --git a/CDM/detect_compo/deprecated/ocr_classify_text.py b/CDM/detect_compo/deprecated/ocr_classify_text.py new file mode 100644 index 0000000000000000000000000000000000000000..ae9c63604ea88e7289168557069e80640092815f --- /dev/null +++ b/CDM/detect_compo/deprecated/ocr_classify_text.py @@ -0,0 +1,113 @@ +import pytesseract as pyt +import cv2 + +import lib_ip.ip_draw as draw +from config.CONFIG_UIED import Config + +C = Config() + + +def is_text(img, min_word_area, show=False): + broad = img.copy() + area_word = 0 + area_total = img.shape[0] * img.shape[1] + + try: + # ocr text detection + data = pyt.image_to_data(img).split('\n') + except: + print(img.shape) + return -1 + word = [] + for d in data[1:]: + d = d.split() + if d[-1] != '-1': + if d[-1] != '-' and d[-1] != '—' and int(d[-3]) < 50 and int(d[-4]) < 100: + word.append(d) + t_l = (int(d[-6]), int(d[-5])) + b_r = (int(d[-6]) + int(d[-4]), int(d[-5]) + int(d[-3])) + area_word += int(d[-4]) * int(d[-3]) + cv2.rectangle(broad, t_l, b_r, (0,0,255), 1) + + if show: + for d in word: print(d) + print(area_word/area_total) + cv2.imshow('a', broad) + cv2.waitKey(0) + cv2.destroyAllWindows() + # no text in this clip or relatively small text area + if len(word) == 0 or area_word/area_total < min_word_area: + return False + return True + + +def text_detection(org, img_clean): + try: + data = pyt.image_to_data(img_clean).split('\n') + except: + return org, None + corners_word = [] + for d in data[1:]: + d = d.split() + if d[-1] != '-1': + if d[-1] != '-' and d[-1] != '—' and 5 < int(d[-3]) < 40 and 5 < int(d[-4]) < 100: + t_l = (int(d[-6]), int(d[-5])) + b_r = (int(d[-6]) + int(d[-4]), int(d[-5]) + int(d[-3])) + corners_word.append((t_l, b_r)) + return corners_word + + +# def text_merge_word_into_line(org, corners_word, max_words_gap=C.THRESHOLD_TEXT_MAX_WORD_GAP): +# +# def is_in_line(word): +# for i in range(len(lines)): +# line = lines[i] +# # at the same row +# if abs(line['center'][1] - word['center'][1]) < max_words_gap: +# # small gap between words +# if (abs(line['center'][0] - word['center'][0]) - abs(line['width']/2 + word['width']/2)) < max_words_gap: +# return i +# return -1 +# +# def merge_line(word, index): +# line = lines[index] +# # on the left +# if word['center'][0] < line['center'][0]: +# line['col_min'] = word['col_min'] +# # on the right +# else: +# line['col_max'] = word['col_max'] +# line['row_min'] = min(line['row_min'], word['row_min']) +# line['row_max'] = max(line['row_max'], word['row_max']) +# line['width'] = line['col_max'] - line['col_min'] +# line['height'] = line['row_max'] - line['row_min'] +# line['center'] = ((line['col_max'] + line['col_min'])/2, (line['row_max'] + line['row_min'])/2) +# +# words = [] +# for corner in corners_word: +# word = {} +# (top_left, bottom_right) = corner +# (col_min, row_min) = top_left +# (col_max, row_max) = bottom_right +# word['col_min'], word['col_max'], word['row_min'], word['row_max'] = col_min, col_max, row_min, row_max +# word['height'] = row_max - row_min +# word['width'] = col_max - col_min +# word['center'] = ((col_max + col_min)/2, (row_max + row_min)/2) +# words.append(word) +# +# lines = [] +# for word in words: +# line_index = is_in_line(word) +# # word is in current line +# if line_index != -1: +# merge_line(word, line_index) +# # word is not in current line +# else: +# # this single word as a new line +# lines.append(word) +# +# corners_line = [] +# for l in lines: +# corners_line.append(((l['col_min'], l['row_min']), (l['col_max'], l['row_max']))) +# return corners_line + diff --git a/CDM/detect_compo/ip_region_proposal.py b/CDM/detect_compo/ip_region_proposal.py new file mode 100644 index 0000000000000000000000000000000000000000..6c6d9d2dffc9fbbf92e7633f135013cb3ef909a6 --- /dev/null +++ b/CDM/detect_compo/ip_region_proposal.py @@ -0,0 +1,200 @@ +import cv2 +from os.path import join as pjoin +import time + +import CDM.detect_compo.lib_ip.ip_preprocessing as pre +import CDM.detect_compo.lib_ip.ip_draw as draw +import CDM.detect_compo.lib_ip.ip_detection as det +import CDM.detect_compo.lib_ip.file_utils as file +import CDM.detect_compo.lib_ip.Component as Compo +from CDM.config.CONFIG_UIED import Config +C = Config() + + +def nesting_inspection(org, grey, compos, ffl_block): + ''' + Inspect all big compos through block division by flood-fill + :param ffl_block: gradient threshold for flood-fill + :return: nesting compos + ''' + nesting_compos = [] + for i, compo in enumerate(compos): + if compo.height > 50: + replace = False + clip_grey = compo.compo_clipping(grey) + n_compos = det.nested_components_detection(clip_grey, org, grad_thresh=ffl_block, show=False) + Compo.cvt_compos_relative_pos(n_compos, compo.bbox.col_min, compo.bbox.row_min) + + for n_compo in n_compos: + if n_compo.redundant: + compos[i] = n_compo + replace = True + break + if not replace: + nesting_compos += n_compos + return nesting_compos + + +def compo_detection(input_img_path, output_root, uied_params, + resize_by_height=800, classifier=None, show=False, wai_key=0): + + start = time.process_time() + name = input_img_path.split('/')[-1][:-4] if '/' in input_img_path else input_img_path.split('\\')[-1][:-4] + ip_root = file.build_directory(pjoin(output_root, "ip")) + + # *** Step 1 *** pre-processing: read img -> get binary map + org, grey = pre.read_img(input_img_path, resize_by_height) + binary = pre.binarization(org, grad_min=int(uied_params['min-grad'])) + + full_size_org, full_size_grey = pre.read_img(input_img_path) + ratio = full_size_org.shape[0] / org.shape[0] + + # *** Step 2 *** element detection + det.rm_line(binary, show=show, wait_key=wai_key) + uicompos = det.component_detection(binary, min_obj_area=int(uied_params['min-ele-area'])) + + # *** Step 3 *** results refinement + uicompos = det.compo_filter(uicompos, min_area=int(uied_params['min-ele-area']), img_shape=binary.shape) + uicompos = det.merge_intersected_compos(uicompos) + det.compo_block_recognition(binary, uicompos) + if uied_params['merge-contained-ele']: + uicompos = det.rm_contained_compos_not_in_block(uicompos) + Compo.compos_update(uicompos, org.shape) + Compo.compos_containment(uicompos) + + # *** Step 4 ** nesting inspection: check if big compos have nesting element + uicompos += nesting_inspection(org, grey, uicompos, ffl_block=uied_params['ffl-block']) + Compo.compos_update(uicompos, org.shape) + draw.draw_bounding_box(full_size_org, ratio, uicompos, show=show, name='merged compo', write_path=pjoin(ip_root, name + '.jpg'), wait_key=wai_key) + + # # classify icons + # model = models.resnet18().to('cpu') + # in_feature_num = model.fc.in_features + # model.fc = nn.Linear(in_feature_num, 99) + # # model.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3,3), padding=(3,3), stride=(2,2), bias=False) + # model.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(5, 5), padding=(3, 3), stride=(2, 2), + # bias=False) + # # PATH = "C:/ANU/2022 s2/honours project/code/UIED-master/model/model-99-resnet18.pkl" + # PATH = "./model/model-99-resnet18.pkl" + # # trained_model = model() + # model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu'))) + # + # model.eval() + # + # # ----------------- try on semantics dataset--------------------- + # + # # sample_data = np.load('C:/ANU/2022 s2/honours project/code/semantic-icon-classifier-master/data/training_x.npy') + # # + # # array = np.reshape(sample_data[0, :, :, :], [32, 32]) + # # + # # print("array: ", array) + # # + # # cv2.imshow("array", array) + # # cv2.waitKey(0) + # # + # # array = array.astype('float32') + # # array = array / 255 + # # array = (array - array.mean()) / array.std() + # # + # # print("array mean: ", array.mean()) + # # print("array std: ", array.std()) + # # + # # array = array.reshape(1, 1, 32, 32) + # # + # # array = torch.tensor(array) + # # print("array_tensor: ", array) + # # array_pred_label = model(array) + # # print("output: ", array_pred_label) + # + # # ----------------- end trying --------------------- + # + # grey = grey.astype('float32') + # grey = grey / 255 + # # grey = grey / np.linalg.norm(grey) + # + # grey = (grey-grey.mean())/grey.std() + # print("grey mean: ", grey.mean()) + # print("grey std: ", grey.std()) + # + # # grey = grey.to(torch.float32) + # + # # plt.imshow(Image.fromarray(binary)) + # # plt.show() + # # cv2.imshow("grey", grey) + # + # privacy_compos = [] + # for comp in uicompos: + # + # # cv2.imshow("comp", grey[comp.bbox.row_min:comp.bbox.row_max, comp.bbox.col_min:comp.bbox.col_max]) + # # cv2.waitKey(0) + # + # # col_mid = int((comp.bbox.col_min+comp.bbox.col_max)/2) + # # row_mid = int((comp.bbox.row_min+comp.bbox.row_max)/2) + # # comp_crop = grey[max(0, row_mid-16):min(grey.shape[1], row_mid+16), max(0, col_mid-16):min(grey.shape[0], col_mid+16)] + # # + # # if comp_crop.shape[0] != 32 or comp_crop.shape[1] != 32: + # # print("A component is not classified, size: ", comp_crop.shape) + # # print("col_mid: ", col_mid) + # # print("row_mid: ", row_mid) + # # print("shape[0]: ", comp_crop.shape[0]) + # # print("shape[1]: ", comp_crop.shape[1]) + # # print("max(0, row_mid-16) and min(binary.shape[1], row_mid+16): ", max(0, row_mid-16), min(grey.shape[1], row_mid+16)) + # + # comp_grey = grey[comp.bbox.row_min:comp.bbox.row_max, comp.bbox.col_min:comp.bbox.col_max] + # + # # cv2.imshow("comp_grey", comp_grey) + # # cv2.waitKey(0) + # + # # print("comp_crop: ", comp_crop) + # # comp_crop = comp_grey.reshape(1, 1, 32, 32) + # comp_crop = cv2.resize(comp_grey, (32, 32)) + # print("comp_crop: ", comp_crop) + # + # # cv2.imshow("comp_crop", comp_crop) + # # cv2.waitKey(0) + # + # comp_crop = comp_crop.reshape(1, 1, 32, 32) + # + # comp_tensor = torch.tensor(comp_crop) + # comp_tensor = comp_tensor.permute(0, 1, 3, 2) + # print("comp_tensor: ", comp_tensor) + # # comp_float = comp_tensor.to(torch.float32) + # # print("comp_float: ", comp_float) + # # pred_label = model(comp_float) + # pred_label = model(comp_tensor) + # print("output: ", pred_label) + # print("label: ", np.argmax(pred_label.cpu().data.numpy(), axis=1)) + # if np.argmax(pred_label.cpu().data.numpy(), axis=1) in [72.0, 42.0, 77.0, 91.0, 6.0, 89.0, 40.0, 43.0, 82.0, 3.0, 68.0, + # 49.0, 56.0, 89.0]: + # privacy_compos.append(comp) + # + # draw.draw_bounding_box(org, privacy_compos, show=show, name='merged compo', write_path=pjoin(ip_root, name + '.jpg'), wait_key=wai_key) + + # *** Step 5 *** image inspection: recognize image -> remove noise in image -> binarize with larger threshold and reverse -> rectangular compo detection + # if classifier is not None: + # classifier['Image'].predict(seg.clipping(org, uicompos), uicompos) + # draw.draw_bounding_box_class(org, uicompos, show=show) + # uicompos = det.rm_noise_in_large_img(uicompos, org) + # draw.draw_bounding_box_class(org, uicompos, show=show) + # det.detect_compos_in_img(uicompos, binary_org, org) + # draw.draw_bounding_box(org, uicompos, show=show) + # if classifier is not None: + # classifier['Noise'].predict(seg.clipping(org, uicompos), uicompos) + # draw.draw_bounding_box_class(org, uicompos, show=show) + # uicompos = det.rm_noise_compos(uicompos) + + # *** Step 6 *** element classification: all category classification + # if classifier is not None: + # classifier['Elements'].predict([compo.compo_clipping(org) for compo in uicompos], uicompos) + # draw.draw_bounding_box_class(org, uicompos, show=show, name='cls', write_path=pjoin(ip_root, 'result.jpg')) + # draw.draw_bounding_box_class(org, uicompos, write_path=pjoin(output_root, 'result.jpg')) + + # *** Step 7 *** save detection result + + Compo.compos_update(uicompos, org.shape) + file.save_corners_json(pjoin(ip_root, name + '.json'), uicompos) + # file.save_corners_json(pjoin(ip_root, name + '.json'), uicompos, full_size_org, ratio) + + cd_time = time.process_time() - start + print("[Compo Detection Completed in %.3f s] Input: %s Output: %s" % (cd_time, input_img_path, pjoin(ip_root, name + '.json'))) + return cd_time diff --git a/CDM/detect_compo/lib_ip/Bbox.py b/CDM/detect_compo/lib_ip/Bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..5790d8d20751bad1133172b4ffbc0106d8d422c0 --- /dev/null +++ b/CDM/detect_compo/lib_ip/Bbox.py @@ -0,0 +1,122 @@ +import numpy as np +import CDM.detect_compo.lib_ip.ip_draw as draw + + +class Bbox: + def __init__(self, col_min, row_min, col_max, row_max): + self.col_min = col_min + self.row_min = row_min + self.col_max = col_max + self.row_max = row_max + + self.width = col_max - col_min + self.height = row_max - row_min + self.box_area = self.width * self.height + + def put_bbox(self): + return self.col_min, self.row_min, self.col_max, self.row_max + + def bbox_cal_area(self): + self.box_area = self.width * self.height + return self.box_area + + def bbox_relation(self, bbox_b): + """ + :return: -1 : a in b + 0 : a, b are not intersected + 1 : b in a + 2 : a, b are identical or intersected + """ + col_min_a, row_min_a, col_max_a, row_max_a = self.put_bbox() + col_min_b, row_min_b, col_max_b, row_max_b = bbox_b.put_bbox() + + # if a is in b + if col_min_a > col_min_b and row_min_a > row_min_b and col_max_a < col_max_b and row_max_a < row_max_b: + return -1 + # if b is in a + elif col_min_a < col_min_b and row_min_a < row_min_b and col_max_a > col_max_b and row_max_a > row_max_b: + return 1 + # a and b are non-intersect + elif (col_min_a > col_max_b or row_min_a > row_max_b) or (col_min_b > col_max_a or row_min_b > row_max_a): + return 0 + # intersection + else: + return 2 + + def bbox_relation_nms(self, bbox_b, bias=(0, 0)): + ''' + Calculate the relation between two rectangles by nms + :return: -1 : a in b + 0 : a, b are not intersected + 1 : b in a + 2 : a, b are intersected + ''' + col_min_a, row_min_a, col_max_a, row_max_a = self.put_bbox() + col_min_b, row_min_b, col_max_b, row_max_b = bbox_b.put_bbox() + + bias_col, bias_row = bias + # get the intersected area + col_min_s = max(col_min_a - bias_col, col_min_b - bias_col) + row_min_s = max(row_min_a - bias_row, row_min_b - bias_row) + col_max_s = min(col_max_a + bias_col, col_max_b + bias_col) + row_max_s = min(row_max_a + bias_row, row_max_b + bias_row) + w = np.maximum(0, col_max_s - col_min_s) + h = np.maximum(0, row_max_s - row_min_s) + inter = w * h + area_a = (col_max_a - col_min_a) * (row_max_a - row_min_a) + area_b = (col_max_b - col_min_b) * (row_max_b - row_min_b) + iou = inter / (area_a + area_b - inter) + ioa = inter / self.box_area + iob = inter / bbox_b.box_area + + if iou == 0 and ioa == 0 and iob == 0: + return 0 + + # import lib_ip.ip_preprocessing as pre + # org_iou, _ = pre.read_img('uied/data/input/7.jpg', 800) + # print(iou, ioa, iob) + # board = draw.draw_bounding_box(org_iou, [self], color=(255,0,0)) + # draw.draw_bounding_box(board, [bbox_b], color=(0,255,0), show=True) + + # contained by b + if ioa >= 1: + return -1 + # contains b + if iob >= 1: + return 1 + # not intersected with each other + # intersected + if iou >= 0.02 or iob > 0.2 or ioa > 0.2: + return 2 + # if iou == 0: + # print('ioa:%.5f; iob:%.5f; iou:%.5f' % (ioa, iob, iou)) + return 0 + + def bbox_cvt_relative_position(self, col_min_base, row_min_base): + ''' + Convert to relative position based on base coordinator + ''' + self.col_min += col_min_base + self.col_max += col_min_base + self.row_min += row_min_base + self.row_max += row_min_base + + def bbox_merge(self, bbox_b): + ''' + Merge two intersected bboxes + ''' + col_min_a, row_min_a, col_max_a, row_max_a = self.put_bbox() + col_min_b, row_min_b, col_max_b, row_max_b = bbox_b.put_bbox() + col_min = min(col_min_a, col_min_b) + col_max = max(col_max_a, col_max_b) + row_min = min(row_min_a, row_min_b) + row_max = max(row_max_a, row_max_b) + new_bbox = Bbox(col_min, row_min, col_max, row_max) + return new_bbox + + def bbox_padding(self, image_shape, pad): + row, col = image_shape[:2] + self.col_min = max(self.col_min - pad, 0) + self.col_max = min(self.col_max + pad, col) + self.row_min = max(self.row_min - pad, 0) + self.row_max = min(self.row_max + pad, row) \ No newline at end of file diff --git a/CDM/detect_compo/lib_ip/Component.py b/CDM/detect_compo/lib_ip/Component.py new file mode 100644 index 0000000000000000000000000000000000000000..e860656b63f25a0585e39fff19f1480a7a7090d5 --- /dev/null +++ b/CDM/detect_compo/lib_ip/Component.py @@ -0,0 +1,238 @@ +from CDM.detect_compo.lib_ip.Bbox import Bbox +import CDM.detect_compo.lib_ip.ip_draw as draw + +import cv2 + + +def cvt_compos_relative_pos(compos, col_min_base, row_min_base): + for compo in compos: + compo.compo_relative_position(col_min_base, row_min_base) + + +def compos_containment(compos): + for i in range(len(compos) - 1): + for j in range(i + 1, len(compos)): + relation = compos[i].compo_relation(compos[j]) + if relation == -1: + compos[j].contain.append(i) + if relation == 1: + compos[i].contain.append(j) + + +def compos_update(compos, org_shape): + for i, compo in enumerate(compos): + # start from 1, id 0 is background + compo.compo_update(i + 1, org_shape) + + +class Component: + def __init__(self, region, image_shape): + self.id = None + self.region = region + self.boundary = self.compo_get_boundary() + self.bbox = self.compo_get_bbox() + self.bbox_area = self.bbox.box_area + + self.region_area = len(region) + self.width = len(self.boundary[0]) + self.height = len(self.boundary[2]) + self.image_shape = image_shape + self.area = self.width * self.height + + self.category = 'Compo' + self.contain = [] + + self.rect_ = None + self.line_ = None + self.redundant = False + + def compo_update(self, id, org_shape): + self.id = id + self.image_shape = org_shape + self.width = self.bbox.width + self.height = self.bbox.height + self.bbox_area = self.bbox.box_area + self.area = self.width * self.height + + def put_bbox(self): + return self.bbox.put_bbox() + + def compo_update_bbox_area(self): + self.bbox_area = self.bbox.bbox_cal_area() + + def compo_get_boundary(self): + ''' + get the bounding boundary of an object(region) + boundary: [top, bottom, left, right] + -> up, bottom: (column_index, min/max row border) + -> left, right: (row_index, min/max column border) detect range of each row + ''' + border_up, border_bottom, border_left, border_right = {}, {}, {}, {} + for point in self.region: + # point: (row_index, column_index) + # up, bottom: (column_index, min/max row border) detect range of each column + if point[1] not in border_up or border_up[point[1]] > point[0]: + border_up[point[1]] = point[0] + if point[1] not in border_bottom or border_bottom[point[1]] < point[0]: + border_bottom[point[1]] = point[0] + # left, right: (row_index, min/max column border) detect range of each row + if point[0] not in border_left or border_left[point[0]] > point[1]: + border_left[point[0]] = point[1] + if point[0] not in border_right or border_right[point[0]] < point[1]: + border_right[point[0]] = point[1] + + boundary = [border_up, border_bottom, border_left, border_right] + # descending sort + for i in range(len(boundary)): + boundary[i] = [[k, boundary[i][k]] for k in boundary[i].keys()] + boundary[i] = sorted(boundary[i], key=lambda x: x[0]) + return boundary + + def compo_get_bbox(self): + """ + Get the top left and bottom right points of boundary + :param boundaries: boundary: [top, bottom, left, right] + -> up, bottom: (column_index, min/max row border) + -> left, right: (row_index, min/max column border) detect range of each row + :return: corners: [(top_left, bottom_right)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + """ + col_min, row_min = (int(min(self.boundary[0][0][0], self.boundary[1][-1][0])), int(min(self.boundary[2][0][0], self.boundary[3][-1][0]))) + col_max, row_max = (int(max(self.boundary[0][0][0], self.boundary[1][-1][0])), int(max(self.boundary[2][0][0], self.boundary[3][-1][0]))) + bbox = Bbox(col_min, row_min, col_max, row_max) + return bbox + + def compo_is_rectangle(self, min_rec_evenness, max_dent_ratio, test=False): + ''' + detect if an object is rectangle by evenness and dent of each border + ''' + dent_direction = [1, -1, 1, -1] # direction for convex + + flat = 0 + parameter = 0 + for n, border in enumerate(self.boundary): + parameter += len(border) + # dent detection + pit = 0 # length of pit + depth = 0 # the degree of surface changing + if n <= 1: + adj_side = max(len(self.boundary[2]), len(self.boundary[3])) # get maximum length of adjacent side + else: + adj_side = max(len(self.boundary[0]), len(self.boundary[1])) + + # -> up, bottom: (column_index, min/max row border) + # -> left, right: (row_index, min/max column border) detect range of each row + abnm = 0 + for i in range(int(3 + len(border) * 0.02), len(border) - 1): + # calculate gradient + difference = border[i][1] - border[i + 1][1] + # the degree of surface changing + depth += difference + # ignore noise at the start of each direction + if i / len(border) < 0.08 and (dent_direction[n] * difference) / adj_side > 0.5: + depth = 0 # reset + + # print(border[i][1], i / len(border), depth, (dent_direction[n] * difference) / adj_side) + # if the change of the surface is too large, count it as part of abnormal change + if abs(depth) / adj_side > 0.3: + abnm += 1 # count the size of the abnm + # if the abnm is too big, the shape should not be a rectangle + if abnm / len(border) > 0.1: + if test: + print('abnms', abnm, abnm / len(border)) + draw.draw_boundary([self], self.image_shape, show=True) + self.rect_ = False + return False + continue + else: + # reset the abnm if the depth back to normal + abnm = 0 + + # if sunken and the surface changing is large, then counted as pit + if dent_direction[n] * depth < 0 and abs(depth) / adj_side > 0.15: + pit += 1 + continue + + # if the surface is not changing to a pit and the gradient is zero, then count it as flat + if abs(depth) < 1 + adj_side * 0.015: + flat += 1 + if test: + print(depth, adj_side, flat) + # if the pit is too big, the shape should not be a rectangle + if pit / len(border) > max_dent_ratio: + if test: + print('pit', pit, pit / len(border)) + draw.draw_boundary([self], self.image_shape, show=True) + self.rect_ = False + return False + if test: + print(flat / parameter, '\n') + draw.draw_boundary([self], self.image_shape, show=True) + # ignore text and irregular shape + if self.height / self.image_shape[0] > 0.3: + min_rec_evenness = 0.85 + if (flat / parameter) < min_rec_evenness: + self.rect_ = False + return False + self.rect_ = True + return True + + def compo_is_line(self, min_line_thickness): + """ + Check this object is line by checking its boundary + :param boundary: boundary: [border_top, border_bottom, border_left, border_right] + -> top, bottom: list of (column_index, min/max row border) + -> left, right: list of (row_index, min/max column border) detect range of each row + :param min_line_thickness: + :return: Boolean + """ + # horizontally + slim = 0 + for i in range(self.width): + if abs(self.boundary[1][i][1] - self.boundary[0][i][1]) <= min_line_thickness: + slim += 1 + if slim / len(self.boundary[0]) > 0.93: + self.line_ = True + return True + # vertically + slim = 0 + for i in range(self.height): + if abs(self.boundary[2][i][1] - self.boundary[3][i][1]) <= min_line_thickness: + slim += 1 + if slim / len(self.boundary[2]) > 0.93: + self.line_ = True + return True + self.line_ = False + return False + + def compo_relation(self, compo_b, bias=(0, 0)): + """ + :return: -1 : a in b + 0 : a, b are not intersected + 1 : b in a + 2 : a, b are identical or intersected + """ + return self.bbox.bbox_relation_nms(compo_b.bbox, bias) + + def compo_relative_position(self, col_min_base, row_min_base): + ''' + Convert to relative position based on base coordinator + ''' + self.bbox.bbox_cvt_relative_position(col_min_base, row_min_base) + + def compo_merge(self, compo_b): + self.bbox = self.bbox.bbox_merge(compo_b.bbox) + self.compo_update(self.id, self.image_shape) + + def compo_clipping(self, img, pad=0, show=False): + (column_min, row_min, column_max, row_max) = self.put_bbox() + column_min = max(column_min - pad, 0) + column_max = min(column_max + pad, img.shape[1]) + row_min = max(row_min - pad, 0) + row_max = min(row_max + pad, img.shape[0]) + clip = img[row_min:row_max, column_min:column_max] + if show: + cv2.imshow('clipping', clip) + cv2.waitKey() + return clip diff --git a/CDM/detect_compo/lib_ip/file_utils.py b/CDM/detect_compo/lib_ip/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b817da1eebc93c364ab654664c9cced0b7e3a3fe --- /dev/null +++ b/CDM/detect_compo/lib_ip/file_utils.py @@ -0,0 +1,80 @@ +import os +import pandas as pd +import json +from os.path import join as pjoin +import time +import cv2 + + +def save_corners(file_path, corners, compo_name, clear=True): + try: + df = pd.read_csv(file_path, index_col=0) + except: + df = pd.DataFrame(columns=['component', 'x_max', 'x_min', 'y_max', 'y_min', 'height', 'width']) + + if clear: + df = df.drop(df.index) + for corner in corners: + (up_left, bottom_right) = corner + c = {'component': compo_name} + (c['y_min'], c['x_min']) = up_left + (c['y_max'], c['x_max']) = bottom_right + c['width'] = c['y_max'] - c['y_min'] + c['height'] = c['x_max'] - c['x_min'] + df = df.append(c, True) + df.to_csv(file_path) + + +def save_corners_json(file_path, compos): + # img_shape = [int(x * ratio) for x in compos[0].image_shape] + # w_h_ratio = org.shape[1] / org.shape[0] + # img_shape = org.shape + + img_shape = compos[0].image_shape + output = {'img_shape': img_shape, 'compos': []} + f_out = open(file_path, 'w') + + for compo in compos: + bbox = compo.put_bbox() + # bbox = [int(x * ratio) for x in bbox] + c = {'id': compo.id, 'class': compo.category} + (c['column_min'], c['row_min'], c['column_max'], c['row_max']) = bbox + c['width'] = compo.width + c['height'] = compo.height + # c['width'] = int(compo.width * ratio) + # c['height'] = int(compo.height * ratio) + output['compos'].append(c) + + json.dump(output, f_out, indent=4) + + +def save_clipping(org, output_root, corners, compo_classes, compo_index): + if not os.path.exists(output_root): + os.mkdir(output_root) + pad = 2 + for i in range(len(corners)): + compo = compo_classes[i] + (up_left, bottom_right) = corners[i] + (col_min, row_min) = up_left + (col_max, row_max) = bottom_right + col_min = max(col_min - pad, 0) + col_max = min(col_max + pad, org.shape[1]) + row_min = max(row_min - pad, 0) + row_max = min(row_max + pad, org.shape[0]) + + # if component type already exists, index increase by 1, otherwise add this type + compo_path = pjoin(output_root, compo) + if compo_classes[i] not in compo_index: + compo_index[compo_classes[i]] = 0 + if not os.path.exists(compo_path): + os.mkdir(compo_path) + else: + compo_index[compo_classes[i]] += 1 + clip = org[row_min:row_max, col_min:col_max] + cv2.imwrite(pjoin(compo_path, str(compo_index[compo_classes[i]]) + '.png'), clip) + + +def build_directory(directory): + if not os.path.exists(directory): + os.mkdir(directory) + return directory diff --git a/CDM/detect_compo/lib_ip/ip_detection.py b/CDM/detect_compo/lib_ip/ip_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..7034dcad07755a00d54435c1f86f91a7c7ee84c3 --- /dev/null +++ b/CDM/detect_compo/lib_ip/ip_detection.py @@ -0,0 +1,574 @@ +import cv2 +import numpy as np + +import CDM.detect_compo.lib_ip.ip_draw as draw +import CDM.detect_compo.lib_ip.ip_preprocessing as pre +from CDM.detect_compo.lib_ip.Component import Component +import CDM.detect_compo.lib_ip.Component as Compo +from CDM.config.CONFIG_UIED import Config +C = Config() + + +def merge_intersected_corner(compos, org, is_merge_contained_ele, max_gap=(0, 0), max_ele_height=25): + ''' + :param is_merge_contained_ele: if true, merge compos nested in others + :param max_gap: (horizontal_distance, vertical_distance) to be merge into one line/column + :param max_ele_height: if higher than it, recognize the compo as text + :return: + ''' + changed = False + new_compos = [] + Compo.compos_update(compos, org.shape) + for i in range(len(compos)): + merged = False + cur_compo = compos[i] + for j in range(len(new_compos)): + relation = cur_compo.compo_relation(new_compos[j], max_gap) + # print(relation) + # draw.draw_bounding_box(org, [cur_compo, new_compos[j]], name='b-merge', show=True) + # merge compo[i] to compo[j] if + # 1. compo[j] contains compo[i] + # 2. compo[j] intersects with compo[i] with certain iou + # 3. is_merge_contained_ele and compo[j] is contained in compo[i] + if relation == 1 or \ + relation == 2 or \ + (is_merge_contained_ele and relation == -1): + # (relation == 2 and new_compos[j].height < max_ele_height and cur_compo.height < max_ele_height) or\ + + new_compos[j].compo_merge(cur_compo) + cur_compo = new_compos[j] + # draw.draw_bounding_box(org, [new_compos[j]], name='a-merge', show=True) + merged = True + changed = True + # break + if not merged: + new_compos.append(compos[i]) + + if not changed: + return compos + else: + return merge_intersected_corner(new_compos, org, is_merge_contained_ele, max_gap, max_ele_height) + + +def merge_intersected_compos(compos): + changed = True + while changed: + changed = False + temp_set = [] + for compo_a in compos: + merged = False + for compo_b in temp_set: + if compo_a.compo_relation(compo_b) == 2: + compo_b.compo_merge(compo_a) + merged = True + changed = True + break + if not merged: + temp_set.append(compo_a) + compos = temp_set.copy() + return compos + + +def rm_contained_compos_not_in_block(compos): + ''' + remove all components contained by others that are not Block + ''' + marked = np.full(len(compos), False) + for i in range(len(compos) - 1): + for j in range(i + 1, len(compos)): + relation = compos[i].compo_relation(compos[j]) + if relation == -1 and compos[j].category != 'Block': + marked[i] = True + if relation == 1 and compos[i].category != 'Block': + marked[j] = True + new_compos = [] + for i in range(len(marked)): + if not marked[i]: + new_compos.append(compos[i]) + return new_compos + + +def merge_text(compos, org_shape, max_word_gad=4, max_word_height=20): + def is_text_line(compo_a, compo_b): + (col_min_a, row_min_a, col_max_a, row_max_a) = compo_a.put_bbox() + (col_min_b, row_min_b, col_max_b, row_max_b) = compo_b.put_bbox() + + col_min_s = max(col_min_a, col_min_b) + col_max_s = min(col_max_a, col_max_b) + row_min_s = max(row_min_a, row_min_b) + row_max_s = min(row_max_a, row_max_b) + + # on the same line + # if abs(row_min_a - row_min_b) < max_word_gad and abs(row_max_a - row_max_b) < max_word_gad: + if row_min_s < row_max_s: + # close distance + if col_min_s < col_max_s or \ + (0 < col_min_b - col_max_a < max_word_gad) or (0 < col_min_a - col_max_b < max_word_gad): + return True + return False + + changed = False + new_compos = [] + row, col = org_shape[:2] + for i in range(len(compos)): + merged = False + height = compos[i].height + # ignore non-text + # if height / row > max_word_height_ratio\ + # or compos[i].category != 'Text': + if height > max_word_height: + new_compos.append(compos[i]) + continue + for j in range(len(new_compos)): + # if compos[j].category != 'Text': + # continue + if is_text_line(compos[i], new_compos[j]): + new_compos[j].compo_merge(compos[i]) + merged = True + changed = True + break + if not merged: + new_compos.append(compos[i]) + + if not changed: + return compos + else: + return merge_text(new_compos, org_shape) + + +def rm_top_or_bottom_corners(components, org_shape, top_bottom_height=C.THRESHOLD_TOP_BOTTOM_BAR): + new_compos = [] + height, width = org_shape[:2] + for compo in components: + (column_min, row_min, column_max, row_max) = compo.put_bbox() + # remove big ones + # if (row_max - row_min) / height > 0.65 and (column_max - column_min) / width > 0.8: + # continue + if not (row_max < height * top_bottom_height[0] or row_min > height * top_bottom_height[1]): + new_compos.append(compo) + return new_compos + + +def rm_line_v_h(binary, show=False, max_line_thickness=C.THRESHOLD_LINE_THICKNESS): + def check_continuous_line(line, edge): + continuous_length = 0 + line_start = -1 + for j, p in enumerate(line): + if p > 0: + if line_start == -1: + line_start = j + continuous_length += 1 + elif continuous_length > 0: + if continuous_length / edge > 0.6: + return [line_start, j] + continuous_length = 0 + line_start = -1 + + if continuous_length / edge > 0.6: + return [line_start, len(line)] + else: + return None + + def extract_line_area(line, start_idx, flag='v'): + for e, l in enumerate(line): + if flag == 'v': + map_line[start_idx + e, l[0]:l[1]] = binary[start_idx + e, l[0]:l[1]] + + map_line = np.zeros(binary.shape[:2], dtype=np.uint8) + cv2.imshow('binary', binary) + + width = binary.shape[1] + start_row = -1 + line_area = [] + for i, row in enumerate(binary): + line_v = check_continuous_line(row, width) + if line_v is not None: + # new line + if start_row == -1: + start_row = i + line_area = [] + line_area.append(line_v) + else: + # checking line + if start_row != -1: + if i - start_row < max_line_thickness: + # binary[start_row: i] = 0 + # map_line[start_row: i] = binary[start_row: i] + print(line_area, start_row, i) + extract_line_area(line_area, start_row) + start_row = -1 + + height = binary.shape[0] + start_col = -1 + for i in range(width): + col = binary[:, i] + line_h = check_continuous_line(col, height) + if line_h is not None: + # new line + if start_col == -1: + start_col = i + else: + # checking line + if start_col != -1: + if i - start_col < max_line_thickness: + # binary[:, start_col: i] = 0 + map_line[:, start_col: i] = binary[:, start_col: i] + start_col = -1 + + binary -= map_line + + if show: + cv2.imshow('no-line', binary) + cv2.imshow('lines', map_line) + cv2.waitKey() + + +def rm_line(binary, + max_line_thickness=C.THRESHOLD_LINE_THICKNESS, + min_line_length_ratio=C.THRESHOLD_LINE_MIN_LENGTH, + show=False, wait_key=0): + def is_valid_line(line): + line_length = 0 + line_gap = 0 + for j in line: + if j > 0: + if line_gap > 5: + return False + line_length += 1 + line_gap = 0 + elif line_length > 0: + line_gap += 1 + if line_length / width > 0.95: + return True + return False + + height, width = binary.shape[:2] + board = np.zeros(binary.shape[:2], dtype=np.uint8) + + start_row, end_row = -1, -1 + check_line = False + check_gap = False + for i, row in enumerate(binary): + # line_ratio = (sum(row) / 255) / width + # if line_ratio > 0.9: + if is_valid_line(row): + # new start: if it is checking a new line, mark this row as start + if not check_line: + start_row = i + check_line = True + else: + # end the line + if check_line: + # thin enough to be a line, then start checking gap + if i - start_row < max_line_thickness: + end_row = i + check_gap = True + else: + start_row, end_row = -1, -1 + check_line = False + # check gap + if check_gap and i - end_row > max_line_thickness: + binary[start_row: end_row] = 0 + start_row, end_row = -1, -1 + check_line = False + check_gap = False + + if (check_line and (height - start_row) < max_line_thickness) or check_gap: + binary[start_row: end_row] = 0 + + if show: + cv2.imshow('no-line binary', binary) + if wait_key is not None: + cv2.waitKey(wait_key) + if wait_key == 0: + cv2.destroyWindow('no-line binary') + + +def rm_noise_compos(compos): + compos_new = [] + for compo in compos: + if compo.category == 'Noise': + continue + compos_new.append(compo) + return compos_new + + +def rm_noise_in_large_img(compos, org, + max_compo_scale=C.THRESHOLD_COMPO_MAX_SCALE): + row, column = org.shape[:2] + remain = np.full(len(compos), True) + new_compos = [] + for compo in compos: + if compo.category == 'Image': + for i in compo.contain: + remain[i] = False + for i in range(len(remain)): + if remain[i]: + new_compos.append(compos[i]) + return new_compos + + +def detect_compos_in_img(compos, binary, org, max_compo_scale=C.THRESHOLD_COMPO_MAX_SCALE, show=False): + compos_new = [] + row, column = binary.shape[:2] + for compo in compos: + if compo.category == 'Image': + compo.compo_update_bbox_area() + # org_clip = compo.compo_clipping(org) + # bin_clip = pre.binarization(org_clip, show=show) + bin_clip = compo.compo_clipping(binary) + bin_clip = pre.reverse_binary(bin_clip, show=show) + + compos_rec, compos_nonrec = component_detection(bin_clip, test=False, step_h=10, step_v=10, rec_detect=True) + for compo_rec in compos_rec: + compo_rec.compo_relative_position(compo.bbox.col_min, compo.bbox.row_min) + if compo_rec.bbox_area / compo.bbox_area < 0.8 and compo_rec.bbox.height > 20 and compo_rec.bbox.width > 20: + compos_new.append(compo_rec) + # draw.draw_bounding_box(org, [compo_rec], show=True) + + # compos_inner = component_detection(bin_clip, rec_detect=False) + # for compo_inner in compos_inner: + # compo_inner.compo_relative_position(compo.bbox.col_min, compo.bbox.row_min) + # draw.draw_bounding_box(org, [compo_inner], show=True) + # if compo_inner.bbox_area / compo.bbox_area < 0.8: + # compos_new.append(compo_inner) + compos += compos_new + + +def compo_filter(compos, min_area, img_shape): + # max_height = img_shape[0] * 0.8 + # compos_new = [] + # for compo in compos: + # if compo.area < min_area: + # continue + # if compo.height > max_height: + # continue + # ratio_h = compo.width / compo.height + # ratio_w = compo.height / compo.width + # if ratio_h > 50 or ratio_w > 40 or \ + # (min(compo.height, compo.width) < 8 and max(ratio_h, ratio_w) > 10): + # continue + # compos_new.append(compo) + # return compos_new + + # mobile semantics filter + # compos_new = [] + # + # for compo in compos: + # + # if compo.area >= 0.05 * (img_shape[0] * img_shape[1]): + # continue + # + # smaller_dimension = min(compo.width, compo.height) + # larger_dimension = max(compo.width, compo.height) + # + # if smaller_dimension/larger_dimension <= 0.75: + # continue + # + # compos_new.append(compo) + # + # return compos_new + + # my own filter + compos_new = [] + + for compo in compos: + + if compo.area >= 0.1 * (img_shape[0] * img_shape[1]): + continue + + if compo.area <= 0.0005 * (img_shape[0] * img_shape[1]): + continue + + smaller_dimension = min(compo.width, compo.height) + larger_dimension = max(compo.width, compo.height) + + if smaller_dimension / larger_dimension <= 0.6: + continue + + compos_new.append(compo) + + return compos_new + + +def is_block(clip, thread=0.15): + ''' + Block is a rectangle border enclosing a group of compos (consider it as a wireframe) + Check if a compo is block by checking if the inner side of its border is blank + ''' + side = 4 # scan 4 lines inner forward each border + # top border - scan top down + blank_count = 0 + for i in range(1, 5): + if sum(clip[side + i]) / 255 > thread * clip.shape[1]: + blank_count += 1 + if blank_count > 2: return False + # left border - scan left to right + blank_count = 0 + for i in range(1, 5): + if sum(clip[:, side + i]) / 255 > thread * clip.shape[0]: + blank_count += 1 + if blank_count > 2: return False + + side = -4 + # bottom border - scan bottom up + blank_count = 0 + for i in range(-1, -5, -1): + if sum(clip[side + i]) / 255 > thread * clip.shape[1]: + blank_count += 1 + if blank_count > 2: return False + # right border - scan right to left + blank_count = 0 + for i in range(-1, -5, -1): + if sum(clip[:, side + i]) / 255 > thread * clip.shape[0]: + blank_count += 1 + if blank_count > 2: return False + return True + + +def compo_block_recognition(binary, compos, block_side_length=0.15): + height, width = binary.shape + for compo in compos: + if compo.height / height > block_side_length and compo.width / width > block_side_length: + clip = compo.compo_clipping(binary) + if is_block(clip): + compo.category = 'Block' + + +# take the binary image as input +# calculate the connected regions -> get the bounding boundaries of them -> check if those regions are rectangles +# return all boundaries and boundaries of rectangles +def component_detection(binary, min_obj_area, + line_thickness=C.THRESHOLD_LINE_THICKNESS, + min_rec_evenness=C.THRESHOLD_REC_MIN_EVENNESS, + max_dent_ratio=C.THRESHOLD_REC_MAX_DENT_RATIO, + step_h = 5, step_v = 2, + rec_detect=False, show=False, test=False): + """ + :param binary: Binary image from pre-processing + :param min_obj_area: If not pass then ignore the small object + :param min_obj_perimeter: If not pass then ignore the small object + :param line_thickness: If not pass then ignore the slim object + :param min_rec_evenness: If not pass then this object cannot be rectangular + :param max_dent_ratio: If not pass then this object cannot be rectangular + :return: boundary: [top, bottom, left, right] + -> up, bottom: list of (column_index, min/max row border) + -> left, right: list of (row_index, min/max column border) detect range of each row + """ + mask = np.zeros((binary.shape[0] + 2, binary.shape[1] + 2), dtype=np.uint8) + compos_all = [] + compos_rec = [] + compos_nonrec = [] + row, column = binary.shape[0], binary.shape[1] + for i in range(0, row, step_h): + for j in range(i % 2, column, step_v): + if binary[i, j] == 255 and mask[i, j] == 0: + # get connected area + # region = util.boundary_bfs_connected_area(binary, i, j, mask) + + mask_copy = mask.copy() + ff = cv2.floodFill(binary, mask, (j, i), None, 0, 0, cv2.FLOODFILL_MASK_ONLY) + if ff[0] < min_obj_area: continue + mask_copy = mask - mask_copy + region = np.reshape(cv2.findNonZero(mask_copy[1:-1, 1:-1]), (-1, 2)) + region = [(p[1], p[0]) for p in region] + + # filter out some compos + component = Component(region, binary.shape) + # calculate the boundary of the connected area + # ignore small area + if component.width <= 3 or component.height <= 3: + continue + # check if it is line by checking the length of edges + # if component.compo_is_line(line_thickness): + # continue + + if test: + print('Area:%d' % (len(region))) + draw.draw_boundary([component], binary.shape, show=True) + + compos_all.append(component) + + if rec_detect: + # rectangle check + if component.compo_is_rectangle(min_rec_evenness, max_dent_ratio): + component.rect_ = True + compos_rec.append(component) + else: + component.rect_ = False + compos_nonrec.append(component) + + if show: + print('Area:%d' % (len(region))) + draw.draw_boundary(compos_all, binary.shape, show=True) + + # draw.draw_boundary(compos_all, binary.shape, show=True) + if rec_detect: + return compos_rec, compos_nonrec + else: + return compos_all + + +def nested_components_detection(grey, org, grad_thresh, + show=False, write_path=None, + step_h=10, step_v=10, + line_thickness=C.THRESHOLD_LINE_THICKNESS, + min_rec_evenness=C.THRESHOLD_REC_MIN_EVENNESS, + max_dent_ratio=C.THRESHOLD_REC_MAX_DENT_RATIO): + ''' + :param grey: grey-scale of original image + :return: corners: list of [(top_left, bottom_right)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + ''' + compos = [] + mask = np.zeros((grey.shape[0]+2, grey.shape[1]+2), dtype=np.uint8) + broad = np.zeros((grey.shape[0], grey.shape[1], 3), dtype=np.uint8) + broad_all = broad.copy() + + row, column = grey.shape[0], grey.shape[1] + for x in range(0, row, step_h): + for y in range(0, column, step_v): + if mask[x, y] == 0: + # region = flood_fill_bfs(grey, x, y, mask) + + # flood fill algorithm to get background (layout block) + mask_copy = mask.copy() + ff = cv2.floodFill(grey, mask, (y, x), None, grad_thresh, grad_thresh, cv2.FLOODFILL_MASK_ONLY) + # ignore small regions + if ff[0] < 500: continue + mask_copy = mask - mask_copy + region = np.reshape(cv2.findNonZero(mask_copy[1:-1, 1:-1]), (-1, 2)) + region = [(p[1], p[0]) for p in region] + + compo = Component(region, grey.shape) + # draw.draw_region(region, broad_all) + # if block.height < 40 and block.width < 40: + # continue + if compo.height < 30: + continue + + # print(block.area / (row * column)) + if compo.area / (row * column) > 0.9: + continue + elif compo.area / (row * column) > 0.7: + compo.redundant = True + + # get the boundary of this region + # ignore lines + if compo.compo_is_line(line_thickness): + continue + # ignore non-rectangle as blocks must be rectangular + if not compo.compo_is_rectangle(min_rec_evenness, max_dent_ratio): + continue + # if block.height/row < min_block_height_ratio: + # continue + compos.append(compo) + # draw.draw_region(region, broad) + if show: + cv2.imshow('flood-fill all', broad_all) + cv2.imshow('block', broad) + cv2.waitKey() + if write_path is not None: + cv2.imwrite(write_path, broad) + return compos diff --git a/CDM/detect_compo/lib_ip/ip_draw.py b/CDM/detect_compo/lib_ip/ip_draw.py new file mode 100644 index 0000000000000000000000000000000000000000..9e72407d658ddb68efe23c3c4b0ceb5849109281 --- /dev/null +++ b/CDM/detect_compo/lib_ip/ip_draw.py @@ -0,0 +1,139 @@ +import cv2 +import numpy as np +from random import randint as rint +from CDM.config.CONFIG_UIED import Config + + +C = Config() + + +def draw_bounding_box_class(org, components, color_map=C.COLOR, line=2, show=False, write_path=None, name='board'): + """ + Draw bounding box of components with their classes on the original image + :param org: original image + :param components: bbox [(column_min, row_min, column_max, row_max)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + :param color_map: colors mapping to different components + :param line: line thickness + :param compo_class: classes matching the corners of components + :param show: show or not + :return: labeled image + """ + board = org.copy() + for compo in components: + bbox = compo.put_bbox() + board = cv2.rectangle(board, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color_map[compo.category], line) + # board = cv2.putText(board, compo.category, (bbox[0]+5, bbox[1]+20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color_map[compo.category], 2) + if show: + cv2.imshow(name, board) + cv2.waitKey(0) + if write_path is not None: + cv2.imwrite(write_path, board) + return board + + +def draw_bounding_box(org, ratio, components, color=(0, 255, 0), line=2, + show=False, write_path=None, name='board', is_return=False, wait_key=0): + """ + Draw bounding box of components on the original image + :param org: original image + :param components: bbox [(column_min, row_min, column_max, row_max)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + :param color: line color + :param line: line thickness + :param show: show or not + :return: labeled image + """ + if not show and write_path is None and not is_return: return + board = org.copy() + # board = cv2.imread(img_path) + # ratio = board.shape[0]/org.shape[0] + + for compo in components: + bbox = compo.put_bbox() + + # bounding box on full size image + # bbox = int(ratio * bbox) + bbox = [int(x * ratio) for x in bbox] + board = cv2.rectangle(board, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, line) + if show: + cv2.imshow(name, board) + if wait_key is not None: + cv2.waitKey(wait_key) + if wait_key == 0: + cv2.destroyWindow(name) + if write_path is not None: + # board = cv2.resize(board, (1080, 1920)) + # board = board[100:-110] + cv2.imwrite(write_path, board) + return board + + +def draw_line(org, lines, color=(0, 255, 0), show=False): + """ + Draw detected lines on the original image + :param org: original image + :param lines: [line_h, line_v] + -> line_h: horizontal {'head':(column_min, row), 'end':(column_max, row), 'thickness':int) + -> line_v: vertical {'head':(column, row_min), 'end':(column, row_max), 'thickness':int} + :param color: drawn color + :param show: show or not + :return: image with lines drawn + """ + board = org.copy() + line_h, line_v = lines + for line in line_h: + cv2.line(board, tuple(line['head']), tuple(line['end']), color, line['thickness']) + for line in line_v: + cv2.line(board, tuple(line['head']), tuple(line['end']), color, line['thickness']) + if show: + cv2.imshow('img', board) + cv2.waitKey(0) + return board + + +def draw_boundary(components, shape, show=False): + """ + Draw boundary of objects on the black withe + :param components: boundary: [top, bottom, left, right] + -> up, bottom: (column_index, min/max row border) + -> left, right: (row_index, min/max column border) detect range of each row + :param shape: shape or original image + :param show: show or not + :return: drawn board + """ + board = np.zeros(shape[:2], dtype=np.uint8) # binary board + for component in components: + # up and bottom: (column_index, min/max row border) + for point in component.boundary[0] + component.boundary[1]: + board[point[1], point[0]] = 255 + # left, right: (row_index, min/max column border) + for point in component.boundary[2] + component.boundary[3]: + board[point[0], point[1]] = 255 + if show: + cv2.imshow('rec', board) + cv2.waitKey(0) + return board + + +def draw_region(region, broad, show=False): + color = (rint(0,255), rint(0,255), rint(0,255)) + for point in region: + broad[point[0], point[1]] = color + + if show: + cv2.imshow('region', broad) + cv2.waitKey() + return broad + + +def draw_region_bin(region, broad, show=False): + for point in region: + broad[point[0], point[1]] = 255 + + if show: + cv2.imshow('region', broad) + cv2.waitKey() + return broad diff --git a/CDM/detect_compo/lib_ip/ip_preprocessing.py b/CDM/detect_compo/lib_ip/ip_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..aaacd146fa9a9ec41c762a7a07d4738716ab01f5 --- /dev/null +++ b/CDM/detect_compo/lib_ip/ip_preprocessing.py @@ -0,0 +1,69 @@ +import cv2 +import numpy as np +from CDM.config.CONFIG_UIED import Config +C = Config() + + +def read_img(path, resize_height=None, kernel_size=None): + + def resize_by_height(org): + w_h_ratio = org.shape[1] / org.shape[0] + resize_w = resize_height * w_h_ratio + re = cv2.resize(org, (int(resize_w), int(resize_height))) + return re + + try: + img = cv2.imread(path) + if kernel_size is not None: + img = cv2.medianBlur(img, kernel_size) + if img is None: + print("*** Image does not exist ***") + return None, None + if resize_height is not None: + img = resize_by_height(img) + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + return img, gray + + except Exception as e: + print(e) + print("*** Img Reading Failed ***\n") + return None, None + + +def gray_to_gradient(img): + if len(img.shape) == 3: + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + img_f = np.copy(img) + img_f = img_f.astype("float") + + kernel_h = np.array([[0,0,0], [0,-1.,1.], [0,0,0]]) + kernel_v = np.array([[0,0,0], [0,-1.,0], [0,1.,0]]) + dst1 = abs(cv2.filter2D(img_f, -1, kernel_h)) + dst2 = abs(cv2.filter2D(img_f, -1, kernel_v)) + gradient = (dst1 + dst2).astype('uint8') + return gradient + + +def reverse_binary(bin, show=False): + """ + Reverse the input binary image + """ + r, bin = cv2.threshold(bin, 1, 255, cv2.THRESH_BINARY_INV) + if show: + cv2.imshow('binary_rev', bin) + cv2.waitKey() + return bin + + +def binarization(org, grad_min, show=False, write_path=None, wait_key=0): + grey = cv2.cvtColor(org, cv2.COLOR_BGR2GRAY) + grad = gray_to_gradient(grey) # get RoI with high gradient + rec, binary = cv2.threshold(grad, grad_min, 255, cv2.THRESH_BINARY) # enhance the RoI + morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, (3, 3)) # remove noises + if write_path is not None: + cv2.imwrite(write_path, morph) + if show: + cv2.imshow('binary', morph) + if wait_key is not None: + cv2.waitKey(wait_key) + return morph diff --git a/CDM/detect_compo/model/model-99-resnet18.pkl b/CDM/detect_compo/model/model-99-resnet18.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bfcf4b62dd60467c589a415475532be7a55d1baf --- /dev/null +++ b/CDM/detect_compo/model/model-99-resnet18.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b31df5d3ed9c743990fb7a27baf71626cf7766df36d1f414496c89d34a854f2 +size 44957605 diff --git a/CDM/detect_merge/Element.py b/CDM/detect_merge/Element.py new file mode 100644 index 0000000000000000000000000000000000000000..852cf4182cf9d398ac68591daeccaa666e8bf8b3 --- /dev/null +++ b/CDM/detect_merge/Element.py @@ -0,0 +1,113 @@ +import numpy as np +import cv2 + + +class Element: + def __init__(self, id, corner, category, text_content=None): + self.id = id + self.category = category + self.col_min, self.row_min, self.col_max, self.row_max = corner + self.width = self.col_max - self.col_min + self.height = self.row_max - self.row_min + self.area = self.width * self.height + + self.text_content = text_content + self.parent_id = None + self.children = [] # list of elements + self.label = None + + def init_bound(self): + self.width = self.col_max - self.col_min + self.height = self.row_max - self.row_min + self.area = self.width * self.height + + def put_bbox(self): + return self.col_min, self.row_min, self.col_max, self.row_max + + def wrap_info(self): + info = {'id':self.id, 'class': self.category, 'height': self.height, 'width': self.width, + 'position': {'column_min': self.col_min, 'row_min': self.row_min, 'column_max': self.col_max, + 'row_max': self.row_max}, 'label': self.label} + if self.text_content is not None: + info['text_content'] = self.text_content + if len(self.children) > 0: + info['children'] = [] + for child in self.children: + info['children'].append(child.id) + if self.parent_id is not None: + info['parent'] = self.parent_id + return info + + def resize(self, resize_ratio): + self.col_min = int(self.col_min * resize_ratio) + self.row_min = int(self.row_min * resize_ratio) + self.col_max = int(self.col_max * resize_ratio) + self.row_max = int(self.row_max * resize_ratio) + self.init_bound() + + def element_merge(self, element_b, new_element=False, new_category=None, new_id=None): + col_min_a, row_min_a, col_max_a, row_max_a = self.put_bbox() + col_min_b, row_min_b, col_max_b, row_max_b = element_b.put_bbox() + new_corner = (min(col_min_a, col_min_b), min(row_min_a, row_min_b), max(col_max_a, col_max_b), max(row_max_a, row_max_b)) + if element_b.text_content is not None: + self.text_content = element_b.text_content if self.text_content is None else self.text_content + '\n' + element_b.text_content + if new_element: + return Element(new_id, new_corner, new_category) + else: + self.col_min, self.row_min, self.col_max, self.row_max = new_corner + self.init_bound() + + def calc_intersection_area(self, element_b, bias=(0, 0)): + a = self.put_bbox() + b = element_b.put_bbox() + col_min_s = max(a[0], b[0]) - bias[0] + row_min_s = max(a[1], b[1]) - bias[1] + col_max_s = min(a[2], b[2]) + row_max_s = min(a[3], b[3]) + w = np.maximum(0, col_max_s - col_min_s) + h = np.maximum(0, row_max_s - row_min_s) + inter = w * h + + iou = inter / (self.area + element_b.area - inter) + ioa = inter / self.area + iob = inter / element_b.area + + return inter, iou, ioa, iob + + def element_relation(self, element_b, bias=(0, 0)): + """ + @bias: (horizontal bias, vertical bias) + :return: -1 : a in b + 0 : a, b are not intersected + 1 : b in a + 2 : a, b are identical or intersected + """ + inter, iou, ioa, iob = self.calc_intersection_area(element_b, bias) + + # area of intersection is 0 + if ioa == 0: + return 0 + # a in b + if ioa >= 1: + return -1 + # b in a + if iob >= 1: + return 1 + return 2 + + def visualize_element(self, img, color=(0, 255, 0), line=1, show=False, ratio=1): + loc = self.put_bbox() + + if ratio != 1: + loc = [int(x * ratio) for x in loc] + + # cv2.rectangle(img, loc[:2], loc[2:], color, line) + cv2.rectangle(img, (loc[0], loc[1]), (loc[2], loc[3]), color, line) + cv2.putText(img, str(int(self.id) + 1), (int(ratio*(self.col_min - 10)), int(ratio*(self.row_max + 10))), cv2.FONT_HERSHEY_SIMPLEX, 1, + color, line) + # for child in self.children: + # child.visualize_element(img, color=(255, 0, 255), line=line) + if show: + cv2.imshow('element', img) + cv2.waitKey(0) + cv2.destroyWindow('element') diff --git a/CDM/detect_merge/merge.py b/CDM/detect_merge/merge.py new file mode 100644 index 0000000000000000000000000000000000000000..185f4665e2f996230373ae70e0cda45efac90ed6 --- /dev/null +++ b/CDM/detect_merge/merge.py @@ -0,0 +1,361 @@ +import json +import cv2 +import numpy as np +from os.path import join as pjoin +import os +import time +import shutil + +from CDM.detect_merge.Element import Element +from torchvision import models +from torch import nn +import torch + +import CDM.detect_compo.lib_ip.ip_preprocessing as pre + +# ----------------- load pre-trained classification model ---------------- + +# model = models.resnet18().to('cpu') +# in_feature_num = model.fc.in_features +# model.fc = nn.Linear(in_feature_num, 99) +# model.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(5, 5), padding=(3, 3), stride=(2, 2), +# bias=False) +# +# PATH = "./model/model-99-resnet18.pkl" +# model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu'))) +# +# model.eval() + +# ----------------- end loading ------------------------------------------ + +# information_type = {'Name':['name', 'first name', 'last name', 'full name', 'real name', 'surname', 'family name', 'given name'], +# 'Birthday':['birthday', 'date of birth', 'birth date', 'DOB', 'dob full birthday'], +# 'Address':['address', 'mailing address', 'physical address', 'postal address', 'billing address', 'shipping address'], +# 'Phone':['phone', 'phone number', 'mobile', 'mobile phone', 'mobile number', 'telephone', 'telephone number', 'call'], +# 'Email':['email', 'e-mail', 'email address', 'e-mail address'], +# 'Contacts':['contacts', 'phone-book', 'phone book'], +# 'Location':['location', 'locate', 'place', 'geography', 'geo', 'geo-location', 'precision location'], +# 'Camera':['camera', 'photo', 'scan', 'album', 'picture', 'gallery', 'photo library', 'storage', 'image', 'video'], +# 'Microphone':['microphone', 'voice, mic', 'speech', 'talk'], +# 'Financial':['credit card', 'pay', 'payment', 'debit card', 'mastercard', 'wallet'], +# 'IP':['IP', 'Internet Protocol', 'IP address', 'internet protocol address'], +# 'Cookies':['cookies', 'cookie'], +# 'Social':['facebook', 'twitter']} + +def show_elements(org_img, eles, ratio, show=False, win_name='element', wait_key=0, shown_resize=None, line=2): + color_map = {'Text':(0, 0, 255), 'Compo':(0, 255, 0), 'Block':(0, 255, 0), 'Text Content':(255, 0, 255)} + img = org_img.copy() + for ele in eles: + color = color_map[ele.category] + ele.visualize_element(img=img, color=color, line=line, ratio=ratio) + img_resize = img + if shown_resize is not None: + img_resize = cv2.resize(img, shown_resize) + if show: + cv2.imshow(win_name, img_resize) + cv2.waitKey(wait_key) + if wait_key == 0: + cv2.destroyWindow(win_name) + return img_resize + +def show_one_element(org_img, eles, ratio, show=False, win_name='element', wait_key=0, shown_resize=None, line=2): + color_map = {'Text': (0, 0, 255), 'Compo': (0, 255, 0), 'Block': (0, 255, 0), 'Text Content': (255, 0, 255)} + all_img = [] + for ele in eles: + img = org_img.copy() + color = color_map[ele.category] + ele.visualize_element(img=img, color=color, line=line, ratio=ratio) + img_resize = img + all_img.append(img_resize) + if shown_resize is not None: + img_resize = cv2.resize(img, shown_resize) + if show: + cv2.imshow(win_name, img_resize) + cv2.waitKey(wait_key) + if wait_key == 0: + cv2.destroyWindow(win_name) + return all_img + + +def save_elements(output_file, elements, img_shape, ratio=1): + components = {'compos': [], 'img_shape': img_shape} + for i, ele in enumerate(elements): + + if ratio != 1: + ele.resize(ratio) + ele.width = ele.col_max - ele.col_min + ele.height = ele.row_max - ele.row_min + + c = ele.wrap_info() + # c['id'] = i + components['compos'].append(c) + json.dump(components, open(output_file, 'w'), indent=4) + return components + + +def reassign_ids(elements): + for i, element in enumerate(elements): + element.id = i + + +def refine_texts(texts, img_shape): + refined_texts = [] + # for text in texts: + # # remove potential noise + # if len(text.text_content) > 1 and text.height / img_shape[0] < 0.075: + # refined_texts.append(text) + + for text in texts: + # remove potential noise + if text.height / img_shape[0] < 0.075: + refined_texts.append(text) + + return refined_texts + + +def merge_text_line_to_paragraph(elements, max_line_gap=5): + texts = [] + non_texts = [] + for ele in elements: + if ele.category == 'Text': + texts.append(ele) + else: + non_texts.append(ele) + + changed = True + while changed: + changed = False + temp_set = [] + for text_a in texts: + merged = False + for text_b in temp_set: + inter_area, _, _, _ = text_a.calc_intersection_area(text_b, bias=(0, max_line_gap)) + if inter_area > 0: + text_b.element_merge(text_a) + merged = True + changed = True + break + if not merged: + temp_set.append(text_a) + texts = temp_set.copy() + return non_texts + texts + + +def refine_elements(compos, texts, input_img_path, intersection_bias=(2, 2), containment_ratio=0.8, ): + ''' + 1. remove compos contained in text + 2. remove compos containing text area that's too large + 3. store text in a compo if it's contained by the compo as the compo's text child element + ''' + + # resize_by_height = 800 + # org, grey = pre.read_img(input_img_path, resize_by_height) + # + # grey = grey.astype('float32') + # grey = grey / 255 + # + # grey = (grey - grey.mean()) / grey.std() + + elements = [] + contained_texts = [] + + # classification_start_time = time.time() + + for compo in compos: + is_valid = True + text_area = 0 + for text in texts: + inter, iou, ioa, iob = compo.calc_intersection_area(text, bias=intersection_bias) + if inter > 0: + # the non-text is contained in the text compo + if ioa >= containment_ratio: + is_valid = False + break + text_area += inter + # the text is contained in the non-text compo + if iob >= containment_ratio and compo.category != 'Block': + contained_texts.append(text) + # print("id: ", compo.id) + # print("text.text_content: ", text.text_content) + # print("is_valid: ", is_valid) + # print("inter: ", inter) + # print("iou: ", iou) + # print("ioa: ", ioa) + # print("iob: ", iob) + # print("text_area: ", text_area) + # print("compo.area: ", compo.area) + if is_valid and text_area / compo.area < containment_ratio: + # for t in contained_texts: + # t.parent_id = compo.id + # compo.children += contained_texts + + # --------- classification ---------- + + # comp_grey = grey[compo.row_min:compo.row_max, compo.col_min:compo.col_max] + # + # comp_crop = cv2.resize(comp_grey, (32, 32)) + # + # comp_crop = comp_crop.reshape(1, 1, 32, 32) + # + # comp_tensor = torch.tensor(comp_crop) + # comp_tensor = comp_tensor.permute(0, 1, 3, 2) + # + # pred_label = model(comp_tensor) + # + # if np.argmax(pred_label.cpu().data.numpy(), axis=1) in [72.0, 42.0, 77.0, 91.0, 6.0, 89.0, 40.0, 43.0, 82.0, + # 3.0, 68.0, 49.0, 56.0, 89.0]: + # elements.append(compo) + + # --------- end classification ---------- + + elements.append(compo) + # time_cost_ic = time.time() - classification_start_time + # print("time cost for icon classification: %2.2f s" % time_cost_ic) + + # text_selection_time = time.time() + + # elements += texts + for text in texts: + if text not in contained_texts: + elements.append(text) + + # ---------- Simulate keyword search ----------- + + # for key in keyword_list: + # for w in keyword_list[key]: + # if w in text.text_content.lower(): + # elements.append(text) + + # ---------- end ------------------------------- + + # time_cost_ts = time.time() - text_selection_time + # print("time cost for text selection: %2.2f s" % time_cost_ts) + + # return elements, time_cost_ic, time_cost_ts + return elements + + +def check_containment(elements): + for i in range(len(elements) - 1): + for j in range(i + 1, len(elements)): + relation = elements[i].element_relation(elements[j], bias=(2, 2)) + if relation == -1: + elements[j].children.append(elements[i]) + elements[i].parent_id = elements[j].id + if relation == 1: + elements[i].children.append(elements[j]) + elements[j].parent_id = elements[i].id + + +def remove_top_bar(elements, img_height): + new_elements = [] + max_height = img_height * 0.04 + for ele in elements: + if ele.row_min < 10 and ele.height < max_height: + continue + new_elements.append(ele) + return new_elements + + +def remove_bottom_bar(elements, img_height): + new_elements = [] + for ele in elements: + # parameters for 800-height GUI + if ele.row_min > 750 and 20 <= ele.height <= 30 and 20 <= ele.width <= 30: + continue + new_elements.append(ele) + return new_elements + + +def compos_clip_and_fill(clip_root, org, compos): + def most_pix_around(pad=6, offset=2): + ''' + determine the filled background color according to the most surrounding pixel + ''' + up = row_min - pad if row_min - pad >= 0 else 0 + left = col_min - pad if col_min - pad >= 0 else 0 + bottom = row_max + pad if row_max + pad < org.shape[0] - 1 else org.shape[0] - 1 + right = col_max + pad if col_max + pad < org.shape[1] - 1 else org.shape[1] - 1 + most = [] + for i in range(3): + val = np.concatenate((org[up:row_min - offset, left:right, i].flatten(), + org[row_max + offset:bottom, left:right, i].flatten(), + org[up:bottom, left:col_min - offset, i].flatten(), + org[up:bottom, col_max + offset:right, i].flatten())) + most.append(int(np.argmax(np.bincount(val)))) + return most + + if os.path.exists(clip_root): + shutil.rmtree(clip_root) + os.mkdir(clip_root) + + bkg = org.copy() + cls_dirs = [] + for compo in compos: + cls = compo['class'] + if cls == 'Background': + compo['path'] = pjoin(clip_root, 'bkg.png') + continue + c_root = pjoin(clip_root, cls) + c_path = pjoin(c_root, str(compo['id']) + '.jpg') + compo['path'] = c_path + if cls not in cls_dirs: + os.mkdir(c_root) + cls_dirs.append(cls) + + position = compo['position'] + col_min, row_min, col_max, row_max = position['column_min'], position['row_min'], position['column_max'], position['row_max'] + cv2.imwrite(c_path, org[row_min:row_max, col_min:col_max]) + # Fill up the background area + cv2.rectangle(bkg, (col_min, row_min), (col_max, row_max), most_pix_around(), -1) + cv2.imwrite(pjoin(clip_root, 'bkg.png'), bkg) + + +def merge(img_path, compo_path, text_path, merge_root=None, is_paragraph=False, is_remove_top_bar=False, is_remove_bottom_bar=False, show=False, wait_key=0): + compo_json = json.load(open(compo_path, 'r')) + text_json = json.load(open(text_path, 'r')) + + # load text and non-text compo + ele_id = 0 + compos = [] + for compo in compo_json['compos']: + element = Element(ele_id, (compo['column_min'], compo['row_min'], compo['column_max'], compo['row_max']), compo['class']) + compos.append(element) + ele_id += 1 + texts = [] + for text in text_json['texts']: + element = Element(ele_id, (text['column_min'], text['row_min'], text['column_max'], text['row_max']), 'Text', text_content=text['content']) + texts.append(element) + ele_id += 1 + if compo_json['img_shape'] != text_json['img_shape']: + resize_ratio = compo_json['img_shape'][0] / text_json['img_shape'][0] + for text in texts: + text.resize(resize_ratio) + + # check the original detected elements + img = cv2.imread(img_path) + img_resize = cv2.resize(img, (compo_json['img_shape'][1], compo_json['img_shape'][0])) + ratio = img.shape[0] / img_resize.shape[0] + + show_elements(img, texts + compos, ratio, show=show, win_name='all elements before merging', wait_key=wait_key, line=3) + + # refine elements + texts = refine_texts(texts, compo_json['img_shape']) + elements = refine_elements(compos, texts, img_path) + if is_remove_top_bar: + elements = remove_top_bar(elements, img_height=compo_json['img_shape'][0]) + if is_remove_bottom_bar: + elements = remove_bottom_bar(elements, img_height=compo_json['img_shape'][0]) + if is_paragraph: + elements = merge_text_line_to_paragraph(elements, max_line_gap=7) + reassign_ids(elements) + check_containment(elements) + board = show_elements(img, elements, ratio, show=show, win_name='elements after merging', wait_key=wait_key, line=3) + + # save all merged elements, clips and blank background + name = img_path.replace('\\', '/').split('/')[-1][:-4] + components = save_elements(pjoin(merge_root, name + '.json'), elements, img_resize.shape) + cv2.imwrite(pjoin(merge_root, name + '.jpg'), board) + print('[Merge Completed] Input: %s Output: %s' % (img_path, pjoin(merge_root, name + '.jpg'))) + return board, components + # return this_ic_time, this_ts_time diff --git a/CDM/detect_text/Text.py b/CDM/detect_text/Text.py new file mode 100644 index 0000000000000000000000000000000000000000..830ec304c1b791e900f90b4c2ccd1d994c245908 --- /dev/null +++ b/CDM/detect_text/Text.py @@ -0,0 +1,181 @@ +import cv2 +import numpy as np + + +class Text: + def __init__(self, id, content, location): + self.id = id + self.content = content + self.location = location + + self.width = self.location['right'] - self.location['left'] + self.height = self.location['bottom'] - self.location['top'] + self.area = self.width * self.height + self.word_width = self.width / len(self.content) + + ''' + ******************************** + *** Relation with Other text *** + ******************************** + ''' + def is_justified(self, ele_b, direction='h', max_bias_justify=4): + ''' + Check if the element is justified + :param max_bias_justify: maximum bias if two elements to be justified + :param direction: + - 'v': vertical up-down connection + - 'h': horizontal left-right connection + ''' + l_a = self.location + l_b = ele_b.location + # connected vertically - up and below + if direction == 'v': + # left and right should be justified + if abs(l_a['left'] - l_b['left']) < max_bias_justify and abs(l_a['right'] - l_b['right']) < max_bias_justify: + return True + return False + elif direction == 'h': + # top and bottom should be justified + if abs(l_a['top'] - l_b['top']) < max_bias_justify and abs(l_a['bottom'] - l_b['bottom']) < max_bias_justify: + return True + return False + + def is_on_same_line(self, text_b, direction='h', bias_gap=4, bias_justify=4): + ''' + Check if the element is on the same row(direction='h') or column(direction='v') with ele_b + :param direction: + - 'v': vertical up-down connection + - 'h': horizontal left-right connection + :return: + ''' + l_a = self.location + l_b = text_b.location + # connected vertically - up and below + if direction == 'v': + # left and right should be justified + if self.is_justified(text_b, direction='v', max_bias_justify=bias_justify): + # top and bottom should be connected (small gap) + if abs(l_a['bottom'] - l_b['top']) < bias_gap or abs(l_a['top'] - l_b['bottom']) < bias_gap: + return True + return False + elif direction == 'h': + # top and bottom should be justified + if self.is_justified(text_b, direction='h', max_bias_justify=bias_justify): + # top and bottom should be connected (small gap) + if abs(l_a['right'] - l_b['left']) < bias_gap or abs(l_a['left'] - l_b['right']) < bias_gap: + return True + return False + + def is_intersected(self, text_b, bias): + l_a = self.location + l_b = text_b.location + left_in = max(l_a['left'], l_b['left']) + bias + top_in = max(l_a['top'], l_b['top']) + bias + right_in = min(l_a['right'], l_b['right']) + bottom_in = min(l_a['bottom'], l_b['bottom']) + + w_in = max(0, right_in - left_in) + h_in = max(0, bottom_in - top_in) + area_in = w_in * h_in + if area_in > 0: + return True + + ''' + *********************** + *** Revise the Text *** + *********************** + ''' + def merge_text(self, text_b): + text_a = self + top = min(text_a.location['top'], text_b.location['top']) + left = min(text_a.location['left'], text_b.location['left']) + right = max(text_a.location['right'], text_b.location['right']) + bottom = max(text_a.location['bottom'], text_b.location['bottom']) + self.location = {'left': left, 'top': top, 'right': right, 'bottom': bottom} + self.width = self.location['right'] - self.location['left'] + self.height = self.location['bottom'] - self.location['top'] + self.area = self.width * self.height + + left_element = text_a + right_element = text_b + if text_a.location['left'] > text_b.location['left']: + left_element = text_b + right_element = text_a + self.content = left_element.content + ' ' + right_element.content + self.word_width = self.width / len(self.content) + + def shrink_bound(self, binary_map): + bin_clip = binary_map[self.location['top']:self.location['bottom'], self.location['left']:self.location['right']] + height, width = np.shape(bin_clip) + + shrink_top = 0 + shrink_bottom = 0 + for i in range(height): + # top + if shrink_top == 0: + if sum(bin_clip[i]) == 0: + shrink_top = 1 + else: + shrink_top = -1 + elif shrink_top == 1: + if sum(bin_clip[i]) != 0: + self.location['top'] += i + shrink_top = -1 + # bottom + if shrink_bottom == 0: + if sum(bin_clip[height-i-1]) == 0: + shrink_bottom = 1 + else: + shrink_bottom = -1 + elif shrink_bottom == 1: + if sum(bin_clip[height-i-1]) != 0: + self.location['bottom'] -= i + shrink_bottom = -1 + + if shrink_top == -1 and shrink_bottom == -1: + break + + shrink_left = 0 + shrink_right = 0 + for j in range(width): + # left + if shrink_left == 0: + if sum(bin_clip[:, j]) == 0: + shrink_left = 1 + else: + shrink_left = -1 + elif shrink_left == 1: + if sum(bin_clip[:, j]) != 0: + self.location['left'] += j + shrink_left = -1 + # right + if shrink_right == 0: + if sum(bin_clip[:, width-j-1]) == 0: + shrink_right = 1 + else: + shrink_right = -1 + elif shrink_right == 1: + if sum(bin_clip[:, width-j-1]) != 0: + self.location['right'] -= j + shrink_right = -1 + + if shrink_left == -1 and shrink_right == -1: + break + self.width = self.location['right'] - self.location['left'] + self.height = self.location['bottom'] - self.location['top'] + self.area = self.width * self.height + self.word_width = self.width / len(self.content) + + ''' + ********************* + *** Visualization *** + ********************* + ''' + def visualize_element(self, img, color=(0, 0, 255), line=1, show=False): + loc = self.location + cv2.rectangle(img, (loc['left'], loc['top']), (loc['right'], loc['bottom']), color, line) + if show: + print(self.content) + cv2.imshow('text', img) + cv2.waitKey() + cv2.destroyWindow('text') diff --git a/CDM/detect_text/ocr.py b/CDM/detect_text/ocr.py new file mode 100644 index 0000000000000000000000000000000000000000..e295d5a9b6628b3d9b86e07706dc3bb997885bb3 --- /dev/null +++ b/CDM/detect_text/ocr.py @@ -0,0 +1,43 @@ +import cv2 +import os +import requests +import json +from base64 import b64encode +import time + + +def Google_OCR_makeImageData(imgpath): + with open(imgpath, 'rb') as f: + ctxt = b64encode(f.read()).decode() + img_req = { + 'image': { + 'content': ctxt + }, + 'features': [{ + 'type': 'DOCUMENT_TEXT_DETECTION', + # 'type': 'TEXT_DETECTION', + 'maxResults': 1 + }] + } + return json.dumps({"requests": img_req}).encode() + + +def ocr_detection_google(imgpath): + # start = time.clock() + url = 'https://vision.googleapis.com/v1/images:annotate' + + api_key = os.environ.get('google_ocr') + + imgdata = Google_OCR_makeImageData(imgpath) + response = requests.post(url, + data=imgdata, + params={'key': api_key}, + headers={'Content_Type': 'application/json'}) + # print('*** Text Detection Time Taken:%.3fs ***' % (time.clock() - start)) + print("*** Please replace the Google OCR key at detect_text/ocr.py line 28 with your own (apply in https://cloud.google.com/vision) ***") + # print('response.json(): ', response.json()) + if response.json()['responses'] == [{}]: + # No Text + return None + else: + return response.json()['responses'][0]['textAnnotations'][1:] diff --git a/CDM/detect_text/text_detection.py b/CDM/detect_text/text_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7a92c993a5ae3544dd20b6b17e02b37bc9aaf9 --- /dev/null +++ b/CDM/detect_text/text_detection.py @@ -0,0 +1,289 @@ +import CDM.detect_text.ocr as ocr +from CDM.detect_text.Text import Text +import numpy as np +import cv2 +import json +import time +import os +from os.path import join as pjoin +# from paddleocr import PaddleOCR +import pytesseract + +# paddle_model = PaddleOCR(use_angle_cls=True, lang="en") #'ch' for chinese and english, 'en' for english + + +def save_detection_json(file_path, texts, img_shape): + f_out = open(file_path, 'w') + output = {'img_shape': img_shape, 'texts': []} + for text in texts: + c = {'id': text.id, 'content': text.content} + loc = text.location + c['column_min'], c['row_min'], c['column_max'], c['row_max'] = loc['left'], loc['top'], loc['right'], loc['bottom'] + c['width'] = text.width + c['height'] = text.height + output['texts'].append(c) + json.dump(output, f_out, indent=4) + + +def visualize_texts(org_img, texts, shown_resize_height=None, show=False, write_path=None): + img = org_img.copy() + for text in texts: + text.visualize_element(img, line=2) + + img_resize = img + if shown_resize_height is not None: + img_resize = cv2.resize(img, (int(shown_resize_height * (img.shape[1]/img.shape[0])), shown_resize_height)) + + if show: + cv2.imshow('texts', img_resize) + cv2.waitKey(0) + cv2.destroyWindow('texts') + if write_path is not None: + cv2.imwrite(write_path, img) + + +def text_sentences_recognition(texts): + ''' + Merge separate words detected by Google ocr into a sentence + ''' + changed = True + while changed: + changed = False + temp_set = [] + for text_a in texts: + merged = False + for text_b in temp_set: + if text_a.is_on_same_line(text_b, 'h', bias_justify=0.2 * min(text_a.height, text_b.height), bias_gap=2 * max(text_a.word_width, text_b.word_width)): + text_b.merge_text(text_a) + merged = True + changed = True + break + if not merged: + temp_set.append(text_a) + texts = temp_set.copy() + + for i, text in enumerate(texts): + text.id = i + return texts + + +def merge_intersected_texts(texts): + ''' + Merge intersected texts (sentences or words) + ''' + changed = True + while changed: + changed = False + temp_set = [] + for text_a in texts: + merged = False + for text_b in temp_set: + if text_a.is_intersected(text_b, bias=2): + text_b.merge_text(text_a) + merged = True + changed = True + break + if not merged: + temp_set.append(text_a) + texts = temp_set.copy() + return texts + + +def text_cvt_orc_format(ocr_result): + texts = [] + if ocr_result is not None: + for i, result in enumerate(ocr_result): + error = False + x_coordinates = [] + y_coordinates = [] + text_location = result['boundingPoly']['vertices'] + content = result['description'] + for loc in text_location: + if 'x' not in loc or 'y' not in loc: + error = True + break + x_coordinates.append(loc['x']) + y_coordinates.append(loc['y']) + if error: continue + location = {'left': min(x_coordinates), 'top': min(y_coordinates), + 'right': max(x_coordinates), 'bottom': max(y_coordinates)} + texts.append(Text(i, content, location)) + return texts + + +def text_cvt_orc_format_paddle(paddle_result): + texts = [] + for i, line in enumerate(paddle_result): + points = np.array(line[0]) + # points = points * 5 + location = {'left': int(min(points[:, 0])), 'top': int(min(points[:, 1])), 'right': int(max(points[:, 0])), + 'bottom': int(max(points[:, 1]))} + content = line[1][0] + texts.append(Text(i, content, location)) + return texts + + +def text_cvt_orc_format_tesseract(tesseract_result): + # texts = [] + # i_real = 0 + # for i, line in enumerate(tesseract_result['text']): + # content = line.strip() + # location = { + # 'left': int(tesseract_result['left'][i]), + # 'top': int(tesseract_result['top'][i]), + # 'right': int(tesseract_result['left'][i]) + int(tesseract_result['width'][i]), + # 'bottom': int(tesseract_result['top'][i]) + int(tesseract_result['height'][i]) + # } + # if len(content) > 0: + # texts.append(Text(i_real, content, location)) + # i_real = i_real + 1 + + # Extract line boxes + texts = [] + i_real = 0 + line_boxes = [] + n_boxes = len(tesseract_result['level']) + for i in range(n_boxes): + if tesseract_result['level'][i] == 4 and len(tesseract_result['text'][i].strip()) > 0: + # (x, y, w, h) = (tesseract_result['left'][i], tesseract_result['top'][i], tesseract_result['width'][i], tesseract_result['height'][i]) + content = tesseract_result['text'][i].strip() + location = { + 'left': int(tesseract_result['left'][i]), + 'top': int(tesseract_result['top'][i]), + 'right': int(tesseract_result['left'][i]) + int(tesseract_result['width'][i]), + 'bottom': int(tesseract_result['top'][i]) + int(tesseract_result['height'][i]) + } + texts.append(Text(i_real, content, location)) + i_real = i_real + 1 + # print("ocr result: ", texts) + + return texts + +def text_cvt_orc_format_tesseract_by_line(data): + + # line_data = [] + line_num = None + line_text = [] + line_box = [0, 0, 0, 0] + texts = [] + i_real = 0 + + for i in range(len(data['level'])): + # check if the level is word + if data['level'][i] == 5: + if line_num != data['line_num'][i]: + if line_num is not None: # append the previous line data to line_data + content = ' '.join(line_text) + location = { + 'left': line_box[0], + 'top': line_box[1], + 'right': line_box[2], + 'bottom': line_box[3] + } + texts.append(Text(i_real, content, location)) + i_real = i_real + 1 + + # start a new line + line_num = data['line_num'][i] + line_text = [data['text'][i]] + line_box = [ + data['left'][i], + data['top'][i], + data['left'][i] + data['width'][i], + data['top'][i] + data['height'][i], + ] + else: # add a word to the current line + line_text.append(data['text'][i]) + line_box[2] = max(line_box[2], data['left'][i] + data['width'][i]) + line_box[3] = max(line_box[3], data['top'][i] + data['height'][i]) + + # append the last line data to line_data + if line_text: + content = ' '.join(line_text) + location = { + 'left': line_box[0], + 'top': line_box[1], + 'right': line_box[2], + 'bottom': line_box[3] + } + texts.append(Text(i_real, content, location)) + i_real = i_real + 1 + + return texts + + +def text_filter_noise(texts): + valid_texts = [] + for text in texts: + if len(text.content) <= 1 and text.content.lower() not in ['a', ',', '.', '!', '?', '$', '%', ':', '&', '+']: + continue + valid_texts.append(text) + return valid_texts + + +def text_detection(input_file='../data/input/30800.jpg', output_file='../data/output', show=False, method='google', paddle_model=None): + ''' + :param method: google or paddle + :param paddle_model: the preload paddle model for paddle ocr + ''' + start = time.process_time() + name = input_file.split('/')[-1][:-4] + ocr_root = pjoin(output_file, 'ocr') + img = cv2.imread(input_file) + if img is None: + print("imread nothing!") + + # resize the img to speed up the ocr + # img = cv2.resize(img, (int(img.shape[1]/5), int(img.shape[0]/5))) + # cv2.imshow("img", img) + # cv2.waitKey(0) + + if method == 'google': + print('*** Detect Text through Google OCR ***') + ocr_result = ocr.ocr_detection_google(input_file) + texts = text_cvt_orc_format(ocr_result) + texts = merge_intersected_texts(texts) + texts = text_filter_noise(texts) + texts = text_sentences_recognition(texts) + ocr_time_cost = time.process_time() - start + elif method == 'paddle': + # The import of the paddle ocr can be separate to the beginning of the program if you decide to use this method + # from paddleocr import PaddleOCR + print('*** Detect Text through Paddle OCR ***') + # if paddle_model is None: + # paddle_model = PaddleOCR(use_angle_cls=True, lang="en") #'ch' for chinese and english, 'en' for english + # None + result = paddle_model.ocr(input_file, cls=True) + ocr_time_cost = time.process_time() - start + texts = text_cvt_orc_format_paddle(result) + + elif method == 'pytesseract': + + img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + # Perform OCR using Tesseract + result = pytesseract.image_to_data(img_rgb, output_type=pytesseract.Output.DICT) + print("ocr result: ", result) + + ocr_time_cost = time.process_time() - start + + # Convert the Tesseract result to the desired format + texts = text_cvt_orc_format_tesseract_by_line(result) + print("texts: ", texts) + else: + raise ValueError('Method has to be "google" or "paddle" or "pytesseract"') + + visualize_texts(img, texts, shown_resize_height=800, show=show, write_path=pjoin(ocr_root, name+'.png')) + save_detection_json(pjoin(ocr_root, name+'.json'), texts, img.shape) + # ocr_time_cost = time.process_time() - start + print("[Text Detection Completed in %.3f s] Input: %s Output: %s" % (ocr_time_cost, input_file, pjoin(ocr_root, name+'.json'))) + + # print("!!! detected content !!!") + # for text in texts: + # print(text.content) + + return ocr_time_cost + + +# text_detection() + diff --git a/CDM/input_examples/README.md b/CDM/input_examples/README.md new file mode 100644 index 0000000000000000000000000000000000000000..55ccbc5a96d37a7c24f5d253e52c7c10fbc779cb --- /dev/null +++ b/CDM/input_examples/README.md @@ -0,0 +1,80 @@ +# UIED - UI element detection, detecting UI elements from UI screenshots or drawnings + +This project is still ongoing and this repo may be updated irregularly, I developed a web app for the UIED in http://uied.online + +## Related Publications: +[1. UIED: a hybrid tool for GUI element detection](https://dl.acm.org/doi/10.1145/3368089.3417940) + +[2. Object Detection for Graphical User Interface: Old Fashioned or Deep Learning or a Combination?](https://arxiv.org/abs/2008.05132) + +>The repo has been **upgraded with Google OCR** for GUI text detection, to use the original version in our paper (using [EAST](https://github.com/argman/EAST) as text detector), check the relase [v2.3](https://github.com/MulongXie/UIED/releases/tag/v2.3) and download the pre-trained model in [this link](https://drive.google.com/drive/folders/1MK0Om7Lx0wRXGDfNcyj21B0FL1T461v5?usp=sharing). + +## What is it? + +UI Element Detection (UIED) is an old-fashioned computer vision (CV) based element detection approach for graphic user interface. + +The input of UIED could be various UI image, such as mobile app or web page screenshot, UI design drawn by Photoshop or Sketch, and even some hand-drawn UI design. Then the approach detects and classifies text and graphic UI elements, and exports the detection result as JSON file for future application. + +UIED comprises two parts to detect UI text and graphic elements, such as button, image and input bar. +* For text, it leverages [Google OCR](https://cloud.google.com/vision/docs/ocr) to perfrom detection. + +* For graphical elements, it uses old-fashioned CV approaches to locate the elements and a CNN classifier to achieve classification. + +> UIED is highly customizable, you can replace both parts by your choice (e.g. other text detection approaches). Unlike black-box end-to-end deep learning approach, you can revise the algorithms in the non-text detection and merging (partially or entirely) easily to fit your task. + +![UIED Approach](https://github.com/MulongXie/UIED/blob/master/data/demo/approach.png) + +## How to use? + +### Dependency +* **Python 3.5** +* **Opencv 3.4.2** +* **Pandas** + + +### Installation + + + + +The new version of UIED equipped with Google OCR is easy to deploy and no pre-trained model is needed. Simply donwload the repo along with the dependencies. + +> Please replace the Google OCR key at `detect_text/ocr.py line 28` with your own (apply in [Google website](https://cloud.google.com/vision)). + +### Usage +To test your own image(s): +* To test single image, change *input_path_img* in ``run_single.py`` to your input image and the results will be output to *output_root*. +* To test mutiple images, change *input_img_root* in ``run_batch.py`` to your input directory and the results will be output to *output_root*. +* To adjust the parameters lively, using ``run_testing.py`` + +> Note: The best set of parameters vary for different types of GUI image (Mobile App, Web, PC). I highly recommend to first play with the ``run_testing.py`` to pick a good set of parameters for your data. + +## Folder structure +``cnn/`` +* Used to train classifier for graphic UI elements +* Set path of the CNN classification model + +``config/`` +* Set data paths +* Set parameters for graphic elements detection + +``data/`` +* Input UI images and output detection results + +``detect_compo/`` +* Non-text GUI component detection + +``detect_text/`` +* GUI text detection using Google OCR + +``detect_merge/`` +* Merge the detection results of non-text and text GUI elements + +The major detection algorithms are in ``detect_compo/``, ``detect_text/`` and ``detect_merge/`` + +## Demo +GUI element detection result for web screenshot + +![UI Components detection result](https://github.com/MulongXie/UIED/blob/master/data/demo/demo.png) diff --git a/CDM/logs/cfg-for-web.txt b/CDM/logs/cfg-for-web.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7a5bcfcc62664cf08fc281544f42cf616b99d60 --- /dev/null +++ b/CDM/logs/cfg-for-web.txt @@ -0,0 +1,19 @@ +Testing image: data/input/9.png + +1. detect_compo/ip_region_proposal.py +# smaller minarea 50 -> 25 +line 70: uied_params = {'param-grad':5, 'param-block':5, 'param-minarea':25} + +2. detect_compo/lib_ip/ip_detection.py +line 289-290 comment: # remove filter of aspect ratio +line 342-344 comment: # remove is_line check + +3. detect_text_east/lib_east/eval.py +# smaller max_word_gap 10 -> 5 +line 52: def merge_text(corners, max_word_gad=5) # + +4. merge.py +# smaller horizontal max gap to merge lines (6,0) -> (4,0) +line 199 max_gap=(4,0) +# smaller vertical max gap to merge paragraph (0,6) -> (0,4) +line 202 max_gap=(0,6) \ No newline at end of file diff --git a/CDM/logs/log.txt b/CDM/logs/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..e92b2089092818845831ca04eeb6f80ee76604f4 --- /dev/null +++ b/CDM/logs/log.txt @@ -0,0 +1,22 @@ +16:10 8/7/2020 +- Synchronized with Webapp. +- Add image inspection. +- Used No-line v1. +- No-line v2 requires bug fix with consideration of gap. + + +11:00 23/7/2020 +- Synchronized with Webapp. + +10:53 4/8/2020 +- Synchronized with Webapp. + +7/10/2020 +- Extract parameters as configurable + +30/10/2020 +- Speed optimization (500% boost) + +11/11/2020 +- Revise rm_line +- Add adjustable track bar testing \ No newline at end of file diff --git a/CDM/logs/speed-improvement.txt b/CDM/logs/speed-improvement.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bc6732afdaf49f082f8a02ad4cf9904fe9e1a2f --- /dev/null +++ b/CDM/logs/speed-improvement.txt @@ -0,0 +1,12 @@ +Optimization: +1. ip_preprocessing.py / gray_to_gradient : 0.5s -> 0.02s + +2. ip_draw.py / draw_bounding_box : if not show and write_path is None: return : 0.005s -> 0s + +3. ip_detection.py / component_detection : if ff[0] < min_obj_area: continue : 2.5s -> 0.3s + +4. ip_detection.py / component_detection : cv2.findNonZero : 0.65s -> 0.33s + +5. block_division.py / block_division : if ff[0] < 500 : continue: 1.97s -> 1s + +6. block_division.py / block_division : Turn off draw : 1s -> 0.65s \ No newline at end of file diff --git a/CDM/model/model-99-ViT-entire.pkl b/CDM/model/model-99-ViT-entire.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fcf5d5a0d2e20ea2c90edb4cd6a1f56574d5f778 --- /dev/null +++ b/CDM/model/model-99-ViT-entire.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc13133f12a561224c075dac2633af6dbe6036e6c6603c266efc0e6536727ca6 +size 343682793 diff --git a/CDM/model/model-99-resnet18.pkl b/CDM/model/model-99-resnet18.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bfcf4b62dd60467c589a415475532be7a55d1baf --- /dev/null +++ b/CDM/model/model-99-resnet18.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b31df5d3ed9c743990fb7a27baf71626cf7766df36d1f414496c89d34a854f2 +size 44957605 diff --git a/CDM/requirements.txt b/CDM/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..10f219ea2828cbf15ddb3769bebc2f8e63f0e830 Binary files /dev/null and b/CDM/requirements.txt differ diff --git a/CDM/result_classification/README.md b/CDM/result_classification/README.md new file mode 100644 index 0000000000000000000000000000000000000000..55ccbc5a96d37a7c24f5d253e52c7c10fbc779cb --- /dev/null +++ b/CDM/result_classification/README.md @@ -0,0 +1,80 @@ +# UIED - UI element detection, detecting UI elements from UI screenshots or drawnings + +This project is still ongoing and this repo may be updated irregularly, I developed a web app for the UIED in http://uied.online + +## Related Publications: +[1. UIED: a hybrid tool for GUI element detection](https://dl.acm.org/doi/10.1145/3368089.3417940) + +[2. Object Detection for Graphical User Interface: Old Fashioned or Deep Learning or a Combination?](https://arxiv.org/abs/2008.05132) + +>The repo has been **upgraded with Google OCR** for GUI text detection, to use the original version in our paper (using [EAST](https://github.com/argman/EAST) as text detector), check the relase [v2.3](https://github.com/MulongXie/UIED/releases/tag/v2.3) and download the pre-trained model in [this link](https://drive.google.com/drive/folders/1MK0Om7Lx0wRXGDfNcyj21B0FL1T461v5?usp=sharing). + +## What is it? + +UI Element Detection (UIED) is an old-fashioned computer vision (CV) based element detection approach for graphic user interface. + +The input of UIED could be various UI image, such as mobile app or web page screenshot, UI design drawn by Photoshop or Sketch, and even some hand-drawn UI design. Then the approach detects and classifies text and graphic UI elements, and exports the detection result as JSON file for future application. + +UIED comprises two parts to detect UI text and graphic elements, such as button, image and input bar. +* For text, it leverages [Google OCR](https://cloud.google.com/vision/docs/ocr) to perfrom detection. + +* For graphical elements, it uses old-fashioned CV approaches to locate the elements and a CNN classifier to achieve classification. + +> UIED is highly customizable, you can replace both parts by your choice (e.g. other text detection approaches). Unlike black-box end-to-end deep learning approach, you can revise the algorithms in the non-text detection and merging (partially or entirely) easily to fit your task. + +![UIED Approach](https://github.com/MulongXie/UIED/blob/master/data/demo/approach.png) + +## How to use? + +### Dependency +* **Python 3.5** +* **Opencv 3.4.2** +* **Pandas** + + +### Installation + + + + +The new version of UIED equipped with Google OCR is easy to deploy and no pre-trained model is needed. Simply donwload the repo along with the dependencies. + +> Please replace the Google OCR key at `detect_text/ocr.py line 28` with your own (apply in [Google website](https://cloud.google.com/vision)). + +### Usage +To test your own image(s): +* To test single image, change *input_path_img* in ``run_single.py`` to your input image and the results will be output to *output_root*. +* To test mutiple images, change *input_img_root* in ``run_batch.py`` to your input directory and the results will be output to *output_root*. +* To adjust the parameters lively, using ``run_testing.py`` + +> Note: The best set of parameters vary for different types of GUI image (Mobile App, Web, PC). I highly recommend to first play with the ``run_testing.py`` to pick a good set of parameters for your data. + +## Folder structure +``cnn/`` +* Used to train classifier for graphic UI elements +* Set path of the CNN classification model + +``config/`` +* Set data paths +* Set parameters for graphic elements detection + +``data/`` +* Input UI images and output detection results + +``detect_compo/`` +* Non-text GUI component detection + +``detect_text/`` +* GUI text detection using Google OCR + +``detect_merge/`` +* Merge the detection results of non-text and text GUI elements + +The major detection algorithms are in ``detect_compo/``, ``detect_text/`` and ``detect_merge/`` + +## Demo +GUI element detection result for web screenshot + +![UI Components detection result](https://github.com/MulongXie/UIED/blob/master/data/demo/demo.png) diff --git a/CDM/result_processing/Untitled.ipynb b/CDM/result_processing/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d43ac1e0c8bb7857b5e693cf4ea4ad925a457ce9 --- /dev/null +++ b/CDM/result_processing/Untitled.ipynb @@ -0,0 +1,937 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import numpy as np\n", + "import cv2\n", + "from glob import glob\n", + "from os.path import join as pjoin\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "def resize_label(bboxes, d_height, gt_height, bias=0):\n", + " bboxes_new = []\n", + " scale = gt_height / d_height\n", + " for bbox in bboxes:\n", + " bbox = [int(b * scale + bias) for b in bbox]\n", + " bboxes_new.append(bbox)\n", + " return bboxes_new\n", + "\n", + "\n", + "def draw_bounding_box(org, corners, color=(0, 255, 0), line=2, show=False):\n", + " board = org.copy()\n", + " for i in range(len(corners)):\n", + " board = cv2.rectangle(board, (corners[i][0], corners[i][1]), (corners[i][2], corners[i][3]), color, line)\n", + " if show:\n", + " cv2.imshow('a', cv2.resize(board, (500, 1000)))\n", + " cv2.waitKey(0)\n", + " return board\n", + "\n", + "\n", + "def load_detect_result_json(reslut_file_root, shrink=0):\n", + " def is_bottom_or_top(corner):\n", + " column_min, row_min, column_max, row_max = corner\n", + " if row_max < 36 or row_min > 725:\n", + " return True\n", + " return False\n", + "\n", + " result_files = glob(pjoin(reslut_file_root, '*.json'))\n", + " compos_reform = {}\n", + " print('Loading %d detection results' % len(result_files))\n", + " for reslut_file in tqdm(result_files):\n", + " img_name = reslut_file.split('\\\\')[-1].split('.')[0]\n", + " compos = json.load(open(reslut_file, 'r'))['compos']\n", + " for compo in compos:\n", + " if is_bottom_or_top((compo['column_min'], compo['row_min'], compo['column_max'], compo['row_max'])):\n", + " continue\n", + " if img_name not in compos_reform:\n", + " compos_reform[img_name] = {'bboxes': [[compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink]],\n", + " 'categories': [compo['category']]}\n", + " else:\n", + " compos_reform[img_name]['bboxes'].append([compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink])\n", + " compos_reform[img_name]['categories'].append(compo['category'])\n", + " return compos_reform\n", + "\n", + "\n", + "def load_ground_truth_json(gt_file):\n", + " def get_img_by_id(img_id):\n", + " for image in images:\n", + " if image['id'] == img_id:\n", + " return image['file_name'].split('/')[-1][:-4], (image['height'], image['width'])\n", + "\n", + " def cvt_bbox(bbox):\n", + " '''\n", + " :param bbox: [x,y,width,height]\n", + " :return: [col_min, row_min, col_max, row_max]\n", + " '''\n", + " bbox = [int(b) for b in bbox]\n", + " return [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]\n", + "\n", + " data = json.load(open(gt_file, 'r'))\n", + " images = data['images']\n", + " annots = data['annotations']\n", + " compos = {}\n", + " print('Loading %d ground truth' % len(annots))\n", + " for annot in tqdm(annots):\n", + " img_name, size = get_img_by_id(annot['image_id'])\n", + " if img_name not in compos:\n", + " compos[img_name] = {'bboxes': [cvt_bbox(annot['bbox'])], 'categories': [annot['category_id']], 'size': size}\n", + " else:\n", + " compos[img_name]['bboxes'].append(cvt_bbox(annot['bbox']))\n", + " compos[img_name]['categories'].append(annot['category_id'])\n", + " return compos\n", + "\n", + "\n", + "def eval(detection, ground_truth, img_root, show=True, no_text=False, only_text=False):\n", + " def compo_filter(compos, flag):\n", + " if not no_text and not only_text:\n", + " return compos\n", + " compos_new = {'bboxes': [], 'categories': []}\n", + " for k, category in enumerate(compos['categories']):\n", + " if only_text:\n", + " if flag == 'det' and category != 'TextView':\n", + " continue\n", + " if flag == 'gt' and int(category) != 14:\n", + " continue\n", + " elif no_text:\n", + " if flag == 'det' and category == 'TextView':\n", + " continue\n", + " if flag == 'gt' and int(category) == 14:\n", + " continue\n", + "\n", + " compos_new['bboxes'].append(compos['bboxes'][k])\n", + " compos_new['categories'].append(category)\n", + " return compos_new\n", + "\n", + " def match(org, d_bbox, gt_bboxes, matched):\n", + " '''\n", + " :param matched: mark if the ground truth component is matched\n", + " :param d_bbox: [col_min, row_min, col_max, row_max]\n", + " :param gt_bboxes: list of ground truth [[col_min, row_min, col_max, row_max]]\n", + " :return: Boolean: if IOU large enough or detected box is contained by ground truth\n", + " '''\n", + " area_d = (d_bbox[2] - d_bbox[0]) * (d_bbox[3] - d_bbox[1])\n", + " for i, gt_bbox in enumerate(gt_bboxes):\n", + " if matched[i] == 0:\n", + " continue\n", + " area_gt = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])\n", + " col_min = max(d_bbox[0], gt_bbox[0])\n", + " row_min = max(d_bbox[1], gt_bbox[1])\n", + " col_max = min(d_bbox[2], gt_bbox[2])\n", + " row_max = min(d_bbox[3], gt_bbox[3])\n", + " # if not intersected, area intersection should be 0\n", + " w = max(0, col_max - col_min)\n", + " h = max(0, row_max - row_min)\n", + " area_inter = w * h\n", + " if area_inter == 0:\n", + " continue\n", + " iod = area_inter / area_d\n", + " iou = area_inter / (area_d + area_gt - area_inter)\n", + " # if show:\n", + " # cv2.putText(org, (str(round(iou, 2)) + ',' + str(round(iod, 2))), (d_bbox[0], d_bbox[1]),\n", + " # cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)\n", + "\n", + " if iou > 0.9 or iod == 1:\n", + " matched[i] = 0\n", + " return True\n", + " return False\n", + "\n", + " amount = len(detection)\n", + " TP, FP, FN = 0, 0, 0\n", + " pres, recalls, f1s = [], [], []\n", + " for i, image_id in enumerate(detection):\n", + " TP_this, FP_this, FN_this = 0, 0, 0\n", + " img = cv2.imread(pjoin(img_root, image_id + '.jpg'))\n", + " d_compos = detection[image_id]\n", + " gt_compos = ground_truth[image_id]\n", + "\n", + " org_height = gt_compos['size'][0]\n", + "\n", + " d_compos = compo_filter(d_compos, 'det')\n", + " gt_compos = compo_filter(gt_compos, 'gt')\n", + "\n", + " d_compos['bboxes'] = resize_label(d_compos['bboxes'], 800, org_height)\n", + " matched = np.ones(len(gt_compos['bboxes']), dtype=int)\n", + " for d_bbox in d_compos['bboxes']:\n", + " if match(img, d_bbox, gt_compos['bboxes'], matched):\n", + " TP += 1\n", + " TP_this += 1\n", + " else:\n", + " FP += 1\n", + " FP_this += 1\n", + " FN += sum(matched)\n", + " FN_this = sum(matched)\n", + "\n", + " try:\n", + " pre_this = TP_this / (TP_this + FP_this)\n", + " recall_this = TP_this / (TP_this + FN_this)\n", + " f1_this = 2 * (pre_this * recall_this) / (pre_this + recall_this)\n", + " except:\n", + " print('empty')\n", + " continue\n", + "\n", + " pres.append(pre_this)\n", + " recalls.append(recall_this)\n", + " f1s.append(f1_this)\n", + " if show:\n", + " print(image_id + '.jpg')\n", + " print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f' % (\n", + " i, amount, TP_this, FP_this, FN_this, pre_this, recall_this))\n", + " cv2.imshow('org', cv2.resize(img, (500, 1000)))\n", + " broad = draw_bounding_box(img, d_compos['bboxes'], color=(255, 0, 0), line=3)\n", + " draw_bounding_box(broad, gt_compos['bboxes'], color=(0, 0, 255), show=True, line=2)\n", + "\n", + " if i % 200 == 0:\n", + " precision = TP / (TP + FP)\n", + " recall = TP / (TP + FN)\n", + " f1 = 2 * (precision * recall) / (precision + recall)\n", + " print(\n", + " '[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1))\n", + "\n", + " precision = TP / (TP + FP)\n", + " recall = TP / (TP + FN)\n", + " print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1))\n", + " # print(\"Average precision:%.4f; Average recall:%.3f\" % (sum(pres)/len(pres), sum(recalls)/len(recalls)))\n", + "\n", + " return pres, recalls, f1s" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import math\n", + "\n", + "def draw_plot(data, title='Score for our approach'):\n", + " for i in range(len(data)):\n", + " data[i] = [d for d in data[i] if not math.isnan(d)]\n", + "# plt.title(title)\n", + " labels = ['Precision', 'Recall', 'F1']\n", + " bplot = plt.boxplot(data, patch_artist=True, labels=labels) # 设置箱型图可填充\n", + " colors = ['pink', 'lightblue', 'lightgreen']\n", + " for patch, color in zip(bplot['boxes'], colors):\n", + " patch.set_facecolor(color) \n", + " plt.grid(axis='y')\n", + " plt.xticks(fontsize=16)\n", + " plt.yticks(fontsize=16)\n", + " plt.savefig(title + '.png')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 9%|███████▏ | 442/4708 [00:00<00:01, 4173.66it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading 4708 detection results\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████| 4708/4708 [00:01<00:00, 4404.67it/s]\n" + ] + } + ], + "source": [ + "detect = load_detect_result_json('E:\\\\Mulong\\\\Result\\\\rico\\\\rico_uied\\\\rico_new_uied_cls\\\\merge')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|█████▉ | 6915/86646 [00:00<00:01, 68670.52it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading 86646 ground truth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████| 86646/86646 [00:11<00:00, 7576.11it/s]\n" + ] + } + ], + "source": [ + "gt = load_ground_truth_json('E:\\\\Mulong\\\\Datasets\\\\rico\\\\instances_test.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0/4707] TP:16, FP:0, FN:0, Precesion:1.000, Recall:1.000, F1:1.000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:165: RuntimeWarning: invalid value encountered in double_scalars\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[200/4707] TP:2222, FP:2920, FN:1705, Precesion:0.432, Recall:0.566, F1:0.490\n", + "[400/4707] TP:4616, FP:5737, FN:3346, Precesion:0.446, Recall:0.580, F1:0.504\n", + "[600/4707] TP:6963, FP:8682, FN:4812, Precesion:0.445, Recall:0.591, F1:0.508\n", + "[800/4707] TP:9367, FP:11432, FN:6305, Precesion:0.450, Recall:0.598, F1:0.514\n", + "[1000/4707] TP:11222, FP:14346, FN:7511, Precesion:0.439, Recall:0.599, F1:0.507\n", + "[1200/4707] TP:13680, FP:17278, FN:8901, Precesion:0.442, Recall:0.606, F1:0.511\n", + "[1400/4707] TP:16274, FP:20664, FN:10379, Precesion:0.441, Recall:0.611, F1:0.512\n", + "[1600/4707] TP:18431, FP:23002, FN:11556, Precesion:0.445, Recall:0.615, F1:0.516\n", + "[1800/4707] TP:20718, FP:25600, FN:13049, Precesion:0.447, Recall:0.614, F1:0.517\n", + "[2000/4707] TP:23009, FP:28626, FN:14588, Precesion:0.446, Recall:0.612, F1:0.516\n", + "[2200/4707] TP:25424, FP:31555, FN:16191, Precesion:0.446, Recall:0.611, F1:0.516\n", + "[2400/4707] TP:27559, FP:34176, FN:17388, Precesion:0.446, Recall:0.613, F1:0.517\n", + "[2600/4707] TP:29820, FP:37065, FN:18617, Precesion:0.446, Recall:0.616, F1:0.517\n", + "[2800/4707] TP:32108, FP:39846, FN:20018, Precesion:0.446, Recall:0.616, F1:0.518\n", + "[3000/4707] TP:34188, FP:43112, FN:21399, Precesion:0.442, Recall:0.615, F1:0.515\n", + "[3200/4707] TP:36558, FP:46011, FN:23002, Precesion:0.443, Recall:0.614, F1:0.514\n", + "[3400/4707] TP:38783, FP:48918, FN:24365, Precesion:0.442, Recall:0.614, F1:0.514\n", + "[3600/4707] TP:40958, FP:51829, FN:25605, Precesion:0.441, Recall:0.615, F1:0.514\n", + "[3800/4707] TP:43270, FP:54963, FN:26841, Precesion:0.440, Recall:0.617, F1:0.514\n", + "[4000/4707] TP:45512, FP:57838, FN:28141, Precesion:0.440, Recall:0.618, F1:0.514\n", + "[4200/4707] TP:47544, FP:60789, FN:29420, Precesion:0.439, Recall:0.618, F1:0.513\n", + "[4400/4707] TP:49907, FP:64407, FN:30897, Precesion:0.437, Recall:0.618, F1:0.512\n", + "[4600/4707] TP:52181, FP:67592, FN:32399, Precesion:0.436, Recall:0.617, F1:0.511\n", + "[4706/4707] TP:53393, FP:69230, FN:33248, Precesion:0.435, Recall:0.616, F1:0.511\n" + ] + } + ], + "source": [ + "no_text = False\n", + "only_text = False\n", + "pres_all, recalls_all, f1_all = eval(detect, gt, 'E:\\\\Mulong\\\\Datasets\\\\rico\\\\combined', show=False, no_text=no_text, only_text=only_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0/4707] TP:1, FP:0, FN:0, Precesion:1.000, Recall:1.000, F1:1.000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:165: RuntimeWarning: invalid value encountered in double_scalars\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[200/4707] TP:973, FP:2022, FN:891, Precesion:0.325, Recall:0.522, F1:0.400\n", + "empty\n", + "[400/4707] TP:1921, FP:3905, FN:1788, Precesion:0.330, Recall:0.518, F1:0.403\n", + "[600/4707] TP:2847, FP:6079, FN:2717, Precesion:0.319, Recall:0.512, F1:0.393\n", + "empty\n", + "empty\n", + "empty\n", + "[800/4707] TP:3774, FP:7895, FN:3574, Precesion:0.323, Recall:0.514, F1:0.397\n", + "empty\n", + "[1000/4707] TP:4478, FP:9951, FN:4229, Precesion:0.310, Recall:0.514, F1:0.387\n", + "empty\n", + "empty\n", + "[1200/4707] TP:5451, FP:12055, FN:4960, Precesion:0.311, Recall:0.524, F1:0.391\n", + "empty\n", + "empty\n", + "empty\n", + "[1400/4707] TP:6493, FP:14405, FN:5804, Precesion:0.311, Recall:0.528, F1:0.391\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1600/4707] TP:7372, FP:15980, FN:6375, Precesion:0.316, Recall:0.536, F1:0.397\n", + "empty\n", + "empty\n", + "empty\n", + "[1800/4707] TP:8273, FP:17814, FN:7156, Precesion:0.317, Recall:0.536, F1:0.399\n", + "empty\n", + "empty\n", + "[2000/4707] TP:9273, FP:19993, FN:8051, Precesion:0.317, Recall:0.535, F1:0.398\n", + "empty\n", + "[2200/4707] TP:10293, FP:22055, FN:8869, Precesion:0.318, Recall:0.537, F1:0.400\n", + "[2400/4707] TP:11207, FP:23944, FN:9524, Precesion:0.319, Recall:0.541, F1:0.401\n", + "empty\n", + "empty\n", + "[2600/4707] TP:12103, FP:25932, FN:10276, Precesion:0.318, Recall:0.541, F1:0.401\n", + "[2800/4707] TP:12994, FP:27792, FN:11122, Precesion:0.319, Recall:0.539, F1:0.400\n", + "empty\n", + "empty\n", + "[3000/4707] TP:13839, FP:30256, FN:11943, Precesion:0.314, Recall:0.537, F1:0.396\n", + "[3200/4707] TP:14758, FP:32276, FN:12851, Precesion:0.314, Recall:0.535, F1:0.395\n", + "empty\n", + "[3400/4707] TP:15718, FP:34337, FN:13627, Precesion:0.314, Recall:0.536, F1:0.396\n", + "[3600/4707] TP:16695, FP:36424, FN:14263, Precesion:0.314, Recall:0.539, F1:0.397\n", + "[3800/4707] TP:17641, FP:38693, FN:14932, Precesion:0.313, Recall:0.542, F1:0.397\n", + "empty\n", + "empty\n", + "[4000/4707] TP:18651, FP:40641, FN:15653, Precesion:0.315, Recall:0.544, F1:0.399\n", + "empty\n", + "[4200/4707] TP:19554, FP:42631, FN:16305, Precesion:0.314, Recall:0.545, F1:0.399\n", + "empty\n", + "empty\n", + "[4400/4707] TP:20584, FP:45335, FN:17197, Precesion:0.312, Recall:0.545, F1:0.397\n", + "[4600/4707] TP:21416, FP:47595, FN:17950, Precesion:0.310, Recall:0.544, F1:0.395\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4706/4707] TP:21870, FP:48657, FN:18391, Precesion:0.310, Recall:0.543, F1:0.395\n" + ] + } + ], + "source": [ + "no_text = True\n", + "only_text = False\n", + "pres_non_text, recalls_non_text, f1_non_text = eval(detect, gt, 'E:\\\\Mulong\\\\Datasets\\\\rico\\\\combined', show=False, no_text=no_text, only_text=only_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0/4707] TP:15, FP:0, FN:0, Precesion:1.000, Recall:1.000, F1:1.000\n", + "empty\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:165: RuntimeWarning: invalid value encountered in double_scalars\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[200/4707] TP:1041, FP:1106, FN:1022, Precesion:0.485, Recall:0.505, F1:0.495\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[400/4707] TP:2185, FP:2342, FN:2068, Precesion:0.483, Recall:0.514, F1:0.498\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[600/4707] TP:3272, FP:3447, FN:2939, Precesion:0.487, Recall:0.527, F1:0.506\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[800/4707] TP:4505, FP:4625, FN:3819, Precesion:0.493, Recall:0.541, F1:0.516\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1000/4707] TP:5426, FP:5713, FN:4600, Precesion:0.487, Recall:0.541, F1:0.513\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1200/4707] TP:6649, FP:6803, FN:5521, Precesion:0.494, Recall:0.546, F1:0.519\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1400/4707] TP:7890, FP:8150, FN:6466, Precesion:0.492, Recall:0.550, F1:0.519\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1600/4707] TP:8964, FP:9117, FN:7276, Precesion:0.496, Recall:0.552, F1:0.522\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[1800/4707] TP:10052, FP:10179, FN:8286, Precesion:0.497, Recall:0.548, F1:0.521\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[2000/4707] TP:11126, FP:11243, FN:9147, Precesion:0.497, Recall:0.549, F1:0.522\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[2200/4707] TP:12213, FP:12418, FN:10240, Precesion:0.496, Recall:0.544, F1:0.519\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[2400/4707] TP:13243, FP:13341, FN:10973, Precesion:0.498, Recall:0.547, F1:0.521\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[2600/4707] TP:14377, FP:14473, FN:11681, Precesion:0.498, Recall:0.552, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[2800/4707] TP:15494, FP:15674, FN:12516, Precesion:0.497, Recall:0.553, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[3000/4707] TP:16471, FP:16734, FN:13334, Precesion:0.496, Recall:0.553, F1:0.523\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[3200/4707] TP:17644, FP:17891, FN:14307, Precesion:0.497, Recall:0.552, F1:0.523\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[3400/4707] TP:18711, FP:18935, FN:15092, Precesion:0.497, Recall:0.554, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[3600/4707] TP:19710, FP:19958, FN:15895, Precesion:0.497, Recall:0.554, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[3800/4707] TP:20845, FP:21054, FN:16693, Precesion:0.498, Recall:0.555, F1:0.525\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4000/4707] TP:21881, FP:22177, FN:17468, Precesion:0.497, Recall:0.556, F1:0.525\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4200/4707] TP:22842, FP:23306, FN:18263, Precesion:0.495, Recall:0.556, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4400/4707] TP:23930, FP:24465, FN:19093, Precesion:0.494, Recall:0.556, F1:0.524\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4600/4707] TP:25015, FP:25747, FN:20199, Precesion:0.493, Recall:0.553, F1:0.521\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "empty\n", + "[4706/4707] TP:25638, FP:26458, FN:20742, Precesion:0.492, Recall:0.553, F1:0.521\n" + ] + } + ], + "source": [ + "no_text = False\n", + "only_text = True\n", + "pres_text, recalls_text, f1_text = eval(detect, gt, 'E:\\\\Mulong\\\\Datasets\\\\rico\\\\combined', show=False, no_text=no_text, only_text=only_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\lib\\site-packages\\matplotlib\\figure.py:448: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", + " % get_backend())\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAD/lJREFUeJzt3XuMXGd9xvHvE4dwycWJa7qlScARmIKLCqFLAqSUdaEioVXcqoQmKrcqYKlqoBRaGkQVTPoHhapFvRha00ZQoISAIHUjQ0DUq4SAwevciBMiuSbUblDDxQQBhRDy6x9zDMNk1zO7nt11Xn8/0sjn8s45vxm/fvb43ZnzpqqQJLXlmOUuQJI0foa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUHHLteJV69eXWvWrFmu00vSQ9KuXbu+XlWPHtZu2cJ9zZo1zMzMLNfpJekhKclXRmnnsIwkNchwl6QGGe6S1CDDXZIaZLhLUoOGhnuSK5Lck+S2OfYnyd8l2ZPk1iRPH3+ZkqT5GOXK/T3AuYfYfx6wtntsBN51+GVJkg7H0HCvquuAbx6iyQbgX6tnB3BykseMq0BJ0vyN40tMpwL7+tb3d9u+OtgwyUZ6V/dMTEwwPT09htMfPdavX7+g523fvn3MlUgPtpD+ad9cPOMI98yybdZZt6tqC7AFYHJysqampsZw+qPHoSYzT3LI/dJim6v/2TeXxzg+LbMfOL1v/TTg7jEcV5K0QOMI963Ay7pPzTwTuLeqHjQkI0laOkOHZZJ8EJgCVifZD7wZeBhAVf0jsA14IbAH+B7w+4tVrCRpNEPDvaouGrK/gD8cW0WSpMPmN1QlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S5mXVqlUkGfkBzKt9ElatWrXMr/KhbxyTdUg6ihw4cGDRJ984+ENBC+eVuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDVopHBPcm6SO5PsSXLpLPsfm2R7kpuS3JrkheMvVZI0qqHhnmQFsBk4D1gHXJRk3UCzPweuqqozgQuBd467UEnS6Ea5cj8L2FNVe6vqPuBKYMNAmwJO6pZXAnePr0RJ0nyNMkH2qcC+vvX9wNkDbTYBn0zyauB44PljqU6StCCjhPts05APTn1+EfCeqvrrJM8C3pfkKVX1wE8dKNkIbASYmJhgenp6ASVrLr6fWipL0dfsz4cnVYM5PdCgF9abquoF3fobAarqrX1tdgPnVtW+bn0v8Myqumeu405OTtbMzMzhvwIBkIRhf5fSOCxFX7M/zy3JrqqaHNZulDH3ncDaJGckOY7eL0y3DrT5b+B53YmfDDwC+Nr8SpYkjcvQcK+q+4FLgGuBO+h9KmZ3ksuTnN81ez3wqiS3AB8EXlH+2JWkZTPKmDtVtQ3YNrDtsr7l24FzxluaJGmh/IaqJDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUEjfYlJkg6qN58Em1Yu/jl0WAx3SfOSt3x7aW4ctmlRT9E8h2UkqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhvsRaNWqVSSZ1wOYV/tVq1Yt86uUtJi8/cAR6MCBA0vy9W5J7fLKXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDRgr3JOcmuTPJniSXztHmxUluT7I7yb+Nt0xJ0nwMvbdMkhXAZuDXgf3AziRbq+r2vjZrgTcC51TVgSQ/u1gFS5KGG+XK/SxgT1Xtrar7gCuBDQNtXgVsrqoDAFV1z3jLlCTNxyh3hTwV2Ne3vh84e6DNEwGS3ACsADZV1ScGD5RkI7ARYGJigunp6QWUfHRYivfG918LZf888mXYrWWTXAC8oKpe2a2/FDirql7d1+Ya4IfAi4HTgOuBp1TVt+Y67uTkZM3MzBz+K2hQkiW55e9in0Ntsn8uryS7qmpyWLtRhmX2A6f3rZ8G3D1Lm3+vqh9W1ZeBO4G1oxYrSRqvUcJ9J7A2yRlJjgMuBLYOtLkaWA+QZDW9YZq94yxUkjS6oeFeVfcDlwDXAncAV1XV7iSXJzm/a3Yt8I0ktwPbgT+tqm8sVtGSpEMbOua+WBxzn5tjmjqS2T+X1zjH3CVJDzGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoNGmUNVS6zefBJsWrn455AWKMmiHv+UU05Z1OMfDQz3I1De8u2luV/2pkU9hRo1377pvdmXh8MyktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBI4V7knOT3JlkT5JLD9HuRUkqyeT4SpQkzdfQcE+yAtgMnAesAy5Ksm6WdicCrwE+P+4iJUnzM8qV+1nAnqraW1X3AVcCG2Zp9xfA24Hvj7E+SdICjBLupwL7+tb3d9t+LMmZwOlVdc0Ya5MkLdAo0+zNNlnij+fMSnIM8A7gFUMPlGwENgJMTEwwPT09UpFHo6V4b3z/tVTsa0svw+Y2TPIsYFNVvaBbfyNAVb21W18J/Bfwne4pPwd8Ezi/qmbmOu7k5GTNzMy5+6i2FHNOOq+llop9bbyS7KqqoR9aGWVYZiewNskZSY4DLgS2HtxZVfdW1eqqWlNVa4AdDAl2SdLiGhruVXU/cAlwLXAHcFVV7U5yeZLzF7tASdL8jTLmTlVtA7YNbLtsjrZTh1+WJOlw+A1VSWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1aKQJsrX0kizq8U855ZRFPb6k5WW4H4Gqat7PSbKg50lqk8MyktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoJHCPcm5Se5MsifJpbPsf12S25PcmuTTSR43/lIlSaMaGu5JVgCbgfOAdcBFSdYNNLsJmKyqXwI+Arx93IVKkkY3ypX7WcCeqtpbVfcBVwIb+htU1faq+l63ugM4bbxlSpLmY5Qbh50K7Otb3w+cfYj2FwMfn21Hko3ARoCJiQmmp6dHq1Ij8f3Ukcq+ufRGCffZ7j076+0Hk7wEmASeO9v+qtoCbAGYnJysqamp0arUSHw/daSyby69UcJ9P3B63/ppwN2DjZI8H3gT8Nyq+sF4ypMkLcQoY+47gbVJzkhyHHAhsLW/QZIzgX8Czq+qe8ZfpiRpPoaGe1XdD1wCXAvcAVxVVbuTXJ7k/K7ZXwEnAB9OcnOSrXMcTpK0BEaaiamqtgHbBrZd1rf8/DHXJUk6DH5DVZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDVopHBPcm6SO5PsSXLpLPsfnuRD3f7PJ1kz7kIlSaMbGu5JVgCbgfOAdcBFSdYNNLsYOFBVTwDeAbxt3IVKkkY3ypX7WcCeqtpbVfcBVwIbBtpsAN7bLX8EeF6SjK9MSdJ8jBLupwL7+tb3d9tmbVNV9wP3Aj8zjgIlSfN37AhtZrsCrwW0IclGYCPAxMQE09PTI5xeB61fv/6Q++f6z9L27dsXoxzppxyqf9o3l94o4b4fOL1v/TTg7jna7E9yLLAS+ObggapqC7AFYHJysqamphZQ8tGr6kE/L6Ujhv3zyDLKsMxOYG2SM5IcB1wIbB1osxV4ebf8IuA/y79pSVo2Q6/cq+r+JJcA1wIrgCuqaneSy4GZqtoK/AvwviR76F2xX7iYRUuSDm2UYRmqahuwbWDbZX3L3wcuGG9pkqSF8huqktQgw12SGmS4S1KDDHdJapDhLkkNynJ9HD3J14CvLMvJ27Qa+PpyFyHNwr45Xo+rqkcPa7Rs4a7xSjJTVZPLXYc0yL65PByWkaQGGe6S1CDDvR1blrsAaQ72zWXgmLskNcgrd0lqkOH+EJbks0P2b0ty8lLVI81HkjVJbuuWp5Jcs9w1tWSku0Jq8SVZUVU/ms9zqurZQ/a/8PCqkh6smx85VfXActeiuXnlvgS6K5QvJXlvkluTfCTJo5LcleSyJJ8BLkjy+CSfSLIryfVJntQ9fyLJx5Lc0j2e3W3/TvfnY5Jcl+TmJLcleU63/a4kq7vl13X7bkvy2r667kjy7iS7k3wyySO7fa9JcntX75XL8LbpCNLXV94J3Ai8NMnnktyY5MNJTujaPSPJZ7t++oUkJ3bPvb5re+PB/nuIcz2368s3J7kpyYlL8RqbU1U+FvkBrKE3p+w53foVwJ8AdwFv6Gv3aWBtt3w2vRmtAD4EvLZbXgGs7Ja/0/35euBNfftP7JbvovftwF8GvggcD5wA7AbO7Oq6H3ha1/4q4CXd8t3Aw7vlk5f7PfRxRPThB4Bndn3qOuD4bt+fAZcBxwF7gWd020+iNzrwKOAR3ba19Cb5OXjM27rlKeCabvk/+v6tnAAcu9yv/6H4cFhm6eyrqhu65fcDr+mWPwTQXfk8G/hw32TCD+/+/DXgZQDVG7q5d+DYO4ErkjwMuLqqbh7Y/yvAx6rqu925Pgo8h970iF/ua7+L3j84gFuBDyS5Grh6IS9YzflKVe1I8pvAOuCGrq8eB3wO+AXgq1W1E6Cqvg2Q5HjgH5I8DfgR8MQh57kB+JskHwA+WlX7F+XVNM5hmaUz+JnTg+vf7f48BvhWVT2t7/HkkQ5cdR3wq8D/0Jvu8GUDTWafer7nB33LP+Inv4f5DWAzvav+Xd3E5zq6HeyrAT7V10/XVdXF3fbZPlv9x8D/Ak8FJun9MJhTVf0l8ErgkcCOg8OTmh/Dfek8NsmzuuWLgM/07+yucr6c5ALo/dIqyVO73Z8G/qDbviLJSf3PTfI44J6qeje9+WyfPnDu64Df6sb5jwd+G7h+rkKTHAOcXlXbgTcAJ9P777EEsAM4J8kTALp+9UTgS8DPJ3lGt/3E7qJgJb0r+geAl9IbOpxTksdX1Rer6m3ADGC4L4DhvnTuAF6e5FZgFfCuWdr8HnBxklvojYtv6Lb/EbA+yRfpDZ384sDzpoCbk9wE/A7wt/07q+pG4D3AF4DPA/9cVTcdotYVwPu7890EvKOqvjXi61TjquprwCuAD3b9eQfwpKq6D/hd4O+7Pvwp4BHAO+n1/R30hmS+O+uBf+K13S/+bwH+D/j44ryStvkN1SWQZA29XxY9ZZlLkXSU8MpdkhrklbskNcgrd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSg/wdEbKhcLyCIwQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "draw_plot([pres_all, recalls_all])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\lib\\site-packages\\matplotlib\\figure.py:448: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", + " % get_backend())\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAD+BJREFUeJzt3X+QXWV9x/H3hyD+gBBMY7cW0DAaq6lTxa6gUuum2hFsJ2mnYsnUXx00M52itdpaHDsY6R9WO63TH9EWW0arVkRHacpE0bHZAdFoNvySgMykEZsUp/gDEbWKyLd/3INer7u5dzd3d8OT92vmzp5znuee8703Tz45efaee1JVSJLacsxyFyBJGj/DXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgY5frwGvWrKm1a9cu1+El6UFpz549X6uqRw3rt2zhvnbtWmZmZpbr8JL0oJTky6P0c1pGkhpkuEtSgwx3SWqQ4S5JDTLcJalBQ8M9yaVJ7kxy8xztSfJ3SfYluSnJ08ZfpiRpPkY5c383cPYh2s8B1nWPLcA7D78sSdLhGBruVXU18I1DdNkE/Gv17AJOSvLocRUoSZq/cVzEdDJwoG/9YLftK4Mdk2yhd3bPxMQE09PTYzj80WPDhg0Let7OnTvHXImOZhs3buSee+5Z1GOsXLmS7du3L+oxWjeOcM8s22a963ZVXQJcAjA5OVlTU1NjOPzR41A3M09yyHZpXO65555FH2tJMB8Ozzg+LXMQOLVv/RTgjjHsV5K0QOMI9+3AS7tPzTwDuLuqfmpKRpK0dIZOyyT5ADAFrElyEHgT8BCAqvpHYAfwAmAf8F3g9xerWEnSaIaGe1VtHtJewB+OrSJJ0mHzClVJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoJHCPcnZSW5Lsi/JhbO0PybJziTXJ7kpyQvGX6okaVRDwz3JCmAbcA6wHticZP1Atz8HLq+q04HzgHeMu1BJ0uhGOXM/A9hXVfur6l7gMmDTQJ8CTuyWVwF3jK9ESdJ8HTtCn5OBA33rB4EzB/psBT6R5FXA8cDzxlKdJGlBRgn3zLKtBtY3A++uqr9O8kzgvUmeXFX3/8SOki3AFoCJiQmmp6cXULLm4vuppbIUY83xfHhSNZjTAx16Yb21qp7frb8BoKre0tdnL3B2VR3o1vcDz6iqO+fa7+TkZM3MzBz+KxAASRj2ZymNw1KMNcfz3JLsqarJYf1GmXPfDaxLclqS4+j9wnT7QJ//Bp7bHfhJwMOAr86vZEnSuAwN96q6D7gAuAq4ld6nYvYmuTjJxq7b64BXJrkR+ADw8vKfXUlaNqPMuVNVO4AdA9su6lu+BThrvKVJkhbKK1QlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuB+BVq9eTZJ5PYB59V+9evUyv0pJi2mkj0Jqad11111LcgWgpHZ55i5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQX79gKR5qTedCFtXLf4xdFgMd0nzkjd/a0m++6i2Luohmue0jCQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaNFK4Jzk7yW1J9iW5cI4+L0pyS5K9Sf5tvGVKkuZj6LdCJlkBbAN+HTgI7E6yvapu6euzDngDcFZV3ZXkZxerYEnScKOcuZ8B7Kuq/VV1L3AZsGmgzyuBbVV1F0BV3TneMiVJ8zHK97mfDBzoWz8InDnQ5wkASa4FVgBbq+rjgztKsgXYAjAxMcH09PQCSj46LMV74/uvhXJ8Hvky7Ev3k5wLPL+qXtGtvwQ4o6pe1dfnSuAHwIuAU4BrgCdX1Tfn2u/k5GTNzMwc/itoUJKluRnCIh9DbXJ8Lq8ke6pqcli/UaZlDgKn9q2fAtwxS59/r6ofVNWXgNuAdaMWK0kar1HCfTewLslpSY4DzgO2D/S5AtgAkGQNvWma/eMsVJI0uqHhXlX3ARcAVwG3ApdX1d4kFyfZ2HW7Cvh6kluAncCfVtXXF6toSdKhDZ1zXyzOuc/NOU0dyRyfy2ucc+6SpAcZw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBo9xDVUus3nQibF21+MeQ1CzD/QiUN39rab4ve+uiHkLSMnJaRpIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1aKRwT3J2ktuS7Ety4SH6vTBJJZkcX4mSpPkaGu5JVgDbgHOA9cDmJOtn6bcSeDXwuXEXKUman1HO3M8A9lXV/qq6F7gM2DRLv78A3gZ8b4z1SZIWYJRwPxk40Ld+sNv2I0lOB06tqivHWJskaYFGuc1eZtn2o3vAJTkGeDvw8qE7SrYAWwAmJiaYnp4eqcij0VK8N77/WqhktlgYn5UrVzo+D1OG3aszyTOBrVX1/G79DQBV9ZZufRXwX8C3u6f8HPANYGNVzcy138nJyZqZmbP5qJZkae6husjHkMCxNm5J9lTV0A+tjDItsxtYl+S0JMcB5wHbH2isqrurak1Vra2qtcAuhgS7JGlxDQ33qroPuAC4CrgVuLyq9ia5OMnGxS5QkjR/o8y5U1U7gB0D2y6ao+/U4ZclSTocXqEqSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lq0LHLXYBml2RR9//IRz5yUfcvaXkZ7kegqpr3c5Is6HmS2uS0jCQ1yHCXpAYZ7pLUIMNdkhpkuEtSg0YK9yRnJ7ktyb4kF87S/toktyS5Kcmnkjx2/KVKkkY1NNyTrAC2AecA64HNSdYPdLsemKyqXwI+DLxt3IVKkkY3ypn7GcC+qtpfVfcClwGb+jtU1c6q+m63ugs4ZbxlSpLmY5SLmE4GDvStHwTOPET/84GPzdaQZAuwBWBiYoLp6enRqtRIfD91pHJsLr1Rwn226+BnvRQyyYuBSeA5s7VX1SXAJQCTk5M1NTU1WpUaie+njlSOzaU3SrgfBE7tWz8FuGOwU5LnAW8EnlNV3x9PeZKkhRhlzn03sC7JaUmOA84Dtvd3SHI68E/Axqq6c/xlSpLmY2i4V9V9wAXAVcCtwOVVtTfJxUk2dt3+CjgB+FCSG5Jsn2N3kqQlMNK3QlbVDmDHwLaL+pafN+a6JEmHwStUJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoNGCvckZye5Lcm+JBfO0v7QJB/s2j+XZO24C5UkjW5ouCdZAWwDzgHWA5uTrB/odj5wV1U9Hng78NZxFypJGt0oZ+5nAPuqan9V3QtcBmwa6LMJeE+3/GHguUkyvjIlSfMxSrifDBzoWz/YbZu1T1XdB9wN/Mw4CpQkzd+xI/SZ7Qy8FtCHJFuALQATExNMT0+PcHg9YMOGDYdsn+s/Szt37lyMcqSfcKjx6dhceqOE+0Hg1L71U4A75uhzMMmxwCrgG4M7qqpLgEsAJicna2pqagElH72qfurfS+mI4fg8sowyLbMbWJfktCTHAecB2wf6bAde1i2/EPjP8k9akpbN0DP3qrovyQXAVcAK4NKq2pvkYmCmqrYD/wK8N8k+emfs5y1m0ZKkQxtlWoaq2gHsGNh2Ud/y94Bzx1uaJGmhvEJVkhpkuEtSgwx3SWqQ4S5JDTLcJalBWa6Poyf5KvDlZTl4m9YAX1vuIqRZODbH67FV9ahhnZYt3DVeSWaqanK565AGOTaXh9MyktQgw12SGmS4t+OS5S5AmoNjcxk45y5JDfLMXZIaZLg/iCX5zJD2HUlOWqp6pPlIsjbJzd3yVJIrl7umloz0rZBafElWVNUP5/OcqnrWkPYXHF5V0k/r7o+cqrp/uWvR3DxzXwLdGcoXk7wnyU1JPpzkEUluT3JRkk8D5yZ5XJKPJ9mT5JokT+yeP5Hko0lu7B7P6rZ/u/v56CRXJ7khyc1Jnt1tvz3Jmm75tV3bzUle01fXrUnelWRvkk8keXjX9uokt3T1XrYMb5uOIH1j5R3AdcBLknw2yXVJPpTkhK7f05N8phunn0+ysnvuNV3f6x4Yv4c41nO6sXxDkuuTrFyK19icqvKxyA9gLb17yp7VrV8K/AlwO/D6vn6fAtZ1y2fSu6MVwAeB13TLK4BV3fK3u5+vA97Y176yW76d3tWBvwx8ATgeOAHYC5ze1XUf8NSu/+XAi7vlO4CHdssnLfd76OOIGMP3A8/oxtTVwPFd258BFwHHAfuBp3fbT6Q3O/AI4GHdtnX0bvLzwD5v7pangCu75f/o+7tyAnDscr/+B+PDaZmlc6Cqru2W3we8ulv+IEB35vMs4EN9NxN+aPfz14CXAlRv6ubugX3vBi5N8hDgiqq6YaD9V4CPVtV3umN9BHg2vdsjfqmv/x56f+EAbgLen+QK4IqFvGA158tVtSvJbwLrgWu7sXoc8FngF4CvVNVugKr6FkCS44F/SPJU4IfAE4Yc51rgb5K8H/hIVR1clFfTOKdlls7gZ04fWP9O9/MY4JtV9dS+x5NG2nHV1cCvAv9D73aHLx3oMvut53u+37f8Q378e5jfALbRO+vf0934XEe3B8ZqgE/2jdP1VXV+t322z1b/MfC/wFOASXr/GMypqv4SeAXwcGDXA9OTmh/Dfek8Jskzu+XNwKf7G7uznC8lORd6v7RK8pSu+VPAH3TbVyQ5sf+5SR4L3FlV76J3P9unDRz7auC3unn+44HfBq6Zq9AkxwCnVtVO4PXASfT+eywB7ALOSvJ4gG5cPQH4IvDzSZ7ebV/ZnRSsondGfz/wEnpTh3NK8riq+kJVvRWYAQz3BTDcl86twMuS3ASsBt45S5/fA85PciO9efFN3fY/AjYk+QK9qZNfHHjeFHBDkuuB3wH+tr+xqq4D3g18Hvgc8M9Vdf0hal0BvK873vXA26vqmyO+TjWuqr4KvBz4QDeedwFPrKp7gd8F/r4bw58EHga8g97Y30VvSuY7s+74x17T/eL/RuD/gI8tzitpm1eoLoEka+n9sujJy1yKpKOEZ+6S1CDP3CWpQZ65S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAb9P9Yjn2/XeLGfAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "draw_plot([pres_non_text, recalls_non_text])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "\n", + "pres1 = pd.DataFrame({'score_type':'Precision', 'score': pres_non_text, 'class':'Non_text'})\n", + "pres2 = pd.DataFrame({'score_type':'Precision', 'score': pres_all, 'class':'All_element'})\n", + "\n", + "recalls1 = pd.DataFrame({'score_type':'Recall', 'score':recalls_non_text, 'class':'Non_text'})\n", + "recalls2 = pd.DataFrame({'score_type':'Recall', 'score':recalls_all, 'class':'All_element'})\n", + "\n", + "f1s1 = pd.DataFrame({'score_type':'F1', 'score':f1_non_text, 'class':'Non_text'})\n", + "f1s2 = pd.DataFrame({'score_type':'F1', 'score':f1_all, 'class':'All_element'})\n", + "\n", + "data=pd.concat([pres1, pres2, recalls1, recalls2, f1s1, f1s2])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3Xl8VNXdx/HPL4EkCAiytCprrKBQJAFZIorWgop1RwX3gj7leSwWl0pr61KL1dKK+hK0bg+1WnEBi4pKCz5a64JgEMPuggoSsYqgVNYk5Pf8MZPrJGaZTHJnsnzfr1denXvumXt+M7fym3PvPeeYuyMiIgKQluoARESk4VBSEBGRgJKCiIgElBRERCSgpCAiIgElBRERCSgpiIhIQElBREQCSgoiIhJokeoAaqtTp07es2fPVIchItKovPXWW1+4e+ea6jW6pNCzZ0+WLl2a6jBERBoVM9sQTz1dPhIRkYCSgoiIBJQUREQk0OjuKYhI41BcXExhYSG7d+9OdSjNSlZWFl27dqVly5YJvV9JQURCUVhYSNu2benZsydmlupwmgV3Z8uWLRQWFpKdnZ3QMUK7fGRmfzazz81sVRX7zcymm9k6M1thZgPDikVEkm/37t107NhRCSGJzIyOHTvWqXcW5j2FvwCjqtl/ItAr+jcBuCfEWEQkBZQQkq+u33loScHdXwG2VlPlNOBhj1gMtDezA8KKR0REapbKewpdgI0x24XRsk8rVjSzCUR6E3Tv3j0pwSXimXdWVFp+2qH9Q223uPgjAFq2TOwaYnP1yIo3a/2eC/oPSbi93bsX1fo9WVnDEm6vudm2e1e57XZZraqtf+ONN9KmTRuuvvrqhNssLd0DQFpaZsLHaGhS+UhqZX0cr6yiu9/v7oPcfVDnzjWO0hYRkQSlMikUAt1itrsCm1IUi4g0cQ8//DD9+/cnJyeHCy+8sNy+Bx54gMGDB5OTk8OZZ57Jzp07AZgzZw79+vUjJyeHo48+GoDVq1czZMgQcnNzyc0dxPvvr0v6ZwlTKpPCPOCi6FNIecA2d//WpSMRkbpavXo1N998My+99BLLly/nzjvvLLd/9OjR5Ofns3z5cvr06cPMmTMBmDJlCgsWLGD58uXMmzcPgHvvvZfLL7+cgoIC3nxzEV27dkn65wlTaPcUzOwx4AdAJzMrBH4DtARw93uB+cCPgHXATmB8WLGISPP20ksvcdZZZ9GpUycAOnToUG7/qlWruO666/jqq6/Yvn07J5xwAgBHHnkk48aNY8yYMYwePRqAI444gptvvpnCwkJOP/1kevU6OLkfJmShJQV3P7eG/Q5MDKt9EZEy7l7to5rjxo3j6aefJicnh7/85S+8/PLLQKRXsGTJEp5//nlyc3MpKCjgvPPOY+jQoTz//POceOLJ3H//PYwcWd3T942L5j4SkSZvxIgRzJ49my1btgCwdWv5p+W//vprDjjgAIqLi5k1a1ZQ/sEHHzB06FCmTJlCp06d2LhxIx9++CEHHXQQkyZN4pRTTmblykrH5zZamuZCRJq873//+1x77bUcc8wxpKenM2DAAGIX67rpppsYOnQoPXr04LDDDuPrr78GYPLkybz//vu4OyNGjCAnJ4epU6fyyCOP0LJlS7773e9w/fW/TtGnCodFruI0HoMGDfKGushOKsYpFBWtKbedkdE3tLaaGo1TCNfatWvp06dP0tqr7TiF+tBQxylU9t2b2VvuPqim9+rykYiIBJQUREQSUFq6CygFSqOvmwYlBRERCSgpiIhIQElBREQCSgoiIhLQOAURSYo5857hyx3b6+14+7Vuw9mnngbAl7t2fmt/Wdl+rfaptzabAyUFkXq0a9eLSTteq1Yj6rWtsH25Yzt7D+5Wc8V4j7duY411OuzTmquuuorbbrsNgGnTprF9+3ZuvPHGeolh/foNLFq0mPPOG5vQ+wsKCti0aRM/+tGP6iWe+qDLRyLSZGVmZjJ37ly++OKLUI6/fv0GHnvsiYTfX1BQwPz58+sxorpTUhCRJqtFixZMmDCBO+6441v7NmzYwIgRI+jfvz8jRozg448/BiKT402aNIlhw4Zx0EEH8eSTT1Z5/F//+npee20RAwcO5Y477mDv3r1MnjyZwYMH079/f+677z4AnnrqKUaOHIm78+mnn9K7d28+/vhjbrjhBp544glyc3N54onEk0t9UlIQkSZt4sSJzJo1i23btpUrv+yyy7joootYsWIF559/PpMmTQr2ffrpp7z22ms899xzXHPNNVUe+5ZbbuKoo4axbNkSrrzySmbOnEm7du3Iz88nPz+fBx54gI8++ogzzjiD/fffn7vvvpuf/OQn/Pa3v6V79+5MmTKFsWPHUlBQwNixiV2Cqm+6pyAiTdq+++7LRRddxPTp02nV6pv5kN544w3mzp0LwIUXXsgvfvGLYN/pp59OWloaffv25bPPPvvWMUtLd5S9Cv63tHQHCxcuZMWKFUHvYtu2bbz//vtkZ2czY8YM+vXrR15eHueeW+3KAimlpNBI7dmzrMryzMyBSY5GpGG74oorGDhwIOPHV72WV+x6C5mZ30xwV5tJQ92dGTNmBIv0xPrkk09IS0vjs88+o7S0lLS0hnmhpmFGJSJNzn6t25C+bmO9/e3Xuk3cbXfo0IExY8YEy2wCDBs2jMcffxyAWbNmcdRRR9X6M7Vt25bt278Otk844QTuueceiouLAXjvvffYsWMHJSUljB8/nkcffZQ+ffpw++23B+8vm6a7oVBPQUSSomxMQar8/Oc/56677gq2p0+fzsUXX8ytt95K586defDBB2t9zP79+9GiRQsGDMjjxz++gCuumMz69esZOHAg7k7nzp15+umnue222xg+fDjDhw8nNzeXwYMHc9JJJ3HssccydepUcnNz+dWvftUg7itoPYV6lMz1FKq6fATo8lGMe/JfSVpblw4+ut7HKVSnoY9TSOZ6CpUNXisTxuC1b+4plJeW1rre20qE1lMQEZF6octHIiI1WLlyJRdeeGFMSSmZmZm88cbLqQopNEoKIiI1OOywwygoKAi2q7p81BQoKdRBvGv8VqxXl3V+RUTCpHsKIiISUFIQEZGALh+JSFL84x9PU1RUf+spZGS0YdSo0+vteBKhpCAiSVFUtJ3jjjug3o73wgufxlXvqaeeYvTo0axdu5ZDDz2U9evXc/LJJ7Nq1Spefvllpk2bxnPPPVertl9++RVuu206zz5b9QyqYVm/fj2LFi3ivPPOC+X4unwkIk3aY489xlFHHRVMadHYrV+/nkcffTS04yspiEiTtX37dl5//XVmzpyZUFLYsWMHF198MYMHH86AATk89dTjlJaWXQJzwNmxYzuXXPI/DB16dLk6f/7zvZx++umccsopZGdnc9ddd3H77bczYMAA8vLy2Lp1KwAffPABo0aN4vDDD2f48OG88847QNXrOlxzzTW8+uqr5ObmVrpORF0pKYhIkzX/2WcZNWoUvXv3pkOHDixbVvX0MJW5+eab+eEPf8iSJf/ixRef55e/vI4dO8qPUbjllls59thjKq2zatUqHn30Ud58802uvfZa9tlnH95++22OOOIIHn74YQAmTJjAjBkzeOutt5g2bRo//elPg2NXtq7D1KlTGT58OAUFBVx55ZV1+XoqFeo9BTMbBdwJpAP/6+5TK+zvDjwEtI/WucbdG9badCLSIBWX7uWLndXfuH5y9mz+e+JEvti5nZNHn8Fjjz3GxIkT425j4cKFzJs3j2nT/gjA7t17+Pjj8mtDv/DCSzz77Hxuv336t+oce+yxtG3blrZt29KuXTtOOeUUIDIYbsWKFWzfvp1FixZx9tlnB8fbs2dP8LqmdR3CEFpSMLN04G7gOKAQyDezee6+JqbadcBsd7/HzPoC84GeYcUkIs3H1i1beO1fr/DOmrWYGXv37iU9La3cL/GauDt/+9vf6NWrS7nyzz7bXK7OnDmPcMghvcvVWbJkabl1GdLS0oLttLQ0SkpKKC0tpX379uVGS8dKdF2HugizpzAEWOfuHwKY2ePAaUBsUnBg3+jrdsCmEONp9HbvXpRQvaysYWGEI1IrGRlt4n5iqCbFpXtJb1H9jKTPPv0MY847l9tmTA/KzjzxJAoLC+Nu54QTTmDGjBnceefvMTPefns5AwbklKtz/PEjuOuu+5g+fVqVdaqy7777kp2dzZw5czj77LNxd1asWEFOTtXvD3sNhjCTQhcgtp9VCAytUOdGYKGZ/QxoDYys7EBmNgGYANC9e/d6D1REwlefYwpqumwEMHfOHCZddVW5sjPPPJNbbrkl7nauv/56rrjiCnJz83B3evTo/q3HUK+77pdceeUvq61TnVmzZnHppZfyu9/9juLiYs4555xqk0L//v1p0aIFOTk5jBs3rt7vK4S2noKZnQ2c4O7/Fd2+EBji7j+LqXNVNIbbzOwIYCbQz91LKz0oDWs9hXjnPqoo0bmP4u0pVNScewpaTyF1wlxPIZ6kUJlO+8S/Wlusb544qp20tMTaq6uGup5CIdAtZrsr3748dAkwG8Dd3wCygE4hxiQiItUIMynkA73MLNvMMoBzgHkV6nwMjAAwsz5EksJmRESS6MEHHyQ3N7fcX22eUmpKQrun4O4lZnYZsIDI46Z/dvfVZjYFWOru84CfAw+Y2ZVEbjqP88a2PqiIVMndMbNUh1Gj8ePHM378+FSHUS/q+k9oqOMUomMO5lcouyHm9RrgyDBjEJHUyMrKYsuWLXTs2LFRJIamwN3ZsmULWVlZCR9DE+KJSCi6du1KYWEhmzfX/xXh7UV7aq5Uic0ZmTVXqoR7Yu2ZJdZeXWRlZdG1a9eE36+kICKhaNmyJdnZ2aEc++HlSxJ630V9chN6X+JP/iXWXipp7iMREQkoKYiISEBJQUREAkoKIiIS0I1mkXqyePFi/v3v5E3Bsv/+rcnLy0tae9I8qKcgIiIB9RRE6kleXh67du2ouWI9adVKvQSpf0oKIiIQygy31R2zoc5yq8tHIiISUFIQEZGAkoKIiASUFEREJKCkICIiASUFEREJKCmIiEhASUFERAIavNbAJHMATUMdPCMiqaOegoiIBNRTqME9+a8k9Zjj+tV7cyKNkv7bSw31FEREJKCkICIiASUFEREJKCmIiEhASUFERAJKCiIiEtAjqdJkLV68mHfz85PX3t4McnKS1pxIKNRTEBGRgHoK0mTl5eXxdnpR8tobnBfKNCUiyRRqT8HMRpnZu2a2zsyuqaLOGDNbY2arzezRMOMREZHqhdZTMLN04G7gOKAQyDezee6+JqZOL+BXwJHu/qWZfSeseEREpGZh9hSGAOvc/UN3LwIeB06rUOcnwN3u/iWAu38eYjwiIlKDMJNCF2BjzHZhtCxWb6C3mb1uZovNbFRlBzKzCWa21MyWbt68OaRwRUQkzKRglZR5he0WQC/gB8C5wP+aWftvvcn9fncf5O6DOnfuXO+BiohIRJhJoRDoFrPdFdhUSZ1n3L3Y3T8C3iWSJEREJAXCTAr5QC8zyzazDOAcYF6FOk8DxwKYWScil5M+DDEmERGpRmhJwd1LgMuABcBaYLa7rzazKWZ2arTaAmCLma0B/glMdvctYcUkIiLVC3XwmrvPB+ZXKLsh5rUDV0X/REQkxTTNhYiIBJQUREQkoKQgIiIBTYgnIs3exjXvsOCTL5Pa5v77tyYvLy+pbcYj7p6CmR1lZuOjrzubWXZ4YYmISCrE1VMws98Ag4BDgAeBlsAjwJHhhSYikhzd+h7KCf2Kk9pmq1YNr5cA8fcUzgBOBXYAuPsmoG1YQYmISGrEmxSKomMKHMDMWocXkoiIpEq8SWG2md0HtDeznwD/BzwQXlgiIpIKcd1TcPdpZnYc8B8i9xVucPcXQo1MRESSrsakEF1BbYG7jwSUCEREmrAaLx+5+15gp5m1S0I8IiKSQvEOXtsNrDSzF4g+gQTg7pNCiUpERFIi3qTwfPRPQpTsUZUNdUSliKROvDeaH4oulNM7WvSuuyd3pIdII9Cq1Ygq9+3evajWx8vKGlaXcERqLd4RzT8AHgLWE1l7uZuZ/djdXwkvtOYn2aMqG+qIShFJnXgvH90GHO/u7wKYWW/gMeDwsAITEZHki3fwWsuyhADg7u8Rmf9IRESakHh7CkvNbCbw1+j2+cBb4YQkIs3d4sWLeTc/P6lt5u9qxeDBhyS1zYYo3qRwKTARmETknsIrwJ/CCkpERFIj3qTQArjT3W+HYJRzZmhRiUizlpeXx9vpRUltc3CSp85uqOK9p/Ai0CpmuxWRSfFERKQJiTcpZLn79rKN6Ot9wglJRERSJd6ksMPMBpZtmNkgYFc4IYmISKrEe0/hcmCOmW0istDOgcDY0KISEZGUiDcpZAMDgO5ElubMI7oKm4iINB3xXj663t3/A7QHjgPuB+4JLSoREUmJeJPC3uj/ngTc6+7PABnhhCQiIqkSb1L4JLpG8xhgvpll1uK9IiLSSMT7D/sYYAEwyt2/AjoAk0OLSkREUiKupODuO919rru/H93+1N0X1vQ+MxtlZu+a2Tozu6aaemeZmUcfdRURkRQJ7RJQdCqMu4ETgb7AuWbWt5J6bYnMqbQkrFhERCQ+8T6SmoghwDp3/xDAzB4HTgPWVKh3E/BH4OoQY0mIZmoUkeYmzJvFXYCNMduF0bKAmQ0Aurn7c9UdyMwmmNlSM1u6efPm+o9URESAcHsKVklZMODNzNKAO4BxNR3I3e8nMjaCQYMGJW3QnGZqFJHmJsyeQiHQLWa7K7ApZrst0A942czWExklPU83m0VEUifMpJAP9DKzbDPLAM4B5pXtdPdt7t7J3Xu6e09gMXCquy8NMSYREalGaEnB3UuAy4iMb1gLzHb31WY2xcxODatdERFJXJj3FHD3+cD8CmU3VFH3B2HGIs3TpYOPrnLfIyverPXxLug/pC7hiDR4mqpCREQCSgoiIhJQUhARkUCo9xRERBqLVq1GVLlv9+5FCR0zK2tYouGkjHoKIiISUFIQEZGAkoKIiAR0T0EkSWKvL+/Zs6zKepmZA5MRjkil1FMQEZGAkoKIiASUFEREJKCkICIiASUFEREJKCmIiEhASUFERAJKCiIiElBSEBGRgJKCiIgElBRERCSguY9EpEGqbn3th5cvSeiYF+UMTTScZkNJoYHRQh8ikkq6fCQiIgElBRERCSgpiIhIQElBREQCSgoiIhJQUhARkYCSgoiIBJQUREQkoMFrIiI1qDgAdM+eZZXWy8wcmIxwQhVqT8HMRpnZu2a2zsyuqWT/VWa2xsxWmNmLZtYjzHhERKR6ofUUzCwduBs4DigE8s1snruvian2NjDI3Xea2aXAH4GxYcUk0lCU/aIsKvrmP4eMjL6pCkckEGZPYQiwzt0/dPci4HHgtNgK7v5Pd98Z3VwMdA0xHhERqUGY9xS6ABtjtguB6qYovAT4e2U7zGwCMAGge/fu9RWfNHMX9B8SvH7mnRVV1jvt0P7JCEekQQizp2CVlHmlFc0uAAYBt1a2393vd/dB7j6oc+fO9RiiiIjECrOnUAh0i9nuCmyqWMnMRgLXAse4+54Q4xERkRqE2VPIB3qZWbaZZQDnAPNiK5jZAOA+4FR3/zzEWEREJA6hJQV3LwEuAxYAa4HZ7r7azKaY2anRarcCbYA5ZlZgZvOqOJyIiCRBqIPX3H0+ML9C2Q0xr0eG2b6IiNSORjTXoLp1Yh9Z8WZCx4x96kVEpCFRUhCRRueinPJPt89dW1Bl3dF9csMOp0nRhHgiIhJQUhARkYCSgoiIBHRPoRGJnb63qql7oWlM3ysiqaGegohILWVmDsQsq9xfU/kxpqQgIiIBJQUREQkoKYiISEBJQUREAkoKIiIS0COpItLoje6Ty8IP1pYrO/57fVIUTeOmnoKIiASUFEREJKCkIJJCGRl9MWtFRkbfVIcitVR27pra+VNSEBGRgJKCiIgE9PRRI1U2z0pR0Zpy5U2pGysiyaeegog0Ccd/rw+tMzJpnZGpx1HrQD2FOqi41vIz76yotN5ph/ZPRjgiInWmpCAikqCWLbNTHUK90+UjEWkyjux2EG0yMlMdRqOmpCAiIgElBRGROigp2ZTqEOqV7imISJOS890uSWtrz54VlJZuZu/eLWRmHpa0dsOknkIj11SH2oskasNXW5LSTklJEaWlmwEoLf2ckpKipLQbNiUFEWkyXtuwjvnvr+b1DetCb6u4eHG1242VkoKINAlFe/ey8vPI9f0Vn2+iaO/e0NoqKfk3UFyhtJiSks9CazNZlBREpEmYu3pZue2n1iyrombdFRe/V0X5u6G1mSyhJgUzG2Vm75rZOjO7ppL9mWb2RHT/EjPrGWY8ItI0FW77ki/37CpXtnX3Lj75z5ehtGfWsVbljUloScHM0oG7gROBvsC5ZlbxTuglwJfufjBwB/CHsOIRkabrtY8/qLT81Q2Vl9dVZubBtSpvTMLsKQwB1rn7h+5eBDwOnFahzmnAQ9HXTwIjzMxCjElEmqDhPb5Xq/K6MsvELLtCWTZmjX80dZhJoQuwMWa7MFpWaR13LwG2AY2//yUiSdVl3/3YL7NVubIOWa3osu9+obWZmdmTb/4JTYtuN35hDl6r7Be/J1AHM5sATADo3r173SMLSapmQ22Kk3IlWypnstX5qx+jvz+QmcteD7bP6Dsw1PbM0sjIyKWoaBkZGQMwaxrP7YT5KQqBbjHbXYGK48GDOmbWAmgHbK14IHe/390Hufugzp07hxSuiDRmGenpHPadAwHo/50DyUhPD73N9PT9yMo6kvT09qG3lSxh9hTygV4WufD2CXAOcF6FOvOAHwNvAGcBL7n7t3oKIiLxOKrHwXRrtx892ifvKrRZVtLaSobQkoK7l5jZZcACIB34s7uvNrMpwFJ3nwfMBP5qZuuI9BDOCSseEWkekpkQmqJQJ8Rz9/nA/AplN8S83g2cHWYMIiISv6ZxZ0REROqFkoKIiASUFEREJKCkICIiASUFEREJKCmIiEjAGttYMTPbDGxIdRwh6gR8keogJCE6d41bUz9/Pdy9xikhGl1SaOrMbKm7D0p1HFJ7OneNm85fhC4fiYhIQElBREQCSgoNz/2pDkASpnPXuOn8oXsKIiISQz0FEREJKCmIiEhASSFBZrbXzArMbJWZzTGzferhmIPMbHo1+w80syfr2o5Ur8K5fdbM6nVZLTMbZ2Z3RV/faGZX1+fxpWYx57jsr6eZdTSzf5rZ9rLz0xwpKSRul7vnuns/oAj4n9idFlGr79fdl7r7pGr2b3L3sxILV2oh9txuBSamOiCpd2XnuOxvPbAbuB5o1klaSaF+vAocHP21sdbM/gQsA7qZ2fFm9oaZLYv2KNoAmNlgM1tkZsvN7E0za2tmPzCz56L7j4n5FfN2dH9PM1sV3Z9lZg+a2cro/mOj5ePMbK6Z/cPM3jezP6boO2kq3gC6lG2Y2WQzyzezFWb225jyi6Jly83sr9GyU8xsSfT8/J+ZfTcF8Uuc3H2Hu79GJDk0W0oKdWRmLYATgZXRokOAh919ALADuA4Y6e4DgaXAVWaWATwBXO7uOcBIYFeFQ18NTHT3XGB4JfsnArj7YcC5wEP2zWKxucBY4DBgrJl1q6/P25yYWTowgsha4pjZ8UAvYAiR7/hwMzvazL4PXAv8MHo+L48e4jUgL/r/hceBXyT5I0jVWsX86Hoq1cE0JKEux9nEtTKzgujrV4msN30gsMHdF0fL84C+wOtmBpBB5JfnIcCn7p4P4O7/AYjWKfM6cLuZzQLmunthhf1HATOi73/HzDYAvaP7XnT3bdFjrgF6ABvr6XM3B2XntifwFvBCtPz46N/b0e02RJJEDvCku38B4O5bo/u7Ak+Y2QFEzv1HSYle4rEr+oNLKlBPIXGx1yR/5u5F0fIdMXUMeCGmXl93vyRaXu0AEXefCvwX0ApYbGaHVqhi335XYE/M670o+ddW2T8YPYj8Y152T8GA38ecz4PdfSZVn88ZwF3R3tx/A1mV1BFpUJQUwrUYONLMDgYws33MrDfwDnCgmQ2OlreNXoYKmNn33H2lu/+ByGWniknhFeD8aN3eQHfg3VA/TTMT7W1NAq42s5bAAuDimPtCXczsO8CLwBgz6xgt7xA9RDvgk+jrHyc1eJEE6RdkiNx9s5mNAx4zs8xo8XXu/p6ZjQVmmFkrIvcLRlZ4+xXRm8d7gTXA34EDYvb/CbjXzFYCJcA4d99T4RKT1JG7v21my4Fz3P2vZtYHeCP6PW8HLnD31WZ2M/AvM9tL5PLSOOBGYI6ZfULkB0J2Kj6DxM/M1gP7AhlmdjpwvLuvSW1UyaVpLkREJKDLRyIiElBSEBGRgJKCiIgElBRERCSgpCAiIgElBZEQmVl7M/tpquMQiZeSgkgtVBxkGIf2gJKCNBoavCZNnpm1BmYTmYsoHbgJ+BC4E2hNZFqQEUAxcA8wiMiAwKvc/Z/RAYgnEZmmojXwQzObDIwBMoGn3P03VTQ/FfhedC6lF4D9icyT9Ew0tllEJkfsAJwRPV428Ki7/zZa5wIiI6szgCXAT919b718OSIVKClIczAK2OTuJwGYWTsio47Hunu+me1LZFT55RCZeTY619TC6BQiAEcA/d19a4XZUg2YZ2ZHu/srlbR9DdCvbPI1MzsGuBJ4JhrHMCJTYFwQPV4/YCeQb2bPE5lLayxwpLsXR6dlPx94uD6/IJEySgrSHKwEppnZH4DngK+ofJba6maefSFm9tOqZkutLCmU4+7/MrO7o3MmjQb+5u4l0WkzXnD3LdFY5hKZCbcEOJxIkoDIBImfJ/pFiNRESUGavOhcU4cDPwJ+Dyyk8llNq5s4quLst7939/sSDOmvRH7tnwNcHBtqhXoebeshd/9Vgm2J1IpuNEuTZ2YHAjvd/RFgGpF1LiqbpTbemWermi21Ml8DbSuU/QW4AsDdV8eUH2dmHaKTJJ5OZE2NF4Gzyo4f3d+jNp9fpDbUU5Dm4DDgVjMrJXIz+VIiv8ArzlIb18yz7r6wstlSqeSyjrtvMbPXo8uo/t3dJ7v7Z2a2Fni6QvURy0twAAAAgElEQVTXiPQiDiZyo3kpgJldR+T+Rlo0/onAhjp/KyKV0CypIklmZvsQuc8xMGaFvHHAIHe/LJWxiejykUgSmdlIIosszShLCCINiXoKIvUguurai5XsGlH2RJFIY6CkICIiAV0+EhGRgJKCiIgElBRERCSgpCAiIgElBRERCfw/wQEZp/wRE5kAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.boxenplot(x='score_type', y='score', hue='class', data=data, width=0.5, linewidth=1.0, palette=\"Set3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEBCAYAAACe6Rn8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAF4NJREFUeJzt3XuQXGWdxvHvQ7gZFJ0IDIsmGWKyQFBh3UFThauT6AIumkBlBUtFECSwCBIUd6HQcIsoBQIrFJqwrCBQLiugyQpiEDIoSJSAgAly3UwSFpRLAiFcwu23f5zTVNvpTJ+e7p6emff5VHV1+vR5+/y630w/fd5zU0RgZmZp2qzdBZiZWfs4BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhDgEzs4Rt3u4Catluu+2iq6ur3WWYmQ0rd91119MRsX2t+YZ8CHR1dbF06dJ2l2FmNqxIWllkPg8HmZklzCFgZpYwh4CZWcIKhYCkd0u6UNIdkl6UFJK6CrbdTNLJkvokvSzpXkkzGynazMyao+iawETgIGAt8Js6l3EmcBpwEfAJYAnwE0n/VOfrmJlZkxXdO+jXEdEJIOlLwD5FGknaATgR+E5EnJtPXixpIvAd4IY66zUzsyYqtCYQEW8M8PX3BbYErqyYfiXwPkk7D/B1zcysCVq9YXh3YAPwSMX05fn95BYv38zM+tHqg8XGAM/GxhcyXlP2/EYkzQJmAXR2dtLb29uyAltp6tSpTXmdxYsXN+V1rD7N6D/3XXv4b6+4VoeAgGpXsld/jSJiPjAfoLu7O3p6eppf2SDYOPs2JqnQfDb4avWL+27o8t9eca0eDloDdEiq/NLvKHvezMzapNUhsBzYCnhPxfTStoD7W7x8MzPrR6tD4EbgFeBzFdM/DyyLiBUtXr6ZmfWj8DYBSf+c//Pv8/tPSHoKeCoibs3neQ24PCKOAIiIJyWdD5ws6XngbuBgYBowo0nvwczMBqieDcM/qXh8cX5/K9CT/3tUfit3CrAeOB7YEXgQOCgi/qeuSs3MrOkKh0BE9LtHz6bmiYjXgbn5zczMhhCfRdTMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOAUvSuPHjkdTQDWj4NcaNH9/mT8JS1+qLypgNSatXreLaBx5vdxnM3HWndpdgifOagJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSWsUAhIGivpGknPSVon6TpJ4wq2HSfpckmrJL0o6SFJcyVt01jpZmbWqJoXlZE0GrgF2AAcCgQwF1gs6f0R8UI/bbcBfgVsAXwTWAXsBZwOTAIObvQNmJnZwBW5stiRwARgl4h4BEDSfcDDwFHAef203Zvsy37fiFiUT1ssaQxwoqTREfHigKs3M7OGFBkOmg4sKQUAQESsAG4HZtRou2V+v65i+rP5slWwTjMza4EiIbA7sKzK9OXA5Bptf0W2xnC2pMmS3ippGnA88IP+hpLMzKz1ioTAGGBtlelrgI7+GkbEy8CH8+UsB54HbgZ+DhxbV6VmZtZ0RbYJQLYxuFLNoRxJWwNXAzsAh5BtGP4gMAd4DfiXTbSbBcwC6OzspLe3t2CZw9NIf3/WP/d/+/izB0VU+34vm0H6C/CziDiqYvrFwKcjYvt+2n4ZuAiYGBGPlk0/EpgP7BkR9/a3/O7u7li6dGnNNzJcSaJWH1jzSeLaBx5vdxnM3HUn93+bjPS/PUl3RUR3rfmKDActJ9suUGkycH+Ntu8D1pYHQO73+f1uBZZvZmYtUiQEFgJTJE0oTZDURbb758Iabf8MdEiaWDH9Q/n9/xUr08zMWqFICFwC9AELJM2QNB1YAKwG5pVmkjRe0muS5pS1vYxsY/ANkg6VNFXS14FzgbvIdjM1M7M2qRkC+W6c04CHgCuAq4AVwLSIWF82q4BR5a8ZEX3AFOAesqOMbyA7+Gw+8I8R8UZT3oWZmQ1Iob2DImIVMLPGPH1U2WMoIu4HDhpIcWZm1lo+i6iZWcIcAmZmCXMImJklzCFgZpYwh4CZWcIcAmZmCSt6AjmzESVO3RZ+vGu7y8jqMGsjh4AlSaevGzonkDut3VVYyjwcZGbDztiusUhq6AY01H5s19g2fwrN4TUBMxt2Hlv5GBesuaCtNcweM7uty28WrwmYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCDegaN77tB6x0jRvf5k/BzIYzHyzWgJWrVxG9d7a1BvXs1dblm9nw5jUBM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYYVCQNJYSddIek7SOknXSRpXdCGSdpP0E0lPS3pJ0oOSjh942WZm1gw1jxiWNBq4BdgAHAoEMBdYLOn9EfFCjfbdefte4EvAc8Ak4K0NVW5mZg0rctqII4EJwC4R8QiApPuAh4GjgPM21VDSZsDlwM0RcWDZU4sHXLGZmTVNkeGg6cCSUgAARMQK4HZgRo22PcBk+gkKMzNrnyIhsDuwrMr05WRf8P35cH6/taQlkl6V9KSk70l6Sz2FmplZ8xUZDhoDrK0yfQ3QUaPtTvn91cBFwElAN3AGMBY4sFojSbOAWQCdnZ309vYWKDNd/nyGN/ff8DUS+q7oqaSjyjQVaFda07gyIubk/+6VNAr4jqTJEXH/RguLmA/MB+ju7o6enp6CZabJn8/w5v4bvkZC3xUZDlpLtjZQqYPqawjlnsnvb6qYvii/37PA8s3MrEWKhMBysu0ClSYDG/2Kr9IWNl6TKK1FvFFg+WZm1iJFQmAhMEXShNIESV3A3vlz/fkF2fEF+1VM3ze/X1qoSjMza4kiIXAJ0AcskDRD0nRgAbAamFeaSdJ4Sa9JKo39ExHPAN8GjpZ0lqSPSzoJmANcXr7bqZmZDb6aG4Yj4gVJ04DzgSvIhnJuBmZHxPqyWQWMYuNgOQN4HjgGOBF4AjgHOLPh6s3MrCGF9g6KiFXAzBrz9FFlj6GICLKDxXzAmJnZEOOziJqZJcwhYGaWsKIHi5mZDRlx6rbw73Nqz9hCx5+6bVuX3ywOATMbdnT6Oi5Yc0Fba5g9ZjZxWltLaAoPB5mZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZgnzwWINiFO3hcUfa38NVrex48Yxc9edas84CHWYtZNDoAE6fR3Re2d7a+jZa0QctTjYVq1c2fBrSCI7Sa7Z8OXhIDOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGGFQkDSWEnXSHpO0jpJ10mq+/SHkk6WFJJuq79UMzNrtpohIGk0cAuwK3AocAgwCVgsaZuiC5I0ATgFeHJgpZqZWbMVOZX0kcAEYJeIeARA0n3Aw8BRwHkFl/V94Cpgl4LLNTOzFisyHDQdWFIKAICIWAHcDswoshBJnwU+AJw8kCLNzKw1ioTA7sCyKtOXA5NrNZbUAZwP/GtErKmvPDMza6UiITAGWFtl+hqgo0D7c4CHgMuKl2VmZoOh6Nh8tWvoqVYjSf8AfAH4QNRxHT5Js4BZAJ2dnfT29hZtmiR/Pu3jzz5tI6H/i4TAWrK1gUodVF9DKDcPuBR4TNI7ypY5Kn/8UkRsqGwUEfOB+QDd3d3R09NToMx0+fNpH3/2aRsJ/V8kBJaTbReoNBm4v0bb3fLb0VWeWwucAFxQoAYzM2uBIiGwEDhX0oSI+F8ASV3A3sBJNdpOrTLtAmAUcBzwSJXnzcxskBQJgUuAY4EFkr5Btn3gTGA12XAPAJLGA48CZ0TEGQAR0Vv5YpKeBTav9pyZmQ2umnsHRcQLwDSyPXyuIDvgawUwLSLWl80qsl/4Ph+RmdkwUWjvoIhYBcysMU8fBfYYioieIss0M7PW8692M7OEOQTMzBLmE7mZ2bDz7vHvZvaY2W2vYSRwCJjZsLO6b3XDryGJOk5kMGJ5OMjMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhPlU0g0YP3Yc6tmr7TWYmQ2UQ6ABfatWNvwaPqe5mbWTh4PMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYYVCQNJYSddIek7SOknXSap5lJKkbknzJT0g6UVJqyRdJWnnxks3M7NG1QwBSaOBW4BdgUOBQ4BJwGJJ29Ro/hlgd+B7wCeAk4APAEsljW2gbjMza4IiRwwfCUwAdomIRwAk3Qc8DBwFnNdP27Mj4qnyCZJuB1bkrztnIEWbmVlzFBkOmg4sKQUAQESsAG4HZvTXsDIA8mkrgaeAd9VXqpmZNVuRENgdWFZl+nJgcr0LlLQbsAPwp3rbmplZcxUZDhoDrK0yfQ3QUc/CJG0O/IBsTeDSfuabBcwC6OzspLe3t57FDDsj/f2NZO674c39B6p1BktJrwDfjYiTK6Z/C/i3iCh8JlJJPwCOAPaPiEVF2nR3d8fSpUuLLmLY8VlEhy/33fA20vtP0l0R0V1rviJf4GvJ1gYqdVB9DWFTBX2b7Nf9oUUDwMzMWqtICCwn2y5QaTJwf5GFSDqFbPfQr0TEFcXLMzOzViqyYXghMEXShNIESV3A3vlz/ZL0FWAucEpEXDiwMs3MrBWKhMAlQB+wQNIMSdOBBcBqYF5pJknjJb0maU7ZtM8AFwA3ArdImlJ2q3vPIjMza66aw0ER8YKkacD5wBWAgJuB2RGxvmxWAaP462DZL5++X34rdyvQM+DKzcysYYX27ImIVcDMGvP0kX3hl087DDhsYKWZmVmr+SyiZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSXMIWBmljCHgJlZwhwCZmYJcwiYmSWsUAhIGivpGknPSVon6TpJ4wq23VrSOZKekPSSpDskfaSxss3MrBlqhoCk0cAtwK7AocAhwCRgsaRtCizjUuBIYA7wSeAJ4JeS9hxo0WZm1hybF5jnSGACsEtEPAIg6T7gYeAo4LxNNZS0B/BZ4PCI+GE+7VZgOXAGML2h6s3MqpDUlPkiohnlDGlFhoOmA0tKAQAQESuA24EZBdq+Clxd1vY14L+AfSVtVXfFZmY1RERTbikoEgK7A8uqTF8OTC7QdkVEvFil7ZbAxALLNzOzFikSAmOAtVWmrwE6Gmhbet7MzNqkyDYBgGrrRUUG3TSQtpJmAbMAOjs76e3tLbCooWfq1KmF5qs1Lrl48eJmlGN1KtJ/7jsb7oqEwFqq/2LvoPqv/HJrgGq7knaUPb+RiJgPzAfo7u6Onp6eAmUOPamMKY5U7j9LQZHhoOVkY/uVJgP3F2i7c76baWXbV4BHNm5iZmaDpUgILASmSJpQmiCpC9g7f65W2y2AT5e13Rw4GFgUERvqrNfMzJqoSAhcAvQBCyTNkDQdWACsBuaVZpI0XtJrkuaUpkXEPWS7h14g6UuSPka2e+jOwKnNextmZjYQNUMgIl4ApgEPAVcAVwErgGkRsb5sVgGjqrzmF4EfAnOB64GxwH4RcXfD1ZuZWUMK7R0UEauAmTXm6aPKXj8R8RLw1fxmZmZDiM8iamaWMIeAmVnCHAJmZgnTUD8gRtJTwMp219FC2wFPt7sIGxD33fA20vtvfERsX2umIR8CI52kpRHR3e46rH7uu+HN/ZfxcJCZWcIcAmZmCXMItN/8dhdgA+a+G97cf3ibgJlZ0rwmYGaWsCRDQNJhkqLs9rykeyUdm5/ldLDqOE1SXatiknol9baopBGnSl+/IulRSWdJ2rrNtfVJuqzscanWrrYVNYJU6fvy28fzec6StEjSM/n0w9pc9qAbtC+8IerTwGPAtvm/LwR2AOb016iJ/gO4sc42x7SikASU+vptwIHAyfm/j2tnUTYoSn1frnQtlOOAe4CfA18YzKKGitRD4J6IKF3YZpGkicBsqoSAsusIbhERrzRr4RHxGBv/56zVptaFfKy68r6+SdIk4AhJx0fEG+0szFquvO8rvT0i3sj/9pMMgSSHg/pxJ/A2STvkq+pXSjpc0gNkV0LbH0DSaElnS1qRDy+skHSKpL/6PCVtL+liSaslbcjvr5C0Vf78RsNBko6X9CdJL0laK2mppAPLnt9oOEjSLpJ+KunZvN0SSftVzHNavro7SdL1ktZLWilpTmXdibgbeAvZUaMASNpZ0lWSnsr7657yz75svj3yz/uZ/PN+UNLJZc/vI+kGSU9IelHSMklfkzRqcN6aFeUfAF4TqLQz8DpQuk7CVGBP4HTgSaAv32bwS7JLZJ4J/BGYAnyT7FrMXwOQ1AH8Np82F7iPbKhpBrAlsNFV1SR9DvgucAbwG7IvqfdT/RrPpTY7AbcBzwPHAs8BXwaul/TJiPhFRZOfkl3f4XzgU/l7W51PS0kX2Wf1DICkscDvyPr5BOApsivgXSvpgIhYmM/3QaCX7NKoJ5CtyU0i66eSCcDNZMOLLwPdwGnA9sBJLX1XVs2oim19ERGvt62aoSYikrsBhwEB7EIWhB3AUWQB8LN8nj7gRWDHiraH5G0/UjH9FLK1hR3yx2fkr/d3/dRxWtYFbz6+CLi7Ru29QG/Z43OB14CJZdNGAQ+Wv1ZpWcAXK17vj2SX+mx7vwxiXx+ef2bHls13KdkX/zsr2t9ENpxQevxrstAcXXD5ypd7CrAW2KzsuT7gsiq1drX7cxsJt7LPs/J2W5V5J+bPHdbuugf7luIwQLkHgFeBNcDFZFdNO7zs+SUR8eeKNvuRndDut5I2L92ARWTXU56Sz7cPcGdE/KGOeu4E9pR0oaSPSxpdoM1H8jrfHPOM7FfOj/PX2rZi/usrHi8DxtVR43BV3teXAvMi4qKy5/cDbgCeq+jXXwJ7SNo274+9gasi4sVNLUjS30iaJ2kl2Q+DV8nWBt9BtjZog+tAYK+y2xHtLWdoSX046ECy1fnngZUR8XLF809UabMDMJ7sD7uad5bd31tnPT8Ctib7T3oM8KqkG4CvRnbltmrGANWC5s9kv0I7gHVl09dUzLchX+ZIV+rr7cmucneMpN9FxI/y53cg2zC4qY2D7yT7Qt+Mfjbm59tXFgI7ka19PQC8BBxAtjaQwmc91CyLTW8YTl7qIVDrP0e1ffifIbvG8kGbaNOX3z8NvKueYiJbL50HzMu3KexDto3gauBDm2i2BtixyvQdyeqv/NJP1Zt9LekWsm0050i6NrLraD9Dth3m7E20f5xsmO0N+u/X95BtAzgkIq4sTZT0qcbfglnzpT4cNBA3AmOB9RGxtMqtdH7yRcAHJe0xkIVExNqIuBr4b+C9/cx6KzCl/ACjfC+Ug4E/RMTzA1n+SBYRG4Cvk/36Lx13cSPZxt3lm+jXDfkQ0G3A5yW9ZRMvXxrCe3NNUdIWwOda8mbMGpT6msBAXAV8EbhZ0nfJhny2JPsFOB04IP+yOB/4LPArSXPJNsBuR7Z30NHVvpwlzScbmrqDbC+VvyXbEL2on3rOJ9sAdpOkU8mGfo7J2+7f6JsdqSJioaQ7gRMlXUR2bMjvgV/nj/vIhtLeC0yIiNK2ohPJgveOvP8fI9sbaM+IOA74E9k2o29Jep0sDE4YvHdm9ZD0UbIhwtLadLek9QARcU3bChtEDoE6RcSrkvYl29VvFtlupS8Aj5JtdH0ln+9ZSXuTbRA8iWxM+S/ALaV5qridLGAOAd5ONgRxJXBqP/U8LunDZMMY3we2IjsCcv+IqPdo5NR8g2zD79ERcb6k0q6cZ5F9MTxDtuH88lKDiLgz79czyHYB3YrsS/+H+fOvSDqAbE+vH5ENx/0nsAq4ZHDeltXhdOCjZY+/nN8g26Y24vksomZmCfM2ATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhP0//t3ZjeY2sNgAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "draw_plot([pres_all, recalls_all, f1_all], title='Scores for All Elements')" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEBCAYAAACe6Rn8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAF2hJREFUeJzt3X2QXFWdxvHvQ3gzKDoRGBadZIhhgaDCuoOmFtedRAVcNIHKCpaKIEpgESQobkGh4S2LUiBBodCEZQWBclkBTVYQAySDgkQJCJggQthMEhYUJIEQXgKB3/5xb1NtpzN9e6Z7embO86nq6vTpe/r+uk+mn77n3u6riMDMzNK0VasLMDOz1nEImJklzCFgZpYwh4CZWcIcAmZmCXMImJklzCFgZpYwh4CZWcIcAmZmCdu61QXUstNOO0VnZ2eryzAzG1buvffev0TEzrWWG/Ih0NnZydKlS1tdhpnZsCJpVZHlPB1kZpYwh4CZWcIcAmZmCSsUApLeKekSSXdLelFSSOos2HcrSadL6pX0sqQHJE0fSNFmZtYYRbcEJgCHA+uAX9W5jnOBs4BLgY8BS4AfS/rnOh/HzMwarOjRQb+MiHYASV8EDizSSdIuwKnAtyLiwrx5saQJwLeAm+us18zMGqjQlkBEvN7Pxz8I2Ba4pqL9GuA9knbv5+OamVkDNHvH8D7ARmBFRfvy/Hpik9dvZmZ9aPaXxcYAz8bmJzJeW3b/ZiTNAGYAtLe309PT07QCm2ny5MkNeZzFixc35HGsPo0YP49da/hvr7hmh4CAameyV1+dImIeMA+gq6sruru7G1/ZINg8+zYnqdByNvhqjYvHbujy315xzZ4OWgu0Sap8028ru9/MzFqk2SGwHNgOeFdFe2lfwENNXr+ZmfWh2SFwC/AK8JmK9s8CyyJiZZPXb2ZmfSi8T0DSv+T//Pv8+mOSngaejog78mU2AVdFxBcAIuIpSXOA0yU9D9wHHAFMAaY16DmYmVk/1bNj+McVty/Lr+8AuvN/j8ov5c4ANgAnA7sCfwQOj4j/qatSMzNruMIhEBF9HtGzpWUi4jVgdn4xM7MhxL8iamaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAJWnsuHFIGtAFGPBjjB03rsWvhKWunnMMm40Ya1av5oaHn2h1GUzfa7dWl2CJ85aAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJaxQCEjqkHS9pOckrZd0o6SxBfuOlXSVpNWSXpT0iKTZknYYWOlmZjZQNb8sJmk0sAjYCBwFBDAbWCzpvRHxQh99dwBuA7YBvgGsBvYHzgb2AI4Y6BMwM7P+K/KN4WOB8cCeEbECQNKDwKPAccBFffQ9gOzN/qCIWJi3LZY0BjhV0uiIeLHf1ZuZ2YAUmQ6aCiwpBQBARKwE7gKm1ei7bX69vqL92XzdKlinmZk1QZEQ2AdYVqV9OTCxRt/byLYYzpc0UdKbJU0BTga+39dUkpmZNV+REBgDrKvSvhZo66tjRLwMfDBfz3LgeeB24GfAiXVVamZmDVf0V0SjSlvNqRxJ2wPXAbsAR5LtGH4/MAvYBPzrFvrNAGYAtLe309PTU7DM4WmkPz/rm8e/dfzagyKqvb+XLSD9GfhpRBxX0X4Z8MmI2LmPvl8CLgUmRMRjZe3HAvOA/SLigb7W39XVFUuXLq35RIYrSdQaA2s8SUPmp6Q9/q0x0v/2JN0bEV21lisyHbScbL9ApYnAQzX6vgdYVx4Aud/m13sXWL+ZmTVJkRBYAEySNL7UIKmT7PDPBTX6/glokzShov0D+fX/FSvTzMyaoUgIXA70AvMlTZM0FZgPrAHmlhaSNE7SJkmzyvpeSbYz+GZJR0maLOlrwIXAvWSHmZqZWYvUDIH8MM4pwCPA1cC1wEpgSkRsKFtUwKjyx4yIXmAScD/Zt4xvJvvy2TzgoxHxekOehZmZ9Uuho4MiYjUwvcYyvVQ5YigiHgIO709xZmbWXP4VUTOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ8DMLGEOATOzhDkEzMwS5hAwM0uYQ2AAOseOQ9KALsCA+neOHdfiV8Fs8HV0drT8b6+js6PFr0JjFDrRvFW3as1qoueeltag7v1bun6zVnh81eNcvPbiltYwc8zMlq6/UbwlYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCCh0iKqkDmAN8FBBwGzAzIlYX7L83cA4wGdgBWA1cFhHf6U/RZgMVZ+4IP9qr1WVkdZi1UM0QkDQaWARsBI4CApgNLJb03oh4oUb/rrx/D/BF4DlgD+DNA6rcbAB09npuePiJVpfB9L12I85qdRWWsiJbAscC44E9I2IFgKQHgUeB44CLttRR0lbAVcDtEXFY2V2L+12xmZk1TJF9AlOBJaUAAIiIlcBdwLQafbuBifQRFGZm1jpFQmAfYFmV9uVkb/B9+WB+vb2kJZJelfSUpO9KelM9hZqZWeMVmQ4aA6yr0r4WaKvRd7f8+jrgUuA0oItsJ3EHcFi1TpJmADMA2tvb6enpKVBmuvz6DG8ev+FrJIxd0R+QiyptKtCvtKVxTUTMyv/dI2kU8C1JEyPioc1WFjEPmAfQ1dUV3d3dBctMk1+f4c3jN3yNhLErMh20jmxroFIb1bcQyj2TX99a0b4wv96vwPrNzKxJioTAcrL9ApUmApt9iq/SFzbfkihtRbxeYP1mZtYkRUJgATBJ0vhSg6RO4ID8vr78nOz7BQdXtB+UXy8tVKWZmTVFkRC4HOgF5kuaJmkqMB9YA8wtLSRpnKRNkkpz/0TEM8A3geMlnSfpI5JOA2YBV5UfdmpmZoOv5o7hiHhB0hSyn424mmwq53ayn43YULaogFFsHiznAM8DJwCnAk8CFwDnDrh6MzMbkEJHB+W/ETS9xjK9VDliKCKC7Mti/sKYmdkQ418RNTNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhhU4vaWY2lMSZO8J3ZrW0hpPP3LGl628Uh4CZDTs6ez0Xr724pTXMHDOTOKulJTSEp4PMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS1ihEJDUIel6Sc9JWi/pRklj612ZpNMlhaQ76y/VzMwarWYISBoNLAL2Ao4CjgT2ABZL2qHoiiSNB84AnupfqWZm1mhFvix2LDAe2DMiVgBIehB4FDgOuKjgur4HXAvsWXC9ZmbWZEWmg6YCS0oBABARK4G7gGlFViLp08D7gNP7U6SZmTVHkRDYB1hWpX05MLFWZ0ltwBzg3yJibX3lmZlZMxUJgTHAuirta4G2Av0vAB4BrixelpmZDYaic/NRpU21Okn6R+BzwPsiotpjbKnfDGAGQHt7Oz09PUW7Dqo4c0dY/OGW1zBUXx8rxuM3fI2EsSsSAuvItgYqtVF9C6HcXOAK4HFJbytb56j89ksRsbGyU0TMA+YBdHV1RXd3d4EyB58mryd67mltDd37E2d1t7QGG5ih+v/bahsJY1ckBJaT7ReoNBF4qEbfvfPL8VXuWwecArT292DNzBJWJAQWABdKGh8R/wsgqRM4ADitRt/JVdouBkYBJwErqtxvZmaDpEgIXA6cCMyX9HWy/QPnAmvIpnsAkDQOeAw4JyLOAYiInsoHk/QssHW1+8zMbHDVDIGIeEHSFLLDPK8m2yF8OzAzIjaULSqyT/j+PSIb8jrGjmX6Xru1ugw6xtb96ytmDVXo6KCIWA1Mr7FMLwWOGIqI7iLrNGum1atWDfgxJFHHQW9mQ5I/tZuZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJazQmcXMzIaSd457JzPHzGx5DSOBQ8DMhp01vWsG/Bg+PWjG00FmZglzCJiZJcwhYGaWMIeAmVnCHAJmZglzCJiZJcwhYGaWMIeAmVnCCoWApA5J10t6TtJ6STdKGlugX5ekeZIelvSipNWSrpW0+8BLNzOzgaoZApJGA4uAvYCjgCOBPYDFknao0f1TwD7Ad4GPAacB7wOWSuoYQN1mZtYARX424lhgPLBnRKwAkPQg8ChwHHBRH33Pj4inyxsk3QWszB93Vn+KNjOzxigyHTQVWFIKAICIWAncBUzrq2NlAORtq4CngXfUV6qZmTVakRDYB1hWpX05MLHeFUraG9gF+EO9fc3MrLGKTAeNAdZVaV8LtNWzMklbA98n2xK4oo/lZgAzANrb2+np6alnNcnx69M6fu2HN49f8Z+SrvZ7q+rH+i4F/gE4JCKqBUu2soh5wDyArq6u6O7u7seq0uHXp3X82g9vHr9iIbCObGugUhvVtxCqkvRNsk/3R0XEwqL9zMyseYqEwHKy/QKVJgIPFVmJpDPIDg/9ckRcXbw8MzNrpiI7hhcAkySNLzVI6gQOyO/rk6QvA7OBMyLikv6VaWZmzVAkBC4HeoH5kqZJmgrMB9YAc0sLSRonaZOkWWVtnwIuBm4BFkmaVHap+8giMzNrrJrTQRHxgqQpwBzgarIdwrcDMyNiQ9miAkbx18FycN5+cH4pdwfQ3e/Kh4BxHWNR9/4tr8HMrL801E+03NXVFUuXLm11GU3jk10PXx674W2kj5+keyOiq9Zy/hVRM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBLmEDAzS5hDwMwsYQ4BM7OEOQTMzBJWKAQkdUi6XtJzktZLulHS2IJ9t5d0gaQnJb0k6W5JHxpY2WZm1gg1Q0DSaGARsBdwFHAksAewWNIOBdZxBXAsMAv4OPAk8AtJ+/W3aDMza4ytCyxzLDAe2DMiVgBIehB4FDgOuGhLHSXtC3waOCYifpC33QEsB84Bpg6oejMzG5Ai00FTgSWlAACIiJXAXcC0An1fBa4r67sJ+C/gIEnb1V2xmZk1TJEQ2AdYVqV9OTCxQN+VEfFilb7bAhMKrN/MrC6Sal6KLJeCItNBY4B1VdrXAm0D6Fu638ysoSKi1SUMG0VCAKDaK1okJtWfvpJmADMA2tvb6enpKbCqoWfy5MmFlqv1iWPx4sWNKMfqVGT8PHY23BUJgXVU/8TeRvVP+eXWAtUOJW0ru38zETEPmAfQ1dUV3d3dBcocevxpZHjz+FkKiuwTWE42t19pIvBQgb6754eZVvZ9BVixeRczMxssRUJgATBJ0vhSg6RO4ID8vlp9twE+WdZ3a+AIYGFEbKyzXjMza6AiIXA50AvMlzRN0lRgPrAGmFtaSNI4SZskzSq1RcT9ZIeHXizpi5I+THZ46O7AmY17GmZm1h81QyAiXgCmAI8AVwPXAiuBKRGxoWxRAaOqPObngR8As4GbgA7g4Ii4b8DVm5nZgBQ6OigiVgPTayzTS5WjfiLiJeAr+cXMzIYQ/4qomVnCHAJmZglzCJiZJUxD/Qsxkp4GVrW6jibaCfhLq4uwfvHYDW8jffzGRcTOtRYa8iEw0klaGhFdra7D6uexG948fhlPB5mZJcwhYGaWMIdA681rdQHWbx674c3jh/cJmJklzVsCZmYJSzIEJB0tKcouz0t6QNKJ+a+cDlYdZ0mqa1NMUo+kniaVNOJUGetXJD0m6TxJ27e4tl5JV5bdLtXa2bKiRpAqY19++Ui+zHmSFkp6Jm8/usVlD7pBe8Mboj4JPA7smP/7EmAXYFZfnRroP4Bb6uxzQjMKSUBprN8CHAacnv/7pFYWZYOiNPblSudCOQm4H/gZ8LnBLGqoSD0E7o+I0oltFkqaAMykSggoO4/gNhHxSqNWHhGPs/l/zlp9ap3Ix6orH+tbJe0BfEHSyRHxeisLs6YrH/tKb42I1/O//SRDIMnpoD7cA7xF0i75pvo1ko6R9DDZmdAOAZA0WtL5klbm0wsrJZ0h6a9eT0k7S7pM0hpJG/PrqyVtl9+/2XSQpJMl/UHSS5LWSVoq6bCy+zebDpK0p6SfSHo277dE0sEVy5yVb+7uIekmSRskrZI0q7LuRNwHvInsW6MASNpd0rWSns7H6/7y175suX3z1/uZ/PX+o6TTy+4/UNLNkp6U9KKkZZK+KmnU4Dw1K8ofALwlUGl34DWgdJ6EycB+wNnAU0Bvvs/gF2SnyDwX+D0wCfgG2bmYvwogqQ34dd42G3iQbKppGrAtsNlZ1SR9Bvg2cA7wK7I3qfdS/RzPpT67AXcCzwMnAs8BXwJukvTxiPh5RZefkJ3fYQ7wify5rcnbUtJJ9lo9AyCpA/gN2TifAjxNdga8GyQdGhEL8uXeD/SQnRr1FLItuT3IxqlkPHA72fTiy0AXcBawM3BaU5+VVTOqYl9fRMRrLatmqImI5C7A0UAAe5IFYRtwHFkA/DRfphd4Edi1ou+Red8PVbSfQba1sEt++5z88f6ujzrOyobgjduXAvfVqL0H6Cm7fSGwCZhQ1jYK+GP5Y5XWBXy+4vF+T3aqz5aPyyCO9TH5a3Zi2XJXkL3xv72i/61k0wml278kC83RBdevfL1nAOuArcru6wWurFJrZ6tft5FwKXs9Ky93Vll2Qn7f0a2ue7AvKU4DlHsYeBVYC1xGdta0Y8ruXxIRf6roczDZD9r9WtLWpQuwkOx8ypPy5Q4E7omI39VRzz3AfpIukfQRSaML9PlQXucbc56Rfcr5Uf5YO1Ysf1PF7WXA2DpqHK7Kx/oKYG5EXFp2/8HAzcBzFeP6C2BfSTvm43EAcG1EvLilFUn6G0lzJa0i+2DwKtnW4NvItgZtcB0G7F92+UJryxlaUp8OOoxsc/55YFVEvFxx/5NV+uwCjCP7w67m7WXXD9RZzw+B7cn+k54AvCrpZuArkZ25rZoxQLWg+RPZp9A2YH1Z+9qK5Tbm6xzpSmO9M9lZ7k6Q9JuI+GF+/y5kOwa3tHPw7WRv6FvRx878fP/KAmA3sq2vh4GXgEPJtgZSeK2HmmWx5R3DyUs9BGr956h2DP8zZOdYPnwLfXrz678A76inmMi2S+cCc/N9CgeS7SO4DvjAFrqtBXat0r4rWf2Vb/qpemOsJS0i20dzgaQbIjuP9jNk+2HO30L/J8im2V6n73F9F9k+gCMj4ppSo6RPDPwpmDVe6tNB/XEL0AFsiIilVS6l3ydfCLxf0r79WUlErIuI64D/Bt7dx6J3AJPKv2CUH4VyBPC7iHi+P+sfySJiI/A1sk//pe9d3EK2c3f5FsZ1Yz4FdCfwWUlv2sLDl6bw3thSlLQN8JmmPBmzAUp9S6A/rgU+D9wu6dtkUz7bkn0CnAocmr9ZzAE+DdwmaTbZDtidyI4OOr7am7OkeWRTU3eTHaXyt2Q7ohf2Uc8csh1gt0o6k2zq54S87yEDfbIjVUQskHQPcKqkS8m+G/Jb4Jf57V6yqbR3A+MjorSv6FSy4L07H//HyY4G2i8iTgL+QLbP6N8lvUYWBqcM3jOzekj6J7IpwtLWdJekDQARcX3LChtEDoE6RcSrkg4iO9RvBtlhpS8Aj5HtdH0lX+5ZSQeQ7RA8jWxO+c/AotIyVdxFFjBHAm8lm4K4Bjizj3qekPRBsmmM7wHbkX0D8pCIqPfbyKn5OtmO3+MjYo6k0qGc55G9MTxDtuP8qlKHiLgnH9dzyA4B3Y7sTf8H+f2vSDqU7EivH5JNx/0nsBq4fHCeltXhbOCfym5/Kb9Atk9txPOviJqZJcz7BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEuYQMDNLmEPAzCxhDgEzs4Q5BMzMEvb/ucuWDpfEXmUAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "draw_plot([pres_non_text, recalls_non_text, f1_non_text], title='Score for Non-text Elements')" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/CDM/result_processing/eval_classes.py b/CDM/result_processing/eval_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..64f78e231017574ad0002c002e04178c288fd28f --- /dev/null +++ b/CDM/result_processing/eval_classes.py @@ -0,0 +1,215 @@ +import json +import numpy as np +import cv2 +from glob import glob +from os.path import join as pjoin +from tqdm import tqdm + +class_map = {'0':'Button', '1':'CheckBox', '2':'Chronometer', '3':'EditText', '4':'ImageButton', '5':'ImageView', + '6':'ProgressBar', '7':'RadioButton', '8':'RatingBar', '9':'SeekBar', '10':'Spinner', '11':'Switch', + '12':'ToggleButton', '13':'VideoView', '14':'TextView'} + + +def resize_label(bboxes, d_height, gt_height, bias=0): + bboxes_new = [] + scale = gt_height / d_height + for bbox in bboxes: + bbox = [int(b * scale + bias) for b in bbox] + bboxes_new.append(bbox) + return bboxes_new + + +def draw_bounding_box(org, corners, color=(0, 255, 0), line=2, show=False): + board = org.copy() + for i in range(len(corners)): + board = cv2.rectangle(board, (corners[i][0], corners[i][1]), (corners[i][2], corners[i][3]), color, line) + if show: + cv2.imshow('a', cv2.resize(board, (500, 1000))) + cv2.waitKey(0) + return board + + +def load_detect_result_json(reslut_file_root, shrink=4): + def is_bottom_or_top(corner): + column_min, row_min, column_max, row_max = corner + if row_max < 36 or row_min > 725: + return True + return False + + result_files = glob(pjoin(reslut_file_root, '*.json')) + compos_reform = {} + print('Loading %d detection results' % len(result_files)) + for reslut_file in tqdm(result_files): + img_name = reslut_file.split('\\')[-1].split('.')[0] + compos = json.load(open(reslut_file, 'r'))['compos'] + for compo in compos: + if compo['column_max'] - compo['column_min'] < 10 or compo['row_max'] - compo['row_min'] < 10: + continue + if is_bottom_or_top((compo['column_min'], compo['row_min'], compo['column_max'], compo['row_max'])): + continue + if img_name not in compos_reform: + compos_reform[img_name] = {'bboxes': [[compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink]], + 'categories': [compo['category']]} + else: + compos_reform[img_name]['bboxes'].append([compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink]) + compos_reform[img_name]['categories'].append(compo['category']) + return compos_reform + + +def load_ground_truth_json(gt_file): + def get_img_by_id(img_id): + for image in images: + if image['id'] == img_id: + return image['file_name'].split('/')[-1][:-4], (image['height'], image['width']) + + def cvt_bbox(bbox): + ''' + :param bbox: [x,y,width,height] + :return: [col_min, row_min, col_max, row_max] + ''' + bbox = [int(b) for b in bbox] + return [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] + + data = json.load(open(gt_file, 'r')) + images = data['images'] + annots = data['annotations'] + compos = {} + print('Loading %d ground truth' % len(annots)) + for annot in tqdm(annots): + img_name, size = get_img_by_id(annot['image_id']) + if img_name not in compos: + compos[img_name] = {'bboxes': [cvt_bbox(annot['bbox'])], 'categories': [class_map[str(annot['category_id'])]], 'size': size} + else: + compos[img_name]['bboxes'].append(cvt_bbox(annot['bbox'])) + compos[img_name]['categories'].append(class_map[str(annot['category_id'])]) + return compos + + +def eval(detection, ground_truth, img_root, show=True, no_text=False, only_text=False): + def compo_filter(compos, flag): + if not no_text and not only_text: + return compos + compos_new = {'bboxes': [], 'categories': []} + for k, category in enumerate(compos['categories']): + if only_text: + if flag == 'det' and category != 'TextView': + continue + if flag == 'gt' and category != 'TextView': + continue + elif no_text: + if flag == 'det' and category == 'TextView': + continue + if flag == 'gt' and category == 'TextView': + continue + + compos_new['bboxes'].append(compos['bboxes'][k]) + compos_new['categories'].append(category) + return compos_new + + def match(org, d_bbox, d_category, gt_compos, matched): + ''' + :param matched: mark if the ground truth component is matched + :param d_bbox: [col_min, row_min, col_max, row_max] + :param gt_bboxes: list of ground truth [[col_min, row_min, col_max, row_max]] + :return: Boolean: if IOU large enough or detected box is contained by ground truth + ''' + area_d = (d_bbox[2] - d_bbox[0]) * (d_bbox[3] - d_bbox[1]) + gt_bboxes = gt_compos['bboxes'] + gt_categories = gt_compos['categories'] + for i, gt_bbox in enumerate(gt_bboxes): + if matched[i] == 0: + continue + area_gt = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1]) + col_min = max(d_bbox[0], gt_bbox[0]) + row_min = max(d_bbox[1], gt_bbox[1]) + col_max = min(d_bbox[2], gt_bbox[2]) + row_max = min(d_bbox[3], gt_bbox[3]) + # if not intersected, area intersection should be 0 + w = max(0, col_max - col_min) + h = max(0, row_max - row_min) + area_inter = w * h + if area_inter == 0: + continue + iod = area_inter / area_d + iou = area_inter / (area_d + area_gt - area_inter) + # if show: + # cv2.putText(org, (str(round(iou, 2)) + ',' + str(round(iod, 2))), (d_bbox[0], d_bbox[1]), + # cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) + + if iou > 0.9 or iod == 1: + if d_category == gt_categories[i]: + matched[i] = 0 + return True + return False + + amount = len(detection) + TP, FP, FN = 0, 0, 0 + pres, recalls, f1s = [], [], [] + for i, image_id in enumerate(detection): + TP_this, FP_this, FN_this = 0, 0, 0 + img = cv2.imread(pjoin(img_root, image_id + '.jpg')) + d_compos = detection[image_id] + if image_id not in ground_truth: + continue + gt_compos = ground_truth[image_id] + + org_height = gt_compos['size'][0] + + d_compos = compo_filter(d_compos, 'det') + gt_compos = compo_filter(gt_compos, 'gt') + + d_compos['bboxes'] = resize_label(d_compos['bboxes'], 800, org_height) + matched = np.ones(len(gt_compos['bboxes']), dtype=int) + for j, d_bbox in enumerate(d_compos['bboxes']): + if match(img, d_bbox, d_compos['categories'][j], gt_compos, matched): + TP += 1 + TP_this += 1 + else: + FP += 1 + FP_this += 1 + FN += sum(matched) + FN_this = sum(matched) + + try: + pre_this = TP_this / (TP_this + FP_this) + recall_this = TP_this / (TP_this + FN_this) + f1_this = 2 * (pre_this * recall_this) / (pre_this + recall_this) + except: + print('empty') + continue + + pres.append(pre_this) + recalls.append(recall_this) + f1s.append(f1_this) + if show: + print(image_id + '.jpg') + print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f' % ( + i, amount, TP_this, FP_this, FN_this, pre_this, recall_this)) + # cv2.imshow('org', cv2.resize(img, (500, 1000))) + broad = draw_bounding_box(img, d_compos['bboxes'], color=(255, 0, 0), line=3) + draw_bounding_box(broad, gt_compos['bboxes'], color=(0, 0, 255), show=True, line=2) + + if i % 200 == 0: + precision = TP / (TP + FP) + recall = TP / (TP + FN) + f1 = 2 * (precision * recall) / (precision + recall) + print( + '[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1)) + + precision = TP / (TP + FP) + recall = TP / (TP + FN) + print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1)) + # print("Average precision:%.4f; Average recall:%.3f" % (sum(pres)/len(pres), sum(recalls)/len(recalls))) + + return pres, recalls, f1s + + +no_text = True +only_text = False + +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\ip') +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\merge') +detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3\\merge') +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3\\ocr') +gt = load_ground_truth_json('E:\\Mulong\\Datasets\\rico\\instances_test.json') +eval(detect, gt, 'E:\\Mulong\\Datasets\\rico\\combined', show=False, no_text=no_text, only_text=only_text) diff --git a/CDM/result_processing/eval_size.py b/CDM/result_processing/eval_size.py new file mode 100644 index 0000000000000000000000000000000000000000..1c90ccea854b876c6841f901afdebbedf4921fd9 --- /dev/null +++ b/CDM/result_processing/eval_size.py @@ -0,0 +1,219 @@ +import json +import numpy as np +import cv2 +from glob import glob +from os.path import join as pjoin +from tqdm import tqdm + + +def resize_label(bboxes, d_height, gt_height, bias=0): + bboxes_new = [] + scale = gt_height / d_height + for bbox in bboxes: + bbox = [int(b * scale + bias) for b in bbox] + bboxes_new.append(bbox) + return bboxes_new + + +def draw_bounding_box(org, corners, color=(0, 255, 0), line=2, show=False): + board = cv2.resize(org, (608, 1024)) + for i in range(len(corners)): + board = cv2.rectangle(board, (corners[i][0], corners[i][1]), (corners[i][2], corners[i][3]), color, line) + if show: + cv2.imshow('a', board) + cv2.waitKey(0) + return board + + +def load_detect_result_json(reslut_file_root, shrink=3): + def is_bottom_or_top(corner): + column_min, row_min, column_max, row_max = corner + if row_max < 36 or row_min > 725: + return True + return False + + result_files = glob(pjoin(reslut_file_root, '*.json')) + compos_reform = {} + print('Loading %d detection results' % len(result_files)) + for reslut_file in tqdm(result_files): + img_name = reslut_file.split('\\')[-1].split('.')[0] + compos = json.load(open(reslut_file, 'r'))['compos'] + for compo in compos: + if compo['column_max'] - compo['column_min'] < 10 or compo['row_max'] - compo['row_min'] < 10: + continue + if is_bottom_or_top((compo['column_min'], compo['row_min'], compo['column_max'], compo['row_max'])): + continue + if img_name not in compos_reform: + compos_reform[img_name] = {'bboxes': [ + [compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, + compo['row_max'] - shrink]], + 'categories': [compo['category']]} + else: + compos_reform[img_name]['bboxes'].append( + [compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, + compo['row_max'] - shrink]) + compos_reform[img_name]['categories'].append(compo['category']) + return compos_reform + + +def load_ground_truth_json(gt_file): + def get_img_by_id(img_id): + for image in images: + if image['id'] == img_id: + return image['file_name'].split('/')[-1][:-4], (image['height'], image['width']) + + def cvt_bbox(bbox): + ''' + :param bbox: [x,y,width,height] + :return: [col_min, row_min, col_max, row_max] + ''' + bbox = [int(b) for b in bbox] + return [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] + + data = json.load(open(gt_file, 'r')) + images = data['images'] + annots = data['annotations'] + compos = {} + print('Loading %d ground truth' % len(annots)) + for annot in tqdm(annots): + img_name, size = get_img_by_id(annot['image_id']) + if img_name not in compos: + compos[img_name] = {'bboxes': [cvt_bbox(annot['bbox'])], 'categories': [annot['category_id']], 'size': size} + else: + compos[img_name]['bboxes'].append(cvt_bbox(annot['bbox'])) + compos[img_name]['categories'].append(annot['category_id']) + return compos + + +def eval(detection, ground_truth, img_root, show=True, no_text=False, only_text=False): + def compo_filter(compos, flag): + if not no_text and not only_text: + return compos + compos_new = {'bboxes': [], 'categories': []} + for k, category in enumerate(compos['categories']): + if only_text: + if flag == 'det' and category != 'TextView': + continue + if flag == 'gt' and int(category) != 14: + continue + elif no_text: + if flag == 'det' and category == 'TextView': + continue + if flag == 'gt' and int(category) == 14: + continue + + compos_new['bboxes'].append(compos['bboxes'][k]) + compos_new['categories'].append(category) + return compos_new + + def match(org, d_bbox, gt_bboxes, matched): + ''' + :param matched: mark if the ground truth component is matched + :param d_bbox: [col_min, row_min, col_max, row_max] + :param gt_bboxes: list of ground truth [[col_min, row_min, col_max, row_max]] + :return: Boolean: if IOU large enough or detected box is contained by ground truth + ''' + area_d = (d_bbox[2] - d_bbox[0]) * (d_bbox[3] - d_bbox[1]) + size = -1 + for i, gt_bbox in enumerate(gt_bboxes): + if matched[i] == 0: + continue + area_gt = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1]) + col_min = max(d_bbox[0], gt_bbox[0]) + row_min = max(d_bbox[1], gt_bbox[1]) + col_max = min(d_bbox[2], gt_bbox[2]) + row_max = min(d_bbox[3], gt_bbox[3]) + # if not intersected, area intersection should be 0 + w = max(0, col_max - col_min) + h = max(0, row_max - row_min) + area_inter = w * h + if area_inter == 0: + continue + iod = area_inter / area_d + iou = area_inter / (area_d + area_gt - area_inter) + # if show: + # cv2.putText(org, (str(round(iou, 2)) + ',' + str(round(iod, 2))), (d_bbox[0], d_bbox[1]), + # cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) + + if iou > 0.9 or iod == 1: + if (gt_bbox[2] - gt_bbox[0]) < 64: + size = 0 + elif 64 < (gt_bbox[2] - gt_bbox[0]) < 128: + size = 1 + elif (gt_bbox[2] - gt_bbox[0]) > 128: + size = 2 + matched[i] = 0 + return True, size + return False, size + + amount = len(detection) + TP, FP, FN = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] + for i, image_id in enumerate(detection): + img = cv2.imread(pjoin(img_root, image_id + '.jpg')) + d_compos = detection[image_id] + if image_id not in ground_truth: + continue + gt_compos = ground_truth[image_id] + + org_height = gt_compos['size'][0] + + d_compos = compo_filter(d_compos, 'det') + gt_compos = compo_filter(gt_compos, 'gt') + + d_compos['bboxes'] = resize_label(d_compos['bboxes'], 800, 1024) + gt_compos['bboxes'] = resize_label(gt_compos['bboxes'], org_height, 1024) + matched = np.ones(len(gt_compos['bboxes']), dtype=int) + for d_bbox in d_compos['bboxes']: + m, size = match(img, d_bbox, gt_compos['bboxes'], matched) + if m: + TP[size] += 1 + else: + h = d_bbox[2] - d_bbox[0] + if h < 64: + size = 0 + elif 64 < h < 128: + size = 1 + elif h > 128: + size = 2 + FP[size] += 1 + + for i in range(len(matched)): + if matched[i] == 1: + gt_bboxes = gt_compos['bboxes'] + h = gt_bboxes[i][2] - gt_bboxes[i][0] + if h < 64: + size = 0 + elif 64 < h < 128: + size = 1 + elif h > 128: + size = 2 + FN[size] += 1 + + if show: + print(image_id + '.jpg') + # cv2.imshow('org', cv2.resize(img, (500, 1000))) + broad = draw_bounding_box(img, d_compos['bboxes'], color=(255, 0, 0), line=3) + draw_bounding_box(broad, gt_compos['bboxes'], color=(0, 0, 255), show=True, line=2) + + if i % 200 == 0: + precision = [round(TP[i] / (TP[i] + FP[i]),3) for i in range(len(TP))] + recall = [round(TP[i] / (TP[i] + FN[i]),3) for i in range(len(TP))] + f1 = [round(2 * (precision[i] * recall[i]) / (precision[i] + recall[i]), 3) for i in range(3)] + print( + '[%d/%d] TP:%s, FP:%s, FN:%s, Precesion:%s, Recall:%s, F1:%s' % ( + i, amount, str(TP), str(FP), str(FN), str(precision), str(recall), str(f1))) + + precision = [round(TP[i] / (TP[i] + FP[i]),3) for i in range(len(TP))] + recall = [round(TP[i] / (TP[i] + FN[i]),3) for i in range(len(TP))] + f1 = [round(2 * (precision[i] * recall[i]) / (precision[i] + recall[i]), 3) for i in range(3)] + print( + '[%d/%d] TP:%s, FP:%s, FN:%s, Precesion:%s, Recall:%s, F1:%s' % ( + i, amount, str(TP), str(FP), str(FN), str(precision), str(recall), str(f1))) + # print("Average precision:%.4f; Average recall:%.3f" % (sum(pres)/len(pres), sum(recalls)/len(recalls))) + + +no_text = False +only_text = False +detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3\\merge') +gt = load_ground_truth_json('E:\\Mulong\\Datasets\\rico\\instances_test.json') +eval(detect, gt, 'E:\\Mulong\\Datasets\\rico\\combined', show=False, no_text=no_text, only_text=only_text) \ No newline at end of file diff --git a/CDM/result_processing/evaluation.py b/CDM/result_processing/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..def5f3d2dee07a487718a118af32adb62e21d207 --- /dev/null +++ b/CDM/result_processing/evaluation.py @@ -0,0 +1,208 @@ +import json +import numpy as np +import cv2 +from glob import glob +from os.path import join as pjoin +from tqdm import tqdm + + +def resize_label(bboxes, d_height, gt_height, bias=0): + bboxes_new = [] + scale = gt_height / d_height + for bbox in bboxes: + bbox = [int(b * scale + bias) for b in bbox] + bboxes_new.append(bbox) + return bboxes_new + + +def draw_bounding_box(org, corners, color=(0, 255, 0), line=2, show=False): + board = org.copy() + for i in range(len(corners)): + board = cv2.rectangle(board, (corners[i][0], corners[i][1]), (corners[i][2], corners[i][3]), color, line) + if show: + cv2.imshow('a', cv2.resize(board, (500, 1000))) + cv2.waitKey(0) + return board + + +def load_detect_result_json(reslut_file_root, shrink=4): + def is_bottom_or_top(corner): + column_min, row_min, column_max, row_max = corner + if row_max < 36 or row_min > 725: + return True + return False + + result_files = glob(pjoin(reslut_file_root, '*.json')) + compos_reform = {} + print('Loading %d detection results' % len(result_files)) + for reslut_file in tqdm(result_files): + img_name = reslut_file.split('\\')[-1].split('.')[0] + compos = json.load(open(reslut_file, 'r'))['compos'] + for compo in compos: + if compo['column_max'] - compo['column_min'] < 10 or compo['row_max'] - compo['row_min'] < 10: + continue + if is_bottom_or_top((compo['column_min'], compo['row_min'], compo['column_max'], compo['row_max'])): + continue + if img_name not in compos_reform: + compos_reform[img_name] = {'bboxes': [[compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink]], + 'categories': [compo['category']]} + else: + compos_reform[img_name]['bboxes'].append([compo['column_min'] + shrink, compo['row_min'] + shrink, compo['column_max'] - shrink, compo['row_max'] - shrink]) + compos_reform[img_name]['categories'].append(compo['category']) + return compos_reform + + +def load_ground_truth_json(gt_file): + def get_img_by_id(img_id): + for image in images: + if image['id'] == img_id: + return image['file_name'].split('/')[-1][:-4], (image['height'], image['width']) + + def cvt_bbox(bbox): + ''' + :param bbox: [x,y,width,height] + :return: [col_min, row_min, col_max, row_max] + ''' + bbox = [int(b) for b in bbox] + return [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] + + data = json.load(open(gt_file, 'r')) + images = data['images'] + annots = data['annotations'] + compos = {} + print('Loading %d ground truth' % len(annots)) + for annot in tqdm(annots): + img_name, size = get_img_by_id(annot['image_id']) + if img_name not in compos: + compos[img_name] = {'bboxes': [cvt_bbox(annot['bbox'])], 'categories': [annot['category_id']], 'size': size} + else: + compos[img_name]['bboxes'].append(cvt_bbox(annot['bbox'])) + compos[img_name]['categories'].append(annot['category_id']) + return compos + + +def eval(detection, ground_truth, img_root, show=True, no_text=False, only_text=False): + def compo_filter(compos, flag): + if not no_text and not only_text: + return compos + compos_new = {'bboxes': [], 'categories': []} + for k, category in enumerate(compos['categories']): + if only_text: + if flag == 'det' and category != 'TextView': + continue + if flag == 'gt' and int(category) != 14: + continue + elif no_text: + if flag == 'det' and category == 'TextView': + continue + if flag == 'gt' and int(category) == 14: + continue + + compos_new['bboxes'].append(compos['bboxes'][k]) + compos_new['categories'].append(category) + return compos_new + + def match(org, d_bbox, gt_bboxes, matched): + ''' + :param matched: mark if the ground truth component is matched + :param d_bbox: [col_min, row_min, col_max, row_max] + :param gt_bboxes: list of ground truth [[col_min, row_min, col_max, row_max]] + :return: Boolean: if IOU large enough or detected box is contained by ground truth + ''' + area_d = (d_bbox[2] - d_bbox[0]) * (d_bbox[3] - d_bbox[1]) + for i, gt_bbox in enumerate(gt_bboxes): + if matched[i] == 0: + continue + area_gt = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1]) + col_min = max(d_bbox[0], gt_bbox[0]) + row_min = max(d_bbox[1], gt_bbox[1]) + col_max = min(d_bbox[2], gt_bbox[2]) + row_max = min(d_bbox[3], gt_bbox[3]) + # if not intersected, area intersection should be 0 + w = max(0, col_max - col_min) + h = max(0, row_max - row_min) + area_inter = w * h + if area_inter == 0: + continue + iod = area_inter / area_d + iou = area_inter / (area_d + area_gt - area_inter) + # if show: + # cv2.putText(org, (str(round(iou, 2)) + ',' + str(round(iod, 2))), (d_bbox[0], d_bbox[1]), + # cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) + + if iou > 0.9 or iod == 1: + matched[i] = 0 + return True + return False + + amount = len(detection) + TP, FP, FN = 0, 0, 0 + pres, recalls, f1s = [], [], [] + for i, image_id in enumerate(detection): + TP_this, FP_this, FN_this = 0, 0, 0 + img = cv2.imread(pjoin(img_root, image_id + '.jpg')) + d_compos = detection[image_id] + if image_id not in ground_truth: + continue + gt_compos = ground_truth[image_id] + + org_height = gt_compos['size'][0] + + d_compos = compo_filter(d_compos, 'det') + gt_compos = compo_filter(gt_compos, 'gt') + + d_compos['bboxes'] = resize_label(d_compos['bboxes'], 800, org_height) + matched = np.ones(len(gt_compos['bboxes']), dtype=int) + for d_bbox in d_compos['bboxes']: + if match(img, d_bbox, gt_compos['bboxes'], matched): + TP += 1 + TP_this += 1 + else: + FP += 1 + FP_this += 1 + FN += sum(matched) + FN_this = sum(matched) + + try: + pre_this = TP_this / (TP_this + FP_this) + recall_this = TP_this / (TP_this + FN_this) + f1_this = 2 * (pre_this * recall_this) / (pre_this + recall_this) + except: + print('empty') + continue + + pres.append(pre_this) + recalls.append(recall_this) + f1s.append(f1_this) + if show: + print(image_id + '.jpg') + print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f' % ( + i, amount, TP_this, FP_this, FN_this, pre_this, recall_this)) + # cv2.imshow('org', cv2.resize(img, (500, 1000))) + broad = draw_bounding_box(img, d_compos['bboxes'], color=(255, 0, 0), line=3) + draw_bounding_box(broad, gt_compos['bboxes'], color=(0, 0, 255), show=True, line=2) + + if i % 200 == 0: + precision = TP / (TP + FP) + recall = TP / (TP + FN) + f1 = 2 * (precision * recall) / (precision + recall) + print( + '[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1)) + + precision = TP / (TP + FP) + recall = TP / (TP + FN) + print('[%d/%d] TP:%d, FP:%d, FN:%d, Precesion:%.3f, Recall:%.3f, F1:%.3f' % (i, amount, TP, FP, FN, precision, recall, f1)) + # print("Average precision:%.4f; Average recall:%.3f" % (sum(pres)/len(pres), sum(recalls)/len(recalls))) + + return pres, recalls, f1s + + +no_text = True +only_text = False + +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\ip') +detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\merge') +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3\\merge') +# detect = load_detect_result_json('E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_v3\\ocr') +gt = load_ground_truth_json('E:\\Mulong\\Datasets\\rico\\instances_test.json') +eval(detect, gt, 'E:\\Mulong\\Datasets\\rico\\combined', show=False, no_text=no_text, only_text=only_text) diff --git a/CDM/result_processing/experiment.py b/CDM/result_processing/experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..ac80f168bf0d9518f74c3a91863de3522828bcd0 --- /dev/null +++ b/CDM/result_processing/experiment.py @@ -0,0 +1,72 @@ +import cv2 +import numpy as np + +import lib_ip.block_division as blk +import lib_ip.ip_preprocessing as pre +import lib_ip.ip_detection as det + + +def nothing(x): + pass + + +def get_contour(org, binary): + def cvt_bbox(bbox): + ''' + x,y,w,h -> colmin, rowmin, colmax, rowmax + ''' + return bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3] + + board = org.copy() + hie, contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + res_contour = [] + for i in range(len(contours)): + if cv2.contourArea(contours[i]) < 200: + continue + cnt = cv2.approxPolyDP(contours[i], 0.001*cv2.arcLength(contours[i], True), True) + res_contour.append(cnt) + cv2.drawContours(board, res_contour, -1, (0,0,255), 1) + return board + + +img_file = 'E:\\Mulong\\Datasets\\rico\\combined\\1014.jpg' +resize_height = 800 + +cv2.namedWindow('control') +cv2.createTrackbar('resize_height', 'control', 800, 1600, nothing) +cv2.createTrackbar('grad_min', 'control', 4, 255, nothing) +cv2.createTrackbar('grad_min_blk', 'control', 5, 255, nothing) +cv2.createTrackbar('c1', 'control', 1, 1000, nothing) +cv2.createTrackbar('c2', 'control', 1, 1000, nothing) + + +while 1: + resize_height = cv2.getTrackbarPos('resize_height', 'control') + grad_min = cv2.getTrackbarPos('grad_min', 'control') + grad_min_blk = cv2.getTrackbarPos('grad_min_blk', 'control') + c1 = cv2.getTrackbarPos('c1', 'control') + c2 = cv2.getTrackbarPos('c2', 'control') + + org, grey = pre.read_img(img_file, resize_height) + # org = cv2.medianBlur(org, 3) + # org = cv2.GaussianBlur(org, (3,3), 0) + + binary = pre.binarization(org, grad_min) + binary_r = pre.reverse_binary(binary) + # blk.block_division(grey, grad_thresh=grad_min_blk, step_v=10, step_h=10, show=True) + cv2.imshow('bijn', binary) + cv2.imshow('r', binary_r) + cv2.waitKey(10) + + # canny = cv2.Canny(grey, c1, c2) + # hie, contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # b_contour = get_contour(org, binary) + # c_contour = get_contour(org, canny) + + # b_contour = cv2.hconcat([b_contour, c_contour]) + # binary = cv2.hconcat([binary, binary_r, canny]) + + # cv2.imshow('org', org) + # cv2.imshow('b_cnt', b_contour) + # cv2.imshow('bin', binary) + # cv2.imshow('canny', canny) diff --git a/CDM/result_processing/merge_east.py b/CDM/result_processing/merge_east.py new file mode 100644 index 0000000000000000000000000000000000000000..e7c8e51404340ff1b7ab764a908ce08a30ca64e7 --- /dev/null +++ b/CDM/result_processing/merge_east.py @@ -0,0 +1,31 @@ +import multiprocessing +from glob import glob +import time +import json +from tqdm import tqdm +from os.path import join as pjoin, exists + +import merge + + +input_root = 'E:\\Mulong\\Datasets\\rico\\combined' +output_root = 'E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\merge' +compo_root = 'E:\\Mulong\\Result\\rico\\rico_uied\\rico_new_uied_cls\\ip' +text_root = 'E:\\Mulong\\Result\\east' + +data = json.load(open('E:\\Mulong\\Datasets\\rico\\instances_test.json', 'r')) +input_paths_img = [pjoin(input_root, img['file_name'].split('/')[-1]) for img in data['images']] +input_paths_img = sorted(input_paths_img, key=lambda x: int(x.split('\\')[-1][:-4])) # sorted by index + +# set the range of target inputs' indices +num = 0 +start_index = 0 +end_index = 100000 +for input_path_img in input_paths_img: + index = input_path_img.split('\\')[-1][:-4] + if int(index) < start_index: + continue + if int(index) > end_index: + break + + merge.incorporate(input_path_img, compo_root, text_root, output_root, resize_by_height=800, show=False) diff --git a/CDM/result_processing/view_gt.py b/CDM/result_processing/view_gt.py new file mode 100644 index 0000000000000000000000000000000000000000..fed54d02d31398f54871d92e1e487cda135425fc --- /dev/null +++ b/CDM/result_processing/view_gt.py @@ -0,0 +1,89 @@ +from tqdm import tqdm +import json +import cv2 +from os.path import join as pjoin + +from config.CONFIG_UIED import Config +C = Config() + + +def draw_bounding_box_class(org, components, color=C.COLOR, line=2, show=False, write_path=None): + """ + Draw bounding box of components with their classes on the original image + :param org: original image + :param components: bbox [(column_min, row_min, column_max, row_max)] + -> top_left: (column_min, row_min) + -> bottom_right: (column_max, row_max) + :param color_map: colors mapping to different components + :param line: line thickness + :param compo_class: classes matching the corners of components + :param show: show or not + :return: labeled image + """ + board = org.copy() + bboxes = components['bboxes'] + categories = components['categories'] + for i in range(len(bboxes)): + bbox = bboxes[i] + category = categories[i] + board = cv2.rectangle(board, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color[C.CLASS_MAP[str(category)]], line) + board = cv2.putText(board, C.CLASS_MAP[str(category)], (bbox[0]+5, bbox[1]+20), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color[C.CLASS_MAP[str(category)]], 2) + if show: + cv2.imshow('a', cv2.resize(board, (500, 1000))) + cv2.waitKey(0) + if write_path is not None: + cv2.imwrite(write_path, board) + return board + + +def load_ground_truth_json(gt_file, no_text=True): + def get_img_by_id(img_id): + for image in images: + if image['id'] == img_id: + return image['file_name'].split('/')[-1][:-4], (image['height'], image['width']) + + def cvt_bbox(bbox): + ''' + :param bbox: [x,y,width,height] + :return: [col_min, row_min, col_max, row_max] + ''' + bbox = [int(b) for b in bbox] + return [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] + + data = json.load(open(gt_file, 'r')) + images = data['images'] + annots = data['annotations'] + compos = {} + print('Loading %d ground truth' % len(annots)) + for annot in tqdm(annots): + img_name, size = get_img_by_id(annot['image_id']) + if no_text and int(annot['category_id']) == 14: + compos[img_name] = {'bboxes': [], 'categories': [], 'size': size} + continue + if img_name not in compos: + compos[img_name] = {'bboxes': [cvt_bbox(annot['bbox'])], 'categories': [annot['category_id']], 'size':size} + else: + compos[img_name]['bboxes'].append(cvt_bbox(annot['bbox'])) + compos[img_name]['categories'].append(annot['category_id']) + return compos + + +def view_gt_all(gt, img_root): + for img_id in gt: + compos = gt[img_id] + img = cv2.imread(pjoin(img_root, img_id + '.jpg')) + print(pjoin(img_root, img_id + '.jpg')) + draw_bounding_box_class(img, compos, show=True) + + +def view_gt_single(gt, img_root, img_id): + img_id = str(img_id) + compos = gt[img_id] + img = cv2.imread(pjoin(img_root, img_id + '.jpg')) + print(pjoin(img_root, img_id + '.jpg')) + draw_bounding_box_class(img, compos, show=True) + + +gt = load_ground_truth_json('E:\\Mulong\\Datasets\\rico\\instances_test.json', no_text=False) +# view_gt_all(gt, 'E:\\Mulong\\Datasets\\rico\\combined') +view_gt_single(gt, 'E:\\Mulong\\Datasets\\rico\\combined', 670) diff --git a/CDM/run_batch.py b/CDM/run_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4d836f5992df52e91a7d97a5b90cbb3ccf131e44 --- /dev/null +++ b/CDM/run_batch.py @@ -0,0 +1,146 @@ +import multiprocessing +import glob +import time +import json +from tqdm import tqdm +from os.path import join as pjoin, exists +import cv2 +import os +import shutil + +from detect_merge.merge import reassign_ids +import detect_compo.ip_region_proposal as ip +from detect_merge.Element import Element +import detect_compo.lib_ip.ip_preprocessing as pre +import detect_classify.classification as clf +import torch +import numpy as np +from torchvision import models +from torch import nn +import pandas as pd +import csv +import re +import openai +import random +from PIL import Image + +def resize_height_by_longest_edge(img_path, resize_length=800): + org = cv2.imread(img_path) + height, width = org.shape[:2] + if height > width: + return resize_length + else: + return int(resize_length * (height / width)) + + +if __name__ == '__main__': + + input_img_root = "./input_examples/" + output_root = "./result_classification" + segment_root = '../scrutinizing_alexa/txt' + + if os.path.exists(output_root): + shutil.rmtree(output_root) + os.makedirs(output_root) + + image_list = os.listdir(input_img_root) + + input_imgs = [input_img_root + image_name for image_name in image_list] + + key_params = {'min-grad': 4, 'ffl-block': 5, 'min-ele-area': 50, 'merge-contained-ele': True, + 'max-word-inline-gap': 10, 'max-line-ingraph-gap': 4, 'remove-top-bar': False} + + is_ip = True + is_clf = False + is_ocr = True + is_merge = True + is_classification = True + + # Load deep learning models in advance + compo_classifier = None + if is_ip and is_clf: + compo_classifier = {} + from cnn.CNN import CNN + # compo_classifier['Image'] = CNN('Image') + compo_classifier['Elements'] = CNN('Elements') + # compo_classifier['Noise'] = CNN('Noise') + ocr_model = None + if is_ocr: + import detect_text.text_detection as text + + # set the range of target inputs' indices + num = 0 + # start_index = 30800 # 61728 + # end_index = 100000 + + img_time_cost_all = [] + ocr_time_cost_all = [] + ic_time_cost_all = [] + ts_time_cost_all = [] + cd_time_cost_all = [] + + resize_by_height = 800 + for input_img in input_imgs: + + output_data = pd.DataFrame(columns=['screenshot', 'id', 'label', 'index', 'text', 'sentences']) + + this_img_start_time = time.clock() + + resized_height = resize_height_by_longest_edge(input_img, resize_by_height) + index = input_img.split('/')[-1][:-4] + + if index != "1-1" and index != "1-2": + continue + + if is_ocr: + os.makedirs(pjoin(output_root, 'ocr'), exist_ok=True) + this_ocr_time_cost = text.text_detection(input_img, output_root, show=False, method='paddle') + ocr_time_cost_all.append(this_ocr_time_cost) + + if is_ip: + os.makedirs(pjoin(output_root, 'ip'), exist_ok=True) + this_cd_time_cost = ip.compo_detection(input_img, output_root, key_params, classifier=compo_classifier, resize_by_height=resized_height, show=False) + cd_time_cost_all.append(this_cd_time_cost) + + if is_merge: + import detect_merge.merge as merge + + os.makedirs(pjoin(output_root, 'merge'), exist_ok=True) + compo_path = pjoin(output_root, 'ip', str(index) + '.json') + ocr_path = pjoin(output_root, 'ocr', str(index) + '.json') + board_merge, components_merge = merge.merge(input_img, compo_path, ocr_path, pjoin(output_root, 'merge'), is_remove_top_bar=key_params['remove-top-bar'], show=False) + # ic_time_cost_all.append(this_ic_time_cost) + # ts_time_cost_all.append(this_ts_time_cost) + + if is_classification: + + os.makedirs(pjoin(output_root, 'classification'), exist_ok=True) + merge_path = pjoin(output_root, 'merge', str(index) + '.json') + merge_json = json.load(open(merge_path, 'r')) + os.makedirs(pjoin(output_root, 'classification', 'GUI'), exist_ok=True) + this_time_cost_ic, this_time_cost_ts, output_data, output_board = clf.compo_classification(input_img, output_root, segment_root, merge_json, output_data, resize_by_height=resize_by_height) + + ic_time_cost_all.append(this_time_cost_ic) + ts_time_cost_all.append(this_time_cost_ts) + + this_img_time_cost = time.clock() - this_img_start_time + img_time_cost_all.append(this_img_time_cost) + print("time cost for this image: %2.2f s" % this_img_time_cost) + + num += 1 + + if os.path.isfile(output_root + '/output.csv'): + output_data.to_csv(output_root + '/output.csv', index=False, mode='a', header=False) + else: + output_data.to_csv(output_root + '/output.csv', index=False, mode='w') + + avg_ocr_time_cost = sum(ocr_time_cost_all) / len(ocr_time_cost_all) + avg_cd_time_cost = sum(cd_time_cost_all) / len(cd_time_cost_all) + avg_ic_time_cost = sum(ic_time_cost_all) / len(ic_time_cost_all) + avg_ts_time_cost = sum(ts_time_cost_all) / len(ts_time_cost_all) + avg_time_cost = sum(img_time_cost_all)/len(img_time_cost_all) + print("average text extraction time cost for this app: %2.2f s" % avg_ocr_time_cost) + print("average widget detection time cost for this app: %2.2f s" % avg_cd_time_cost) + print("average icon classification time cost for this app: %2.2f s" % avg_ic_time_cost) + print("average text selection processing time cost for this app: %2.2f s" % avg_ts_time_cost) + print("average screenshot processing time cost for this app: %2.2f s" % avg_time_cost) diff --git a/CDM/run_online_demo.py b/CDM/run_online_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..aa55fedbb40be8d61a87d9dbbbda4aaebe66c0c5 --- /dev/null +++ b/CDM/run_online_demo.py @@ -0,0 +1,52 @@ +import sys +import os + +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'scrutinizing_alexa')) +# from run_single_sem import run_single_pp + +sys.path.append(os.path.join(os.path.dirname(__file__))) +from run_single import run_single_img + +import cv2 + +def run_demo(img_root, output_root, segment_root, file): + # run_single_pp(file) + + output_board, output_data = run_single_img(img_root, output_root, segment_root) + + # cv2.imshow("result", output_board) + # cv2.waitKey(0) + + return output_board, output_data + +if __name__ == '__main__': + + input_img_root = "./input_examples/1-1.jpg" + output_root = "./result_classification" + segment_root = '../scrutinizing_alexa/txt' + img_root = "./input_examples/1-1-write.jpg" + pp_root = "../scrutinizing_alexa/pp_example/1.html" + + # run_single_pp(file) + # + # img = cv2.imread(input_img_root) + # + # cv2.imwrite(input_img, img) + # + # output_board, output_data = run_single_img("./input_examples/1-1-write.jpg", output_root, segment_root) + # + # # cv2.imshow("result", output_board) + # # cv2.waitKey(0) + + img = cv2.imread(input_img_root) + cv2.imwrite(img_root, img) + + file = open('../scrutinizing_alexa/pp_example/1.html', encoding='utf-8') + + output_board, output_data = run_demo(img_root, output_root, segment_root, file) + + # cv2.imshow("result", output_board) + # cv2.waitKey(0) + + print(output_data) + diff --git a/CDM/run_single.py b/CDM/run_single.py new file mode 100644 index 0000000000000000000000000000000000000000..dabbe7dbfd0692d67124c5b4975542edc7e2913c --- /dev/null +++ b/CDM/run_single.py @@ -0,0 +1,212 @@ +from os.path import join as pjoin +import cv2 +import os +import shutil +import time +import json +import CDM.detect_compo.ip_region_proposal as ip +import CDM.detect_classify.classification as clf +import pandas as pd +import openai + +def summarize_segment(segment): + openai.api_key = os.environ.get('openai_key') + + prompt = f"Shorten this paragraph: \"{str(segment)}\"." + + response = openai.ChatCompletion.create( + # engine="text-davinci-002", + model="gpt-3.5-turbo", + messages=[ + # {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ], + max_tokens=400, + n=1, + stop=None, + temperature=0, + ) + + shortened_segment = response.choices[0].message['content'] + + return shortened_segment + +def resize_height_by_longest_edge(img_path, resize_length=800): + org = cv2.imread(img_path) + height, width = org.shape[:2] + if height > width: + return resize_length + else: + return int(resize_length * (height / width)) + +def run_single_img(input_img, output_root, segment_root): + # input_img_root = "./input_examples/" + # output_root = "./result_classification" + # segment_root = '../scrutinizing_alexa/txt' + + if os.path.exists(output_root): + shutil.rmtree(output_root) + os.makedirs(output_root) + + # image_list = os.listdir(input_img_root) + # + # input_imgs = [input_img_root + image_name for image_name in image_list] + + key_params = {'min-grad': 4, 'ffl-block': 5, 'min-ele-area': 50, 'merge-contained-ele': True, + 'max-word-inline-gap': 10, 'max-line-ingraph-gap': 4, 'remove-top-bar': False} + + is_ip = True + is_clf = False + is_ocr = True + is_merge = True + is_classification = True + + # # Load deep learning models in advance + # compo_classifier = None + # if is_ip and is_clf: + # compo_classifier = {} + # from cnn.CNN import CNN + # # compo_classifier['Image'] = CNN('Image') + # compo_classifier['Elements'] = CNN('Elements') + # # compo_classifier['Noise'] = CNN('Noise') + # ocr_model = None + if is_ocr: + import CDM.detect_text.text_detection as text + + # set the range of target inputs' indices + # num = 0 + # start_index = 30800 # 61728 + # end_index = 100000 + + img_time_cost_all = [] + ocr_time_cost_all = [] + ic_time_cost_all = [] + ts_time_cost_all = [] + cd_time_cost_all = [] + + resize_by_height = 800 + # for input_img in input_imgs: + + output_data = pd.DataFrame(columns=['screenshot', 'id', 'label', 'index', 'text', 'sentences']) + + this_img_start_time = time.process_time() + + resized_height = resize_height_by_longest_edge(input_img, resize_by_height) + index = input_img.split('/')[-1][:-4] + + # if index != "1-1" and index != "1-2": + # continue + + if is_ocr: + os.makedirs(pjoin(output_root, 'ocr'), exist_ok=True) + this_ocr_time_cost = text.text_detection(input_img, output_root, show=False, method='google') # pytesseract + ocr_time_cost_all.append(this_ocr_time_cost) + + if is_ip: + os.makedirs(pjoin(output_root, 'ip'), exist_ok=True) + this_cd_time_cost = ip.compo_detection(input_img, output_root, key_params, + resize_by_height=resized_height, show=False) + cd_time_cost_all.append(this_cd_time_cost) + + if is_merge: + import CDM.detect_merge.merge as merge + + os.makedirs(pjoin(output_root, 'merge'), exist_ok=True) + compo_path = pjoin(output_root, 'ip', str(index) + '.json') + ocr_path = pjoin(output_root, 'ocr', str(index) + '.json') + board_merge, components_merge = merge.merge(input_img, compo_path, ocr_path, pjoin(output_root, 'merge'), + is_remove_top_bar=key_params['remove-top-bar'], show=False) + # ic_time_cost_all.append(this_ic_time_cost) + # ts_time_cost_all.append(this_ts_time_cost) + + if is_classification: + os.makedirs(pjoin(output_root, 'classification'), exist_ok=True) + merge_path = pjoin(output_root, 'merge', str(index) + '.json') + merge_json = json.load(open(merge_path, 'r')) + os.makedirs(pjoin(output_root, 'classification', 'GUI'), exist_ok=True) + this_time_cost_ic, this_time_cost_ts, output_data, output_board = clf.compo_classification(input_img, output_root, + segment_root, merge_json, + output_data, + resize_by_height=resize_by_height, clf_model="ViT") + + ic_time_cost_all.append(this_time_cost_ic) + ts_time_cost_all.append(this_time_cost_ts) + + this_img_time_cost = time.process_time() - this_img_start_time + img_time_cost_all.append(this_img_time_cost) + print("time cost for this image: %2.2f s" % this_img_time_cost) + + if os.path.isfile(output_root + '/output.csv'): + output_data.to_csv(output_root + '/output.csv', index=False, mode='a', header=False) + else: + output_data.to_csv(output_root + '/output.csv', index=False, mode='w') + + # avg_ocr_time_cost = sum(ocr_time_cost_all) / len(ocr_time_cost_all) + # avg_cd_time_cost = sum(cd_time_cost_all) / len(cd_time_cost_all) + # avg_ic_time_cost = sum(ic_time_cost_all) / len(ic_time_cost_all) + # avg_ts_time_cost = sum(ts_time_cost_all) / len(ts_time_cost_all) + # avg_time_cost = sum(img_time_cost_all) / len(img_time_cost_all) + # print("average text extraction time cost for this app: %2.2f s" % avg_ocr_time_cost) + # print("average widget detection time cost for this app: %2.2f s" % avg_cd_time_cost) + # print("average icon classification time cost for this app: %2.2f s" % avg_ic_time_cost) + # print("average text selection processing time cost for this app: %2.2f s" % avg_ts_time_cost) + # print("average screenshot processing time cost for this app: %2.2f s" % avg_time_cost) + + short_output_data = output_data[['id', 'label', 'text']].copy() + short_output_data = short_output_data.rename(columns={'text': 'segment'}) + + # summarize segments: + + # original_output_data = short_output_data.copy() + # retries = 3 + # for index in range(1, len(short_output_data)): + # seg = short_output_data.loc[index, 'segment'] + # for i in range(retries): + # try: + # shortened_seg = summarize_segment(seg) + # break + # except openai.error.RateLimitError as e: + # if "overloaded" in str(e): + # # Exponential backoff with jitter + # sleep_time = 2 * (2 ** i) + 0.1 + # time.sleep(sleep_time) + # except Exception as e: + # # If you wish, you can print or log the exception details here without raising it + # print(e) + # else: + # # This part will be executed if the for loop doesn't hit 'break' + # shortened_seg = seg + # + # short_output_data.loc[index, 'segment'] = shortened_seg + + original_output = [] + retries = 3 + summarized_data = [] # List to hold summarized rows + for index, row in short_output_data.iterrows(): + seg = row['segment'] + for i in range(retries): + try: + shortened_seg = summarize_segment(seg) + break + except openai.error.RateLimitError as e: + if "overloaded" in str(e): + + sleep_time = 2 * (2 ** i) + 0.1 + # sleep_time = 3 + time.sleep(sleep_time) + except Exception as e: + # If you wish, you can print or log the exception details here without raising it + print(e) + else: + # This part will be executed if the for loop doesn't hit 'break' + shortened_seg = seg + + summarized_data.append({'id': row['id'], 'label': row['label'], 'segment': shortened_seg}) + original_output.append({'id': row['id'], 'label': row['label'], 'segment': seg[0].upper() + seg[1:]}) + + summarized_output_data = pd.DataFrame(summarized_data) + original_output_data = pd.DataFrame(original_output) + + return output_board, summarized_output_data, original_output_data + + diff --git a/CDM/run_testing(Used for Adjusting).py b/CDM/run_testing(Used for Adjusting).py new file mode 100644 index 0000000000000000000000000000000000000000..f97d51edcb95826c6e6aa3e67d987feb4fdd7e4f --- /dev/null +++ b/CDM/run_testing(Used for Adjusting).py @@ -0,0 +1,89 @@ +from os.path import join as pjoin +import cv2 +import os + + +def resize_height_by_longest_edge(img_path, resize_length=800): + org = cv2.imread(img_path) + height, width = org.shape[:2] + if height > width: + return resize_length + else: + return int(resize_length * (height / width)) + + +def nothing(x): + pass + + +if __name__ == '__main__': + + ''' + ele:min-grad: gradient threshold to produce binary map + ele:ffl-block: fill-flood threshold + ele:min-ele-area: minimum area for selected elements + ele:merge-contained-ele: if True, merge elements contained in others + text:max-word-inline-gap: words with smaller distance than the gap are counted as a line + text:max-line-gap: lines with smaller distance than the gap are counted as a paragraph + + Tips: + 1. Larger *min-grad* produces fine-grained binary-map while prone to over-segment element to small pieces + 2. Smaller *min-ele-area* leaves tiny elements while prone to produce noises + 3. If not *merge-contained-ele*, the elements inside others will be recognized, while prone to produce noises + 4. The *max-word-inline-gap* and *max-line-gap* should be dependent on the input image size and resolution + + mobile: {'min-grad':4, 'ffl-block':5, 'min-ele-area':50, 'max-word-inline-gap':6, 'max-line-gap':1} + web : {'min-grad':3, 'ffl-block':5, 'min-ele-area':25, 'max-word-inline-gap':4, 'max-line-gap':4} + ''' + key_params = {'min-grad':10, 'ffl-block':5, 'min-ele-area':50, 'merge-contained-ele':False, + 'max-word-inline-gap':10, 'max-line-gap':4, 'remove-top-bar':True} + + # set input image path + input_path_img = 'data/input/4.jpg' + output_root = 'data/output' + + resized_height = resize_height_by_longest_edge(input_path_img) + is_clf = False + is_ocr = False + if is_ocr: + import detect_text.text_detection as text + os.makedirs(pjoin(output_root, 'ocr'), exist_ok=True) + text.text_detection(input_path_img, output_root, show=False) + + ''' + ******** Testing with adjustable parameters ******** + ''' + testing_ip = True + testing_merge = False + + cv2.namedWindow('parameters') + if testing_ip: + cv2.createTrackbar('min-grad', 'parameters', 4, 20, nothing) + cv2.createTrackbar('min-ele-area', 'parameters', 20, 200, nothing) + while(1): + key_params['min-grad'] = cv2.getTrackbarPos('min-grad', 'parameters') + key_params['min-ele-area'] = cv2.getTrackbarPos('min-ele-area', 'parameters') + import detect_compo.ip_region_proposal as ip + os.makedirs(pjoin(output_root, 'ip'), exist_ok=True) + # switch of the classification func + classifier = None + if is_clf: + classifier = {} + from cnn.CNN import CNN + # classifier['Image'] = CNN('Image') + classifier['Elements'] = CNN('Elements') + # classifier['Noise'] = CNN('Noise') + ip.compo_detection(input_path_img, output_root, key_params, + classifier=classifier, resize_by_height=resized_height, show=True, wai_key=10) + + if testing_merge: + cv2.createTrackbar('max-word-inline-gap', 'parameters', 4, 20, nothing) + cv2.createTrackbar('max-line-gap', 'parameters', 20, 200, nothing) + while(1): + key_params['max-word-inline-gap'] = cv2.getTrackbarPos('max-word-inline-gap', 'parameters') + key_params['max-line-gap'] = cv2.getTrackbarPos('max-line-gap', 'parameters') + import detect_merge.merge as merge + name = input_path_img.split('/')[-1][:-4] + compo_path = pjoin(output_root, 'ip', str(name) + '.json') + ocr_path = pjoin(output_root, 'ocr', str(name) + '.json') + merge.merge(input_path_img, compo_path, ocr_path, output_root=None, is_remove_top=key_params['remove-top-bar'], show=True, wait_key=10) diff --git a/README.md b/README.md index 77ae8b585b67941fedccb827dc450428a6253ebb..82b4d0f8868705273ca0552c3e127f097254f549 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ --- -title: Cpp4App Test +title: Cpp4App emoji: 🏢 -colorFrom: blue +colorFrom: pink colorTo: red sdk: gradio sdk_version: 4.41.0 @@ -9,4 +9,4 @@ app_file: app.py pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file diff --git a/SEM/.idea/.gitignore b/SEM/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/SEM/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/SEM/.idea/inspectionProfiles/profiles_settings.xml b/SEM/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/SEM/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/SEM/.idea/misc.xml b/SEM/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..192f792b58cb860ac4c55a159218ee36952ea127 --- /dev/null +++ b/SEM/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/SEM/.idea/modules.xml b/SEM/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..a4f659e51f520fe5f793146227fba66ea5b69dcb --- /dev/null +++ b/SEM/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/SEM/.idea/scrutinizing_alexa.iml b/SEM/.idea/scrutinizing_alexa.iml new file mode 100644 index 0000000000000000000000000000000000000000..ae4fa7b15585f72f71d7a4306aaf772c844658e8 --- /dev/null +++ b/SEM/.idea/scrutinizing_alexa.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/SEM/P1_PP_processing.py b/SEM/P1_PP_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..32bfe7f2938aa2753a3e098becf7c294d5559662 --- /dev/null +++ b/SEM/P1_PP_processing.py @@ -0,0 +1,120 @@ +import os +import time +import shutil + +from bs4 import BeautifulSoup + +from find_subtitle import find_title_Label +from get_text import write_text, write_text_without_label, removeUnneccessaryElements, makeCoarseSegments +from types_pp_processing import caculateSim, getSentences, getSentences_no_classifier, getSentences_with_classifier +# from children_pp_processing import process_specialGroup +# from region_pp_processing import get_alifornia +# from retention_pp_processing import retention_process +# from clean_txt import cleaning_txt + +if __name__ == '__main__': + # INPUT = "../dataset/privacy_policies_html/" + INPUT = "./pp_example/" + # cleaning_txt("./txt") + # os.mkdir("./txt") + if os.path.exists("./txt"): + shutil.rmtree("./txt") + os.makedirs("./txt") + + for file in os.listdir(INPUT): + + segmentation_start_time = time.clock() + + pathName = os.path.basename(file) + if pathName == ".DS_Store": + continue + path = INPUT+pathName + label = find_title_Label(path) + print("The current file is:" + pathName) + + # if pathName != '20.html': + # continue + + para_start_time = time.clock() + soup = BeautifulSoup(open(path,encoding='utf-8'), features="html.parser") + title_list = soup.find_all(label) + # cleaning_txt() + + if not os.path.exists('./txt/' + pathName[:-5]): + os.mkdir('./txt/' + pathName[:-5]) + + if len(title_list) == 0 or pathName == '20.html' or pathName == '29.html' or pathName == '25.html' or pathName == '8.html' or pathName == '27.html' or pathName == '28.html': + # write_text_without_label(soup.getText(), pathName) + removeUnneccessaryElements(soup) + result = makeCoarseSegments(soup) + for seg in result: + with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: + + f.write(seg) + f.write("\n") + else: + write_text(title_list, pathName) + print("Paragraph level processing time: %2.2f s" % (time.clock() - para_start_time)) + + for t in title_list: + with open('./txt/' + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g: + g.write(str(t)) + g.write("\n") + + # data types + if not os.path.exists("./txt/"+pathName[:-5]+"/data_types.txt"): + print("No information about data types!") + else: + sen_start_time = time.clock() + # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt") + dict_sentences, dict_index = getSentences_with_classifier("./txt/" + pathName[:-5] + "/data_types.txt") + print("sentence level processing time: %2.2f s" % (time.clock() - sen_start_time)) + + os.makedirs("./txt/"+pathName[:-5]+"/classified_sentences") + for key in dict_sentences: + + if dict_sentences[key] == "": + continue + with open('./txt/' + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", encoding='utf-8') as g: + g.write(dict_sentences[key]) + + for key in dict_index: + with open('./txt/' + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", encoding='utf-8') as f: + f.write(key + ":" + str(dict_index[key]) + "\n") + + + # #children + # if not os.path.exists("./txt/"+pathName[:-5]+"/children.txt"): + # print("No information about children!") + # else: + # age , rule, childUse, specialGroup = process_specialGroup("./txt/"+pathName[:-5]+"/children.txt") + # # print("children age is :") + # print("D.CHILDREN.age : " + str(age)) + # if childUse == 1: + # print(" the skill’s privacy policy states that it does not collect any information from children") + # print("D.CHILDREN.[CTypes] = [ ]") + # else: + # # print("D.CHILDREN.[CTypes] :" + str(all_types)) + # None + # #region + # if not os.path.exists("./txt/"+pathName[:-5]+"/region.txt"): + # print("No information about region!") + # else: + # specialArea,california = get_alifornia("./txt/"+pathName[:-5]+"/region.txt") + # if california == 1: + # print("D.REGIONS.region :California") + # print("D.REGIONS.delete : Yes") + # else: + # print("D.REGIONS.region :No mention") + # print("D.REGIONS.delete : No") + # + # #retention + # if not os.path.exists("./txt/"+pathName[:-5]+"/data_retention.txt"): + # print("No information about data retention!") + # else: + # retention_time, text = retention_process("./txt/"+pathName[:-5]+"/data_retention.txt") + # print("D.RETENTION.period :"+ retention_time) + # # cleaning_txt() + # print("-------------------------------------------------------") + + print("time cost for segmentation: %2.2f s" % (time.clock() - segmentation_start_time)) diff --git a/SEM/children_pp_processing.py b/SEM/children_pp_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..a43ff85f71e993a091579f0c694f7fb9ed1b2c67 --- /dev/null +++ b/SEM/children_pp_processing.py @@ -0,0 +1,49 @@ +from text_preprocessing import pre_process_list + +def is_number(s): + try: + float(s) + return True + except ValueError: + pass + + try: + import unicodedata + unicodedata.numeric(s) + return True + except (TypeError, ValueError): + pass + + return False +# intend +def process_specialGroup(txt): + specialGroup = "" + age = "" + rule = "" + childUse = 0 + with open(txt, encoding='utf-8') as file_obj: + for line in file_obj: + specialGroup += line + result = pre_process_list(specialGroup) + + flag = 0 + for word in result: + # print("word in result: ", word) + if word == "direct" or word == "intend" or word == "address": + childUse = 1 + if is_number(word): + if word != age and age == "": + age = word + if word == "coppa": + if rule != word: + rule = "COPPA" + flag = 1 + if word == "gdpr": + if rule != word: + rule = "GDPR" + flag = 1 + if flag == 0: + rule += "The privacy policy does not specify what rules to follow" + if age =="": + age = "The privacy policy does not mention the age of the child" + return age , rule,childUse,specialGroup diff --git a/SEM/clean_txt.py b/SEM/clean_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..bcfdf14e2fae48b026e3b3120f7adfb7bccd3e4a --- /dev/null +++ b/SEM/clean_txt.py @@ -0,0 +1,32 @@ +import os + +def cleaning_txt(path): + # f = open("./txt/data_types.txt","r+") + # f.truncate() + # g = open("./txt/use_data.txt","r+") + # g.truncate() + # e = open("./txt/protect_information.txt","r+") + # e.truncate() + # h = open("./txt/children.txt","r+") + # h.truncate() + # j = open("./txt/data_retention.txt","r+") + # j.truncate() + # k = open("./txt/update.txt","r+") + # k.truncate() + # d = open("./txt/region.txt","r+") + # d.truncate() + # a = open("./txt/share_information.txt", "r+") + # a.truncate() + # b = open("./txt/thrid_party.txt", "r+") + # b.truncate() + # c = open("./txt/user_right.txt", "r+") + # c.truncate() + + ls = os.listdir(path) + for i in ls: + c_path = os.path.join(path, i) + if os.path.isdir(c_path): + cleaning_txt(c_path) + else: + os.remove(c_path) + os.removedirs(path) diff --git a/SEM/find_subtitle.py b/SEM/find_subtitle.py new file mode 100644 index 0000000000000000000000000000000000000000..fe6a9dd3929e9ca90ef68ee5f545134991d96a6c --- /dev/null +++ b/SEM/find_subtitle.py @@ -0,0 +1,180 @@ +import csv +import os +import bs4 + + +def find_title_Label(path): + a = 0 + soup = bs4.BeautifulSoup(open(path,encoding='utf-8'), features="html.parser") + all_list = ["","","","","","",""] + list_index = ['h1','h2','h3','h4','h5','strong','b'] + h1_list = soup.find_all('h1') + if len(h1_list) <= 2: + h1_list = None + try: + for h1 in h1_list: + all_list[0] += h1.text + + except Exception: + a = 1 + h2_list = soup.find_all('h2') + if len(h2_list) <= 2: + h2_list = None + try: + for h2 in h2_list: + all_list[1] += h2.text + except Exception: + a = 1 + h3_list = soup.find_all('h3') + if len(h3_list) <= 2: + h3_list = None + try: + for h3 in h3_list: + all_list[2] += h3.text + except Exception: + a = 1 + h4_list = soup.find_all('h4') + if len(h4_list) <= 2: + h4_list = None + try: + for h4 in h4_list: + all_list[3] += h4.text + except Exception: + a = 1 + h5_list = soup.find_all('h5') + if len(h5_list) <= 2: + h5_list = None + try: + for h5 in h5_list: + all_list[4] += h5.text + except Exception: + a = 1 + strong_list = soup.find_all('strong') + if len(strong_list) <= 2: + strong_list = None + try: + for st in strong_list: + all_list[5] += st.text + except Exception: + a = 1 + b_list = soup.find_all('b') + if len(b_list) <= 2: + b_list = None + try: + for b in b_list: + all_list[6] += b.text + except Exception: + a = 1 + long = 0 + maxLongList = None + for list in all_list: + if list == None: + continue + clean_list = list.lower() + + if "information" in clean_list and "collect" in clean_list: + + return list_index[all_list.index(list)] + if "information" in clean_list and "use" in clean_list: + + return list_index[all_list.index(list)] + if "change" in clean_list and "data" in clean_list: + + return list_index[all_list.index(list)] + if len(list) > long: + long = len(list) + maxLongList = list + if maxLongList == None: + return "TitleError" + + return list_index[all_list.index(maxLongList)] + +def find_title_Label_with_html(file): + a = 0 + soup = bs4.BeautifulSoup(file, features="html.parser") + all_list = ["","","","","","",""] + list_index = ['h1','h2','h3','h4','h5','strong','b'] + h1_list = soup.find_all('h1') + if len(h1_list) <= 2: + h1_list = None + try: + for h1 in h1_list: + all_list[0] += h1.text + + except Exception: + a = 1 + h2_list = soup.find_all('h2') + if len(h2_list) <= 2: + h2_list = None + try: + for h2 in h2_list: + all_list[1] += h2.text + except Exception: + a = 1 + h3_list = soup.find_all('h3') + if len(h3_list) <= 2: + h3_list = None + try: + for h3 in h3_list: + all_list[2] += h3.text + except Exception: + a = 1 + h4_list = soup.find_all('h4') + if len(h4_list) <= 2: + h4_list = None + try: + for h4 in h4_list: + all_list[3] += h4.text + except Exception: + a = 1 + h5_list = soup.find_all('h5') + if len(h5_list) <= 2: + h5_list = None + try: + for h5 in h5_list: + all_list[4] += h5.text + except Exception: + a = 1 + strong_list = soup.find_all('strong') + if len(strong_list) <= 2: + strong_list = None + try: + for st in strong_list: + all_list[5] += st.text + except Exception: + a = 1 + b_list = soup.find_all('b') + if len(b_list) <= 2: + b_list = None + try: + for b in b_list: + all_list[6] += b.text + except Exception: + a = 1 + long = 0 + maxLongList = None + for list in all_list: + if list == None: + continue + clean_list = list.lower() + + if "information" in clean_list and "collect" in clean_list: + + return list_index[all_list.index(list)] + if "information" in clean_list and "use" in clean_list: + + return list_index[all_list.index(list)] + if "change" in clean_list and "data" in clean_list: + + return list_index[all_list.index(list)] + if len(list) > long: + long = len(list) + maxLongList = list + if maxLongList == None: + return "TitleError" + + return list_index[all_list.index(maxLongList)] + + + + diff --git a/SEM/get_pp.py b/SEM/get_pp.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb9f617ece107ed6b0ae8d689b3970e796cc1b4 --- /dev/null +++ b/SEM/get_pp.py @@ -0,0 +1,12 @@ +import ssl +from bs4 import BeautifulSoup +import urllib.request + +ssl._create_default_https_context = ssl._create_unverified_context + +def get_text(url): + response=urllib.request.urlopen(url) + html=response.read() + soup = BeautifulSoup(html,features="html.parser") + text = soup.get_text() + return text diff --git a/SEM/get_text.py b/SEM/get_text.py new file mode 100644 index 0000000000000000000000000000000000000000..8b04d45b18b4befe77521f0f7ba697fd47e8f300 --- /dev/null +++ b/SEM/get_text.py @@ -0,0 +1,283 @@ +import re + +import bs4 + +from SEM.paragraph_bayesian import clf,tf +from bs4 import BeautifulSoup + +mark_txt = {'0':"/data_types.txt", + '1':"/data_types.txt", + '2':"/personal_information_type.txt", + '3':"/share_information.txt", + '4':"/protect_information.txt", + '5':"/advertising.txt", + '6':"/user_right.txt", + '7':"/children.txt", + '8':"/region.txt", + '9':"/update.txt", + '10':"/way_to_collect.txt", + '11':"/provider.txt", + '12':"/data_retention.txt", + '13':"/data_types.txt", + '14':"/thrid_party.txt", + '15':"/data_types.txt"} + +def write_text(title_list, pathName): + type = 0 + security = 0 + right = 0 + specialGroup = 0 + specialArea = 0 + update = 0 + retention = 0 + useData = 0 + clean_title_list = [] + for title in title_list: + if title.text != "•": + clean_title_list.append(title) + + # print("title list:"+str(clean_title_list)) + + lastMark = "" + for title in clean_title_list: + title_Str = re.sub(r'\s+', ' ',str(title)) + title_Str = re.sub(r'<[^<]+?>', '', title_Str).replace('\n','').strip() + if title is None: + continue + try: + mark = clf.predict(tf.transform([title_Str])) + + except Exception as e: + continue + # print(mark) + if mark == "1": + type = 1 + if mark == "4": + security = 1 + if mark == "6": + right = 1 + if mark == "13": + useData = 1 + if mark == "8": + specialArea = 1 + if mark == "9": + update = 1 + if mark == "12": + retention = 1 + + if mark == "7": + specialGroup = 1 + if mark == "0": + if lastMark != "": + mark = lastMark + lastMark = mark + for sibling in title.next_elements: + # print("sibling", sibling) + + # if len(str(sibling).split(' ')) < 5: + # continue + try: + if clean_title_list[clean_title_list.index(title) + 1] == sibling: + + break + except Exception: + continue + # if isinstance(sibling, bs4.element.Tag): + # + # continue + if str(sibling) == '\n': + + continue + if sibling == title.string: + + continue + + if clean_title_list.index(title) == len(clean_title_list) - 1: + + with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as f: + + if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): + continue + if sibling.name == 'li': + + if sibling.find_previous('p'): + + # p_text = sibling.find_previous('p').text.strip() + parent = ' '.join(sibling.find_previous('p').text.split()) + text = ' '.join(sibling.get_text().split()) + currentSibing = f"{parent} {text}" + # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": + # currentSibing = currentSibing + "." + # g.write(currentSibing) + # print("Found ul after a p tag with text:", parent) + else: + # currentSibing = str(sibling) + currentSibing = ' '.join(sibling.get_text().split()) + else: + # currentSibing = str(sibling) + currentSibing = ' '.join(sibling.get_text().split()) + # currentSibing = str(sibling) + if len(currentSibing) != 0: + if currentSibing[-1].isalpha() or currentSibing[-1] == ")": + currentSibing = currentSibing + "." + elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": + currentSibing = currentSibing[:-1] + currentSibing = currentSibing + "." + + f.write(currentSibing) + f.write("\n") + f.close() + + else: + + with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as g: + + if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): + continue + if sibling.name == 'li': + + if sibling.find_previous('p'): + + # p_text = sibling.find_previous('p').text.strip() + parent = ' '.join(sibling.find_previous('p').text.split()) + text = ' '.join(sibling.get_text().split()) + currentSibing = f"{parent} {text}" + # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": + # currentSibing = currentSibing + "." + # g.write(currentSibing) + # print("Found ul after a p tag with text:", parent) + else: + # currentSibing = str(sibling) + currentSibing = ' '.join(sibling.get_text().split()) + else: + # currentSibing = str(sibling) + currentSibing = ' '.join(sibling.get_text().split()) + # currentSibing = str(sibling) + if len(currentSibing) != 0: + if currentSibing[-1].isalpha() or currentSibing[-1] == ")": + currentSibing = currentSibing + "." + elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": + currentSibing = currentSibing[:-1] + currentSibing = currentSibing + "." + g.write(currentSibing) + g.write("\n") + g.close() + + return type,security,right,specialArea,specialGroup,update,retention,useData + +def write_text_without_label(text, pathName): + with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: + currentSibing = str(text) + # print("currentSibing", currentSibing) + if currentSibing[-1].isalpha() or currentSibing[-1] == ")": + currentSibing = currentSibing + "." + elif currentSibing[-1] == ";": + currentSibing[-1] = "." + f.write(currentSibing) + f.close() + +def removeUnneccessaryElements(soup): + for script in soup(["script", "style", "nav", "footer", "header", "img", "option", "select", "head", "button"]): + script.extract() # rip it out + for div in soup.find_all("div", {'class': 'footer'}): + div.decompose() + for div in soup.find_all("div", {'class': re.compile(r"sidebar")}): + div.decompose() + for div in soup.find_all("div", {'data-testid': re.compile(r"ax-navigation-menubar")}): + div.decompose() + for div in soup.find_all("div", {'class': re.compile(r"menu")}): + div.decompose() + for li in soup.find_all("li", {'class': re.compile(r"menu")}): + li.decompose() + for p in soup.find_all("p", {'class': re.compile(r"heading")}): + p.decompose() + for p in soup.find_all("p", {'class': re.compile(r"fw-bold")}): + p.decompose() + for ul in soup.find_all("ul", {'class': re.compile(r"menu")}): + ul.decompose() + for div in soup.find_all("div", {'class': re.compile(r"header")}): + div.decompose() + for div in soup.find_all("div", {'data-referrer': re.compile(r"page_footer")}): + div.decompose() + for div in soup.find_all("div", {'id': 'footer'}): + div.decompose() + for div in soup.find_all("div", {'id': re.compile(r"sidebar")}): + div.decompose() + for div in soup.find_all("div", {'id': re.compile(r"menu")}): + div.decompose() + for li in soup.find_all("li", {'id': re.compile(r"menu")}): + li.decompose() + for ul in soup.find_all("ul", {'id': re.compile(r"menu")}): + ul.decompose() + for div in soup.find_all("div", {'id': re.compile(r"header")}): + div.decompose() + for div in soup.find_all("div", {'id': re.compile(r"breadcrumbs")}): + div.decompose() + for div in soup.find_all("div", {'id': re.compile(r"instagram")}): + div.decompose() + for div in soup.find_all("div", {'role': re.compile(r"navigation")}): + div.decompose() + for div in soup.find_all("div", {'role': re.compile(r"banner")}): + div.decompose() + for div in soup.find_all("div", {'role': re.compile(r"button")}): + div.decompose() + for div in soup.find_all("ul", {'role': re.compile(r"navigation")}): + div.decompose() + +def makeCoarseSegments(soup): + segments = [] + for p in soup.find_all("p"): + if p.find_next() is not None: + if p.find_next().name != "ul": + # segments.append(' '.join(p.get_text().split())) + text = ' '.join(p.get_text().split()) + + if len(text) != 0: + if text[-1].isalpha() or text[-1] == ")": + text = text + "." + elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": + text = text[:-1] + text = text + "." + + segments.append(text) + + listSplitter = [] + + for ul in soup.find_all("ul"): + if ul.find_previous('p') is not None: + parent = ' '.join(ul.find_previous('p').text.split()) + for element in ul.findChildren('li'): + text = ' '.join(element.get_text().split()) + listElement = f"{parent} {text}" + + if len(listElement) != 0: + if listElement[-1].isalpha() or listElement[-1] == ")": + listElement = listElement + "." + elif listElement[-1] == ";" or listElement[-1] == ":" or listElement[-1] == ",": + listElement = listElement[:-1] + listElement = listElement + "." + + segments.append(listElement) + else: + for element in ul.findChildren('li'): + text = ' '.join(element.get_text().split()) + + if len(text) != 0: + if text[-1].isalpha() or text[-1] == ")": + text = text + "." + elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": + text = text[:-1] + text = text + "." + + segments.append(text) + + # if not segments: + # text = soup.getText().replace('\n', '').replace('↵', '') + # result = useAlgorithm(text) + # else: + # # text = " ".join(segments) + # # print("TEXT??", text) + # print("SEGMENTS??", segments) + # result = segments + result = segments + return result diff --git a/SEM/model/para_model.pkl b/SEM/model/para_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..944d280707b50b57fed58986e0536d02615e6c21 --- /dev/null +++ b/SEM/model/para_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6575c12bd9b6f1c14e5a575fc05d0d866e1a62ff2a5946b41b875336f34f981 +size 102673 diff --git a/SEM/model/sen_model.pkl b/SEM/model/sen_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c2d93883d26c5185c21b37ddb3f421711ccc727f --- /dev/null +++ b/SEM/model/sen_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c23523e5c2efe9cf7fb4292b18fbf0374d6907f443270b90cbc4beebc145e71c +size 29369 diff --git a/SEM/paragraph_bayesian.py b/SEM/paragraph_bayesian.py new file mode 100644 index 0000000000000000000000000000000000000000..f1e123efe640bed47aa2c861f6ad3926c692f629 --- /dev/null +++ b/SEM/paragraph_bayesian.py @@ -0,0 +1,52 @@ +import csv +import joblib + +from sklearn.naive_bayes import MultinomialNB + +from SEM.text_preprocessing import pre_process_title +from sklearn.feature_extraction.text import TfidfVectorizer + + + +def readtrain(): + with open('SEM/training_data/title.csv', 'rt') as csvfile: + reader = csv.reader(csvfile) + column1 = [row for row in reader] + content_train = [i[0] for i in column1[1:]] + opinion_train = [i[1] for i in column1[1:]] + train = [content_train, opinion_train] + return train + +def segmentWord(cont): + c = [] + for i in cont: + clean_text = pre_process_title(i) + c.append(clean_text) + return c + +train = readtrain() +content = segmentWord(train[1]) + +textMark = train[0] + +train_content = content[:] +# test_content = content[450:508] +train_textMark = textMark[:] +# test_textMark = textMark[450:508] + +tf = TfidfVectorizer(max_df=0.5) + +train_features = tf.fit_transform(train_content) + +load_pretrain_model = True + +if not load_pretrain_model: + + clf = MultinomialNB(alpha=0.1) + clf.fit(train_features,train_textMark) + + joblib.dump(clf, 'SEM/model/para_model.pkl') +else: + clf = joblib.load('SEM/model/para_model.pkl') + + diff --git a/SEM/phrase_similarity.py b/SEM/phrase_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..916e5f0bf0bf78181b8873367dbb3459cc60178f --- /dev/null +++ b/SEM/phrase_similarity.py @@ -0,0 +1,58 @@ + +from nltk.corpus import wordnet as wn + +def wordnetSim3(word1, word2): + totalPoint = 0 + simList = [] + phrase1 = word1 + phrase2 = word2 + word1 = phrase1.split(' ') + word2 = phrase2.split(' ') + for w1 in word1: + for w2 in word2: + synsets1 = wn.synsets(w1) + synsets2 = wn.synsets(w2) + path_sim = 0 + for tmpword1 in synsets1: + for tmpword2 in synsets2: + try: + sim = tmpword1.path_similarity(tmpword2) + path_sim = max(path_sim, sim) + except Exception as e: + continue + simList.append(path_sim) + for sim in simList: + totalPoint += sim + min_len = min(len(word1),len(word2)) + result = totalPoint/min_len + return result +# print(wordnetSim3("location data", "geographical position information")) + +def wordnetSim_modified(word1, word2): + totalPoint = 0 + simList = [] + phrase1 = word1 + phrase2 = word2 + word1 = phrase1.split(' ') + word2 = phrase2.split(' ') + for w1 in word1: + for w2 in word2: + synsets1 = wn.synsets(w1) + synsets2 = wn.synsets(w2) + path_sim = 0 + for tmpword1 in synsets1: + for tmpword2 in synsets2: + try: + sim = tmpword1.path_similarity(tmpword2) + path_sim = max(path_sim, sim) + except Exception as e: + continue + simList.append(path_sim) + for sim in simList: + totalPoint += sim + + result = totalPoint/len(simList) + + # result = max(simList) + + return result diff --git a/SEM/pp_example/1.html b/SEM/pp_example/1.html new file mode 100644 index 0000000000000000000000000000000000000000..14d27062672b14c4904bc31ca2de4b895ddf7382 --- /dev/null +++ b/SEM/pp_example/1.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + + + + + + + + + + + + GMA Privacy Policy | McDonald's Australia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ +
+
+ +
+ + +

GMA Privacy Policy

+

+
+
+ +
+
+ + +
+
+

McDonald’s is committed to respecting your personal information. Our privacy policy sets outs out how we collect, use, store and disclose your personal information. When you use our Websites Apps, or provide your personal information to us, you consent to your personal information being collected, held, used and disclosed as set out in our privacy policy.

+

 

+

Information we collect and hold

+

McDonald’s collects personal information about you in a number of ways, including when you:

+
  • use our websites (including www.mcdonalds.com.au), social media pages, and internal websites or intranet (Website);
  • +
  • use our mobile and tablet Apps (Apps); and
  • +
  • interact with us and provide personal information by any other means, including either physically or electronically,
  • +

(Collection Channels). 

+

Personal information that McDonald’s collects and holds may include your name, email address, delivery address, date of birth, phone number, payment method, social media handles, photographs of you and other identifying information you choose to provide via a particular Collection Channel.

+

When you use a Website or App, we may also collect personal information about you in the following general categories:

+
  • Location information: If you permit an App to access location services in your settings, then we collect your device location App to deliver your order or to send you alerts.
  • +
  • Transaction information: We collect your transaction details when you place an order via a Website or App, including the products you have ordered, the date and time of your order, the amount charged and your loyalty entitlements.
  • +
  • Usage and preferences: We collect information about how you interact with our Websites or Apps, including the pages you visit, your preferences and the settings that you choose. We do this through cookies and other similar technology.
  • +
  • Device information: We collect information about your device, such as the hardware model, operating system, preferred language, unique device identifier and mobile network.
  • +
  • Employee information: If you are a job applicant, an employee in one of our restaurants or our corporate offices, or a former employee, and use a Website or App, we collect information about the training modules you have completed, the forms you have submitted, the approvals you have given or received, and other similar information related to your job.
  • +
  • Other information: We also collect and log information such as your IP address, access dates and times, browser type and pages visited when you interact with a Website or App.
  • +

We also collect personal information about you from third parties, including when:

+
  • you choose to create an account or register for a product or service via a Website or App using a social media platform (e.g. Facebook);
  • +
  • you have consented to a third party disclosing your personal information to us (e.g. when you enter a competition or promotion run by a third party for us); and
  • +
  • it is otherwise lawful for a third party to disclose your personal information to us.
  • +

We also collect personal or anonymous information about you from other sources and sometimes combine that information with other information collected from you or from third parties for the purposes disclosed in this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

How McDonald’s collects and holds personal information

+

McDonald’s will only collect or monitor any personal information about you as provided in this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Active information collection

+

McDonald’s may collect your personal information via our Collection Channels when you:

+
  • purchase a product or make a booking through a Website or App;
  • +
  • participate in any offers, marketing activities, loyalty or rewards program or promotional activities;
  • +
  • contact us or provide us with personal information directly via any medium including a Website or App, SMS or other message service and email, social media platforms, mail, telephone or in person;
  • +
  • interact with a Website or App for a specific purpose;
  • +
  • browse a Website or App generally;
  • +
  • sign-up to, or register an account via any Collection Channel; or
  • +
  • apply for employment with McDonald’s.
  • +
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Platform permissions

+

Mobile platforms such as iOS and Android may define certain types of information or data that our Apps cannot access without your consent. Each platform has its own permissions system for obtaining your consent. For example, the iOS platform may alert you the first time an App wants your permission to access certain types of data (e.g. location services) and will provide you option to consent to that request. Android devices may notify you of the permissions that an App seeks before you first use the App and your subsequent use of the App constitutes your consent. You can usually manage your platform level permissions via the Settings section on your device. For more information, please contact your device provider or refer to the user manual for your device.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Privacy Policy 

+

McDonald’s Privacy Policy contains information about how you can access and correct your personal information, how you can lodge a complaint regarding the handling of your personal information and how any complaint will be handled by McDonald’s. You may contact McDonald’s with any queries via email: privacy@au.mcd.com or at McDonald's Australia Limited (Attention: McDonald's Privacy Officer), PO Box 392 Pennant Hills NSW 2120 Australia.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Cookies and passive information collection

+

We may use tracking technologies to collect personal information about you when you use and access a Websites or App, including cookies, internet tags or web beacons, and navigational data collection (e.g. log files, server logs, and clickstream data). For example, we may collect information about the date, time and duration of visits and which pages of a Website or App are most commonly accessed. This browsing information is generally not linked to your identity, except where you access a Website or App via links in a message we have sent or where we are able to user accessing a Website or App.

+

We may combine your anonymous information, browsing information or other information collected through tracking technologies with your personal information collect via our Collection Channels in order to understand and remember your preferences and interests. By accessing a Website or App via links and/or by accessing a Website or App where you have identified yourself, you consent to the collection of this information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Purposes for which McDonald’s collects, holds, uses and discloses personal information

+

We collect, hold, use and disclose your personal information for our primary purposes, including:

+
  • for the purposes stated on a particular Collection Channel;
  • +
  • to maintain and improve the functionality of a Website or App;
  • +
  • to fulfil obligations in respect of any sale and purchase contract and/or any other contract between you and McDonald’s;
  • +
  • to manage your orders or facilitating payment, for example, when you use our App the drive thru screen and kiosk will display your name and crew members will greet you by name.
  • +
  • to send you any technical, administrative or legal notices important to our Websites and Apps;
  • +
  • to provide you with information about your transactions and loyalty entitlements;
  • +
  • to provide marketing materials and information about our products and services, events, special offers, competitions and/or promotions, or to request your feedback for promotional purposes;
  • +
  • to respond to customer enquiries or complaints;
  • +
  • to manage your employment or process your application for employment with McDonald’s (including McDonald’s franchisees) and to facilitate effective employment practices;
  • +
  • to obtain opinions or comments about products and/or services and to conduct other market research and development (including to record statistical data for marketing analysis);
  • +
  • to enter you into and administer promotions;
  • +
  • to provide, maintain and improve our products and services;
  • +
  • to customise a Website or App based on your preferences;
  • +
  • to allow you to use  a Website or App;
  • +
  • to share with trusted third parties including professional service providers, our related bodies corporate, our franchisees, our suppliers and our promotional partners and other trusted third parties (and their directors, servants and agents) and agencies (McDonald’s Family); and
  • +
  • to share with your social media communities, to the extent allowed by you.
  • +
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Not providing information

+

You don’t have to provide any personal information to us. However, if you do not do so, this may affect or completely restrict your ability to use a Website or App and our ability to provide you with relevant content, products and services.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Sharing your personal information

+

McDonald's shares personal information with the global McDonald’s Family for the purposes described in this privacy policy

+

McDonald’s recognises the trust with which you provide personal information, and except as stated in this privacy policy, your information will not be used or disclosed for any other purposes without your consent. However, McDonald's reserves the right to use or disclose any information, including personal information, as needed to satisfy any law, regulation or legal request, to protect the rights or property of McDonald's, any member of the McDonald's Family, or any member of the public, to protect the integrity of a Website or App, to fulfil your requests, or to cooperate in any law enforcement investigation or an investigation on a matter of public safety.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Contact by McDonald’s and third parties

+

If you would like to opt out of receiving advertising communications from us, the McDonald’s Family and our trusted third parties, you can unsubscribe.

+

We may still send you transaction and administrative information.

+

If you no longer wish to receive any communications from McDonald’s via an App, you can delete the App from your device.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Ability of others to view information

+

McDonald’s provides areas on Websites and Apps where you can upload user-generated content, post or provide information about yourself, communicate with other users, provide reviews for content, products and/or services or interact with or vote on particular content. This information may be publicly posted on a Website or App and/or shared with others, including social media platforms and other public forums in which you choose to participate. This information may become publicly available and may be read, collected and used by others outside of a McDonald’s Website or App. McDonald’s is not responsible for the conduct of others who may read, collect and use this information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Children

+

McDonald's is very sensitive to privacy issues. We are proud of our long-time commitment to our customers. McDonald’s does not intend to collect personal information from any person under the age of 18 years without the consent of a parent or legal guardian. We urge parents to regularly monitor and supervise their children's on-line activities.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Security of personal information

+

McDonald’s will endeavour to take all reasonable steps to protect your personal information. All information is passed through to a secure server using encryption technology and stored on secure servers that are protected in controlled facilities, which in some cases may be overseas. McDonald's employees and data processors are obliged to respect the confidentiality of any personal information held by McDonald's. However, McDonald’s cannot guarantee the security of your personal information and will not be held responsible for events arising from unauthorised access to personal information beyond McDonald's reasonable control.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Disclosure of personal information to overseas recipients

+

In some cases, McDonald’s may disclose your personal information to overseas recipients, including but not limited to recipients in the United States of America, Japan, Malaysia and Singapore. McDonald’s employees and data processors are obliged to respect the confidentiality of any personal information held by McDonald’s.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Access to personal information

+

You are in control of any personal information you provide to us. If at any time, you would like to access, review, correct and/or delete the personal information we have about you, or if you would like to change your contact preferences, you can let us know via the contact details listed below. Please allow 30 days for this request to be processed.

+

Your personal information may be stored in different locations depending upon the reason for which you originally submitted the information. If you make an inquiry in relation to your personal information, the more information you can provide us about when you originally submitted your personal information, the quicker McDonald's will be able to retrieve your personal information.

+

If requested, all reasonable steps to delete personal information will be made, except where it is required for legal reasons. Deletion of information may result in McDonald's being unable to facilitate or provide you with information about certain transactions (including the uploading of, access to, and receipt of, content on a Website or App, and purchase transactions undertaken on a Website or App), other content, services or product information, upcoming promotions, competitions or event information, and/or provide certain content, products or services.

+

We are not responsible for removing your personal information from the lists of any third party who has previously been provided your information in accordance with this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Links to other sites

+

Our Websites or Apps contain links to sites operated by third parties. We are not responsible for the privacy practices of, or any content on, those sites linked to our Websites and Apps. If you visit one of these linked websites, we encourage you to review their privacy and other policies.

+

We may use third party advertisements on our Websites and Apps. All third party advertising, if paid for, is paid for by the relevant third party advertisers. Third party advertisements are not recommendations or endorsements by McDonald’s or any of its affiliates. To the extent permitted by law, McDonald’s is not responsible for the content (including representations) of any third party advertisement on a Website or App. Cookies may be associated with these advertisements to enable the advertiser to track the number of anonymous users responding to the campaign.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Related McDonald's Websites or Apps

+

All Websites and Apps operated by McDonald's in Australia will adhere to this privacy policy. The policies on the Websites and Apps of some other members of the McDonald's Family may vary because of local customs, practices or laws.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Franchisee privacy policies

+

Many McDonald's restaurants are owned and operated by independent franchisees. Some franchisees also operate websites and are required to follow this privacy policy. If you are concerned that there may have been a breach of this privacy policy by a franchisee, please contact the relevant franchisee entity or McDonald’s restaurant directly.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Changes to our privacy policy

+

From time to time, it may be necessary for McDonald's to change this privacy policy without notice. We will post any changes to this privacy policy on our Websites and Apps. Rest assured, however, that any changes will not be retroactively applied and will not alter how we handle previously collected personal information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Sale of the Company

+

If McDonald’s merges with, is acquired by another company, or sells all or a portion of its assets, your personal information may be disclosed to our advisers and any prospective purchaser’s adviser and may be among the assets transferred. However, your personal information will always remain subject to this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Contact Us

+

If you have any questions about our privacy policy, or any problems or complaints about how we have collected, used, stored, handled and/or disclosed your personal information, please contact us at:

+

Mail:     McDonald's Privacy Officer

+

McDonald's Australia Limited

+

PO Box 392

+

Pennant Hills NSW 2120

+

Australia

+

Email: privacy@au.mcd.com

+

Telephone: (02) 9875 6666

+

Fax: (02) 98756568

+

Please allow 14 days for this request to be processed. If you do not receive a satisfactory response from McDonald’s to your query, problem or complaint within 14 days, you may refer your query, problem or complaint to the Office of the Australian Information Commissioner via the contact details listed at https://www.oaic.gov.au/about-us/contact-us/.

+

 

+
+ + +
+ +
+
+
+
+
+
+ + +
+
+
+
+ + + + + + + + + diff --git a/SEM/region_pp_processing.py b/SEM/region_pp_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..c330973689a061c9ec515b370370048911da0d15 --- /dev/null +++ b/SEM/region_pp_processing.py @@ -0,0 +1,40 @@ +import csv +import re +import spacy +from bs4 import BeautifulSoup + +def get_alifornia(text): + specialArea = "" + california = 0 + with open(text, encoding='utf-8') as file_obj: + for line in file_obj: + specialArea += line + if "alifornia" in specialArea: + california = 1 + return specialArea,california + + +import sys +maxInt = sys.maxsize +decrement = True +while decrement: + decrement = False + try: + csv.field_size_limit(maxInt) + except OverflowError: + maxInt = int(maxInt/10) + decrement = True + + +def get_text(path): + htmlfile = open(path, 'r', encoding='utf-8') + htmlhandle = htmlfile.read() + + soup = BeautifulSoup(htmlhandle, 'html.parser') + + stri = str(soup) + return stri + + + + diff --git a/SEM/retention_pp_processing.py b/SEM/retention_pp_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..657c51b3a3f8fcc220a7b6fefd04dbdacbe9c932 --- /dev/null +++ b/SEM/retention_pp_processing.py @@ -0,0 +1,24 @@ +from types_pp_processing import cleanHtml +import spacy +nlp = spacy.load('en_core_web_sm') +def retention_process(txt): + text = "" + result = cleanHtml(txt) + for sen in result: + text += sen + time = "" + doc = nlp(text) + flag = 0 + for token in doc: + if flag == 1: + if token.text == "year" or token.text == "month" or token.text == "week" or token.text == "day" or token.text == "hour": + time += " " + token.text + break + else: + flag = 0 + if token.pos_ == "NUM": + flag = 1 + time = token.text + if time == "": + time = "The privacy policy does not specify how long the data will be retained" + return time,text diff --git a/SEM/run_single_sem.py b/SEM/run_single_sem.py new file mode 100644 index 0000000000000000000000000000000000000000..5f69913fae65d44b7e8062c1b18b0ba466a0559b --- /dev/null +++ b/SEM/run_single_sem.py @@ -0,0 +1,88 @@ +import os +import time +import shutil + +from bs4 import BeautifulSoup + +from SEM.find_subtitle import find_title_Label_with_html, find_title_Label +from SEM.get_text import write_text, removeUnneccessaryElements, makeCoarseSegments +from SEM.types_pp_processing import getSentences_with_classifier + + +def run_single_pp(file): + # INPUT = "../dataset/privacy_policies_html/" + # INPUT = "./pp_example/" + # cleaning_txt("./txt") + # os.mkdir("./txt") + + result_root = "./SEM/txt/" + + if os.path.exists(result_root): + shutil.rmtree(result_root) + os.makedirs(result_root) + + # file = os.listdir(INPUT)[0] + + segmentation_start_time = time.process_time() + + pathName = "1.html" + + label = find_title_Label(file) + print("label: ", label) + print("The current file is:" + pathName) + + # if pathName != '20.html': + # continue + + para_start_time = time.process_time() + soup = BeautifulSoup(open(file,encoding='utf-8'), features="html.parser") + print("soup.contents: ", soup.contents) + title_list = soup.find_all(label) + # cleaning_txt() + print("title_list: ", title_list) + + if not os.path.exists(result_root + pathName[:-5]): + os.mkdir(result_root + pathName[:-5]) + + if len(title_list) == 0: + # write_text_without_label(soup.getText(), pathName) + removeUnneccessaryElements(soup) + result = makeCoarseSegments(soup) + for seg in result: + with open(result_root + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: + f.write(seg) + f.write("\n") + else: + write_text(title_list, pathName) + print("Paragraph level processing time: %2.2f s" % (time.process_time() - para_start_time)) + + for t in title_list: + with open(result_root + pathName[:-5] + '/headings.txt', "a", encoding='utf-8') as g: + g.write(str(t)) + g.write("\n") + + # data types + if not os.path.exists(result_root + pathName[:-5] + "/data_types.txt"): + print("No information about data types!") + else: + sen_start_time = time.process_time() + # all_types = caculateSim("./txt/"+pathName[:-5]+"/data_types.txt") + dict_sentences, dict_index = getSentences_with_classifier(result_root + pathName[:-5] + "/data_types.txt") + print("sentence level processing time: %2.2f s" % (time.process_time() - sen_start_time)) + + os.makedirs(result_root + pathName[:-5] + "/classified_sentences") + for key in dict_sentences: + + if dict_sentences[key] == "": + continue + with open(result_root + pathName[:-5] + "/classified_sentences/" + key + ".txt", "a", + encoding='utf-8') as g: + g.write(dict_sentences[key]) + + for key in dict_index: + with open(result_root + pathName[:-5] + "/classified_sentences/keyword_index.txt", "a", + encoding='utf-8') as f: + f.write(key + ":" + str(dict_index[key]) + "\n") + + print("time cost for segmentation: %2.2f s" % (time.process_time() - segmentation_start_time)) + diff --git a/SEM/sentence_bayesian.py b/SEM/sentence_bayesian.py new file mode 100644 index 0000000000000000000000000000000000000000..021c5930d95e8ada687b71e02017377619ebddc0 --- /dev/null +++ b/SEM/sentence_bayesian.py @@ -0,0 +1,62 @@ +import csv +import joblib + + +from sklearn.metrics import f1_score, recall_score +from sklearn.naive_bayes import MultinomialNB + +from SEM.text_preprocessing import pre_process_title +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer + + +def readtrain(): + with open('SEM/training_data/personal_type.csv', 'rt') as csvfile: + reader = csv.reader(csvfile) + column1 = [row for row in reader] + content_train = [i[0] for i in column1[1:]] + opinion_train = [i[1] for i in column1[1:]] + + train = [content_train, opinion_train] + return train + +def segmentWord(cont): + c = [] + for i in cont: + clean_text = pre_process_title(i) + c.append(clean_text) + return c + +train = readtrain() +content = segmentWord(train[1]) + +textMark = train[0] + +train_content = content[:499] +# test_content = content[400:499] +train_textMark = textMark[:499] +# test_textMark = textMark[400:499] + +tf = TfidfVectorizer(max_df=0.5) + +train_features = tf.fit_transform(train_content) + +load_pretrain_model = True + +if not load_pretrain_model: + + + clf_type = MultinomialNB(alpha=0.1) + clf_type.fit(train_features,train_textMark) + + joblib.dump(clf_type, 'SEM/model/sen_model.pkl') + + # test_features = tf.transform(test_content) + # print("clf test score: ", clf_type.score(test_features, test_textMark)) +else: + clf_type = joblib.load('SEM/model/sen_model.pkl') + # print("clf training score: ", clf_type.score(train_features, train_textMark)) + + # test_features = tf.transform(test_content) + # print("clf test score: ", clf_type.score(test_features, test_textMark)) + + diff --git a/SEM/text_preprocessing.py b/SEM/text_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab20f1410d61e0e89c746bbc7e82390ebb10e88 --- /dev/null +++ b/SEM/text_preprocessing.py @@ -0,0 +1,194 @@ +import re +import nltk +from nltk.corpus import stopwords +from nltk import word_tokenize,pos_tag +from nltk.stem import WordNetLemmatizer +from nltk.corpus import wordnet +nltk.download('stopwords') +nltk.download('punkt') +nltk.download('averaged_perceptron_tagger') +nltk.download('wordnet') +nltk.download('omw-1.4') + +def tokenize(sentence): + + sentence = re.sub(r'\s+', ' ', sentence) + token_words = word_tokenize(sentence) + token_words = pos_tag(token_words) + return token_words + + +wordnet_lematizer = WordNetLemmatizer() +def stem(token_words): + + words_lematizer = [] + for word, tag in token_words: + if tag.startswith('NN'): + word_lematizer = wordnet_lematizer.lemmatize(word, pos='n') + elif tag.startswith('VB'): + word_lematizer = wordnet_lematizer.lemmatize(word, pos='v') + elif tag.startswith('JJ'): + word_lematizer = wordnet_lematizer.lemmatize(word, pos='a') + elif tag.startswith('R'): + word_lematizer = wordnet_lematizer.lemmatize(word, pos='r') + else: + word_lematizer = wordnet_lematizer.lemmatize(word) + words_lematizer.append(word_lematizer) + return words_lematizer + +def delete_invalid_word(token_words): + valid_word = [] + for word in token_words: + if len(wordnet.synsets(word)) > 0: + valid_word.append(word) + return valid_word + +sr = stopwords.words('english') +sr.append("limited") +sr.append("additionnaly") +sr.append("e.g") +sr.remove("other") +sr.remove("than") +sr.remove("not") +sr.remove("you") +sr.remove("and") +sr2 = stopwords.words('english') +def delete_stopwords(token_words): + + cleaned_words = [word for word in token_words if word not in sr] + return cleaned_words + +def delete_stopwords2(token_words): + + cleaned_words = [word for word in token_words if word not in sr2] + return cleaned_words + +def delete_adjwords(token_words): + + cleaned_words = [word for word in token_words if word not in sr] + return cleaned_words + + +def is_number(s): + + try: + float(s) + return True + except ValueError: + pass + + try: + import unicodedata + unicodedata.numeric(s) + return True + except (TypeError, ValueError): + pass + + return False +characters_title = [' ','.',',','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] +characters = [' ','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] +characters_proposal = [' ','|' , '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] +def delete_characters(token_words): + + words_list = [word for word in token_words if word not in characters] + return words_list +def delete_characters_proposal(token_words): + + words_list = [word for word in token_words if word not in characters_proposal and not is_number(word)] + return words_list + +def delete_characters_title(token_words): + + words_list = [word for word in token_words if word not in characters and not is_number(word)] + return words_list + + +def to_lower(token_words): + + words_lists = [x.lower() for x in token_words] + return words_lists + +def pre_process_title(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_invalid_word(token_words) + + token_words = delete_characters_title(token_words) + + token_words = to_lower(token_words) + + return ' '.join(token_words) + +def pre_process(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_stopwords(token_words) + + token_words = delete_characters(token_words) + + token_words = to_lower(token_words) + + return ' '.join(token_words) + +def pre_process_type(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_stopwords2(token_words) + + token_words = delete_characters(token_words) + + token_words = to_lower(token_words) + + return ' '.join(token_words) + + +def pre_process_proposal(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_stopwords(token_words) + + token_words = delete_characters_proposal(token_words) + + token_words = to_lower(token_words) + + return ' '.join(token_words) + +def pre_process_list(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_stopwords(token_words) + + token_words = delete_characters(token_words) + + token_words = to_lower(token_words) + + return token_words + +def pre_process_stop(text): + + token_words = tokenize(text) + + token_words = stem(token_words) + + token_words = delete_characters(token_words) + + token_words = to_lower(token_words) + + text = ' '.join(token_words) + final_text = text.split(".") + return final_text diff --git a/SEM/training_data/personal_type.csv b/SEM/training_data/personal_type.csv new file mode 100644 index 0000000000000000000000000000000000000000..0a249f96105d3e35187da5c6484676b717e252d9 --- /dev/null +++ b/SEM/training_data/personal_type.csv @@ -0,0 +1,499 @@ +label,sentence, +0,"the information we collect helps us deliver, personalize and continually improve your alexa internet experience and provide information about internet trends, website popularity and traffic, and related content", +0, here are the types of information we gather, +0,information you give us we may receive and store any information you enter on our website or give us in any other way, +0, click here to see examples of what we collect, +0," you can choose not to provide certain information, but then you might not be able to take advantage of many of our features", +0," we use the information you provide for purposes such as responding to your requests, providing, customizing and improving our services, communicating with you, complying with our legal obligations, and helping to protect the security of our customers, alexa internet, and others", +0,"automatic information we collect from the toolbar service when you use the toolbar service, we collect and store certain information such as the websites you visit and the advertisements you see on those websites, the searches you perform using search engines or the search function if included in your version of the toolbar service, the results of your searches, the ways in which you use various toolbar service features including whether the toolbar service is still installed on your computer, and your internet protocol ip address", +0, click here to see additional examples of what we collect, +0," although this information may contain some personally identifiable information, alexa internet does not use it to determine the identity of individual users or correlate it to your e-mail address or other personally identifiable information that you give us or our affiliates, except as may be required by subpoenas, court orders, or other legal requirements", +0," we use the information collected through the toolbar service for purposes such as to provide you with information about the websites you are visiting, for example their popularity, similar sites, and load times, and to compile aggregate statistics and other indicators of web usage and traffic and internet trends that we may make available as part of our toolbar service or other offerings", +0,other automatic information we collect from our website and other content we receive and store certain types of information whenever you interact with us, +0," for example, like many websites, we use ""cookies,"" and other unique identifiers, and we may obtain certain types of information when your browser accesses alexa", +0,com or when you interact with our features or content served by or on behalf of us, +0," we use this information for purposes such as responding to your requests, providing a seamless customer experience, and customizing and improving our services", +0," in addition, to help us make e-mails more useful and interesting, we often receive a confirmation if your device supports such capabilities when you open e-mails from alexa internet or click on a link in one of our e-mails", +0, click here to see examples of the information we receive, +0, you can choose not to receive marketing e-mails from us by clicking on the unsubscribe link in any marketing e-mail, +0,"automatic information we collect from other websites alexa internet also offers products, such as the alexa internet measurement pixel, that help website operators analyze traffic to their websites__or example, to determine the total number of visitors to a site", +0, we receive and store certain types of personal information when you interact with such third-party websites that use our pixel or other technologies, +0, click here for examples of such information, +0," because alexa internet processes this information on behalf of the applicable website operators, collection, processing, and use of such information is subject to the applicable website operators_ privacy policies and is not covered by our privacy notice", +0," for more information about how you may choose to prevent an alexa internet measurement pixel from recognizing your computer, click here", +0, information from other sources we may receive information about you from other sources, +0, click here to see examples of such information, +0," we use this information to compile aggregate statistics and other indicators of internet trends, website popularity and traffic, popularity of internet content and topics, and related content", +0, information you give us when you use our servicesif you register for an account at alexa, +1,"com, sign up for or subscribe to an alexa internet service other than the toolbar service, or download or request alexa internet content, we may collect information such as your name, address, e-mail address, country, nickname, telephone number, website, company, job title, and credit card information", +1," you may also provide us with similar information when you communicate with us by phone, e-mail, or otherwise or complete a questionnaire, a support ticket, or a contest entry form", +0,"automatic information we collect from the toolbar servicefor every web page you view while using the toolbar service, the alexa internet software transmits and stores information such as your ip address, the full uniform resource locator url clickstream to, through, and from the web page you are visiting, general information about your browser, general information about your computer's operating system, a unique identifier enabling alexa internet to recognize your toolbar service, and the date and time the above information is logged", +0," the toolbar service also automatically collects information about the online advertisements that are displayed on the websites you visit including, the text, source, and url of the online ad, and the terms you choose when you use search engines to search the web, and the results of your searches", +0, alexa internet also periodically collects usage statistics from toolbar service that describe the frequency of use of each toolbar service feature, +0," finally, to ensure accurate information about the number of toolbar service users, the toolbar service periodically sends a signal to alexa internet containing basic information such as the ip address of your computer", +1," automatic information we collect from our websiteexamples of the information we collect and analyze when you use our website or interact with our services include the ip address used to connect your device to the internet; device and connection information such as browser type, browser engine, version, and time zone setting; browser plug-in types and versions; operating system; purchase e", +0,", subscription history; the full url clickstream to, through, and from our website, including date and time; cookie number; web pages you viewed or searched for; and the phone number you used to call us", +0," we may also use browser data such as cookies or similar data on certain parts of our website for fraud prevention, to provide a seamless customer experience, and for other purposes", +1," during some visits to our website we may use software tools such as javascript to collect information about your engagement with our content, including page response times, download errors, length of visits to certain pages, page interaction information such as scrolling, clicks, downloads, and mouse-overs, and methods used to browse away from the page", +1," automatic information we collect from other websitesexamples of automatic information we collect from third-party websites on behalf of website operators for their traffic analysis include the account identifier of the website operator, the urls of web pages visited, any referring urls to those web pages, the ip addresses and browser information, such as browser type, browser engine, operating system, and version, of website visitors, and a unique identifier for each website visitor", +0,"information from other sourcesan example of information we might receive from other sources is when you authorize a third-party service, such as facebook__ login with facebook feature, to interact directly with our website to provide or receive personalized information about you", +0," in that case, we may receive such information as your name and other information used by that third-party website to identify your account with that website for example, when you use login with facebook to access your alexa internet account", +0,"additional examples of information from other sources include information we may receive from our partners to improve our web information services, which may include truncated ip addresses and unique identifiers for website visitors; browser information; the full url clickstream for websites visited along with advertisements on those websites, and the searches performed using search engines along with their results", +0," we may also receive certain public information from social media services, such as reddit and twitter, which may include public posts, likes, and similar engagement with content publicly posted on these sites""0","in general, you can browse house hacker without giving us any personal information" +0, we do analyze traffic to our website in order to understand our customer and visitors needs and to continually improve our site for them, +0," house hacker collects only anonymous, aggregate statistics", +0," for example, we do not tie a specific visit to a specific visitor", +0,see the next section for more regarding the personal information that you voluntarily send to us, +0,"this privacy policy does not apply to content, business information, ideas, concepts or inventions that you send to house hacker by email, forum post, or blog comment", +0," if you want to keep content or business information, ideas, concepts or inventions private or proprietary, do not send them in an email to us or post them to our forum", +1," emails you send to house hackerwhen you contact house hacker, you are providing us with at minimum your email address", +0, you may optionally provide us with additional information such as your real name or the company you represent, +0, house hacker will keep this information in confidence and not share it with third parties, +0, you will not be added to mailing lists unless requested, +1," subscribing to a house hacker accountwhen you subscribe to a house hacker product, you must provide us with your name, email address and a password", +0, this is the minimum information needed for us to provide the selected house hacker service, +0," when you create an account, you can optionally provide us with additional information such as your real name", +0, registering an account with this data is done through secure sockets layer technology ssl, +0," ssl is a proven secure communication protocol that lets your browser automatically encrypt, or scramble, data before you send it to us", +0," we follow generally accepted industry standards to protect the personal information submitted to us, both during transmission and once we receive it", +0," please note that no method of transmission over the internet, or method of electronic storage, is 100% secure, however", +0," therefore, while we strive to use commercially acceptable means to protect your personal information, we cannot guarantee its absolute security", +0,"under no circumstances will we hold sensitive payment details such as your credit card number, expiration date or security code", +0," all transactions are handled through one of our accredited payment bureaus, currently paypal", +0, all communication between house hacker and paypal occur through secure sockets layer technology ssl, +0," ssl is a proven secure communication protocol that lets your browser automatically encrypt, or scramble, data before you send it to us", +0," we follow generally accepted industry standards to protect the personal information submitted to us, both during transmission and once we receive it", +0," please note that no method of transmission over the internet, or method of electronic storage, is 100% secure, however", +0," therefore, while we strive to use commercially acceptable means to protect your personal information, we cannot guarantee its absolute security", +0, for more information on paypal's privacy policy please refer to http//www, +1,"questions, queries or feedback you leave, including your email address if you contact usyour email address and subscription preferences when you sign up to our email alerts including opt out to the extent permittedhow you use our emails _ for example whether you open them and which links you click onyour internet protocol ip address, online user identification profile, browsing behavior including visits, pages viewed, clicks, time spent, details of which version of web browser you used, and your views of our digital advertisements when browsing other sitesinformation on how you use the site, using cookies and page tagging techniques see use of cookies belowyour log in credentials, such as your user name, when you access a restricted content sectionorganization details, job title, and phone numbersaccount details, registration information including meal preferences and emergency contact information, user preferences, course details, support ticket information, support documentation and attachments, communication history, and survey information", +0,we collect payment information for purchases through third parties see the section titled __hat we do with your data_ for more information, +1,"your name, company name, email address, address information, and other contact info", +1," generally, this includes your name, address, social security number, date of birth, account numbers and personal financial information e", +1,", net worth, income, financial assets", +1," we may also have access to other sensitive information, such as credit scores, income tax information, insurance and medical information, and so forth", +0,we need certain information to analyze the financial situation of prospective clients and to provide investment advisory and management services to clients, +0,what information do we collect?we need certain information to analyze the financial situation of prospective clients and to provide investment advisory and management services to clients, +1," if you purchase kinertia__ products and services, we collect billing and credit card information", +0, this information is used to complete the purchase transaction, +0, we may gather additional personal or non-personal information in the future, +0, information about your computer hardware and software may be automatically collected by kinertia, +1," this information can include your ip address, browser type, domain names, access times and referring website addresses", +0," this information is used for the operation of the service, to maintain quality of the service, and to provide general statistics regarding use of the kinerita website", +0, information about your computer hardware and software may be automatically collected by kinertia, +0, kinertia encourages you to review the privacy statements of websites you choose to link to from kinertia so that you can understand how those websites share your information, +0, kinertia is not responsible for the privacy statements or other content on websites outside of the kinertia website, +1,"collection of your personal informationkinertia may collect personally identifiable information, such as your name", +0," information collectedwhat we collectwhat we collectwhat we collectspokenlayer and our service providers collect personal data to offer services you have requested or that we have a legitimate interest to believe are of interest to our customers, to manage the relationships we have with our customers and partners, and to perform activities based on your consent", +0," spokenlayer and our service providers may collect the following information from and about youspokenlayer and our service providers collect personal data to offer services you have requested or that we have a legitimate interest to believe are of interest to our customers, to manage the relationships we have with our customers and partners, and to perform activities based on your consent", +0, spokenlayer and our service providers may collect the following information from and about you3, +0,"1 personal data""personal data"" for purposes of this privacy policy is the information that identifies you as an individual or relates to an identifiable person", +0,"""personal data"" for purposes of this privacy policy is the information that identifies you as an individual or relates to an identifiable person", +1," the personal data you submit may include your name, postal address including billing and shipping addresses, credit or debit card number, telephone number, mobile number, email address, gender, username, or profile picture", +1,"2 personal data we receive from third partiesto the extent permitted by applicable law, we may receive personal data about you from third parties, such as social media services, commercially available sources, content partners, business partners, and, if applicable to you, the third party provider of your access to spokenlayer services", +1," the personal data we receive from third parties may include your user id with the third party service, location, ip address", +1,"to the extent permitted by applicable law, we may receive personal data about you from third parties, such as social media services, commercially available sources, content partners, business partners, and, if applicable to you, the third party provider of your access to spokenlayer services", +1,"""other information"" is any information that is not intended to reveal your specific identity to us, such as browser information, usage data, information collected through cookies and other technologies, demographic information, geolocation data obtained with your consent where required by applicable law, and aggregated information", +1," this may include your password, birthday, education, occupation, financial information such as your income, investments, portfolio transactions and value, and other information, and interests", +0,we collect other information when you submit it to us, +0,2 other information we receive from third parties3, +0,3 other information collected automaticallyour services use cookies and other tracking technologies to function effectively and deliver certain features, +0," for more information about how we use cookies and tracking technologies, please refer to our cookie policy", +0,our services use cookies and other tracking technologies to function effectively and deliver certain features, +0,"the spokenlayer services may also be linked to sites or apps operated by third parties, and may carry advertisements or offer content, special offers, functionality, games or applications developed and maintained by third parties, using iframes, tools, or plug-ins ""linked services""", +0, these third party linked services may use automated means to collect information about you and your use of these features, +0,"spokenlayer is not responsible for the privacy practices of such third parties, and once you leave the spokenlayer services or click an advertisement, or sign up for a special offer, you should check the applicable third party privacy policy", +0,4 children under the age of 13spokenlayer does not knowingly collect personal identifiable information from children under the age of thirteen 13 without verifiable parental consent, +1," examples of personal data that we may collect from you, either directly when you engage with smartkarma or its group companies, or otherwise through your use of the site or mobile app, includeprofessional and personal details such as your name, gender, contact details, contact preferences and ip address;details of transactions or interactions you carry out through our site and/or mobile app;details of your visits to our site including, but not limited to, traffic data, location data, weblogs and other communication data and the resources and links that you access; andrecords in connection with service provision to users including records of correspondence and records of use of the site and other related services, account details, records of agreements and other transactions, notes from interactions and information used for monitoring and regulatory compliance", +1,"professional and personal details such as your name, gender, contact details, contact preferences and ip address;professional and personal details such as your name, gender, contact details, contact preferences and ip address;details of transactions or interactions you carry out through our site and/or mobile app;details of transactions or interactions you carry out through our site and/or mobile app;details of your visits to our site including, but not limited to, traffic data, location data, weblogs and other communication data and the resources and links that you access; anddetails of your visits to our site including, but not limited to, traffic data, location data, weblogs and other communication data and the resources and links that you access; andrecords in connection with service provision to users including records of correspondence and records of use of the site and other related services, account details, records of agreements and other transactions, notes from interactions and information used for monitoring and regulatory compliance", +1,"records in connection with service provision to users including records of correspondence and records of use of the site and other related services, account details, records of agreements and other transactions, notes from interactions and information used for monitoring and regulatory compliance", +0,"how we collect personal datawe collect, process and retain personal information from you when you enter into a user agreement or register for an account with us, and each time you navigate to the site or mobile app and interact or transact with us", +0,"to capture user and usage data, individual preferences and browsing behaviour, we use cookies, web beacons and similar technologies to collect information about your visit to the site or mobile app, such as the pages you view, content you read, links you click and other actions you take on the site or mobile app", +0,you may set up your web browser to block or disable cookies which will in turn disable the monitoring of your visit, +0, you may also remove cookies stored from your computer or mobile device, +0," however, if you do block or disable cookies, you may not be able to use or receive certain personalised features and functions of the site or mobile app", +0,please also note that the site may contain links to other websites which are not operated or maintained by smartkarma, +0," when visiting these third party websites, you should read their privacy and data protection policies which will solely apply to your use of those third party websites", +0,"how we use personal datasmartkarma will always process your personal data for a specific purpose, and process only the personal data that is relevant for achieving that purpose", +0," depending on our relationship with you and the products and services that we provide to you, we may process your personal data for the following business purposes and associated compatible purposesaccount opening and on-boarding;understanding client needs and offering products and services;providing products and services;managing our relationships with users and connected parties;communicating with users;carrying out operational and administrative functions;helping us to improve our products and services;managing and conducting our business;monitoring and assessing risk, conducting audits and enforcing our rights;marketing our products and services to users; andmeeting our regulatory and compliance obligations", +0,how we use personal datahow we use personal datahow we use personal data, +0,"the personal information that you are asked to provide, and the reasons why you are asked to provide it, will be made clear to you at the point we ask you to provide your personal information", +1,"if you contact us directly, we may receive additional information about you such as your name, email address, phone number, the contents of the message and/or attachments you may send us, and any other information you may choose to provide", +1,"when you register for an account, we may ask for your contact information, including items such as name, company name, address, email address, and telephone number", +0,how we use your informationhow we use your information, +0,"how we use your informationhow we use your informationtypes of data collectedpersonal datapersonal datawhile using our service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you", +0," personally identifiable information may include, but is not limited towhile using our service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you", +1," personally identifiable information may include, but is not limited toemail addressusage dataemail addressemail addressemail addressusage datausage datausage datausage datausage datausage data is collected automatically when using the service", +0,usage data is collected automatically when using the service, +1," ip address, browser type, browser version, the pages of our service that you visit, the time and date of your visit, the time spent on those pages, unique device identifiers and other diagnostic data", +1,"when you access the service by or through a mobile device, we may collect certain information automatically, including, but not limited to, the type of mobile device you use, your mobile device unique id, the ip address of your mobile device, your mobile operating system, the type of mobile internet browser you use, unique device identifiers and other diagnostic data", +0,we may also collect information that your browser sends whenever you visit our service or when you access the service by or through a mobile device, +0,tracking technologies and cookiestracking technologies and cookieswe use cookies and similar tracking technologies to track the activity on our service and store certain information, +0,we use cookies and similar tracking technologies to track the activity on our service and store certain information, +0," tracking technologies used are beacons, tags, and scripts to collect and track information and to improve and analyze our service", +0,you can instruct your browser to refuse all cookies or to indicate when a cookie is being sent, +0," however, if you do not accept cookies, you may not be able to use some parts of our service", +0,"cookies can be ""persistent"" or ""session"" cookies", +0," persistent cookies remain on your personal computer or mobile device when you go offline, while session cookies are deleted as soon as you close your web browser", +0, learn more about cookies all about cookiesall about cookies, +0,we use both session and persistent cookies for the purposes set out belowwe use both session and persistent cookies for the purposes set out belownecessary / essential cookiestype session cookiesadministered by uspurpose these cookies are essential to provide you with services available through the website and to enable you to use some of its features, +0," without these cookies, the services that you have asked for cannot be provided, and we only use these cookies to provide you with those services", +0,purpose these cookies are essential to provide you with services available through the website and to enable you to use some of its features, +0, they help to authenticate users and prevent fraudulent use of user accounts, +0,cookies policy / notice acceptance cookiestype persistent cookiesadministered by uspurpose these cookies identify if users have accepted the use of cookies on the website, +0,"functionality cookiesfunctionality cookiesfunctionality cookiestype persistent cookiestype persistent cookiesadministered by usadministered by uspurpose these cookies allow us to remember choices you make when you use the website, such as remembering your login details or language preference", +0,"purpose these cookies allow us to remember choices you make when you use the website, such as remembering your login details or language preference", +0, the purpose of these cookies is to provide you with a more personal experience and to avoid you having to re-enter your preferences every time you use the website, +0,"for more information about the cookies we use and your choices regarding cookies, please visit our cookies policy or the cookies section of our privacy policy", +0,"when ordering or registering on our site, as appropriate, you may be asked to enter your name or other details to help you with your experience", +0,when do we collect information?when do we collect information?we collect information from you when you or enter information on our site, +0,we collect information from you when you or enter information on our site, +0, how do we use your information? how do we use your information? , +0,"when ordering or registering on our site, as appropriate, you may be asked to enter your name or other details to help you with your experience", +0,provide us with feedback on our products or services , +0,we collect several different types of information for various purposes to provide and improve our service to you, +0,"types of data collectedtypes of data collectedpersonal datapersonal datawhile using our service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ""personal data""", +0," personally identifiable information may include, but is not limited tocookies and usage datacookies and usage datacookies and usage datausage datausage datawe may also collect information how the service is accessed and used ""usage data""", +0, this usage data may include information such as your computer's internet protocol address e, +1," ip address, browser type, browser version, the pages of our service that you visit, the time and date of your visit, the time spent on those pages, unique device identifiers and other diagnostic data", +0,"we may also collect information how the service is accessed and used ""usage data""", +0,we use cookies and similar tracking technologies to track the activity on our service and hold certain information, +0,cookies are files with small amount of data which may include an anonymous unique identifier, +0, cookies are sent to your browser from a website and stored on your device, +0," tracking technologies also used are beacons, tags, and scripts to collect and track information and to improve and analyze our service", +0, short may collect limited data public database marketing partner outside source , +0, short short shortin shortin short may collect limited data public database marketing partner outside source , +0, may obtain information source public database joint marketing partner well third party , +0, example information receive source include social medium profile information marketing lead search result link include pay listing sponsored link , +0, example information receive source include social medium profile informationwe may obtain information source public database joint marketing partner well third party , +0, market lead search result link include pay listing sponsored link , +0, use information use information use information use information , +0, use information short knowingly collect data market child year age , +0, short knowingly collect data market child year age , +0, short short shortin shortin short knowingly collect data market child year age , +0, knowingly collect data market child year age, +0,we knowingly collect data market child year age , +0, knowingly solicit data market child year age , +0, learn personal information user less year age collect deactivate account take reasonable measure promptly delete data record , +0, become aware data collect child age please contact u mymail, +0, privacy right information collect source information collect source information collect sourcesinformation collect sourcesinformation collect source short may collect limited data public database marketing partner outside source , +0, short may collect limited data public database marketing partner outside source , +0, short short shortin shortin short may collect limited data public database marketing partner outside source , +0, may obtain information source public database joint marketing partner well third party , +0, example information receive source include social medium profile information marketing lead search result link include pay listing sponsored link , +0, may obtain information source public database joint marketing partner well third party , +0, example information receive source include social medium profile informationwe may obtain information source public database joint marketing partner well third party , +0, example information receive source include social medium profile information marketing lead search result link include pay listing sponsored link , +0, market lead search result link include pay listing sponsored link , +0, use information use information use information use information , +0, use information short knowingly collect data market child year age , +0, short knowingly collect data market child year age , +1,visit website server automatically log computer _ internet protocol ip address browser type version page visit time date visit time spend page standard data provide web browser , +0,ai generator ask website address company name location service bot creation whether agree disclaimer , +0, information mandatory use generate vobo, +0, choose paid service ask email address payment detail , +0, information mandatory use generate vobo, +1,nonpublic personal information collect may include limit name address telephone number email address social security number date birth transaction experience history tiaa company , +0, may use information connection certain aspect business , +0, example may use information complete requested transaction otherwise manage relationship tiaa company , +0, may obtain information directly e, +0, application form complete information choose disclose website tiaa, +0,org information give tiaa company consultation ii employer respect employer-sponsored plan iii third party , +1, addition may also collect information creditworthiness consumer reporting agency may include marital status employment history income asset credit score credit history open line credit household , +1, apply life insurance policy may also collect health information , +0, disclose health information company include tiaa company person unless authorize require permit law regulation, +0,we use personal information primarily provide product service request employer direct u deliver behalf share use personal information relate transaction balance payment history similar experience among tiaa company conduct business , +0, participant employer-sponsored retirement saving plan invest tiaa third-party annuity contract tiaa third-party mutual fund may share information collect employer agent plan administration purpose , +0, additionally unless instruct otherwise plan sponsor current former retirement plan may also use personal information market determine possible interest product service tiaa company offer , +0, may use information connection certain aspect business , +0, example may use information complete requested transaction otherwise manage relationship tiaa company , +0, may obtain information directly e, +0,use digital channel tiaa bank collect information several waysif use online banking service collect use information described tiaa fsb consumer privacy notice, +0,tiaa bank collect use ip address help diagnose problem server administer website , +0, ip address like telephone number computer internet , +0, one locate phone number telephone book computer locate ip address use domain name service server , +0, tiaa bank use ip address information transmit computer help identify computer browser move throughout digital channel gather general information internet service provider locate type service , +0, many case allow tiaa bank present convenient customized experience you, +0,our digital channel service provider may time use cooky similar technology collect information computer email client browser access digital channel keep track move digital channel third party website business relationship service provider , +0, cooky small text file text identifier place computer tiaa bank many website , +0, tiaa bank service provider use information number reason include deliver content specific interest provide access secure banking function assist website traffic report report unique visitor count provide client support respond inquiry include use live chat video session technology measure report level engagement tiaa bank advertisement across digital channel website provide advertisement tailor interest also know _ target advertising _ , +0, case information collect cooky combine personally identifiable information order provide target advertising , +1, may also use cooky collect personally identify information authenticate and/or remember user name upon login , +0, choose may set web browser warn time accept cookie allow choose whether want accept cookie , +0, may also choose obtain software operate conjunction browser control cookie acceptance , +0, portion tiaa bank website use cooky provide secure environment certain financial transaction , +0, feature may work properly restrict use cooky website, +1,tiaa bank also collect information provide u use digital channel feature include live chat video session provide client support respond inquiries, +1, zip code age income level , +0, tiaa bank use contact demographic data send information tiaa bank promotional material certain partner , +0, request send information tiaa bank promotional material see _ refuse communication _ subsection , +0, tiaa bank may also send online survey solicit feedback experience tiaa bank , +0, information use way continually improve product service provide, +0,demographic profile data also collect tiaa bank _ digital channel , +0, tiaa bank use data tailor visitor digital experience provide content tiaa bank believe may interest display content base preference , +1,nonpublic personal information collect may include limit name address telephone number email address social security number date birth transaction experience history tiaa company , +0,we collect information enter information site , +1,order register site appropriate may ask enter name email address detail help experience, +1, user ask information name email address , +0, user may however use alexa skill anonymously , +0, collect personal identification information user voluntarily submit information u , +0, user always refuse supply personally identification information except may prevent engage certain alexa skill related activity , +0, collect non-personal identification information user whenever interact alexa skill , +1, non-personal identification information may include browser name type computer technical information user mean connection alexa skill operating system internet service provider utilized similar information , +0, diabetes info alexa skill collect use user personal information following purpose run operate alexa skill , +0, need information function alexa skill , +0, personalize user experience , +0, use information aggregate understand user group use service resource provide alexa skill , +0, use email address send user information update pertain order , +0, use respond inquiry question and/or request , +0,collect personal identification information user way include limited user visit alexa skill register app connection activity service feature resource make available alexa skill , +1, user ask information name email address , +0, user may however use alexa skill anonymously , +0, collect personal identification information user voluntarily submit information u , +0, user always refuse supply personally identification information except may prevent engage certain alexa skill related activity , +0, collect non-personal identification information user whenever interact alexa skill , +0, non-personal identification information may include browser name type computer technical information user mean connection alexa skill operating system internet service provider utilized similar information , +0, diabetes info alexa skill collect use user personal information following purpose run operate alexa skill , +0, need information function alexa skill , +0, personalize user experience , +0, use information aggregate understand user group use service resource provide alexa skill , +0, use email address send user information update pertain order , +0, use respond inquiry question and/or request , +0,collectspokenlayer service provider collect personal data offer service request legitimate interest believe interest customer manage relationship customer partner perform activity base consent , +0, spokenlayer service provider may collect following information you3, +0, personal data '' personal data '' purpose privacy policy information identify individual relate identifiable person, +0, personal data submitwe collect personal data submit u register sign spokenlayer platform service create profile participate sweepstakes contest game take survey visit spokenlayer service , +1, personal data submit may include name postal address include billing shipping address credit debit card number telephone number mobile number email address gender username profile picture, +0, personal data receive third partiesto extent permit applicable law may receive personal data third party social medium service commercially available source content partner business partner applicable third party provider access spokenlayer service , +1, personal data receive third party may include user id third party service location ip address, +0, information '' information '' information intend reveal specific identity u browser information usage data information collect cooky technology demographic information geolocation data obtain consent require applicable law aggregate information, +0, information submitwe collect information submit u , +1, may include password birthday education occupation financial information income investment portfolio transaction value information interests, +0, information receive third partieswe may receive information third party include example demographic data social medium account number information interest information activity websites, +0, information collect automaticallyour service use cooky track technology function effectively deliver certain feature , +0, information use cooky track technology please refer cookie policy, +0,the spokenlayer service may also link site apps operate third party may carry advertisement offer content special offer functionality game application develop maintain third party use iframes tool plug-in `` link service '' , +0, third party link service may use automated mean collect information use features, +0,spokenlayer responsible privacy practice third party leave spokenlayer service click advertisement sign special offer check applicable third party privacy policy, +0,4 child age 13spokenlayer knowingly collect personal identifiable information child age thirteen without verifiable parental consent , +0, determine information inadvertently collect anyone age thirteen shall immediately take necessary step ensure information delete system database , +0, anyone age thirteen must seek obtain parent guardian permission use website , +0,collect personal identification information user way include limited user visit alexa skill register app connection activity service feature resource make available alexa skill , +0,site collect personal information variety way interact site include register create account site place order site use site _ product service enter promotion sweepstakes subscribe newsletter desire add mailing list product service correspond otherwise interact u provide feedback online survey , +1, information collect may include name e-mail address phone number address product preference information bill information demographic information provide language gender age applicable content preference personalization information personal interest information , +0, occasion may ask additional information enable u provide access use certain information material service , +1, kind information collect site automatically receives record information server log browser include ip address information page visit site information cooky , +0,in e-mail text electronic message website, +0,through mobile desktop application download website provide dedicated non-browser-based interaction website, +0,when interact advertising application third-party website service application advertise include link policy , +0, apply information collect u offline mean include website operate company third party include affiliate subsidiary third party include affiliate subsidiary include application content include advertising may link accessible website , +0, please read policy carefully understand policy practice regard information treat , +0, agree policy practice choice use website , +0, access use website agree privacy policy , +0, policy may change time time , +0, continued use website make change deem acceptance change please check policy periodically update , +1, collect several type information user website include information may personally identify name postal address e-mail address telephone number identifier may contact online offline _ personal information _ individually identify andabout internet connection equipment use access website usage detail , +0, collect information directly provide us, +0,automatically navigate site , +1, information collect automatically may include usage detail ip address information collect cooky web beacon track technology andfrom third party example business partner , +1, navigate interact website may use automatic data collection technology collect certain information equipment browse action pattern include detail visit website include traffic data location data log communication data resource access use website, +1,information computer internet connection include ip address operate system browser type , +0, also may use technology collect information online activity time across third-party website online service behavioral track , +0, information collect automatically statistical data may include personal information may maintain associate personal information collect way receive third party , +0, help u improve website deliver good personalized service include enable u estimate audience size usage patterns, +0,store information preference allow u customize website accord individual interests, +0,speed searches, +0,recognize return website , +0, technology use automatic data collection may include may collect personal identification information user variety way include limited user visit site subscribe newsletter fill form connection activity service feature resource make available site , +1, user may ask appropriate name email address phone number , +0, user may however visit site anonymously , +0, collect personal identification information user voluntarily submit information u , +0, user always refuse supply personally identification information except may prevent engage certain site relate activities, +0,we may collect non-personal identification information user whenever interact site , +0, non-personal identification information may include browser name type computer technical information user mean connection site operating system internet service provider utilized similar information, +0,intelligence node may collect use user personal information following purpose improve customer service information provide help u respond customer service request support need efficiently , +0, personalize user experience may use information aggregate understand user group use service resource provide site , +0, send periodic email may use email address respond inquiry question and/or request , +0, user decides opt-in mail list receive email may include company news update related product service information etc , +0, time user would like unsubscribe receive future email include detailed unsubscribe instruction bottom email user may contact u via site, +1,we collect several type information user vt property include information may personally identify name postal address e-mail address telephone number _ personal data _ and/or internet connection equipment use access vt property information interact vt property page views, +0,we collect information directly provide u fill form vt property send correspondence u complete transaction vt property , +0, automatically navigate website , +1, information collect automatically may include usage detail traffic data location data log communication data data regard resource access use vt property information computer internet connection include ip address operate system browser type, +0,your opt-in require automatic data collection _ purely vts internal analytics necessary use vt property , +0, information collect automatically statistical data include personal information may maintain associate personal information collect way receive third party , +1, site gather information type browser operating system visitor use ip address cookie information time stamp time page access well amount time spend per page clickstream information example page view long , +0, note information gather visitor site , +0, otsi monitor visitor arrive site , +0, otsi automatically receives record information server log browser, +0,certain visitor otsi website choose interact otsi way require otsi gather personally-identifying information , +0, amount type information otsi gather depend nature interaction , +0, otsi collect information insofar necessary appropriate fulfill purpose visitor interaction otsi disclose personally-identifying information described , +0, visitor always refuse supply personally-identifying information caveat may prevent engage certain website-related activities, +0,while use service may ask provide u certain personally identifiable information use contact identify , +0, personally identifiable information may include limit email address name information _ personal information _ , +1,we may also collect information browser send whenever visit service access service mobile device _ log data _ , +1,this log data may include information computer _ internet protocol _ ip _ address browser type browser version page service visit time date visit time spend page statistics, +1,when access service mobile device log data may include information type mobile device use mobile device unique id ip address mobile device mobile operating system type mobile internet browser use statistics, +0,when use site platform may collect use personal information behalf customer provide service , +0, _ personal information _ mean information identifies relate describes capable associate could reasonably link directly indirectly particular consumer household california , +0, generally openeyes technology service provider customer collect personal information directly individual , +0, rather customer determine personal information collect individual , +0, also instance collect personal information directly individual , +0, instance describe fully , +0, assessment platform customer utilize assessment platform use enter test taker information allow test taker enter information allow administration assessment , +0, instance openeyes technology act processor information behalf customer , +0, customer may allow test taker employee contractor access platform create account enter access personal information , +1, generally personal information provide organization test taker may include following name postal address email address phone number gender user name password student id number date birth parent contact information consent test taker minor limited instance biometric identifier use verify identity prevent fraud , +0, _ biometric identifier '' record one measurable biological behavioral characteristic use automated recognition individual fingerprint retina iris pattern voiceprint voice pattern bodily gesture facial characteristic type cadence, +0,learning management system customer use learning management system _ lms _ use enter employee individual information allow individual enter information allow access training material content lm , +0, instance openeyes technology act processor information behalf customer provide lm , +0, customer may allow individual designate access system create account enter access personal information , +1, generally personal information provide customer individual include following name postal address email address phone number gender user name password date birth , +0, additional information may also provide related employment education , +0, human resource platform customer may enter permit job applicant employee contractor enter personal information job application employment contract information platform , +1, generally personal information collect may include name postal address email address phone number gender user name password date birth ssn/tin academic information reference information include name contact information phone number home address employment history include name previous employer , +0, permissible applicable law require position individual ask consent criminal record search , +0, voice system system allow user take survey assessment provide review product service share feedback voice enable personal digital assistant smart speaker device _ smart device _ , +0, enable use system user need provide access smart device login information name username password u , +1, generally personal information collect would include smart device account information include name email address username , +0, user communication generally collect personal information directly individual contact u via site , +1, instance may collect personal information name postal address email address phone number , +0, customer account payment may also collect personal information create customer account process payment , +1, generally personal information include name business email address company name address tax id number bill information process transaction include credit card bank account detail , +0, cooky use site unless prohibit applicable law may send one cooky computer mobile device , +0, use cooky legitimate business purpose help u understand individual utilize site help u improve quality site include store user preference improve search result track user trend people use site , +0, configure browser accept cooky reject cooky notify cookie set , +0, browser different check _ help _ menu browser learn change cookie preference , +0, see cooky section additional information cooky collect information , +0, automatically collected information log information _ access site platform via browser application device server automatically record certain information , +1, server log may include information web request interaction site platform internet protocol address browser type browser language date time request , +1, use site platform use smartphone mobile device may collect information regard device type operate system ip address device id , +0, information generally part standard http request send part online interaction virtually web site , +0, may also collect location/gps information provide affirmative consent u , +0, see cooky section additional information , +0, job applicant apply job u may collect personal information site , +0, generally include name postal address email address date birth ssn academic information employment history information need business purpose process job application , +0, comcast respect privacy software access computer file information mobile device except necessary download installation use otherwise obtain consent , +1, comcast may collect information computer mobile device operating system software version amount available disk space peripheral internet connectivity information relate software installation whether software instal successfully first try , +0, addition may collect use information use software would comcast-branded web service detail web service privacy policy , +0, addition comcast may update software time without notice you, +1,for good experience use service may require provide u certain personally identifiable information include limited name phone number postal address , +0, information collect use contact identify you, +0,we want inform whenever visit service collect information browser send u call log data , +1, log data may include information computer _ internet protocol _ ip _ address browser version page service visit time date visit time spend page statistics, +1,as part provide holistic picture financial wellness envestnet may collect following type personal informationidentifiers like postal address email address account name unique personal identifier login credential similar identifier customer record like signature address telephone number passport number social security number driver _ license state identification card number insurance policy number employment bank account number credit card number debit card number financial information medical information health insurance information applicable require utilize insurance- related tool classification characteristic like age marital status sex commercial information record personal property investment asset account activity include transaction balance position history anticipated retirement expense expect value future asset lump sum distribution pension inheritance purchasing consume history tendency example opt share information order view analyze spending habit work budget performance internet similar network activity like online identifier internet protocol address web browser cookie identifier identifier automatically assign computer device access internet information interaction service website application advertisement geolocation data physical location movement device applicable example access service voice-activated digital assistant sensory data customer service request question direct envestnet website via telephone conversation professional employment-related information current past job history income andinferences draw personal information additional profile information reflect preference behavior example person _ risk tolerance relate investment strategies, +0,envestnet may receive category personal information describe representative integration partner government entity third-party data provider right provide u information , +0, _ personal information _ include publicly available information government record use purpose information make publicly available publicly available business contact information de-identified aggregated consumer information, +0,we collect several different type information various purpose provide improve service , +0, personal data use service may ask provide u certain personally identifiable information use contact identify `` personal data '' , +1, personally identifiable information may include limited toemail addressfirst name last namephone numberaddress state province zip/postal code citycookies usage datawe may use personal data contact newsletter marketing promotional material information may interest , +0, may opt receive communication u follow unsubscribe link instruction provide email send, +0,we may also collect information service access use `` usage data '' , +0, usage data may include information computer internet protocol address e, +1, ip address browser type browser version page service visit time date visit time spend page unique device identifier diagnostic data , +0, may collect personal identification information user variety way include limited user visit site register site place order fill form connection activity service feature resource make available site , +0, user may ask appropriate name email address , +0, user may however visit site anonymously , +0, collect personal identification information user voluntarily submit information u , +0, user always refuse supply personally identification information except may prevent engage certain site relate activities, +0,we may collect non-personal identification information user whenever interact site , +1, non-personal identification information may include browser name type computer technical information user mean connection site operating system internet service provider utilized similar information, +1,soundadvice sale marketing may collect use user personal information following purposeswhen visit site use mobile app automatically collect certain information device include information web browser ip address time zone cooky instal device , +0, sign use mobile app alexa skill collect following personal information email address full name birthday genderadditionally browse site navigate mobile app collect information individual web page product view websites search term refer site mobile app information interact site mobile app , +0, refer automatically-collected information `` device information , +0, `` interact alexa skill collect information response intent interaction skillwe collect device information use follow technology `` log file '' track action occur site collect data include ip address browser type internet service provider referring/exit page date/time stamp , +0, `` web beacon '' `` tag '' `` pixel '' electronic file use record information browse site, +0,we collect several different type information various purpose provide improve service you, +0,types data collectedpersonal datawhile use service may ask provide u certain personally identifiable information use contact identify `` personal data '' , +1, personally identifiable information may include limited toemail address nickname choice password choice cooky usage datawe may use email address contact system message relate app login failure forget password , +0, may also send email newsletter marketing promotional material information may interest , +0, may opt receive communication u follow unsubscribe link instruction provide email send, +0,usage datawe may also collect information browser send whenever visit service access service mobile device `` usage data '' , +0,this usage data may include information computer internet protocol address e, +1, ip address browser type browser version page service visit time date visit time spend page unique device identifier diagnostic data, +0,when access service mobile device usage data may include information type mobile device use mobile device unique id ip address mobile device mobile operating system type mobile internet browser use unique device identifier diagnostic data, +0,tracking cooky datawe use cooky similar tracking technology track activity service hold certain information, +0,cookies file small amount data may include anonymous unique identifier , +0, cooky send browser website store device , +0, track technology also use beacon tag script collect track information improve analyze service, +0,you instruct browser refuse cooky indicate cookie send , +0, however accept cooky may able use portion service, +0,examples cooky usesession cooky , +0, use session cooky operate service, +0,preference cooky , +0, use preference cooky remember preference various settings, +0,security cooky , +0, use security cooky security purposes, +0,advertising cooky , +0, advertising cooky use serve advertisement may relevant interests, +0,information displayedupon registration require provide email address , +0, email never show publicly circumstance , +0, also require provide unique nickname publicly visible find appropriate and/or permit , +0, responsibility choose nickname different real name , +0, claim responsibility damage result name expose publicly, +1,we collect follow information child year young name username / email address profile picture , +0, may collect certain non-personal information passively child date time visit time spend content device id type browser use e, +0, chrome firefox internet explorer type operating system use e, +0, windows mac isp child receive internet access , +0, information may combine personal information , +0, may collect personal identification information user variety way include limited user visit site register site subscribe newsletter respond survey fill form connection activity service feature resource make available site , +1, user may ask appropriate name email address mail address phone number , +0, user may however visit site anonymously , +0, collect personal identification information user voluntarily submit information u , +0, user always refuse supply personally identification information except may prevent engage certain site relate activities, +0,we may collect non-personal identification information user whenever interact site , +1, non-personal identification information may include browser name type computer technical information user mean connection site operating system internet service provider utilized similar information, +0,santronix may collect use user personal information following purposes- improve customer serviceinformation provide help u respond customer service request support need efficiently, +0, may also use respond inquiry question and/or request , +0, user decides opt-in mail list receive email may include company news update related product service information etc , +0, time user would like unsubscribe receive future email may contact u via site, +0,our website automatically collect personally identifiable information visitor , +1, recognize collect ip address browser type isp referring/exit page platform type date/time stamp number click domain name country/state , +0, information use analyze trend administer site track visitor _ movement analyze broad demographic information, +0,in order offer provide customized personal experience site use cooky store help track information , +0, example use cooky help remind u deliver content service base account information , +0, cooky simply piece information send browser web server store computer _ hard drive , +0, use cooky relatively standard likely find use major web site , +0, browser initially set accept cooky , +0, able use site and/or associate mobile apps set browser accept cookies, +0,in area site ask provide personally identifiable information , +0, personal information provide keep confidential use support interaction company , +0, good experience use skill may require provide u certain personally identifiable information include limited user name , +0, information request store database use exclusively user communication skill , +0, skill use third party service may collect information use identify , +0, link privacy policy third party service provider use skill alexa privacy policy want inform whenever use service case error skill collect data information third party product device call log data , +0, log data may include information amazon user access token configuration skill utilize service time date use service statistic , +0, order determine information provide skill need postal codecountrygeneral usage patternswe collect several different type information various purpose provide improve service you, +0,types data collectedpersonal datawhile use service may ask provide u certain personally identifiable information use contact identify `` personal data '' , +0, personally identifiable information may include limited toemail addressfirst name last namephone numbercookies usage datausage datawe may also collect information service access use `` usage data '' , +0, usage data may include information computer internet protocol address e, +1, ip address browser type browser version page service visit time date visit time spend page unique device identifier diagnostic data, +0,tracking cooky datawe use cooky similar tracking technology track activity service hold certain information, +0,cookies file small amount data may include anonymous unique identifier , +0, cooky send browser website store device , +0, track technology also use beacon tag script collect track information improve analyze service, +0,you instruct browser refuse cooky indicate cookie send , +0, however accept cooky may able use portion service, +0,examples cooky usesession cooky , +0, use session cooky operate service, +0,preference cooky , +0, use preference cooky remember preference various settings, +0,security cooky , +0, use security cooky security purposes, +1,for good experience use service may require provide u certain personally identifiable information include limited name phone number postal address , +0, information collect use contact identify you, +0,we want inform whenever visit service collect information browser send u call log data , +1, log data may include information computer _ internet protocol `` ip '' address browser version page service visit time date visit time spend page statistics, +1,in addition skill may collect certain information automatically part normal operation within context amazon alexa facebook messenger environment include limited hardware device unique id unique user id prior usage skill information way use skills, +0,these skill collect precise information location hardware device though detail information capture underlying platform relevant term condition agreement referenced, +0,only aggregate anonymized data periodically transmit amazon software environment service help u improve skill service , +0, share information third party way describe privacy statement, +0,we may disclose user provide automatically collect informationas require law comply subpoena similar legal process believe good faith disclosure necessary protect right protect safety safety others investigate fraud respond government request trusted service provider work behalf independent use information disclose agree adhere rule set forth privacy statement, +0,if datascenes development involve merger acquisition sale portion asset notify via email and/or prominent notice web site change ownership us information well choice may regard information, +0,we may work analytics company help u understand skill use frequency duration usage , +0, protect anonymity information use encryption technology help ensure third party _ identify personally, +0,a skill access title first sentence blog article number partner blog site , +0, content automatically update new article post , +1,name and job title, +1,"demographic information such as postcode,preferences and interests", \ No newline at end of file diff --git a/SEM/training_data/title.csv b/SEM/training_data/title.csv new file mode 100644 index 0000000000000000000000000000000000000000..ba93eb075b616d7f4828c88c837233994f1e5cae --- /dev/null +++ b/SEM/training_data/title.csv @@ -0,0 +1,591 @@ +label,title,,,,,,,,,,,,,,,,,, +1,what personal information does alexa internet collect?,,,,,,,,,,,,,,,,,, +2,what about cookies and other identifiers?,,,,,,,,,,,,,,,,,, +3,does alexa internet share the information it receives?,,,,,,,,,,,,,,,,,, +4,how secure is information about me?,,,,,,,,,,,,,,,,,, +5,what about advertising?,,,,,,,,,,,,,,,,,, +6,which information can i access?,,,,,,,,,,,,,,,,,, +6,what choices do i have?,,,,,,,,,,,,,,,,,, +7,are children allowed to use alexa internet?,,,,,,,,,,,,,,,,,, +8,california consumer privacy act (__cpa_),,,,,,,,,,,,,,,,,, +9,"conditions of use, notices, and revisions",,,,,,,,,,,,,,,,,, +10,examples of information collected,,,,,,,,,,,,,,,,,, +13,reasons we can share your information:,,,,,,,,,,,,,,,,,, +3,does cutting edge share?,,,,,,,,,,,,,,,,,, +6,can you limit this sharing?,,,,,,,,,,,,,,,,,, +0,privacy policy,,,,,,,,,,,,,,,,,, +0,questions:,,,,,,,,,,,,,,,,,, +11,who is providing this notice?,,,,,,,,,,,,,,,,,, +4,how does cutting edge credit union protect my personal information?,,,,,,,,,,,,,,,,,, +10,how does cutting edge fcu collect my personal information?,,,,,,,,,,,,,,,,,, +6,why can not i limit all sharing?,,,,,,,,,,,,,,,,,, +0,affiliates:,,,,,,,,,,,,,,,,,, +0,nonaffiliates:,,,,,,,,,,,,,,,,,, +0,joint marketing:,,,,,,,,,,,,,,,,,, +1,non-personal information we collect about you,,,,,,,,,,,,,,,,,, +2,a. cookies ,,,,,,,,,,,,,,,,,, +0,b. web beacons ,,,,,,,,,,,,,,,,,, +1,volunteering personally identifiable information,,,,,,,,,,,,,,,,,, +4,security of your financial information,,,,,,,,,,,,,,,,,, +4,security of information we collect on your behalf,,,,,,,,,,,,,,,,,, +4,security of your personal information,,,,,,,,,,,,,,,,,, +3,disclosure of your information,,,,,,,,,,,,,,,,,, +4,protection of your personal information,,,,,,,,,,,,,,,,,, +7,privacy of children,,,,,,,,,,,,,,,,,, +9,changes to this privacy policy,,,,,,,,,,,,,,,,,, +0,service-related announcements,,,,,,,,,,,,,,,,,, +0,customer service,,,,,,,,,,,,,,,,,, +11,contact information,,,,,,,,,,,,,,,,,, +0,privacy policy,,,,,,,,,,,,,,,,,, +1,gathering of personally-identifying information,,,,,,,,,,,,,,,,,, +12,data retention policy,,,,,,,,,,,,,,,,,, +1,log data,,,,,,,,,,,,,,,,,, +4,protection of certain personally-identifying information,,,,,,,,,,,,,,,,,, +9,privacy policy changes,,,,,,,,,,,,,,,,,, +0,capital investment advisors privacy notice,,,,,,,,,,,,,,,,,, +1,categories of personal information,,,,,,,,,,,,,,,,,, +2,use of cookies,,,,,,,,,,,,,,,,,, +13,why we use your data,,,,,,,,,,,,,,,,,, +12,how long do we keep your data,,,,,,,,,,,,,,,,,, +13,what do we do with your data,,,,,,,,,,,,,,,,,, +0,bulletin boards and chat rooms,,,,,,,,,,,,,,,,,, +7,children__ privacy,,,,,,,,,,,,,,,,,, +4,security of information,,,,,,,,,,,,,,,,,, +6,what are your rights?,,,,,,,,,,,,,,,,,, +6,you have a right to be informed about the personal information that we are collecting about you. ,,,,,,,,,,,,,,,,,, +6,you have a right to see what specific pieces of personal information we have collected and currently hold about you twice in each twelve-month period,,,,,,,,,,,,,,,,,, +6,your have the right to have your personal information deleted.,,,,,,,,,,,,,,,,,, +6,you have the right to know if your personal information is shared or sold.,,,,,,,,,,,,,,,,,, +6,you have the right to opt out. ,,,,,,,,,,,,,,,,,, +0,you have the right to non-discrimination.,,,,,,,,,,,,,,,,,, +0,alternative privacy policies,,,,,,,,,,,,,,,,,, +9,revisions,,,,,,,,,,,,,,,,,, +0,privacy policy ,,,,,,,,,,,,,,,,,, +1,collection of your personal information,,,,,,,,,,,,,,,,,, +13,use of your personal information ,,,,,,,,,,,,,,,,,, +2,use of cookies ,,,,,,,,,,,,,,,,,, +4,security of your personal information,,,,,,,,,,,,,,,,,, +7,children under thirteen,,,,,,,,,,,,,,,,,, +0,opt-out & unsubscribe,,,,,,,,,,,,,,,,,, +11,contact information ,,,,,,,,,,,,,,,,,, +0,spokenlayer privacy policy,,,,,,,,,,,,,,,,,, +0,1. introduction,,,,,,,,,,,,,,,,,, +0,privacy policy,,,,,,,,,,,,,,,,,, +0,scope and date of policy,,,,,,,,,,,,,,,,,, +1,what personal data we collect,,,,,,,,,,,,,,,,,, +10,how we collect personal data,,,,,,,,,,,,,,,,,, +13,how we use personal data,,,,,,,,,,,,,,,,,, +3,disclosure and sharing of personal data,,,,,,,,,,,,,,,,,, +3,transfer of personal data,,,,,,,,,,,,,,,,,, +12,retention of personal data,,,,,,,,,,,,,,,,,, +6,data rights,,,,,,,,,,,,,,,,,, +11,how to contact us,,,,,,,,,,,,,,,,,, +0,instapass,,,,,,,,,,,,,,,,,, +0,share smartkarma corporate solutions brochure with your team,,,,,,,,,,,,,,,,,, +0,download smartkarma corporate solutions subscription plans,,,,,,,,,,,,,,,,,, +0,"Phyllis Pieri, CFE",,,,,,,,,,,,,,,,,, +0,Consent,,,,,,,,,,,,,,,,,, +1,Information we collect,,,,,,,,,,,,,,,,,, +13,How we use your information,,,,,,,,,,,,,,,,,, +1,Log Files,,,,,,,,,,,,,,,,,, +5,Advertising Partners Privacy Policies,,,,,,,,,,,,,,,,,, +14,Third Party Privacy Policies,,,,,,,,,,,,,,,,,, +6,CCPA Privacy Rights (Do Not Sell My Personal Information),,,,,,,,,,,,,,,,,, +4,GDPR Data Protection Rights,,,,,,,,,,,,,,,,,, +7,Children's Information,,,,,,,,,,,,,,,,,, +11,CONTACT US,,,,,,,,,,,,,,,,,, +0,Phyllis Pieri,,,,,,,,,,,,,,,,,, +0,25372 Juniper Drive,,,,,,,,,,,,,,,,,, +0,"Mission Viejo, CA 92691",,,,,,,,,,,,,,,,,, +0,425-922-4126,,,,,,,,,,,,,,,,,, +11,FOLLOW US,,,,,,,,,,,,,,,,,, +11,Contact US_,,,,,,,,,,,,,,,,,, +0,Interpretation,,,,,,,,,,,,,,,,,, +0,Definitions,,,,,,,,,,,,,,,,,, +1,Types of Data Collected,,,,,,,,,,,,,,,,,, +13,Use of Your Personal Data,,,,,,,,,,,,,,,,,, +12,Retention of Your Personal Data,,,,,,,,,,,,,,,,,, +3,Transfer of Your Personal Data,,,,,,,,,,,,,,,,,, +3,Disclosure of Your Personal Data,,,,,,,,,,,,,,,,,, +4,Security of Your Personal Data,,,,,,,,,,,,,,,,,, +0,Privacy Policy,,,,,,,,,,,,,,,,,, +0,Interpretation and Definitions,,,,,,,,,,,,,,,,,, +13,Collecting and Using Your Personal Data,,,,,,,,,,,,,,,,,, +8,Your California Privacy Rights (California's Shine the Light law),,,,,,,,,,,,,,,,,, +8,California Privacy Rights for Minor Users (California Business and Professions Code Section 22581),,,,,,,,,,,,,,,,,, +14,Links to Other Websites,,,,,,,,,,,,,,,,,, +10,Changes to this Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +1,"What personal information do we collect from the people that visit our blog, website or app?",,,,,,,,,,,,,,,,,, +10,When do we collect information?,,,,,,,,,,,,,,,,,, +13,How do we use your information? ,,,,,,,,,,,,,,,,,, +4,How do we protect your information?,,,,,,,,,,,,,,,,,, +2,Do we use 'cookies'?,,,,,,,,,,,,,,,,,, +2,We use cookies to:,,,,,,,,,,,,,,,,,, +14,Third-party disclosure,,,,,,,,,,,,,,,,,, +14,Third-party links,,,,,,,,,,,,,,,,,, +0,Google,,,,,,,,,,,,,,,,,, +7,COPPA (Children Online Privacy Protection Act),,,,,,,,,,,,,,,,,, +0,CAN SPAM Act,,,,,,,,,,,,,,,,,, +0,"To be in accordance with CANSPAM, we agree to the following:",,,,,,,,,,,,,,,,,, +0,ALL,,,,,,,,,,,,,,,,,, +11,Contacting Us,,,,,,,,,,,,,,,,,, +1,"What personal information do we collect from the people that visit our blog, website or app?",,,,,,,,,,,,,,,,,, +10,When do we collect information?,,,,,,,,,,,,,,,,,, +13,How do we use your information? ,,,,,,,,,,,,,,,,,, +4,How do we protect your information?,,,,,,,,,,,,,,,,,, +2,Do we use 'cookies'?,,,,,,,,,,,,,,,,,, +14,Third-party disclosure,,,,,,,,,,,,,,,,,, +14,Third-party links,,,,,,,,,,,,,,,,,, +1,Information Collection And Use,,,,,,,,,,,,,,,,,, +13,Use of Data,,,,,,,,,,,,,,,,,, +3,Transfer Of Data,,,,,,,,,,,,,,,,,, +3,Disclosure Of Data,,,,,,,,,,,,,,,,,, +4,Security Of Data,,,,,,,,,,,,,,,,,, +11,Service Providers,,,,,,,,,,,,,,,,,, +14,Links To Other Sites,,,,,,,,,,,,,,,,,, +7,Children's Privacy,,,,,,,,,,,,,,,,,, +9,Changes To This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +0,TABLE OF CONTENTS,,,,,,,,,,,,,,,,,, +1,1. WHAT INFORMATION DO WE COLLECT?,,,,,,,,,,,,,,,,,, +13,2. HOW DO WE USE YOUR INFORMATION?,,,,,,,,,,,,,,,,,, +3,3. WILL YOUR INFORMATION BE SHARED WITH ANYONE?,,,,,,,,,,,,,,,,,, +12,4. HOW LONG DO WE KEEP YOUR INFORMATION?,,,,,,,,,,,,,,,,,, +4,5. HOW DO WE KEEP YOUR INFORMATION SAFE?,,,,,,,,,,,,,,,,,, +0,6. DO WE COLLECT INFORMATION FROM MINORS?,,,,,,,,,,,,,,,,,, +6,7. WHAT ARE YOUR PRIVACY RIGHTS?,,,,,,,,,,,,,,,,,, +0,8. CONTROLS FOR DO-NOT-TRACK FEATURES,,,,,,,,,,,,,,,,,, +8,9. DO CALIFORNIA RESIDENTS HAVE SPECIFIC PRIVACY RIGHTS?,,,,,,,,,,,,,,,,,, +9,10. DO WE MAKE UPDATES TO THIS POLICY?,,,,,,,,,,,,,,,,,, +11,11. HOW CAN YOU CONTACT US ABOUT THIS POLICY?,,,,,,,,,,,,,,,,,, +1,1. Information we collect,,,,,,,,,,,,,,,,,, +0,2. Our legal bases for processing,,,,,,,,,,,,,,,,,, +13,3. Collecting and using information,,,,,,,,,,,,,,,,,, +14,4. Disclosure of personal information to third parties,,,,,,,,,,,,,,,,,, +3,5. International transfers of personal information,,,,,,,,,,,,,,,,,, +6,6. Your rights as our user,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +0,Business transfers,,,,,,,,,,,,,,,,,, +0,About this policy,,,,,,,,,,,,,,,,,, +1,Information we may collect,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +13,How your information is used,,,,,,,,,,,,,,,,,, +3,Disclosure of your information,,,,,,,,,,,,,,,,,, +6,Your right to opt out,,,,,,,,,,,,,,,,,, +7,Children's privacy online,,,,,,,,,,,,,,,,,, +9,Changes in our privacy notice,,,,,,,,,,,,,,,,,, +6,How to change or correct your personal information,,,,,,,,,,,,,,,,,, +0,Former customers,,,,,,,,,,,,,,,,,, +1,Digital information collection,,,,,,,,,,,,,,,,,, +4,Information security,,,,,,,,,,,,,,,,,, +3,Sharing information,,,,,,,,,,,,,,,,,, +0,Public forums,,,,,,,,,,,,,,,,,, +0,Encrypted email,,,,,,,,,,,,,,,,,, +0,Fraud and other internet risks,,,,,,,,,,,,,,,,,, +10,Information collected in contests or surveys,,,,,,,,,,,,,,,,,, +14,Links to other websites,,,,,,,,,,,,,,,,,, +6,How to refuse communications,,,,,,,,,,,,,,,,,, +7,Protecting children_ online privacy,,,,,,,,,,,,,,,,,, +9,Changes,,,,,,,,,,,,,,,,,, +1,Log Data,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +11,Service Providers,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +14,Links to Other Sites,,,,,,,,,,,,,,,,,, +7,Children__ Privacy,,,,,,,,,,,,,,,,,, +9,Changes to This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +15,1. the information we collect and use,,,,,,,,,,,,,,,,,, +1,1.1 location-based information,,,,,,,,,,,,,,,,,, +1,1.2 account information,,,,,,,,,,,,,,,,,, +1,1.3 usage information,,,,,,,,,,,,,,,,,, +1,1.4 device utilization information,,,,,,,,,,,,,,,,,, +1,1.5 survey information,,,,,,,,,,,,,,,,,, +10,2. how we use consumer information that we collect,,,,,,,,,,,,,,,,,, +0,"2.6 Contests, Sweepstakes and Other Promotions",,,,,,,,,,,,,,,,,, +14,3.1 Contractors and Third-Party Service Providers,,,,,,,,,,,,,,,,,, +11,3.2 Utility and Device Providers,,,,,,,,,,,,,,,,,, +0,3.4 Affiliated Companies,,,,,,,,,,,,,,,,,, +14,3.5 Third-Party Websites,,,,,,,,,,,,,,,,,, +6,4.2 Changing your Privacy Settings: How to Opt-In or Opt-Out,,,,,,,,,,,,,,,,,, +6,4.4 Review and Update of Account Information,,,,,,,,,,,,,,,,,, +7,5. INFORMATION FROM CHILDREN,,,,,,,,,,,,,,,,,, +12,7. SECURITY OF CONSUMER INFORMATION AND DATA RETENTION,,,,,,,,,,,,,,,,,, +0,8. GENERAL,,,,,,,,,,,,,,,,,, +9,8.1 Amendments to this Privacy Policy,,,,,,,,,,,,,,,,,, +11,8.2 Contact Us,,,,,,,,,,,,,,,,,, +8,9. California Consumer Privacy Act (CCPA) Rights,,,,,,,,,,,,,,,,,, +1,"What personal information do we collect from the people that visit our blog, website or app?",,,,,,,,,,,,,,,,,, +10,When do we collect information?,,,,,,,,,,,,,,,,,, +13,How do we use your information?,,,,,,,,,,,,,,,,,, +4,How do we protect your information?,,,,,,,,,,,,,,,,,, +3,Third-party disclosure,,,,,,,,,,,,,,,,,, +14,Third-party links,,,,,,,,,,,,,,,,,, +8,California Online Privacy Protection Act,,,,,,,,,,,,,,,,,, +14,Does our site allow third-party behavioral tracking?,,,,,,,,,,,,,,,,,, +7,COPPA (Children Online Privacy Protection Act),,,,,,,,,,,,,,,,,, +7,We do not specifically market to children under the age of 13 years old.,,,,,,,,,,,,,,,,,, +0,"Great!""0""",SpokenLayer Privacy Policy,,,,,,,,,,,,,,,,, +0,1. Introduction,,,,,,,,,,,,,,,,,, +0,2. Definitions,,,,,,,,,,,,,,,,,, +1,3. Information Collected,,,,,,,,,,,,,,,,,, +13,4. How we Use Your Personal Date that We Collect,,,,,,,,,,,,,,,,,, +0,"5. Accessing, Editing, and Removing Your Information",,,,,,,,,,,,,,,,,, +2,6. Cookies,,,,,,,,,,,,,,,,,, +14,7. Third Party Websites,,,,,,,,,,,,,,,,,, +14,8. Third Party Access to Your Information,,,,,,,,,,,,,,,,,, +0,9. Release of Your Information for Legal Purposes,,,,,,,,,,,,,,,,,, +0,10. Commercial and Non-Commercial Communications,,,,,,,,,,,,,,,,,, +4,11. Security Measures,,,,,,,,,,,,,,,,,, +8,12. Your California Online Privacy Rights,,,,,,,,,,,,,,,,,, +0,13. International Transfer,,,,,,,,,,,,,,,,,, +0,14. Amendments,,,,,,,,,,,,,,,,,, +8,General Data Protection Regulation (GDPR),,,,,,,,,,,,,,,,,, +1,Log Files,,,,,,,,,,,,,,,,,, +2,Cookies and Web Beacons,,,,,,,,,,,,,,,,,, +0,Privacy Policies,,,,,,,,,,,,,,,,,, +14,Third Party Privacy Policies,,,,,,,,,,,,,,,,,, +7,Children's Information,,,,,,,,,,,,,,,,,, +0,Online Privacy Policy Only,,,,,,,,,,,,,,,,,, +0,Consent,,,,,,,,,,,,,,,,,, +1,"In the course of our business, we may collect Personal Information about you from the following sources:",,,,,,,,,,,,,,,,,, +4,Our Policies Regarding the Protection of the Confidentiality and Security of Your Personal Information,,,,,,,,,,,,,,,,,, +3,Our Policies and Practices Regarding the Sharing of Your Personal Information,,,,,,,,,,,,,,,,,, +6,Right to Access Your Personal Information and Ability to Correct Errors or Request Changes or Deletion,,,,,,,,,,,,,,,,,, +0,Multiple Products or Services,,,,,,,,,,,,,,,,,, +0,PROJECT DISTINCT,,,,,,,,,,,,,,,,,, +13,Why do we collect personal information?,,,,,,,,,,,,,,,,,, +1,What kind of personal information do we collect?,,,,,,,,,,,,,,,,,, +2,What are cookies and how do we use them?,,,,,,,,,,,,,,,,,, +4,How do we protect your personal information?,,,,,,,,,,,,,,,,,, +3,How and when do we disclose the information we collect?,,,,,,,,,,,,,,,,,, +0,Subscribe to the PROJECT DISTINCT DAILY PODCAST,,,,,,,,,,,,,,,,,, +0,Introduction,,,,,,,,,,,,,,,,,, +1,This policy applies to information we collect,,,,,,,,,,,,,,,,,, +1,Information We Collect About You and How We Collect It,,,,,,,,,,,,,,,,,, +1,Information You Provide to Us,,,,,,,,,,,,,,,,,, +1,Information We Collect Through Automatic Data Collection Technologies.,,,,,,,,,,,,,,,,,, +2,Cookies (or browser cookies),,,,,,,,,,,,,,,,,, +2,Flash Cookies.,,,,,,,,,,,,,,,,,, +0,Web Beacons.,,,,,,,,,,,,,,,,,, +14,Third-party Use of Cookies and Other Tracking Technologies.,,,,,,,,,,,,,,,,,, +13,How We Use Your Information,,,,,,,,,,,,,,,,,, +3,Disclosure of Your Information,,,,,,,,,,,,,,,,,, +6,Withdrawing Your Consent,,,,,,,,,,,,,,,,,, +0,Supplementing Information,,,,,,,,,,,,,,,,,, +3,We may also disclose your personal information:,,,,,,,,,,,,,,,,,, +13,Choices About How We Use and Disclose Your Information,,,,,,,,,,,,,,,,,, +5,Tracking Technologies and Advertising,,,,,,,,,,,,,,,,,, +14,Disclosure of Your Information for Third-Party Advertising,,,,,,,,,,,,,,,,,, +0,Promotional Offers from the Company.,,,,,,,,,,,,,,,,,, +5,Targeted Advertising,,,,,,,,,,,,,,,,,, +6,Accessing and Correcting Your Information,,,,,,,,,,,,,,,,,, +4,Data Security,,,,,,,,,,,,,,,,,, +9,Changes to Our Privacy Policy,,,,,,,,,,,,,,,,,, +0,Contact Information,,,,,,,,,,,,,,,,,, +12,Removal of Your Information,,,,,,,,,,,,,,,,,, +0,Concerns or Questions Regarding our Compliance,,,,,,,,,,,,,,,,,, +0,Digital Millennium Copyright Act Notice:,,,,,,,,,,,,,,,,,, +0,Compliance with the General Digital Protection Regulation promulgated by the European Union in May 2018,,,,,,,,,,,,,,,,,, +0,Book A Demo,,,,,,,,,,,,,,,,,, +0,Request For Quote,,,,,,,,,,,,,,,,,, +0,Full Service Press Release Form,,,,,,,,,,,,,,,,,, +0,"Don't worry, we got you!",,,,,,,,,,,,,,,,,, +0,SCOPE OF THIS POLICY,,,,,,,,,,,,,,,,,, +0,YOUR CONSENT,,,,,,,,,,,,,,,,,, +0,OUR TERMS OF USE ,,,,,,,,,,,,,,,,,, +6,YOUR CHOICES,,,,,,,,,,,,,,,,,, +9,THIS POLICY MAY CHANGE,,,,,,,,,,,,,,,,,, +1,INFORMATION WE COLLECT,,,,,,,,,,,,,,,,,, +14,INFORMATION ABOUT THIRD-PARTY COOKIES,,,,,,,,,,,,,,,,,, +13,HOW WE USE YOUR INFORMATION,,,,,,,,,,,,,,,,,, +3,HOW WE SHARE YOUR INFORMATION,,,,,,,,,,,,,,,,,, +4,HOW WE PROTECT YOUR INFORMATION,,,,,,,,,,,,,,,,,, +7,A NOTE ABOUT CHILDREN__ PRIVACY,,,,,,,,,,,,,,,,,, +14,LINKS TO OTHER WEBSITES,,,,,,,,,,,,,,,,,, +13,ACCESS TO YOUR INFORMATION,,,,,,,,,,,,,,,,,, +12,RETENTION OF PERSONAL INFORMATION,,,,,,,,,,,,,,,,,, +0,GOVERNING LAW,,,,,,,,,,,,,,,,,, +8,QUESTIONS ABOUT THIS POLICY OR AMERICAN PUBLIC MEDIA GROUP__ DATA PRACTICES,,,,,,,,,,,,,,,,,, +0,Privacy Statement,,,,,,,,,,,,,,,,,, +8,Information we automatically gather when you visit American Public Media Web sites ,,,,,,,,,,,,,,,,,, +1,Information you give us,,,,,,,,,,,,,,,,,, +1,Name and contact information,,,,,,,,,,,,,,,,,, +1,Demographic and Personal Information,,,,,,,,,,,,,,,,,, +1,Credit-card information,,,,,,,,,,,,,,,,,, +0,Special note for APM contributors,,,,,,,,,,,,,,,,,, +0,Special Note for Public Insight Network Participants,,,,,,,,,,,,,,,,,, +1,Information provided to PIN,,,,,,,,,,,,,,,,,, +4,How APM stores your data,,,,,,,,,,,,,,,,,, +13,How APM may use your data,,,,,,,,,,,,,,,,,, +13,How other PIN newsrooms may use your data,,,,,,,,,,,,,,,,,, +8,Your California Privacy Rights,,,,,,,,,,,,,,,,,, +8,Contact American Public Media,,,,,,,,,,,,,,,,,, +0,Consent ,,,,,,,,,,,,,,,,,, +0,Nielsen//NetRatings,,,,,,,,,,,,,,,,,, +0,Effective Date, ,Personal identification information ,,,,,,,,,,,,,,,, +1,Non-personal identification information ,,,,,,,,,,,,,,,,,, +2,Web browser cookies ,,,,,,,,,,,,,,,,,, +10,How we use collected information ,,,,,,,,,,,,,,,,,, +4,How we protect your information,,,,,,,,,,,,,,,,,, +3,Sharing your personal information,,,,,,,,,,,,,,,,,, +14,Third party websites,,,,,,,,,,,,,,,,,, +9,Changes to this privacy policy ,,,,,,,,,,,,,,,,,, +0,Your acceptance of these terms ,,,,,,,,,,,,,,,,,, +11,Contact Us ,,,,,,,,,,,,,,,,,, +1,Personal identification information,,0,VTS,,,,,,,,,,,,,,, +0,Sales & Support,,,,,,,,,,,,,,,,,, +1,Information we collect about you and how we collect it,,,,,,,,,,,,,,,,,, +13,How we use your information,,,,,,,,,,,,,,,,,, +3,Disclosure of your information,,,,,,,,,,,,,,,,,, +8,Data processing and the GDPR,,,,,,,,,,,,,,,,,, +8,Data controlling and the GDPR,,,,,,,,,,,,,,,,,, +6,"Accessing, correcting your information and exercising privacy law data rights",,,,,,,,,,,,,,,,,, +14,Third-Party data controllers,,,,,,,,,,,,,,,,,, +8,Privacy Shield,,,,,,,,,,,,,,,,,, +4,Data security,,,,,,,,,,,,,,,,,, +0,Questions or complaints,,,,,,,,,,,,,,,,,, +9,Changes to our privacy policy,,,0,Privacy Policies Statement,,,,,,,,,,,,,, +1,Information Gathered,,,,,,,,,,,,,,,,,, +1,Gathering of Personally-Identifying Information,,,,,,,,,,,,,,,,,, +4,Protection of Certain Personally-Identifying Information,,,,,,,,,,,,,,,,,, +2,Use of Cookies,,,,,,,,,,,,,,,,,, +14,Links to Other Web Sites,,,,,,,,,,,,,,,,,, +9,Changes to this Privacy Statement,,,,,,,,,,,,,,,,,, +11,Contacting our Web Site,,,,0,Information Collection And Use,,,,,,,,,,,,, +1,Log Data,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +2,DoubleClick Cookie,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +14,Links To Other Sites,,,,,,,,,,,,,,,,,, +7,Children's Privacy,,,,,,,,,,,,,,,,,, +9,Changes To This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us ,,,,,0,Information Collection And Use,,,,,,,,,,,, +1,information collection and use,,,,,,,,,,,,,,,,,, +1,log data,,,,,,,,,,,,,,,,,, +2,cookies,,,,,,,,,,,,,,,,,, +11,service providers,,,,,,,,,,,,,,,,,, +0,compliance with laws,,,,,,,,,,,,,,,,,, +5,business transaction,,,,,,,,,,,,,,,,,, +4,security,,,,,,,,,,,,,,,,,, +0,international transfer,,,,,,,,,,,,,,,,,, +14,links to other sites,,,,,,,,,,,,,,,,,, +7,children__ privacy,,,,,,,,,,,,,,,,,, +9,changes to this privacy policy,,,,,,,,,,,,,,,,,, +11,contact us,,,,,,0,children__ privacy,,,,,,,,,,, +14,disclosure of personal information to third parties,,,,,,,,,,,,,,,,,, +3,international transfers of personal information,,,,,,,,,,,,,,,,,, +6,your rights and controlling your personal information,,,,,,,,,,,,,,,,,, +2,use of cookies,,,,,,,,,,,,,,,,,, +0,limits of our policy,,,,,,,,,,,,,,,,,, +9,changes to this policy,,,,,,,,,,,,,,,,,, +11,contact us,,,,,,,,,,,,,,,,,, +0,domains,,,,,,,,,,,,,,,,,, +0,hosting,,,,,,,,,,,,,,,,,, +4,security,,,,,,,,,,,,,,,,,, +0,account,,,,,,,,,,,,,,,,,, +0,legitimate reasons for processing your personal information,,,,,,,,,,,,,,,,,, +1,collection and use of information,,,,,,,,,,,,,,,,,, +4,security of your personal information,,,,,,,,,,,,,,,,,, +12,how long we keep your personal information,,,,,,,0,akvelon privacy policy,,,,,,,,,, +1,INFORMATION WE COLLECT FROM YOU,,,,,,,,,,,,,,,,,, +2,COOKIES,,,,,,,,,,,,,,,,,, +13,USES MADE OF THE INFORMATION,,,,,,,,,,,,,,,,,, +3,DISCLOSURE OF YOUR INFORMATION,,,,,,,,,,,,,,,,,, +4,WHERE WE STORE YOUR PERSONAL DATA,,,,,,,,,,,,,,,,,, +6,YOUR RIGHTS,,,,,,,,,,,,,,,,,, +0,ACCESS TO INFORMATION,,,,,,,,,,,,,,,,,, +9,CHANGES TO OUR PRIVACY POLICY,,,,,,,,,,,,,,,,,, +11,CONTACT,,,,,,,,0,Assessment Platform,,,,,,,,, +1,Learning Management System,,,,,,,,,,,,,,,,,, +1,Human Resource Platform,,,,,,,,,,,,,,,,,, +1,Voice Over Systems,,,,,,,,,,,,,,,,,, +0,User Communications,,,,,,,,,,,,,,,,,, +1,Customer Accounts and Payments,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +1,Automatically Collected Information; Log information,,,,,,,,,,,,,,,,,, +0,Job Applicants,,,,,,,,,,,,,,,,,, +0,Customers,,,,,,,,,,,,,,,,,, +11,Service Providers,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +3,Corporate Transactions,,,,,,,,,,,,,,,,,, +3,Government Requests,,,,,,,,,,,,,,,,,, +4,Fraud Prevention and Security,,,,,,,,,,,,,,,,,, +0,Our Platforms,,,,,,,,,,,,,,,,,, +0,AB 1584:,,,,,,,,,,,,,,,,,, +0,Certification,,,,,,,,,,,,,,,,,, +0,Partners,,,,,,,,,,,,,,,,,, +0,Membership,,,,,,,,,0,1. Grant of a Limited License.,,,,,,,, +6,2. No Ownership Rights.,,,,,,,,,,,,,,,,,, +14,3. Third-Party Software.,,,,,,,,,,,,,,,,,, +14,4. Third-Party Content.,,,,,,,,,,,,,,,,,, +6,5. Restrictions and Requirements. ,,,,,,,,,,,,,,,,,, +0,6. Modification and Order of Precedence.,,,,,,,,,,,,,,,,,, +1,7. Data Collection and Use.,,,,,,,,,,,,,,,,,, +0,8. Open Source.,,,,,,,,,,,,,,,,,, +11,9. Support,,,,,,,,,,,,,,,,,, +0,10. Termination,,,,,,,,,,,,,,,,,, +0,11. Additional Terms Incorporated by Reference.,,,,,,,,,,0,Information Collection and Use,,,,,,, +1,Log Data,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +11,Service Providers,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +14,Links to Other Sites,,,,,,,,,,,,,,,,,, +9,Changes to This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,0,Authorization of Use,,,,,, +1,Personal Information We Collect,,,,,,,,,,,,,,,,,, +13,How We Use Personal Information,,,,,,,,,,,,,,,,,, +2,"Cookies, Pixel Tags/Web Beacons and Similar Technologies",,,,,,,,,,,,,,,,,, +3,Disclosing Personal Information to Others,,,,,,,,,,,,,,,,,, +12,Retaining Personal Information,,,,,,,,,,,,,,,,,, +4,Protecting Collected Information,,,,,,,,,,,,,,,,,, +9,Notification of Changes,,,,,,,,,,,,,,,,,, +6,Your Rights and Choices,,,,,,,,,,,,,,,,,, +8,International Customers,,,,,,,,,,,,,,,,,, +0,Opting Out of Technologies,,,,,,,,,,,,0,Definitions,,,,, +1,Information Collection and Use,,,,,,,,,,,,,,,,,, +1,Types of Data Collected ,,,,,,,,,,,,,,,,,, +13,Usage Data ,,,,,,,,,,,,,,,,,, +2,Tracking & Cookies Data ,,,,,,,,,,,,,,,,,, +13,Use of Data ,,,,,,,,,,,,,,,,,, +8,Transfer of Data ,,,,,,,,,,,,,,,,,, +3,Disclosure of Data ,,,,,,,,,,,,,,,,,, +4,Security of Data ,,,,,,,,,,,,,,,,,, +11,Service Providers ,,,,,,,,,,,,,,,,,, +14,Links to Other Sites ,,,,,,,,,,,,,,,,,, +7,Children's Privacy,,,,,,,,,,,,,,,,,, +9,Changes to This Privacy Policy ,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,0,children under the age of 13,,,, +15,information we collect and how we collect it,,,,,,,,,,,,,,,,,, +13,how we use your information.,,,,,,,,,,,,,,,,,, +3,disclosure of your information,,,,,,,,,,,,,,,,,, +6,"your choices about our collection, use and disclosure of your information",,,,,,,,,,,,,,,,,, +8,your california privacy rights,,,,,,,,,,,,,,,,,, +4,data security,,,,,,,,,,,,,,,,,, +9,changes to our privacy policy,,,,,,,,,,,,,,,,,, +11,contact information,,,,,,,,,,,,,,,,,, +1,information you provide to us,,,,,,,,,,,,,,,,,, +1,automatic information collection and tracking,,,,,,,,,,,,,,,,,, +0,information collection and tracking technologies,,,,,,,,,,,,,,,,,, +1,Personal identification information,,,,,,,,,,,,,,,,,, +1,Non-personal identification information,,,,,,,,,,,,,,,,,, +2,Web browser cookies,,,,,,,,,,,,,,,,,, +13,How we use collected information,,,,,,,,,,,,,,,,,, +0,To run and operate our Site,,,,,,,,,,,,,,,,,, +3,Sharing your personal information,,,,,,,,,,,,,,,,,, +0,Electronic newsletters,,,,,,,,,,,,,,,,,, +14,Third party websites,,,,,,,,,,,,,,,,,, +9,Changes to this privacy policy,,,,,,,,,,,,,,,,,, +6,Your acceptance of these terms,,,,,,,,,,,,,,,,,, +11,Contacting us,,,,,,,,,,,,,,0, Collecting Personal Information ,,, +13,Using Information,,,,,,,,,,,,,,,,,, +4,Protecting Information,,,,,,,,,,,,,,,,,, +2,Web Browser Cookies,,,,,,,,,,,,,,,,,, +3,Sharing Information,,,,,,,,,,,,,,,,,, +14,Third-Party Websites ,,,,,,,,,,,,,,,,,, +6,Your Consent,,,,,,,,,,,,,,,0,PERSONAL INFORMATION WE COLLECT,, +4,HOW DO WE PROTECT YOUR DATA?,,,,,,,,,,,,,,,,,, +13,HOW DO WE USE YOUR PERSONAL INFORMATION?,,,,,,,,,,,,,,,,,, +3,SHARING YOUR PERSONAL INFORMATION,,,,,,,,,,,,,,,,,, +0,DO NOT TRACK,,,,,,,,,,,,,,,,,, +12,DATA RETENTION,,,,,,,,,,,,,,,,,, +9,CHANGES,,,,,,,,,,,,,,,,,, +11,CONTACT US,,,,,,,,,,,,,,,,0,, +13,What we do with the information we gather.,,,,,,,,,,,,,,,,,, +4,Security.,,,,,,,,,,,,,,,,,, +2,How we use cookies,,,,,,,,,,,,,,,,,, +14,Links to other websites,,,,,,,,,,,,,,,,,, +1,Controlling your personal information,,,,,,,,,,,,,,,,0,Privacy Policy, +1,Personal identification information,,,,,,,,,,,,,,,,,, +1,Non-personal identification information,,,,,,,,,,,,,,,,,, +2,Web browser cookies,,,,,,,,,,,,,,,,,, +13,How we use collected information,,,,,,,,,,,,,,,,,, +4,How we protect your information,,,,,,,,,,,,,,,,,, +3,Sharing your personal information,,,,,,,,,,,,,,,,,, +9,Changes to this privacy policy,,,,,,,,,,,,,,,,,, +6,Your acceptance of these terms,,,,,,,,,,,,,,,,,, +11,Contacting us,,,,,,,,,,,,,,,,,, +13,how the personal data is used,,,,,,,,,,,,,,,,,, +7,prarent's consent,,,,,,,,,,,,,,,,,0,BAMBOO LEARNING PRIVACY POLICY +0,OUR PRIVACY PRINCIPLES,,,,,,,,,,,,,,,,,, +13,WHY WE COLLECT PERSONAL INFORMATION,,,,,,,,,,,,,,,,,, +15,WHAT INFORMATION WE COLLECT AND WHEN WE COLLECT IT,,,,,,,,,,,,,,,,,, +3,HOW WE MAY SHARE INFORMATION WE COLLECT,,,,,,,,,,,,,,,,,, +6,YOUR COMMUNICATION CHOICES,,,,,,,,,,,,,,,,,, +0,INTEGRITY OF YOUR PERSONAL INFORMATION,,,,,,,,,,,,,,,,,, +7,CHILDREN,,,,,,,,,,,,,,,,,, +4,OUR COMPANYWIDE COMMITMENT TO YOUR PRIVACY,,,,,,,,,,,,,,,,,, +1,"What personal information do we collect from the people that visit our blog, website or app?",,,,,,,,,,,,,,,,,, +0,When do we collect information?,,,,,,,,,,,,,,,,,, +13,How do we use your information? ,,,,,,,,,,,,,,,,,, +4,How do we protect your information?,,,,,,,,,,,,,,,,,, +2,Do we use 'cookies'?,,,,,,,,,,,,,,,,,, +3,Third-party disclosure,,,,,,,,,,,,,,,,,, +3,Third-party links,,,,,,,,,,,,,,,,,, +0,Google,,,,,,,,,,,,,,,,,, +8,California Online Privacy Protection Act,,,,,,,,,,,,,,,,,, +0,How does our site handle Do Not Track signals?,,,,,,,,,,,,,,,,,, +0,Does our site allow third-party behavioral tracking?,,,,,,,,,,,,,,,,,, +7,COPPA (Children Online Privacy Protection Act),,,,,,,,,,,,,,,,,, +0,Fair Information Practices,,,,,,,,,,,,,,,,,, +0,"In order to be in line with Fair Information Practices we will take the following responsive action, should a data breach occur:",,,,,,,,,,,,,,,,,, +13,We collect your email address in order to:,,,,,,,,,,,,,,,,,, +11,Contacting Us,,,,,,,,,,,,,,,,,, +1,What information do we collect?,,,,,,,,,,,,,,,,,, +13,What do we use your information for?,,,,,,,,,,,,,,,,,, +2,Do we use cookies?,,,,,,,,,,,,,,,,,, +3,Do we disclose any information to outside parties?,,,,,,,,,,,,,,,,,, +4,Data protection,,,,,,,,,,,,,,,,,, +7,Parent's consent,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +1,What personal data we collect and why we collect it,,,,,,,,,,,,,,,,,, +3,Who we share your data with,,,,,,,,,,,,,,,,,, +12,How long we retain your data,,,,,,,,,,,,,,,,,, +6,What rights you have over your data,,,,,,,,,,,,,,,,,, +3,Where we send your data,,,,,,,,,,,,,,,,,, +0,Additional information,,,,,,,,,,,,,,,,,, +11,Contact information,,,,,,,,,,,,,,,,,, +0,Privacy Policy,,,,,,,,,,,,,,,,,, +15,Information Collection and Use,,,,,,,,,,,,,,,,,, +1,Log Data,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +7,Children__ Privacy,,,,,,,,,,,,,,,,,, +9,Changes to This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +0,Essential definitions,,,,,,,,,,,,,,,,,, +15,About the collection AND USE of your Personal Data,,,,,,,,,,,,,,,,,, +12,HOW LONG DO WE SAVE YOUR PERSONAL DATA?,,,,,,,,,,,,,,,,,, +3,DO WE SHARE YOUR PERSONAL DATA?,,,,,,,,,,,,,,,,,, +4,How do WE SECURE YOUR PERSONAL DATA?,,,,,,,,,,,,,,,,,, +3,About international transfers of your Personal Data,,,,,,,,,,,,,,,,,, +6,Your rights as a Data Subject,,,,,,,,,,,,,,,,,, +9,Policy updates,,,,,,,,,,,,,,,,,, +1,Types of personal information we collect,,,,,,,,,,,,,,,,,, +1,Information you provide to us,,,,,,,,,,,,,,,,,, +1,Information we collect automatically,,,,,,,,,,,,,,,,,, +1,Information we collect from other sources,,,,,,,,,,,,,,,,,, +13,Use of information,,,,,,,,,,,,,,,,,, +3,Transfer of information to the U.S. and other countries,,,,,,,,,,,,,,,,,, +3,Sharing of information,,,,,,,,,,,,,,,,,, +3,Third party advertising and analytics services,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +6,Your choices,,,,,,,,,,,,,,,,,, +1,Account information and access your information,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +1,Location information,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +9,How might this privacy notice be changed?,,,,,,,,,,,,,,,,,, +11,Contact information,,,,,,,,,,,,,,,,,, +15,Information Collection and Use,,,,,,,,,,,,,,,,,, +1,Log Data,,,,,,,,,,,,,,,,,, +2,Cookies,,,,,,,,,,,,,,,,,, +11,Service Providers,,,,,,,,,,,,,,,,,, +4,Security,,,,,,,,,,,,,,,,,, +3,Links to Other Sites,,,,,,,,,,,,,,,,,, +9,Changes to This Privacy Policy,,,,,,,,,,,,,,,,,, +11,Contact Us,,,,,,,,,,,,,,,,,, +13,purposes of processing your personal information,,,,,,,,,,,,,,,,,, +13,purposes of processing your personal information,,,,,,,,,,,,,,,,,, +13,purposes of processing your personal information,,,,,,,,,,,,,,,,,, +13,purposes of processing your personal information,,,,,,,,,,,,,,,,,, \ No newline at end of file diff --git a/SEM/types_pp_processing.py b/SEM/types_pp_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b3a815588bfc1ea239c37b2a5a27e8eddd7acf --- /dev/null +++ b/SEM/types_pp_processing.py @@ -0,0 +1,418 @@ +import csv +import re +import spacy +from bs4 import BeautifulSoup +# import stanza + +from nltk.corpus import stopwords, wordnet +from SEM.text_preprocessing import pre_process,pre_process_type +from SEM.sentence_bayesian import clf_type,tf +from SEM.phrase_similarity import wordnetSim3, wordnetSim_modified + +def check_ngram(string): + words = string.split() + num_words = len(words) + return num_words + + +replacement_patterns = [ +(r'won\'t', 'will not'), +(r'can\'t', 'cannot'), +(r'i\'m', 'i am'), +(r'ain\'t', 'is not'), +(r'(\w+)\'ll', '\g<1> will'), +(r'(\w+)n\'t', '\g<1> not'), +(r'(\w+)\'ve', '\g<1> have'), +(r'(\w+)\'s', '\g<1> is'), +(r'(\w+)\'re', '\g<1> are'), +(r'(\w+)\'d', '\g<1> would')] + +class RegexpReplacer(object): + def __init__(self, patterns=replacement_patterns): + self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns] + def replace(self, text): + s = text + for (pattern, repl) in self.patterns: + (s, count) = re.subn(pattern, repl, s) + return s +# 获取单词的词性 +def get_wordnet_pos(tag): + if tag.startswith('J'): + return wordnet.ADJ + elif tag.startswith('V'): + return wordnet.VERB + elif tag.startswith('N'): + return wordnet.NOUN + elif tag.startswith('R'): + return wordnet.ADV + else: + return None + + +def cleanHtml(txt): + + # only split with line + personal_information = [] + with open(txt, encoding='utf-8') as file_obj: + for line in file_obj: + # if len(line.split(' ')) >= 5: + personal_information.append(line) + + text = ''.join(personal_information) + soup = BeautifulSoup(text, 'html.parser') + lower = soup.get_text().lower() + + # use re + # pattern = r'(? 0.8: + simList[information_type.index(type)] = wordnetSim3(chunk,type) + except Exception: + pass + print("error") + nowMax = 0 + for max in simList: + if max > nowMax: + nowMax = max + if nowMax != 0: + word[simList.index(nowMax)] = 1 + return word + +def getSentences(txt): + + information_type = {'Name':['name', 'first name', 'last name', 'full name', 'real name', 'surname', 'family name', 'given name'], + 'Birthday':['birthday', 'date of birth', 'birth date', 'DOB', 'dob full birthday'], + 'Address':['address', 'mailing address', 'physical address', 'postal address', 'billing address', 'shipping address'], + 'Phone':['phone', 'phone number', 'mobile', 'mobile phone', 'mobile number', 'telephone', 'telephone number', 'call'], + 'Email':['email', 'e-mail', 'email address', 'e-mail address'], + 'Contacts':['contacts', 'phone-book', 'phone book'], + 'Location':['location', 'locate', 'place', 'geography', 'geo', 'geo-location', 'precision location'], + 'Camera':['camera', 'photo', 'scan', 'album', 'picture', 'gallery', 'photo library', 'storage', 'image', 'video'], + 'Microphone':['microphone', 'voice, mic', 'speech', 'talk'], + 'Financial':['credit card', 'company', 'companies', 'organization', 'organizations', 'pay', 'payment'], + 'IP':['IP', 'Internet Protocol', 'IP address', 'internet protocol address'], + 'Cookies':['cookies', 'cookie']} + + sentence_list = cleanHtml(txt) + for sen in sentence_list: + sentence_list[sentence_list.index(sen)] = pre_process_type(sen) + + # print("all sentences:\n") + # for sen in sentence_list: + # print(sen) + # print("\n") + + classified_sen = {'Name':[], + 'Birthday':[], + 'Address':[], + 'Phone':[], + 'Email':[], + 'Contacts':[], + 'Location':[], + 'Camera':[], + 'Microphone':[], + 'Financial':[], + 'IP':[], + 'Cookies':[]} + # simList = [] + # for a in information_type: + # word.append(0) + # for b in information_type: + # simList.append(0) + for sentence in sentence_list: + if clf_type.predict(tf.transform([sentence])) == "1": + # print("yes sentence: "+sentence+"\n") + for type in information_type: + for w in information_type[type]: + if w in sentence: + if w == "geo" or w == "IP" or w == "DOB": + # check whether w is a part of an unrelated word + if sentence[sentence.index(w) - 1] == " " and sentence not in classified_sen[type]: + classified_sen[type].append(sentence) + else: + # check duplication + if sentence not in classified_sen[type]: + classified_sen[type].append(sentence) + + return classified_sen + +def getSentences_no_classifier(txt): + + information_type = {'Name':['name', 'first name', 'last name', 'full name', 'real name', 'surname', 'family name', 'given name'], + 'Birthday':['birthday', 'date of birth', 'birth date', 'DOB', 'dob full birthday'], + 'Address':['address', 'mailing address', 'physical address', 'postal address', 'billing address', 'shipping address'], + 'Phone':['phone', 'phone number', 'mobile', 'mobile phone', 'mobile number', 'telephone', 'telephone number', 'call'], + 'Email':['email', 'e-mail', 'email address', 'e-mail address'], + 'Contacts':['contacts', 'phone-book', 'phone book'], + 'Location':['location', 'locate', 'place', 'geography', 'geo', 'geo-location', 'precision location'], + 'Camera':['camera', 'photo', 'scan', 'album', 'picture', 'gallery', 'photo library', 'storage', 'image', 'video'], + 'Microphone':['microphone', 'voice, mic', 'speech', 'talk'], + 'Financial':['credit card', 'company', 'companies', 'organization', 'organizations', 'pay', 'payment'], + 'IP':['IP', 'Internet Protocol', 'IP address', 'internet protocol address'], + 'Cookies':['cookies', 'cookie']} + + sentence_list = cleanHtml(txt) + for sen in sentence_list: + sentence_list[sentence_list.index(sen)] = pre_process_type(sen) + + # print("all sentences:\n") + # for sen in sentence_list: + # print(sen) + # print("\n") + + classified_sen = {'Name':[], + 'Birthday':[], + 'Address':[], + 'Phone':[], + 'Email':[], + 'Contacts':[], + 'Location':[], + 'Camera':[], + 'Microphone':[], + 'Financial':[], + 'IP':[], + 'Cookies':[]} + # simList = [] + # for a in information_type: + # word.append(0) + # for b in information_type: + # simList.append(0) + for sentence in sentence_list: + # print("yes sentence: "+sentence+"\n") + for type in information_type: + for w in information_type[type]: + if w in sentence: + if w == "geo" or w == "IP" or w == "DOB": + # check whether w is a part of an unrelated word + if sentence[sentence.index(w) - 1] == " " and sentence not in classified_sen[type]: + classified_sen[type].append(sentence) + else: + # check duplication + if sentence not in classified_sen[type]: + classified_sen[type].append(sentence) + + return classified_sen + +def getSentences_with_classifier(txt): + + information_type = {'Name':['name', 'first name', 'last name', 'full name', 'real name', 'surname', 'family name', 'given name'], + 'Birthday':['birthday', 'date of birth', 'birth date', 'DOB', 'dob full birthday', 'birth year'], + 'Address':['mailing address', 'physical address', 'postal address', 'billing address', 'shipping address', 'delivery address', 'residence', 'collect address', 'personal address', 'residential address'], + 'Phone':['phone', 'phone number', 'mobile', 'mobile phone', 'mobile number', 'telephone', 'telephone number', 'call'], + 'Email':['email', 'e-mail', 'email address', 'e-mail address'], + 'Contacts':['contacts', 'phone-book', 'phone book', 'phonebook', 'contact list', 'phone contacts', 'address book'], + 'Location':['location', 'locate', 'geography', 'geo', 'geo-location', 'precision location', 'nearby'], + 'Photos':['camera', 'photo', 'scan', 'album', 'picture', 'gallery', 'photo library', 'storage', 'image', 'video', 'scanner', 'photograph'], + 'Voices':['microphone', 'voice', 'mic', 'speech', 'talk'], + 'Financial info':['credit card', 'pay', 'payment', 'debit card', 'mastercard', 'wallet'], + 'IP':['IP', 'Internet Protocol', 'IP address', 'internet protocol address'], + 'Cookies':['cookies', 'cookie'], + 'Social media':['facebook', 'twitter', 'socialmedia', 'social media'], + 'Profile':['profile', 'account'], + 'Gender':['gender']} + + sentence_list = cleanHtml(txt) + + classified_sen = {'Name': "", + 'Birthday': "", + 'Address': "", + 'Phone': "", + 'Email': "", + 'Contacts': "", + 'Location': "", + 'Photos': "", + 'Voices': "", + 'Financial info': "", + 'IP': "", + 'Cookies': "", + 'Social media': "", + 'Profile': "", + 'Gender': "" + } + + keyword_index = {'Name':[], + 'Birthday':[], + 'Address':[], + 'Phone':[], + 'Email':[], + 'Contacts':[], + 'Location':[], + 'Photos':[], + 'Voices':[], + 'Financial info':[], + 'IP':[], + 'Cookies':[], + 'Social media': [], + 'Profile': [], + 'Gender': [] + } + + # simList = [] + # for a in information_type: + # word.append(0) + # for b in information_type: + # simList.append(0) + for sentence in sentence_list: + # print("yes sentence: "+sentence+"\n") + + sentence = sentence.lower() + + info_found = False + + for type in information_type: + for w in information_type[type]: + + if w.lower() in sentence: + # if (check_ngram(w) == 1 and w.lower() in sentence.split()) or (check_ngram(w) > 1 and w.lower() in sentence): + if w == "geo" or w == "IP" or w == "DOB" or w == "mic": + if sentence[sentence.index(w.lower()) - 1] != " ": + continue + if sentence not in classified_sen[type]: + + if re.match(r'[a-zA-Z0-9]', sentence[-1]): + sentence = sentence + '.' + + # start_index = len(classified_sen[type]) + sentence.index(w.lower()) + # end_index = start_index + len(w.lower()) - 1 + # keyword_index[type].append([start_index, end_index]) + # classified_sen[type] = classified_sen[type] + sentence + + pattern = re.compile(re.escape(w.lower())) + for match in pattern.finditer(sentence): + start_index = len(classified_sen[type]) + match.start() + end_index = start_index + len(w) - 1 + keyword_index[type].append([start_index, end_index]) + # if sentence[0].isalpha(): + # sentence = sentence[0].upper() + sentence[1:] + classified_sen[type] = classified_sen[type] + sentence + '\n' + # sen_dict[type].append(sentence) + + info_found = True + + if not info_found and clf_type.predict(tf.transform([sentence])) == "1": + nlp = spacy.load('en_core_web_sm') + doc = nlp(sentence) + chunk_list = [] + for chunk in doc.noun_chunks: + chunk_str = str(chunk) + if chunk_str[0] == " ": + chunk_str = chunk_str[1:] + chunk_list.append(chunk_str) + + for type in information_type: + found_this_type = False + + for w in information_type[type]: + for chunk in chunk_list: + if w == chunk or wordnetSim_modified(chunk, w) > 0.8: + + if sentence not in classified_sen[type]: + # classified_sen[type].append(sentence) + + if re.match(r'[a-zA-Z0-9]', sentence[-1]): + sentence = sentence + '.' + + # start_index = len(classified_sen[type]) + sentence.index(chunk) + # end_index = start_index + len(chunk) - 1 + # keyword_index[type].append([start_index, end_index]) + # classified_sen[type] = classified_sen[type] + sentence + + pattern = re.compile(re.escape(chunk)) + for match in pattern.finditer(sentence): + start_index = len(classified_sen[type]) + match.start() + end_index = start_index + len(chunk) - 1 + keyword_index[type].append([start_index, end_index]) + # if sentence[0].isalpha(): + # sentence = sentence[0].upper() + sentence[1:] + classified_sen[type] = classified_sen[type] + sentence + '\n' + # sen_dict[type].append(sentence) + + found_this_type = True + + if found_this_type: + break + + return classified_sen, keyword_index + + + + + + + + diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..65eede91d725e21af61bbcb3bb32c5be48e00c4b --- /dev/null +++ b/app.py @@ -0,0 +1,343 @@ +import gradio as gr +import cv2 +import numpy as np +import shutil +from bs4 import BeautifulSoup +import requests +import pandas as pd +import threading +import time +import os +import sys +import logging +from logging.handlers import TimedRotatingFileHandler +import psutil + +from SEM.run_single_sem import run_single_pp +from CDM.run_single import run_single_img + +title = "Cpp4App" +description = "Automated Contextual Privacy Policies Generation for Mobile Apps" + + +# log +log_file_path = 'logs/app.log' + +# set log handler(Generate one log file per day and keep only the latest 7 files) +handler = TimedRotatingFileHandler(log_file_path, when='W0', interval=1, backupCount=7) +formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') +handler.setLevel(logging.INFO) +handler.setFormatter(formatter) + +# set logger +logger = logging.getLogger() +logger.setLevel(logging.INFO) +logger.addHandler(handler) + +logger.info('Application started') + + +def write_and_read(): + # Write + with open('myfile.txt', 'w') as f: + f.write('Hello, World!') + + # Read + with open('myfile.txt', 'r') as f: + data = f.read() + + print("this is data: ", data) + + return data + +def run_demo(img_root, output_root, segment_root, file): + print(type(file)) + + # file_content = file.read().decode('utf-8') + run_single_pp(file) + + output_board, output_data, complete_result = run_single_img(img_root, output_root, segment_root) + + return output_board, output_data, complete_result + +def inference(img, html): + + write_and_read() + + if img is None or html is None: + return None, None + + output_root = "./CDM/result_classification" + segment_root = './SEM/txt' + img_root = "./CDM/input_examples/1-1-write.jpg" + pp_root = "1.txt" + + # output_root = "" + # segment_root = "" + # img_root = "demo_img.jpg" + + img_array = np.array(img) + + cv2.imwrite(img_root, img_array) + + # replace example string with real example + # if html == 'html content 1': + # with open("examples/6.txt", "r") as f: + # html = f.read() + # elif html == 'html content 2': + # with open("examples/11.txt", "r") as f: + # html = f.read() + + # print("string: ", html) + # with open(pp_root, 'w', encoding='utf-8') as file: # Open the destination file in text mode + # file.write(html) # Write the HTML content to the destination file + + try: + response = requests.get(html) + response.raise_for_status() # Will raise an exception if the status is an error + input_text = response.text + except requests.HTTPError: + input_text = "" + # print("input_text: ", input_text) + with open(pp_root, 'w', encoding='utf-8') as file: + file.write(input_text) + + soup = BeautifulSoup(open(pp_root, encoding='utf-8'), features="html.parser") + # print("pp_root soup: ", soup.contents) + + output_board, output_data, complete_result = run_demo(img_root, output_root, segment_root, pp_root) + + # print(output_data) + + return output_board, output_data, complete_result + +# inputs = [ +# gr.inputs.Image(type="pil", label="Image Upload"), +# # gr.inputs.File(label="HTML File Upload"), +# gr.inputs.Textbox(label="Text Input") +# # gr.inputs.Textbox(lines=True, label="HTML Text") +# ] +# output = [ +# gr.outputs.Image(type="pil", label="Result Image"), +# gr.outputs.Dataframe(type="pandas", label="Result Excel") +# ] + +# gr.Interface( +# inference, +# # inputs, +# # output, +# inputs=[image_input_row, textbox_input_row], +# outputs=[image_output_row, dataframe_output_row], +# title=title, +# description=description, +# # examples=[['examples/6-8.jpg', 'examples/6.txt'], ['examples/11-9.jpg', 'examples/11.html']], +# # examples=[['examples/6-8.jpg', example_file_content_1], ['examples/11-9.jpg', example_file_content_2]], +# examples=[['examples/6-8.jpg', 'html content 1'], ['examples/11-9.jpg', 'html content 2']], +# enable_queue=True, +# capture_session=True, +# layout='vertical' +# ).launch(debug=False) + +# def example_inference(): +# image_input_bgr = cv2.imread('examples/6-8.jpg') +# image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) +# # text_input = 'html content 1' # example string +# text_input = 'https://www.whatsapp.com/legal/privacy-policy' +# +# out_result, out_segment = inference(image_input, text_input) +# +# return image_input, text_input, out_result, out_segment + +def example_inference_1(): + image_input_bgr = cv2.imread("examples/6-8.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://www.whatsapp.com/legal/privacy-policy' + out_result, out_segment, complete_result = inference(image_input, text_input) + return image_input, text_input, out_result, out_segment, complete_result + +def example_inference_2(): + image_input_bgr = cv2.imread("examples/11-9.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://values.snap.com/privacy/privacy-policy' + out_result, out_segment, complete_result = inference(image_input, text_input) + return image_input, text_input, out_result, out_segment, complete_result + +def example_inference_3(): + image_input_bgr = cv2.imread("examples/1-1.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://mcdonalds.com.au/privacy-policy' + out_result, out_segment, complete_result = inference(image_input, text_input) + return image_input, text_input, out_result, out_segment, complete_result + +def new_example_inference_1(): + image_input_bgr = cv2.imread("examples/6-8.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://www.whatsapp.com/legal/privacy-policy' + + out_result_bgr = cv2.imread("results/result_1.png") + out_result = cv2.cvtColor(out_result_bgr, cv2.COLOR_BGR2RGB) + + out_segment = pd.read_excel("results/result_1_S.xlsx") + complete_result = pd.read_excel("results/result_1_C.xlsx") + + return image_input, text_input, out_result, out_segment, complete_result + +def new_example_inference_2(): + image_input_bgr = cv2.imread("examples/11-9.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://values.snap.com/privacy/privacy-policy' + + out_result_bgr = cv2.imread("results/result_2.png") + out_result = cv2.cvtColor(out_result_bgr, cv2.COLOR_BGR2RGB) + + out_segment = pd.read_excel("results/result_2_S.xlsx") + complete_result = pd.read_excel("results/result_2_C.xlsx") + + return image_input, text_input, out_result, out_segment, complete_result + +def new_example_inference_3(): + image_input_bgr = cv2.imread("examples/1-1.jpg") + image_input = cv2.cvtColor(image_input_bgr, cv2.COLOR_BGR2RGB) + text_input = 'https://mcdonalds.com.au/privacy-policy' + + out_result_bgr = cv2.imread("results/result_3.png") + out_result = cv2.cvtColor(out_result_bgr, cv2.COLOR_BGR2RGB) + + out_segment = pd.read_excel("results/result_3_S.xlsx") + complete_result = pd.read_excel("results/result_3_C.xlsx") + + return image_input, text_input, out_result, out_segment, complete_result + +# def toggle_dataframe_callback(): +# complete_result_dataframe.visible = not complete_result_dataframe.visible + + +def schedule_restarts(interval_hours): + """ + Auto-restart every 'interval_hours' hours + """ + while True: + time.sleep(interval_hours * 3600) # convert hour to second + python = sys.executable + os.execl(python, python, *sys.argv) + +def schedule_daily_resource_logging(interval_hours): + """ + Logging system resource usage every 'interval_hours' hours + """ + while True: + cpu_usage = psutil.cpu_percent(interval=1) + memory_info = psutil.virtual_memory() + disk_usage = psutil.disk_usage('/') + + logger.info(f"CPU usage: {cpu_usage}%") + logger.info( + f"Memory usage: {memory_info.percent}% (Total: {memory_info.total}, Used: {memory_info.used}, Free: {memory_info.free})") + logger.info( + f"Disk usage: {disk_usage.percent}% (Total: {disk_usage.total}, Used: {disk_usage.used}, Free: {disk_usage.free})") + + time.sleep(interval_hours * 3600) # waiting for 'interval_hours' hours + + +demo = gr.Blocks() +with demo: + gr.Markdown("# Cpp4App\n\n**Automated Contextual Privacy Policies Generation for Mobile Apps**" + "\n\nThere are two inputs to generate CPP for a mobile app: app's privacy policy URL link and a GUI screenshot") + + with gr.Row(): + example_image_1 = gr.Image(value="examples/6-8.jpg", label="Example 1") + example_image_2 = gr.Image(value="examples/11-9.jpg", label="Example 2") + example_image_3 = gr.Image(value="examples/1-1.jpg", label="Example 3") + with gr.Column(): + gr.Markdown("**You can try with three examples we provided:**" + "\n\n- WhatsApp" + "\n\n- Snap" + "\n\n- Mcdonald's" + "\n\n**You can also try with your own example:**" + "\n\nUpload the screenshot and privacy policy URL link, then click 'submit' button" + # "\n\n" + # "\n\nThe three provided examples are pre-run, while your own screenshot needs to run for approximately one minute." + ) + + with gr.Row(): + example_button_1 = gr.Button("Run with Example 1") + example_button_2 = gr.Button("Run with Example 2") + example_button_3 = gr.Button("Run with Example 3") + with gr.Column(): + clear_button = gr.Button("Clear") + submit_button = gr.Button("Submit") + + with gr.Row(): + text_input = gr.Textbox(label="URL Input for the Privacy Policy of the App") + + with gr.Column(): + image_input = gr.Image(type="pil", label="Screenshot Upload") + result_image = gr.Image(type="pil", label="Result Screenshot") + + with gr.Row(): + result_dataframe = gr.Dataframe(type="pandas", label="Result Excel (Summarized)") + + # with gr.Row(): + # # Create a button to control the display of complete_result_dataframe + # toggle_dataframe_button = gr.Button("Show Complete Result Excel") + + with gr.Row(): + complete_result_dataframe = gr.Dataframe(type="pandas", label="Result Excel (Complete)") + + # with gr.Row(): + # example_button_1 = gr.Button("Run with Example 1") + # example_button_2 = gr.Button("Run with Example 2") + # example_button_3 = gr.Button("Run with Example 3") + # with gr.Column(): + # clear_button = gr.Button("Clear") + # submit_button = gr.Button("Submit") + # + # with gr.Row(): + # example_image_1 = gr.Image(value="examples/6-8.jpg", label="Example 1") + # example_image_2 = gr.Image(value="examples/11-9.jpg", label="Example 2") + # example_image_3 = gr.Image(value="examples/1-1.jpg", label="Example 3") + # with gr.Column(): + # gr.Markdown("**You can try with three examples we provided:**" + # "\n\n- WhatsApp" + # "\n\n- Snap" + # "\n\n- Mcdonald's" + # "\n\n**You can also try with your own example:**" + # "\n\nUpload the screenshot and privacy policy URL link, then click 'submit' button") + + submit_button.click(inference, inputs=[image_input, text_input], outputs=[result_image, result_dataframe, complete_result_dataframe]) + clear_button.click(lambda: [None, None, None, None, None, None], inputs=[], outputs=[image_input, text_input, result_image, result_dataframe, complete_result_dataframe]) + # example_button.click(example_inference, inputs=[], outputs=[image_input, text_input, result_image, result_dataframe]) + example_button_1.click(new_example_inference_1, + inputs=[], + outputs=[image_input, text_input, result_image, result_dataframe, complete_result_dataframe]) + example_button_2.click(new_example_inference_2, + inputs=[], + outputs=[image_input, text_input, result_image, result_dataframe, complete_result_dataframe]) + example_button_3.click(new_example_inference_3, + inputs=[], + outputs=[image_input, text_input, result_image, result_dataframe, complete_result_dataframe]) + + # # Create a unique CSS ID for the dataframe output + # dataframe_id = id(complete_result_dataframe) + # + # # Define CSS styles for hiding/showing the dataframe + # hide_style = f"#{dataframe_id} {{ display: none; }}" + # show_style = f"#{dataframe_id} {{ display: block; }}" + # + # + # def toggle_dataframe_callback(): + # if toggle_dataframe_button.label == "Show Complete Result Excel": + # toggle_dataframe_button.label = "Hide Complete Result Excel" + # gr.Html(style=show_style).show() + # else: + # toggle_dataframe_button.label = "Show Complete Result Excel" + # gr.Html(style=hide_style).show() + +threading.Thread(target=schedule_restarts, args=(72,)).start() # restart every 72 hours +threading.Thread(target=schedule_daily_resource_logging, args=(24,)).start() # recorded every 24 hours + +try: + demo.launch() + logger.info('Gradio app launched successfully') +except Exception as e: + logger.error('Error occurred while launching Gradio app', exc_info=True) diff --git a/examples/1-1.jpg b/examples/1-1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..68bda7bad3d590e5b552aea184f26fb26dfe3fb1 Binary files /dev/null and b/examples/1-1.jpg differ diff --git a/examples/1.html b/examples/1.html new file mode 100644 index 0000000000000000000000000000000000000000..14d27062672b14c4904bc31ca2de4b895ddf7382 --- /dev/null +++ b/examples/1.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + + + + + + + + + + + + GMA Privacy Policy | McDonald's Australia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ +
+
+ +
+ + +

GMA Privacy Policy

+

+
+
+ +
+
+ + +
+
+

McDonald’s is committed to respecting your personal information. Our privacy policy sets outs out how we collect, use, store and disclose your personal information. When you use our Websites Apps, or provide your personal information to us, you consent to your personal information being collected, held, used and disclosed as set out in our privacy policy.

+

 

+

Information we collect and hold

+

McDonald’s collects personal information about you in a number of ways, including when you:

+
  • use our websites (including www.mcdonalds.com.au), social media pages, and internal websites or intranet (Website);
  • +
  • use our mobile and tablet Apps (Apps); and
  • +
  • interact with us and provide personal information by any other means, including either physically or electronically,
  • +

(Collection Channels). 

+

Personal information that McDonald’s collects and holds may include your name, email address, delivery address, date of birth, phone number, payment method, social media handles, photographs of you and other identifying information you choose to provide via a particular Collection Channel.

+

When you use a Website or App, we may also collect personal information about you in the following general categories:

+
  • Location information: If you permit an App to access location services in your settings, then we collect your device location App to deliver your order or to send you alerts.
  • +
  • Transaction information: We collect your transaction details when you place an order via a Website or App, including the products you have ordered, the date and time of your order, the amount charged and your loyalty entitlements.
  • +
  • Usage and preferences: We collect information about how you interact with our Websites or Apps, including the pages you visit, your preferences and the settings that you choose. We do this through cookies and other similar technology.
  • +
  • Device information: We collect information about your device, such as the hardware model, operating system, preferred language, unique device identifier and mobile network.
  • +
  • Employee information: If you are a job applicant, an employee in one of our restaurants or our corporate offices, or a former employee, and use a Website or App, we collect information about the training modules you have completed, the forms you have submitted, the approvals you have given or received, and other similar information related to your job.
  • +
  • Other information: We also collect and log information such as your IP address, access dates and times, browser type and pages visited when you interact with a Website or App.
  • +

We also collect personal information about you from third parties, including when:

+
  • you choose to create an account or register for a product or service via a Website or App using a social media platform (e.g. Facebook);
  • +
  • you have consented to a third party disclosing your personal information to us (e.g. when you enter a competition or promotion run by a third party for us); and
  • +
  • it is otherwise lawful for a third party to disclose your personal information to us.
  • +

We also collect personal or anonymous information about you from other sources and sometimes combine that information with other information collected from you or from third parties for the purposes disclosed in this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

How McDonald’s collects and holds personal information

+

McDonald’s will only collect or monitor any personal information about you as provided in this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Active information collection

+

McDonald’s may collect your personal information via our Collection Channels when you:

+
  • purchase a product or make a booking through a Website or App;
  • +
  • participate in any offers, marketing activities, loyalty or rewards program or promotional activities;
  • +
  • contact us or provide us with personal information directly via any medium including a Website or App, SMS or other message service and email, social media platforms, mail, telephone or in person;
  • +
  • interact with a Website or App for a specific purpose;
  • +
  • browse a Website or App generally;
  • +
  • sign-up to, or register an account via any Collection Channel; or
  • +
  • apply for employment with McDonald’s.
  • +
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Platform permissions

+

Mobile platforms such as iOS and Android may define certain types of information or data that our Apps cannot access without your consent. Each platform has its own permissions system for obtaining your consent. For example, the iOS platform may alert you the first time an App wants your permission to access certain types of data (e.g. location services) and will provide you option to consent to that request. Android devices may notify you of the permissions that an App seeks before you first use the App and your subsequent use of the App constitutes your consent. You can usually manage your platform level permissions via the Settings section on your device. For more information, please contact your device provider or refer to the user manual for your device.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Privacy Policy 

+

McDonald’s Privacy Policy contains information about how you can access and correct your personal information, how you can lodge a complaint regarding the handling of your personal information and how any complaint will be handled by McDonald’s. You may contact McDonald’s with any queries via email: privacy@au.mcd.com or at McDonald's Australia Limited (Attention: McDonald's Privacy Officer), PO Box 392 Pennant Hills NSW 2120 Australia.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Cookies and passive information collection

+

We may use tracking technologies to collect personal information about you when you use and access a Websites or App, including cookies, internet tags or web beacons, and navigational data collection (e.g. log files, server logs, and clickstream data). For example, we may collect information about the date, time and duration of visits and which pages of a Website or App are most commonly accessed. This browsing information is generally not linked to your identity, except where you access a Website or App via links in a message we have sent or where we are able to user accessing a Website or App.

+

We may combine your anonymous information, browsing information or other information collected through tracking technologies with your personal information collect via our Collection Channels in order to understand and remember your preferences and interests. By accessing a Website or App via links and/or by accessing a Website or App where you have identified yourself, you consent to the collection of this information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Purposes for which McDonald’s collects, holds, uses and discloses personal information

+

We collect, hold, use and disclose your personal information for our primary purposes, including:

+
  • for the purposes stated on a particular Collection Channel;
  • +
  • to maintain and improve the functionality of a Website or App;
  • +
  • to fulfil obligations in respect of any sale and purchase contract and/or any other contract between you and McDonald’s;
  • +
  • to manage your orders or facilitating payment, for example, when you use our App the drive thru screen and kiosk will display your name and crew members will greet you by name.
  • +
  • to send you any technical, administrative or legal notices important to our Websites and Apps;
  • +
  • to provide you with information about your transactions and loyalty entitlements;
  • +
  • to provide marketing materials and information about our products and services, events, special offers, competitions and/or promotions, or to request your feedback for promotional purposes;
  • +
  • to respond to customer enquiries or complaints;
  • +
  • to manage your employment or process your application for employment with McDonald’s (including McDonald’s franchisees) and to facilitate effective employment practices;
  • +
  • to obtain opinions or comments about products and/or services and to conduct other market research and development (including to record statistical data for marketing analysis);
  • +
  • to enter you into and administer promotions;
  • +
  • to provide, maintain and improve our products and services;
  • +
  • to customise a Website or App based on your preferences;
  • +
  • to allow you to use  a Website or App;
  • +
  • to share with trusted third parties including professional service providers, our related bodies corporate, our franchisees, our suppliers and our promotional partners and other trusted third parties (and their directors, servants and agents) and agencies (McDonald’s Family); and
  • +
  • to share with your social media communities, to the extent allowed by you.
  • +
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Not providing information

+

You don’t have to provide any personal information to us. However, if you do not do so, this may affect or completely restrict your ability to use a Website or App and our ability to provide you with relevant content, products and services.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Sharing your personal information

+

McDonald's shares personal information with the global McDonald’s Family for the purposes described in this privacy policy

+

McDonald’s recognises the trust with which you provide personal information, and except as stated in this privacy policy, your information will not be used or disclosed for any other purposes without your consent. However, McDonald's reserves the right to use or disclose any information, including personal information, as needed to satisfy any law, regulation or legal request, to protect the rights or property of McDonald's, any member of the McDonald's Family, or any member of the public, to protect the integrity of a Website or App, to fulfil your requests, or to cooperate in any law enforcement investigation or an investigation on a matter of public safety.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Contact by McDonald’s and third parties

+

If you would like to opt out of receiving advertising communications from us, the McDonald’s Family and our trusted third parties, you can unsubscribe.

+

We may still send you transaction and administrative information.

+

If you no longer wish to receive any communications from McDonald’s via an App, you can delete the App from your device.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Ability of others to view information

+

McDonald’s provides areas on Websites and Apps where you can upload user-generated content, post or provide information about yourself, communicate with other users, provide reviews for content, products and/or services or interact with or vote on particular content. This information may be publicly posted on a Website or App and/or shared with others, including social media platforms and other public forums in which you choose to participate. This information may become publicly available and may be read, collected and used by others outside of a McDonald’s Website or App. McDonald’s is not responsible for the conduct of others who may read, collect and use this information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Children

+

McDonald's is very sensitive to privacy issues. We are proud of our long-time commitment to our customers. McDonald’s does not intend to collect personal information from any person under the age of 18 years without the consent of a parent or legal guardian. We urge parents to regularly monitor and supervise their children's on-line activities.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Security of personal information

+

McDonald’s will endeavour to take all reasonable steps to protect your personal information. All information is passed through to a secure server using encryption technology and stored on secure servers that are protected in controlled facilities, which in some cases may be overseas. McDonald's employees and data processors are obliged to respect the confidentiality of any personal information held by McDonald's. However, McDonald’s cannot guarantee the security of your personal information and will not be held responsible for events arising from unauthorised access to personal information beyond McDonald's reasonable control.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Disclosure of personal information to overseas recipients

+

In some cases, McDonald’s may disclose your personal information to overseas recipients, including but not limited to recipients in the United States of America, Japan, Malaysia and Singapore. McDonald’s employees and data processors are obliged to respect the confidentiality of any personal information held by McDonald’s.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Access to personal information

+

You are in control of any personal information you provide to us. If at any time, you would like to access, review, correct and/or delete the personal information we have about you, or if you would like to change your contact preferences, you can let us know via the contact details listed below. Please allow 30 days for this request to be processed.

+

Your personal information may be stored in different locations depending upon the reason for which you originally submitted the information. If you make an inquiry in relation to your personal information, the more information you can provide us about when you originally submitted your personal information, the quicker McDonald's will be able to retrieve your personal information.

+

If requested, all reasonable steps to delete personal information will be made, except where it is required for legal reasons. Deletion of information may result in McDonald's being unable to facilitate or provide you with information about certain transactions (including the uploading of, access to, and receipt of, content on a Website or App, and purchase transactions undertaken on a Website or App), other content, services or product information, upcoming promotions, competitions or event information, and/or provide certain content, products or services.

+

We are not responsible for removing your personal information from the lists of any third party who has previously been provided your information in accordance with this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Links to other sites

+

Our Websites or Apps contain links to sites operated by third parties. We are not responsible for the privacy practices of, or any content on, those sites linked to our Websites and Apps. If you visit one of these linked websites, we encourage you to review their privacy and other policies.

+

We may use third party advertisements on our Websites and Apps. All third party advertising, if paid for, is paid for by the relevant third party advertisers. Third party advertisements are not recommendations or endorsements by McDonald’s or any of its affiliates. To the extent permitted by law, McDonald’s is not responsible for the content (including representations) of any third party advertisement on a Website or App. Cookies may be associated with these advertisements to enable the advertiser to track the number of anonymous users responding to the campaign.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Related McDonald's Websites or Apps

+

All Websites and Apps operated by McDonald's in Australia will adhere to this privacy policy. The policies on the Websites and Apps of some other members of the McDonald's Family may vary because of local customs, practices or laws.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Franchisee privacy policies

+

Many McDonald's restaurants are owned and operated by independent franchisees. Some franchisees also operate websites and are required to follow this privacy policy. If you are concerned that there may have been a breach of this privacy policy by a franchisee, please contact the relevant franchisee entity or McDonald’s restaurant directly.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Changes to our privacy policy

+

From time to time, it may be necessary for McDonald's to change this privacy policy without notice. We will post any changes to this privacy policy on our Websites and Apps. Rest assured, however, that any changes will not be retroactively applied and will not alter how we handle previously collected personal information.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Sale of the Company

+

If McDonald’s merges with, is acquired by another company, or sells all or a portion of its assets, your personal information may be disclosed to our advisers and any prospective purchaser’s adviser and may be among the assets transferred. However, your personal information will always remain subject to this privacy policy.

+
+ + +
+ +
+
+
+
+
+
+
+
+ + +
+
+

Contact Us

+

If you have any questions about our privacy policy, or any problems or complaints about how we have collected, used, stored, handled and/or disclosed your personal information, please contact us at:

+

Mail:     McDonald's Privacy Officer

+

McDonald's Australia Limited

+

PO Box 392

+

Pennant Hills NSW 2120

+

Australia

+

Email: privacy@au.mcd.com

+

Telephone: (02) 9875 6666

+

Fax: (02) 98756568

+

Please allow 14 days for this request to be processed. If you do not receive a satisfactory response from McDonald’s to your query, problem or complaint within 14 days, you may refer your query, problem or complaint to the Office of the Australian Information Commissioner via the contact details listed at https://www.oaic.gov.au/about-us/contact-us/.

+

 

+
+ + +
+ +
+
+
+
+
+
+ + +
+
+
+
+ + + + + + + + + diff --git a/examples/11-9.jpg b/examples/11-9.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5830ed8112ad4de87e42ef0ace8843880def476b Binary files /dev/null and b/examples/11-9.jpg differ diff --git a/examples/11.html b/examples/11.html new file mode 100644 index 0000000000000000000000000000000000000000..87c5234bfcae85325f6853dce784b5da320d94b3 --- /dev/null +++ b/examples/11.html @@ -0,0 +1,698 @@ + Privacy Policy - Data Policy | Snapchat Privacy
Privacy and Safety Hub

Privacy Policy

Effective: June 29, 2022
Snap Inc. is a camera company. Our products and services — including Snapchat, Bitmoji, Spectacles, advertising, commerce, and others that link to this Privacy Policy — provide fast and fun ways to express yourself, live in the moment, learn about the world, and have fun together!
When you use these services, you’ll share some information with us. So we want to be upfront about the information we collect, how we use it, whom we share it with, and the controls we give you to access, update, and delete your information.
That’s why we’ve written this Privacy Policy. And it’s why we’ve tried to write it in a way that’s easy to understand for all our users and blissfully free of the legalese that often clouds these documents. Of course, if you still have questions about anything in our Privacy Policy, just contact us.
You should read our entire Privacy Policy, but when you only have a few minutes or want to remember something later on, you can always take a look at this overview and video. We also encourage you to check out the rest of our Privacy Center. We designed it to give you easy-to-digest summaries of our privacy practices. For example, our Privacy by Product page provides a breakdown of specific privacy features for our products.

Information We Collect

There are three basic categories of information we collect:
  • Information you provide.
  • Information we get when you use our services.
  • Information we get from third parties.
Here’s a little more detail on each of these categories.
Information You Provide
When you interact with our services, we collect information that you provide to us. For example, many of our services require you to set up an account, so we may need to collect a few important details about you, such as your name, username, password, email address, phone number, and date of birth. We may also ask you to provide us with some additional information that will be publicly visible on our services, such as a profile picture or Bitmoji avatar. Some services, such as commerce products, may require you to provide us with a debit or credit card number and its associated account information.
Of course, you’ll also provide us whatever information you send through our services, such as Snaps and Chats. Keep in mind that the users who view your Snaps, Chats, and any other content can always save that content or copy it outside the app. So, the same common sense that applies to the internet at large applies to our services as well: Don’t send messages or share content that you wouldn’t want someone to save or share.
When you contact customer support or communicate with us in any other way, we’ll collect whatever information you volunteer or that we need to resolve your question.
Information We Get When You Use Our Services
When you use our services, we collect information about which of those services you’ve used and how you’ve used them. We might know, for instance, that you watched a particular Story, saw a specific ad for a certain period of time, and sent a few Snaps. Here’s a fuller explanation of the types of information we collect when you use our services:
  • Usage Information. We collect information about your activity through our services. For example, we may collect information about:
    • how you interact with our services, such as which Filters or Lenses you view or apply to Snaps, which Stories you watch on Discover, whether you’re using Spectacles, or which search queries you submit.
    • how you communicate with other Snapchatters, such as their names, the time and date of your communications, the number of messages you exchange with your friends, which friends you exchange messages with the most, and your interactions with messages (such as when you open a message or capture a screenshot).
  • Content Information. We collect content you create on our services, such as custom stickers, and information about the content you create or provide, such as if the recipient has viewed the content and the metadata that is provided with the content.
  • Device Information. We collect information from and about the devices you use. For example, we collect:
  • information about your hardware and software, such as the hardware model, operating system version, device memory, advertising identifiers, unique application identifiers, apps installed, unique device identifiers, device usage data, browser type, keyboards installed, language, battery level, and time zone;
  • information from device sensors, such as accelerometers, gyroscopes, compasses, microphones, and whether you have headphones connected; and
  • information about your wireless and mobile network connections, such as mobile phone number, service provider, IP address, and signal strength.
  • Device Phonebook. Because our services are all about communicating with friends, we may — with your permission — collect information from your device’s phonebook.
  • Camera, Photos, and Audio. Many of our services require us to collect images and other information from your device’s camera and photos. For example, you won’t be able to send Snaps or upload photos from your camera roll unless we can access your camera or photos.
  • Location Information. When you use our services we may collect information about your location. With your permission, we may also collect information about your precise location using methods that include GPS, wireless networks, cell towers, Wi-Fi access points, and other sensors, such as gyroscopes, accelerometers, and compasses.
  • Information Collected by Cookies and Other Technologies. Like most online services and mobile applications, we may use cookies and other technologies, such as web beacons, web storage, and unique advertising identifiers, to collect information about your activity, browser, and device. We may also use these technologies to collect information when you interact with services we offer through one of our partners, such as advertising and commerce features. For example, we may use information collected on other websites to show you more relevant ads. Most web browsers are set to accept cookies by default. If you prefer, you can usually remove or reject browser cookies through the settings on your browser or device. Keep in mind, though, that removing or rejecting cookies could affect the availability and functionality of our services. To learn more about how we and our partners use cookies on our services and your choices, please check out our Cookie Policy.
  • Log Information. We also collect log information when you use our website, such as:
  • details about how you’ve used our services;
  • device information, such as your web browser type and language;
  • access times;
  • pages viewed;
  • IP address;
  • identifiers associated with cookies or other technologies that may uniquely identify your device or browser; and
  • pages you visited before or after navigating to our website.
Information We Collect from Third Parties
We may collect information about you from other users, our affiliates, and third parties. Here are a few examples:
  • If you link your Snapchat account to another service (like Bitmoji or a third-party app), we may receive information from the other service, like how you use that service.
  • Advertisers, app developers, publishers, and other third parties may share information with us as well. We may use this information, among other ways, to help target or measure the performance of ads. You can learn more about our use of this kind of third-party data in our Support Center.
  • If another user uploads their contact list, we may combine information from that user’s contact list with other information we have collected about you.

How We Use Information

What do we do with the information we collect? For the detailed answer, go here. The short answer is: Provide you with an amazing set of products and services that we relentlessly improve. Here are the ways we do that:
  • develop, operate, improve, deliver, maintain, and protect our products and services.
  • send you communications, including by email or SMS where permitted. For example, we may use email or SMS to respond to support inquiries or to share information about our products, services, and promotional offers that we think may interest you.
  • monitor and analyze trends and usage.
  • personalize our services by, among other things, suggesting friends, profile information, or Bitmoji stickers, helping Snapchatters find each other in Snapchat, affiliate and third-party apps and services, or customising the content we show you, including ads.
  • add context to your Snapchat experience, for example by tagging your Memories with searchable labels based on your location (of course, if you’ve given us permission to collect your location) and the content of your photo or video (e.g., if there’s a dog in your photo, it may be searchable in Memories by the term “dog”).
  • provide and improve our advertising services, ad targeting, and ad measurement, including through the use of your precise location information (again, if you’ve given us permission to collect that information), both on and off our services. We may also store information about your use of third-party apps and websites on your device to do this. Learn more. See the Control Over Your Information section below for more information about Snap Inc.’s advertising practices and your choices.
  • enhance the safety and security of our products and services.
  • verify your identity and prevent fraud or other unauthorised or illegal activity.
  • use information we’ve collected from cookies and other technology to enhance our services and your experience with them.
  • enforce, investigate, and report conduct violating our Terms of Service and other usage policies, respond to requests from law enforcement, and comply with legal requirements.
We may also use information from Apple’s TrueDepth camera to improve the quality of Lenses. Information from the TrueDepth camera is used in real time — we don’t store this information on our servers or share it with third parties.

How We Share Information

We may share information about you in the following ways:
  • With other Snapchatters. We may share the following information with other Snapchatters:
  • information about you, such as your username, name, and Bitmoji.
  • information about how you have interacted with our services, such as your Snapchat “score,” the names of Snapchatters you are friends with, how close you are with your friends on Snapchat, your recent location history (if you choose to share your location on Snap Map), and other information that will help Snapchatters understand your connections with others using our services. For example, because it may not be clear whether a new friend request comes from someone you actually know, we may share whether you and the requestor have Snapchat friends in common.
  • information about your device, such as the operating system and device type, to help you receive Chats, Snaps, and other content in the optimal viewing format.
  • any additional information you have directed us to share. For example, Snap will share your information when you connect your Snapchat account to a third-party app, and if you share information or content from Snapchat to the third-party app.
  • content you post or send. How widely your content is shared depends on your personal settings and the type of service you are using. For example, a Snap may be sent to just a single friend you select, but your My Story content may be seen by any Snapchatter whom you allow to see your My Story.
  • With all Snapchatters, our business partners, and the general public. We may share the following information with all Snapchatters as well as with our business partners and the general public:
  • public information like your name, username, profile pictures, Snapcode, and Public Profile.
  • Public Content like your Highlights, Custom Stickers, Lenses, Story submissions that are set to be viewable by Everyone, and any content that you submit to an inherently public service, like Spotlight, Snap Map, and other crowd-sourced services. This content may be viewed, used, and shared by the public at large both on and off our services, including through search results, on websites, in apps, and in online and offline broadcasts.
  • With third parties. We may share information with third parties in the following ways:
  • We may share information about you with service providers who perform services on our behalf, including to facilitate payments and measure and optimize the performance of ads and deliver more relevant ads, including on third-party websites and apps.
  • We may share information about you with business partners that provide services and functionality on our services. For more information about information collected by third parties on our services, visit our Support Site.
  • We may share information about you, such as device and usage information, to help us and others prevent fraud.
  • We may share information about you for legal, safety, and security reasons. We may share information about you if we reasonably believe that disclosing the information is needed to:
  • comply with any valid legal process, governmental request, or applicable law, rule, or regulation.
  • investigate, remedy, or enforce potential Terms of Service and Community Guidelines violations.
  • protect the rights, property, or safety of us, our users, or others.
  • detect and resolve any fraud or security concerns.
  • We may share information about you as part of a merger or acquisition. If Snap Inc. gets involved in a merger, asset sale, financing, liquidation or bankruptcy, or acquisition of all or some portion of our business to another company, we may share your information with that company before and after the transaction closes.
  • Non-personal information. We may also share with third parties that provide services to us or perform business purposes for us aggregated, non-personally identifiable, or de-identified information.

Third-Party Content and Integrations

Our services may contain third-party content and integrations. Examples include third-party integrations in the Camera, third-party games in Chat, and third-party Snap Kit integrations. Through these integrations, you may be providing information to the third party as well as to Snap. We are not responsible for how those third parties collect or use your information. As always, we encourage you to review the privacy policies of every third-party service that you visit or use, including those third parties you interact with through our services. You can learn more about third-party services in Snapchat here.

How Long We Keep Your Information

Snapchat lets you capture what it’s like to live in the moment. On our end, that means most messages — like Snaps and Chats — sent in Snapchat will be automatically deleted by default from our servers after we detect they’ve been opened by all recipients or have expired. Other content, like Story posts, are stored for longer. For detailed information about how long we store different types of content, check out our Support Site.
We store other information for longer periods of time. For example:
  • We store your basic account information — like your name, phone number, and email address — and list of friends until you ask us to delete them.
  • We store location information for different lengths of time based on how precise it is and which services you use. If location information is associated with a Snap — like those saved to Memories or posted to Snap Map or Spotlight — we’ll retain that location as long as we store the Snap. Pro tip: You can see the location data we retain about you by downloading your data.
If you ever decide to stop using Snapchat, you can just ask us to delete your account. We’ll also delete most of the information we’ve collected about you after you’ve been inactive for a while!
Keep in mind that, while our systems are designed to carry out our deletion practices automatically, we cannot promise that deletion will occur within a specific timeframe. There may be legal requirements to store your data and we may need to suspend those deletion practices if we receive valid legal process asking us to preserve content, if we receive reports of abuse or other Terms of Service violations, or if your account, content created by you, or content created with other users is flagged by others or our systems for abuse or other Terms of Service violations. Finally, we may also retain certain information in backup for a limited period of time or as required by law.

Control Over Your Information

We want you to be in control of your information, so we provide you with the following tools.
  • Access, Correction, and Portability. You can access and edit most of your basic account information right in our apps. You can also use Download My Data to obtain a copy of information that isn’t available in our apps in a portable format, so you can move it or store it wherever you want. Because your privacy is important to us, we will ask you to verify your identity or provide additional information before we let you access or update your personal information. We may also reject your request to access or update your personal information for a number of reasons, including, for example, if the request risks the privacy of other users or is unlawful.
  • Revoking permissions. In most cases, if you let us use your information, you can simply revoke your permission by changing the settings in the app or on your device if your device offers those options. Of course, if you do that, certain services may lose full functionality. For promotional emails and SMS, you may opt out by clicking on the unsubscribe link or similar mechanism as provided.
  • Deletion. While we hope you’ll remain a lifelong Snapchatter, if for some reason you ever want to delete your account, just go here to learn how. You can also delete some information in the app, like photos you’ve saved to Memories, Spotlight submissions, and search history.
  • Advertising Preferences. We try to show you ads that we think will be relevant to your interests. If you would like to modify the information we and our advertising partners use to select these ads, you can do so in the app and through your device preferences. Go here to learn more.
  • Tracking. If you opt out of tracking on devices running iOS 14.5 or more recent versions, we will not link identifiable information from third-party apps and websites with identifiable information from Snapchat for advertising purposes, except on your device. You can control use of this on-device data for advertising by opting out of Activity-Based Advertising in Snapchat Ad Preferences Settings. Go here to learn more.
  • Communicating with other Snapchatters. It’s important to us that you stay in control over whom you communicate with. That’s why we’ve built a number of tools in Settings that let you indicate, among other things, who you want to see your Stories, whether you’d like to receive Snaps from just your friends or all Snapchatters, and whether you’d like to block another Snapchatter from contacting you again. Go here to learn more.

International Data Transfers

We may collect your personal information from, transfer it to, and store and process it in the United States and other countries outside of where you live. Whenever we share information outside of where you live, when we are legally required to do so, we make sure an adequate transfer mechanism is in place. We also make sure any third parties we share information with have an adequate transfer mechanism in place, as well. You can find more information on the categories of third parties we share information with here.

State and Region Specific Information

You may have specific privacy rights in your state or region. For example, in the United States, residents of California and other states have specific privacy rights. Snapchatters in the European Economic Area (EEA), the UK, Brazil, the Republic of Korea, and other jurisdictions also have specific rights. We keep an up-to-date overview of state and region specific disclosures here.

Children

Our services are not intended for — and we don’t direct them to — anyone under 13. And that’s why we do not knowingly collect personal information from anyone under 13. In addition, we may limit how we collect, use, and store some of the information of EEA and UK users between 13 and 16. In some cases, this means we will be unable to provide certain functionality to these users. If we need to rely on consent as a legal basis for processing your information and your country requires consent from a parent, we may require your parent’s consent before we collect and use that information.

Revisions to the Privacy Policy

We may change this Privacy Policy from time to time. But when we do, we’ll let you know one way or another. Sometimes, we’ll let you know by revising the date at the top of the Privacy Policy that’s available on our website and mobile application. Other times, we may provide you with additional notice (such as adding a statement to our websites’ homepages or providing you with an in-app notification).
\ No newline at end of file diff --git a/examples/11.txt b/examples/11.txt new file mode 100644 index 0000000000000000000000000000000000000000..98cf13851929c56d2208b2157262080fb3db7c4a --- /dev/null +++ b/examples/11.txt @@ -0,0 +1,699 @@ + + Privacy Policy - Data Policy | Snapchat Privacy
Privacy and Safety Hub

Privacy Policy

Effective: June 29, 2022
Snap Inc. is a camera company. Our products and services — including Snapchat, Bitmoji, Spectacles, advertising, commerce, and others that link to this Privacy Policy — provide fast and fun ways to express yourself, live in the moment, learn about the world, and have fun together!
When you use these services, you’ll share some information with us. So we want to be upfront about the information we collect, how we use it, whom we share it with, and the controls we give you to access, update, and delete your information.
That’s why we’ve written this Privacy Policy. And it’s why we’ve tried to write it in a way that’s easy to understand for all our users and blissfully free of the legalese that often clouds these documents. Of course, if you still have questions about anything in our Privacy Policy, just contact us.
You should read our entire Privacy Policy, but when you only have a few minutes or want to remember something later on, you can always take a look at this overview and video. We also encourage you to check out the rest of our Privacy Center. We designed it to give you easy-to-digest summaries of our privacy practices. For example, our Privacy by Product page provides a breakdown of specific privacy features for our products.

Information We Collect

There are three basic categories of information we collect:
  • Information you provide.
  • Information we get when you use our services.
  • Information we get from third parties.
Here’s a little more detail on each of these categories.
Information You Provide
When you interact with our services, we collect information that you provide to us. For example, many of our services require you to set up an account, so we may need to collect a few important details about you, such as your name, username, password, email address, phone number, and date of birth. We may also ask you to provide us with some additional information that will be publicly visible on our services, such as a profile picture or Bitmoji avatar. Some services, such as commerce products, may require you to provide us with a debit or credit card number and its associated account information.
Of course, you’ll also provide us whatever information you send through our services, such as Snaps and Chats. Keep in mind that the users who view your Snaps, Chats, and any other content can always save that content or copy it outside the app. So, the same common sense that applies to the internet at large applies to our services as well: Don’t send messages or share content that you wouldn’t want someone to save or share.
When you contact customer support or communicate with us in any other way, we’ll collect whatever information you volunteer or that we need to resolve your question.
Information We Get When You Use Our Services
When you use our services, we collect information about which of those services you’ve used and how you’ve used them. We might know, for instance, that you watched a particular Story, saw a specific ad for a certain period of time, and sent a few Snaps. Here’s a fuller explanation of the types of information we collect when you use our services:
  • Usage Information. We collect information about your activity through our services. For example, we may collect information about:
    • how you interact with our services, such as which Filters or Lenses you view or apply to Snaps, which Stories you watch on Discover, whether you’re using Spectacles, or which search queries you submit.
    • how you communicate with other Snapchatters, such as their names, the time and date of your communications, the number of messages you exchange with your friends, which friends you exchange messages with the most, and your interactions with messages (such as when you open a message or capture a screenshot).
  • Content Information. We collect content you create on our services, such as custom stickers, and information about the content you create or provide, such as if the recipient has viewed the content and the metadata that is provided with the content.
  • Device Information. We collect information from and about the devices you use. For example, we collect:
  • information about your hardware and software, such as the hardware model, operating system version, device memory, advertising identifiers, unique application identifiers, apps installed, unique device identifiers, device usage data, browser type, keyboards installed, language, battery level, and time zone;
  • information from device sensors, such as accelerometers, gyroscopes, compasses, microphones, and whether you have headphones connected; and
  • information about your wireless and mobile network connections, such as mobile phone number, service provider, IP address, and signal strength.
  • Device Phonebook. Because our services are all about communicating with friends, we may — with your permission — collect information from your device’s phonebook.
  • Camera, Photos, and Audio. Many of our services require us to collect images and other information from your device’s camera and photos. For example, you won’t be able to send Snaps or upload photos from your camera roll unless we can access your camera or photos.
  • Location Information. When you use our services we may collect information about your location. With your permission, we may also collect information about your precise location using methods that include GPS, wireless networks, cell towers, Wi-Fi access points, and other sensors, such as gyroscopes, accelerometers, and compasses.
  • Information Collected by Cookies and Other Technologies. Like most online services and mobile applications, we may use cookies and other technologies, such as web beacons, web storage, and unique advertising identifiers, to collect information about your activity, browser, and device. We may also use these technologies to collect information when you interact with services we offer through one of our partners, such as advertising and commerce features. For example, we may use information collected on other websites to show you more relevant ads. Most web browsers are set to accept cookies by default. If you prefer, you can usually remove or reject browser cookies through the settings on your browser or device. Keep in mind, though, that removing or rejecting cookies could affect the availability and functionality of our services. To learn more about how we and our partners use cookies on our services and your choices, please check out our Cookie Policy.
  • Log Information. We also collect log information when you use our website, such as:
  • details about how you’ve used our services;
  • device information, such as your web browser type and language;
  • access times;
  • pages viewed;
  • IP address;
  • identifiers associated with cookies or other technologies that may uniquely identify your device or browser; and
  • pages you visited before or after navigating to our website.
Information We Collect from Third Parties
We may collect information about you from other users, our affiliates, and third parties. Here are a few examples:
  • If you link your Snapchat account to another service (like Bitmoji or a third-party app), we may receive information from the other service, like how you use that service.
  • Advertisers, app developers, publishers, and other third parties may share information with us as well. We may use this information, among other ways, to help target or measure the performance of ads. You can learn more about our use of this kind of third-party data in our Support Center.
  • If another user uploads their contact list, we may combine information from that user’s contact list with other information we have collected about you.

How We Use Information

What do we do with the information we collect? For the detailed answer, go here. The short answer is: Provide you with an amazing set of products and services that we relentlessly improve. Here are the ways we do that:
  • develop, operate, improve, deliver, maintain, and protect our products and services.
  • send you communications, including by email or SMS where permitted. For example, we may use email or SMS to respond to support inquiries or to share information about our products, services, and promotional offers that we think may interest you.
  • monitor and analyze trends and usage.
  • personalize our services by, among other things, suggesting friends, profile information, or Bitmoji stickers, helping Snapchatters find each other in Snapchat, affiliate and third-party apps and services, or customising the content we show you, including ads.
  • add context to your Snapchat experience, for example by tagging your Memories with searchable labels based on your location (of course, if you’ve given us permission to collect your location) and the content of your photo or video (e.g., if there’s a dog in your photo, it may be searchable in Memories by the term “dog”).
  • provide and improve our advertising services, ad targeting, and ad measurement, including through the use of your precise location information (again, if you’ve given us permission to collect that information), both on and off our services. We may also store information about your use of third-party apps and websites on your device to do this. Learn more. See the Control Over Your Information section below for more information about Snap Inc.’s advertising practices and your choices.
  • enhance the safety and security of our products and services.
  • verify your identity and prevent fraud or other unauthorised or illegal activity.
  • use information we’ve collected from cookies and other technology to enhance our services and your experience with them.
  • enforce, investigate, and report conduct violating our Terms of Service and other usage policies, respond to requests from law enforcement, and comply with legal requirements.
We may also use information from Apple’s TrueDepth camera to improve the quality of Lenses. Information from the TrueDepth camera is used in real time — we don’t store this information on our servers or share it with third parties.

How We Share Information

We may share information about you in the following ways:
  • With other Snapchatters. We may share the following information with other Snapchatters:
  • information about you, such as your username, name, and Bitmoji.
  • information about how you have interacted with our services, such as your Snapchat “score,” the names of Snapchatters you are friends with, how close you are with your friends on Snapchat, your recent location history (if you choose to share your location on Snap Map), and other information that will help Snapchatters understand your connections with others using our services. For example, because it may not be clear whether a new friend request comes from someone you actually know, we may share whether you and the requestor have Snapchat friends in common.
  • information about your device, such as the operating system and device type, to help you receive Chats, Snaps, and other content in the optimal viewing format.
  • any additional information you have directed us to share. For example, Snap will share your information when you connect your Snapchat account to a third-party app, and if you share information or content from Snapchat to the third-party app.
  • content you post or send. How widely your content is shared depends on your personal settings and the type of service you are using. For example, a Snap may be sent to just a single friend you select, but your My Story content may be seen by any Snapchatter whom you allow to see your My Story.
  • With all Snapchatters, our business partners, and the general public. We may share the following information with all Snapchatters as well as with our business partners and the general public:
  • public information like your name, username, profile pictures, Snapcode, and Public Profile.
  • Public Content like your Highlights, Custom Stickers, Lenses, Story submissions that are set to be viewable by Everyone, and any content that you submit to an inherently public service, like Spotlight, Snap Map, and other crowd-sourced services. This content may be viewed, used, and shared by the public at large both on and off our services, including through search results, on websites, in apps, and in online and offline broadcasts.
  • With third parties. We may share information with third parties in the following ways:
  • We may share information about you with service providers who perform services on our behalf, including to facilitate payments and measure and optimize the performance of ads and deliver more relevant ads, including on third-party websites and apps.
  • We may share information about you with business partners that provide services and functionality on our services. For more information about information collected by third parties on our services, visit our Support Site.
  • We may share information about you, such as device and usage information, to help us and others prevent fraud.
  • We may share information about you for legal, safety, and security reasons. We may share information about you if we reasonably believe that disclosing the information is needed to:
  • comply with any valid legal process, governmental request, or applicable law, rule, or regulation.
  • investigate, remedy, or enforce potential Terms of Service and Community Guidelines violations.
  • protect the rights, property, or safety of us, our users, or others.
  • detect and resolve any fraud or security concerns.
  • We may share information about you as part of a merger or acquisition. If Snap Inc. gets involved in a merger, asset sale, financing, liquidation or bankruptcy, or acquisition of all or some portion of our business to another company, we may share your information with that company before and after the transaction closes.
  • Non-personal information. We may also share with third parties that provide services to us or perform business purposes for us aggregated, non-personally identifiable, or de-identified information.

Third-Party Content and Integrations

Our services may contain third-party content and integrations. Examples include third-party integrations in the Camera, third-party games in Chat, and third-party Snap Kit integrations. Through these integrations, you may be providing information to the third party as well as to Snap. We are not responsible for how those third parties collect or use your information. As always, we encourage you to review the privacy policies of every third-party service that you visit or use, including those third parties you interact with through our services. You can learn more about third-party services in Snapchat here.

How Long We Keep Your Information

Snapchat lets you capture what it’s like to live in the moment. On our end, that means most messages — like Snaps and Chats — sent in Snapchat will be automatically deleted by default from our servers after we detect they’ve been opened by all recipients or have expired. Other content, like Story posts, are stored for longer. For detailed information about how long we store different types of content, check out our Support Site.
We store other information for longer periods of time. For example:
  • We store your basic account information — like your name, phone number, and email address — and list of friends until you ask us to delete them.
  • We store location information for different lengths of time based on how precise it is and which services you use. If location information is associated with a Snap — like those saved to Memories or posted to Snap Map or Spotlight — we’ll retain that location as long as we store the Snap. Pro tip: You can see the location data we retain about you by downloading your data.
If you ever decide to stop using Snapchat, you can just ask us to delete your account. We’ll also delete most of the information we’ve collected about you after you’ve been inactive for a while!
Keep in mind that, while our systems are designed to carry out our deletion practices automatically, we cannot promise that deletion will occur within a specific timeframe. There may be legal requirements to store your data and we may need to suspend those deletion practices if we receive valid legal process asking us to preserve content, if we receive reports of abuse or other Terms of Service violations, or if your account, content created by you, or content created with other users is flagged by others or our systems for abuse or other Terms of Service violations. Finally, we may also retain certain information in backup for a limited period of time or as required by law.

Control Over Your Information

We want you to be in control of your information, so we provide you with the following tools.
  • Access, Correction, and Portability. You can access and edit most of your basic account information right in our apps. You can also use Download My Data to obtain a copy of information that isn’t available in our apps in a portable format, so you can move it or store it wherever you want. Because your privacy is important to us, we will ask you to verify your identity or provide additional information before we let you access or update your personal information. We may also reject your request to access or update your personal information for a number of reasons, including, for example, if the request risks the privacy of other users or is unlawful.
  • Revoking permissions. In most cases, if you let us use your information, you can simply revoke your permission by changing the settings in the app or on your device if your device offers those options. Of course, if you do that, certain services may lose full functionality. For promotional emails and SMS, you may opt out by clicking on the unsubscribe link or similar mechanism as provided.
  • Deletion. While we hope you’ll remain a lifelong Snapchatter, if for some reason you ever want to delete your account, just go here to learn how. You can also delete some information in the app, like photos you’ve saved to Memories, Spotlight submissions, and search history.
  • Advertising Preferences. We try to show you ads that we think will be relevant to your interests. If you would like to modify the information we and our advertising partners use to select these ads, you can do so in the app and through your device preferences. Go here to learn more.
  • Tracking. If you opt out of tracking on devices running iOS 14.5 or more recent versions, we will not link identifiable information from third-party apps and websites with identifiable information from Snapchat for advertising purposes, except on your device. You can control use of this on-device data for advertising by opting out of Activity-Based Advertising in Snapchat Ad Preferences Settings. Go here to learn more.
  • Communicating with other Snapchatters. It’s important to us that you stay in control over whom you communicate with. That’s why we’ve built a number of tools in Settings that let you indicate, among other things, who you want to see your Stories, whether you’d like to receive Snaps from just your friends or all Snapchatters, and whether you’d like to block another Snapchatter from contacting you again. Go here to learn more.

International Data Transfers

We may collect your personal information from, transfer it to, and store and process it in the United States and other countries outside of where you live. Whenever we share information outside of where you live, when we are legally required to do so, we make sure an adequate transfer mechanism is in place. We also make sure any third parties we share information with have an adequate transfer mechanism in place, as well. You can find more information on the categories of third parties we share information with here.

State and Region Specific Information

You may have specific privacy rights in your state or region. For example, in the United States, residents of California and other states have specific privacy rights. Snapchatters in the European Economic Area (EEA), the UK, Brazil, the Republic of Korea, and other jurisdictions also have specific rights. We keep an up-to-date overview of state and region specific disclosures here.

Children

Our services are not intended for — and we don’t direct them to — anyone under 13. And that’s why we do not knowingly collect personal information from anyone under 13. In addition, we may limit how we collect, use, and store some of the information of EEA and UK users between 13 and 16. In some cases, this means we will be unable to provide certain functionality to these users. If we need to rely on consent as a legal basis for processing your information and your country requires consent from a parent, we may require your parent’s consent before we collect and use that information.

Revisions to the Privacy Policy

We may change this Privacy Policy from time to time. But when we do, we’ll let you know one way or another. Sometimes, we’ll let you know by revising the date at the top of the Privacy Policy that’s available on our website and mobile application. Other times, we may provide you with additional notice (such as adding a statement to our websites’ homepages or providing you with an in-app notification).
\ No newline at end of file diff --git a/examples/6-8.jpg b/examples/6-8.jpg new file mode 100644 index 0000000000000000000000000000000000000000..69820ef5b3cb8e3bd49a3703bf9237e009bf8b4d Binary files /dev/null and b/examples/6-8.jpg differ diff --git a/examples/6.html b/examples/6.html new file mode 100644 index 0000000000000000000000000000000000000000..7690c36608525ec2753a5c1402a2f28c3f7ce085 --- /dev/null +++ b/examples/6.html @@ -0,0 +1,28 @@ + + +Privacy Policy + + + + +
+ + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/6.txt b/examples/6.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4b41315b426526159cd1882c3cc76e65bd0806f --- /dev/null +++ b/examples/6.txt @@ -0,0 +1,29 @@ + + + +Privacy Policy + + + + +
+ + + + + + + + + + + + + + \ No newline at end of file diff --git a/main b/main new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000000000000000000000000000000000000..607bdb03d2a25e12c4c5dded04057f7b29af0dc0 --- /dev/null +++ b/packages.txt @@ -0,0 +1,5 @@ +python3-opencv +libgl1 +cmake +libssl-dev +tesseract-ocr-all \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6ef8fa06372c64b6e59a08b8c82b6eb2cf64274 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +opencv-python==4.6.0.66 +torch==1.11.0 +torchvision==0.12.0 +beautifulsoup4==4.11.1 +bs4==0.0.1 +scikit-learn==1.1.2 +scipy==1.9.0 +nltk==3.7 +spacy +pytesseract==0.3.10 +https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl +transformers +openai==0.27.0 +openpyxl \ No newline at end of file diff --git a/results/result_1.png b/results/result_1.png new file mode 100644 index 0000000000000000000000000000000000000000..c13249d97366edefa4048e358c036db0e7b2af35 Binary files /dev/null and b/results/result_1.png differ diff --git a/results/result_1_C.xlsx b/results/result_1_C.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..269087e1ad5d605038e0591067659b8cddc09ca1 Binary files /dev/null and b/results/result_1_C.xlsx differ diff --git a/results/result_1_S.xlsx b/results/result_1_S.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..878a0c559a298e4191bd220dad8611433deaf334 Binary files /dev/null and b/results/result_1_S.xlsx differ diff --git a/results/result_2.png b/results/result_2.png new file mode 100644 index 0000000000000000000000000000000000000000..21dcdf3709c57619a4ff932fe2d2626fa2b79b28 Binary files /dev/null and b/results/result_2.png differ diff --git a/results/result_2_C.xlsx b/results/result_2_C.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..859e586717f70f6c151efa05344dd3fb1973c081 Binary files /dev/null and b/results/result_2_C.xlsx differ diff --git a/results/result_2_S.xlsx b/results/result_2_S.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a975edd2b21bdfbc056091eb38944e9901cb5beb Binary files /dev/null and b/results/result_2_S.xlsx differ diff --git a/results/result_3.png b/results/result_3.png new file mode 100644 index 0000000000000000000000000000000000000000..811c07549c8789c2bbc093ad1268758357c764d6 Binary files /dev/null and b/results/result_3.png differ diff --git a/results/result_3_C.xlsx b/results/result_3_C.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..23c6c6443ac653ecc66d0d4f4c4cfb1709b15356 Binary files /dev/null and b/results/result_3_C.xlsx differ diff --git a/results/result_3_S.xlsx b/results/result_3_S.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..759707b96db59dc42b9a06cef5ac9f2e61ebae56 Binary files /dev/null and b/results/result_3_S.xlsx differ diff --git a/run_sem_test.py b/run_sem_test.py new file mode 100644 index 0000000000000000000000000000000000000000..bd8ce8dd4fba06cdc617615debcf695646aa9b30 --- /dev/null +++ b/run_sem_test.py @@ -0,0 +1,23 @@ +from SEM.run_single_sem import run_single_pp +from bs4 import BeautifulSoup +import shutil + +# file = open('examples/6.html', encoding='utf-8') + +# file_content = file.read().decode('utf-8') +# file_content = file + +# file_content = 'examples/6.html' +# pp_root = 'demo_pp.html' +# with open(pp_root, 'wb') as file: +# with open(file_content, 'rb') as html: +# shutil.copyfileobj(html, file) + +with open("examples/6.html", "r") as file: + example_file_content = file.read() + +run_single_pp(example_file_content) + +# soup = BeautifulSoup(file, features="html.parser") +# print("soup.contents: ", soup.contents) +