Spaces:
Sleeping
Sleeping
| import re | |
| import bs4 | |
| from SEM.paragraph_bayesian import clf,tf | |
| from bs4 import BeautifulSoup | |
| mark_txt = {'0':"/data_types.txt", | |
| '1':"/data_types.txt", | |
| '2':"/personal_information_type.txt", | |
| '3':"/share_information.txt", | |
| '4':"/protect_information.txt", | |
| '5':"/advertising.txt", | |
| '6':"/user_right.txt", | |
| '7':"/children.txt", | |
| '8':"/region.txt", | |
| '9':"/update.txt", | |
| '10':"/way_to_collect.txt", | |
| '11':"/provider.txt", | |
| '12':"/data_retention.txt", | |
| '13':"/data_types.txt", | |
| '14':"/thrid_party.txt", | |
| '15':"/data_types.txt"} | |
| def process_content_outside_heading(content, pathName): | |
| # 将内容转换为文本并进行预处理 | |
| soup = BeautifulSoup(content, 'html.parser') | |
| text_content = soup.get_text(separator=' ', strip=True) | |
| # 如果内容不为空,进行分类或其他处理 | |
| if text_content: | |
| # 您可以在这里使用与标题相同的分类器进行分类 | |
| mark = clf.predict(tf.transform([text_content])) | |
| # 将处理后的内容写入对应的文件 | |
| with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]), "a", encoding='utf-8') as f: | |
| f.write(text_content) | |
| f.write("\n") | |
| def write_text(title_list, pathName, soup): | |
| type = 0 | |
| security = 0 | |
| right = 0 | |
| specialGroup = 0 | |
| specialArea = 0 | |
| update = 0 | |
| retention = 0 | |
| useData = 0 | |
| clean_title_list = [] | |
| for title in title_list: | |
| if title.text != "•": | |
| clean_title_list.append(title) | |
| # # 处理第一个标题之前的内容 | |
| # if clean_title_list: | |
| # first_title = clean_title_list[0] | |
| # content_before_first_title = [] | |
| # for element in first_title.find_previous_siblings(): | |
| # content_before_first_title.insert(0, element) # 逆序插入,保持顺序 | |
| # content_before_first_title = ''.join([str(elem) for elem in content_before_first_title]) | |
| # # 处理标题外的内容 | |
| # process_content_outside_heading(content_before_first_title, pathName) | |
| # print("title list:"+str(clean_title_list)) | |
| lastMark = "" | |
| for title in clean_title_list: | |
| title_Str = re.sub(r'\s+', ' ',str(title)) | |
| title_Str = re.sub(r'<[^<]+?>', '', title_Str).replace('\n','').strip() | |
| if title is None: | |
| continue | |
| try: | |
| mark = clf.predict(tf.transform([title_Str])) | |
| except Exception as e: | |
| continue | |
| # print(mark) | |
| if mark == "1": | |
| type = 1 | |
| if mark == "4": | |
| security = 1 | |
| if mark == "6": | |
| right = 1 | |
| if mark == "13": | |
| useData = 1 | |
| if mark == "8": | |
| specialArea = 1 | |
| if mark == "9": | |
| update = 1 | |
| if mark == "12": | |
| retention = 1 | |
| if mark == "7": | |
| specialGroup = 1 | |
| if mark == "0": | |
| if lastMark != "": | |
| mark = lastMark | |
| lastMark = mark | |
| for sibling in title.next_elements: | |
| # print("sibling", sibling) | |
| # if len(str(sibling).split(' ')) < 5: | |
| # continue | |
| try: | |
| if clean_title_list[clean_title_list.index(title) + 1] == sibling: | |
| break | |
| except Exception: | |
| continue | |
| # if isinstance(sibling, bs4.element.Tag): | |
| # | |
| # continue | |
| if str(sibling) == '\n': | |
| continue | |
| if sibling == title.string: | |
| continue | |
| if clean_title_list.index(title) == len(clean_title_list) - 1: | |
| with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as f: | |
| if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): | |
| continue | |
| if sibling.name == 'li': | |
| if sibling.find_previous('p'): | |
| # p_text = sibling.find_previous('p').text.strip() | |
| parent = ' '.join(sibling.find_previous('p').text.split()) | |
| text = ' '.join(sibling.get_text().split()) | |
| currentSibing = f"{parent} {text}" | |
| # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": | |
| # currentSibing = currentSibing + "." | |
| # g.write(currentSibing) | |
| # print("Found ul after a p tag with text:", parent) | |
| else: | |
| # currentSibing = str(sibling) | |
| currentSibing = ' '.join(sibling.get_text().split()) | |
| else: | |
| # currentSibing = str(sibling) | |
| currentSibing = ' '.join(sibling.get_text().split()) | |
| # currentSibing = str(sibling) | |
| if len(currentSibing) != 0: | |
| if currentSibing[-1].isalpha() or currentSibing[-1] == ")": | |
| currentSibing = currentSibing + "." | |
| elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": | |
| currentSibing = currentSibing[:-1] | |
| currentSibing = currentSibing + "." | |
| f.write(currentSibing) | |
| f.write("\n") | |
| f.close() | |
| else: | |
| with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as g: | |
| if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): | |
| continue | |
| if sibling.name == 'li': | |
| if sibling.find_previous('p'): | |
| # p_text = sibling.find_previous('p').text.strip() | |
| parent = ' '.join(sibling.find_previous('p').text.split()) | |
| text = ' '.join(sibling.get_text().split()) | |
| currentSibing = f"{parent} {text}" | |
| # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": | |
| # currentSibing = currentSibing + "." | |
| # g.write(currentSibing) | |
| # print("Found ul after a p tag with text:", parent) | |
| else: | |
| # currentSibing = str(sibling) | |
| currentSibing = ' '.join(sibling.get_text().split()) | |
| else: | |
| # currentSibing = str(sibling) | |
| currentSibing = ' '.join(sibling.get_text().split()) | |
| # currentSibing = str(sibling) | |
| if len(currentSibing) != 0: | |
| if currentSibing[-1].isalpha() or currentSibing[-1] == ")": | |
| currentSibing = currentSibing + "." | |
| elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": | |
| currentSibing = currentSibing[:-1] | |
| currentSibing = currentSibing + "." | |
| g.write(currentSibing) | |
| g.write("\n") | |
| g.close() | |
| # 处理标题之外的段落 | |
| remaining_soup = soup # 保留整个页面的 soup 结构 | |
| # 遍历标题列表,移除已经处理的标题和标题下的段落 | |
| for title in clean_title_list: | |
| title.extract() # 移除每个标题及其下的段落 | |
| # 剩下的文本内容未被标题覆盖,进行同样的分类处理 | |
| removeUnneccessaryElements(remaining_soup) | |
| remaining_segments = makeCoarseSegments(remaining_soup) | |
| for seg in remaining_segments: | |
| seg_clean = ' '.join(seg.split()) | |
| if len(seg_clean) != 0: | |
| try: | |
| mark = clf.predict(tf.transform([seg_clean])) | |
| with open('./SEM/txt/' + pathName[:-5] + mark_txt.get(mark[0]), "a", encoding='utf-8') as f: | |
| f.write(seg_clean) | |
| f.write("\n") | |
| f.close() | |
| except Exception as e: | |
| continue | |
| # # 处理最后一个标题之后的内容 | |
| # if clean_title_list: | |
| # last_title = clean_title_list[-1] | |
| # content_after_last_title = [] | |
| # for element in last_title.next_siblings: | |
| # content_after_last_title.append(element) | |
| # content_after_last_title = ''.join([str(elem) for elem in content_after_last_title]) | |
| # # 处理标题外的内容 | |
| # process_content_outside_heading(content_after_last_title, pathName) | |
| return type,security,right,specialArea,specialGroup,update,retention,useData | |
| def write_text_without_label(text, pathName): | |
| with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: | |
| currentSibing = str(text) | |
| # print("currentSibing", currentSibing) | |
| if currentSibing[-1].isalpha() or currentSibing[-1] == ")": | |
| currentSibing = currentSibing + "." | |
| elif currentSibing[-1] == ";": | |
| currentSibing[-1] = "." | |
| f.write(currentSibing) | |
| f.close() | |
| def removeUnneccessaryElements(soup): | |
| for script in soup(["script", "style", "nav", "footer", "header", "img", "option", "select", "head", "button"]): | |
| script.extract() # rip it out | |
| for div in soup.find_all("div", {'class': 'footer'}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'class': re.compile(r"sidebar")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'data-testid': re.compile(r"ax-navigation-menubar")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'class': re.compile(r"menu")}): | |
| div.decompose() | |
| for li in soup.find_all("li", {'class': re.compile(r"menu")}): | |
| li.decompose() | |
| for p in soup.find_all("p", {'class': re.compile(r"heading")}): | |
| p.decompose() | |
| for p in soup.find_all("p", {'class': re.compile(r"fw-bold")}): | |
| p.decompose() | |
| for ul in soup.find_all("ul", {'class': re.compile(r"menu")}): | |
| ul.decompose() | |
| for div in soup.find_all("div", {'class': re.compile(r"header")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'data-referrer': re.compile(r"page_footer")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'id': 'footer'}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'id': re.compile(r"sidebar")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'id': re.compile(r"menu")}): | |
| div.decompose() | |
| for li in soup.find_all("li", {'id': re.compile(r"menu")}): | |
| li.decompose() | |
| for ul in soup.find_all("ul", {'id': re.compile(r"menu")}): | |
| ul.decompose() | |
| for div in soup.find_all("div", {'id': re.compile(r"header")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'id': re.compile(r"breadcrumbs")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'id': re.compile(r"instagram")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'role': re.compile(r"navigation")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'role': re.compile(r"banner")}): | |
| div.decompose() | |
| for div in soup.find_all("div", {'role': re.compile(r"button")}): | |
| div.decompose() | |
| for div in soup.find_all("ul", {'role': re.compile(r"navigation")}): | |
| div.decompose() | |
| def makeCoarseSegments(soup): | |
| segments = [] | |
| for p in soup.find_all("p"): | |
| if p.find_next() is not None: | |
| if p.find_next().name != "ul": | |
| # segments.append(' '.join(p.get_text().split())) | |
| text = ' '.join(p.get_text().split()) | |
| if len(text) != 0: | |
| if text[-1].isalpha() or text[-1] == ")": | |
| text = text + "." | |
| elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": | |
| text = text[:-1] | |
| text = text + "." | |
| segments.append(text) | |
| listSplitter = [] | |
| for ul in soup.find_all("ul"): | |
| if ul.find_previous('p') is not None: | |
| parent = ' '.join(ul.find_previous('p').text.split()) | |
| for element in ul.findChildren('li'): | |
| text = ' '.join(element.get_text().split()) | |
| listElement = f"{parent} {text}" | |
| if len(listElement) != 0: | |
| if listElement[-1].isalpha() or listElement[-1] == ")": | |
| listElement = listElement + "." | |
| elif listElement[-1] == ";" or listElement[-1] == ":" or listElement[-1] == ",": | |
| listElement = listElement[:-1] | |
| listElement = listElement + "." | |
| segments.append(listElement) | |
| else: | |
| for element in ul.findChildren('li'): | |
| text = ' '.join(element.get_text().split()) | |
| if len(text) != 0: | |
| if text[-1].isalpha() or text[-1] == ")": | |
| text = text + "." | |
| elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": | |
| text = text[:-1] | |
| text = text + "." | |
| segments.append(text) | |
| # if not segments: | |
| # text = soup.getText().replace('\n', '').replace('↵', '') | |
| # result = useAlgorithm(text) | |
| # else: | |
| # # text = " ".join(segments) | |
| # # print("TEXT??", text) | |
| # print("SEGMENTS??", segments) | |
| # result = segments | |
| result = segments | |
| return result | |