import re import bs4 from SEM.paragraph_bayesian import clf,tf from bs4 import BeautifulSoup mark_txt = {'0':"/data_types.txt", '1':"/data_types.txt", '2':"/personal_information_type.txt", '3':"/share_information.txt", '4':"/protect_information.txt", '5':"/advertising.txt", '6':"/user_right.txt", '7':"/children.txt", '8':"/region.txt", '9':"/update.txt", '10':"/way_to_collect.txt", '11':"/provider.txt", '12':"/data_retention.txt", '13':"/data_types.txt", '14':"/thrid_party.txt", '15':"/data_types.txt"} def process_content_outside_heading(content, pathName): # 将内容转换为文本并进行预处理 soup = BeautifulSoup(content, 'html.parser') text_content = soup.get_text(separator=' ', strip=True) # 如果内容不为空,进行分类或其他处理 if text_content: # 您可以在这里使用与标题相同的分类器进行分类 mark = clf.predict(tf.transform([text_content])) # 将处理后的内容写入对应的文件 with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]), "a", encoding='utf-8') as f: f.write(text_content) f.write("\n") def write_text(title_list, pathName, soup): type = 0 security = 0 right = 0 specialGroup = 0 specialArea = 0 update = 0 retention = 0 useData = 0 clean_title_list = [] for title in title_list: if title.text != "•": clean_title_list.append(title) # # 处理第一个标题之前的内容 # if clean_title_list: # first_title = clean_title_list[0] # content_before_first_title = [] # for element in first_title.find_previous_siblings(): # content_before_first_title.insert(0, element) # 逆序插入,保持顺序 # content_before_first_title = ''.join([str(elem) for elem in content_before_first_title]) # # 处理标题外的内容 # process_content_outside_heading(content_before_first_title, pathName) # print("title list:"+str(clean_title_list)) lastMark = "" for title in clean_title_list: title_Str = re.sub(r'\s+', ' ',str(title)) title_Str = re.sub(r'<[^<]+?>', '', title_Str).replace('\n','').strip() if title is None: continue try: mark = clf.predict(tf.transform([title_Str])) except Exception as e: continue # print(mark) if mark == "1": type = 1 if mark == "4": security = 1 if mark == "6": right = 1 if mark == "13": useData = 1 if mark == "8": specialArea = 1 if mark == "9": update = 1 if mark == "12": retention = 1 if mark == "7": specialGroup = 1 if mark == "0": if lastMark != "": mark = lastMark lastMark = mark for sibling in title.next_elements: # print("sibling", sibling) # if len(str(sibling).split(' ')) < 5: # continue try: if clean_title_list[clean_title_list.index(title) + 1] == sibling: break except Exception: continue # if isinstance(sibling, bs4.element.Tag): # # continue if str(sibling) == '\n': continue if sibling == title.string: continue if clean_title_list.index(title) == len(clean_title_list) - 1: with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as f: if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): continue if sibling.name == 'li': if sibling.find_previous('p'): # p_text = sibling.find_previous('p').text.strip() parent = ' '.join(sibling.find_previous('p').text.split()) text = ' '.join(sibling.get_text().split()) currentSibing = f"{parent} {text}" # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": # currentSibing = currentSibing + "." # g.write(currentSibing) # print("Found ul after a p tag with text:", parent) else: # currentSibing = str(sibling) currentSibing = ' '.join(sibling.get_text().split()) else: # currentSibing = str(sibling) currentSibing = ' '.join(sibling.get_text().split()) # currentSibing = str(sibling) if len(currentSibing) != 0: if currentSibing[-1].isalpha() or currentSibing[-1] == ")": currentSibing = currentSibing + "." elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": currentSibing = currentSibing[:-1] currentSibing = currentSibing + "." f.write(currentSibing) f.write("\n") f.close() else: with open('./SEM/txt/'+pathName[:-5]+mark_txt.get(mark[0]),"a",encoding='utf-8') as g: if sibling.name is None or (sibling.name != 'li' and sibling.name != 'p' and sibling.name != 'br' and isinstance(sibling, bs4.element.Tag)): continue if sibling.name == 'li': if sibling.find_previous('p'): # p_text = sibling.find_previous('p').text.strip() parent = ' '.join(sibling.find_previous('p').text.split()) text = ' '.join(sibling.get_text().split()) currentSibing = f"{parent} {text}" # if currentSibing[-1].isalpha() or currentSibing[-1] == ")": # currentSibing = currentSibing + "." # g.write(currentSibing) # print("Found ul after a p tag with text:", parent) else: # currentSibing = str(sibling) currentSibing = ' '.join(sibling.get_text().split()) else: # currentSibing = str(sibling) currentSibing = ' '.join(sibling.get_text().split()) # currentSibing = str(sibling) if len(currentSibing) != 0: if currentSibing[-1].isalpha() or currentSibing[-1] == ")": currentSibing = currentSibing + "." elif currentSibing[-1] == ";" or currentSibing[-1] == ":" or currentSibing[-1] == ",": currentSibing = currentSibing[:-1] currentSibing = currentSibing + "." g.write(currentSibing) g.write("\n") g.close() # 处理标题之外的段落 remaining_soup = soup # 保留整个页面的 soup 结构 # 遍历标题列表,移除已经处理的标题和标题下的段落 for title in clean_title_list: title.extract() # 移除每个标题及其下的段落 # 剩下的文本内容未被标题覆盖,进行同样的分类处理 removeUnneccessaryElements(remaining_soup) remaining_segments = makeCoarseSegments(remaining_soup) for seg in remaining_segments: seg_clean = ' '.join(seg.split()) if len(seg_clean) != 0: try: mark = clf.predict(tf.transform([seg_clean])) with open('./SEM/txt/' + pathName[:-5] + mark_txt.get(mark[0]), "a", encoding='utf-8') as f: f.write(seg_clean) f.write("\n") f.close() except Exception as e: continue # # 处理最后一个标题之后的内容 # if clean_title_list: # last_title = clean_title_list[-1] # content_after_last_title = [] # for element in last_title.next_siblings: # content_after_last_title.append(element) # content_after_last_title = ''.join([str(elem) for elem in content_after_last_title]) # # 处理标题外的内容 # process_content_outside_heading(content_after_last_title, pathName) return type,security,right,specialArea,specialGroup,update,retention,useData def write_text_without_label(text, pathName): with open('./txt/' + pathName[:-5] + '/data_types.txt', "a", encoding='utf-8') as f: currentSibing = str(text) # print("currentSibing", currentSibing) if currentSibing[-1].isalpha() or currentSibing[-1] == ")": currentSibing = currentSibing + "." elif currentSibing[-1] == ";": currentSibing[-1] = "." f.write(currentSibing) f.close() def removeUnneccessaryElements(soup): for script in soup(["script", "style", "nav", "footer", "header", "img", "option", "select", "head", "button"]): script.extract() # rip it out for div in soup.find_all("div", {'class': 'footer'}): div.decompose() for div in soup.find_all("div", {'class': re.compile(r"sidebar")}): div.decompose() for div in soup.find_all("div", {'data-testid': re.compile(r"ax-navigation-menubar")}): div.decompose() for div in soup.find_all("div", {'class': re.compile(r"menu")}): div.decompose() for li in soup.find_all("li", {'class': re.compile(r"menu")}): li.decompose() for p in soup.find_all("p", {'class': re.compile(r"heading")}): p.decompose() for p in soup.find_all("p", {'class': re.compile(r"fw-bold")}): p.decompose() for ul in soup.find_all("ul", {'class': re.compile(r"menu")}): ul.decompose() for div in soup.find_all("div", {'class': re.compile(r"header")}): div.decompose() for div in soup.find_all("div", {'data-referrer': re.compile(r"page_footer")}): div.decompose() for div in soup.find_all("div", {'id': 'footer'}): div.decompose() for div in soup.find_all("div", {'id': re.compile(r"sidebar")}): div.decompose() for div in soup.find_all("div", {'id': re.compile(r"menu")}): div.decompose() for li in soup.find_all("li", {'id': re.compile(r"menu")}): li.decompose() for ul in soup.find_all("ul", {'id': re.compile(r"menu")}): ul.decompose() for div in soup.find_all("div", {'id': re.compile(r"header")}): div.decompose() for div in soup.find_all("div", {'id': re.compile(r"breadcrumbs")}): div.decompose() for div in soup.find_all("div", {'id': re.compile(r"instagram")}): div.decompose() for div in soup.find_all("div", {'role': re.compile(r"navigation")}): div.decompose() for div in soup.find_all("div", {'role': re.compile(r"banner")}): div.decompose() for div in soup.find_all("div", {'role': re.compile(r"button")}): div.decompose() for div in soup.find_all("ul", {'role': re.compile(r"navigation")}): div.decompose() def makeCoarseSegments(soup): segments = [] for p in soup.find_all("p"): if p.find_next() is not None: if p.find_next().name != "ul": # segments.append(' '.join(p.get_text().split())) text = ' '.join(p.get_text().split()) if len(text) != 0: if text[-1].isalpha() or text[-1] == ")": text = text + "." elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": text = text[:-1] text = text + "." segments.append(text) listSplitter = [] for ul in soup.find_all("ul"): if ul.find_previous('p') is not None: parent = ' '.join(ul.find_previous('p').text.split()) for element in ul.findChildren('li'): text = ' '.join(element.get_text().split()) listElement = f"{parent} {text}" if len(listElement) != 0: if listElement[-1].isalpha() or listElement[-1] == ")": listElement = listElement + "." elif listElement[-1] == ";" or listElement[-1] == ":" or listElement[-1] == ",": listElement = listElement[:-1] listElement = listElement + "." segments.append(listElement) else: for element in ul.findChildren('li'): text = ' '.join(element.get_text().split()) if len(text) != 0: if text[-1].isalpha() or text[-1] == ")": text = text + "." elif text[-1] == ";" or text[-1] == ":" or text[-1] == ",": text = text[:-1] text = text + "." segments.append(text) # if not segments: # text = soup.getText().replace('\n', '').replace('↵', '') # result = useAlgorithm(text) # else: # # text = " ".join(segments) # # print("TEXT??", text) # print("SEGMENTS??", segments) # result = segments result = segments return result