Spaces:
No application file
No application file
| from bs4 import BeautifulSoup | |
| import re | |
| from bs4.element import PageElement | |
| import json | |
| def preclean(content:str): | |
| res = re.sub(r'\s+', ' ', content) | |
| res=res.replace("阅读更多","") | |
| res=res.replace(" ","\n").split('\n') | |
| res = [re.sub(r'\s+', ' ',x) for x in res if len(re.sub(r'\s+', ' ',x)) >= 3] | |
| res= [item for item in res if re.search('[\u4e00-\u9fa5]', item)] | |
| new_list = [] | |
| for item in res: | |
| # if ('(' in item and ')' not in item) or (')' in item and '(' not in item): | |
| # continue | |
| if item not in new_list: | |
| new_list.append(item) | |
| clean_res=' '.join(new_list) | |
| return clean_res | |
| def text2subsec(content_elem:PageElement): | |
| head = content_elem.find_all('h3', attrs='topic__header--subsection') | |
| # head = content_elem.find_next_siblings('h3', attrs='topic__header--subsection') | |
| # head=content_elem.find_all_next('h3', attrs='topic__header--subsection') | |
| # head=content_elem.find_all_next('section', class_='topic__section GHead') | |
| if len(head) == 0: | |
| return head | |
| if '参考文献' in head[-1].text: | |
| head = head[:-1] | |
| return head | |
| def subsec2dict(content_elem:PageElement,sec:list): | |
| info={} | |
| intro_list = [] | |
| # 如果没有subsection则直接返回字典 | |
| if len(sec)==0: | |
| # 是否过滤href的内容 | |
| # info["开头"] = preclean(rm_link(content_elem)) | |
| info["开头"] = preclean(content_elem.get_text()) | |
| return info | |
| subsection_tag = content_elem.find('section', class_='topic__section GHead') | |
| for sibling in subsection_tag.previous_siblings: | |
| if sibling.name is not None: | |
| # 如果兄弟节点是一个标签,则将其文本内容添加到列表中 | |
| # intro_list.append(rm_link(sibling).strip()) | |
| intro_list.append(sibling.get_text().strip()) | |
| intro = "" | |
| for elem in list(reversed(intro_list)): | |
| intro += elem | |
| info["开头"] = preclean(intro) | |
| else: | |
| # 如果兄弟节点是一个字符串,则直接将其添加到列表中 | |
| pass | |
| # intro_list.append(sibling.get_text().strip()) | |
| # intro_list.append(rm_link(sibling).strip()) | |
| # intro="" | |
| # for elem in list(reversed(intro_list)): | |
| # intro+=elem | |
| # info["开头"]=preclean(intro) | |
| for s in sec: | |
| # key = rm_link(s).strip() | |
| key=s.get_text().strip() | |
| # value = rm_link(s.find_next_sibling('div', class_='topic__content')).strip() | |
| value=s.find_next_sibling('div', class_='topic__content').get_text().strip() | |
| value=preclean(value) | |
| info[key]=value | |
| return info | |
| def parser(soup,feature:str): | |
| # 查找data-originaltitle="治疗"的标签 | |
| treatment_tag = soup.find_all(attrs={"data-originaltitle": feature}) | |
| # 检查是否找到了符合条件的标签 | |
| if treatment_tag: | |
| # 获取紧跟在目标标签后面的div标签 | |
| content_div = treatment_tag[0].find_next_sibling('div', class_='topic__content') | |
| # 提取div里面的内容 | |
| heads=text2subsec(content_div) | |
| content=subsec2dict(content_div,heads) | |
| return content | |
| else: | |
| return None | |
| def file_to_4_attr(filename:str): | |
| # 症状和体征 & 诊断 & 预后 & 治疗 | |
| info=[] | |
| # features = ["症状和体征", "治疗"] | |
| features=["症状和体征","诊断","预后","治疗"] | |
| soup = BeautifulSoup(open(filename,encoding='utf-8').read(), 'html.parser') | |
| for f in features: | |
| p_content = parser(soup,f) | |
| # clean_p = preclean(p_content) | |
| info.append(p_content) | |
| return info | |
| if __name__=="__main__": | |
| # 高血压 {6DE49BCA-DE4B-4666-B0A2-49FC9C8F357A}.html | |
| # 新生儿高钙血病 {0CE84DB1-A652-4C31-80E6-D987B01F8BBB}.html | |
| # 返回 症状和体征 & 诊断 & 预后 & 治疗 和对应的subsection | |
| ret=file_to_4_attr(filename='{B8BFF836-D19C-4415-9A48-249ACF9C7A96}.html') | |
| with open("慢性肝炎概述.json", "w", encoding='utf-8') as f: | |
| json.dump(ret, f, ensure_ascii=False, indent=4) | |
| # json.dump(ret,"{6DE49BCA-DE4B-4666-B0A2-49FC9C8F357A}.json",ensure_ascii=False,indent=4) | |
| print("hold") | |