Spaces:
Sleeping
Sleeping
| import xml.etree.ElementTree as ET | |
| import re | |
| class PubmedXmlParse: | |
| def __init__(self): | |
| pass | |
| def remove_xml_tags(self, text): | |
| """移除XML标签,返回纯文本""" | |
| clean = re.compile('<.*?>') | |
| return re.sub(clean, '', text) | |
| # 解析 XML 数据 | |
| def parse_pubmed_xml(self, xml_data): | |
| tree = ET.ElementTree(ET.fromstring(xml_data)) | |
| root = tree.getroot() | |
| articles = [] | |
| # 遍历每个 PubmedArticle 元素 | |
| for article in root.findall(".//PubmedArticle"): | |
| # 提取文章信息 | |
| article_title_elem = article.find(".//ArticleTitle") | |
| article_title = "" | |
| if article_title_elem is not None: | |
| # Convert element to string and decode to handle tags | |
| title_text = ET.tostring(article_title_elem, encoding='unicode', method='xml') | |
| # Remove the ArticleTitle tags but keep inner content and tags | |
| title_text = title_text.replace('<ArticleTitle>', '').replace('</ArticleTitle>', '') | |
| # Remove all XML tags to get plain text | |
| article_title = self.remove_xml_tags(title_text).strip() | |
| pmid = ( | |
| article.find(".//ArticleId[@IdType='pubmed']").text | |
| if article.find(".//ArticleId[@IdType='pubmed']") is not None | |
| else "" | |
| ) | |
| abstract_texts = article.findall(".//AbstractText") | |
| abstract_text = ( | |
| " ".join( | |
| [ | |
| abstract.text if abstract.text is not None else "" | |
| for abstract in abstract_texts | |
| ] | |
| ) | |
| if abstract_texts | |
| else "" | |
| ) | |
| # 提取作者信息 | |
| authors = [] | |
| for author in article.findall(".//Author"): | |
| authors.append( | |
| { | |
| "lastname": ( | |
| author.find(".//LastName").text | |
| if author.find(".//LastName") is not None | |
| else "" | |
| ), | |
| "forename": ( | |
| author.find(".//ForeName").text | |
| if author.find(".//ForeName") is not None | |
| else "" | |
| ), | |
| "initials": ( | |
| author.find(".//Initials").text | |
| if author.find(".//Initials") is not None | |
| else "" | |
| ), | |
| "affiliation": ( | |
| author.find(".//AffiliationInfo/Affiliation").text | |
| if author.find(".//AffiliationInfo/Affiliation") is not None | |
| else "" | |
| ), | |
| } | |
| ) | |
| journal = { | |
| "issn": ( | |
| article.find(".//Journal/ISSN").text | |
| if article.find(".//Journal/ISSN") is not None | |
| else "" | |
| ), | |
| "title": ( | |
| article.find(".//Journal/Title").text | |
| if article.find(".//Journal/Title") is not None | |
| else "" | |
| ), | |
| "abbreviation": ( | |
| article.find(".//Journal/ISOAbbreviation").text | |
| if article.find(".//Journal/ISOAbbreviation") is not None | |
| else "" | |
| ), | |
| "startPage": ( | |
| article.find(".//Pagination/StartPage").text | |
| if article.find(".//Pagination/StartPage") is not None | |
| else "" | |
| ), | |
| "endPage": ( | |
| article.find(".//Pagination/EndPage").text | |
| if article.find(".//Pagination/EndPage") is not None | |
| else "" | |
| ), | |
| "volume": ( | |
| article.find(".//Journal/JournalIssue/Volume").text | |
| if article.find(".//Journal/JournalIssue/Volume") is not None | |
| else "" | |
| ), | |
| "issue": ( | |
| article.find(".//Journal/JournalIssue/Issue").text | |
| if article.find(".//Journal/JournalIssue/Issue") is not None | |
| else "" | |
| ), | |
| "year": ( | |
| article.find(".//Journal/JournalIssue/PubDate/Year").text | |
| if article.find(".//Journal/JournalIssue/PubDate/Year") is not None | |
| else "" | |
| ), | |
| } | |
| medline = article.find("MedlineCitation") | |
| references = article.findall(".//PubmedData/ReferenceList/Reference") | |
| # 将每篇文章的信息添加到列表中 | |
| articles.append( | |
| { | |
| "pmid": pmid, | |
| "pmcid": ( | |
| article.find( | |
| ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" | |
| ).text | |
| if article.find( | |
| ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" | |
| ) | |
| is not None | |
| else "" | |
| ), | |
| "title": article_title, | |
| "abstract": abstract_text, | |
| "journal": journal, | |
| "authors": authors, | |
| "pub_date": { | |
| "year": ( | |
| article.find(".//Journal/JournalIssue/PubDate/Year").text | |
| if article.find(".//Journal/JournalIssue/PubDate/Year") | |
| is not None | |
| else "" | |
| ), | |
| "month": ( | |
| article.find(".//Journal/JournalIssue/PubDate/Month").text | |
| if article.find(".//Journal/JournalIssue/PubDate/Month") | |
| is not None | |
| else "" | |
| ), | |
| "day": ( | |
| article.find(".//Journal/JournalIssue/PubDate/Day").text | |
| if article.find(".//Journal/JournalIssue/PubDate/Day") | |
| is not None | |
| else "" | |
| ), | |
| }, | |
| "keywords": ( | |
| [k.text for k in medline.findall(".//KeywordList/Keyword")] | |
| if medline.findall(".//KeywordList/Keyword") is not None | |
| else "" | |
| ), | |
| "doi": self.parse_doi(medline.find("Article"), article), | |
| "mesh_terms": [ | |
| self.parse_mesh(m) | |
| for m in medline.findall("MeshHeadingList/MeshHeading") | |
| ], | |
| "references": [self.parse_reference(r) for r in references], | |
| } | |
| ) | |
| return articles | |
| def parse_doi(self, article, article_elem) -> str: | |
| if article.find(".//ELocationID[@EIdType='doi']") is not None: | |
| doi = article.find(".//ELocationID[@EIdType='doi']").text | |
| if doi is not None and doi != "": | |
| return doi | |
| elif article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']") is not None: | |
| doi = article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']").text | |
| if doi is not None and doi != "": | |
| return doi | |
| else: | |
| return "" | |
| def parse_mesh(self, mesh_elem): | |
| """解析MeSH主题词""" | |
| return { | |
| "descriptor": ( | |
| mesh_elem.find(".//DescriptorName").text | |
| if mesh_elem.find(".//DescriptorName") is not None | |
| else "" | |
| ), | |
| "qualifiers": [ | |
| ( | |
| q.find(".//QualifierName").text | |
| if q.find(".//QualifierName") is not None | |
| else "" | |
| ) | |
| for q in mesh_elem.findall(".//QualifierName") | |
| ], | |
| } | |
| def parse_reference(self, reference_elem): | |
| """解析参考文献""" | |
| return { | |
| "citation": ( | |
| reference_elem.find("Citation").text | |
| if reference_elem.find("Citation") is not None | |
| else "" | |
| ), | |
| "doi": ( | |
| reference_elem.find(".//ArticleId[@IdType='doi']").text | |
| if reference_elem.find(".//ArticleId[@IdType='doi']") is not None | |
| else "" | |
| ), | |
| "pmid": ( | |
| reference_elem.find(".//ArticleId[@IdType='pubmed']").text | |
| if reference_elem.find(".//ArticleId[@IdType='pubmed']") is not None | |
| else "" | |
| ), | |
| "pmcid": ( | |
| reference_elem.find(".//ArticleId[@IdType='pmcid']").text | |
| if reference_elem.find(".//ArticleId[@IdType='pmcid']") is not None | |
| else "" | |
| ), | |
| } | |