import os from paperqa import Docs import requests import paperqa import json from bs4 import BeautifulSoup import urllib.request class PDBQuery: def __init__(self, pdbid): self.pdbid = pdbid self.pubids = [] self.refs = [] self.citations = [] self.pubids = [] def get_pdb_descriptions(self,pdbid): pdbrest = f"https://data.rcsb.org/rest/v1/core/entry/{pdbid}" r = requests.get(pdbrest) self.data = r.json() return self.data def get_pubids(self): #extract pubmed IDs of the articles for paper in self.result['citation']: for k,v in paper.items(): if k=="pdbx_database_id_pub_med": self.pubids.append(v) return self.pubids def create_citation(self): ##Create citation using pubmed ids self.result = self.get_pdb_descriptions(self.pdbid) self.pubids = self.get_pubids() for pid in self.pubids: puburl = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pid}&retmode=json' response = requests.get(puburl) search_response = requests.get(puburl).json() article = str(pid) author_list = [] pubmed_id = search_response["result"][article]["uid"] title = search_response["result"][article]["title"] authors = search_response["result"][article]["authors"] journal = search_response["result"][article]["source"] pub_date = search_response["result"][article]["pubdate"] volume = search_response["result"][article]["volume"] issue = search_response["result"][article]["issue"] pages = search_response["result"][article]["pages"] doi = search_response["result"][article]["elocationid"] for i in authors: all_authors = i["name"] author_list.append(all_authors) names = str(author_list).replace("'", "").replace("[","").replace("]","") corrected_title = title.replace("", "").replace("", "") self.citations.append(f"{names}.{journal} {pub_date[0:4]};{volume}({issue}):{pages}. {doi}") return self.citations def write_webdata(self): url_list = [f'https://pubmed.ncbi.nlm.nih.gov/{pid}' for pid in self.pubids] #finally add a basic PDB databank search page url_list.append(f"https://data.rcsb.org/rest/v1/core/entry/{self.pdbid}") with open('web_data.txt', "w", encoding="utf-8") as out: for url in url_list: r = requests.get(url) bs = BeautifulSoup(r.text, 'html.parser') ## remove html syntax ---> this improved the results much more html = bs.prettify() for i in range(0, len(html)): out.write(html[i])