|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
import requests |
|
|
from core.DefaultPackages import openFile, saveFile |
|
|
from core.NER import cleanText |
|
|
import pandas as pd |
|
|
class HTML(): |
|
|
def __init__(self, htmlFile, htmlLink): |
|
|
self.htmlLink = htmlLink |
|
|
self.htmlFile = htmlFile |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from lxml.etree import ParserError, XMLSyntaxError |
|
|
|
|
|
def openHTMLFile(self): |
|
|
not_need_domain = ['https://broadinstitute.github.io/picard/', |
|
|
'https://software.broadinstitute.org/gatk/best-practices/', |
|
|
'https://www.ncbi.nlm.nih.gov/genbank/', |
|
|
'https://www.mitomap.org/'] |
|
|
headers = { |
|
|
"User-Agent": ( |
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
|
"Chrome/114.0.0.0 Safari/537.36" |
|
|
), |
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
|
"Referer": self.htmlLink, |
|
|
"Connection": "keep-alive" |
|
|
} |
|
|
|
|
|
session = requests.Session() |
|
|
session.headers.update(headers) |
|
|
if self.htmlLink in not_need_domain: |
|
|
return BeautifulSoup("", 'html.parser') |
|
|
try: |
|
|
if self.htmlLink and self.htmlLink != "None": |
|
|
r = session.get(self.htmlLink, allow_redirects=True, timeout=15) |
|
|
if r.status_code != 200 or not r.text.strip(): |
|
|
print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}") |
|
|
return BeautifulSoup("", 'html.parser') |
|
|
soup = BeautifulSoup(r.content, 'html.parser') |
|
|
else: |
|
|
with open(self.htmlFile, encoding='utf-8') as fp: |
|
|
soup = BeautifulSoup(fp, 'html.parser') |
|
|
except (ParserError, XMLSyntaxError, OSError) as e: |
|
|
print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}") |
|
|
return BeautifulSoup("", 'html.parser') |
|
|
except Exception as e: |
|
|
print(f"❌ General exception for {self.htmlLink}: {e}") |
|
|
return BeautifulSoup("", 'html.parser') |
|
|
|
|
|
return soup |
|
|
|
|
|
def getText(self): |
|
|
soup = self.openHTMLFile() |
|
|
s = soup.find_all("html") |
|
|
text = "" |
|
|
if s: |
|
|
for t in range(len(s)): |
|
|
text = s[t].get_text() |
|
|
cl = cleanText.cleanGenText() |
|
|
text = cl.removeExtraSpaceBetweenWords(text) |
|
|
return text |
|
|
def getListSection(self, scienceDirect=None): |
|
|
try: |
|
|
json = {} |
|
|
text = "" |
|
|
textJson, textHTML = "","" |
|
|
if scienceDirect == None: |
|
|
soup = self.openHTMLFile() |
|
|
|
|
|
json = {} |
|
|
for h2Pos in range(len(soup.find_all('h2'))): |
|
|
if soup.find_all('h2')[h2Pos].text not in json: |
|
|
json[soup.find_all('h2')[h2Pos].text] = [] |
|
|
if h2Pos + 1 < len(soup.find_all('h2')): |
|
|
content = soup.find_all('h2')[h2Pos].find_next("p") |
|
|
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p") |
|
|
while content.text != nexth2Content.text: |
|
|
json[soup.find_all('h2')[h2Pos].text].append(content.text) |
|
|
content = content.find_next("p") |
|
|
else: |
|
|
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True) |
|
|
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content) |
|
|
|
|
|
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[], |
|
|
'Results':[], 'Discussion':[], 'References':[], |
|
|
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[], |
|
|
'Additional information':[], 'Electronic supplementary material':[], |
|
|
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}''' |
|
|
if scienceDirect!= None or len(json)==0: |
|
|
|
|
|
api_key = os.environ["SCIENCE_DIRECT_API"] |
|
|
|
|
|
doi = self.htmlLink.split("https://doi.org/")[-1] |
|
|
|
|
|
base_url = "https://api.elsevier.com/content/article/doi/" |
|
|
|
|
|
headers = { |
|
|
"Accept": "application/json", |
|
|
"X-ELS-APIKey": api_key |
|
|
} |
|
|
|
|
|
response = requests.get(base_url + doi, headers=headers) |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
supp_data = data["full-text-retrieval-response"] |
|
|
if "originalText" in list(supp_data.keys()): |
|
|
if type(supp_data["originalText"])==str: |
|
|
json["originalText"] = [supp_data["originalText"]] |
|
|
if type(supp_data["originalText"])==dict: |
|
|
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]] |
|
|
else: |
|
|
if type(supp_data)==dict: |
|
|
for key in supp_data: |
|
|
json[key] = [supp_data[key]] |
|
|
|
|
|
textJson = self.mergeTextInJson(json) |
|
|
textHTML = self.getText() |
|
|
if len(textHTML) > len(textJson): |
|
|
text = textHTML |
|
|
else: text = textJson |
|
|
return text |
|
|
except: |
|
|
print("failed all") |
|
|
return "" |
|
|
def getReference(self): |
|
|
|
|
|
ref = [] |
|
|
json = self.getListSection() |
|
|
for key in json["References"]: |
|
|
ct = cleanText.cleanGenText(key) |
|
|
cleanText, filteredWord = ct.cleanText() |
|
|
if cleanText not in ref: |
|
|
ref.append(cleanText) |
|
|
return ref |
|
|
def getSupMaterial(self): |
|
|
|
|
|
json = {} |
|
|
soup = self.openHTMLFile() |
|
|
for h2Pos in range(len(soup.find_all('h2'))): |
|
|
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower(): |
|
|
|
|
|
link, output = [],[] |
|
|
if soup.find_all('h2')[h2Pos].text not in json: |
|
|
json[soup.find_all('h2')[h2Pos].text] = [] |
|
|
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True): |
|
|
link.append(l["href"]) |
|
|
if h2Pos + 1 < len(soup.find_all('h2')): |
|
|
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"] |
|
|
if nexth2Link in link: |
|
|
link = link[:link.index(nexth2Link)] |
|
|
|
|
|
for i in link: |
|
|
if "https" in i: output.append(i) |
|
|
json[soup.find_all('h2')[h2Pos].text].extend(output) |
|
|
return json |
|
|
def extractTable(self): |
|
|
soup = self.openHTMLFile() |
|
|
df = [] |
|
|
if len(soup)>0: |
|
|
try: |
|
|
df = pd.read_html(str(soup)) |
|
|
except ValueError: |
|
|
df = [] |
|
|
print("No tables found in HTML file") |
|
|
return df |
|
|
def mergeTextInJson(self,jsonHTML): |
|
|
cl = cleanText.cleanGenText() |
|
|
|
|
|
htmlText = "" |
|
|
for sec in jsonHTML: |
|
|
|
|
|
if len(jsonHTML[sec]) > 0: |
|
|
for i in range(len(jsonHTML[sec])): |
|
|
|
|
|
text = jsonHTML[sec][i] |
|
|
if len(text)>0: |
|
|
|
|
|
|
|
|
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True) |
|
|
jsonHTML[sec][i] = text |
|
|
if i-1 >= 0: |
|
|
if len(jsonHTML[sec][i-1])>0: |
|
|
if jsonHTML[sec][i-1][-1] != ".": |
|
|
htmlText += ". " |
|
|
htmlText += jsonHTML[sec][i] |
|
|
if len(jsonHTML[sec][i]) > 0: |
|
|
if jsonHTML[sec][i][-1]!=".": |
|
|
htmlText += "." |
|
|
htmlText += "\n\n" |
|
|
return htmlText |
|
|
def removeHeaders(self): |
|
|
pass |
|
|
def removeFooters(self): |
|
|
pass |
|
|
def removeReferences(self): |
|
|
pass |