VyLala commited on
Commit
67cedb6
·
verified ·
1 Parent(s): cde399d

Update NER/html/extractHTML.py

Browse files
Files changed (1) hide show
  1. NER/html/extractHTML.py +363 -248
NER/html/extractHTML.py CHANGED
@@ -1,249 +1,364 @@
1
- # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
2
- from bs4 import BeautifulSoup
3
- import requests
4
- from DefaultPackages import openFile, saveFile
5
- from NER import cleanText
6
- import pandas as pd
7
- class HTML():
8
- def __init__(self, htmlFile, htmlLink):
9
- self.htmlLink = htmlLink
10
- self.htmlFile = htmlFile
11
- # def openHTMLFile(self):
12
- # headers = {
13
- # "User-Agent": (
14
- # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
- # "AppleWebKit/537.36 (KHTML, like Gecko) "
16
- # "Chrome/114.0.0.0 Safari/537.36"
17
- # ),
18
- # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
19
- # "Referer": self.htmlLink,
20
- # "Connection": "keep-alive"
21
- # }
22
-
23
- # session = requests.Session()
24
- # session.headers.update(headers)
25
-
26
- # if self.htmlLink != "None":
27
- # try:
28
- # r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
29
- # if r.status_code != 200:
30
- # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
31
- # return BeautifulSoup("", 'html.parser')
32
- # soup = BeautifulSoup(r.content, 'html.parser')
33
- # except Exception as e:
34
- # print(f"❌ Exception fetching HTML: {e}")
35
- # return BeautifulSoup("", 'html.parser')
36
- # else:
37
- # with open(self.htmlFile) as fp:
38
- # soup = BeautifulSoup(fp, 'html.parser')
39
- # return soup
40
- from lxml.etree import ParserError, XMLSyntaxError
41
-
42
- def openHTMLFile(self):
43
- not_need_domain = ['https://broadinstitute.github.io/picard/',
44
- 'https://software.broadinstitute.org/gatk/best-practices/',
45
- 'https://www.ncbi.nlm.nih.gov/genbank/',
46
- 'https://www.mitomap.org/']
47
- headers = {
48
- "User-Agent": (
49
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
- "AppleWebKit/537.36 (KHTML, like Gecko) "
51
- "Chrome/114.0.0.0 Safari/537.36"
52
- ),
53
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
54
- "Referer": self.htmlLink,
55
- "Connection": "keep-alive"
56
- }
57
-
58
- session = requests.Session()
59
- session.headers.update(headers)
60
- if self.htmlLink in not_need_domain:
61
- return BeautifulSoup("", 'html.parser')
62
- try:
63
- if self.htmlLink and self.htmlLink != "None":
64
- r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
65
- if r.status_code != 200 or not r.text.strip():
66
- print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
67
- return BeautifulSoup("", 'html.parser')
68
- soup = BeautifulSoup(r.content, 'html.parser')
69
- else:
70
- with open(self.htmlFile, encoding='utf-8') as fp:
71
- soup = BeautifulSoup(fp, 'html.parser')
72
- except (ParserError, XMLSyntaxError, OSError) as e:
73
- print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
74
- return BeautifulSoup("", 'html.parser')
75
- except Exception as e:
76
- print(f"❌ General exception for {self.htmlLink}: {e}")
77
- return BeautifulSoup("", 'html.parser')
78
-
79
- return soup
80
-
81
- def getText(self):
82
- try:
83
- soup = self.openHTMLFile()
84
- s = soup.find_all("html")
85
- text = ""
86
- if s:
87
- for t in range(len(s)):
88
- text = s[t].get_text()
89
- cl = cleanText.cleanGenText()
90
- text = cl.removeExtraSpaceBetweenWords(text)
91
- return text
92
- except:
93
- print("failed get text from html")
94
- return ""
95
- def getListSection(self, scienceDirect=None):
96
- try:
97
- json = {}
98
- text = ""
99
- textJson, textHTML = "",""
100
- if scienceDirect == None:
101
- # soup = self.openHTMLFile()
102
- # # get list of section
103
- # json = {}
104
- # for h2Pos in range(len(soup.find_all('h2'))):
105
- # if soup.find_all('h2')[h2Pos].text not in json:
106
- # json[soup.find_all('h2')[h2Pos].text] = []
107
- # if h2Pos + 1 < len(soup.find_all('h2')):
108
- # content = soup.find_all('h2')[h2Pos].find_next("p")
109
- # nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
110
- # while content.text != nexth2Content.text:
111
- # json[soup.find_all('h2')[h2Pos].text].append(content.text)
112
- # content = content.find_next("p")
113
- # else:
114
- # content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
115
- # json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
116
-
117
- soup = self.openHTMLFile()
118
- h2_tags = soup.find_all('h2')
119
- json = {}
120
-
121
- for idx, h2 in enumerate(h2_tags):
122
- section_title = h2.get_text(strip=True)
123
- json.setdefault(section_title, [])
124
-
125
- # Get paragraphs until next H2
126
- next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
127
- for p in h2.find_all_next("p"):
128
- if next_h2 and p == next_h2:
129
- break
130
- json[section_title].append(p.get_text(strip=True))
131
- # format
132
- '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
133
- 'Results':[], 'Discussion':[], 'References':[],
134
- 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
135
- 'Additional information':[], 'Electronic supplementary material':[],
136
- 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
137
- if scienceDirect!= None or len(json)==0:
138
- # Replace with your actual Elsevier API key
139
- api_key = os.environ["SCIENCE_DIRECT_API"]
140
- # ScienceDirect article DOI or PI (Example DOI)
141
- doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
142
- # Base URL for the Elsevier API
143
- base_url = "https://api.elsevier.com/content/article/doi/"
144
- # Set headers with API key
145
- headers = {
146
- "Accept": "application/json",
147
- "X-ELS-APIKey": api_key
148
- }
149
- # Make the API request
150
- response = requests.get(base_url + doi, headers=headers)
151
- # Check if the request was successful
152
- if response.status_code == 200:
153
- data = response.json()
154
- supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
155
- # if "originalText" in list(supp_data.keys()):
156
- # if type(supp_data["originalText"])==str:
157
- # json["originalText"] = [supp_data["originalText"]]
158
- # if type(supp_data["originalText"])==dict:
159
- # json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
160
- # else:
161
- # if type(supp_data)==dict:
162
- # for key in supp_data:
163
- # json[key] = [supp_data[key]]
164
- if type(data)==dict:
165
- json["fullText"] = data
166
- textJson = self.mergeTextInJson(json)
167
- textHTML = self.getText()
168
- if len(textHTML) > len(textJson):
169
- text = textHTML
170
- else: text = textJson
171
- return text #json
172
- except:
173
- print("failed all")
174
- return ""
175
- def getReference(self):
176
- # get reference to collect more next data
177
- ref = []
178
- json = self.getListSection()
179
- for key in json["References"]:
180
- ct = cleanText.cleanGenText(key)
181
- cleanText, filteredWord = ct.cleanText()
182
- if cleanText not in ref:
183
- ref.append(cleanText)
184
- return ref
185
- def getSupMaterial(self):
186
- # check if there is material or not
187
- json = {}
188
- soup = self.openHTMLFile()
189
- for h2Pos in range(len(soup.find_all('h2'))):
190
- if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
191
- #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
192
- link, output = [],[]
193
- if soup.find_all('h2')[h2Pos].text not in json:
194
- json[soup.find_all('h2')[h2Pos].text] = []
195
- for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
196
- link.append(l["href"])
197
- if h2Pos + 1 < len(soup.find_all('h2')):
198
- nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
199
- if nexth2Link in link:
200
- link = link[:link.index(nexth2Link)]
201
- # only take links having "https" in that
202
- for i in link:
203
- if "https" in i: output.append(i)
204
- json[soup.find_all('h2')[h2Pos].text].extend(output)
205
- return json
206
- def extractTable(self):
207
- soup = self.openHTMLFile()
208
- df = []
209
- if len(soup)>0:
210
- try:
211
- df = pd.read_html(str(soup))
212
- except ValueError:
213
- df = []
214
- print("No tables found in HTML file")
215
- return df
216
- def mergeTextInJson(self,jsonHTML):
217
- try:
218
- #cl = cleanText.cleanGenText()
219
- htmlText = ""
220
- if jsonHTML:
221
- # try:
222
- # for sec, entries in jsonHTML.items():
223
- # for i, entry in enumerate(entries):
224
- # # Only process if it's actually text
225
- # if isinstance(entry, str):
226
- # if entry.strip():
227
- # entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
228
- # else:
229
- # # Skip or convert dicts/lists to string if needed
230
- # entry = str(entry)
231
-
232
- # jsonHTML[sec][i] = entry
233
-
234
- # # Add spacing between sentences
235
- # if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
236
- # htmlText += ". "
237
- # htmlText += entry
238
-
239
- # # Add final period if needed
240
- # if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
241
- # htmlText += "."
242
- # htmlText += "\n\n"
243
- # except:
244
- htmlText += str(jsonHTML)
245
- return htmlText
246
- except:
247
- print("failed merge text in json")
248
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
 
1
+ # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ from DefaultPackages import openFile, saveFile
5
+ from NER import cleanText
6
+ import pandas as pd
7
+ from lxml.etree import ParserError, XMLSyntaxError
8
+ import aiohttp
9
+ import asyncio
10
+ class HTML():
11
+ def __init__(self, htmlFile, htmlLink, htmlContent: str=None):
12
+ self.htmlLink = htmlLink
13
+ self.htmlFile = htmlFile
14
+ self.htmlContent = htmlContent # NEW: store raw HTML if provided
15
+ def fetch_crossref_metadata(self, doi):
16
+ """Fetch metadata from CrossRef API for a given DOI."""
17
+ try:
18
+ url = f"https://api.crossref.org/works/{doi}"
19
+ r = requests.get(url, timeout=10)
20
+ if r.status_code == 200:
21
+ return r.json().get("message", {})
22
+ else:
23
+ print(f"⚠️ CrossRef fetch failed ({r.status_code}) for DOI: {doi}")
24
+ return {}
25
+ except Exception as e:
26
+ print(f"❌ CrossRef exception: {e}")
27
+ return {}
28
+ # def openHTMLFile(self):
29
+ # headers = {
30
+ # "User-Agent": (
31
+ # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
32
+ # "AppleWebKit/537.36 (KHTML, like Gecko) "
33
+ # "Chrome/114.0.0.0 Safari/537.36"
34
+ # ),
35
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
36
+ # "Referer": self.htmlLink,
37
+ # "Connection": "keep-alive"
38
+ # }
39
+
40
+ # session = requests.Session()
41
+ # session.headers.update(headers)
42
+
43
+ # if self.htmlLink != "None":
44
+ # try:
45
+ # r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
46
+ # if r.status_code != 200:
47
+ # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
48
+ # return BeautifulSoup("", 'html.parser')
49
+ # soup = BeautifulSoup(r.content, 'html.parser')
50
+ # except Exception as e:
51
+ # print(f" Exception fetching HTML: {e}")
52
+ # return BeautifulSoup("", 'html.parser')
53
+ # else:
54
+ # with open(self.htmlFile) as fp:
55
+ # soup = BeautifulSoup(fp, 'html.parser')
56
+ # return soup
57
+
58
+ def openHTMLFile(self):
59
+ """Return a BeautifulSoup object from cached htmlContent, file, or requests."""
60
+ # If raw HTML already provided (from async aiohttp), use it directly
61
+ if self.htmlContent is not None:
62
+ return BeautifulSoup(self.htmlContent, "html.parser")
63
+
64
+ not_need_domain = ['https://broadinstitute.github.io/picard/',
65
+ 'https://software.broadinstitute.org/gatk/best-practices/',
66
+ 'https://www.ncbi.nlm.nih.gov/genbank/',
67
+ 'https://www.mitomap.org/']
68
+ if self.htmlLink in not_need_domain:
69
+ return BeautifulSoup("", 'html.parser')
70
+ headers = {
71
+ "User-Agent": (
72
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
73
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
74
+ "Chrome/114.0.0.0 Safari/537.36"
75
+ ),
76
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
77
+ "Accept-Language": "en-US,en;q=0.9",
78
+ "Referer": "https://www.google.com/",
79
+ #"Referer": self.htmlLink,
80
+ "Connection": "keep-alive"
81
+ }
82
+
83
+ # session = requests.Session()
84
+ # session.headers.update(headers)
85
+ try:
86
+ if self.htmlLink and self.htmlLink != "None":
87
+ r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
88
+ if r.status_code != 200 or not r.text.strip():
89
+ print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
90
+ return BeautifulSoup("", 'html.parser')
91
+ soup = BeautifulSoup(r.content, 'html.parser')
92
+ elif self.htmlFile:
93
+ with open(self.htmlFile, encoding='utf-8') as fp:
94
+ soup = BeautifulSoup(fp, 'html.parser')
95
+ except (ParserError, XMLSyntaxError, OSError) as e:
96
+ print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
97
+ return BeautifulSoup("", 'html.parser')
98
+ except Exception as e:
99
+ print(f"❌ General exception for {self.htmlLink}: {e}")
100
+ return BeautifulSoup("", 'html.parser')
101
+
102
+ return soup
103
+
104
+ async def async_fetch_html(self):
105
+ """Async fetch HTML content with aiohttp."""
106
+ not_need_domain = [
107
+ "https://broadinstitute.github.io/picard/",
108
+ "https://software.broadinstitute.org/gatk/best-practices/",
109
+ "https://www.ncbi.nlm.nih.gov/genbank/",
110
+ "https://www.mitomap.org/",
111
+ ]
112
+ if self.htmlLink in not_need_domain:
113
+ return "" # Skip domains we don't need
114
+
115
+ headers = {
116
+ "User-Agent": (
117
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
118
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
119
+ "Chrome/114.0.0.0 Safari/537.36"
120
+ ),
121
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
122
+ "Accept-Language": "en-US,en;q=0.9",
123
+ "Referer": "https://www.google.com/",
124
+ "Connection": "keep-alive",
125
+ }
126
+
127
+ try:
128
+ async with aiohttp.ClientSession(headers=headers) as session:
129
+ async with session.get(self.htmlLink, timeout=15) as resp:
130
+ if resp.status != 200:
131
+ print(f"❌ HTML GET failed ({resp.status}) — {self.htmlLink}")
132
+ return ""
133
+ return await resp.text()
134
+ except Exception as e:
135
+ print(f"❌ Async fetch failed for {self.htmlLink}: {e}")
136
+ return ""
137
+
138
+ @classmethod
139
+ async def bulk_fetch(cls, links: list[str]):
140
+ """Fetch multiple links concurrently, return list of HTML() objects with htmlContent filled."""
141
+ tasks = [cls("", link).async_fetch_html() for link in links]
142
+ results = await asyncio.gather(*tasks, return_exceptions=True)
143
+
144
+ out = []
145
+ for link, content in zip(links, results):
146
+ if isinstance(content, Exception):
147
+ print(f"⚠️ Exception while fetching {link}: {content}")
148
+ out.append(cls("", link, htmlContent=""))
149
+ else:
150
+ out.append(cls("", link, htmlContent=content))
151
+ return out
152
+
153
+
154
+ def getText(self):
155
+ try:
156
+ soup = self.openHTMLFile()
157
+ s = soup.find_all("html")
158
+ text = ""
159
+ if s:
160
+ for t in range(len(s)):
161
+ text = s[t].get_text()
162
+ cl = cleanText.cleanGenText()
163
+ text = cl.removeExtraSpaceBetweenWords(text)
164
+ return text
165
+ except:
166
+ print("failed get text from html")
167
+ return ""
168
+
169
+ async def async_getListSection(self, scienceDirect=None):
170
+ try:
171
+ json = {}
172
+ textJson, textHTML = "", ""
173
+
174
+ # Use preloaded HTML (fast path)
175
+ soup = self.openHTMLFile()
176
+ h2_tags = soup.find_all('h2')
177
+ for idx, h2 in enumerate(h2_tags):
178
+ section_title = h2.get_text(strip=True)
179
+ json.setdefault(section_title, [])
180
+ next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
181
+ for p in h2.find_all_next("p"):
182
+ if next_h2 and p == next_h2:
183
+ break
184
+ json[section_title].append(p.get_text(strip=True))
185
+
186
+ # If no sections or explicitly ScienceDirect
187
+ if scienceDirect is not None or len(json) == 0:
188
+ print("async fetching ScienceDirect metadata...")
189
+ api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
190
+ doi = self.htmlLink.split("https://doi.org/")[-1]
191
+ base_url = f"https://api.elsevier.com/content/article/doi/{doi}"
192
+ headers = {"Accept": "application/json", "X-ELS-APIKey": api_key}
193
+
194
+ async with aiohttp.ClientSession() as session:
195
+ async with session.get(base_url, headers=headers, timeout=15) as resp:
196
+ if resp.status == 200:
197
+ data = await resp.json()
198
+ if isinstance(data, dict):
199
+ json["fullText"] = data
200
+
201
+ # Merge text
202
+ textJson = self.mergeTextInJson(json)
203
+ textHTML = self.getText()
204
+ return textHTML if len(textHTML) > len(textJson) else textJson
205
+
206
+ except Exception as e:
207
+ print("❌ async_getListSection failed:", e)
208
+ return ""
209
+
210
+ def getListSection(self, scienceDirect=None):
211
+ try:
212
+ json = {}
213
+ text = ""
214
+ textJson, textHTML = "",""
215
+ if scienceDirect == None:
216
+ # soup = self.openHTMLFile()
217
+ # # get list of section
218
+ # json = {}
219
+ # for h2Pos in range(len(soup.find_all('h2'))):
220
+ # if soup.find_all('h2')[h2Pos].text not in json:
221
+ # json[soup.find_all('h2')[h2Pos].text] = []
222
+ # if h2Pos + 1 < len(soup.find_all('h2')):
223
+ # content = soup.find_all('h2')[h2Pos].find_next("p")
224
+ # nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
225
+ # while content.text != nexth2Content.text:
226
+ # json[soup.find_all('h2')[h2Pos].text].append(content.text)
227
+ # content = content.find_next("p")
228
+ # else:
229
+ # content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
230
+ # json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
231
+
232
+ soup = self.openHTMLFile()
233
+ h2_tags = soup.find_all('h2')
234
+ json = {}
235
+
236
+ for idx, h2 in enumerate(h2_tags):
237
+ section_title = h2.get_text(strip=True)
238
+ json.setdefault(section_title, [])
239
+
240
+ # Get paragraphs until next H2
241
+ next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
242
+ for p in h2.find_all_next("p"):
243
+ if next_h2 and p == next_h2:
244
+ break
245
+ json[section_title].append(p.get_text(strip=True))
246
+ # format
247
+ '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
248
+ 'Results':[], 'Discussion':[], 'References':[],
249
+ 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
250
+ 'Additional information':[], 'Electronic supplementary material':[],
251
+ 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
252
+ if scienceDirect!= None or len(json)==0:
253
+ # Replace with your actual Elsevier API key
254
+ api_key = os.environ["SCIENCE_DIRECT_API"]
255
+ # ScienceDirect article DOI or PI (Example DOI)
256
+ doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
257
+ # Base URL for the Elsevier API
258
+ base_url = "https://api.elsevier.com/content/article/doi/"
259
+ # Set headers with API key
260
+ headers = {
261
+ "Accept": "application/json",
262
+ "X-ELS-APIKey": api_key
263
+ }
264
+ # Make the API request
265
+ response = requests.get(base_url + doi, headers=headers)
266
+ # Check if the request was successful
267
+ if response.status_code == 200:
268
+ data = response.json()
269
+ supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
270
+ # if "originalText" in list(supp_data.keys()):
271
+ # if type(supp_data["originalText"])==str:
272
+ # json["originalText"] = [supp_data["originalText"]]
273
+ # if type(supp_data["originalText"])==dict:
274
+ # json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
275
+ # else:
276
+ # if type(supp_data)==dict:
277
+ # for key in supp_data:
278
+ # json[key] = [supp_data[key]]
279
+ if type(data)==dict:
280
+ json["fullText"] = data
281
+ textJson = self.mergeTextInJson(json)
282
+ textHTML = self.getText()
283
+ if len(textHTML) > len(textJson):
284
+ text = textHTML
285
+ else: text = textJson
286
+ return text #json
287
+ except:
288
+ print("failed all")
289
+ return ""
290
+ def getReference(self):
291
+ # get reference to collect more next data
292
+ ref = []
293
+ json = self.getListSection()
294
+ for key in json["References"]:
295
+ ct = cleanText.cleanGenText(key)
296
+ cleanText, filteredWord = ct.cleanText()
297
+ if cleanText not in ref:
298
+ ref.append(cleanText)
299
+ return ref
300
+ def getSupMaterial(self):
301
+ # check if there is material or not
302
+ json = {}
303
+ soup = self.openHTMLFile()
304
+ for h2Pos in range(len(soup.find_all('h2'))):
305
+ if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
306
+ #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
307
+ link, output = [],[]
308
+ if soup.find_all('h2')[h2Pos].text not in json:
309
+ json[soup.find_all('h2')[h2Pos].text] = []
310
+ for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
311
+ link.append(l["href"])
312
+ if h2Pos + 1 < len(soup.find_all('h2')):
313
+ nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
314
+ if nexth2Link in link:
315
+ link = link[:link.index(nexth2Link)]
316
+ # only take links having "https" in that
317
+ for i in link:
318
+ if "https" in i: output.append(i)
319
+ json[soup.find_all('h2')[h2Pos].text].extend(output)
320
+ return json
321
+ def extractTable(self):
322
+ soup = self.openHTMLFile()
323
+ df = []
324
+ if len(soup)>0:
325
+ try:
326
+ df = pd.read_html(str(soup))
327
+ except ValueError:
328
+ df = []
329
+ print("No tables found in HTML file")
330
+ return df
331
+ def mergeTextInJson(self,jsonHTML):
332
+ try:
333
+ #cl = cleanText.cleanGenText()
334
+ htmlText = ""
335
+ if jsonHTML:
336
+ # try:
337
+ # for sec, entries in jsonHTML.items():
338
+ # for i, entry in enumerate(entries):
339
+ # # Only process if it's actually text
340
+ # if isinstance(entry, str):
341
+ # if entry.strip():
342
+ # entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
343
+ # else:
344
+ # # Skip or convert dicts/lists to string if needed
345
+ # entry = str(entry)
346
+
347
+ # jsonHTML[sec][i] = entry
348
+
349
+ # # Add spacing between sentences
350
+ # if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
351
+ # htmlText += ". "
352
+ # htmlText += entry
353
+
354
+ # # Add final period if needed
355
+ # if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
356
+ # htmlText += "."
357
+ # htmlText += "\n\n"
358
+ # except:
359
+ htmlText += str(jsonHTML)
360
+ return htmlText
361
+ except:
362
+ print("failed merge text in json")
363
+ return ""
364