VyLala commited on
Commit
da3b210
·
verified ·
1 Parent(s): 6bcf9de

Update smart_fallback.py

Browse files
Files changed (1) hide show
  1. smart_fallback.py +59 -0
smart_fallback.py CHANGED
@@ -4,6 +4,8 @@ import mtdna_classifier
4
  from NER.html import extractHTML
5
  import data_preprocess
6
  import pipeline
 
 
7
  # Setup
8
  def fetch_ncbi(accession_number):
9
  try:
@@ -242,6 +244,63 @@ def smart_google_queries(metadata: dict):
242
  # if better_filter:
243
  # filtered = better_filter
244
  # return filtered
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
247
  TRUSTED_DOMAINS = [
 
4
  from NER.html import extractHTML
5
  import data_preprocess
6
  import pipeline
7
+ import aiohttp
8
+ import asyncio
9
  # Setup
10
  def fetch_ncbi(accession_number):
11
  try:
 
244
  # if better_filter:
245
  # filtered = better_filter
246
  # return filtered
247
+ async def process_link(session, link, saveLinkFolder, keywords, accession):
248
+ output = []
249
+ title_snippet = link.lower()
250
+
251
+ # use async extractor for web, fallback to sync for local files
252
+ if link.startswith("http"):
253
+ article_text = await data_preprocess.async_extract_text(link, saveLinkFolder)
254
+ else:
255
+ article_text = data_preprocess.extract_text(link, saveLinkFolder)
256
+
257
+ for keyword in keywords:
258
+ if article_text and keyword.lower() in article_text.lower():
259
+ output.append([link, keyword.lower(), article_text])
260
+ return output
261
+ if keyword.lower() in title_snippet:
262
+ output.append([link, keyword.lower()])
263
+ return output
264
+ return output
265
+
266
+ async def async_filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
267
+ TRUSTED_DOMAINS = [
268
+ "ncbi.nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "pmc.ncbi.nlm.nih.gov",
269
+ "biorxiv.org", "researchgate.net", "nature.com", "sciencedirect.com"
270
+ ]
271
+
272
+ keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
273
+ if accession:
274
+ keywords = [accession] + keywords
275
+
276
+ filtered, better_filter = {}, {}
277
+
278
+ async with aiohttp.ClientSession() as session:
279
+ tasks = []
280
+ for link in search_results:
281
+ if link:
282
+ tasks.append(process_link(session, link, saveLinkFolder, keywords, accession))
283
+
284
+ results = await asyncio.gather(*tasks)
285
+
286
+ # merge results
287
+ for output_link in results:
288
+ for out_link in output_link:
289
+ if isinstance(out_link, list) and len(out_link) > 1:
290
+ kw = out_link[1]
291
+ if accession and kw == accession.lower():
292
+ if len(out_link) == 2:
293
+ better_filter[out_link[0]] = ""
294
+ elif len(out_link) == 3:
295
+ better_filter[out_link[0]] = out_link[2]
296
+ if len(out_link) == 2:
297
+ better_filter[out_link[0]] = ""
298
+ elif len(out_link) == 3:
299
+ better_filter[out_link[0]] = out_link[2]
300
+ else:
301
+ filtered[out_link] = ""
302
+
303
+ return better_filter or filtered
304
 
305
  def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
306
  TRUSTED_DOMAINS = [