Spaces:
Running
Running
Update smart_fallback.py
Browse files- smart_fallback.py +59 -0
smart_fallback.py
CHANGED
|
@@ -4,6 +4,8 @@ import mtdna_classifier
|
|
| 4 |
from NER.html import extractHTML
|
| 5 |
import data_preprocess
|
| 6 |
import pipeline
|
|
|
|
|
|
|
| 7 |
# Setup
|
| 8 |
def fetch_ncbi(accession_number):
|
| 9 |
try:
|
|
@@ -242,6 +244,63 @@ def smart_google_queries(metadata: dict):
|
|
| 242 |
# if better_filter:
|
| 243 |
# filtered = better_filter
|
| 244 |
# return filtered
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
| 247 |
TRUSTED_DOMAINS = [
|
|
|
|
| 4 |
from NER.html import extractHTML
|
| 5 |
import data_preprocess
|
| 6 |
import pipeline
|
| 7 |
+
import aiohttp
|
| 8 |
+
import asyncio
|
| 9 |
# Setup
|
| 10 |
def fetch_ncbi(accession_number):
|
| 11 |
try:
|
|
|
|
| 244 |
# if better_filter:
|
| 245 |
# filtered = better_filter
|
| 246 |
# return filtered
|
| 247 |
+
async def process_link(session, link, saveLinkFolder, keywords, accession):
|
| 248 |
+
output = []
|
| 249 |
+
title_snippet = link.lower()
|
| 250 |
+
|
| 251 |
+
# use async extractor for web, fallback to sync for local files
|
| 252 |
+
if link.startswith("http"):
|
| 253 |
+
article_text = await data_preprocess.async_extract_text(link, saveLinkFolder)
|
| 254 |
+
else:
|
| 255 |
+
article_text = data_preprocess.extract_text(link, saveLinkFolder)
|
| 256 |
+
|
| 257 |
+
for keyword in keywords:
|
| 258 |
+
if article_text and keyword.lower() in article_text.lower():
|
| 259 |
+
output.append([link, keyword.lower(), article_text])
|
| 260 |
+
return output
|
| 261 |
+
if keyword.lower() in title_snippet:
|
| 262 |
+
output.append([link, keyword.lower()])
|
| 263 |
+
return output
|
| 264 |
+
return output
|
| 265 |
+
|
| 266 |
+
async def async_filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
| 267 |
+
TRUSTED_DOMAINS = [
|
| 268 |
+
"ncbi.nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "pmc.ncbi.nlm.nih.gov",
|
| 269 |
+
"biorxiv.org", "researchgate.net", "nature.com", "sciencedirect.com"
|
| 270 |
+
]
|
| 271 |
+
|
| 272 |
+
keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
| 273 |
+
if accession:
|
| 274 |
+
keywords = [accession] + keywords
|
| 275 |
+
|
| 276 |
+
filtered, better_filter = {}, {}
|
| 277 |
+
|
| 278 |
+
async with aiohttp.ClientSession() as session:
|
| 279 |
+
tasks = []
|
| 280 |
+
for link in search_results:
|
| 281 |
+
if link:
|
| 282 |
+
tasks.append(process_link(session, link, saveLinkFolder, keywords, accession))
|
| 283 |
+
|
| 284 |
+
results = await asyncio.gather(*tasks)
|
| 285 |
+
|
| 286 |
+
# merge results
|
| 287 |
+
for output_link in results:
|
| 288 |
+
for out_link in output_link:
|
| 289 |
+
if isinstance(out_link, list) and len(out_link) > 1:
|
| 290 |
+
kw = out_link[1]
|
| 291 |
+
if accession and kw == accession.lower():
|
| 292 |
+
if len(out_link) == 2:
|
| 293 |
+
better_filter[out_link[0]] = ""
|
| 294 |
+
elif len(out_link) == 3:
|
| 295 |
+
better_filter[out_link[0]] = out_link[2]
|
| 296 |
+
if len(out_link) == 2:
|
| 297 |
+
better_filter[out_link[0]] = ""
|
| 298 |
+
elif len(out_link) == 3:
|
| 299 |
+
better_filter[out_link[0]] = out_link[2]
|
| 300 |
+
else:
|
| 301 |
+
filtered[out_link] = ""
|
| 302 |
+
|
| 303 |
+
return better_filter or filtered
|
| 304 |
|
| 305 |
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
| 306 |
TRUSTED_DOMAINS = [
|