Spaces:
Running
Running
Update smart_fallback.py
Browse files- smart_fallback.py +16 -1
smart_fallback.py
CHANGED
|
@@ -147,7 +147,7 @@ def smart_google_queries(metadata: dict):
|
|
| 147 |
|
| 148 |
return queries
|
| 149 |
|
| 150 |
-
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
| 151 |
TRUSTED_DOMAINS = [
|
| 152 |
"ncbi.nlm.nih.gov",
|
| 153 |
"pubmed.ncbi.nlm.nih.gov",
|
|
@@ -157,6 +157,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
| 157 |
"nature.com",
|
| 158 |
"sciencedirect.com"
|
| 159 |
]
|
|
|
|
|
|
|
|
|
|
| 160 |
def is_trusted_link(link):
|
| 161 |
for domain in TRUSTED_DOMAINS:
|
| 162 |
if domain in link:
|
|
@@ -170,6 +173,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
| 170 |
title_snippet = link.lower()
|
| 171 |
print("save link folder inside this filter function: ", saveLinkFolder)
|
| 172 |
success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
|
|
|
|
|
|
|
|
|
|
| 173 |
if success_process:
|
| 174 |
article_text = output_process
|
| 175 |
print("yes succeed for getting article text")
|
|
@@ -179,10 +185,16 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
| 179 |
#article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
| 180 |
print("article text")
|
| 181 |
#print(article_text)
|
|
|
|
|
|
|
|
|
|
| 182 |
try:
|
| 183 |
ext = link.split(".")[-1].lower()
|
| 184 |
if ext not in ["pdf", "docx", "xlsx"]:
|
| 185 |
html = extractHTML.HTML("", link)
|
|
|
|
|
|
|
|
|
|
| 186 |
jsonSM = html.getSupMaterial()
|
| 187 |
if jsonSM:
|
| 188 |
output += sum((jsonSM[key] for key in jsonSM), [])
|
|
@@ -210,6 +222,9 @@ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
|
|
| 210 |
# filtered.append(link)
|
| 211 |
# else:
|
| 212 |
print(link)
|
|
|
|
|
|
|
|
|
|
| 213 |
if link:
|
| 214 |
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
| 215 |
print("output link: ")
|
|
|
|
| 147 |
|
| 148 |
return queries
|
| 149 |
|
| 150 |
+
def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
|
| 151 |
TRUSTED_DOMAINS = [
|
| 152 |
"ncbi.nlm.nih.gov",
|
| 153 |
"pubmed.ncbi.nlm.nih.gov",
|
|
|
|
| 157 |
"nature.com",
|
| 158 |
"sciencedirect.com"
|
| 159 |
]
|
| 160 |
+
if stop_flag is not None and stop_flag.value:
|
| 161 |
+
print(f"🛑 Stop detected {accession}, aborting early...")
|
| 162 |
+
return []
|
| 163 |
def is_trusted_link(link):
|
| 164 |
for domain in TRUSTED_DOMAINS:
|
| 165 |
if domain in link:
|
|
|
|
| 173 |
title_snippet = link.lower()
|
| 174 |
print("save link folder inside this filter function: ", saveLinkFolder)
|
| 175 |
success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
|
| 176 |
+
if stop_flag is not None and stop_flag.value:
|
| 177 |
+
print(f"🛑 Stop detected {accession}, aborting early...")
|
| 178 |
+
return []
|
| 179 |
if success_process:
|
| 180 |
article_text = output_process
|
| 181 |
print("yes succeed for getting article text")
|
|
|
|
| 185 |
#article_text = data_preprocess.extract_text(link,saveLinkFolder)
|
| 186 |
print("article text")
|
| 187 |
#print(article_text)
|
| 188 |
+
if stop_flag is not None and stop_flag.value:
|
| 189 |
+
print(f"🛑 Stop detected {accession}, aborting early...")
|
| 190 |
+
return []
|
| 191 |
try:
|
| 192 |
ext = link.split(".")[-1].lower()
|
| 193 |
if ext not in ["pdf", "docx", "xlsx"]:
|
| 194 |
html = extractHTML.HTML("", link)
|
| 195 |
+
if stop_flag is not None and stop_flag.value:
|
| 196 |
+
print(f"🛑 Stop detected {accession}, aborting early...")
|
| 197 |
+
return []
|
| 198 |
jsonSM = html.getSupMaterial()
|
| 199 |
if jsonSM:
|
| 200 |
output += sum((jsonSM[key] for key in jsonSM), [])
|
|
|
|
| 222 |
# filtered.append(link)
|
| 223 |
# else:
|
| 224 |
print(link)
|
| 225 |
+
if stop_flag is not None and stop_flag.value:
|
| 226 |
+
print(f"🛑 Stop detected {accession}, aborting early...")
|
| 227 |
+
return []
|
| 228 |
if link:
|
| 229 |
output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
|
| 230 |
print("output link: ")
|