Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,11 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from transformers import file_utils
|
| 4 |
print(file_utils.default_cache_path)
|
| 5 |
|
|
@@ -21,10 +27,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 21 |
from collections import Counter
|
| 22 |
|
| 23 |
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
| 24 |
-
|
| 25 |
|
| 26 |
import torch
|
| 27 |
-
|
| 28 |
|
| 29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 30 |
print(f"Device: {device}...")
|
|
@@ -496,12 +502,12 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
|
|
| 496 |
#https://data.bioontology.org/documentation#nav_annotator
|
| 497 |
#https://bioportal.bioontology.org/annotatorplus
|
| 498 |
|
| 499 |
-
key_bioportal = ""
|
| 500 |
-
if args.bioportalkey_filename:
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
|
| 506 |
df_annot = pd.DataFrame()
|
| 507 |
for drm_idx, row in tqdm(df.iterrows()):
|
|
@@ -886,9 +892,9 @@ def entitiesFusion(df_annotated, args):
|
|
| 886 |
# Delete all the rows where EXACT MATCHING NOT MET:
|
| 887 |
# Apply the conditions
|
| 888 |
condition_to_delete = (
|
| 889 |
-
df_annotated[
|
| 890 |
-
df_annotated[
|
| 891 |
-
(df_annotated[
|
| 892 |
)
|
| 893 |
|
| 894 |
# Now Filter out the rows where condition_to_delete is True
|
|
@@ -1076,12 +1082,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
| 1076 |
entityBioeUrl = None
|
| 1077 |
ALLURIScontext = []
|
| 1078 |
|
| 1079 |
-
key_bioportal = ""
|
| 1080 |
-
if args.bioportalkey_filename:
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
|
| 1086 |
# Check if args.KG_restriction exists and is not empty
|
| 1087 |
if getattr(args, 'KG_restriction', None):
|
|
@@ -2310,12 +2316,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 2310 |
else:
|
| 2311 |
cache_map_geonames = {}
|
| 2312 |
|
| 2313 |
-
key_geonames = ""
|
| 2314 |
-
if args.geonameskey_filename:
|
| 2315 |
-
|
| 2316 |
-
|
| 2317 |
-
|
| 2318 |
-
|
| 2319 |
|
| 2320 |
cache_map_virtuoso = None
|
| 2321 |
if strtobool(args.USE_CACHE):
|
|
@@ -2326,12 +2332,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 2326 |
else:
|
| 2327 |
cache_map_virtuoso = {}
|
| 2328 |
|
| 2329 |
-
key_virtuoso = ""
|
| 2330 |
-
if args.virtuosokey_filename:
|
| 2331 |
-
|
| 2332 |
-
|
| 2333 |
-
|
| 2334 |
-
|
| 2335 |
|
| 2336 |
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
| 2337 |
|
|
@@ -2352,8 +2358,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 2352 |
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
| 2353 |
|
| 2354 |
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
| 2355 |
-
quoted_context = df_ToAnnotate[
|
| 2356 |
-
|
| 2357 |
|
| 2358 |
# Combine both conditions
|
| 2359 |
condition = missing_sentence_refs & quoted_context
|
|
@@ -2363,8 +2369,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 2363 |
|
| 2364 |
rows_to_add['model'] = "Forced"
|
| 2365 |
rows_to_add['entity_group'] = "MISC"
|
| 2366 |
-
rows_to_add['word'] = rows_to_add[
|
| 2367 |
-
rows_to_add['word'] = rows_to_add[
|
| 2368 |
rows_to_add['score'] = 1.0
|
| 2369 |
rows_to_add['start'] = int(1)
|
| 2370 |
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
|
| 4 |
+
|
| 5 |
+
os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
| 6 |
+
os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
| 7 |
+
os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
|
| 8 |
+
|
| 9 |
from transformers import file_utils
|
| 10 |
print(file_utils.default_cache_path)
|
| 11 |
|
|
|
|
| 27 |
from collections import Counter
|
| 28 |
|
| 29 |
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
| 30 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 31 |
|
| 32 |
import torch
|
| 33 |
+
torch.cuda.empty_cache() # Clear cache ot torch
|
| 34 |
|
| 35 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 36 |
print(f"Device: {device}...")
|
|
|
|
| 502 |
#https://data.bioontology.org/documentation#nav_annotator
|
| 503 |
#https://bioportal.bioontology.org/annotatorplus
|
| 504 |
|
| 505 |
+
#key_bioportal = ""
|
| 506 |
+
#if args.bioportalkey_filename:
|
| 507 |
+
# fkeyname = args.bioportalkey_filename
|
| 508 |
+
# with open(fkeyname) as f:
|
| 509 |
+
# key_bioportal = f.read()
|
| 510 |
+
key_bioportal = os.environ['key_bioportal']
|
| 511 |
|
| 512 |
df_annot = pd.DataFrame()
|
| 513 |
for drm_idx, row in tqdm(df.iterrows()):
|
|
|
|
| 892 |
# Delete all the rows where EXACT MATCHING NOT MET:
|
| 893 |
# Apply the conditions
|
| 894 |
condition_to_delete = (
|
| 895 |
+
df_annotated[args.source_column].str.startswith('"') &
|
| 896 |
+
df_annotated[args.source_column].str.endswith('"') &
|
| 897 |
+
(df_annotated[args.source_column].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
|
| 898 |
)
|
| 899 |
|
| 900 |
# Now Filter out the rows where condition_to_delete is True
|
|
|
|
| 1082 |
entityBioeUrl = None
|
| 1083 |
ALLURIScontext = []
|
| 1084 |
|
| 1085 |
+
#key_bioportal = ""
|
| 1086 |
+
#if args.bioportalkey_filename:
|
| 1087 |
+
# fkeyname = args.bioportalkey_filename
|
| 1088 |
+
# with open(fkeyname) as f:
|
| 1089 |
+
# key_bioportal = f.read()
|
| 1090 |
+
key_bioportal = os.environ['key_bioportal']
|
| 1091 |
|
| 1092 |
# Check if args.KG_restriction exists and is not empty
|
| 1093 |
if getattr(args, 'KG_restriction', None):
|
|
|
|
| 2316 |
else:
|
| 2317 |
cache_map_geonames = {}
|
| 2318 |
|
| 2319 |
+
#key_geonames = ""
|
| 2320 |
+
#if args.geonameskey_filename:
|
| 2321 |
+
# fkeyname = args.geonameskey_filename
|
| 2322 |
+
# with open(fkeyname) as f:
|
| 2323 |
+
# key_geonames = f.read()
|
| 2324 |
+
key_geonames = os.environ['key_geonames']
|
| 2325 |
|
| 2326 |
cache_map_virtuoso = None
|
| 2327 |
if strtobool(args.USE_CACHE):
|
|
|
|
| 2332 |
else:
|
| 2333 |
cache_map_virtuoso = {}
|
| 2334 |
|
| 2335 |
+
#key_virtuoso = ""
|
| 2336 |
+
#if args.virtuosokey_filename:
|
| 2337 |
+
# fkeyname = args.virtuosokey_filename
|
| 2338 |
+
# with open(fkeyname) as f:
|
| 2339 |
+
# key_virtuoso = f.read()
|
| 2340 |
+
key_virtuoso = os.environ['key_virtuoso']
|
| 2341 |
|
| 2342 |
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
| 2343 |
|
|
|
|
| 2358 |
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
| 2359 |
|
| 2360 |
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
| 2361 |
+
quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
|
| 2362 |
+
args.source_column].str.endswith('"')
|
| 2363 |
|
| 2364 |
# Combine both conditions
|
| 2365 |
condition = missing_sentence_refs & quoted_context
|
|
|
|
| 2369 |
|
| 2370 |
rows_to_add['model'] = "Forced"
|
| 2371 |
rows_to_add['entity_group'] = "MISC"
|
| 2372 |
+
rows_to_add['word'] = rows_to_add[args.source_column]
|
| 2373 |
+
rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
|
| 2374 |
rows_to_add['score'] = 1.0
|
| 2375 |
rows_to_add['start'] = int(1)
|
| 2376 |
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|