Spaces:

jattokatarratto
/

MultiNER-simplified

Sleeping

App Files Files Community

jattokatarratto commited on Feb 7, 2025

Commit

ef18338

verified ·

1 Parent(s): d2850ad

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -33

app.py CHANGED Viewed

@@ -1,5 +1,11 @@
 import os
 from transformers import file_utils
 print(file_utils.default_cache_path)
@@ -21,10 +27,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from collections import Counter
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
-#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import torch
-#torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
@@ -496,12 +502,12 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
         #https://data.bioontology.org/documentation#nav_annotator
         #https://bioportal.bioontology.org/annotatorplus
-        key_bioportal = ""
-        if args.bioportalkey_filename:
-            fkeyname = args.bioportalkey_filename
-            with open(fkeyname) as f:
-                key_bioportal = f.read()
-        #key_bioportal = os.environ['key_bioportal']
         df_annot = pd.DataFrame()
         for drm_idx, row in tqdm(df.iterrows()):
@@ -886,9 +892,9 @@ def entitiesFusion(df_annotated, args):
     # Delete all the rows where EXACT MATCHING NOT MET:
     # Apply the conditions
     condition_to_delete = (
-            df_annotated['ContextToAnnotate'].str.startswith('"') &
-            df_annotated['ContextToAnnotate'].str.endswith('"') &
-            (df_annotated['ContextToAnnotate'].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
     )
     # Now Filter out the rows where condition_to_delete is True
@@ -1076,12 +1082,12 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
     entityBioeUrl = None
     ALLURIScontext = []
-    key_bioportal = ""
-    if args.bioportalkey_filename:
-        fkeyname = args.bioportalkey_filename
-        with open(fkeyname) as f:
-            key_bioportal = f.read()
-    #key_bioportal = os.environ['key_bioportal']
     # Check if args.KG_restriction exists and is not empty
     if getattr(args, 'KG_restriction', None):
@@ -2310,12 +2316,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                 else:
                     cache_map_geonames = {}
-            key_geonames = ""
-            if args.geonameskey_filename:
-                fkeyname = args.geonameskey_filename
-                with open(fkeyname) as f:
-                    key_geonames = f.read()
-            #key_geonames = os.environ['key_geonames']
             cache_map_virtuoso = None
             if strtobool(args.USE_CACHE):
@@ -2326,12 +2332,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                 else:
                     cache_map_virtuoso = {}
-            key_virtuoso = ""
-            if args.virtuosokey_filename:
-                fkeyname = args.virtuosokey_filename
-                with open(fkeyname) as f:
-                    key_virtuoso = f.read()
-            #key_virtuoso = os.environ['key_virtuoso']
             # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
@@ -2352,8 +2358,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
             # Define the condition to check if ContextToAnnotate starts and ends with quotes
-            quoted_context = df_ToAnnotate['ContextToAnnotate'].str.startswith('"') & df_ToAnnotate[
-                'ContextToAnnotate'].str.endswith('"')
             # Combine both conditions
             condition = missing_sentence_refs & quoted_context
@@ -2363,8 +2369,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             rows_to_add['model'] = "Forced"
             rows_to_add['entity_group'] = "MISC"
-            rows_to_add['word'] = rows_to_add['ContextToAnnotate']
-            rows_to_add['word'] = rows_to_add['ContextToAnnotate'].apply(strip_quotes)
             rows_to_add['score'] = 1.0
             rows_to_add['start'] = int(1)
             rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)

 import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
+os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
 print(file_utils.default_cache_path)
 from collections import Counter
 #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import torch
+torch.cuda.empty_cache()  # Clear cache ot torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}...")
         #https://data.bioontology.org/documentation#nav_annotator
         #https://bioportal.bioontology.org/annotatorplus
+        #key_bioportal = ""
+        #if args.bioportalkey_filename:
+        #    fkeyname = args.bioportalkey_filename
+        #    with open(fkeyname) as f:
+        #        key_bioportal = f.read()
+        key_bioportal = os.environ['key_bioportal']
         df_annot = pd.DataFrame()
         for drm_idx, row in tqdm(df.iterrows()):
     # Delete all the rows where EXACT MATCHING NOT MET:
     # Apply the conditions
     condition_to_delete = (
+            df_annotated[args.source_column].str.startswith('"') &
+            df_annotated[args.source_column].str.endswith('"') &
+            (df_annotated[args.source_column].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
     )
     # Now Filter out the rows where condition_to_delete is True
     entityBioeUrl = None
     ALLURIScontext = []
+    #key_bioportal = ""
+    #if args.bioportalkey_filename:
+    #    fkeyname = args.bioportalkey_filename
+    #    with open(fkeyname) as f:
+    #        key_bioportal = f.read()
+    key_bioportal = os.environ['key_bioportal']
     # Check if args.KG_restriction exists and is not empty
     if getattr(args, 'KG_restriction', None):
                 else:
                     cache_map_geonames = {}
+            #key_geonames = ""
+            #if args.geonameskey_filename:
+            #    fkeyname = args.geonameskey_filename
+            #    with open(fkeyname) as f:
+            #        key_geonames = f.read()
+            key_geonames = os.environ['key_geonames']
             cache_map_virtuoso = None
             if strtobool(args.USE_CACHE):
                 else:
                     cache_map_virtuoso = {}
+            #key_virtuoso = ""
+            #if args.virtuosokey_filename:
+            #    fkeyname = args.virtuosokey_filename
+            #    with open(fkeyname) as f:
+            #        key_virtuoso = f.read()
+            key_virtuoso = os.environ['key_virtuoso']
             # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
             missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
             # Define the condition to check if ContextToAnnotate starts and ends with quotes
+            quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[
+                args.source_column].str.endswith('"')
             # Combine both conditions
             condition = missing_sentence_refs & quoted_context
             rows_to_add['model'] = "Forced"
             rows_to_add['entity_group'] = "MISC"
+            rows_to_add['word'] = rows_to_add[args.source_column]
+            rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes)
             rows_to_add['score'] = 1.0
             rows_to_add['start'] = int(1)
             rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)