Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
| 3 |
from transformers import file_utils
|
| 4 |
print(file_utils.default_cache_path)
|
| 5 |
|
|
@@ -16,11 +17,8 @@ from transformers.pipelines.pt_utils import KeyDataset
|
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
from collections import Counter
|
| 18 |
|
| 19 |
-
##os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
| 20 |
-
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 21 |
-
|
| 22 |
import torch
|
| 23 |
-
|
| 24 |
|
| 25 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 26 |
print(f"Device: {device}...")
|
|
@@ -41,7 +39,7 @@ from virtuosoQueryRest import sparqlQuery
|
|
| 41 |
import gradio as gr
|
| 42 |
import re
|
| 43 |
|
| 44 |
-
from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, rescale_exponential_to_logarithmic
|
| 45 |
|
| 46 |
|
| 47 |
|
|
@@ -73,16 +71,14 @@ modelGlinerBio=None
|
|
| 73 |
num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
|
| 74 |
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
'SYMP', 'FoodOn', 'UBERON', 'VO', 'EuroSciVoc']
|
| 82 |
|
| 83 |
ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
|
| 84 |
|
| 85 |
-
|
| 86 |
encod = encoding_getter('microsoft/deberta-v3-large')
|
| 87 |
text_splitter = TokenTextSplitter(
|
| 88 |
# separators=separators,
|
|
@@ -215,23 +211,24 @@ def process_row_BioPortal_api(args, key_bioportal, row):
|
|
| 215 |
|
| 216 |
onto_clauses = ""
|
| 217 |
for choice in args.KG_restriction:
|
| 218 |
-
if choice
|
| 219 |
choice="SNOMEDCT"
|
| 220 |
-
elif choice
|
| 221 |
choice = "OBOREL"
|
| 222 |
-
elif choice
|
| 223 |
choice = "PTRANS"
|
| 224 |
-
elif choice
|
| 225 |
choice = "FOODON"
|
| 226 |
-
elif choice
|
| 227 |
choice = "GEOSPARQL"
|
| 228 |
-
# elif choice
|
| 229 |
# choice = "NCBITAXON,NCBITaxon_"
|
| 230 |
-
elif choice
|
| 231 |
choice = "NCBITAXON"
|
| 232 |
if choice in ONLY_Ontologies_OnBIOPORTAL:
|
| 233 |
onto_clauses=onto_clauses+choice+","
|
| 234 |
|
|
|
|
| 235 |
if onto_clauses and onto_clauses[-1] == ",":
|
| 236 |
onto_clauses=onto_clauses[:-1]
|
| 237 |
|
|
@@ -366,7 +363,7 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
|
|
| 366 |
# with open(fkeyname) as f:
|
| 367 |
# key_bioportal = f.read()
|
| 368 |
key_bioportal = os.environ['key_bioportal']
|
| 369 |
-
|
| 370 |
df_annot = pd.DataFrame()
|
| 371 |
for drm_idx, row in tqdm(df.iterrows()):
|
| 372 |
df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
|
|
@@ -403,9 +400,9 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
|
|
| 403 |
|
| 404 |
if "semantic_groups" not in df_max_score_biop.columns:
|
| 405 |
# Drop the '@id' column
|
| 406 |
-
df_max_score_biop["semantic_groups"] = None
|
| 407 |
-
|
| 408 |
-
|
| 409 |
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
|
| 410 |
|
| 411 |
# Subset the dataframe to keep only the specified columns
|
|
@@ -744,6 +741,21 @@ def entitiesFusion(df_annotated, args):
|
|
| 744 |
logging.error(
|
| 745 |
f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
|
| 746 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
#delete all the rows with score smaller than entities_filter_threshold:
|
| 748 |
if args.entities_filter_threshold > 0:
|
| 749 |
df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
|
|
@@ -754,8 +766,8 @@ def entitiesFusion(df_annotated, args):
|
|
| 754 |
# df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
|
| 755 |
|
| 756 |
# in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
|
| 757 |
-
#df_annotated.loc[
|
| 758 |
-
#
|
| 759 |
df_annotated.loc[
|
| 760 |
(~df_annotated['ToLink'].isnull()) & (
|
| 761 |
df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
|
|
@@ -931,7 +943,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
| 931 |
# with open(fkeyname) as f:
|
| 932 |
# key_bioportal = f.read()
|
| 933 |
key_bioportal = os.environ['key_bioportal']
|
| 934 |
-
|
| 935 |
# Check if args.KG_restriction exists and is not empty
|
| 936 |
if getattr(args, 'KG_restriction', None):
|
| 937 |
|
|
@@ -961,24 +973,37 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
| 961 |
|
| 962 |
### this is for Bioportal url api:
|
| 963 |
onto_clauses = ""
|
| 964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 965 |
for choice in args.KG_restriction:
|
| 966 |
-
if choice
|
| 967 |
choice="SNOMEDCT"
|
| 968 |
-
elif choice
|
| 969 |
choice = "OBOREL"
|
| 970 |
-
elif choice
|
| 971 |
choice = "PTRANS"
|
| 972 |
-
elif choice
|
| 973 |
choice = "FOODON"
|
| 974 |
-
elif choice
|
| 975 |
choice = "GEOSPARQL"
|
| 976 |
-
# elif choice
|
| 977 |
# choice = "NCBITAXON,NCBITaxon_"
|
| 978 |
-
elif choice
|
| 979 |
choice = "NCBITAXON"
|
| 980 |
if choice in ONLY_Ontologies_OnBIOPORTAL:
|
| 981 |
-
onto_clauses
|
| 982 |
|
| 983 |
if onto_clauses and onto_clauses[-1] == ",":
|
| 984 |
onto_clauses = onto_clauses[:-1]
|
|
@@ -1286,12 +1311,13 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
|
|
| 1286 |
return contextText, map_query_input_output
|
| 1287 |
|
| 1288 |
#@mem.cache
|
| 1289 |
-
def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None):
|
| 1290 |
|
| 1291 |
if strtobool(args.debug):
|
| 1292 |
print(f"\n----- Starting virtuoso_api_call for {word}")
|
| 1293 |
|
| 1294 |
word = word.lower()
|
|
|
|
| 1295 |
|
| 1296 |
endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
|
| 1297 |
VirtuosoUsername = 'dba'
|
|
@@ -1340,7 +1366,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
| 1340 |
else:
|
| 1341 |
|
| 1342 |
try:
|
| 1343 |
-
entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=
|
| 1344 |
if ALLURIScontext and isinstance(ALLURIScontext, list):
|
| 1345 |
ALLURIScontext = list(set(ALLURIScontext))
|
| 1346 |
except Exception as err:
|
|
@@ -1352,7 +1378,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
| 1352 |
|
| 1353 |
return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
|
| 1354 |
|
| 1355 |
-
|
| 1356 |
|
| 1357 |
if entityBioeUrl:
|
| 1358 |
|
|
@@ -1520,7 +1546,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
| 1520 |
endpoint,
|
| 1521 |
VirtuosoUsername,
|
| 1522 |
contextWordVirtuoso,
|
| 1523 |
-
UseBioportalForLinking=
|
| 1524 |
if ALLURIScontext and isinstance(ALLURIScontext, list):
|
| 1525 |
ALLURIScontext = list(set(ALLURIScontext))
|
| 1526 |
|
|
@@ -1538,7 +1564,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
| 1538 |
# Print the error message to stderr
|
| 1539 |
print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
|
| 1540 |
# Exit the program with a non-zero status code (commonly used to indicate an error)
|
| 1541 |
-
|
| 1542 |
|
| 1543 |
else:
|
| 1544 |
|
|
@@ -1714,6 +1740,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
| 1714 |
|
| 1715 |
|
| 1716 |
|
|
|
|
| 1717 |
def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
| 1718 |
|
| 1719 |
result = None
|
|
@@ -1736,8 +1763,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
|
|
| 1736 |
|
| 1737 |
result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
|
| 1738 |
|
| 1739 |
-
|
| 1740 |
-
if row['IsBio'] == 1:
|
| 1741 |
|
| 1742 |
# Check if '@id' column exists in df_Extract
|
| 1743 |
iiid = None
|
|
@@ -1756,7 +1782,37 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
|
|
| 1756 |
if strtobool(args.debug):
|
| 1757 |
print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
|
| 1758 |
print(row[args.source_column])
|
| 1759 |
-
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1760 |
|
| 1761 |
else:
|
| 1762 |
if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
|
|
@@ -1780,7 +1836,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
|
|
| 1780 |
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
| 1781 |
|
| 1782 |
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
| 1783 |
-
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO)
|
| 1784 |
|
| 1785 |
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
|
| 1786 |
|
|
@@ -1889,9 +1945,9 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 1889 |
parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
|
| 1890 |
|
| 1891 |
parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
|
| 1892 |
-
parser.add_argument("--geonameskey_filename", type=str, default="", help="file location where it is stored the geonames api key")
|
| 1893 |
-
parser.add_argument("--virtuosokey_filename", type=str, default="", help="file location where it is stored the virtuoso endpoint dba pwd")
|
| 1894 |
-
parser.add_argument("--bioportalkey_filename", type=str, default="", help="file location where it is stored the NCBO BioPortal api key")
|
| 1895 |
|
| 1896 |
# consose 20250205:
|
| 1897 |
# KGchoices = None
|
|
@@ -1910,7 +1966,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 1910 |
# parser.add_argument("--USE_CACHE", type=str, default="False",
|
| 1911 |
# help="whether to use cache for the NER and NEL tasks or not")
|
| 1912 |
parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
|
| 1913 |
-
|
| 1914 |
parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
|
| 1915 |
|
| 1916 |
parser.add_argument("--computeEntityContext", type=str, default="False",
|
|
@@ -1926,7 +1982,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 1926 |
|
| 1927 |
args = parser.parse_args()
|
| 1928 |
|
| 1929 |
-
|
| 1930 |
|
| 1931 |
#print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
|
| 1932 |
#if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
|
|
@@ -1998,21 +2054,24 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 1998 |
df_annotated = history.copy()
|
| 1999 |
|
| 2000 |
|
| 2001 |
-
|
| 2002 |
-
|
| 2003 |
-
# filter now per models selection
|
| 2004 |
-
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
|
| 2005 |
-
if df_annotated.empty:
|
| 2006 |
-
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
| 2007 |
-
return {"text": text, "entities": []}, html_output, history.to_dict()
|
| 2008 |
|
| 2009 |
-
|
| 2010 |
-
|
| 2011 |
-
|
| 2012 |
-
|
| 2013 |
-
|
| 2014 |
-
|
| 2015 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2016 |
|
| 2017 |
|
| 2018 |
cache_prefix_fp = "LLMQUERYNER"
|
|
@@ -2063,6 +2122,60 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
| 2063 |
# key_virtuoso = f.read()
|
| 2064 |
key_virtuoso = os.environ['key_virtuoso']
|
| 2065 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2066 |
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
|
| 2067 |
text_splitter, args, key_geonames,
|
| 2068 |
cache_map_geonames,
|
|
@@ -2255,5 +2368,5 @@ demo = gr.Interface(
|
|
| 2255 |
|
| 2256 |
|
| 2257 |
|
| 2258 |
-
|
| 2259 |
-
demo.launch(share=True) # Share your demo with just 1 extra parameter
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
|
| 4 |
from transformers import file_utils
|
| 5 |
print(file_utils.default_cache_path)
|
| 6 |
|
|
|
|
| 17 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 18 |
from collections import Counter
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
import torch
|
| 21 |
+
torch.cuda.empty_cache() # Clear cache ot torch
|
| 22 |
|
| 23 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 24 |
print(f"Device: {device}...")
|
|
|
|
| 39 |
import gradio as gr
|
| 40 |
import re
|
| 41 |
|
| 42 |
+
from common import strtobool, split_camel_case, chunk_tokens, update_nested_dict, cleanInputText, token_counter, encoding_getter, extract_words, all_words_in_list, row_to_dict_string, strip_quotes, rescale_exponential_to_logarithmic
|
| 43 |
|
| 44 |
|
| 45 |
|
|
|
|
| 71 |
num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner !
|
| 72 |
tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
|
| 73 |
|
| 74 |
+
POSSIBLE_KGchoices_List = ["AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO",
|
| 75 |
+
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
|
| 76 |
+
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
|
| 77 |
+
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
|
| 78 |
+
"SYMP", "FoodOn", "UBERON", "VO", "EuroSciVoc"]
|
|
|
|
| 79 |
|
| 80 |
ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
|
| 81 |
|
|
|
|
| 82 |
encod = encoding_getter('microsoft/deberta-v3-large')
|
| 83 |
text_splitter = TokenTextSplitter(
|
| 84 |
# separators=separators,
|
|
|
|
| 211 |
|
| 212 |
onto_clauses = ""
|
| 213 |
for choice in args.KG_restriction:
|
| 214 |
+
if choice == "SNOMED":
|
| 215 |
choice="SNOMEDCT"
|
| 216 |
+
elif choice == "RO":
|
| 217 |
choice = "OBOREL"
|
| 218 |
+
elif choice == "TRANS":
|
| 219 |
choice = "PTRANS"
|
| 220 |
+
elif choice == "FoodOn":
|
| 221 |
choice = "FOODON"
|
| 222 |
+
elif choice == "GeoSPARQL":
|
| 223 |
choice = "GEOSPARQL"
|
| 224 |
+
# elif choice == "NCBITAXON":
|
| 225 |
# choice = "NCBITAXON,NCBITaxon_"
|
| 226 |
+
elif choice == "NCBITaxon_":
|
| 227 |
choice = "NCBITAXON"
|
| 228 |
if choice in ONLY_Ontologies_OnBIOPORTAL:
|
| 229 |
onto_clauses=onto_clauses+choice+","
|
| 230 |
|
| 231 |
+
|
| 232 |
if onto_clauses and onto_clauses[-1] == ",":
|
| 233 |
onto_clauses=onto_clauses[:-1]
|
| 234 |
|
|
|
|
| 363 |
# with open(fkeyname) as f:
|
| 364 |
# key_bioportal = f.read()
|
| 365 |
key_bioportal = os.environ['key_bioportal']
|
| 366 |
+
|
| 367 |
df_annot = pd.DataFrame()
|
| 368 |
for drm_idx, row in tqdm(df.iterrows()):
|
| 369 |
df_BioPortalAnnotation=process_row_BioPortal_api(args, key_bioportal, row)
|
|
|
|
| 400 |
|
| 401 |
if "semantic_groups" not in df_max_score_biop.columns:
|
| 402 |
# Drop the '@id' column
|
| 403 |
+
df_max_score_biop["semantic_groups"] = None
|
| 404 |
+
|
| 405 |
+
# Specify the columns you want to keep
|
| 406 |
columns_to_keep = ["score", "from", "to", "prefLabel", "text", "semantic_groups", "@id", "ALLURIScontextFromNCBO"]
|
| 407 |
|
| 408 |
# Subset the dataframe to keep only the specified columns
|
|
|
|
| 741 |
logging.error(
|
| 742 |
f'FAILED to extract json results\n\tError: {err}\nLeaving it as a single column then and not decompressing! Have a check...')
|
| 743 |
|
| 744 |
+
#
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
# Delete all the rows where EXACT MATCHING NOT MET:
|
| 748 |
+
# Apply the conditions
|
| 749 |
+
condition_to_delete = (
|
| 750 |
+
df_annotated['ContextToAnnotate'].str.startswith('"') &
|
| 751 |
+
df_annotated['ContextToAnnotate'].str.endswith('"') &
|
| 752 |
+
(df_annotated['ContextToAnnotate'].apply(strip_quotes).str.lower() != df_annotated['word'].str.lower())
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
# Now Filter out the rows where condition_to_delete is True
|
| 756 |
+
df_annotated = df_annotated[~condition_to_delete].copy()
|
| 757 |
+
#
|
| 758 |
+
|
| 759 |
#delete all the rows with score smaller than entities_filter_threshold:
|
| 760 |
if args.entities_filter_threshold > 0:
|
| 761 |
df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
|
|
|
|
| 766 |
# df_annotated = df_annotated[(df_annotated['ToLink'] == df_annotated['word']) | df_annotated['ToLink'].isna()]
|
| 767 |
|
| 768 |
# in all the rows having a value not null for the column "ToLink", compare this value to that of the column "word". If they are different, set the value in "ToLink" to None
|
| 769 |
+
# df_annotated.loc[
|
| 770 |
+
# (~df_annotated['ToLink'].isnull()) & (df_annotated['ToLink'] != df_annotated['word']), 'ToLink'] = None
|
| 771 |
df_annotated.loc[
|
| 772 |
(~df_annotated['ToLink'].isnull()) & (
|
| 773 |
df_annotated['ToLink'].str.casefold() != df_annotated['word'].str.casefold()), 'ToLink'] = None
|
|
|
|
| 943 |
# with open(fkeyname) as f:
|
| 944 |
# key_bioportal = f.read()
|
| 945 |
key_bioportal = os.environ['key_bioportal']
|
| 946 |
+
|
| 947 |
# Check if args.KG_restriction exists and is not empty
|
| 948 |
if getattr(args, 'KG_restriction', None):
|
| 949 |
|
|
|
|
| 973 |
|
| 974 |
### this is for Bioportal url api:
|
| 975 |
onto_clauses = ""
|
| 976 |
+
# for choice in args.KG_restriction:
|
| 977 |
+
# if choice == "SNOMEDCT":
|
| 978 |
+
# choice = "SNOMED"
|
| 979 |
+
# elif choice == "OBOREL":
|
| 980 |
+
# choice = "RO"
|
| 981 |
+
# elif choice == "PTRANS":
|
| 982 |
+
# choice = "TRANS"
|
| 983 |
+
# elif choice == "FOODON":
|
| 984 |
+
# choice = "FoodOn"
|
| 985 |
+
# elif choice == "GEOSPARQL":
|
| 986 |
+
# choice = "GeoSPARQL"
|
| 987 |
+
# elif choice == "NCBITAXON":
|
| 988 |
+
# choice = "NCBITAXON,NCBITaxon_"
|
| 989 |
+
# onto_clauses = onto_clauses + choice + ","
|
| 990 |
for choice in args.KG_restriction:
|
| 991 |
+
if choice == "SNOMED":
|
| 992 |
choice="SNOMEDCT"
|
| 993 |
+
elif choice == "RO":
|
| 994 |
choice = "OBOREL"
|
| 995 |
+
elif choice == "TRANS":
|
| 996 |
choice = "PTRANS"
|
| 997 |
+
elif choice == "FoodOn":
|
| 998 |
choice = "FOODON"
|
| 999 |
+
elif choice == "GeoSPARQL":
|
| 1000 |
choice = "GEOSPARQL"
|
| 1001 |
+
# elif choice == "NCBITAXON":
|
| 1002 |
# choice = "NCBITAXON,NCBITaxon_"
|
| 1003 |
+
elif choice == "NCBITaxon_":
|
| 1004 |
choice = "NCBITAXON"
|
| 1005 |
if choice in ONLY_Ontologies_OnBIOPORTAL:
|
| 1006 |
+
onto_clauses=onto_clauses+choice+","
|
| 1007 |
|
| 1008 |
if onto_clauses and onto_clauses[-1] == ",":
|
| 1009 |
onto_clauses = onto_clauses[:-1]
|
|
|
|
| 1311 |
return contextText, map_query_input_output
|
| 1312 |
|
| 1313 |
#@mem.cache
|
| 1314 |
+
def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True):
|
| 1315 |
|
| 1316 |
if strtobool(args.debug):
|
| 1317 |
print(f"\n----- Starting virtuoso_api_call for {word}")
|
| 1318 |
|
| 1319 |
word = word.lower()
|
| 1320 |
+
word = strip_quotes(word)
|
| 1321 |
|
| 1322 |
endpoint = 'https://api-vast.jrc.service.ec.europa.eu/sparql'
|
| 1323 |
VirtuosoUsername = 'dba'
|
|
|
|
| 1366 |
else:
|
| 1367 |
|
| 1368 |
try:
|
| 1369 |
+
entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking )
|
| 1370 |
if ALLURIScontext and isinstance(ALLURIScontext, list):
|
| 1371 |
ALLURIScontext = list(set(ALLURIScontext))
|
| 1372 |
except Exception as err:
|
|
|
|
| 1378 |
|
| 1379 |
return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
|
| 1380 |
|
| 1381 |
+
|
| 1382 |
|
| 1383 |
if entityBioeUrl:
|
| 1384 |
|
|
|
|
| 1546 |
endpoint,
|
| 1547 |
VirtuosoUsername,
|
| 1548 |
contextWordVirtuoso,
|
| 1549 |
+
UseBioportalForLinking=UseBioportalForLinking)
|
| 1550 |
if ALLURIScontext and isinstance(ALLURIScontext, list):
|
| 1551 |
ALLURIScontext = list(set(ALLURIScontext))
|
| 1552 |
|
|
|
|
| 1564 |
# Print the error message to stderr
|
| 1565 |
print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
|
| 1566 |
# Exit the program with a non-zero status code (commonly used to indicate an error)
|
| 1567 |
+
sys.exit(1)
|
| 1568 |
|
| 1569 |
else:
|
| 1570 |
|
|
|
|
| 1740 |
|
| 1741 |
|
| 1742 |
|
| 1743 |
+
|
| 1744 |
def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
| 1745 |
|
| 1746 |
result = None
|
|
|
|
| 1763 |
|
| 1764 |
result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
|
| 1765 |
|
| 1766 |
+
elif row['IsBio'] == 1:
|
|
|
|
| 1767 |
|
| 1768 |
# Check if '@id' column exists in df_Extract
|
| 1769 |
iiid = None
|
|
|
|
| 1782 |
if strtobool(args.debug):
|
| 1783 |
print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
|
| 1784 |
print(row[args.source_column])
|
| 1785 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True)
|
| 1786 |
+
|
| 1787 |
+
else:
|
| 1788 |
+
if row['model'] == "Forced":
|
| 1789 |
+
# Check if '@id' column exists in df_Extract
|
| 1790 |
+
iiid = None
|
| 1791 |
+
# Check if the '@id' exists in the Series
|
| 1792 |
+
if '@id' in row:
|
| 1793 |
+
# Check if the value is not None or NaN
|
| 1794 |
+
if row['@id'] is not None and not pd.isna(row['@id']):
|
| 1795 |
+
# Assign the value to the variable iiid
|
| 1796 |
+
iiid = row['@id']
|
| 1797 |
+
iiiALLURIScontextFromNCBO = None
|
| 1798 |
+
if 'ALLURIScontextFromNCBO' in row:
|
| 1799 |
+
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
|
| 1800 |
+
list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
|
| 1801 |
+
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
|
| 1802 |
+
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
| 1803 |
+
|
| 1804 |
+
if strtobool(args.debug):
|
| 1805 |
+
print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
|
| 1806 |
+
print(row[args.source_column])
|
| 1807 |
+
|
| 1808 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
| 1809 |
+
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
|
| 1810 |
+
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
|
| 1811 |
+
|
| 1812 |
+
if not result: #try annotation without bioportal
|
| 1813 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
| 1814 |
+
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
|
| 1815 |
+
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False)
|
| 1816 |
|
| 1817 |
else:
|
| 1818 |
if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
|
|
|
|
| 1836 |
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
| 1837 |
|
| 1838 |
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
| 1839 |
+
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
|
| 1840 |
|
| 1841 |
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
|
| 1842 |
|
|
|
|
| 1945 |
parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner !
|
| 1946 |
|
| 1947 |
parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not")
|
| 1948 |
+
parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key")
|
| 1949 |
+
parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd")
|
| 1950 |
+
parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key")
|
| 1951 |
|
| 1952 |
# consose 20250205:
|
| 1953 |
# KGchoices = None
|
|
|
|
| 1966 |
# parser.add_argument("--USE_CACHE", type=str, default="False",
|
| 1967 |
# help="whether to use cache for the NER and NEL tasks or not")
|
| 1968 |
parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not")
|
| 1969 |
+
|
| 1970 |
parser.add_argument("--num_cores_eLinking", type=int, default=1, help="parallel processing for the entity linking process")
|
| 1971 |
|
| 1972 |
parser.add_argument("--computeEntityContext", type=str, default="False",
|
|
|
|
| 1982 |
|
| 1983 |
args = parser.parse_args()
|
| 1984 |
|
| 1985 |
+
df_ToAnnotate = pd.DataFrame()
|
| 1986 |
|
| 1987 |
#print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
|
| 1988 |
#if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
|
|
|
|
| 2054 |
df_annotated = history.copy()
|
| 2055 |
|
| 2056 |
|
| 2057 |
+
quoted_text = text.startswith('"') & text.endswith('"')
|
| 2058 |
+
if (not df_annotated.empty) or quoted_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2059 |
|
| 2060 |
+
if (not df_annotated.empty):
|
| 2061 |
+
# filter now per models selection
|
| 2062 |
+
df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
|
| 2063 |
+
if df_annotated.empty and quoted_text==False:
|
| 2064 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
| 2065 |
+
return {"text": text, "entities": []}, html_output, history.to_dict()
|
| 2066 |
|
| 2067 |
+
df_annotated_combined = pd.DataFrame()
|
| 2068 |
+
if (not df_annotated.empty):
|
| 2069 |
+
df_annotated_combined = entitiesFusion(df_annotated,args)
|
| 2070 |
+
if df_annotated_combined.empty and quoted_text==False:
|
| 2071 |
+
html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
|
| 2072 |
+
return {"text": text, "entities": []}, html_output, history.to_dict()
|
| 2073 |
+
else:
|
| 2074 |
+
df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking
|
| 2075 |
|
| 2076 |
|
| 2077 |
cache_prefix_fp = "LLMQUERYNER"
|
|
|
|
| 2122 |
# key_virtuoso = f.read()
|
| 2123 |
key_virtuoso = os.environ['key_virtuoso']
|
| 2124 |
|
| 2125 |
+
# Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
|
| 2126 |
+
|
| 2127 |
+
if df_ToAnnotate.empty:
|
| 2128 |
+
df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]})
|
| 2129 |
+
|
| 2130 |
+
if "SentenceRef" not in df_ToAnnotate.columns:
|
| 2131 |
+
df_ToAnnotate["SentenceRef"] = None
|
| 2132 |
+
df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if
|
| 2133 |
+
col != 'SentenceRef']] # this moves it to the first position
|
| 2134 |
+
|
| 2135 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1
|
| 2136 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(
|
| 2137 |
+
df_ToAnnotate[args.source_column]).transform('min').astype(int)
|
| 2138 |
+
df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int)
|
| 2139 |
+
|
| 2140 |
+
# Define the condition to find missing SentenceRefs
|
| 2141 |
+
missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef'])
|
| 2142 |
+
|
| 2143 |
+
# Define the condition to check if ContextToAnnotate starts and ends with quotes
|
| 2144 |
+
quoted_context = df_ToAnnotate['ContextToAnnotate'].str.startswith('"') & df_ToAnnotate[
|
| 2145 |
+
'ContextToAnnotate'].str.endswith('"')
|
| 2146 |
+
|
| 2147 |
+
# Combine both conditions
|
| 2148 |
+
condition = missing_sentence_refs & quoted_context
|
| 2149 |
+
|
| 2150 |
+
# Select rows from df_ToAnnotate that meet the condition
|
| 2151 |
+
rows_to_add = df_ToAnnotate[condition]
|
| 2152 |
+
|
| 2153 |
+
rows_to_add['model'] = "Forced"
|
| 2154 |
+
rows_to_add['entity_group'] = "MISC"
|
| 2155 |
+
rows_to_add['word'] = rows_to_add['ContextToAnnotate']
|
| 2156 |
+
rows_to_add['word'] = rows_to_add['ContextToAnnotate'].apply(strip_quotes)
|
| 2157 |
+
rows_to_add['score'] = 1.0
|
| 2158 |
+
rows_to_add['start'] = int(1)
|
| 2159 |
+
rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1)
|
| 2160 |
+
rows_to_add['IsGeo'] = None
|
| 2161 |
+
rows_to_add['IsBio'] = None
|
| 2162 |
+
rows_to_add['IsCrossInside'] = 0.0
|
| 2163 |
+
|
| 2164 |
+
if df_annotated_combined.empty:
|
| 2165 |
+
df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns)
|
| 2166 |
+
|
| 2167 |
+
# Append these rows to df_annotated_combined
|
| 2168 |
+
df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True)
|
| 2169 |
+
|
| 2170 |
+
df_annotated_combined['start'] = df_annotated_combined['start'].astype(int)
|
| 2171 |
+
df_annotated_combined['end'] = df_annotated_combined['end'].astype(int)
|
| 2172 |
+
|
| 2173 |
+
df_annotated_combined = df_annotated_combined.sort_values(
|
| 2174 |
+
by=['SentenceRef', 'start', 'ToLink', 'word', 'score'],
|
| 2175 |
+
ascending=[True, True, True, True, False])
|
| 2176 |
+
|
| 2177 |
+
# Now df_annotated_combined contains the additional rows
|
| 2178 |
+
|
| 2179 |
df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined,
|
| 2180 |
text_splitter, args, key_geonames,
|
| 2181 |
cache_map_geonames,
|
|
|
|
| 2368 |
|
| 2369 |
|
| 2370 |
|
| 2371 |
+
demo.launch()
|
| 2372 |
+
#demo.launch(share=True) # Share your demo with just 1 extra parameter
|