Spaces:
Runtime error
Runtime error
| from sentence_transformers import SentenceTransformer, util | |
| import json | |
| import time | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import chromadb | |
| from chromadb.config import Settings | |
| from chromadb.utils import embedding_functions | |
| from chromadb.db.clickhouse import NoDatapointsException | |
| def query_right_aas(json_query, collection, metalabel, model): | |
| query = json.loads(json_query) | |
| name = query['Name'] | |
| definition = query["Definition"] | |
| unit = query["Unit"] | |
| datatype = query["Datatype"] | |
| semantic_id = query["SemanticId"] | |
| return_matches = query["ReturnMatches"] | |
| datatype_mapping = {'boolean': 'BOOLEAN', 'string': 'STRING', 'string_translatable':'STRING', 'translatable_string': 'STRING', 'non_translatable_string':'STRING', | |
| 'date':'DATE', 'data_time':'DATE', 'uri':'URI', 'int':'INT', 'int_measure':'INT', 'int_currency':'INT', 'integer': 'INT', | |
| 'real':'REAL', 'real_measure': 'REAL', 'real_currency':'REAL', 'enum_code': 'ENUM_CODE', 'enum_int':'ENUM_CODE', | |
| 'ENUM_REAL': 'ENUM_CODE', 'ENUM_RATIONAL': 'ENUM_CODE', 'ENUM_BOOLEAN': 'ENUM_CODE', 'ENUM_STRING': 'ENUM_CODE', | |
| 'enum_reference': 'ENUM_CODE', 'enum_instance': 'ENUM_CODE', 'set(b1,b2)': 'SET', | |
| 'constrained_set(b1,b2,cmn,cmx)': 'SET', 'set [0,?]': 'SET', 'set [1,?]': 'SET','set [1, ?]': 'SET', 'nan': 'NaN', | |
| 'media_type':'LARGE_OBJECT_TYPE'} | |
| unit_mapping = {'nan': 'NaN', 'hertz': 'FREQUENCY', 'hz': 'FREQUENCY', 'pa': 'PRESSURE', 'pascal': 'PRESSURE', 'n/m²':'PRESSURE', | |
| 'bar': 'PRESSURE', '%': 'SCALARS_PERC', 'w': 'POWER', 'watt': 'POWER', 'kw': 'POWER', 'kg/m³':'CHEMISTRY', | |
| 'm²/s': 'CHEMISTRY', 'pa*s': 'CHEMISTRY', 'v':'ELECTRICAL', 'volt': 'ELECTRICAL', 'db': 'ACOUSTICS', | |
| 'db(a)': 'ACOUSTICS','k': 'TEMPERATURE', '°c': 'TEMPERATURE', 'n': 'MECHANICS', 'newton':'MECHANICS', 'kg/s':'FLOW', | |
| 'kg/h':'FLOW', 'm³/s': 'FLOW', 'm³/h': 'FLOW', 'l/s':'FLOW', 'l/h':'FLOW', 'µm': 'LENGTH', 'mm':'LENGTH', 'cm':'LENGTH', | |
| 'dm':'LENGTH', 'm':'LENGTH' ,'meter': 'LENGTH', 'm/s':'SPEED', 'km/h': 'SPEED', 's^(-1)':'FREQUENCY', '1/s':'FREQUENCY', | |
| 's':'TIME', 'h':'TIME', 'min':'TIME', 'd': 'TIME', 'hours': 'TIME', 'a': 'ELECTRICAL', 'm³': 'VOLUME', | |
| 'm²': 'AREA', 'rpm': 'FLOW', 'nm': 'MECHANICS', 'm/m': 'MECHANICS', 'm³/m²s': 'MECHANICS', 'w(m²*K)': 'HEAT_TRANSFER', | |
| 'kwh': 'ELECTRICAL', 'kg/(s*m²)': 'FLOW', 'kg': 'MASS', 'w/(m*k)': 'HEAT_TRANSFER', 'm²*k/w': 'HEAT_TRANSFER', | |
| 'j/s': 'POWER'} | |
| unit_lower = unit.lower() | |
| datatype_lower = datatype.lower() | |
| unit_categ = unit_mapping.get(unit_lower) | |
| datatype_categ = datatype_mapping.get(datatype_lower) | |
| if unit_categ == None: | |
| unit_categ = 'NaN' | |
| if datatype_categ == None: | |
| datatype_categ = 'NaN' | |
| concat= (unit_categ, datatype_categ) | |
| keys = [k for k, v in metalabel.items() if v == concat] | |
| metadata = keys[0] | |
| name_embedding = model.encode(name) | |
| definition_embedding = model.encode(definition) | |
| concat_name_def_query = np.concatenate((definition_embedding, name_embedding), axis = 0) | |
| concat_name_def_query = concat_name_def_query.tolist() | |
| queries = [concat_name_def_query] | |
| #print(type(queries)) | |
| # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt | |
| # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib | |
| # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden | |
| # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2 | |
| # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich | |
| try: | |
| homogen = collection.query( | |
| query_embeddings=queries, | |
| n_results=1, | |
| where={"SESemanticId": semantic_id} | |
| ) | |
| #except NoDatapointsException: | |
| # homogen = 'Nix' | |
| except Exception: | |
| homogen = 'Nix' | |
| if homogen != 'Nix': | |
| result = homogen | |
| result['matching_method']= 'Semantic equivalent , same semantic Id' | |
| result['matching_algorithm'] = 'None' | |
| result['distances'] = [[0]] | |
| value = result['documents'][0][0] | |
| value_dict = json.loads(value) | |
| final_result = { | |
| "matching_method": result['matching_method'], | |
| "matching_algorithm": result['matching_algorithm'], | |
| "matching_distance": result['distances'][0][0], | |
| "aas_id": result['metadatas'][0][0]['AASId'], | |
| "aas_id_short": result['metadatas'][0][0]['AASIdShort'], | |
| "submodel_id_short": result['metadatas'][0][0]['SubmodelName'], | |
| "submodel_id": result['metadatas'][0][0]['SubmodelId'], | |
| "matched_object": value_dict, | |
| } | |
| final_results = [final_result] | |
| # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten | |
| elif homogen == 'Nix': | |
| try: | |
| with_metadata = collection.query( | |
| query_embeddings=queries, | |
| n_results=return_matches, | |
| where={"Metalabel": metadata}, | |
| ) | |
| #except NoDatapointsException: | |
| # with_metadata = 'Nix' | |
| except Exception: | |
| with_metadata = 'Nix' | |
| without_metadata = collection.query( | |
| query_embeddings=queries, | |
| n_results=return_matches, | |
| ) | |
| print(without_metadata) | |
| if with_metadata == 'Nix': | |
| result = without_metadata | |
| result['matching_method']= 'Semantically not equivalent, NLP without Metadata' | |
| result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass' | |
| elif with_metadata != 'Nix': | |
| distance_with_meta = with_metadata['distances'][0][0] | |
| distance_without_meta = without_metadata['distances'][0][0] | |
| #print(distance_with_meta) | |
| #print(distance_without_meta) | |
| # Vergleich der Abstände von mit und ohne Metadaten | |
| if distance_without_meta <= distance_with_meta: | |
| result = without_metadata | |
| result['matching_method']= 'Semantically not equivalent, NLP without Metadata' | |
| result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass' | |
| else: | |
| result = with_metadata | |
| result['matching_method']= 'Semantically not equivalent, NLP without Metadata' | |
| result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass' | |
| # Aufbereiten des passenden finalen Ergebnisses | |
| final_results = [] | |
| print(result) | |
| for i in range(0, return_matches): | |
| value = result['documents'][0][i] | |
| value_dict = json.loads(value) | |
| final_result = { | |
| "matching_method": result['matching_method'], | |
| "matching_algorithm": result['matching_algorithm'], | |
| "matching_distance": result['distances'][0][i], | |
| #"aas_id": result['metadatas'][0][i]['AASId'], | |
| #"aas_id_short": result['metadatas'][0][i]['AASIdShort'], | |
| "submodel_id_short": result['metadatas'][0][i]['SubmodelName'], | |
| "submodel_id": result['metadatas'][0][i]['SubmodelId'], | |
| "matched_object": value_dict | |
| } | |
| #final_result = json.dumps(final_result, indent = 4) | |
| final_results.append(final_result) | |
| return final_results | |
| def get_right_collection(collections, aas_id): | |
| right_collection = [] | |
| for collection in collections: | |
| try_collection = collection.get(where={'AASId': aas_id}) | |
| try: | |
| collection_aas_id = try_collection['metadatas'][0]['AASId'] | |
| right_collection.append(collection) | |
| except: | |
| print('Nix') | |
| if(right_collection == []): | |
| right_collection = ['AAS not in database'] | |
| return right_collection | |
| # Eine spezifische AAS | |
| def query_specific_aas(query, metalabel, model, collections, client_chroma): | |
| json_query = json.dumps(query, indent = 4) | |
| aas_id = query['AASId'] | |
| right_collection = get_right_collection(collections, aas_id) | |
| if right_collection == ['AAS not in database']: | |
| result = right_collection | |
| else: | |
| collection = client_chroma.get_collection(right_collection[0].name) | |
| result = query_right_aas(json_query, collection, metalabel, model) | |
| return result |