Spaces:
Runtime error
Runtime error
| # %% | |
| # Set logging level to DEBUG | |
| import logging | |
| import os | |
| import dotenv | |
| import numpy as np | |
| import pandas as pd | |
| import pyalex | |
| import rdflib | |
| from ampligraph.datasets import ( | |
| GraphDataLoader, | |
| SQLiteAdapter, | |
| DataSourceIdentifier, | |
| ) | |
| from ampligraph.datasets.graph_partitioner import NaiveGraphPartitioner, BucketGraphPartitioner | |
| from ampligraph.evaluation import train_test_split_no_unseen | |
| from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
| from pyalex import Authors, Concepts, Funders, Institutions, Publishers, Sources, Works | |
| from sklearn.model_selection import train_test_split | |
| import tensorflow as tf | |
| from ampligraph.evaluation import hits_at_n_score, mrr_score | |
| from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
| from ampligraph.latent_features.loss_functions import get as get_loss | |
| from ampligraph.latent_features.regularizers import get as get_regularizer | |
| logging.basicConfig(level=logging.DEBUG) | |
| loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] | |
| for logger in loggers: | |
| logger.setLevel(logging.INFO) | |
| # Load the triples from the file | |
| X_train = np.load("train.npy") | |
| X_valid = np.load("valid.npy") | |
| X_test = np.load("test.npy") | |
| ## Store as CSVs. There are commas in the names of some institutions, so we need to use a tab as the delimiter | |
| #np.savetxt("train.csv", X_train, delimiter="\t", fmt="%s") | |
| #np.savetxt("valid.csv", X_valid, delimiter="\t", fmt="%s") | |
| #np.savetxt("test.csv", X_test, delimiter="\t", fmt="%s") | |
| # | |
| #print(f"Train size: {X_train.shape[0]}") | |
| #print(f"Valid size: {X_valid.shape[0]}") | |
| #print(f"Test size: {X_test.shape[0]}") | |
| # Initialize a ComplEx neural embedding model: the embedding size is k, | |
| # eta specifies the number of corruptions to generate per each positive, | |
| # scoring_type determines the scoring function of the embedding model. | |
| partitioned_model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") | |
| # Optimizer, loss and regularizer definition | |
| optim = tf.keras.optimizers.Adam(learning_rate=1e-3) | |
| loss = get_loss("pairwise", {"margin": 0.5}) | |
| regularizer = get_regularizer("LP", {"p": 2, "lambda": 1e-5}) | |
| # Compilation of the model | |
| partitioned_model.compile( | |
| optimizer=optim, loss=loss, entity_relation_regularizer=regularizer | |
| ) | |
| # For evaluation, we can use a filter which would be used to filter out | |
| # positives statements created by the corruption procedure. | |
| # Here we define the filter set by concatenating all the positives | |
| filter = {"test": np.concatenate((X_train, X_valid, X_test))} | |
| # Early Stopping callback | |
| checkpoint = tf.keras.callbacks.EarlyStopping( | |
| monitor="val_{}".format("hits10"), | |
| min_delta=0, | |
| patience=5, | |
| verbose=1, | |
| mode="max", | |
| restore_best_weights=True, | |
| ) | |
| ###### | |
| use_db = False | |
| if use_db: | |
| AMPLIGRAPH_DATA_HOME = os.path.join(os.getcwd(), "data") # + os.sep | |
| from ampligraph.datasets.data_indexer import SQLite as SQLiteIndexer, DataIndexer | |
| # Initialize GraphDataLoader from .csv file | |
| sqlite_indexer = SQLiteIndexer( | |
| data=None, | |
| db_file="main_partition.db", | |
| root_directory=AMPLIGRAPH_DATA_HOME, | |
| name="main_partition", | |
| ) | |
| indexer = DataIndexer( | |
| X=None, | |
| backend_type='sqlite', | |
| backend=sqlite_indexer, | |
| ) | |
| dataset_loader = GraphDataLoader( | |
| "train.csv", | |
| backend=SQLiteAdapter, | |
| in_memory=False, | |
| verbose=True, | |
| root_directory=AMPLIGRAPH_DATA_HOME, | |
| db_name="mydb.db", | |
| use_indexer=indexer, | |
| ) | |
| # adapter = SQLiteAdapter( | |
| # "database_25-12-2023_07-28-41_485047_PM_2a11fc49-2337-415e-8672-2bfa48a83745.db", | |
| # identifier=DataSourceIdentifier, | |
| # root_directory=AMPLIGRAPH_DATA_HOME, | |
| # ) | |
| print("Graph data loader initialized") | |
| # for elem in next(dataset_loader._get_batch_generator()): | |
| # print(elem) | |
| # break | |
| ###### | |
| else: | |
| X_train = np.load("train.npy") | |
| dataset_loader = GraphDataLoader( | |
| X_train, | |
| verbose=True, | |
| use_indexer=True, | |
| in_memory=True, | |
| ) | |
| print(f'next: {next(dataset_loader)}') | |
| print(f'next: {next(dataset_loader)}') | |
| print(f'next: {next(dataset_loader)}') | |
| #x = np.loadtxt( | |
| # "train.csv", | |
| # delimiter="\t", | |
| # dtype=str, | |
| #) | |
| #print(x[0]) | |
| # Choose the partitioner - in this case we choose RandomEdges partitioner | |
| partition = False | |
| if partition: | |
| print("Will start partitioning now") | |
| graph_partitioner_train = NaiveGraphPartitioner(dataset_loader, k=6) | |
| print("Graph partitioner initialized") | |
| #indexer = ( | |
| # partitioned_model.data_handler.get_mapper() | |
| #) # get the mapper from the trained model | |
| # dataset_loader_test = GraphDataLoader( | |
| # data_source=X_test, | |
| # backend=SQLiteAdapter, # type of backend to use | |
| # batch_size=400, # batch size to use while iterating over this dataset | |
| # dataset_type="test", # dataset type | |
| # use_indexer=indexer, # mapper to map test concepts to the same indices used during training | |
| # verbose=True, | |
| # ) | |
| # graph_partitioner_test = BucketGraphPartitioner(data=partitioner, k=3) | |
| print("Will start training now") | |
| # Fit the model on training and validation set | |
| partitioned_model.fit( | |
| #graph_partitioner_train if partition else dataset_loader, | |
| X_train, | |
| batch_size=500, | |
| epochs=45, # Number of training epochs | |
| validation_freq=20, # Epochs between successive validation | |
| validation_burn_in=100, # Epoch to start validation | |
| validation_data=X_test, # Validation data | |
| validation_filter=filter, # Filter positives from validation corruptions | |
| callbacks=[ | |
| checkpoint | |
| ], # Early stopping callback (more from tf.keras.callbacks are supported) | |
| verbose=True, # Enable stdout messages | |
| #partitioning_k=7, # Number of partitions to create | |
| ) | |
| # %% | |
| # Store the model | |
| super(ScoringBasedEmbeddingModel, partitioned_model).save_weights("model/") | |
| partitioned_model.save_metadata(filedir="model") | |
| # from ampligraph.utils import save_model | |
| # save_model(partitioned_model, model_name_path='model.pkl') | |
| # %% | |
| # Create a dataframe of the institutions and their names | |
| import pandas as pd | |
| import rdflib | |
| g = rdflib.Graph() | |
| uri = "urn:acmcmc:unis:" | |
| unis = rdflib.Namespace(uri) | |
| g.bind("unis", unis) | |
| g.parse("universities_large_1200.ttl", format="turtle") | |
| query_results = g.query( | |
| """ | |
| SELECT DISTINCT ?institution ?name | |
| WHERE { | |
| ?institution a unis:Institution . | |
| ?institution unis:name ?name . | |
| } | |
| """ | |
| ) | |
| institutions = pd.DataFrame(query_results, columns=["institution", "name"]) | |
| institutions["institution"] = institutions["institution"].apply(lambda x: str(x)) | |
| institutions["name"] = institutions["name"].apply(lambda x: str(x)) | |
| # Store the dataframe | |
| institutions.to_csv("institutions.csv", index=False) | |
| # %% | |
| # Run the evaluation procedure on the test set (with filtering) | |
| # To disable filtering: use_filter=None | |
| # Usually, we corrupt subject and object sides separately and compute ranks | |
| ranks = partitioned_model.evaluate(X_test, use_filter=filter, corrupt_side="s,o") | |
| # compute and print metrics: | |
| mrr = mrr_score(ranks) | |
| hits_10 = hits_at_n_score(ranks, n=10) | |
| print("MRR: %f, Hits@10: %f" % (mrr, hits_10)) | |