Spaces:
Build error
Build error
| from distutils.filelist import FileList | |
| import json | |
| from datasets import load_dataset | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| import os | |
| from json import encoder | |
| from codeScripts.utils import save_json, load_json, create_file_path | |
| #variables que tengo que hacer parametrizables: | |
| fileList = ["__appcache__/biConNotaAnon.json", "__appcache__/metodos-de-captura-conNota-Anon.json"] | |
| path_created_dataset = create_file_path("JoinedSubjects2.json", doctype=4) | |
| #los parámetros de SentTransf_train | |
| def getJsonInfo(fileName): | |
| subject_fileDataset = {'train': load_json(fileName)} | |
| samples = [] | |
| for i in range (0,len(subject_fileDataset["train"])): #len(subject1) | |
| hashed_id = subject_fileDataset["train"][i]['hashed_id'] | |
| keywords = subject_fileDataset["train"][i]['metadata']['keywords'] | |
| mark = subject_fileDataset["train"][i]['nota'] | |
| question = subject_fileDataset["train"][i]['metadata']['enunciado'] | |
| responseStudent = subject_fileDataset["train"][i]['respuesta'] | |
| responseTeacher = "" | |
| for j in range(0,len(subject_fileDataset["train"][i]['metadata']['minipreguntas'])): | |
| responseTeacher = responseTeacher + subject_fileDataset["train"][i]['metadata']['minipreguntas'][j]['minirespuesta'] | |
| ie = {'responseTeacher': responseTeacher, | |
| 'responseStudent': responseStudent, | |
| 'mark': mark, | |
| 'hashed_id': hashed_id, | |
| 'keywords': keywords | |
| } | |
| samples.append(ie) | |
| return samples | |
| def PreparingDataSet(): | |
| #Creating a list with the necesarry fields | |
| first_iter = 1 | |
| for subject in fileList: | |
| if first_iter: | |
| subjectFileList = getJsonInfo(subject) | |
| first_iter = 0 | |
| else: | |
| subjectFileList = subjectFileList + getJsonInfo(subject) | |
| #Splitting the dataset into train,valid and test data | |
| data_train ,data_test = train_test_split(subjectFileList,test_size=0.3) | |
| data_train ,data_valid = train_test_split(data_train,test_size=0.1) | |
| data = {'train': data_train | |
| ,'test': data_test | |
| ,'valid': data_valid | |
| } | |
| save_json(path_created_dataset, data) | |
| import json | |
| import math | |
| import pandas as pd | |
| from datasets import load_dataset,Dataset,DatasetDict | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, r2_score, roc_curve | |
| from sentence_transformers import SentenceTransformer, InputExample, losses, util, evaluation, models | |
| from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
| from torch.utils.data import DataLoader | |
| from torch import nn | |
| import datasets | |
| import sklearn | |
| import sentence_transformers | |
| import torch | |
| class SentTransf_train(): | |
| def __init__(self, modelsToTrain = [ | |
| {"checkPoint": "distiluse-base-multilingual-cased-v1", "fromScratch": False}, | |
| {"checkPoint": "paraphrase-multilingual-MiniLM-L12-v2", "fromScratch": False}, | |
| {"checkPoint": "paraphrase-multilingual-mpnet-base-v2", "fromScratch": False}, | |
| {"checkPoint": "all-distilroberta-v1", "fromScratch": False}, | |
| {"checkPoint": "bert-base-multilingual-uncased", "fromScratch": True}, | |
| {"checkPoint": "dccuchile/bert-base-spanish-wwm-uncased", "fromScratch": True} | |
| ], epochsToTest = [1,5,10,30,50,100], saving_path = create_file_path('', doctype=4)): | |
| #modelsToTrain = [{"checkPoint": "distiluse-base-multilingual-cased-v1", "fromScratch": False}] | |
| #epochsToTest = [1] | |
| self.saving_path = saving_path | |
| self.data_train = self.__getDatasetPartition(path_created_dataset, "train") | |
| self.data_test = self.__getDatasetPartition(path_created_dataset, "test") | |
| self.data_valid = self.__getDatasetPartition(path_created_dataset, "valid") | |
| #epochsToTest = [1,5,10,30,50,100] | |
| #Get evaluator | |
| evaluator = self.__CreateModelEvaluationData() | |
| #Train the models | |
| for model in modelsToTrain: | |
| for epochs in epochsToTest: | |
| self.__TrainModel(model["checkPoint"], evaluator, epochs, model["fromScratch"]) | |
| def __getDatasetPartition(self, fileName, split): | |
| subject1_fileDataset = load_dataset("json", data_files=fileName, split="train") | |
| samples = [] | |
| for i in range (0,len(subject1_fileDataset[split][0])): #len(subject1) | |
| mark = subject1_fileDataset[split][0][i]['mark'] | |
| responseStudent = subject1_fileDataset[split][0][i]['responseStudent'] | |
| responseTeacher = subject1_fileDataset[split][0][i]['responseTeacher'] | |
| ie = InputExample(texts=[responseTeacher, responseStudent], label=mark) | |
| samples.append(ie) | |
| return samples | |
| def __CreateModelEvaluationData(self): | |
| sentences1 = [] | |
| sentences2 = [] | |
| scores = [] | |
| for i in range (0,len(self.data_valid)): | |
| sentences1.append(self.data_valid[i].texts[0]) | |
| sentences2.append(self.data_valid[i].texts[1]) | |
| scores.append(self.data_valid[i].label) | |
| evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores) | |
| return evaluator | |
| def __TrainModel(self, checkpoint, evaluator, epochs, fromScratch): | |
| batch_size = int(len(self.data_train) * 0.1) | |
| #Create the model from checkpoint | |
| if (not fromScratch): | |
| model = SentenceTransformer(checkpoint) | |
| else: | |
| word_embedding_model = models.Transformer(checkpoint, max_seq_length=256) | |
| pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) | |
| dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) | |
| model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) | |
| train_dataloader = DataLoader(self.data_train, shuffle=True, batch_size=batch_size) | |
| train_loss = losses.CosineSimilarityLoss(model) | |
| #Fit the model | |
| local_model_path = self.saving_path + 'Model_' + checkpoint + '/' + str(epochs) + '_Epochs' | |
| warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1) #10% of train data for warm-up | |
| evaluation_steps = int(len(train_dataloader)*0.1) | |
| print(len(train_dataloader),warmup_steps,evaluation_steps) | |
| model.fit(train_objectives=[(train_dataloader, train_loss)] | |
| , epochs=epochs | |
| , warmup_steps=warmup_steps | |
| , evaluator=evaluator | |
| , evaluation_steps=evaluation_steps | |
| ,output_path=local_model_path | |
| ,save_best_model=True) | |
| try: | |
| os.mkdir(self.saving_path + "models") | |
| except: | |
| pass | |
| model.save(self.saving_path + "models/" +checkpoint+ str("-Epochs-") + str(epochs)) | |
| import json | |
| import math | |
| import pandas as pd | |
| from datasets import load_dataset,Dataset,DatasetDict | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, r2_score, roc_curve | |
| from sentence_transformers import SentenceTransformer, InputExample, losses, util, evaluation | |
| from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
| from torch.utils.data import DataLoader | |
| class SentTransf_test(): | |
| def __init__(self, modelsToTest = ['distiluse-base-multilingual-cased-v1' | |
| ,'paraphrase-multilingual-MiniLM-L12-v2' | |
| ,'paraphrase-multilingual-mpnet-base-v2' | |
| ,'all-distilroberta-v1' | |
| ,'bert-base-multilingual-uncased' | |
| ,'dccuchile_bert-base-spanish-wwm-uncased' | |
| ], epochsToTest = [1,5,10,30,50,100], save_path = create_file_path('', doctype=4)[:-1]): | |
| self.modelsToTest = modelsToTest | |
| self.epochsToTest = epochsToTest | |
| self.save_path = save_path | |
| self.data_test = self.__getDatasetPartition(path_created_dataset, "test") | |
| #modelsToTest = ['distiluse-base-multilingual-cased-v1'] | |
| #epochsToTest = [1] | |
| def similarity(self,model_path, text1, text2): | |
| #local_model_path = self.save_path + '/' + checkpoint + '/' + str(epochs) + '_Epochs' | |
| model = SentenceTransformer(model_path) | |
| #Compute embedding for both lists | |
| embeddings1 = model.encode(text1, convert_to_tensor=True) | |
| embeddings2 = model.encode(text2, convert_to_tensor=True) | |
| #Compute cosine-similarits | |
| cosine_score = util.cos_sim(embeddings1, embeddings2) | |
| return cosine_score | |
| def test_model(self): | |
| try: | |
| os.mkdir(self.save_path + '/tests') | |
| except: | |
| pass | |
| self.model_name = [] | |
| self.epochs = [] | |
| self.metricMAE = [] | |
| self.metricMSE = [] | |
| self.metricRMSE = [] | |
| self.metricRMSLE = [] | |
| self.metricMAPE = [] | |
| self.metricR2 = [] | |
| #Train & Test the model | |
| cnt=0 | |
| for checkpoint in self.modelsToTest: | |
| #checkpoint = 'Model_' + checkpoint.replace('/','_') | |
| checkpoint = 'Model_' + checkpoint | |
| df = pd.DataFrame(columns=['Sentence1', 'Sentence2', 'Hashed_id', 'Mark']) | |
| dfMetrics = pd.DataFrame(columns=['Model','Epochs', 'MAE', 'MSE', 'RMSE', 'RMSLE', 'MAPE', 'R2']) | |
| for epochs in self.epochsToTest: | |
| self.__TestModel(checkpoint, self.data_test, epochs, df) | |
| self.model_name.append(checkpoint) | |
| #Save Score Results file | |
| df.to_csv(self.save_path + '/tests/' + checkpoint +'_Scores_Results.csv', index=False, sep=';', encoding='utf-8') | |
| #Save Metrics file | |
| dfMetrics['Model'] = self.model_name | |
| dfMetrics['Epochs'] = self.epochs | |
| dfMetrics['MAE'] = self.metricMAE | |
| dfMetrics['MSE'] = self.metricMSE | |
| dfMetrics['RMSE'] = self.metricRMSE | |
| dfMetrics['RMSLE'] = self.metricRMSLE | |
| dfMetrics['MAPE'] = self.metricMAPE | |
| dfMetrics['R2'] = self.metricR2 | |
| dfMetrics.to_csv(self.save_path + '/tests/All_Metrics_Results.csv', index=False, sep=';', encoding='utf-8') | |
| def __getDatasetPartition(self, fileName, split): | |
| subject1_fileDataset = load_dataset("json", data_files=fileName, split="train") | |
| samples = [] | |
| for i in range (0,len(subject1_fileDataset[split][0])): #len(subject1) | |
| hashed_id = subject1_fileDataset[split][0][i]['hashed_id'] | |
| mark = subject1_fileDataset[split][0][i]['mark'] | |
| responseStudent = subject1_fileDataset[split][0][i]['responseStudent'] | |
| responseTeacher = subject1_fileDataset[split][0][i]['responseTeacher'] | |
| ie = InputExample(guid= hashed_id, texts=[responseTeacher, responseStudent], label=mark) | |
| samples.append(ie) | |
| return samples | |
| def __TestModel(self, checkpoint, data, epochs, df): | |
| #Load model | |
| #local_model_path = 'jfarray' + '/' + checkpoint + '_' + str(epochs) + '_Epochs' | |
| local_model_path = self.save_path + '/' + checkpoint + '/' + str(epochs) + '_Epochs' | |
| model = SentenceTransformer(local_model_path) | |
| hashed_ids = [] | |
| sentences1 = [] | |
| sentences2 = [] | |
| marks = [] | |
| scores = [] | |
| marksFloat = [] | |
| scoresFloat = [] | |
| for i in range (0,len(data)): #len(data) | |
| sentences1.append(data[i].texts[0]) | |
| sentences2.append(data[i].texts[1]) | |
| #Compute embedding for both lists | |
| embeddings1 = model.encode(sentences1, convert_to_tensor=True) | |
| embeddings2 = model.encode(sentences2, convert_to_tensor=True) | |
| #Compute cosine-similarits | |
| cosine_scores = util.cos_sim(embeddings1, embeddings2) | |
| #Output the pairs with their score | |
| for i in range(len(sentences1)): | |
| hashed_ids.append(data[i].guid) | |
| marks.append(str(data[i].label).replace('.',',')) | |
| marksFloat.append(data[i].label) | |
| scores.append(str(round(cosine_scores[i][i].item(),3)).replace('.',',')) | |
| scoresFloat.append(round(cosine_scores[i][i].item(),3)) | |
| #Save scores in the file | |
| df['Hashed_id'] = hashed_ids | |
| df['Mark'] = marks | |
| df['Score_' + str(epochs)] = scores | |
| df['Sentence1'] = sentences1 | |
| df['Sentence2'] = sentences2 | |
| self.epochs.append(str(epochs)) | |
| #Calculate metrics 'MAE', 'MSE', 'RMSE', 'RMSLE', 'MAPE', 'R2' | |
| self.metricMAE.append(str(mean_absolute_error(marksFloat, scoresFloat)).replace('.',',')) | |
| self.metricMSE.append(str(mean_squared_error(marksFloat, scoresFloat, squared = True)).replace('.',',')) | |
| self.metricRMSE.append(str(mean_squared_error(marksFloat, scoresFloat, squared = False)).replace('.',',')) | |
| try: | |
| self.metricRMSLE.append(str(mean_squared_log_error(marksFloat, scoresFloat)).replace('.',',')) | |
| except: | |
| self.metricRMSLE.append('-') | |
| self.metricMAPE.append(str(mean_absolute_percentage_error(marksFloat, scoresFloat)).replace('.',',')) | |
| self.metricR2.append(str(r2_score(marksFloat, scoresFloat)).replace('.',',')) | |
| #Evaluate Model this test data | |
| batch_size = 15 #Initializes the batch size with the same value as the training | |
| test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(self.data_test, batch_size=batch_size, name= checkpoint) | |
| test_evaluator(model, output_path= self.save_path + '/tests/') | |