Spaces:
Runtime error
Runtime error
| """ | |
| DONE : | |
| - Separer la partie vectoriser du Classifeur | |
| - Ajouter un LSTM au Classifieur | |
| - entrainer le Classifieur | |
| TO DO : | |
| - Améliorer les résultats du modèle | |
| """ | |
| import logging | |
| import random | |
| from typing import Sequence | |
| import torch | |
| import dataloader | |
| from model import Decoder, Encoder, EncoderDecoderModel | |
| from train import train_network | |
| # logging INFO, WARNING, ERROR, CRITICAL, DEBUG | |
| logging.basicConfig(level=logging.INFO) | |
| logging.disable(level=10) | |
| import os | |
| os.environ[ | |
| "CUBLAS_WORKSPACE_CONFIG" | |
| ] = ":16:8" # pour que ça marche en deterministe sur mon pc boulot | |
| # variable environnement dans git bash export CUBLAS_WORKSPACE_CONFIG=:16:8 | |
| # from datasets import load_dataset | |
| ### OPEN DATASET### | |
| # dataset = load_dataset("newsroom", data_dir=DATA_PATH, data_files="data/train.jsonl") | |
| data1 = dataloader.Data("data/train_extract.jsonl") | |
| data2 = dataloader.Data("data/dev_extract.jsonl") | |
| train_dataset = data1.make_dataset() | |
| dev_dataset = data2.make_dataset() | |
| words = data1.get_words() | |
| vectoriser = dataloader.Vectoriser(words) | |
| word_counts = vectoriser.word_count | |
| def predict(model, tokens: Sequence[str]) -> Sequence[str]: | |
| """Predict the POS for a tokenized sequence""" | |
| words_idx = vectoriser.encode(tokens).to(device) | |
| # Pas de calcul de gradient ici : c'est juste pour les prédictions | |
| with torch.no_grad(): | |
| # equivalent to model(input) when called out of class | |
| out = model(words_idx).to(device) | |
| out_predictions = out.to(device) | |
| return vectoriser.decode(out_predictions) | |
| if __name__ == "__main__": | |
| ### NEURAL NETWORK ### | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print("Device check. You are using:", device) | |
| ### RÉSEAU ENTRAÎNÉ ### | |
| # Pour s'assurer que les résultats seront les mêmes à chaque run du notebook | |
| torch.use_deterministic_algorithms(True) | |
| torch.manual_seed(0) | |
| random.seed(0) | |
| # On peut également entraîner encoder séparemment | |
| encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device) | |
| decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device) | |
| # S'ils sont entraînés, on peut les sauvegarder | |
| torch.save(encoder.state_dict(), "model/encoder.pt") | |
| torch.save(encoder.state_dict(), "model/encoder.pt") | |
| trained_classifier = EncoderDecoderModel(encoder, decoder, device).to(device) | |
| print(next(trained_classifier.parameters()).device) | |
| # print(train_dataset.is_cuda) | |
| train_network( | |
| trained_classifier, | |
| [vectoriser.vectorize(row) for index, row in train_dataset.iterrows()], | |
| [vectoriser.vectorize(row) for index, row in dev_dataset.iterrows()], | |
| 5, | |
| ) | |
| torch.save(trained_classifier.state_dict(), "model/model.pt") | |
| print(f'test text : {dev_dataset.iloc[6]["summary"]}') | |
| print( | |
| f'test prediction : {predict(trained_classifier, dev_dataset.iloc[6]["text"])}' | |
| ) | |