Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer, InputExample, losses | |
| from transformers import AutoTokenizer, AutoModel | |
| import pandas as pd | |
| from torch.utils.data import DataLoader | |
| import numpy as np | |
| from pathlib import Path | |
| import os | |
| import sys | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| def load_model(model_name): | |
| model = SentenceTransformer(model_name) | |
| return model | |
| def get_embeddings(model, texts): | |
| embeddings = model.encode(texts) #convert_to_tensor=True) | |
| return embeddings | |
| # Function to get embeddings from a pre-trained model - requires a lot of memory | |
| def get_transformes_embeddings(text, model, tokenizer): | |
| inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) | |
| outputs = model(**inputs) | |
| # Mean pooling to get a single vector per sentence | |
| embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() | |
| return embeddings | |
| # Using batch processing - for low memory | |
| def batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128): | |
| all_embeddings = [] | |
| for i in range(0, len(sentences), batch_size): | |
| batch = sentences[i:i + batch_size] | |
| inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length) | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() | |
| all_embeddings.append(embeddings) | |
| return np.vstack(all_embeddings) | |
| def fine_tune_and_save_model(model_name, dataset): | |
| # Initialize the pre-trained model | |
| model = SentenceTransformer(model_name) | |
| # Create a list of InputExample objects | |
| train_examples = [InputExample(texts=[row['utterance'], row['intent']], label=1.0) for _, row in dataset.iterrows()] | |
| # Define a DataLoader | |
| train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) | |
| # Define the loss function | |
| train_loss = losses.MultipleNegativesRankingLoss(model) | |
| # Fine-tune the model | |
| model.fit( | |
| train_objectives=[(train_dataloader, train_loss)], | |
| epochs=1, | |
| warmup_steps=100 | |
| ) | |
| # Save the fine-tuned model | |
| path = Path(BASE_DIR) / "output" / "fine-tuned-model" / model_name | |
| model.save(str(path)) | |
| # old model.save('output/fine-tuned-model/{model_name}') | |
| return model | |
| def load_model(model_path): | |
| model = SentenceTransformer(model_path) # SentenceTransformer.from_pretrained('output/fine-tuned-model') | |
| return model |