Spaces:

sunil448832
/

retrieval-augment-generation

Runtime error

retrieval-augment-generation / models /embedding_models.py

Initial Commit

eccde2c over 2 years ago

1.69 kB

	import torch
	from transformers import AutoTokenizer, AutoModel
	import torch.nn.functional as F

	# Create a class for embedding sentences using Hugging Face Transformers
	class EmbeddingModel:
	def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
	# Initialize the model with the given model_name
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModel.from_pretrained(model_name)
	# Get the embedding dimension from the model's output
	self.embedding_dim = self.encode('Hi').shape[1]

	def _mean_pooling(self, model_output, attention_mask):
	# Calculate mean pooling of token embeddings
	token_embeddings = model_output[0]
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
	return embedding

	def encode(self, text):
	# Encode a text into sentence embeddings
	inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
	with torch.no_grad():
	outputs = self.model(**inputs)
	sentence_embeddings = self._mean_pooling(outputs, inputs['attention_mask'])
	sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).numpy().astype('float32')
	return sentence_embeddings

	if __name__ == '__main__':
	# Sentences we want sentence embeddings for
	sentences = ['This is an example sentence', 'Each sentence is converted']
	# Print the embedding dimension of the model
	print(EmbeddingModel().embedding_dim)