Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| # Load CSV file | |
| csv_file = "your_file.csv" # Path to your CSV file | |
| df = pd.read_csv(csv_file) | |
| # Assuming the column containing sentences is named 'text' | |
| sentences = df['text'].tolist() | |
| # Load Romanian BERT model and tokenizer | |
| model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| # Function to get sentence embedding | |
| def get_sentence_embedding(sentence, model, tokenizer): | |
| inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding | |
| return cls_embedding.numpy() | |
| # Generate embeddings for all sentences | |
| embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences] | |
| # Convert to numpy array | |
| embeddings = np.array(embeddings).reshape(len(sentences), -1) | |
| # Save embeddings to a file (optional) | |
| np.save("sentence_embeddings.npy", embeddings) | |
| # Save sentences for reference (optional) | |
| df['embeddings'] = embeddings.tolist() | |
| df.to_csv("embeddings_with_text.csv", index=False) | |