Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| # Load Romanian BERT model and tokenizer | |
| model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| # Load pre-saved embeddings and sentences | |
| saved_embeddings = np.load("sentence_embeddings.npy") | |
| sentences = np.load("sentences.npy") | |
| # Function to get sentence embedding | |
| def get_sentence_embedding(sentence, model, tokenizer): | |
| inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding | |
| return cls_embedding.numpy() | |
| # Streamlit UI | |
| st.title("Sentence Similarity with Pre-trained BERT") | |
| st.write("Enter a sentence in Romanian to find similar sentences.") | |
| # User input | |
| user_input = st.text_input("Your sentence") | |
| # Check if user input exists | |
| if user_input: | |
| # Embed the user input | |
| user_embedding = get_sentence_embedding(user_input, model, tokenizer) | |
| # Compute similarity with saved embeddings | |
| similarities = cosine_similarity(user_embedding, saved_embeddings.reshape(saved_embeddings.shape[0], -1)) | |
| # Get the top 5 most similar sentences | |
| top_n = 5 | |
| top_indices = np.argsort(similarities[0])[::-1][:top_n] | |
| st.write("Top similar sentences:") | |
| # Display the most similar sentences with similarity scores | |
| for idx in top_indices: | |
| st.write(f"Sentence: {sentences[idx]}") | |
| st.write(f"Similarity score: {similarities[0][idx]:.4f}") | |