Spaces:
Runtime error
Runtime error
| from transformers import AutoModelForMaskedLM | |
| from transformers import AutoTokenizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import streamlit as st | |
| import torch | |
| model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2022" | |
| model = AutoModelForMaskedLM.from_pretrained(model_checkpoint,output_hidden_states=True) | |
| model_base = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased", output_hidden_states=True) | |
| model_2019_2022 = AutoModelForMaskedLM.from_pretrained("vives/distilbert-base-uncased-finetuned-cvent-2019_2022",output_hidden_states=True) | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| text1 = st.text_area("Enter first sentence") | |
| text2 = st.text_area("Enter second sentence") | |
| def concat_tokens(t1,t2): | |
| tokens = {'input_ids': [], 'attention_mask': []} | |
| sentences = [t1, t2] | |
| for sentence in sentences: | |
| # encode each sentence and append to dictionary | |
| new_tokens = tokenizer.encode_plus(sentence, max_length=128, | |
| truncation=True, padding='max_length', | |
| return_tensors='pt') | |
| tokens['input_ids'].append(new_tokens['input_ids'][0]) | |
| tokens['attention_mask'].append(new_tokens['attention_mask'][0]) | |
| # reformat list of tensors into single tensor | |
| tokens['input_ids'] = torch.stack(tokens['input_ids']) | |
| tokens['attention_mask'] = torch.stack(tokens['attention_mask']) | |
| return tokens | |
| def pool_embeddings(out, tok): | |
| embeddings = out["hidden_states"][-1] | |
| attention_mask = tok['attention_mask'] | |
| mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() | |
| masked_embeddings = embeddings * mask | |
| summed = torch.sum(masked_embeddings, 1) | |
| summed_mask = torch.clamp(mask.sum(1), min=1e-9) | |
| mean_pooled = summed / summed_mask | |
| return mean_pooled | |
| if text1 and text2: | |
| with torch.no_grad(): | |
| tokens = concat_tokens(text1,text2) | |
| outputs = model(**tokens) | |
| mean_pooled = pool_embeddings(outputs,tokens).detach().numpy() | |
| fine_tuned_out = cosine_similarity( | |
| [mean_pooled[0]], | |
| mean_pooled[1:] | |
| )[0][0] | |
| outputs_base = model_base(**tokens) | |
| mean_pooled_base = pool_embeddings(outputs_base,tokens).detach().numpy() | |
| base_out = cosine_similarity( | |
| [mean_pooled_base[0]], | |
| mean_pooled_base[1:] | |
| )[0][0] | |
| outputs_2019_2022 = model_2019_2022(**tokens) | |
| mean_pooled_2019_2022 = pool_embeddings(outputs_2019_2022,tokens).detach().numpy() | |
| fine_tuned_out2 = cosine_similarity( | |
| [mean_pooled_2019_2022[0]], | |
| mean_pooled_2019_2022[1:] | |
| )[0][0] | |
| st.write(f">>>Similarity for fine-tuned (2019-2022) {fine_tuned_out2}") | |
| st.write(f">>>Similarity for fine-tuned (2022) {fine_tuned_out}") | |
| st.write(f">>>Similarity for base {base_out}") | |