Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse files- app.py +2 -4
- masking_methods.py +44 -1
app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from transformers import AutoTokenizer
|
| 2 |
from transformers import AutoModelForSeq2SeqLM
|
| 3 |
import plotly.graph_objs as go
|
|
@@ -26,7 +28,6 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaske
|
|
| 26 |
import random
|
| 27 |
from nltk.corpus import stopwords
|
| 28 |
from termcolor import colored
|
| 29 |
-
import nltk
|
| 30 |
from nltk.translate.bleu_score import sentence_bleu
|
| 31 |
from transformers import BertTokenizer, BertModel
|
| 32 |
import gradio as gr
|
|
@@ -36,9 +37,6 @@ from lcs import find_common_subsequences
|
|
| 36 |
from highlighter import highlight_common_words, highlight_common_words_dict
|
| 37 |
from entailment import analyze_entailment
|
| 38 |
|
| 39 |
-
nltk.download('stopwords')
|
| 40 |
-
|
| 41 |
-
|
| 42 |
# Function for the Gradio interface
|
| 43 |
def model(prompt):
|
| 44 |
sentence = prompt
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
nltk.download('stopwords')
|
| 3 |
from transformers import AutoTokenizer
|
| 4 |
from transformers import AutoModelForSeq2SeqLM
|
| 5 |
import plotly.graph_objs as go
|
|
|
|
| 28 |
import random
|
| 29 |
from nltk.corpus import stopwords
|
| 30 |
from termcolor import colored
|
|
|
|
| 31 |
from nltk.translate.bleu_score import sentence_bleu
|
| 32 |
from transformers import BertTokenizer, BertModel
|
| 33 |
import gradio as gr
|
|
|
|
| 37 |
from highlighter import highlight_common_words, highlight_common_words_dict
|
| 38 |
from entailment import analyze_entailment
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
# Function for the Gradio interface
|
| 41 |
def model(prompt):
|
| 42 |
sentence = prompt
|
masking_methods.py
CHANGED
|
@@ -2,6 +2,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM
|
|
| 2 |
from transformers import pipeline
|
| 3 |
import random
|
| 4 |
from nltk.corpus import stopwords
|
|
|
|
| 5 |
|
| 6 |
# Masking Model
|
| 7 |
def mask_non_stopword(sentence):
|
|
@@ -14,6 +15,47 @@ def mask_non_stopword(sentence):
|
|
| 14 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
| 15 |
return masked_sentence
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Load tokenizer and model for masked language model
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
| 19 |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
|
@@ -22,4 +64,5 @@ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
|
| 22 |
def mask(sentence):
|
| 23 |
predictions = fill_mask(sentence)
|
| 24 |
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
|
| 25 |
-
return masked_sentences
|
|
|
|
|
|
| 2 |
from transformers import pipeline
|
| 3 |
import random
|
| 4 |
from nltk.corpus import stopwords
|
| 5 |
+
import math
|
| 6 |
|
| 7 |
# Masking Model
|
| 8 |
def mask_non_stopword(sentence):
|
|
|
|
| 15 |
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
| 16 |
return masked_sentence
|
| 17 |
|
| 18 |
+
def mask_non_stopword_pseudorandom(sentence):
|
| 19 |
+
stop_words = set(stopwords.words('english'))
|
| 20 |
+
words = sentence.split()
|
| 21 |
+
non_stop_words = [word for word in words if word.lower() not in stop_words]
|
| 22 |
+
if not non_stop_words:
|
| 23 |
+
return sentence
|
| 24 |
+
random.seed(10)
|
| 25 |
+
word_to_mask = random.choice(non_stop_words)
|
| 26 |
+
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
|
| 27 |
+
return masked_sentence
|
| 28 |
+
|
| 29 |
+
def high_entropy_words(sentence, non_melting_points):
|
| 30 |
+
stop_words = set(stopwords.words('english'))
|
| 31 |
+
words = sentence.split()
|
| 32 |
+
|
| 33 |
+
non_melting_words = set()
|
| 34 |
+
for _, point in non_melting_points:
|
| 35 |
+
non_melting_words.update(point.lower().split())
|
| 36 |
+
|
| 37 |
+
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
|
| 38 |
+
|
| 39 |
+
if not candidate_words:
|
| 40 |
+
return sentence
|
| 41 |
+
|
| 42 |
+
max_entropy = -float('inf')
|
| 43 |
+
max_entropy_word = None
|
| 44 |
+
|
| 45 |
+
for word in candidate_words:
|
| 46 |
+
masked_sentence = sentence.replace(word, '[MASK]', 1)
|
| 47 |
+
predictions = fill_mask(masked_sentence)
|
| 48 |
+
|
| 49 |
+
# Calculate entropy based on top 5 predictions
|
| 50 |
+
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
|
| 51 |
+
|
| 52 |
+
if entropy > max_entropy:
|
| 53 |
+
max_entropy = entropy
|
| 54 |
+
max_entropy_word = word
|
| 55 |
+
|
| 56 |
+
return sentence.replace(max_entropy_word, '[MASK]', 1)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
# Load tokenizer and model for masked language model
|
| 60 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
|
| 61 |
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
|
|
|
|
| 64 |
def mask(sentence):
|
| 65 |
predictions = fill_mask(sentence)
|
| 66 |
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
|
| 67 |
+
return masked_sentences
|
| 68 |
+
|