Spaces:
Runtime error
Runtime error
Commit ·
51fb126
1
Parent(s): 4b9b600
requiremtn fix
Browse files- app.py +8 -10
- extract_config.py +8 -0
- keywords.py +38 -8
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
from fastT5 import OnnxT5,get_onnx_runtime_sessions
|
|
|
|
| 4 |
from annotated_text import annotated_text
|
| 5 |
import nltk
|
| 6 |
nltk.download('stopwords')
|
|
@@ -8,15 +9,14 @@ nltk.download('wordnet')
|
|
| 8 |
nltk.download('punkt')
|
| 9 |
from nltk.corpus import stopwords,wordnet
|
| 10 |
from nltk.tokenize import sent_tokenize
|
| 11 |
-
from flashtext import KeywordProcessor
|
| 12 |
-
import regex as re
|
| 13 |
import string
|
| 14 |
import subprocess
|
| 15 |
-
from PIL import Image
|
| 16 |
import logging
|
| 17 |
import multiprocessing
|
| 18 |
total_threads=multiprocessing.cpu_count()
|
| 19 |
import onnxruntime as ort
|
|
|
|
|
|
|
| 20 |
try:
|
| 21 |
import pke
|
| 22 |
logging.error("importing pke info")
|
|
@@ -84,7 +84,7 @@ model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
|
|
| 84 |
tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
|
| 85 |
|
| 86 |
def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
| 87 |
-
input = "context: %s answer: %s " % (context, answer)
|
| 88 |
features=tokenizer([input],return_tensors='pt')
|
| 89 |
output=model.generate(input_ids=features['input_ids'],
|
| 90 |
attention_mask=features['attention_mask'],
|
|
@@ -94,7 +94,7 @@ def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
|
| 94 |
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
|
| 95 |
|
| 96 |
def create_answers_t5(model,tokenizer,context,question,max_length=128):
|
| 97 |
-
input = "context: %s question: %s " % (context, question)
|
| 98 |
features=tokenizer([input],return_tensors='pt')
|
| 99 |
output=model.generate(input_ids=features['input_ids'],
|
| 100 |
attention_mask=features['attention_mask'],
|
|
@@ -124,10 +124,8 @@ c1,c2,c3=st.columns(3)
|
|
| 124 |
with c1:
|
| 125 |
create_usingkeyword = st.button("Create Questions using Keywords")
|
| 126 |
if create_usingkeyword:
|
| 127 |
-
|
| 128 |
-
from keywords import tokenize_sentence,get_noun_adj_verb
|
| 129 |
tokenized_sent = tokenize_sentence(input_context)
|
| 130 |
-
keywords_noun_adj_verb =
|
| 131 |
t5_questions=[]
|
| 132 |
|
| 133 |
with st.spinner("Creating Questionsssss"):
|
|
@@ -144,6 +142,6 @@ with c1:
|
|
| 144 |
st.markdown("---")
|
| 145 |
|
| 146 |
with c2:
|
| 147 |
-
|
| 148 |
-
if
|
| 149 |
pass
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
from fastT5 import OnnxT5,get_onnx_runtime_sessions
|
| 4 |
+
from keywords import tokenize_sentence, get_multipartiterank_topics,get_topicrank_topics,get_yake_topics
|
| 5 |
from annotated_text import annotated_text
|
| 6 |
import nltk
|
| 7 |
nltk.download('stopwords')
|
|
|
|
| 9 |
nltk.download('punkt')
|
| 10 |
from nltk.corpus import stopwords,wordnet
|
| 11 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
|
|
| 12 |
import string
|
| 13 |
import subprocess
|
|
|
|
| 14 |
import logging
|
| 15 |
import multiprocessing
|
| 16 |
total_threads=multiprocessing.cpu_count()
|
| 17 |
import onnxruntime as ort
|
| 18 |
+
# from bertopic import BERTopic
|
| 19 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 20 |
try:
|
| 21 |
import pke
|
| 22 |
logging.error("importing pke info")
|
|
|
|
| 84 |
tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
|
| 85 |
|
| 86 |
def create_question_t5(model,tokenizer,context,answer,max_length=64):
|
| 87 |
+
input = "context: %s answer: %s </s>" % (context, answer)
|
| 88 |
features=tokenizer([input],return_tensors='pt')
|
| 89 |
output=model.generate(input_ids=features['input_ids'],
|
| 90 |
attention_mask=features['attention_mask'],
|
|
|
|
| 94 |
return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
|
| 95 |
|
| 96 |
def create_answers_t5(model,tokenizer,context,question,max_length=128):
|
| 97 |
+
input = "context: %s question: %s </s>" % (context, question)
|
| 98 |
features=tokenizer([input],return_tensors='pt')
|
| 99 |
output=model.generate(input_ids=features['input_ids'],
|
| 100 |
attention_mask=features['attention_mask'],
|
|
|
|
| 124 |
with c1:
|
| 125 |
create_usingkeyword = st.button("Create Questions using Keywords")
|
| 126 |
if create_usingkeyword:
|
|
|
|
|
|
|
| 127 |
tokenized_sent = tokenize_sentence(input_context)
|
| 128 |
+
keywords_noun_adj_verb = get_multipartiterank_topics(input_context)
|
| 129 |
t5_questions=[]
|
| 130 |
|
| 131 |
with st.spinner("Creating Questionsssss"):
|
|
|
|
| 142 |
st.markdown("---")
|
| 143 |
|
| 144 |
with c2:
|
| 145 |
+
create_usinglongformer = st.button("Create Questions using Longformer")
|
| 146 |
+
if create_usinglongformer:
|
| 147 |
pass
|
extract_config.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BertConfig,BertForMaskedLM
|
| 2 |
+
|
| 3 |
+
config=BertConfig()
|
| 4 |
+
model=BertForMaskedLM(config)
|
| 5 |
+
|
| 6 |
+
print(config)
|
| 7 |
+
|
| 8 |
+
print(model.config)
|
keywords.py
CHANGED
|
@@ -4,8 +4,6 @@ nltk.download('wordnet')
|
|
| 4 |
nltk.download('punkt')
|
| 5 |
from nltk.corpus import stopwords,wordnet
|
| 6 |
from nltk.tokenize import sent_tokenize
|
| 7 |
-
from flashtext import KeywordProcessor
|
| 8 |
-
import regex as re
|
| 9 |
import string
|
| 10 |
import subprocess
|
| 11 |
import logging
|
|
@@ -19,24 +17,40 @@ except:
|
|
| 19 |
subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
|
| 20 |
import pke
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def tokenize_sentence(text):
|
| 24 |
sentences=sent_tokenize(text)
|
| 25 |
sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
|
| 26 |
return sentences
|
| 27 |
|
| 28 |
-
def
|
| 29 |
output = []
|
| 30 |
try:
|
| 31 |
extractor = pke.unsupervised.MultipartiteRank()
|
| 32 |
-
extractor.load_document(input=text, language='en',normalization=None)
|
| 33 |
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
| 34 |
-
extractor.candidate_selection(pos={'NOUN',
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
keyphrases = extractor.get_n_best(n=5)
|
| 41 |
|
| 42 |
for val in keyphrases:
|
|
@@ -45,3 +59,19 @@ def get_noun_adj_verb(text):
|
|
| 45 |
print("found exception",e)
|
| 46 |
return list(set(output))
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
nltk.download('punkt')
|
| 5 |
from nltk.corpus import stopwords,wordnet
|
| 6 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
|
|
| 7 |
import string
|
| 8 |
import subprocess
|
| 9 |
import logging
|
|
|
|
| 17 |
subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
|
| 18 |
import pke
|
| 19 |
|
| 20 |
+
stoplist = list(string.punctuation)
|
| 21 |
+
stoplist += pke.lang.stopwords.get('en')
|
| 22 |
+
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
|
| 23 |
+
stoplist += stopwords.words('english')
|
| 24 |
|
| 25 |
def tokenize_sentence(text):
|
| 26 |
sentences=sent_tokenize(text)
|
| 27 |
sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
|
| 28 |
return sentences
|
| 29 |
|
| 30 |
+
def get_multipartiterank_topics(text):
|
| 31 |
output = []
|
| 32 |
try:
|
| 33 |
extractor = pke.unsupervised.MultipartiteRank()
|
| 34 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
| 35 |
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
| 36 |
+
extractor.candidate_selection(pos={'NOUN','VERB','ADJ'})
|
| 37 |
+
extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1)
|
| 38 |
+
keyphrases = extractor.get_n_best(n=5)
|
| 39 |
|
| 40 |
+
for val in keyphrases:
|
| 41 |
+
output.append(val[0])
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print("found exception",e)
|
| 44 |
+
return list(set(output))
|
| 45 |
|
| 46 |
+
def get_topicrank_topics(text):
|
| 47 |
+
output = []
|
| 48 |
+
try:
|
| 49 |
+
extractor = pke.unsupervised.TopicRank()
|
| 50 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
| 51 |
+
# keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
|
| 52 |
+
extractor.candidate_selection(pos={'NOUN', 'ADJ'})
|
| 53 |
+
extractor.candidate_weighting(threshold=0.7,method='average')
|
| 54 |
keyphrases = extractor.get_n_best(n=5)
|
| 55 |
|
| 56 |
for val in keyphrases:
|
|
|
|
| 59 |
print("found exception",e)
|
| 60 |
return list(set(output))
|
| 61 |
|
| 62 |
+
def get_yake_topics(text):
|
| 63 |
+
#statistics model --very poor performance
|
| 64 |
+
output = []
|
| 65 |
+
try:
|
| 66 |
+
extractor = pke.unsupervised.YAKE()
|
| 67 |
+
extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
|
| 68 |
+
extractor.candidate_selection(n=3)
|
| 69 |
+
extractor.candidate_weighting(window=2)
|
| 70 |
+
keyphrases = extractor.get_n_best(n=5,threshold=0.9)
|
| 71 |
+
|
| 72 |
+
for val in keyphrases:
|
| 73 |
+
output.append(val[0])
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print("found exception",e)
|
| 76 |
+
return list(set(output))
|
| 77 |
+
|
requirements.txt
CHANGED
|
@@ -25,4 +25,6 @@ tokenizers~=0.12.1
|
|
| 25 |
flatbuffers~=1.12
|
| 26 |
filelock~=3.6.0
|
| 27 |
sacremoses~=0.0.53
|
| 28 |
-
fastT5~=0.1.4
|
|
|
|
|
|
|
|
|
| 25 |
flatbuffers~=1.12
|
| 26 |
filelock~=3.6.0
|
| 27 |
sacremoses~=0.0.53
|
| 28 |
+
fastT5~=0.1.4
|
| 29 |
+
nltk~=3.6
|
| 30 |
+
st-annotated-text~=3.0.0
|