Spaces:

ashishraics
/

MCQ-Generator

Runtime error

App Files Files Community

ashishraics commited on Jun 1, 2022

Commit

51fb126

1 Parent(s): 4b9b600

requiremtn fix

Browse files

Files changed (4) hide show

app.py +8 -10
extract_config.py +8 -0
keywords.py +38 -8
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 from transformers import AutoTokenizer
 from fastT5 import OnnxT5,get_onnx_runtime_sessions
 from annotated_text import annotated_text
 import nltk
 nltk.download('stopwords')
@@ -8,15 +9,14 @@ nltk.download('wordnet')
 nltk.download('punkt')
 from nltk.corpus import stopwords,wordnet
 from nltk.tokenize import sent_tokenize
-from flashtext import KeywordProcessor
-import regex as re
 import string
 import subprocess
-from PIL import Image
 import logging
 import multiprocessing
 total_threads=multiprocessing.cpu_count()
 import onnxruntime as ort
 try:
     import pke
     logging.error("importing pke info")
@@ -84,7 +84,7 @@ model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
 tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
 def create_question_t5(model,tokenizer,context,answer,max_length=64):
-    input = "context: %s answer: %s " % (context, answer)
     features=tokenizer([input],return_tensors='pt')
     output=model.generate(input_ids=features['input_ids'],
                           attention_mask=features['attention_mask'],
@@ -94,7 +94,7 @@ def create_question_t5(model,tokenizer,context,answer,max_length=64):
     return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
 def create_answers_t5(model,tokenizer,context,question,max_length=128):
-    input = "context: %s question: %s " % (context, question)
     features=tokenizer([input],return_tensors='pt')
     output=model.generate(input_ids=features['input_ids'],
                           attention_mask=features['attention_mask'],
@@ -124,10 +124,8 @@ c1,c2,c3=st.columns(3)
 with c1:
     create_usingkeyword = st.button("Create Questions using Keywords")
     if create_usingkeyword:
-        from keywords import tokenize_sentence,get_noun_adj_verb
         tokenized_sent = tokenize_sentence(input_context)
-        keywords_noun_adj_verb = get_noun_adj_verb(input_context)
         t5_questions=[]
         with st.spinner("Creating Questionsssss"):
@@ -144,6 +142,6 @@ with c1:
             st.markdown("---")
 with c2:
-    create_usingtopics = st.button("Create Questions using Topic Modelling")
-    if create_usingtopics:
         pass

 import streamlit as st
 from transformers import AutoTokenizer
 from fastT5 import OnnxT5,get_onnx_runtime_sessions
+from keywords import tokenize_sentence, get_multipartiterank_topics,get_topicrank_topics,get_yake_topics
 from annotated_text import annotated_text
 import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
 from nltk.corpus import stopwords,wordnet
 from nltk.tokenize import sent_tokenize
 import string
 import subprocess
 import logging
 import multiprocessing
 total_threads=multiprocessing.cpu_count()
 import onnxruntime as ort
+# from bertopic import BERTopic
+from sklearn.feature_extraction.text import CountVectorizer
 try:
     import pke
     logging.error("importing pke info")
 tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
 def create_question_t5(model,tokenizer,context,answer,max_length=64):
+    input = "context: %s answer: %s </s>" % (context, answer)
     features=tokenizer([input],return_tensors='pt')
     output=model.generate(input_ids=features['input_ids'],
                           attention_mask=features['attention_mask'],
     return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
 def create_answers_t5(model,tokenizer,context,question,max_length=128):
+    input = "context: %s question: %s </s>" % (context, question)
     features=tokenizer([input],return_tensors='pt')
     output=model.generate(input_ids=features['input_ids'],
                           attention_mask=features['attention_mask'],
 with c1:
     create_usingkeyword = st.button("Create Questions using Keywords")
     if create_usingkeyword:
         tokenized_sent = tokenize_sentence(input_context)
+        keywords_noun_adj_verb = get_multipartiterank_topics(input_context)
         t5_questions=[]
         with st.spinner("Creating Questionsssss"):
             st.markdown("---")
 with c2:
+    create_usinglongformer = st.button("Create Questions using Longformer")
+    if create_usinglongformer:
         pass

extract_config.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from transformers import BertConfig,BertForMaskedLM
+config=BertConfig()
+model=BertForMaskedLM(config)
+print(config)
+print(model.config)

keywords.py CHANGED Viewed

@@ -4,8 +4,6 @@ nltk.download('wordnet')
 nltk.download('punkt')
 from nltk.corpus import stopwords,wordnet
 from nltk.tokenize import sent_tokenize
-from flashtext import KeywordProcessor
-import regex as re
 import string
 import subprocess
 import logging
@@ -19,24 +17,40 @@ except:
     subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
     import pke
 def tokenize_sentence(text):
     sentences=sent_tokenize(text)
     sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
     return sentences
-def get_noun_adj_verb(text):
     output = []
     try:
         extractor = pke.unsupervised.MultipartiteRank()
-        extractor.load_document(input=text, language='en',normalization=None)
         # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
-        extractor.candidate_selection(pos={'NOUN', 'VERB', 'ADJ'})
-        # candidate weighting,
-        extractor.candidate_weighting(threshold=0.9,method='average',alpha=1.1)
-        #extract top n
         keyphrases = extractor.get_n_best(n=5)
         for val in keyphrases:
@@ -45,3 +59,19 @@ def get_noun_adj_verb(text):
         print("found exception",e)
     return list(set(output))

 nltk.download('punkt')
 from nltk.corpus import stopwords,wordnet
 from nltk.tokenize import sent_tokenize
 import string
 import subprocess
 import logging
     subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
     import pke
+stoplist = list(string.punctuation)
+stoplist += pke.lang.stopwords.get('en')
+stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
+stoplist += stopwords.words('english')
 def tokenize_sentence(text):
     sentences=sent_tokenize(text)
     sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
     return sentences
+def get_multipartiterank_topics(text):
     output = []
     try:
         extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
         # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
+        extractor.candidate_selection(pos={'NOUN','VERB','ADJ'})
+        extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1)
+        keyphrases = extractor.get_n_best(n=5)
+        for val in keyphrases:
+            output.append(val[0])
+    except Exception as e:
+        print("found exception",e)
+    return list(set(output))
+def get_topicrank_topics(text):
+    output = []
+    try:
+        extractor = pke.unsupervised.TopicRank()
+        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
+        # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
+        extractor.candidate_selection(pos={'NOUN', 'ADJ'})
+        extractor.candidate_weighting(threshold=0.7,method='average')
         keyphrases = extractor.get_n_best(n=5)
         for val in keyphrases:
         print("found exception",e)
     return list(set(output))
+def get_yake_topics(text):
+    #statistics model --very poor performance
+    output = []
+    try:
+        extractor = pke.unsupervised.YAKE()
+        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
+        extractor.candidate_selection(n=3)
+        extractor.candidate_weighting(window=2)
+        keyphrases = extractor.get_n_best(n=5,threshold=0.9)
+        for val in keyphrases:
+            output.append(val[0])
+    except Exception as e:
+        print("found exception",e)
+    return list(set(output))

requirements.txt CHANGED Viewed

@@ -25,4 +25,6 @@ tokenizers~=0.12.1
 flatbuffers~=1.12
 filelock~=3.6.0
 sacremoses~=0.0.53
-fastT5~=0.1.4

 flatbuffers~=1.12
 filelock~=3.6.0
 sacremoses~=0.0.53
+fastT5~=0.1.4
+nltk~=3.6
+st-annotated-text~=3.0.0