Spaces:

HemanthSai7
/

IntelligentQuestionGenerator

Sleeping

IntelligentQuestionGenerator / src /Pipeline /QuestGen.py

G.Hemanth Sai

qgen

32d9382 over 3 years ago

3.42 kB

	"""Download important files for the pipeline. Uncomment the following lines if you are running this script for the first time"""
	# !wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
	# !tar -xvf s2v_reddit_2015_md.tar.gz
	# if tar file is already downloaded don't download it again
	import os
	import urllib.request
	import tarfile
	if not os.path.exists("models/s2v_reddit_2015_md.tar.gz"):
	print ("Downloading Sense2Vec model")
	urllib.request.urlretrieve(r"https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",filename=r"models/s2v_reddit_2015_md.tar.gz")
	else:
	print ("Sense2Vec model already downloaded")

	reddit_s2v= "models/s2v_reddit_2015_md.tar.gz"
	extract_s2v="models"
	extract_s2v_folder=reddit_s2v.replace(".tar.gz","")
	if not os.path.isdir(extract_s2v_folder):
	with tarfile.open(reddit_s2v, 'r:gz') as tar:
	tar.extractall(f"models/")
	else:
	print ("Already extracted")

	"""Import required libraries"""

	import warnings
	warnings.filterwarnings('ignore')

	from transformers import T5ForConditionalGeneration,T5Tokenizer

	import streamlit as st
	from sense2vec import Sense2Vec

	@st.cache(allow_output_mutation=True)
	def cache_models(paths2v,pathT5cond,pathT5):
	s2v = Sense2Vec().from_disk(paths2v)
	question_model = T5ForConditionalGeneration.from_pretrained(pathT5cond)
	question_tokenizer = T5Tokenizer.from_pretrained(pathT5)
	return (s2v,question_model,question_tokenizer)
	s2v,question_model,question_tokenizer=cache_models("models/s2v_old",'ramsrigouthamg/t5_squad_v1','t5-base')


	"""Filter out same sense words using sense2vec algorithm"""

	def filter_same_sense_words(original,wordlist):
	filtered_words=[]
	base_sense =original.split('\|')[1]
	for eachword in wordlist:
	if eachword[0].split('\|')[1] == base_sense:
	filtered_words.append(eachword[0].split('\|')[0].replace("_", " ").title().strip())
	return filtered_words

	def sense2vec_get_words(topn,input_keyword):
	word=input_keyword
	output=[]
	required_keywords=[]
	output = []
	try:
	sense = s2v.get_best_sense(word)
	most_similar = s2v.most_similar(sense, n=topn)
	for i in range(len(most_similar)):
	required_keywords.append(most_similar[i])
	output = filter_same_sense_words(sense,required_keywords)
	print (f"Similar:{output}")
	except:
	output =[]

	return output

	"""T5 Question generation"""
	question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
	question_tokenizer = T5Tokenizer.from_pretrained('t5-base')

	def get_question(sentence,answer):
	text = f"context: {sentence} answer: {answer} </s>"
	max_len = 256
	encoding = question_tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=True, return_tensors="pt")

	input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

	outs = question_model.generate(input_ids=input_ids,
	attention_mask=attention_mask,
	early_stopping=True,
	num_beams=5,
	num_return_sequences=1,
	no_repeat_ngram_size=2,
	max_length=200)


	dec = [question_tokenizer.decode(ids) for ids in outs]


	Question = dec[0].replace("question:","")
	Question= Question.strip()
	return Question