Spaces:
Sleeping
Sleeping
| import numpy as np # linear algebra | |
| import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
| import time | |
| import torch | |
| from transformers import T5ForConditionalGeneration,T5Tokenizer | |
| import random | |
| import spacy | |
| import zipfile | |
| import os | |
| os.system('pip install git+https://github.com/boudinfl/pke.git') | |
| os.system('python -m nltk.downloader universal_tagset') | |
| os.system('python -m spacy download en') | |
| os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') | |
| os.system('tar -xvf s2v_reddit_2015_md.tar.gz') | |
| os.system('python -m spacy download en_core_web_sm') | |
| import git | |
| import json | |
| from sense2vec import Sense2Vec | |
| import requests | |
| from collections import OrderedDict | |
| import string | |
| import pke | |
| import nltk | |
| import numpy | |
| import en_core_web_sm | |
| from nltk import FreqDist | |
| nltk.download('brown', quiet=True, force=True) | |
| nltk.download('stopwords', quiet=True, force=True) | |
| nltk.download('popular', quiet=True, force=True) | |
| from nltk.corpus import stopwords | |
| from nltk.corpus import brown | |
| from similarity.normalized_levenshtein import NormalizedLevenshtein | |
| from nltk.tokenize import sent_tokenize | |
| from flashtext import KeywordProcessor | |
| from encoding import beam_search_decoding | |
| from mcq import tokenize_sentences | |
| from mcq import get_keywords | |
| from mcq import get_sentences_for_keyword | |
| from mcq import generate_questions_mcq | |
| from mcq import generate_normal_questions | |
| import time | |
| tokenizer = T5Tokenizer.from_pretrained('t5-large') | |
| model = T5ForConditionalGeneration.from_pretrained('Parth/result') | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| # model.eval() | |
| device = device | |
| model = model | |
| nlp = spacy.load('en_core_web_sm') | |
| s2v = Sense2Vec().from_disk('s2v_old') | |
| fdist = FreqDist(brown.words()) | |
| normalized_levenshtein = NormalizedLevenshtein() | |
| def set_seed(seed): | |
| numpy.random.seed(seed) | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| set_seed(42) | |
| def predict_mcq(payload): | |
| start = time.time() | |
| inp = { | |
| "input_text": payload.get("input_text"), | |
| "max_questions": payload.get("max_questions", 4) | |
| } | |
| text = inp['input_text'] | |
| sentences = tokenize_sentences(text) | |
| joiner = " " | |
| modified_text = joiner.join(sentences) | |
| keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) | |
| keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) | |
| for k in keyword_sentence_mapping.keys(): | |
| text_snippet = " ".join(keyword_sentence_mapping[k][:3]) | |
| keyword_sentence_mapping[k] = text_snippet | |
| final_output = {} | |
| if len(keyword_sentence_mapping.keys()) == 0: | |
| return final_output | |
| else: | |
| try: | |
| generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein) | |
| except: | |
| return final_output | |
| end = time.time() | |
| final_output["statement"] = modified_text | |
| final_output["questions"] = generated_questions["questions"] | |
| final_output["time_taken"] = end-start | |
| if torch.device=='cuda': | |
| torch.cuda.empty_cache() | |
| return final_output | |
| def predict_shortq(payload): | |
| inp = { | |
| "input_text": payload.get("input_text"), | |
| "max_questions": payload.get("max_questions", 4) | |
| } | |
| text = inp['input_text'] | |
| sentences = tokenize_sentences(text) | |
| joiner = " " | |
| modified_text = joiner.join(sentences) | |
| keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) | |
| keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) | |
| for k in keyword_sentence_mapping.keys(): | |
| text_snippet = " ".join(keyword_sentence_mapping[k][:3]) | |
| keyword_sentence_mapping[k] = text_snippet | |
| final_output = {} | |
| if len(keyword_sentence_mapping.keys()) == 0: | |
| print('ZERO') | |
| return final_output | |
| else: | |
| generated_questions = generate_normal_questions(keyword_sentence_mapping,device,tokenizer,model) | |
| print(generated_questions) | |
| final_output["statement"] = modified_text | |
| final_output["questions"] = generated_questions["questions"] | |
| if torch.device=='cuda': | |
| torch.cuda.empty_cache() | |
| return final_output | |
| def paraphrase(payload): | |
| start = time.time() | |
| inp = { | |
| "input_text": payload.get("input_text"), | |
| "max_questions": payload.get("max_questions", 3) | |
| } | |
| text = inp['input_text'] | |
| num = inp['max_questions'] | |
| sentence= text | |
| text= "paraphrase: " + sentence + " </s>" | |
| encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") | |
| input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) | |
| beam_outputs = model.generate( | |
| input_ids=input_ids, | |
| attention_mask=attention_masks, | |
| max_length= 50, | |
| num_beams=50, | |
| num_return_sequences=num, | |
| no_repeat_ngram_size=2, | |
| early_stopping=True | |
| ) | |
| # print ("\nOriginal Question ::") | |
| # print (text) | |
| # print ("\n") | |
| # print ("Paraphrased Questions :: ") | |
| final_outputs =[] | |
| for beam_output in beam_outputs: | |
| sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) | |
| if sent.lower() != sentence.lower() and sent not in final_outputs: | |
| final_outputs.append(sent) | |
| output= {} | |
| output['Question']= text | |
| output['Count']= num | |
| output['Paraphrased Questions']= final_outputs | |
| for i, final_output in enumerate(final_outputs): | |
| print("{}".format(i, final_output)) | |
| if torch.device=='cuda': | |
| torch.cuda.empty_cache() | |
| return output | |