| |
| import patoolib |
| import wget |
| from textwrap3 import wrap |
| import torch |
| import random |
| import numpy as np |
| import nltk |
| nltk.download('punkt') |
| nltk.download('brown') |
| nltk.download('wordnet') |
| from nltk.corpus import wordnet as wn |
| from nltk.tokenize import sent_tokenize |
| nltk.download('stopwords') |
| from nltk.corpus import stopwords |
| import string |
| import pke |
| import traceback |
| from flashtext import KeywordProcessor |
| from similarity.normalized_levenshtein import NormalizedLevenshtein |
| normalized_levenshtein = NormalizedLevenshtein() |
| from collections import OrderedDict |
| from sklearn.metrics.pairwise import cosine_similarity |
| import nltk |
| nltk.download('omw-1.4') |
| import gradio as gr |
|
|
|
|
| from transformers import T5ForConditionalGeneration,T5Tokenizer |
| summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') |
| summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| summary_model = summary_model.to(device) |
|
|
| question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1') |
| question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1') |
| question_model = question_model.to(device) |
|
|
| |
| wget.download('https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') |
| |
| patoolib.extract_archive("s2v_reddit_2015_md.tar.gz", outdir="/") |
| import numpy as np |
| from sense2vec import Sense2Vec |
| s2v = Sense2Vec().from_disk('s2v_old') |
| from sentence_transformers import SentenceTransformer |
|
|
|
|
| text = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company |
| Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve |
| system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin |
| rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet, |
| Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and |
| transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly |
| believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but |
| the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising |
| that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency.""" |
|
|
| for wrp in wrap(text, 150): |
| print (wrp) |
| print ("\n") |
|
|
|
|
| |
| from transformers import T5ForConditionalGeneration,T5Tokenizer |
| summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') |
| summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| summary_model = summary_model.to(device) |
|
|
|
|
|
|
| def set_seed(seed: int): |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed_all(seed) |
|
|
| set_seed(42) |
|
|
|
|
|
|
| def postprocesstext (content): |
| final="" |
| for sent in sent_tokenize(content): |
| sent = sent.capitalize() |
| final = final +" "+sent |
| return final |
|
|
|
|
| def summarizer(text,model,tokenizer): |
| text = text.strip().replace("\n"," ") |
| text = "summarize: "+text |
| |
| max_len = 512 |
| encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
|
|
| input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
| outs = model.generate(input_ids=input_ids, |
| attention_mask=attention_mask, |
| early_stopping=True, |
| num_beams=3, |
| num_return_sequences=1, |
| no_repeat_ngram_size=2, |
| min_length = 75, |
| max_length=300) |
|
|
|
|
| dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
| summary = dec[0] |
| summary = postprocesstext(summary) |
| summary= summary.strip() |
|
|
| return summary |
|
|
|
|
| summarized_text = summarizer(text,summary_model,summary_tokenizer) |
|
|
|
|
| print ("\noriginal Text >>") |
| for wrp in wrap(text, 150): |
| print (wrp) |
| print ("\n") |
| print ("Summarized Text >>") |
| for wrp in wrap(summarized_text, 150): |
| print (wrp) |
| print ("\n") |
|
|
|
|
|
|
| |
|
|
|
|
| def get_nouns_multipartite(content): |
| out=[] |
| try: |
| extractor = pke.unsupervised.MultipartiteRank() |
| extractor.load_document(input=content,language='en') |
| |
| pos = {'PROPN','NOUN'} |
| |
| stoplist = list(string.punctuation) |
| stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] |
| stoplist += stopwords.words('english') |
| |
| extractor.candidate_selection(pos=pos) |
| |
| |
| |
| extractor.candidate_weighting(alpha=1.1, |
| threshold=0.75, |
| method='average') |
| keyphrases = extractor.get_n_best(n=15) |
| |
|
|
| for val in keyphrases: |
| out.append(val[0]) |
| except: |
| out = [] |
| traceback.print_exc() |
|
|
| return out |
|
|
|
|
|
|
| def get_keywords(originaltext,summarytext): |
| keywords = get_nouns_multipartite(originaltext) |
| print ("keywords unsummarized: ",keywords) |
| keyword_processor = KeywordProcessor() |
| for keyword in keywords: |
| keyword_processor.add_keyword(keyword) |
|
|
| keywords_found = keyword_processor.extract_keywords(summarytext) |
| keywords_found = list(set(keywords_found)) |
| print ("keywords_found in summarized: ",keywords_found) |
|
|
| important_keywords =[] |
| for keyword in keywords: |
| if keyword in keywords_found: |
| important_keywords.append(keyword) |
|
|
| return important_keywords[:10] |
|
|
|
|
| imp_keywords = get_keywords(text,summarized_text) |
| print (imp_keywords) |
|
|
|
|
|
|
| def get_question(context,answer,model,tokenizer): |
| text = "context: {} answer: {}".format(context,answer) |
| encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) |
| input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] |
|
|
| outs = model.generate(input_ids=input_ids, |
| attention_mask=attention_mask, |
| early_stopping=True, |
| num_beams=5, |
| num_return_sequences=1, |
| no_repeat_ngram_size=2, |
| max_length=72) |
|
|
|
|
| dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] |
|
|
|
|
| Question = dec[0].replace("question:","") |
| Question= Question.strip() |
| return Question |
|
|
|
|
|
|
| for wrp in wrap(summarized_text, 150): |
| print (wrp) |
| print ("\n") |
|
|
| for answer in imp_keywords: |
| ques = get_question(summarized_text,answer,question_model,question_tokenizer) |
| print (ques) |
| print (answer.capitalize()) |
| print ("\n") |
|
|
|
|
|
|
|
|
| |
|
|
| |
| sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3') |
|
|
|
|
|
|
|
|
|
|
| def filter_same_sense_words(original,wordlist): |
| filtered_words=[] |
| base_sense =original.split('|')[1] |
| print (base_sense) |
| for eachword in wordlist: |
| if eachword[0].split('|')[1] == base_sense: |
| filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip()) |
| return filtered_words |
|
|
| def get_highest_similarity_score(wordlist,wrd): |
| score=[] |
| for each in wordlist: |
| score.append(normalized_levenshtein.similarity(each.lower(),wrd.lower())) |
| return max(score) |
|
|
| def sense2vec_get_words(word,s2v,topn,question): |
| output = [] |
| print ("word ",word) |
| try: |
| sense = s2v.get_best_sense(word, senses= ["NOUN", "PERSON","PRODUCT","LOC","ORG","EVENT","NORP","WORK OF ART","FAC","GPE","NUM","FACILITY"]) |
| most_similar = s2v.most_similar(sense, n=topn) |
| |
| output = filter_same_sense_words(sense,most_similar) |
| print ("Similar ",output) |
| except: |
| output =[] |
|
|
| threshold = 0.6 |
| final=[word] |
| checklist =question.split() |
| for x in output: |
| if get_highest_similarity_score(final,x)<threshold and x not in final and x not in checklist: |
| final.append(x) |
| |
| return final[1:] |
|
|
| def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param): |
|
|
| |
| word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding) |
| word_similarity = cosine_similarity(word_embeddings) |
|
|
| |
| keywords_idx = [np.argmax(word_doc_similarity)] |
| candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]] |
|
|
| for _ in range(top_n - 1): |
| |
| |
| candidate_similarities = word_doc_similarity[candidates_idx, :] |
| target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1) |
|
|
| |
| mmr = (lambda_param) * candidate_similarities - (1-lambda_param) * target_similarities.reshape(-1, 1) |
| mmr_idx = candidates_idx[np.argmax(mmr)] |
|
|
| |
| keywords_idx.append(mmr_idx) |
| candidates_idx.remove(mmr_idx) |
|
|
| return [words[idx] for idx in keywords_idx] |
|
|
| def get_distractors_wordnet(word): |
| distractors=[] |
| try: |
| syn = wn.synsets(word,'n')[0] |
| |
| word= word.lower() |
| orig_word = word |
| if len(word.split())>0: |
| word = word.replace(" ","_") |
| hypernym = syn.hypernyms() |
| if len(hypernym) == 0: |
| return distractors |
| for item in hypernym[0].hyponyms(): |
| name = item.lemmas()[0].name() |
| |
| if name == orig_word: |
| continue |
| name = name.replace("_"," ") |
| name = " ".join(w.capitalize() for w in name.split()) |
| if name is not None and name not in distractors: |
| distractors.append(name) |
| except: |
| print ("Wordnet distractors not found") |
| return distractors |
|
|
| def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval): |
| distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence) |
| print ("distractors ",distractors) |
| if len(distractors) ==0: |
| return distractors |
| distractors_new = [word.capitalize()] |
| distractors_new.extend(distractors) |
| |
|
|
| embedding_sentence = origsentence+ " "+word.capitalize() |
| |
| keyword_embedding = sentencemodel.encode([embedding_sentence]) |
| distractor_embeddings = sentencemodel.encode(distractors_new) |
|
|
| |
| max_keywords = min(len(distractors_new),5) |
| filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval) |
| |
| final = [word.capitalize()] |
| for wrd in filtered_keywords: |
| if wrd.lower() !=word.lower(): |
| final.append(wrd.capitalize()) |
| final = final[1:] |
| return final |
|
|
| sent = "What cryptocurrency did Musk rarely tweet about?" |
| keyword = "Bitcoin" |
|
|
| |
| |
|
|
|
|
| |
| |
|
|
|
|
| |
| |
|
|
| print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2)) |
|
|
|
|
|
|
|
|
| context = gr.inputs.Textbox(lines=10, placeholder="Enter paragraph/content here...") |
| output = gr.outputs.HTML( label="Question and Answers") |
| radiobutton = gr.inputs.Radio(["Wordnet", "Sense2Vec"]) |
|
|
| def generate_question(context,radiobutton): |
| summary_text = summarizer(context,summary_model,summary_tokenizer) |
| for wrp in wrap(summary_text, 100): |
| print (wrp) |
| |
| np = get_keywords(context,summary_text) |
| print ("\n\nNoun phrases",np) |
| output="" |
| for answer in np: |
| ques = get_question(summary_text,answer,question_model,question_tokenizer) |
| if radiobutton=="Wordnet": |
| distractors = get_distractors_wordnet(answer) |
| else: |
| distractors = get_distractors(answer.capitalize(),ques,s2v,sentence_transformer_model,40,0.2) |
| |
| output = output + "<b style='color:blue;'>" + ques + "</b>" |
| output = output + "<br>" |
| output = output + "<b style='color:green;'>" + "Ans: " +answer.capitalize()+ "</b>"+"<br>" |
| if len(distractors)>0: |
| for distractor in distractors[:4]: |
| output = output + "<b style='color:brown;'>" + distractor+ "</b>"+"<br>" |
| output = output + "<br>" |
|
|
| summary ="Summary: "+ summary_text |
| for answer in np: |
| summary = summary.replace(answer,"<b>"+answer+"</b>" + "<br>") |
| summary = summary.replace(answer.capitalize(),"<b>"+answer.capitalize()+"</b>") |
| output = output + "<p>"+summary+"</p>" |
| output = output + "<br>" |
| return output |
|
|
|
|
| iface = gr.Interface( |
| fn=generate_question, |
| inputs=[context,radiobutton], |
| outputs=output) |
| iface.launch(debug=True) |