Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,43 +1,58 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
| 3 |
-
import torch
|
| 4 |
import spacy
|
| 5 |
import nltk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from b import b
|
|
|
|
| 7 |
nltk.download('punkt')
|
| 8 |
from nltk.tokenize import sent_tokenize
|
| 9 |
|
| 10 |
# Load spaCy model
|
| 11 |
nlp = spacy.load("en_core_web_sm")
|
|
|
|
| 12 |
|
| 13 |
# Load T5 model and tokenizer
|
| 14 |
model_name = "DevBM/t5-large-squad"
|
| 15 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 16 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 17 |
|
| 18 |
-
# Function to extract keywords using
|
| 19 |
def extract_keywords(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
doc = nlp(text)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
# Function to map keywords to sentences
|
| 32 |
-
def map_keywords_to_sentences(text, keywords):
|
| 33 |
sentences = sent_tokenize(text)
|
| 34 |
keyword_sentence_mapping = {}
|
| 35 |
for keyword in keywords:
|
| 36 |
for i, sentence in enumerate(sentences):
|
| 37 |
if keyword in sentence:
|
| 38 |
# Combine current sentence with surrounding sentences for context
|
| 39 |
-
start = max(0, i-
|
| 40 |
-
end = min(len(sentences), i+
|
| 41 |
context = ' '.join(sentences[start:end])
|
| 42 |
if keyword not in keyword_sentence_mapping:
|
| 43 |
keyword_sentence_mapping[keyword] = context
|
|
@@ -45,28 +60,77 @@ def map_keywords_to_sentences(text, keywords):
|
|
| 45 |
keyword_sentence_mapping[keyword] += ' ' + context
|
| 46 |
return keyword_sentence_mapping
|
| 47 |
|
| 48 |
-
# Function to
|
| 49 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
input_text = f"<context> {context} <answer> {answer}"
|
| 51 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 52 |
-
outputs = model.generate(input_ids)
|
| 53 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 54 |
return question
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Streamlit interface
|
| 57 |
st.title("Question Generator from Text")
|
| 58 |
-
text = st.text_area("Enter text here:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
if st.button("Generate Questions"):
|
| 60 |
if text:
|
| 61 |
keywords = extract_keywords(text)
|
| 62 |
-
keyword_sentence_mapping = map_keywords_to_sentences(text, keywords)
|
| 63 |
|
| 64 |
st.subheader("Generated Questions:")
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
st.write(f"**Context:** {context}")
|
| 68 |
st.write(f"**Answer:** {keyword}")
|
| 69 |
st.write(f"**Question:** {question}")
|
|
|
|
|
|
|
| 70 |
st.write("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
else:
|
| 72 |
st.write("Please enter some text to generate questions.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
|
|
|
| 3 |
import spacy
|
| 4 |
import nltk
|
| 5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 6 |
+
from rake_nltk import Rake
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from fpdf import FPDF
|
| 9 |
+
import wikipediaapi
|
| 10 |
from b import b
|
| 11 |
+
|
| 12 |
nltk.download('punkt')
|
| 13 |
from nltk.tokenize import sent_tokenize
|
| 14 |
|
| 15 |
# Load spaCy model
|
| 16 |
nlp = spacy.load("en_core_web_sm")
|
| 17 |
+
# wiki_wiki = wikipediaapi.Wikipedia('en')
|
| 18 |
|
| 19 |
# Load T5 model and tokenizer
|
| 20 |
model_name = "DevBM/t5-large-squad"
|
| 21 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 22 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 23 |
|
| 24 |
+
# Function to extract keywords using combined techniques
|
| 25 |
def extract_keywords(text):
|
| 26 |
+
# Use RAKE
|
| 27 |
+
rake = Rake()
|
| 28 |
+
rake.extract_keywords_from_text(text)
|
| 29 |
+
rake_keywords = set(rake.get_ranked_phrases())
|
| 30 |
+
|
| 31 |
+
# Use spaCy for NER and POS tagging
|
| 32 |
doc = nlp(text)
|
| 33 |
+
spacy_keywords = set([ent.text for ent in doc.ents])
|
| 34 |
+
spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
|
| 35 |
+
|
| 36 |
+
# Use TF-IDF
|
| 37 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
| 38 |
+
X = vectorizer.fit_transform([text])
|
| 39 |
+
tfidf_keywords = set(vectorizer.get_feature_names_out())
|
| 40 |
+
|
| 41 |
+
# Combine all keywords
|
| 42 |
+
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
|
| 43 |
+
|
| 44 |
+
return list(combined_keywords)
|
| 45 |
|
| 46 |
+
# Function to map keywords to sentences with customizable context window size
|
| 47 |
+
def map_keywords_to_sentences(text, keywords, context_window_size):
|
| 48 |
sentences = sent_tokenize(text)
|
| 49 |
keyword_sentence_mapping = {}
|
| 50 |
for keyword in keywords:
|
| 51 |
for i, sentence in enumerate(sentences):
|
| 52 |
if keyword in sentence:
|
| 53 |
# Combine current sentence with surrounding sentences for context
|
| 54 |
+
start = max(0, i - context_window_size)
|
| 55 |
+
end = min(len(sentences), i + context_window_size + 1)
|
| 56 |
context = ' '.join(sentences[start:end])
|
| 57 |
if keyword not in keyword_sentence_mapping:
|
| 58 |
keyword_sentence_mapping[keyword] = context
|
|
|
|
| 60 |
keyword_sentence_mapping[keyword] += ' ' + context
|
| 61 |
return keyword_sentence_mapping
|
| 62 |
|
| 63 |
+
# Function to perform entity linking using Wikipedia API
|
| 64 |
+
# def entity_linking(keyword):
|
| 65 |
+
# page = wiki_wiki.page(keyword)
|
| 66 |
+
# if page.exists():
|
| 67 |
+
# return page.fullurl
|
| 68 |
+
# return None
|
| 69 |
+
|
| 70 |
+
# Function to generate questions using beam search
|
| 71 |
+
def generate_question(context, answer, num_beams=5):
|
| 72 |
input_text = f"<context> {context} <answer> {answer}"
|
| 73 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 74 |
+
outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
|
| 75 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 76 |
return question
|
| 77 |
|
| 78 |
+
# Function to export questions to CSV
|
| 79 |
+
def export_to_csv(data):
|
| 80 |
+
df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
|
| 81 |
+
df.to_csv('questions.csv', index=False)
|
| 82 |
+
|
| 83 |
+
# Function to export questions to PDF
|
| 84 |
+
def export_to_pdf(data):
|
| 85 |
+
pdf = FPDF()
|
| 86 |
+
pdf.add_page()
|
| 87 |
+
pdf.set_font("Arial", size=12)
|
| 88 |
+
|
| 89 |
+
for context, answer, question in data:
|
| 90 |
+
pdf.multi_cell(0, 10, f"Context: {context}")
|
| 91 |
+
pdf.multi_cell(0, 10, f"Answer: {answer}")
|
| 92 |
+
pdf.multi_cell(0, 10, f"Question: {question}")
|
| 93 |
+
pdf.ln(10)
|
| 94 |
+
|
| 95 |
+
pdf.output("questions.pdf")
|
| 96 |
+
|
| 97 |
# Streamlit interface
|
| 98 |
st.title("Question Generator from Text")
|
| 99 |
+
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
| 100 |
+
|
| 101 |
+
# Customization options
|
| 102 |
+
num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
|
| 103 |
+
context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
|
| 104 |
+
num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
|
| 105 |
+
question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"])
|
| 106 |
+
|
| 107 |
if st.button("Generate Questions"):
|
| 108 |
if text:
|
| 109 |
keywords = extract_keywords(text)
|
| 110 |
+
keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
|
| 111 |
|
| 112 |
st.subheader("Generated Questions:")
|
| 113 |
+
data = []
|
| 114 |
+
for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
|
| 115 |
+
if i >= num_questions:
|
| 116 |
+
break
|
| 117 |
+
# linked_entity = entity_linking(keyword)
|
| 118 |
+
question = generate_question(context, keyword, num_beams=num_beams)
|
| 119 |
st.write(f"**Context:** {context}")
|
| 120 |
st.write(f"**Answer:** {keyword}")
|
| 121 |
st.write(f"**Question:** {question}")
|
| 122 |
+
# if linked_entity:
|
| 123 |
+
# st.write(f"**Entity Link:** {linked_entity}")
|
| 124 |
st.write("---")
|
| 125 |
+
data.append((context, keyword, question))
|
| 126 |
+
|
| 127 |
+
# Export buttons
|
| 128 |
+
if st.button("Export to CSV"):
|
| 129 |
+
export_to_csv(data)
|
| 130 |
+
st.success("Questions exported to questions.csv")
|
| 131 |
+
|
| 132 |
+
if st.button("Export to PDF"):
|
| 133 |
+
export_to_pdf(data)
|
| 134 |
+
st.success("Questions exported to questions.pdf")
|
| 135 |
else:
|
| 136 |
st.write("Please enter some text to generate questions.")
|