Spaces:
Runtime error
Runtime error
adding exception handling, updating feedback data to take context, options
Browse filesalso added operations progress bar.
with the previous update; improved the batch processing
app.py
CHANGED
|
@@ -37,14 +37,19 @@ st.set_page_config(
|
|
| 37 |
page_title="Question Generator",
|
| 38 |
initial_sidebar_state="auto",
|
| 39 |
menu_items={
|
| 40 |
-
"About" : "
|
| 41 |
}
|
| 42 |
)
|
| 43 |
|
| 44 |
st.set_option('deprecation.showPyplotGlobalUse',False)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# Initialize Wikipedia API with a user agent
|
| 47 |
-
user_agent = 'QGen/1.
|
| 48 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
| 49 |
|
| 50 |
def get_session_id():
|
|
@@ -134,12 +139,6 @@ def display_info():
|
|
| 134 |
|
| 135 |
""")
|
| 136 |
|
| 137 |
-
# Text Preprocessing Function
|
| 138 |
-
def preprocess_text(text):
|
| 139 |
-
# Remove newlines and extra spaces
|
| 140 |
-
text = re.sub(r'[\n]', ' ', text)
|
| 141 |
-
return text
|
| 142 |
-
|
| 143 |
def get_pdf_text(pdf_file):
|
| 144 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
| 145 |
text = ""
|
|
@@ -147,7 +146,7 @@ def get_pdf_text(pdf_file):
|
|
| 147 |
page = doc.load_page(page_num)
|
| 148 |
text += page.get_text()
|
| 149 |
return text
|
| 150 |
-
def save_feedback(question, answer,rating):
|
| 151 |
feedback_file = 'question_feedback.json'
|
| 152 |
if os.path.exists(feedback_file):
|
| 153 |
with open(feedback_file, 'r') as f:
|
|
@@ -157,6 +156,8 @@ def save_feedback(question, answer,rating):
|
|
| 157 |
tpl = {
|
| 158 |
'question' : question,
|
| 159 |
'answer' : answer,
|
|
|
|
|
|
|
| 160 |
'rating' : rating,
|
| 161 |
}
|
| 162 |
# feedback_data[question] = rating
|
|
@@ -195,33 +196,36 @@ def segment_text(text, max_segment_length=700, batch_size=7):
|
|
| 195 |
|
| 196 |
# Function to extract keywords using combined techniques
|
| 197 |
def extract_keywords(text, extract_all):
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
def get_similar_words_sense2vec(word, n=3):
|
| 227 |
# Try to find the word with its most likely part-of-speech
|
|
@@ -316,59 +320,80 @@ def entity_linking(keyword):
|
|
| 316 |
return None
|
| 317 |
|
| 318 |
async def generate_question_async(context, answer, num_beams):
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
async def generate_options_async(answer, context, n=3):
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
# Compute similarity scores and sort context words
|
| 336 |
-
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
|
| 337 |
-
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
| 338 |
-
options.extend(sorted_context_words[:n])
|
| 339 |
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
options.extend(
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
|
| 358 |
# Function to generate questions using beam search
|
| 359 |
async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
async def process_batch(batch, keywords, context_window_size, num_beams):
|
| 374 |
questions = []
|
|
@@ -481,20 +506,35 @@ def main():
|
|
| 481 |
|
| 482 |
text = None
|
| 483 |
if input_type == "Text Input":
|
| 484 |
-
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
| 485 |
elif input_type == "Upload PDF":
|
| 486 |
file = st.file_uploader("Upload PDF Files")
|
| 487 |
if file is not None:
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
if text:
|
| 490 |
text = clean_text(text)
|
| 491 |
generate_questions_button = st.button("Generate Questions")
|
| 492 |
-
|
|
|
|
| 493 |
# if generate_questions_button:
|
| 494 |
if generate_questions_button and text:
|
| 495 |
start_time = time.time()
|
| 496 |
with st.spinner("Generating questions..."):
|
| 497 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
|
| 499 |
data = get_state(session_id)
|
| 500 |
print(data)
|
|
@@ -532,9 +572,9 @@ def main():
|
|
| 532 |
# q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
|
| 533 |
if enable_feedback_mode:
|
| 534 |
q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
|
| 535 |
-
q['rating'] = st.
|
| 536 |
if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
|
| 537 |
-
save_feedback(q['question'], q['answer'], q['rating'])
|
| 538 |
st.success(f"Feedback submitted for Question {i+1}")
|
| 539 |
st.write("---")
|
| 540 |
|
|
@@ -590,4 +630,8 @@ def main():
|
|
| 590 |
print("********************************************************************************")
|
| 591 |
|
| 592 |
if __name__ == '__main__':
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
page_title="Question Generator",
|
| 38 |
initial_sidebar_state="auto",
|
| 39 |
menu_items={
|
| 40 |
+
"About" : "Hi this our project."
|
| 41 |
}
|
| 42 |
)
|
| 43 |
|
| 44 |
st.set_option('deprecation.showPyplotGlobalUse',False)
|
| 45 |
|
| 46 |
+
class QuestionGenerationError(Exception):
|
| 47 |
+
"""Custom exception for question generation errors."""
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
|
| 51 |
# Initialize Wikipedia API with a user agent
|
| 52 |
+
user_agent = 'QGen/1.2'
|
| 53 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
| 54 |
|
| 55 |
def get_session_id():
|
|
|
|
| 139 |
|
| 140 |
""")
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
def get_pdf_text(pdf_file):
|
| 143 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
| 144 |
text = ""
|
|
|
|
| 146 |
page = doc.load_page(page_num)
|
| 147 |
text += page.get_text()
|
| 148 |
return text
|
| 149 |
+
def save_feedback(question, answer, rating, options, context):
|
| 150 |
feedback_file = 'question_feedback.json'
|
| 151 |
if os.path.exists(feedback_file):
|
| 152 |
with open(feedback_file, 'r') as f:
|
|
|
|
| 156 |
tpl = {
|
| 157 |
'question' : question,
|
| 158 |
'answer' : answer,
|
| 159 |
+
'context' : context,
|
| 160 |
+
'options' : options,
|
| 161 |
'rating' : rating,
|
| 162 |
}
|
| 163 |
# feedback_data[question] = rating
|
|
|
|
| 196 |
|
| 197 |
# Function to extract keywords using combined techniques
|
| 198 |
def extract_keywords(text, extract_all):
|
| 199 |
+
try:
|
| 200 |
+
doc = nlp(text)
|
| 201 |
+
spacy_keywords = set([ent.text for ent in doc.ents])
|
| 202 |
+
spacy_entities = spacy_keywords
|
| 203 |
+
print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
|
| 204 |
+
|
| 205 |
+
# Use Only Spacy Entities
|
| 206 |
+
if extract_all is False:
|
| 207 |
+
return list(spacy_entities)
|
| 208 |
+
|
| 209 |
+
# Use RAKE
|
| 210 |
+
rake = Rake()
|
| 211 |
+
rake.extract_keywords_from_text(text)
|
| 212 |
+
rake_keywords = set(rake.get_ranked_phrases())
|
| 213 |
+
print(f"\n\nRake Keywords: {rake_keywords} \n\n")
|
| 214 |
+
# Use spaCy for NER and POS tagging
|
| 215 |
+
spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
|
| 216 |
+
print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
|
| 217 |
+
# Use TF-IDF
|
| 218 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
| 219 |
+
X = vectorizer.fit_transform([text])
|
| 220 |
+
tfidf_keywords = set(vectorizer.get_feature_names_out())
|
| 221 |
+
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
|
| 222 |
+
|
| 223 |
+
# Combine all keywords
|
| 224 |
+
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
|
| 225 |
+
|
| 226 |
+
return list(combined_keywords)
|
| 227 |
+
except Exception as e:
|
| 228 |
+
raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
|
| 229 |
|
| 230 |
def get_similar_words_sense2vec(word, n=3):
|
| 231 |
# Try to find the word with its most likely part-of-speech
|
|
|
|
| 320 |
return None
|
| 321 |
|
| 322 |
async def generate_question_async(context, answer, num_beams):
|
| 323 |
+
try:
|
| 324 |
+
input_text = f"<context> {context} <answer> {answer}"
|
| 325 |
+
print(f"\n{input_text}\n")
|
| 326 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 327 |
+
outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
|
| 328 |
+
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 329 |
+
print(f"\n{question}\n")
|
| 330 |
+
return question
|
| 331 |
+
except Exception as e:
|
| 332 |
+
raise QuestionGenerationError(f"Error in question generation: {str(e)}")
|
| 333 |
|
| 334 |
async def generate_options_async(answer, context, n=3):
|
| 335 |
+
try:
|
| 336 |
+
options = [answer]
|
| 337 |
+
|
| 338 |
+
# Add contextually relevant words using a pre-trained model
|
| 339 |
+
context_embedding = await asyncio.to_thread(context_model.encode, context)
|
| 340 |
+
answer_embedding = await asyncio.to_thread(context_model.encode, answer)
|
| 341 |
+
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
# Compute similarity scores and sort context words
|
| 344 |
+
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
|
| 345 |
+
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
| 346 |
+
options.extend(sorted_context_words[:n])
|
| 347 |
+
|
| 348 |
+
# Try to get similar words based on sense2vec
|
| 349 |
+
similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
|
| 350 |
+
options.extend(similar_words)
|
| 351 |
+
|
| 352 |
+
# If we don't have enough options, try synonyms
|
| 353 |
+
if len(options) < n + 1:
|
| 354 |
+
synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
|
| 355 |
+
options.extend(synonyms)
|
| 356 |
+
|
| 357 |
+
# Ensure we have the correct number of unique options
|
| 358 |
+
options = list(dict.fromkeys(options))[:n+1]
|
| 359 |
+
|
| 360 |
+
# Shuffle the options
|
| 361 |
+
random.shuffle(options)
|
| 362 |
+
|
| 363 |
+
return options
|
| 364 |
+
except Exception as e:
|
| 365 |
+
raise QuestionGenerationError(f"Error in generating options: {str(e)}")
|
| 366 |
|
| 367 |
|
| 368 |
# Function to generate questions using beam search
|
| 369 |
async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
|
| 370 |
+
try:
|
| 371 |
+
batches = segment_text(text)
|
| 372 |
+
keywords = extract_keywords(text, extract_all_keywords)
|
| 373 |
+
all_questions = []
|
| 374 |
+
|
| 375 |
+
progress_bar = st.progress(0)
|
| 376 |
+
status_text = st.empty()
|
| 377 |
+
|
| 378 |
+
for i, batch in enumerate(batches):
|
| 379 |
+
status_text.text(f"Processing batch {i+1} of {len(batches)}...")
|
| 380 |
+
batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
|
| 381 |
+
all_questions.extend(batch_questions)
|
| 382 |
+
progress_bar.progress((i + 1) / len(batches))
|
| 383 |
+
|
| 384 |
+
if len(all_questions) >= num_questions:
|
| 385 |
+
break
|
| 386 |
+
|
| 387 |
+
progress_bar.empty()
|
| 388 |
+
status_text.empty()
|
| 389 |
+
|
| 390 |
+
return all_questions[:num_questions]
|
| 391 |
+
except QuestionGenerationError as e:
|
| 392 |
+
st.error(f"An error occurred during question generation: {str(e)}")
|
| 393 |
+
return []
|
| 394 |
+
except Exception as e:
|
| 395 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
| 396 |
+
return []
|
| 397 |
|
| 398 |
async def process_batch(batch, keywords, context_window_size, num_beams):
|
| 399 |
questions = []
|
|
|
|
| 506 |
|
| 507 |
text = None
|
| 508 |
if input_type == "Text Input":
|
| 509 |
+
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.", help="Enter or paste your text here")
|
| 510 |
elif input_type == "Upload PDF":
|
| 511 |
file = st.file_uploader("Upload PDF Files")
|
| 512 |
if file is not None:
|
| 513 |
+
try:
|
| 514 |
+
text = get_pdf_text(file)
|
| 515 |
+
except Exception as e:
|
| 516 |
+
st.error(f"Error reading PDF file: {str(e)}")
|
| 517 |
+
text = None
|
| 518 |
if text:
|
| 519 |
text = clean_text(text)
|
| 520 |
generate_questions_button = st.button("Generate Questions")
|
| 521 |
+
st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
|
| 522 |
+
|
| 523 |
# if generate_questions_button:
|
| 524 |
if generate_questions_button and text:
|
| 525 |
start_time = time.time()
|
| 526 |
with st.spinner("Generating questions..."):
|
| 527 |
+
try:
|
| 528 |
+
state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
|
| 529 |
+
if not state['generated_questions']:
|
| 530 |
+
st.warning("No questions were generated. The text might be too short or lack suitable content.")
|
| 531 |
+
else:
|
| 532 |
+
st.success(f"Successfully generated {len(state['generated_questions'])} questions!")
|
| 533 |
+
except QuestionGenerationError as e:
|
| 534 |
+
st.error(f"An error occurred during question generation: {str(e)}")
|
| 535 |
+
except Exception as e:
|
| 536 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
| 537 |
+
|
| 538 |
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
|
| 539 |
data = get_state(session_id)
|
| 540 |
print(data)
|
|
|
|
| 572 |
# q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
|
| 573 |
if enable_feedback_mode:
|
| 574 |
q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
|
| 575 |
+
q['rating'] = st.select_slider(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
|
| 576 |
if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
|
| 577 |
+
save_feedback(q['question'], q['answer'], q['rating'], q['options'], q['context'])
|
| 578 |
st.success(f"Feedback submitted for Question {i+1}")
|
| 579 |
st.write("---")
|
| 580 |
|
|
|
|
| 630 |
print("********************************************************************************")
|
| 631 |
|
| 632 |
if __name__ == '__main__':
|
| 633 |
+
try:
|
| 634 |
+
main()
|
| 635 |
+
except Exception as e:
|
| 636 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
| 637 |
+
st.error("Please try refreshing the page. If the problem persists, contact support.")
|