Question_tagger / app.py
afkdark's picture
Update app.py
e59e5d7 verified
import nltk
import re
import random
import os
import gradio as gr
# Set NLTK data path to a writable location in Hugging Face environment
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
# Explicitly download both punkt and punkt_tab resources
def ensure_nltk_resources():
resources = [
'punkt',
'punkt_tab', # Add this specific resource that's causing the error
'averaged_perceptron_tagger',
'maxent_ne_chunker',
'words'
]
for resource in resources:
try:
# First check if already downloaded
try:
nltk.data.find(f'tokenizers/{resource}')
print(f"Resource {resource} already downloaded")
except LookupError:
print(f"Downloading {resource}...")
nltk.download(resource, download_dir=nltk_data_path)
print(f"Downloaded {resource}")
except Exception as e:
print(f"Warning: Could not download {resource}: {str(e)}")
# Ensure resources are downloaded before proceeding
print("Setting up NLTK resources...")
ensure_nltk_resources()
# Simple sentence tokenizer as fallback
def simple_sentence_tokenizer(text):
"""A simpler fallback sentence tokenizer."""
sentences = []
for sentence in re.split(r'(?<=[.!?])\s+', text):
if sentence:
sentences.append(sentence)
return sentences
def get_named_entities(text):
"""Extract named entities from text with error handling."""
try:
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree
tokens = nltk.word_tokenize(text)
tagged = pos_tag(tokens)
chunked = ne_chunk(tagged)
named_entities = []
for chunk in chunked:
if isinstance(chunk, Tree):
entity = ' '.join([word for word, tag in chunk.leaves()])
named_entities.append((entity, chunk.label()))
return named_entities
except Exception as e:
print(f"Named entity recognition failed: {str(e)}")
return []
def generate_question_from_sentence(sentence):
"""Generate a question from a sentence with improved question formation."""
try:
# Remove punctuation at the end
sentence = re.sub(r'[.!?]$', '', sentence)
# Convert statements with be-verbs into yes/no questions
be_verb_pattern = re.search(r'^(.*?)\s(is|was|were|are|am)\s(.*?)$', sentence, re.IGNORECASE)
if be_verb_pattern:
return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?"
# Check for modal verbs
modal_pattern = re.search(r'^(.*?)\s(can|could|will|would|should|may|might)\s(.*?)$', sentence, re.IGNORECASE)
if modal_pattern:
return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?"
# Check for sentences with temporal markers
if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
return f"When did {sentence.lower()}?"
# Check for causal relationships
if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b|\btherefore\b|\bhence\b', sentence, re.IGNORECASE):
return f"Why {sentence.lower()}?"
# Try to get named entities
entities = get_named_entities(sentence)
# If there are named entities, ask about them
if entities:
entity, entity_type = entities[0]
if entity_type == 'PERSON':
return f"Who is {entity}?"
elif entity_type in ['GPE', 'LOCATION']:
return f"Where is {entity}?"
elif entity_type == 'ORGANIZATION':
return f"What is {entity}?"
else:
return f"Can you tell me more about {entity}?"
# Check for quantifiable content
if re.search(r'\b(many|number of|several|few|multiple)\b', sentence, re.IGNORECASE):
return f"How many are mentioned in the statement: '{sentence}'?"
# Look for action verbs to create "what" questions
words = sentence.split()
if len(words) >= 3:
# Basic subject-verb detection
potential_subject = words[0]
potential_verb = words[1]
# Common pronouns and determiners
pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the']
if potential_subject.lower() in pronouns:
return f"What did {potential_subject.lower()} {' '.join(words[1:])}?"
else:
# Try to identify the main topic
# First, remove common stop words
stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by']
content_words = [word for word in words if word.lower() not in stop_words]
if content_words:
main_topic = content_words[0]
return f"What is significant about {main_topic}?"
# More varied generic fallbacks
question_starters = [
"What is important about",
"How would you describe",
"What are the key aspects of",
"What's notable regarding",
"How does the text characterize",
"What insights can be drawn from"
]
return f"{random.choice(question_starters)} this: '{sentence}'?"
except Exception as e:
print(f"Question generation failed: {str(e)}")
return f"What can you tell me about: '{sentence}'?"
def paragraph_to_questions(paragraph):
"""Generate questions from a paragraph."""
try:
# Try the NLTK sentence tokenizer
sentences = nltk.sent_tokenize(paragraph)
print(f"NLTK tokenizer found {len(sentences)} sentences")
except Exception as e:
print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
# Fallback to simple tokenizer if NLTK fails
sentences = simple_sentence_tokenizer(paragraph)
print(f"Fallback tokenizer found {len(sentences)} sentences")
questions = []
for sentence in sentences:
# Skip very short sentences
if len(sentence.split()) < 4:
continue
question = generate_question_from_sentence(sentence)
questions.append(question)
return questions
# Function to format the output for Gradio
def generate_questions(paragraph):
if not paragraph or paragraph.strip() == "":
return "Please enter a paragraph to generate questions."
print(f"Processing paragraph: {paragraph[:50]}...")
questions = paragraph_to_questions(paragraph)
if not questions:
return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
# Create Gradio interface
demo = gr.Interface(
fn=generate_questions,
inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
outputs=gr.Textbox(label="Generated Questions"),
title="Paragraph to Questions Generator",
description="Enter a paragraph and the model will generate relevant questions based on the content.",
)
# For use as a module in other Hugging Face applications
def generate_questions_from_text(text):
return paragraph_to_questions(text)
# Launch the app if running directly
if __name__ == "__main__":
demo.launch()