Spaces:

afkdark
/

Question_tagger

Build error

App Files Files Community

Question_tagger / app.py

afkdark

Update app.py

e59e5d7 verified 10 months ago

raw

history blame contribute delete

7.94 kB

	import nltk
	import re
	import random
	import os
	import gradio as gr

	# Set NLTK data path to a writable location in Hugging Face environment
	nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
	os.makedirs(nltk_data_path, exist_ok=True)
	nltk.data.path.append(nltk_data_path)

	# Explicitly download both punkt and punkt_tab resources
	def ensure_nltk_resources():
	resources = [
	'punkt',
	'punkt_tab', # Add this specific resource that's causing the error
	'averaged_perceptron_tagger',
	'maxent_ne_chunker',
	'words'
	]

	for resource in resources:
	try:
	# First check if already downloaded
	try:
	nltk.data.find(f'tokenizers/{resource}')
	print(f"Resource {resource} already downloaded")
	except LookupError:
	print(f"Downloading {resource}...")
	nltk.download(resource, download_dir=nltk_data_path)
	print(f"Downloaded {resource}")
	except Exception as e:
	print(f"Warning: Could not download {resource}: {str(e)}")

	# Ensure resources are downloaded before proceeding
	print("Setting up NLTK resources...")
	ensure_nltk_resources()

	# Simple sentence tokenizer as fallback
	def simple_sentence_tokenizer(text):
	"""A simpler fallback sentence tokenizer."""
	sentences = []
	for sentence in re.split(r'(?<=[.!?])\s+', text):
	if sentence:
	sentences.append(sentence)
	return sentences

	def get_named_entities(text):
	"""Extract named entities from text with error handling."""
	try:
	from nltk.tag import pos_tag
	from nltk.chunk import ne_chunk
	from nltk.tree import Tree

	tokens = nltk.word_tokenize(text)
	tagged = pos_tag(tokens)
	chunked = ne_chunk(tagged)

	named_entities = []
	for chunk in chunked:
	if isinstance(chunk, Tree):
	entity = ' '.join([word for word, tag in chunk.leaves()])
	named_entities.append((entity, chunk.label()))

	return named_entities
	except Exception as e:
	print(f"Named entity recognition failed: {str(e)}")
	return []

	def generate_question_from_sentence(sentence):
	"""Generate a question from a sentence with improved question formation."""
	try:
	# Remove punctuation at the end
	sentence = re.sub(r'[.!?]$', '', sentence)

	# Convert statements with be-verbs into yes/no questions
	be_verb_pattern = re.search(r'^(.?)\s(is\|was\|were\|are\|am)\s(.?)$', sentence, re.IGNORECASE)
	if be_verb_pattern:
	return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?"

	# Check for modal verbs
	modal_pattern = re.search(r'^(.?)\s(can\|could\|will\|would\|should\|may\|might)\s(.?)$', sentence, re.IGNORECASE)
	if modal_pattern:
	return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?"

	# Check for sentences with temporal markers
	if re.search(r'\b(in\|on\|during)\s\d{4}\b\|\b(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\b', sentence, re.IGNORECASE):
	return f"When did {sentence.lower()}?"

	# Check for causal relationships
	if re.search(r'\bbecause\b\|\bdue to\b\|\bas a result\b\|\btherefore\b\|\bhence\b', sentence, re.IGNORECASE):
	return f"Why {sentence.lower()}?"

	# Try to get named entities
	entities = get_named_entities(sentence)

	# If there are named entities, ask about them
	if entities:
	entity, entity_type = entities[0]
	if entity_type == 'PERSON':
	return f"Who is {entity}?"
	elif entity_type in ['GPE', 'LOCATION']:
	return f"Where is {entity}?"
	elif entity_type == 'ORGANIZATION':
	return f"What is {entity}?"
	else:
	return f"Can you tell me more about {entity}?"

	# Check for quantifiable content
	if re.search(r'\b(many\|number of\|several\|few\|multiple)\b', sentence, re.IGNORECASE):
	return f"How many are mentioned in the statement: '{sentence}'?"

	# Look for action verbs to create "what" questions
	words = sentence.split()
	if len(words) >= 3:
	# Basic subject-verb detection
	potential_subject = words[0]
	potential_verb = words[1]

	# Common pronouns and determiners
	pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the']

	if potential_subject.lower() in pronouns:
	return f"What did {potential_subject.lower()} {' '.join(words[1:])}?"
	else:
	# Try to identify the main topic
	# First, remove common stop words
	stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by']
	content_words = [word for word in words if word.lower() not in stop_words]

	if content_words:
	main_topic = content_words[0]
	return f"What is significant about {main_topic}?"

	# More varied generic fallbacks
	question_starters = [
	"What is important about",
	"How would you describe",
	"What are the key aspects of",
	"What's notable regarding",
	"How does the text characterize",
	"What insights can be drawn from"
	]

	return f"{random.choice(question_starters)} this: '{sentence}'?"

	except Exception as e:
	print(f"Question generation failed: {str(e)}")
	return f"What can you tell me about: '{sentence}'?"

	def paragraph_to_questions(paragraph):
	"""Generate questions from a paragraph."""
	try:
	# Try the NLTK sentence tokenizer
	sentences = nltk.sent_tokenize(paragraph)
	print(f"NLTK tokenizer found {len(sentences)} sentences")
	except Exception as e:
	print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
	# Fallback to simple tokenizer if NLTK fails
	sentences = simple_sentence_tokenizer(paragraph)
	print(f"Fallback tokenizer found {len(sentences)} sentences")

	questions = []

	for sentence in sentences:
	# Skip very short sentences
	if len(sentence.split()) < 4:
	continue

	question = generate_question_from_sentence(sentence)
	questions.append(question)

	return questions

	# Function to format the output for Gradio
	def generate_questions(paragraph):
	if not paragraph or paragraph.strip() == "":
	return "Please enter a paragraph to generate questions."

	print(f"Processing paragraph: {paragraph[:50]}...")
	questions = paragraph_to_questions(paragraph)

	if not questions:
	return "Could not generate any questions from this text. Try a longer or more detailed paragraph."

	return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])

	# Create Gradio interface
	demo = gr.Interface(
	fn=generate_questions,
	inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
	outputs=gr.Textbox(label="Generated Questions"),
	title="Paragraph to Questions Generator",
	description="Enter a paragraph and the model will generate relevant questions based on the content.",
	)

	# For use as a module in other Hugging Face applications
	def generate_questions_from_text(text):
	return paragraph_to_questions(text)

	# Launch the app if running directly
	if __name__ == "__main__":
	demo.launch()