Spaces:
Build error
Build error
File size: 7,937 Bytes
6279673 2a4201c 6279673 2a4201c 6279673 2a4201c 6279673 2a4201c 6279673 0340678 2a4201c 0340678 2a4201c 0340678 2a4201c 0340678 2a4201c 6279673 2a4201c 0340678 2a4201c 0340678 2a4201c 0340678 2a4201c 0340678 6279673 0340678 2a4201c 0340678 2a4201c 0340678 2a4201c 0340678 2a4201c 6279673 2a4201c 6279673 2a4201c 6279673 2a4201c 6279673 e59e5d7 6279673 e59e5d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import nltk
import re
import random
import os
import gradio as gr
# Set NLTK data path to a writable location in Hugging Face environment
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
# Explicitly download both punkt and punkt_tab resources
def ensure_nltk_resources():
resources = [
'punkt',
'punkt_tab', # Add this specific resource that's causing the error
'averaged_perceptron_tagger',
'maxent_ne_chunker',
'words'
]
for resource in resources:
try:
# First check if already downloaded
try:
nltk.data.find(f'tokenizers/{resource}')
print(f"Resource {resource} already downloaded")
except LookupError:
print(f"Downloading {resource}...")
nltk.download(resource, download_dir=nltk_data_path)
print(f"Downloaded {resource}")
except Exception as e:
print(f"Warning: Could not download {resource}: {str(e)}")
# Ensure resources are downloaded before proceeding
print("Setting up NLTK resources...")
ensure_nltk_resources()
# Simple sentence tokenizer as fallback
def simple_sentence_tokenizer(text):
"""A simpler fallback sentence tokenizer."""
sentences = []
for sentence in re.split(r'(?<=[.!?])\s+', text):
if sentence:
sentences.append(sentence)
return sentences
def get_named_entities(text):
"""Extract named entities from text with error handling."""
try:
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree
tokens = nltk.word_tokenize(text)
tagged = pos_tag(tokens)
chunked = ne_chunk(tagged)
named_entities = []
for chunk in chunked:
if isinstance(chunk, Tree):
entity = ' '.join([word for word, tag in chunk.leaves()])
named_entities.append((entity, chunk.label()))
return named_entities
except Exception as e:
print(f"Named entity recognition failed: {str(e)}")
return []
def generate_question_from_sentence(sentence):
"""Generate a question from a sentence with improved question formation."""
try:
# Remove punctuation at the end
sentence = re.sub(r'[.!?]$', '', sentence)
# Convert statements with be-verbs into yes/no questions
be_verb_pattern = re.search(r'^(.*?)\s(is|was|were|are|am)\s(.*?)$', sentence, re.IGNORECASE)
if be_verb_pattern:
return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?"
# Check for modal verbs
modal_pattern = re.search(r'^(.*?)\s(can|could|will|would|should|may|might)\s(.*?)$', sentence, re.IGNORECASE)
if modal_pattern:
return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?"
# Check for sentences with temporal markers
if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
return f"When did {sentence.lower()}?"
# Check for causal relationships
if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b|\btherefore\b|\bhence\b', sentence, re.IGNORECASE):
return f"Why {sentence.lower()}?"
# Try to get named entities
entities = get_named_entities(sentence)
# If there are named entities, ask about them
if entities:
entity, entity_type = entities[0]
if entity_type == 'PERSON':
return f"Who is {entity}?"
elif entity_type in ['GPE', 'LOCATION']:
return f"Where is {entity}?"
elif entity_type == 'ORGANIZATION':
return f"What is {entity}?"
else:
return f"Can you tell me more about {entity}?"
# Check for quantifiable content
if re.search(r'\b(many|number of|several|few|multiple)\b', sentence, re.IGNORECASE):
return f"How many are mentioned in the statement: '{sentence}'?"
# Look for action verbs to create "what" questions
words = sentence.split()
if len(words) >= 3:
# Basic subject-verb detection
potential_subject = words[0]
potential_verb = words[1]
# Common pronouns and determiners
pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the']
if potential_subject.lower() in pronouns:
return f"What did {potential_subject.lower()} {' '.join(words[1:])}?"
else:
# Try to identify the main topic
# First, remove common stop words
stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by']
content_words = [word for word in words if word.lower() not in stop_words]
if content_words:
main_topic = content_words[0]
return f"What is significant about {main_topic}?"
# More varied generic fallbacks
question_starters = [
"What is important about",
"How would you describe",
"What are the key aspects of",
"What's notable regarding",
"How does the text characterize",
"What insights can be drawn from"
]
return f"{random.choice(question_starters)} this: '{sentence}'?"
except Exception as e:
print(f"Question generation failed: {str(e)}")
return f"What can you tell me about: '{sentence}'?"
def paragraph_to_questions(paragraph):
"""Generate questions from a paragraph."""
try:
# Try the NLTK sentence tokenizer
sentences = nltk.sent_tokenize(paragraph)
print(f"NLTK tokenizer found {len(sentences)} sentences")
except Exception as e:
print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
# Fallback to simple tokenizer if NLTK fails
sentences = simple_sentence_tokenizer(paragraph)
print(f"Fallback tokenizer found {len(sentences)} sentences")
questions = []
for sentence in sentences:
# Skip very short sentences
if len(sentence.split()) < 4:
continue
question = generate_question_from_sentence(sentence)
questions.append(question)
return questions
# Function to format the output for Gradio
def generate_questions(paragraph):
if not paragraph or paragraph.strip() == "":
return "Please enter a paragraph to generate questions."
print(f"Processing paragraph: {paragraph[:50]}...")
questions = paragraph_to_questions(paragraph)
if not questions:
return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
# Create Gradio interface
demo = gr.Interface(
fn=generate_questions,
inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
outputs=gr.Textbox(label="Generated Questions"),
title="Paragraph to Questions Generator",
description="Enter a paragraph and the model will generate relevant questions based on the content.",
)
# For use as a module in other Hugging Face applications
def generate_questions_from_text(text):
return paragraph_to_questions(text)
# Launch the app if running directly
if __name__ == "__main__":
demo.launch() |