Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import re
|
| 3 |
+
import random
|
| 4 |
+
from nltk.tokenize import sent_tokenize
|
| 5 |
+
from nltk.tag import pos_tag
|
| 6 |
+
from nltk.chunk import ne_chunk
|
| 7 |
+
from nltk.tree import Tree
|
| 8 |
+
import gradio as gr
|
| 9 |
+
|
| 10 |
+
# Download necessary NLTK data
|
| 11 |
+
nltk.download('punkt')
|
| 12 |
+
nltk.download('averaged_perceptron_tagger')
|
| 13 |
+
nltk.download('maxent_ne_chunker')
|
| 14 |
+
nltk.download('words')
|
| 15 |
+
|
| 16 |
+
def get_named_entities(text):
|
| 17 |
+
"""Extract named entities from text."""
|
| 18 |
+
chunked = ne_chunk(pos_tag(nltk.word_tokenize(text)))
|
| 19 |
+
named_entities = []
|
| 20 |
+
|
| 21 |
+
for chunk in chunked:
|
| 22 |
+
if isinstance(chunk, Tree):
|
| 23 |
+
entity = ' '.join([word for word, tag in chunk.leaves()])
|
| 24 |
+
named_entities.append((entity, chunk.label()))
|
| 25 |
+
|
| 26 |
+
return named_entities
|
| 27 |
+
|
| 28 |
+
def generate_question_from_sentence(sentence):
|
| 29 |
+
"""Generate a question from a sentence."""
|
| 30 |
+
# Remove punctuation at the end
|
| 31 |
+
sentence = re.sub(r'[.!?]$', '', sentence)
|
| 32 |
+
|
| 33 |
+
# Check for common patterns that can be turned into questions
|
| 34 |
+
if re.search(r'\bis\s|\bwas\s|\bwere\s|\bare\s', sentence):
|
| 35 |
+
# Convert statements with "is", "was", "were", "are" into yes/no questions
|
| 36 |
+
match = re.search(r'^(.*?)\s(is|was|were|are)\s(.*?)$', sentence, re.IGNORECASE)
|
| 37 |
+
if match:
|
| 38 |
+
return f"{match.group(2).capitalize()} {match.group(1)} {match.group(3)}?"
|
| 39 |
+
|
| 40 |
+
# Check for sentences with dates or years
|
| 41 |
+
if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
|
| 42 |
+
return f"When did {sentence.lower()}?"
|
| 43 |
+
|
| 44 |
+
# Get named entities
|
| 45 |
+
entities = get_named_entities(sentence)
|
| 46 |
+
|
| 47 |
+
# If there are named entities, ask about them
|
| 48 |
+
if entities:
|
| 49 |
+
entity, entity_type = entities[0]
|
| 50 |
+
if entity_type == 'PERSON':
|
| 51 |
+
return f"Who is {entity}?"
|
| 52 |
+
elif entity_type in ['GPE', 'LOCATION']:
|
| 53 |
+
return f"Where is {entity}?"
|
| 54 |
+
elif entity_type == 'ORGANIZATION':
|
| 55 |
+
return f"What is {entity}?"
|
| 56 |
+
|
| 57 |
+
# Check for sentences with "because", "due to", "as a result"
|
| 58 |
+
if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b', sentence, re.IGNORECASE):
|
| 59 |
+
return f"Why {sentence.lower()}?"
|
| 60 |
+
|
| 61 |
+
# Default questions based on sentence structure
|
| 62 |
+
words = nltk.word_tokenize(sentence)
|
| 63 |
+
pos_tags = pos_tag(words)
|
| 64 |
+
|
| 65 |
+
# Check if sentence has a verb
|
| 66 |
+
has_verb = any(tag.startswith('VB') for _, tag in pos_tags)
|
| 67 |
+
|
| 68 |
+
if has_verb:
|
| 69 |
+
# Extract subject (simplistic approach)
|
| 70 |
+
subject = ""
|
| 71 |
+
for word, tag in pos_tags:
|
| 72 |
+
if tag.startswith('NN') or tag.startswith('PRP'):
|
| 73 |
+
subject = word
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
if subject:
|
| 77 |
+
if subject.lower() in ['i', 'you', 'we', 'they']:
|
| 78 |
+
return f"What did {subject.lower()} do?"
|
| 79 |
+
else:
|
| 80 |
+
return f"What did {subject} do?"
|
| 81 |
+
else:
|
| 82 |
+
# Fallback to "what" question
|
| 83 |
+
return f"What {sentence.lower()}?"
|
| 84 |
+
|
| 85 |
+
# Very generic fallback
|
| 86 |
+
question_starters = [
|
| 87 |
+
"What is described in",
|
| 88 |
+
"What is mentioned about",
|
| 89 |
+
"Can you explain",
|
| 90 |
+
"Could you elaborate on"
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
return f"{random.choice(question_starters)} the statement: '{sentence}'?"
|
| 94 |
+
|
| 95 |
+
def paragraph_to_questions(paragraph):
|
| 96 |
+
"""Generate questions from a paragraph."""
|
| 97 |
+
sentences = sent_tokenize(paragraph)
|
| 98 |
+
questions = []
|
| 99 |
+
|
| 100 |
+
for sentence in sentences:
|
| 101 |
+
# Skip very short sentences
|
| 102 |
+
if len(sentence.split()) < 4:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
question = generate_question_from_sentence(sentence)
|
| 106 |
+
questions.append(question)
|
| 107 |
+
|
| 108 |
+
return questions
|
| 109 |
+
|
| 110 |
+
# Function to format the output for Gradio
|
| 111 |
+
def generate_questions(paragraph):
|
| 112 |
+
questions = paragraph_to_questions(paragraph)
|
| 113 |
+
return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
|
| 114 |
+
|
| 115 |
+
# Create Gradio interface
|
| 116 |
+
demo = gr.Interface(
|
| 117 |
+
fn=generate_questions,
|
| 118 |
+
inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
|
| 119 |
+
outputs=gr.Textbox(label="Generated Questions"),
|
| 120 |
+
title="Paragraph to Questions Generator",
|
| 121 |
+
description="Enter a paragraph and the model will generate relevant questions based on the content.",
|
| 122 |
+
examples=[
|
| 123 |
+
["Artificial intelligence has revolutionized many industries. Companies like Google and OpenAI are investing billions in research. The field continues to grow rapidly, with new breakthroughs announced every month. Concerns about ethics and regulation remain important topics of discussion."],
|
| 124 |
+
["The Great Barrier Reef is the world's largest coral reef system. It is located off the coast of Queensland, Australia. The reef is home to thousands of species of marine life. Climate change poses a significant threat to its survival."]
|
| 125 |
+
]
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# For use as a module in other Hugging Face applications
|
| 129 |
+
def generate_questions_from_text(text):
|
| 130 |
+
return paragraph_to_questions(text)
|
| 131 |
+
|
| 132 |
+
# Launch the app if running directly
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
demo.launch()
|