File size: 7,937 Bytes
6279673
 
 
2a4201c
6279673
 
2a4201c
 
 
 
6279673
2a4201c
 
 
 
 
 
 
 
 
6279673
2a4201c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6279673
 
0340678
2a4201c
 
 
 
0340678
 
 
 
2a4201c
0340678
 
 
 
 
 
2a4201c
 
 
0340678
 
 
 
 
2a4201c
6279673
2a4201c
 
 
 
 
 
 
 
 
0340678
 
2a4201c
0340678
 
 
2a4201c
0340678
2a4201c
0340678
 
 
 
 
 
 
 
 
 
6279673
0340678
 
 
 
 
 
 
 
2a4201c
0340678
2a4201c
0340678
 
 
 
 
 
2a4201c
 
0340678
 
2a4201c
 
 
6279673
 
 
2a4201c
 
 
 
 
 
 
 
 
 
6279673
 
 
 
 
 
 
 
 
 
 
 
 
 
2a4201c
 
 
 
6279673
2a4201c
 
 
 
6279673
 
e59e5d7
6279673
 
 
 
 
 
 
 
 
 
 
 
 
 
e59e5d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import nltk
import re
import random
import os
import gradio as gr

# Set NLTK data path to a writable location in Hugging Face environment
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Explicitly download both punkt and punkt_tab resources
def ensure_nltk_resources():
    resources = [
        'punkt', 
        'punkt_tab',  # Add this specific resource that's causing the error
        'averaged_perceptron_tagger', 
        'maxent_ne_chunker', 
        'words'
    ]
    
    for resource in resources:
        try:
            # First check if already downloaded
            try:
                nltk.data.find(f'tokenizers/{resource}')
                print(f"Resource {resource} already downloaded")
            except LookupError:
                print(f"Downloading {resource}...")
                nltk.download(resource, download_dir=nltk_data_path)
                print(f"Downloaded {resource}")
        except Exception as e:
            print(f"Warning: Could not download {resource}: {str(e)}")

# Ensure resources are downloaded before proceeding
print("Setting up NLTK resources...")
ensure_nltk_resources()

# Simple sentence tokenizer as fallback
def simple_sentence_tokenizer(text):
    """A simpler fallback sentence tokenizer."""
    sentences = []
    for sentence in re.split(r'(?<=[.!?])\s+', text):
        if sentence:
            sentences.append(sentence)
    return sentences

def get_named_entities(text):
    """Extract named entities from text with error handling."""
    try:
        from nltk.tag import pos_tag
        from nltk.chunk import ne_chunk
        from nltk.tree import Tree
        
        tokens = nltk.word_tokenize(text)
        tagged = pos_tag(tokens)
        chunked = ne_chunk(tagged)
        
        named_entities = []
        for chunk in chunked:
            if isinstance(chunk, Tree):
                entity = ' '.join([word for word, tag in chunk.leaves()])
                named_entities.append((entity, chunk.label()))
        
        return named_entities
    except Exception as e:
        print(f"Named entity recognition failed: {str(e)}")
        return []

def generate_question_from_sentence(sentence):
    """Generate a question from a sentence with improved question formation."""
    try:
        # Remove punctuation at the end
        sentence = re.sub(r'[.!?]$', '', sentence)
        
        # Convert statements with be-verbs into yes/no questions
        be_verb_pattern = re.search(r'^(.*?)\s(is|was|were|are|am)\s(.*?)$', sentence, re.IGNORECASE)
        if be_verb_pattern:
            return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?"
        
        # Check for modal verbs
        modal_pattern = re.search(r'^(.*?)\s(can|could|will|would|should|may|might)\s(.*?)$', sentence, re.IGNORECASE)
        if modal_pattern:
            return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?"
        
        # Check for sentences with temporal markers
        if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
            return f"When did {sentence.lower()}?"
        
        # Check for causal relationships
        if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b|\btherefore\b|\bhence\b', sentence, re.IGNORECASE):
            return f"Why {sentence.lower()}?"
        
        # Try to get named entities
        entities = get_named_entities(sentence)
        
        # If there are named entities, ask about them
        if entities:
            entity, entity_type = entities[0]
            if entity_type == 'PERSON':
                return f"Who is {entity}?"
            elif entity_type in ['GPE', 'LOCATION']:
                return f"Where is {entity}?"
            elif entity_type == 'ORGANIZATION':
                return f"What is {entity}?"
            else:
                return f"Can you tell me more about {entity}?"
        
        # Check for quantifiable content
        if re.search(r'\b(many|number of|several|few|multiple)\b', sentence, re.IGNORECASE):
            return f"How many are mentioned in the statement: '{sentence}'?"
        
        # Look for action verbs to create "what" questions
        words = sentence.split()
        if len(words) >= 3:
            # Basic subject-verb detection
            potential_subject = words[0]
            potential_verb = words[1]
            
            # Common pronouns and determiners
            pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the']
            
            if potential_subject.lower() in pronouns:
                return f"What did {potential_subject.lower()} {' '.join(words[1:])}?"
            else:
                # Try to identify the main topic
                # First, remove common stop words
                stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by']
                content_words = [word for word in words if word.lower() not in stop_words]
                
                if content_words:
                    main_topic = content_words[0]
                    return f"What is significant about {main_topic}?"
        
        # More varied generic fallbacks
        question_starters = [
            "What is important about",
            "How would you describe",
            "What are the key aspects of",
            "What's notable regarding",
            "How does the text characterize",
            "What insights can be drawn from"
        ]
        
        return f"{random.choice(question_starters)} this: '{sentence}'?"
    
    except Exception as e:
        print(f"Question generation failed: {str(e)}")
        return f"What can you tell me about: '{sentence}'?"

def paragraph_to_questions(paragraph):
    """Generate questions from a paragraph."""
    try:
        # Try the NLTK sentence tokenizer
        sentences = nltk.sent_tokenize(paragraph)
        print(f"NLTK tokenizer found {len(sentences)} sentences")
    except Exception as e:
        print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
        # Fallback to simple tokenizer if NLTK fails
        sentences = simple_sentence_tokenizer(paragraph)
        print(f"Fallback tokenizer found {len(sentences)} sentences")
    
    questions = []
    
    for sentence in sentences:
        # Skip very short sentences
        if len(sentence.split()) < 4:
            continue
        
        question = generate_question_from_sentence(sentence)
        questions.append(question)
    
    return questions

# Function to format the output for Gradio
def generate_questions(paragraph):
    if not paragraph or paragraph.strip() == "":
        return "Please enter a paragraph to generate questions."
    
    print(f"Processing paragraph: {paragraph[:50]}...")
    questions = paragraph_to_questions(paragraph)
    
    if not questions:
        return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
    
    return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])

# Create Gradio interface
demo = gr.Interface(
    fn=generate_questions,
    inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
    outputs=gr.Textbox(label="Generated Questions"),
    title="Paragraph to Questions Generator",
    description="Enter a paragraph and the model will generate relevant questions based on the content.",
)

# For use as a module in other Hugging Face applications
def generate_questions_from_text(text):
    return paragraph_to_questions(text)

# Launch the app if running directly
if __name__ == "__main__":
    demo.launch()