Spaces:

afkdark
/

Question_tagger

Build error

App Files Files Community

afkdark commited on Apr 1, 2025

Commit

2a4201c

verified ·

1 Parent(s): bfa3523

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -82

app.py CHANGED Viewed

@@ -1,100 +1,142 @@
 import nltk
 import re
 import random
-from nltk.tokenize import sent_tokenize
-from nltk.tag import pos_tag
-from nltk.chunk import ne_chunk
-from nltk.tree import Tree
 import gradio as gr
-# Download necessary NLTK data
-nltk.download('punkt')
-nltk.download('averaged_perceptron_tagger')
-nltk.download('maxent_ne_chunker')
-nltk.download('words')
-def get_named_entities(text):
-    """Extract named entities from text."""
-    chunked = ne_chunk(pos_tag(nltk.word_tokenize(text)))
-    named_entities = []
-    for chunk in chunked:
-        if isinstance(chunk, Tree):
-            entity = ' '.join([word for word, tag in chunk.leaves()])
-            named_entities.append((entity, chunk.label()))
-    return named_entities
 def generate_question_from_sentence(sentence):
-    """Generate a question from a sentence."""
-    # Remove punctuation at the end
-    sentence = re.sub(r'[.!?]$', '', sentence)
-    # Check for common patterns that can be turned into questions
-    if re.search(r'\bis\s|\bwas\s|\bwere\s|\bare\s', sentence):
-        # Convert statements with "is", "was", "were", "are" into yes/no questions
-        match = re.search(r'^(.*?)\s(is|was|were|are)\s(.*?)$', sentence, re.IGNORECASE)
-        if match:
-            return f"{match.group(2).capitalize()} {match.group(1)} {match.group(3)}?"
-    # Check for sentences with dates or years
-    if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
-        return f"When did {sentence.lower()}?"
-    # Get named entities
-    entities = get_named_entities(sentence)
-    # If there are named entities, ask about them
-    if entities:
-        entity, entity_type = entities[0]
-        if entity_type == 'PERSON':
-            return f"Who is {entity}?"
-        elif entity_type in ['GPE', 'LOCATION']:
-            return f"Where is {entity}?"
-        elif entity_type == 'ORGANIZATION':
-            return f"What is {entity}?"
-    # Check for sentences with "because", "due to", "as a result"
-    if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b', sentence, re.IGNORECASE):
-        return f"Why {sentence.lower()}?"
-    # Default questions based on sentence structure
-    words = nltk.word_tokenize(sentence)
-    pos_tags = pos_tag(words)
-    # Check if sentence has a verb
-    has_verb = any(tag.startswith('VB') for _, tag in pos_tags)
-    if has_verb:
-        # Extract subject (simplistic approach)
-        subject = ""
-        for word, tag in pos_tags:
-            if tag.startswith('NN') or tag.startswith('PRP'):
-                subject = word
-                break
-        if subject:
-            if subject.lower() in ['i', 'you', 'we', 'they']:
                 return f"What did {subject.lower()} do?"
             else:
-                return f"What did {subject} do?"
-        else:
-            # Fallback to "what" question
-            return f"What {sentence.lower()}?"
-    # Very generic fallback
-    question_starters = [
-        "What is described in",
-        "What is mentioned about",
-        "Can you explain",
-        "Could you elaborate on"
-    ]
-    return f"{random.choice(question_starters)} the statement: '{sentence}'?"
 def paragraph_to_questions(paragraph):
     """Generate questions from a paragraph."""
-    sentences = sent_tokenize(paragraph)
     questions = []
     for sentence in sentences:
@@ -109,7 +151,15 @@ def paragraph_to_questions(paragraph):
 # Function to format the output for Gradio
 def generate_questions(paragraph):
     questions = paragraph_to_questions(paragraph)
     return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
 # Create Gradio interface

 import nltk
 import re
 import random
+import os
 import gradio as gr
+# Set NLTK data path to a writable location in Hugging Face environment
+nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
+os.makedirs(nltk_data_path, exist_ok=True)
+nltk.data.path.append(nltk_data_path)
+# Explicitly download both punkt and punkt_tab resources
+def ensure_nltk_resources():
+    resources = [
+        'punkt',
+        'punkt_tab',  # Add this specific resource that's causing the error
+        'averaged_perceptron_tagger',
+        'maxent_ne_chunker',
+        'words'
+    ]
+    for resource in resources:
+        try:
+            # First check if already downloaded
+            try:
+                nltk.data.find(f'tokenizers/{resource}')
+                print(f"Resource {resource} already downloaded")
+            except LookupError:
+                print(f"Downloading {resource}...")
+                nltk.download(resource, download_dir=nltk_data_path)
+                print(f"Downloaded {resource}")
+        except Exception as e:
+            print(f"Warning: Could not download {resource}: {str(e)}")
+# Ensure resources are downloaded before proceeding
+print("Setting up NLTK resources...")
+ensure_nltk_resources()
+# Simple sentence tokenizer as fallback
+def simple_sentence_tokenizer(text):
+    """A simpler fallback sentence tokenizer."""
+    sentences = []
+    for sentence in re.split(r'(?<=[.!?])\s+', text):
+        if sentence:
+            sentences.append(sentence)
+    return sentences
+def get_named_entities(text):
+    """Extract named entities from text with error handling."""
+    try:
+        from nltk.tag import pos_tag
+        from nltk.chunk import ne_chunk
+        from nltk.tree import Tree
+        tokens = nltk.word_tokenize(text)
+        tagged = pos_tag(tokens)
+        chunked = ne_chunk(tagged)
+        named_entities = []
+        for chunk in chunked:
+            if isinstance(chunk, Tree):
+                entity = ' '.join([word for word, tag in chunk.leaves()])
+                named_entities.append((entity, chunk.label()))
+        return named_entities
+    except Exception as e:
+        print(f"Named entity recognition failed: {str(e)}")
+        return []
 def generate_question_from_sentence(sentence):
+    """Generate a question from a sentence with robust error handling."""
+    try:
+        # Remove punctuation at the end
+        sentence = re.sub(r'[.!?]$', '', sentence)
+        # Check for common patterns that can be turned into questions
+        if re.search(r'\bis\s|\bwas\s|\bwere\s|\bare\s', sentence):
+            # Convert statements with "is", "was", "were", "are" into yes/no questions
+            match = re.search(r'^(.*?)\s(is|was|were|are)\s(.*?)$', sentence, re.IGNORECASE)
+            if match:
+                return f"{match.group(2).capitalize()} {match.group(1)} {match.group(3)}?"
+        # Check for sentences with dates or years
+        if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
+            return f"When did {sentence.lower()}?"
+        # Try to get named entities, but don't fail if NER isn't working
+        entities = get_named_entities(sentence)
+        # If there are named entities, ask about them
+        if entities:
+            entity, entity_type = entities[0]
+            if entity_type == 'PERSON':
+                return f"Who is {entity}?"
+            elif entity_type in ['GPE', 'LOCATION']:
+                return f"Where is {entity}?"
+            elif entity_type == 'ORGANIZATION':
+                return f"What is {entity}?"
+        # Check for sentences with "because", "due to", "as a result"
+        if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b', sentence, re.IGNORECASE):
+            return f"Why {sentence.lower()}?"
+        # Simplified approach without relying on POS tagging
+        words = sentence.split()
+        # Very simple subject extraction (first word)
+        if words:
+            subject = words[0]
+            if subject.lower() in ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that']:
                 return f"What did {subject.lower()} do?"
             else:
+                return f"What about {subject}?"
+        # Very generic fallback
+        question_starters = [
+            "What is described in",
+            "What is mentioned about",
+            "Can you explain",
+            "Could you elaborate on"
+        ]
+        return f"{random.choice(question_starters)} the statement: '{sentence}'?"
+    except Exception as e:
+        print(f"Question generation failed: {str(e)}")
+        return f"What can you tell me about: '{sentence}'?"
 def paragraph_to_questions(paragraph):
     """Generate questions from a paragraph."""
+    try:
+        # Try the NLTK sentence tokenizer
+        sentences = nltk.sent_tokenize(paragraph)
+        print(f"NLTK tokenizer found {len(sentences)} sentences")
+    except Exception as e:
+        print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
+        # Fallback to simple tokenizer if NLTK fails
+        sentences = simple_sentence_tokenizer(paragraph)
+        print(f"Fallback tokenizer found {len(sentences)} sentences")
     questions = []
     for sentence in sentences:
 # Function to format the output for Gradio
 def generate_questions(paragraph):
+    if not paragraph or paragraph.strip() == "":
+        return "Please enter a paragraph to generate questions."
+    print(f"Processing paragraph: {paragraph[:50]}...")
     questions = paragraph_to_questions(paragraph)
+    if not questions:
+        return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
     return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
 # Create Gradio interface