Spaces:

afkdark
/

Question_tagger

Build error

File size: 7,937 Bytes

import nltk
import re
import random
import os
import gradio as gr

# Set NLTK data path to a writable location in Hugging Face environment
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Explicitly download both punkt and punkt_tab resources
def ensure_nltk_resources():
    resources = [
        'punkt', 
        'punkt_tab',  # Add this specific resource that's causing the error
        'averaged_perceptron_tagger', 
        'maxent_ne_chunker', 
        'words'
    ]
    
    for resource in resources:
        try:
            # First check if already downloaded
            try:
                nltk.data.find(f'tokenizers/{resource}')
                print(f"Resource {resource} already downloaded")
            except LookupError:
                print(f"Downloading {resource}...")
                nltk.download(resource, download_dir=nltk_data_path)
                print(f"Downloaded {resource}")
        except Exception as e:
            print(f"Warning: Could not download {resource}: {str(e)}")

# Ensure resources are downloaded before proceeding
print("Setting up NLTK resources...")
ensure_nltk_resources()

# Simple sentence tokenizer as fallback
def simple_sentence_tokenizer(text):
    """A simpler fallback sentence tokenizer."""
    sentences = []
    for sentence in re.split(r'(?<=[.!?])\s+', text):
        if sentence:
            sentences.append(sentence)
    return sentences

def get_named_entities(text):
    """Extract named entities from text with error handling."""
    try:
        from nltk.tag import pos_tag
        from nltk.chunk import ne_chunk
        from nltk.tree import Tree
        
        tokens = nltk.word_tokenize(text)
        tagged = pos_tag(tokens)
        chunked = ne_chunk(tagged)
        
        named_entities = []
        for chunk in chunked:
            if isinstance(chunk, Tree):
                entity = ' '.join([word for word, tag in chunk.leaves()])
                named_entities.append((entity, chunk.label()))
        
        return named_entities
    except Exception as e:
        print(f"Named entity recognition failed: {str(e)}")
        return []

def generate_question_from_sentence(sentence):
    """Generate a question from a sentence with improved question formation."""
    try:
        # Remove punctuation at the end
        sentence = re.sub(r'[.!?]$', '', sentence)
        
        # Convert statements with be-verbs into yes/no questions
        be_verb_pattern = re.search(r'^(.*?)\s(is|was|were|are|am)\s(.*?)$', sentence, re.IGNORECASE)
        if be_verb_pattern:
            return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?"
        
        # Check for modal verbs
        modal_pattern = re.search(r'^(.*?)\s(can|could|will|would|should|may|might)\s(.*?)$', sentence, re.IGNORECASE)
        if modal_pattern:
            return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?"
        
        # Check for sentences with temporal markers
        if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
            return f"When did {sentence.lower()}?"
        
        # Check for causal relationships
        if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b|\btherefore\b|\bhence\b', sentence, re.IGNORECASE):
            return f"Why {sentence.lower()}?"
        
        # Try to get named entities
        entities = get_named_entities(sentence)
        
        # If there are named entities, ask about them
        if entities:
            entity, entity_type = entities[0]
            if entity_type == 'PERSON':
                return f"Who is {entity}?"
            elif entity_type in ['GPE', 'LOCATION']:
                return f"Where is {entity}?"
            elif entity_type == 'ORGANIZATION':
                return f"What is {entity}?"
            else:
                return f"Can you tell me more about {entity}?"
        
        # Check for quantifiable content
        if re.search(r'\b(many|number of|several|few|multiple)\b', sentence, re.IGNORECASE):
            return f"How many are mentioned in the statement: '{sentence}'?"
        
        # Look for action verbs to create "what" questions
        words = sentence.split()
        if len(words) >= 3:
            # Basic subject-verb detection
            potential_subject = words[0]
            potential_verb = words[1]
            
            # Common pronouns and determiners
            pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the']
            
            if potential_subject.lower() in pronouns:
                return f"What did {potential_subject.lower()} {' '.join(words[1:])}?"
            else:
                # Try to identify the main topic
                # First, remove common stop words
                stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by']
                content_words = [word for word in words if word.lower() not in stop_words]
                
                if content_words:
                    main_topic = content_words[0]
                    return f"What is significant about {main_topic}?"
        
        # More varied generic fallbacks
        question_starters = [
            "What is important about",
            "How would you describe",
            "What are the key aspects of",
            "What's notable regarding",
            "How does the text characterize",
            "What insights can be drawn from"
        ]
        
        return f"{random.choice(question_starters)} this: '{sentence}'?"
    
    except Exception as e:
        print(f"Question generation failed: {str(e)}")
        return f"What can you tell me about: '{sentence}'?"

def paragraph_to_questions(paragraph):
    """Generate questions from a paragraph."""
    try:
        # Try the NLTK sentence tokenizer
        sentences = nltk.sent_tokenize(paragraph)
        print(f"NLTK tokenizer found {len(sentences)} sentences")
    except Exception as e:
        print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
        # Fallback to simple tokenizer if NLTK fails
        sentences = simple_sentence_tokenizer(paragraph)
        print(f"Fallback tokenizer found {len(sentences)} sentences")
    
    questions = []
    
    for sentence in sentences:
        # Skip very short sentences
        if len(sentence.split()) < 4:
            continue
        
        question = generate_question_from_sentence(sentence)
        questions.append(question)
    
    return questions

# Function to format the output for Gradio
def generate_questions(paragraph):
    if not paragraph or paragraph.strip() == "":
        return "Please enter a paragraph to generate questions."
    
    print(f"Processing paragraph: {paragraph[:50]}...")
    questions = paragraph_to_questions(paragraph)
    
    if not questions:
        return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
    
    return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])

# Create Gradio interface
demo = gr.Interface(
    fn=generate_questions,
    inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."),
    outputs=gr.Textbox(label="Generated Questions"),
    title="Paragraph to Questions Generator",
    description="Enter a paragraph and the model will generate relevant questions based on the content.",
)

# For use as a module in other Hugging Face applications
def generate_questions_from_text(text):
    return paragraph_to_questions(text)

# Launch the app if running directly
if __name__ == "__main__":
    demo.launch()