import nltk import re import random import os import gradio as gr # Set NLTK data path to a writable location in Hugging Face environment nltk_data_path = os.path.join(os.getcwd(), "nltk_data") os.makedirs(nltk_data_path, exist_ok=True) nltk.data.path.append(nltk_data_path) # Explicitly download both punkt and punkt_tab resources def ensure_nltk_resources(): resources = [ 'punkt', 'punkt_tab', # Add this specific resource that's causing the error 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words' ] for resource in resources: try: # First check if already downloaded try: nltk.data.find(f'tokenizers/{resource}') print(f"Resource {resource} already downloaded") except LookupError: print(f"Downloading {resource}...") nltk.download(resource, download_dir=nltk_data_path) print(f"Downloaded {resource}") except Exception as e: print(f"Warning: Could not download {resource}: {str(e)}") # Ensure resources are downloaded before proceeding print("Setting up NLTK resources...") ensure_nltk_resources() # Simple sentence tokenizer as fallback def simple_sentence_tokenizer(text): """A simpler fallback sentence tokenizer.""" sentences = [] for sentence in re.split(r'(?<=[.!?])\s+', text): if sentence: sentences.append(sentence) return sentences def get_named_entities(text): """Extract named entities from text with error handling.""" try: from nltk.tag import pos_tag from nltk.chunk import ne_chunk from nltk.tree import Tree tokens = nltk.word_tokenize(text) tagged = pos_tag(tokens) chunked = ne_chunk(tagged) named_entities = [] for chunk in chunked: if isinstance(chunk, Tree): entity = ' '.join([word for word, tag in chunk.leaves()]) named_entities.append((entity, chunk.label())) return named_entities except Exception as e: print(f"Named entity recognition failed: {str(e)}") return [] def generate_question_from_sentence(sentence): """Generate a question from a sentence with improved question formation.""" try: # Remove punctuation at the end sentence = re.sub(r'[.!?]$', '', sentence) # Convert statements with be-verbs into yes/no questions be_verb_pattern = re.search(r'^(.*?)\s(is|was|were|are|am)\s(.*?)$', sentence, re.IGNORECASE) if be_verb_pattern: return f"{be_verb_pattern.group(2).capitalize()} {be_verb_pattern.group(1)} {be_verb_pattern.group(3)}?" # Check for modal verbs modal_pattern = re.search(r'^(.*?)\s(can|could|will|would|should|may|might)\s(.*?)$', sentence, re.IGNORECASE) if modal_pattern: return f"{modal_pattern.group(2).capitalize()} {modal_pattern.group(1)} {modal_pattern.group(3)}?" # Check for sentences with temporal markers if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE): return f"When did {sentence.lower()}?" # Check for causal relationships if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b|\btherefore\b|\bhence\b', sentence, re.IGNORECASE): return f"Why {sentence.lower()}?" # Try to get named entities entities = get_named_entities(sentence) # If there are named entities, ask about them if entities: entity, entity_type = entities[0] if entity_type == 'PERSON': return f"Who is {entity}?" elif entity_type in ['GPE', 'LOCATION']: return f"Where is {entity}?" elif entity_type == 'ORGANIZATION': return f"What is {entity}?" else: return f"Can you tell me more about {entity}?" # Check for quantifiable content if re.search(r'\b(many|number of|several|few|multiple)\b', sentence, re.IGNORECASE): return f"How many are mentioned in the statement: '{sentence}'?" # Look for action verbs to create "what" questions words = sentence.split() if len(words) >= 3: # Basic subject-verb detection potential_subject = words[0] potential_verb = words[1] # Common pronouns and determiners pronouns = ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that', 'these', 'those', 'a', 'an', 'the'] if potential_subject.lower() in pronouns: return f"What did {potential_subject.lower()} {' '.join(words[1:])}?" else: # Try to identify the main topic # First, remove common stop words stop_words = ['a', 'an', 'the', 'and', 'but', 'or', 'for', 'nor', 'on', 'at', 'to', 'from', 'by'] content_words = [word for word in words if word.lower() not in stop_words] if content_words: main_topic = content_words[0] return f"What is significant about {main_topic}?" # More varied generic fallbacks question_starters = [ "What is important about", "How would you describe", "What are the key aspects of", "What's notable regarding", "How does the text characterize", "What insights can be drawn from" ] return f"{random.choice(question_starters)} this: '{sentence}'?" except Exception as e: print(f"Question generation failed: {str(e)}") return f"What can you tell me about: '{sentence}'?" def paragraph_to_questions(paragraph): """Generate questions from a paragraph.""" try: # Try the NLTK sentence tokenizer sentences = nltk.sent_tokenize(paragraph) print(f"NLTK tokenizer found {len(sentences)} sentences") except Exception as e: print(f"NLTK sentence tokenization failed: {str(e)}, using fallback") # Fallback to simple tokenizer if NLTK fails sentences = simple_sentence_tokenizer(paragraph) print(f"Fallback tokenizer found {len(sentences)} sentences") questions = [] for sentence in sentences: # Skip very short sentences if len(sentence.split()) < 4: continue question = generate_question_from_sentence(sentence) questions.append(question) return questions # Function to format the output for Gradio def generate_questions(paragraph): if not paragraph or paragraph.strip() == "": return "Please enter a paragraph to generate questions." print(f"Processing paragraph: {paragraph[:50]}...") questions = paragraph_to_questions(paragraph) if not questions: return "Could not generate any questions from this text. Try a longer or more detailed paragraph." return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)]) # Create Gradio interface demo = gr.Interface( fn=generate_questions, inputs=gr.Textbox(lines=10, placeholder="Enter a paragraph to generate questions..."), outputs=gr.Textbox(label="Generated Questions"), title="Paragraph to Questions Generator", description="Enter a paragraph and the model will generate relevant questions based on the content.", ) # For use as a module in other Hugging Face applications def generate_questions_from_text(text): return paragraph_to_questions(text) # Launch the app if running directly if __name__ == "__main__": demo.launch()