afkdark commited on
Commit
2a4201c
·
verified ·
1 Parent(s): bfa3523

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -82
app.py CHANGED
@@ -1,100 +1,142 @@
1
  import nltk
2
  import re
3
  import random
4
- from nltk.tokenize import sent_tokenize
5
- from nltk.tag import pos_tag
6
- from nltk.chunk import ne_chunk
7
- from nltk.tree import Tree
8
  import gradio as gr
9
 
10
- # Download necessary NLTK data
11
- nltk.download('punkt')
12
- nltk.download('averaged_perceptron_tagger')
13
- nltk.download('maxent_ne_chunker')
14
- nltk.download('words')
15
 
16
- def get_named_entities(text):
17
- """Extract named entities from text."""
18
- chunked = ne_chunk(pos_tag(nltk.word_tokenize(text)))
19
- named_entities = []
20
-
21
- for chunk in chunked:
22
- if isinstance(chunk, Tree):
23
- entity = ' '.join([word for word, tag in chunk.leaves()])
24
- named_entities.append((entity, chunk.label()))
25
 
26
- return named_entities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def generate_question_from_sentence(sentence):
29
- """Generate a question from a sentence."""
30
- # Remove punctuation at the end
31
- sentence = re.sub(r'[.!?]$', '', sentence)
32
-
33
- # Check for common patterns that can be turned into questions
34
- if re.search(r'\bis\s|\bwas\s|\bwere\s|\bare\s', sentence):
35
- # Convert statements with "is", "was", "were", "are" into yes/no questions
36
- match = re.search(r'^(.*?)\s(is|was|were|are)\s(.*?)$', sentence, re.IGNORECASE)
37
- if match:
38
- return f"{match.group(2).capitalize()} {match.group(1)} {match.group(3)}?"
39
-
40
- # Check for sentences with dates or years
41
- if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
42
- return f"When did {sentence.lower()}?"
43
-
44
- # Get named entities
45
- entities = get_named_entities(sentence)
46
-
47
- # If there are named entities, ask about them
48
- if entities:
49
- entity, entity_type = entities[0]
50
- if entity_type == 'PERSON':
51
- return f"Who is {entity}?"
52
- elif entity_type in ['GPE', 'LOCATION']:
53
- return f"Where is {entity}?"
54
- elif entity_type == 'ORGANIZATION':
55
- return f"What is {entity}?"
56
-
57
- # Check for sentences with "because", "due to", "as a result"
58
- if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b', sentence, re.IGNORECASE):
59
- return f"Why {sentence.lower()}?"
60
-
61
- # Default questions based on sentence structure
62
- words = nltk.word_tokenize(sentence)
63
- pos_tags = pos_tag(words)
64
-
65
- # Check if sentence has a verb
66
- has_verb = any(tag.startswith('VB') for _, tag in pos_tags)
67
-
68
- if has_verb:
69
- # Extract subject (simplistic approach)
70
- subject = ""
71
- for word, tag in pos_tags:
72
- if tag.startswith('NN') or tag.startswith('PRP'):
73
- subject = word
74
- break
75
 
76
- if subject:
77
- if subject.lower() in ['i', 'you', 'we', 'they']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  return f"What did {subject.lower()} do?"
79
  else:
80
- return f"What did {subject} do?"
81
- else:
82
- # Fallback to "what" question
83
- return f"What {sentence.lower()}?"
84
-
85
- # Very generic fallback
86
- question_starters = [
87
- "What is described in",
88
- "What is mentioned about",
89
- "Can you explain",
90
- "Could you elaborate on"
91
- ]
92
-
93
- return f"{random.choice(question_starters)} the statement: '{sentence}'?"
94
 
95
  def paragraph_to_questions(paragraph):
96
  """Generate questions from a paragraph."""
97
- sentences = sent_tokenize(paragraph)
 
 
 
 
 
 
 
 
 
98
  questions = []
99
 
100
  for sentence in sentences:
@@ -109,7 +151,15 @@ def paragraph_to_questions(paragraph):
109
 
110
  # Function to format the output for Gradio
111
  def generate_questions(paragraph):
 
 
 
 
112
  questions = paragraph_to_questions(paragraph)
 
 
 
 
113
  return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
114
 
115
  # Create Gradio interface
 
1
  import nltk
2
  import re
3
  import random
4
+ import os
 
 
 
5
  import gradio as gr
6
 
7
+ # Set NLTK data path to a writable location in Hugging Face environment
8
+ nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
9
+ os.makedirs(nltk_data_path, exist_ok=True)
10
+ nltk.data.path.append(nltk_data_path)
 
11
 
12
+ # Explicitly download both punkt and punkt_tab resources
13
+ def ensure_nltk_resources():
14
+ resources = [
15
+ 'punkt',
16
+ 'punkt_tab', # Add this specific resource that's causing the error
17
+ 'averaged_perceptron_tagger',
18
+ 'maxent_ne_chunker',
19
+ 'words'
20
+ ]
21
 
22
+ for resource in resources:
23
+ try:
24
+ # First check if already downloaded
25
+ try:
26
+ nltk.data.find(f'tokenizers/{resource}')
27
+ print(f"Resource {resource} already downloaded")
28
+ except LookupError:
29
+ print(f"Downloading {resource}...")
30
+ nltk.download(resource, download_dir=nltk_data_path)
31
+ print(f"Downloaded {resource}")
32
+ except Exception as e:
33
+ print(f"Warning: Could not download {resource}: {str(e)}")
34
+
35
+ # Ensure resources are downloaded before proceeding
36
+ print("Setting up NLTK resources...")
37
+ ensure_nltk_resources()
38
+
39
+ # Simple sentence tokenizer as fallback
40
+ def simple_sentence_tokenizer(text):
41
+ """A simpler fallback sentence tokenizer."""
42
+ sentences = []
43
+ for sentence in re.split(r'(?<=[.!?])\s+', text):
44
+ if sentence:
45
+ sentences.append(sentence)
46
+ return sentences
47
+
48
+ def get_named_entities(text):
49
+ """Extract named entities from text with error handling."""
50
+ try:
51
+ from nltk.tag import pos_tag
52
+ from nltk.chunk import ne_chunk
53
+ from nltk.tree import Tree
54
+
55
+ tokens = nltk.word_tokenize(text)
56
+ tagged = pos_tag(tokens)
57
+ chunked = ne_chunk(tagged)
58
+
59
+ named_entities = []
60
+ for chunk in chunked:
61
+ if isinstance(chunk, Tree):
62
+ entity = ' '.join([word for word, tag in chunk.leaves()])
63
+ named_entities.append((entity, chunk.label()))
64
+
65
+ return named_entities
66
+ except Exception as e:
67
+ print(f"Named entity recognition failed: {str(e)}")
68
+ return []
69
 
70
  def generate_question_from_sentence(sentence):
71
+ """Generate a question from a sentence with robust error handling."""
72
+ try:
73
+ # Remove punctuation at the end
74
+ sentence = re.sub(r'[.!?]$', '', sentence)
75
+
76
+ # Check for common patterns that can be turned into questions
77
+ if re.search(r'\bis\s|\bwas\s|\bwere\s|\bare\s', sentence):
78
+ # Convert statements with "is", "was", "were", "are" into yes/no questions
79
+ match = re.search(r'^(.*?)\s(is|was|were|are)\s(.*?)$', sentence, re.IGNORECASE)
80
+ if match:
81
+ return f"{match.group(2).capitalize()} {match.group(1)} {match.group(3)}?"
82
+
83
+ # Check for sentences with dates or years
84
+ if re.search(r'\b(in|on|during)\s\d{4}\b|\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', sentence, re.IGNORECASE):
85
+ return f"When did {sentence.lower()}?"
86
+
87
+ # Try to get named entities, but don't fail if NER isn't working
88
+ entities = get_named_entities(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # If there are named entities, ask about them
91
+ if entities:
92
+ entity, entity_type = entities[0]
93
+ if entity_type == 'PERSON':
94
+ return f"Who is {entity}?"
95
+ elif entity_type in ['GPE', 'LOCATION']:
96
+ return f"Where is {entity}?"
97
+ elif entity_type == 'ORGANIZATION':
98
+ return f"What is {entity}?"
99
+
100
+ # Check for sentences with "because", "due to", "as a result"
101
+ if re.search(r'\bbecause\b|\bdue to\b|\bas a result\b', sentence, re.IGNORECASE):
102
+ return f"Why {sentence.lower()}?"
103
+
104
+ # Simplified approach without relying on POS tagging
105
+ words = sentence.split()
106
+
107
+ # Very simple subject extraction (first word)
108
+ if words:
109
+ subject = words[0]
110
+ if subject.lower() in ['i', 'you', 'we', 'they', 'he', 'she', 'it', 'this', 'that']:
111
  return f"What did {subject.lower()} do?"
112
  else:
113
+ return f"What about {subject}?"
114
+
115
+ # Very generic fallback
116
+ question_starters = [
117
+ "What is described in",
118
+ "What is mentioned about",
119
+ "Can you explain",
120
+ "Could you elaborate on"
121
+ ]
122
+
123
+ return f"{random.choice(question_starters)} the statement: '{sentence}'?"
124
+ except Exception as e:
125
+ print(f"Question generation failed: {str(e)}")
126
+ return f"What can you tell me about: '{sentence}'?"
127
 
128
  def paragraph_to_questions(paragraph):
129
  """Generate questions from a paragraph."""
130
+ try:
131
+ # Try the NLTK sentence tokenizer
132
+ sentences = nltk.sent_tokenize(paragraph)
133
+ print(f"NLTK tokenizer found {len(sentences)} sentences")
134
+ except Exception as e:
135
+ print(f"NLTK sentence tokenization failed: {str(e)}, using fallback")
136
+ # Fallback to simple tokenizer if NLTK fails
137
+ sentences = simple_sentence_tokenizer(paragraph)
138
+ print(f"Fallback tokenizer found {len(sentences)} sentences")
139
+
140
  questions = []
141
 
142
  for sentence in sentences:
 
151
 
152
  # Function to format the output for Gradio
153
  def generate_questions(paragraph):
154
+ if not paragraph or paragraph.strip() == "":
155
+ return "Please enter a paragraph to generate questions."
156
+
157
+ print(f"Processing paragraph: {paragraph[:50]}...")
158
  questions = paragraph_to_questions(paragraph)
159
+
160
+ if not questions:
161
+ return "Could not generate any questions from this text. Try a longer or more detailed paragraph."
162
+
163
  return "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
164
 
165
  # Create Gradio interface