SorrelC commited on
Commit
04c4953
·
verified ·
1 Parent(s): e9e829d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -50
app.py CHANGED
@@ -4,17 +4,27 @@ import warnings
4
  import random
5
  import re
6
  import time
 
 
7
  warnings.filterwarnings('ignore')
8
 
 
 
 
 
 
 
 
9
  # Reliable model names and descriptions
10
  KEYWORD_MODELS = {
11
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
12
- 'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
13
  'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
14
- 'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
15
  'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction'
16
  }
17
 
 
 
 
18
  # Color palette for keywords based on scores
19
  SCORE_COLORS = {
20
  'high': '#00B894', # Green - High relevance
@@ -33,41 +43,84 @@ class KeywordExtractionManager:
33
  def __init__(self):
34
  self.keybert_models = {}
35
  self.rake_extractor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def load_keybert_model(self, model_name):
38
- """Load KeyBERT model"""
39
  if model_name not in self.keybert_models:
40
  try:
41
  from keybert import KeyBERT
 
 
42
  # Extract the actual model name from the identifier
43
  actual_model = model_name.replace('keybert_', '')
44
- self.keybert_models[model_name] = KeyBERT(model=actual_model)
45
- print(f" KeyBERT model {actual_model} loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
46
  except Exception as e:
47
  print(f"Error loading KeyBERT model {model_name}: {str(e)}")
 
48
  return None
49
- return self.keybert_models[model_name]
50
 
51
  def load_rake_extractor(self):
52
- """Load RAKE extractor"""
53
  if self.rake_extractor is None:
54
  try:
55
  from rake_nltk import Rake
56
- import nltk
57
- # Download required NLTK data
58
- try:
59
- nltk.data.find('corpora/stopwords')
60
- except LookupError:
61
- nltk.download('stopwords', quiet=True)
62
- try:
63
- nltk.data.find('tokenizers/punkt')
64
- except LookupError:
65
- nltk.download('punkt', quiet=True)
66
 
 
67
  self.rake_extractor = Rake()
68
  print("✓ RAKE extractor loaded successfully")
69
  except Exception as e:
70
  print(f"Error loading RAKE extractor: {str(e)}")
 
71
  return None
72
  return self.rake_extractor
73
 
@@ -77,6 +130,8 @@ class KeywordExtractionManager:
77
  if progress:
78
  progress(0.3, desc="Loading model...")
79
 
 
 
80
  # Handle different model types
81
  if model_name.startswith('yake_'):
82
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
@@ -89,6 +144,7 @@ class KeywordExtractionManager:
89
 
90
  except Exception as e:
91
  print(f"Error with {model_name}: {str(e)}")
 
92
  return self.fallback_keyword_extraction(text, num_keywords)
93
 
94
  def extract_yake_keywords(self, text, num_keywords, ngram_range, progress):
@@ -123,10 +179,11 @@ class KeywordExtractionManager:
123
  'model': 'YAKE'
124
  })
125
 
 
126
  return results
127
 
128
- except ImportError:
129
- print("YAKE library not found. Using fallback keyword extraction...")
130
  return self.fallback_keyword_extraction(text, num_keywords)
131
 
132
  def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
@@ -137,18 +194,28 @@ class KeywordExtractionManager:
137
 
138
  kw_model = self.load_keybert_model(model_name)
139
  if kw_model is None:
 
140
  return self.fallback_keyword_extraction(text, num_keywords)
141
 
142
  if progress:
143
  progress(0.6, desc="Processing with KeyBERT...")
144
 
145
- # Extract keywords
146
- keywords = kw_model.extract_keywords(
147
- text,
148
- keyphrase_ngram_range=ngram_range,
149
- stop_words='english',
150
- top_k=num_keywords
151
- )
 
 
 
 
 
 
 
 
 
152
 
153
  if progress:
154
  progress(0.8, desc="Formatting results...")
@@ -162,10 +229,11 @@ class KeywordExtractionManager:
162
  'model': f"KeyBERT-{model_name.replace('keybert_', '')}"
163
  })
164
 
 
165
  return results
166
 
167
- except ImportError:
168
- print("KeyBERT library not found. Using fallback keyword extraction...")
169
  return self.fallback_keyword_extraction(text, num_keywords)
170
 
171
  def extract_rake_keywords(self, text, num_keywords, progress):
@@ -176,6 +244,7 @@ class KeywordExtractionManager:
176
 
177
  rake_extractor = self.load_rake_extractor()
178
  if rake_extractor is None:
 
179
  return self.fallback_keyword_extraction(text, num_keywords)
180
 
181
  if progress:
@@ -185,23 +254,33 @@ class KeywordExtractionManager:
185
  rake_extractor.extract_keywords_from_text(text)
186
  keywords_with_scores = rake_extractor.get_ranked_phrases_with_scores()
187
 
188
- # Format results
189
- results = []
190
- for score, keyword in keywords_with_scores[:num_keywords]:
191
- results.append({
192
- 'keyword': keyword,
193
- 'score': score,
194
- 'model': 'RAKE-NLTK'
195
- })
196
 
197
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- except ImportError:
200
- print("RAKE-NLTK library not found. Using fallback keyword extraction...")
201
  return self.fallback_keyword_extraction(text, num_keywords)
202
 
203
  def fallback_keyword_extraction(self, text, num_keywords=10):
204
  """Simple fallback keyword extraction using basic statistics"""
 
205
  import re
206
  from collections import Counter
207
 
@@ -363,6 +442,7 @@ def create_legend_html():
363
  return html
364
 
365
  # Initialize the keyword extraction manager
 
366
  keyword_manager = KeywordExtractionManager()
367
 
368
  def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progress=gr.Progress()):
@@ -436,8 +516,8 @@ def create_interface():
436
  text_input = gr.Textbox(
437
  label="📝 Text to Analyse",
438
  placeholder="Enter your text here...",
439
- lines=18,
440
- max_lines=22
441
  )
442
 
443
  with gr.Column(scale=1):
@@ -492,18 +572,10 @@ def create_interface():
492
  <dt style="font-weight: bold; display: inline; color: #FF6B6B;">YAKE:</dt>
493
  <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
494
  </div>
495
- <div style="margin-bottom: 8px;">
496
- <dt style="font-weight: bold; display: inline; color: #9C27B0;">KeyBERT MPNet:</dt>
497
- <dd style="display: inline; margin-left: 5px;">BERT-based semantic similarity - excellent for contextual understanding</dd>
498
- </div>
499
  <div style="margin-bottom: 8px;">
500
  <dt style="font-weight: bold; display: inline; color: #795548;">KeyBERT MiniLM:</dt>
501
  <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster processing with good results</dd>
502
  </div>
503
- <div style="margin-bottom: 8px;">
504
- <dt style="font-weight: bold; display: inline; color: #607D8B;">KeyBERT Paraphrase:</dt>
505
- <dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
506
- </div>
507
  <div style="margin-bottom: 8px;">
508
  <dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
509
  <dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
@@ -552,7 +624,7 @@ def create_interface():
552
  ],
553
  [
554
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
555
- "keybert_all-mpnet-base-v2",
556
  10,
557
  1,
558
  3
 
4
  import random
5
  import re
6
  import time
7
+ import os
8
+ import sys
9
  warnings.filterwarnings('ignore')
10
 
11
+ # Pre-download NLTK data at startup
12
+ import nltk
13
+ print("Downloading NLTK data...")
14
+ nltk.download('stopwords', quiet=True)
15
+ nltk.download('punkt', quiet=True)
16
+ print("NLTK data downloaded.")
17
+
18
  # Reliable model names and descriptions
19
  KEYWORD_MODELS = {
20
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
 
21
  'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
 
22
  'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction'
23
  }
24
 
25
+ # Reduced model list for better compatibility
26
+ # Removed models that might be too large for Spaces
27
+
28
  # Color palette for keywords based on scores
29
  SCORE_COLORS = {
30
  'high': '#00B894', # Green - High relevance
 
43
  def __init__(self):
44
  self.keybert_models = {}
45
  self.rake_extractor = None
46
+ self.models_initialized = False
47
+ self.initialize_models()
48
+
49
+ def initialize_models(self):
50
+ """Pre-initialize models to check availability"""
51
+ print("Initializing models...")
52
+
53
+ # Test YAKE
54
+ try:
55
+ import yake
56
+ print("✓ YAKE available")
57
+ except ImportError as e:
58
+ print(f"✗ YAKE not available: {e}")
59
+
60
+ # Test KeyBERT
61
+ try:
62
+ from keybert import KeyBERT
63
+ from sentence_transformers import SentenceTransformer
64
+ print("✓ KeyBERT library available")
65
+
66
+ # Try to load a small model
67
+ try:
68
+ test_model = SentenceTransformer('all-MiniLM-L6-v2')
69
+ print("✓ Sentence transformers working")
70
+ except Exception as e:
71
+ print(f"✗ Sentence transformer model failed: {e}")
72
+ except ImportError as e:
73
+ print(f"✗ KeyBERT not available: {e}")
74
+
75
+ # Test RAKE
76
+ try:
77
+ from rake_nltk import Rake
78
+ print("✓ RAKE-NLTK available")
79
+ except ImportError as e:
80
+ print(f"✗ RAKE-NLTK not available: {e}")
81
+
82
+ self.models_initialized = True
83
 
84
  def load_keybert_model(self, model_name):
85
+ """Load KeyBERT model with better error handling"""
86
  if model_name not in self.keybert_models:
87
  try:
88
  from keybert import KeyBERT
89
+ from sentence_transformers import SentenceTransformer
90
+
91
  # Extract the actual model name from the identifier
92
  actual_model = model_name.replace('keybert_', '')
93
+
94
+ print(f"Loading KeyBERT with {actual_model}...")
95
+
96
+ # Try to load the sentence transformer first
97
+ try:
98
+ sentence_model = SentenceTransformer(actual_model)
99
+ self.keybert_models[model_name] = KeyBERT(model=sentence_model)
100
+ print(f"✓ KeyBERT model {actual_model} loaded successfully")
101
+ except Exception as e:
102
+ print(f"Failed to load sentence transformer {actual_model}: {e}")
103
+ # Try with just the model name
104
+ self.keybert_models[model_name] = KeyBERT(model=actual_model)
105
+
106
  except Exception as e:
107
  print(f"Error loading KeyBERT model {model_name}: {str(e)}")
108
+ print(f"Full error: {type(e).__name__}: {str(e)}")
109
  return None
110
+ return self.keybert_models.get(model_name)
111
 
112
  def load_rake_extractor(self):
113
+ """Load RAKE extractor with better error handling"""
114
  if self.rake_extractor is None:
115
  try:
116
  from rake_nltk import Rake
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # Create RAKE instance
119
  self.rake_extractor = Rake()
120
  print("✓ RAKE extractor loaded successfully")
121
  except Exception as e:
122
  print(f"Error loading RAKE extractor: {str(e)}")
123
+ print(f"Full error: {type(e).__name__}: {str(e)}")
124
  return None
125
  return self.rake_extractor
126
 
 
130
  if progress:
131
  progress(0.3, desc="Loading model...")
132
 
133
+ print(f"Attempting to extract keywords with {model_name}")
134
+
135
  # Handle different model types
136
  if model_name.startswith('yake_'):
137
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
 
144
 
145
  except Exception as e:
146
  print(f"Error with {model_name}: {str(e)}")
147
+ print(f"Full error: {type(e).__name__}: {str(e)}")
148
  return self.fallback_keyword_extraction(text, num_keywords)
149
 
150
  def extract_yake_keywords(self, text, num_keywords, ngram_range, progress):
 
179
  'model': 'YAKE'
180
  })
181
 
182
+ print(f"YAKE extracted {len(results)} keywords")
183
  return results
184
 
185
+ except Exception as e:
186
+ print(f"YAKE extraction failed: {type(e).__name__}: {str(e)}")
187
  return self.fallback_keyword_extraction(text, num_keywords)
188
 
189
  def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
 
194
 
195
  kw_model = self.load_keybert_model(model_name)
196
  if kw_model is None:
197
+ print(f"KeyBERT model {model_name} could not be loaded")
198
  return self.fallback_keyword_extraction(text, num_keywords)
199
 
200
  if progress:
201
  progress(0.6, desc="Processing with KeyBERT...")
202
 
203
+ # Extract keywords with error handling
204
+ try:
205
+ keywords = kw_model.extract_keywords(
206
+ text,
207
+ keyphrase_ngram_range=ngram_range,
208
+ stop_words='english',
209
+ top_n=num_keywords
210
+ )
211
+ except Exception as e:
212
+ print(f"KeyBERT extraction error: {e}")
213
+ # Try without stop words
214
+ keywords = kw_model.extract_keywords(
215
+ text,
216
+ keyphrase_ngram_range=ngram_range,
217
+ top_n=num_keywords
218
+ )
219
 
220
  if progress:
221
  progress(0.8, desc="Formatting results...")
 
229
  'model': f"KeyBERT-{model_name.replace('keybert_', '')}"
230
  })
231
 
232
+ print(f"KeyBERT extracted {len(results)} keywords")
233
  return results
234
 
235
+ except Exception as e:
236
+ print(f"KeyBERT extraction failed: {type(e).__name__}: {str(e)}")
237
  return self.fallback_keyword_extraction(text, num_keywords)
238
 
239
  def extract_rake_keywords(self, text, num_keywords, progress):
 
244
 
245
  rake_extractor = self.load_rake_extractor()
246
  if rake_extractor is None:
247
+ print("RAKE extractor could not be loaded")
248
  return self.fallback_keyword_extraction(text, num_keywords)
249
 
250
  if progress:
 
254
  rake_extractor.extract_keywords_from_text(text)
255
  keywords_with_scores = rake_extractor.get_ranked_phrases_with_scores()
256
 
257
+ # Normalize scores
258
+ if keywords_with_scores:
259
+ max_score = max(score for score, _ in keywords_with_scores)
 
 
 
 
 
260
 
261
+ # Format results
262
+ results = []
263
+ for score, keyword in keywords_with_scores[:num_keywords]:
264
+ normalized_score = score / max_score if max_score > 0 else 0
265
+ results.append({
266
+ 'keyword': keyword,
267
+ 'score': normalized_score,
268
+ 'model': 'RAKE-NLTK'
269
+ })
270
+
271
+ print(f"RAKE extracted {len(results)} keywords")
272
+ return results
273
+ else:
274
+ print("RAKE returned no keywords")
275
+ return self.fallback_keyword_extraction(text, num_keywords)
276
 
277
+ except Exception as e:
278
+ print(f"RAKE extraction failed: {type(e).__name__}: {str(e)}")
279
  return self.fallback_keyword_extraction(text, num_keywords)
280
 
281
  def fallback_keyword_extraction(self, text, num_keywords=10):
282
  """Simple fallback keyword extraction using basic statistics"""
283
+ print("Using fallback keyword extraction")
284
  import re
285
  from collections import Counter
286
 
 
442
  return html
443
 
444
  # Initialize the keyword extraction manager
445
+ print("Initializing keyword extraction manager...")
446
  keyword_manager = KeywordExtractionManager()
447
 
448
  def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progress=gr.Progress()):
 
516
  text_input = gr.Textbox(
517
  label="📝 Text to Analyse",
518
  placeholder="Enter your text here...",
519
+ lines=20,
520
+ max_lines=23
521
  )
522
 
523
  with gr.Column(scale=1):
 
572
  <dt style="font-weight: bold; display: inline; color: #FF6B6B;">YAKE:</dt>
573
  <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
574
  </div>
 
 
 
 
575
  <div style="margin-bottom: 8px;">
576
  <dt style="font-weight: bold; display: inline; color: #795548;">KeyBERT MiniLM:</dt>
577
  <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster processing with good results</dd>
578
  </div>
 
 
 
 
579
  <div style="margin-bottom: 8px;">
580
  <dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
581
  <dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
 
624
  ],
625
  [
626
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
627
+ "keybert_all-MiniLM-L6-v2",
628
  10,
629
  1,
630
  3