SorrelC commited on
Commit
bc91f56
Β·
verified Β·
1 Parent(s): 8b00025

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +309 -108
app.py CHANGED
@@ -15,16 +15,14 @@ nltk.download('stopwords', quiet=True)
15
  nltk.download('punkt', quiet=True)
16
  print("NLTK data downloaded.")
17
 
18
- # Reliable model names and descriptions
19
  KEYWORD_MODELS = {
20
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
21
- 'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
22
- 'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction'
 
23
  }
24
 
25
- # Reduced model list for better compatibility
26
- # Removed models that might be too large for Spaces
27
-
28
  # Color palette for keywords based on scores
29
  SCORE_COLORS = {
30
  'high': '#00B894', # Green - High relevance
@@ -41,7 +39,6 @@ KEYWORD_COLORS = [
41
 
42
  class KeywordExtractionManager:
43
  def __init__(self):
44
- self.keybert_models = {}
45
  self.rake_extractor = None
46
  self.models_initialized = False
47
  self.initialize_models()
@@ -57,21 +54,6 @@ class KeywordExtractionManager:
57
  except ImportError as e:
58
  print(f"βœ— YAKE not available: {e}")
59
 
60
- # Test KeyBERT
61
- try:
62
- from keybert import KeyBERT
63
- from sentence_transformers import SentenceTransformer
64
- print("βœ“ KeyBERT library available")
65
-
66
- # Try to load a small model
67
- try:
68
- test_model = SentenceTransformer('all-MiniLM-L6-v2')
69
- print("βœ“ Sentence transformers working")
70
- except Exception as e:
71
- print(f"βœ— Sentence transformer model failed: {e}")
72
- except ImportError as e:
73
- print(f"βœ— KeyBERT not available: {e}")
74
-
75
  # Test RAKE
76
  try:
77
  from rake_nltk import Rake
@@ -79,36 +61,22 @@ class KeywordExtractionManager:
79
  except ImportError as e:
80
  print(f"βœ— RAKE-NLTK not available: {e}")
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  self.models_initialized = True
83
 
84
- def load_keybert_model(self, model_name):
85
- """Load KeyBERT model with better error handling"""
86
- if model_name not in self.keybert_models:
87
- try:
88
- from keybert import KeyBERT
89
- from sentence_transformers import SentenceTransformer
90
-
91
- # Extract the actual model name from the identifier
92
- actual_model = model_name.replace('keybert_', '')
93
-
94
- print(f"Loading KeyBERT with {actual_model}...")
95
-
96
- # Try to load the sentence transformer first
97
- try:
98
- sentence_model = SentenceTransformer(actual_model)
99
- self.keybert_models[model_name] = KeyBERT(model=sentence_model)
100
- print(f"βœ“ KeyBERT model {actual_model} loaded successfully")
101
- except Exception as e:
102
- print(f"Failed to load sentence transformer {actual_model}: {e}")
103
- # Try with just the model name
104
- self.keybert_models[model_name] = KeyBERT(model=actual_model)
105
-
106
- except Exception as e:
107
- print(f"Error loading KeyBERT model {model_name}: {str(e)}")
108
- print(f"Full error: {type(e).__name__}: {str(e)}")
109
- return None
110
- return self.keybert_models.get(model_name)
111
-
112
  def load_rake_extractor(self):
113
  """Load RAKE extractor with better error handling"""
114
  if self.rake_extractor is None:
@@ -135,10 +103,12 @@ class KeywordExtractionManager:
135
  # Handle different model types
136
  if model_name.startswith('yake_'):
137
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
138
- elif model_name.startswith('keybert_'):
139
- return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
140
  elif model_name.startswith('rake_'):
141
  return self.extract_rake_keywords(text, num_keywords, progress)
 
 
142
  else:
143
  raise ValueError(f"Unknown model: {model_name}")
144
 
@@ -186,54 +156,200 @@ class KeywordExtractionManager:
186
  print(f"YAKE extraction failed: {type(e).__name__}: {str(e)}")
187
  return self.fallback_keyword_extraction(text, num_keywords)
188
 
189
- def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
190
- """Extract keywords using KeyBERT"""
191
  try:
 
 
 
 
192
  if progress:
193
- progress(0.4, desc="Loading KeyBERT model...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- kw_model = self.load_keybert_model(model_name)
196
- if kw_model is None:
197
- print(f"KeyBERT model {model_name} could not be loaded")
198
  return self.fallback_keyword_extraction(text, num_keywords)
199
 
 
 
 
200
  if progress:
201
- progress(0.6, desc="Processing with KeyBERT...")
202
 
203
- # Extract keywords with error handling
204
  try:
205
- keywords = kw_model.extract_keywords(
206
- text,
207
- keyphrase_ngram_range=ngram_range,
208
- stop_words='english',
209
- top_n=num_keywords
210
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  except Exception as e:
212
- print(f"KeyBERT extraction error: {e}")
213
- # Try without stop words
214
- keywords = kw_model.extract_keywords(
215
- text,
216
- keyphrase_ngram_range=ngram_range,
217
- top_n=num_keywords
218
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  if progress:
221
- progress(0.8, desc="Formatting results...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Format results
224
  results = []
225
- for keyword, score in keywords:
226
- results.append({
227
- 'keyword': keyword,
228
- 'score': score,
229
- 'model': f"KeyBERT-{model_name.replace('keybert_', '')}"
230
- })
231
-
232
- print(f"KeyBERT extracted {len(results)} keywords")
 
233
  return results
234
 
 
 
 
235
  except Exception as e:
236
- print(f"KeyBERT extraction failed: {type(e).__name__}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  return self.fallback_keyword_extraction(text, num_keywords)
238
 
239
  def extract_rake_keywords(self, text, num_keywords, progress):
@@ -324,6 +440,19 @@ def get_score_color(score, max_score):
324
  else:
325
  return SCORE_COLORS['low']
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  def create_highlighted_html(text, keywords):
328
  """Create HTML with highlighted keywords in the text"""
329
  if not keywords:
@@ -341,12 +470,13 @@ def create_highlighted_html(text, keywords):
341
  keyword = kw_data['keyword']
342
  score = kw_data['score']
343
  color = get_score_color(score, max_score)
 
344
 
345
  # Create regex pattern for whole word matching (case-insensitive)
346
  pattern = r'\b' + re.escape(keyword) + r'\b'
347
 
348
  # Replace with highlighted version
349
- replacement = f'<span style="background-color: {color}; padding: 2px 4px; ' \
350
  f'border-radius: 3px; margin: 0 1px; ' \
351
  f'border: 1px solid {color}; color: white; font-weight: bold;" ' \
352
  f'title="Score: {score:.3f}">{keyword}</span>'
@@ -356,12 +486,12 @@ def create_highlighted_html(text, keywords):
356
  return f"""
357
  <div style='padding: 15px; border: 2px solid #ddd; border-radius: 8px; background-color: #fafafa; margin: 10px 0;'>
358
  <h4 style='margin: 0 0 15px 0; color: #333;'>πŸ“ Text with Highlighted Keywords</h4>
359
- <div style='line-height: 1.8; font-size: 16px; background-color: white; padding: 15px; border-radius: 5px;'>{highlighted_text}</div>
360
  </div>
361
  """
362
 
363
  def create_keyword_table_html(keywords):
364
- """Create HTML table for keywords"""
365
  if not keywords:
366
  return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
367
 
@@ -372,7 +502,7 @@ def create_keyword_table_html(keywords):
372
  table_html = """
373
  <div style='max-height: 600px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa;'>
374
  <h3 style="margin: 0 0 20px 0;">🎯 Extracted Keywords</h3>
375
- <table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
376
  <thead>
377
  <tr style="background-color: #4ECDC4; color: white;">
378
  <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Rank</th>
@@ -388,6 +518,7 @@ def create_keyword_table_html(keywords):
388
  for i, kw_data in enumerate(sorted_keywords):
389
  score = kw_data['score']
390
  color = get_score_color(score, max_score)
 
391
 
392
  # Create relevance bar
393
  bar_width = int((score / max_score) * 100) if max_score > 0 else 0
@@ -398,7 +529,7 @@ def create_keyword_table_html(keywords):
398
  """
399
 
400
  table_html += f"""
401
- <tr style="background-color: #fff;">
402
  <td style="padding: 10px; border: 1px solid #ddd; text-align: center; font-weight: bold;">#{i+1}</td>
403
  <td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{kw_data['keyword']}</td>
404
  <td style="padding: 10px; border: 1px solid #ddd;">
@@ -422,22 +553,87 @@ def create_keyword_table_html(keywords):
422
  return table_html
423
 
424
  def create_legend_html():
425
- """Create a legend showing score colors"""
426
  html = """
427
  <div style='margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px;'>
428
- <h4 style='margin: 0 0 15px 0;'>🎨 Relevance Score Legend</h4>
429
  <div style='display: flex; flex-wrap: wrap; gap: 15px;'>
430
- <span style='background-color: #00B894; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
 
 
 
 
 
 
 
 
 
431
  High Relevance (70%+)
432
- </span>
433
- <span style='background-color: #F9CA24; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
 
 
 
434
  Medium Relevance (40-70%)
435
- </span>
436
- <span style='background-color: #FF6B6B; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
 
 
 
437
  Low Relevance (<40%)
438
- </span>
439
  </div>
440
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  """
442
  return html
443
 
@@ -476,7 +672,7 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
476
 
477
  # Create summary
478
  avg_score = sum(k['score'] for k in keywords) / len(keywords)
479
- model_display = selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rake_', 'RAKE-').title()
480
  summary = f"""
481
  ## πŸ“Š Analysis Summary
482
  - **Keywords extracted:** {len(keywords)}
@@ -495,13 +691,14 @@ def create_interface():
495
  gr.Markdown("""
496
  # Keyword Extraction Explorer Tool
497
 
498
- Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RAKE for comprehensive analysis.
499
 
500
  ### How to use:
501
  1. **πŸ“ Enter your text** in the text area below
502
  2. **🎯 Select a model** from the dropdown for keyword extraction
503
  3. **βš™οΈ Adjust parameters** (number of keywords, n-gram range)
504
  4. **πŸ” Click "Extract Keywords"** to see results with organized output
 
505
  """)
506
 
507
  # Add tip box
@@ -573,13 +770,17 @@ def create_interface():
573
  <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
574
  </div>
575
  <div style="margin-bottom: 8px;">
576
- <dt style="font-weight: bold; display: inline; color: #795548;">KeyBERT MiniLM:</dt>
577
- <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster processing with good results</dd>
578
  </div>
579
  <div style="margin-bottom: 8px;">
580
  <dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
581
  <dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
582
  </div>
 
 
 
 
583
  </dl>
584
  </div>
585
  </details>
@@ -624,7 +825,7 @@ def create_interface():
624
  ],
625
  [
626
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
627
- "keybert_all-MiniLM-L6-v2",
628
  10,
629
  1,
630
  3
@@ -658,9 +859,9 @@ def create_interface():
658
  Yet Another Keyword Extractor β†—
659
  </a>
660
  </li>
661
- <li><strong>KeyBERT:</strong>
662
- <a href="https://github.com/MaartenGr/KeyBERT" target="_blank" style="color: #1976d2;">
663
- Minimal keyword extraction with BERT β†—
664
  </a>
665
  </li>
666
  <li><strong>RAKE-NLTK:</strong>
@@ -668,9 +869,9 @@ def create_interface():
668
  Rapid Automatic Keyword Extraction with NLTK β†—
669
  </a>
670
  </li>
671
- <li><strong>Sentence Transformers:</strong>
672
- <a href="https://www.sbert.net/" target="_blank" style="color: #1976d2;">
673
- BERT-based models for semantic similarity β†—
674
  </a>
675
  </li>
676
  </ul>
 
15
  nltk.download('punkt', quiet=True)
16
  print("NLTK data downloaded.")
17
 
18
+ # Reliable model names and descriptions - replaced KeyBERT with working alternatives
19
  KEYWORD_MODELS = {
20
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
21
+ 'tfidf_cosine': 'TF-IDF with Cosine Similarity - Document similarity approach',
22
+ 'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction',
23
+ 'textrank': 'TextRank - Graph-based ranking algorithm'
24
  }
25
 
 
 
 
26
  # Color palette for keywords based on scores
27
  SCORE_COLORS = {
28
  'high': '#00B894', # Green - High relevance
 
39
 
40
  class KeywordExtractionManager:
41
  def __init__(self):
 
42
  self.rake_extractor = None
43
  self.models_initialized = False
44
  self.initialize_models()
 
54
  except ImportError as e:
55
  print(f"βœ— YAKE not available: {e}")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # Test RAKE
58
  try:
59
  from rake_nltk import Rake
 
61
  except ImportError as e:
62
  print(f"βœ— RAKE-NLTK not available: {e}")
63
 
64
+ # Test sklearn for TF-IDF
65
+ try:
66
+ from sklearn.feature_extraction.text import TfidfVectorizer
67
+ print("βœ“ Scikit-learn available for TF-IDF")
68
+ except ImportError as e:
69
+ print(f"βœ— Scikit-learn not available: {e}")
70
+
71
+ # Test networkx for TextRank
72
+ try:
73
+ import networkx
74
+ print("βœ“ NetworkX available for TextRank")
75
+ except ImportError as e:
76
+ print(f"βœ— NetworkX not available: {e}")
77
+
78
  self.models_initialized = True
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def load_rake_extractor(self):
81
  """Load RAKE extractor with better error handling"""
82
  if self.rake_extractor is None:
 
103
  # Handle different model types
104
  if model_name.startswith('yake_'):
105
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
106
+ elif model_name.startswith('tfidf_'):
107
+ return self.extract_tfidf_cosine_keywords(text, num_keywords, ngram_range, progress)
108
  elif model_name.startswith('rake_'):
109
  return self.extract_rake_keywords(text, num_keywords, progress)
110
+ elif model_name.startswith('textrank'):
111
+ return self.extract_textrank_keywords(text, num_keywords, ngram_range, progress)
112
  else:
113
  raise ValueError(f"Unknown model: {model_name}")
114
 
 
156
  print(f"YAKE extraction failed: {type(e).__name__}: {str(e)}")
157
  return self.fallback_keyword_extraction(text, num_keywords)
158
 
159
+ def extract_tfidf_cosine_keywords(self, text, num_keywords, ngram_range, progress):
160
+ """Extract keywords using TF-IDF with cosine similarity"""
161
  try:
162
+ from sklearn.feature_extraction.text import TfidfVectorizer
163
+ from sklearn.metrics.pairwise import cosine_similarity
164
+ import numpy as np
165
+
166
  if progress:
167
+ progress(0.5, desc="Processing with TF-IDF...")
168
+
169
+ # Create TF-IDF vectorizer
170
+ vectorizer = TfidfVectorizer(
171
+ ngram_range=ngram_range,
172
+ stop_words='english',
173
+ max_features=5000,
174
+ min_df=1,
175
+ max_df=0.95
176
+ )
177
+
178
+ # Extract candidate keywords/phrases
179
+ words = re.findall(r'\b[a-z]+\b', text.lower())
180
+ candidates = []
181
+
182
+ # Generate n-grams
183
+ for n in range(ngram_range[0], ngram_range[1] + 1):
184
+ for i in range(len(words) - n + 1):
185
+ candidate = ' '.join(words[i:i+n])
186
+ if len(candidate) > 2 and candidate not in candidates:
187
+ candidates.append(candidate)
188
 
189
+ if not candidates:
 
 
190
  return self.fallback_keyword_extraction(text, num_keywords)
191
 
192
+ # Limit candidates to prevent memory issues
193
+ candidates = candidates[:300]
194
+
195
  if progress:
196
+ progress(0.7, desc="Computing similarities...")
197
 
 
198
  try:
199
+ # Create document embedding
200
+ doc_embedding = vectorizer.fit_transform([text])
201
+
202
+ # Create embeddings for candidates
203
+ candidate_embeddings = vectorizer.transform(candidates)
204
+
205
+ # Calculate similarities
206
+ similarities = cosine_similarity(doc_embedding, candidate_embeddings)[0]
207
+
208
+ # Get top keywords
209
+ top_indices = similarities.argsort()[-num_keywords:][::-1]
210
+
211
+ results = []
212
+ for idx in top_indices:
213
+ if similarities[idx] > 0:
214
+ results.append({
215
+ 'keyword': candidates[idx],
216
+ 'score': float(similarities[idx]),
217
+ 'model': 'TF-IDF-Cosine'
218
+ })
219
+
220
+ if progress:
221
+ progress(0.8, desc="Formatting results...")
222
+
223
+ print(f"TF-IDF extracted {len(results)} keywords")
224
+ return results
225
+
226
  except Exception as e:
227
+ print(f"TF-IDF approach failed: {e}")
228
+ # Fall back to simple TF-IDF
229
+ return self.simple_tfidf_extraction(text, num_keywords, ngram_range)
230
+
231
+ except ImportError:
232
+ print("scikit-learn not available for TF-IDF")
233
+ return self.fallback_keyword_extraction(text, num_keywords)
234
+ except Exception as e:
235
+ print(f"TF-IDF extraction failed: {e}")
236
+ return self.fallback_keyword_extraction(text, num_keywords)
237
+
238
+ def extract_textrank_keywords(self, text, num_keywords, ngram_range, progress):
239
+ """Extract keywords using TextRank algorithm"""
240
+ try:
241
+ import numpy as np
242
+ from sklearn.feature_extraction.text import TfidfVectorizer
243
+ from sklearn.metrics.pairwise import cosine_similarity
244
+ import networkx as nx
245
+
246
+ if progress:
247
+ progress(0.5, desc="Processing with TextRank...")
248
+
249
+ # Split text into sentences
250
+ sentences = re.split(r'[.!?]+', text)
251
+ sentences = [s.strip() for s in sentences if s.strip()]
252
+
253
+ if len(sentences) < 2:
254
+ # If text is too short, use simple extraction
255
+ return self.simple_tfidf_extraction(text, num_keywords, ngram_range)
256
+
257
+ # Create TF-IDF matrix
258
+ vectorizer = TfidfVectorizer(
259
+ ngram_range=(1, 1), # Use unigrams for sentence similarity
260
+ stop_words='english'
261
+ )
262
+
263
+ tfidf_matrix = vectorizer.fit_transform(sentences)
264
+
265
+ # Calculate similarity matrix
266
+ similarity_matrix = cosine_similarity(tfidf_matrix)
267
 
268
  if progress:
269
+ progress(0.6, desc="Building graph...")
270
+
271
+ # Build graph
272
+ nx_graph = nx.from_numpy_array(similarity_matrix)
273
+
274
+ # Calculate PageRank scores
275
+ scores = nx.pagerank(nx_graph)
276
+
277
+ # Extract keywords from top-ranked sentences
278
+ top_sentence_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:3]
279
+
280
+ # Extract keywords from top sentences
281
+ keyword_vectorizer = TfidfVectorizer(
282
+ ngram_range=ngram_range,
283
+ stop_words='english',
284
+ max_features=num_keywords * 2
285
+ )
286
+
287
+ top_sentences = [sentences[i] for i in top_sentence_indices]
288
+ top_text = ' '.join(top_sentences)
289
+
290
+ if progress:
291
+ progress(0.7, desc="Extracting keywords...")
292
+
293
+ tfidf_matrix = keyword_vectorizer.fit_transform([top_text])
294
+ feature_names = keyword_vectorizer.get_feature_names_out()
295
+ tfidf_scores = tfidf_matrix.toarray()[0]
296
+
297
+ # Get top keywords
298
+ top_indices = tfidf_scores.argsort()[-num_keywords:][::-1]
299
 
 
300
  results = []
301
+ for idx in top_indices:
302
+ if tfidf_scores[idx] > 0:
303
+ results.append({
304
+ 'keyword': feature_names[idx],
305
+ 'score': float(tfidf_scores[idx]),
306
+ 'model': 'TextRank'
307
+ })
308
+
309
+ print(f"TextRank extracted {len(results)} keywords")
310
  return results
311
 
312
+ except ImportError as e:
313
+ print(f"Required library not available for TextRank: {e}")
314
+ return self.fallback_keyword_extraction(text, num_keywords)
315
  except Exception as e:
316
+ print(f"TextRank extraction failed: {e}")
317
+ return self.fallback_keyword_extraction(text, num_keywords)
318
+
319
+ def simple_tfidf_extraction(self, text, num_keywords, ngram_range):
320
+ """Simple TF-IDF extraction without cosine similarity"""
321
+ try:
322
+ from sklearn.feature_extraction.text import TfidfVectorizer
323
+
324
+ vectorizer = TfidfVectorizer(
325
+ ngram_range=ngram_range,
326
+ stop_words='english',
327
+ max_features=num_keywords * 2
328
+ )
329
+
330
+ # Fit and transform
331
+ tfidf_matrix = vectorizer.fit_transform([text])
332
+
333
+ # Get feature names and scores
334
+ feature_names = vectorizer.get_feature_names_out()
335
+ scores = tfidf_matrix.toarray()[0]
336
+
337
+ # Get top keywords
338
+ top_indices = scores.argsort()[-num_keywords:][::-1]
339
+
340
+ results = []
341
+ for idx in top_indices:
342
+ if scores[idx] > 0:
343
+ results.append({
344
+ 'keyword': feature_names[idx],
345
+ 'score': float(scores[idx]),
346
+ 'model': 'TF-IDF-Simple'
347
+ })
348
+
349
+ return results
350
+
351
+ except Exception as e:
352
+ print(f"Simple TF-IDF failed: {e}")
353
  return self.fallback_keyword_extraction(text, num_keywords)
354
 
355
  def extract_rake_keywords(self, text, num_keywords, progress):
 
440
  else:
441
  return SCORE_COLORS['low']
442
 
443
+ def get_relevance_level(score, max_score):
444
+ """Get relevance level name based on score"""
445
+ if max_score == 0:
446
+ return 'medium'
447
+
448
+ relative_score = score / max_score
449
+ if relative_score >= 0.7:
450
+ return 'high'
451
+ elif relative_score >= 0.4:
452
+ return 'medium'
453
+ else:
454
+ return 'low'
455
+
456
  def create_highlighted_html(text, keywords):
457
  """Create HTML with highlighted keywords in the text"""
458
  if not keywords:
 
470
  keyword = kw_data['keyword']
471
  score = kw_data['score']
472
  color = get_score_color(score, max_score)
473
+ relevance = get_relevance_level(score, max_score)
474
 
475
  # Create regex pattern for whole word matching (case-insensitive)
476
  pattern = r'\b' + re.escape(keyword) + r'\b'
477
 
478
  # Replace with highlighted version
479
+ replacement = f'<span class="keyword-{relevance}" style="background-color: {color}; padding: 2px 4px; ' \
480
  f'border-radius: 3px; margin: 0 1px; ' \
481
  f'border: 1px solid {color}; color: white; font-weight: bold;" ' \
482
  f'title="Score: {score:.3f}">{keyword}</span>'
 
486
  return f"""
487
  <div style='padding: 15px; border: 2px solid #ddd; border-radius: 8px; background-color: #fafafa; margin: 10px 0;'>
488
  <h4 style='margin: 0 0 15px 0; color: #333;'>πŸ“ Text with Highlighted Keywords</h4>
489
+ <div id="highlighted-text" style='line-height: 1.8; font-size: 16px; background-color: white; padding: 15px; border-radius: 5px;'>{highlighted_text}</div>
490
  </div>
491
  """
492
 
493
  def create_keyword_table_html(keywords):
494
+ """Create HTML table for keywords with filtering capability"""
495
  if not keywords:
496
  return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
497
 
 
502
  table_html = """
503
  <div style='max-height: 600px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa;'>
504
  <h3 style="margin: 0 0 20px 0;">🎯 Extracted Keywords</h3>
505
+ <table id="keywords-table" style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
506
  <thead>
507
  <tr style="background-color: #4ECDC4; color: white;">
508
  <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Rank</th>
 
518
  for i, kw_data in enumerate(sorted_keywords):
519
  score = kw_data['score']
520
  color = get_score_color(score, max_score)
521
+ relevance = get_relevance_level(score, max_score)
522
 
523
  # Create relevance bar
524
  bar_width = int((score / max_score) * 100) if max_score > 0 else 0
 
529
  """
530
 
531
  table_html += f"""
532
+ <tr class="keyword-row relevance-{relevance}" style="background-color: #fff;">
533
  <td style="padding: 10px; border: 1px solid #ddd; text-align: center; font-weight: bold;">#{i+1}</td>
534
  <td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{kw_data['keyword']}</td>
535
  <td style="padding: 10px; border: 1px solid #ddd;">
 
553
  return table_html
554
 
555
  def create_legend_html():
556
+ """Create an interactive legend showing score colors"""
557
  html = """
558
  <div style='margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px;'>
559
+ <h4 style='margin: 0 0 15px 0;'>🎨 Relevance Score Legend (Click to Filter)</h4>
560
  <div style='display: flex; flex-wrap: wrap; gap: 15px;'>
561
+ <button onclick="filterByRelevance('all')"
562
+ style='background-color: #6c757d; padding: 8px 16px; border-radius: 15px;
563
+ color: white; font-weight: bold; border: none; cursor: pointer;
564
+ transition: all 0.3s ease;'>
565
+ Show All
566
+ </button>
567
+ <button onclick="filterByRelevance('high')"
568
+ style='background-color: #00B894; padding: 8px 16px; border-radius: 15px;
569
+ color: white; font-weight: bold; border: none; cursor: pointer;
570
+ transition: all 0.3s ease;'>
571
  High Relevance (70%+)
572
+ </button>
573
+ <button onclick="filterByRelevance('medium')"
574
+ style='background-color: #F9CA24; padding: 8px 16px; border-radius: 15px;
575
+ color: white; font-weight: bold; border: none; cursor: pointer;
576
+ transition: all 0.3s ease;'>
577
  Medium Relevance (40-70%)
578
+ </button>
579
+ <button onclick="filterByRelevance('low')"
580
+ style='background-color: #FF6B6B; padding: 8px 16px; border-radius: 15px;
581
+ color: white; font-weight: bold; border: none; cursor: pointer;
582
+ transition: all 0.3s ease;'>
583
  Low Relevance (<40%)
584
+ </button>
585
  </div>
586
  </div>
587
+
588
+ <script>
589
+ function filterByRelevance(level) {
590
+ const table = document.getElementById('keywords-table');
591
+ const rows = table.getElementsByClassName('keyword-row');
592
+ const textContainer = document.getElementById('highlighted-text');
593
+ const keywords = textContainer.getElementsByTagName('span');
594
+
595
+ // Filter table rows
596
+ for (let row of rows) {
597
+ if (level === 'all') {
598
+ row.style.display = '';
599
+ } else {
600
+ if (row.classList.contains('relevance-' + level)) {
601
+ row.style.display = '';
602
+ } else {
603
+ row.style.display = 'none';
604
+ }
605
+ }
606
+ }
607
+
608
+ // Highlight keywords in text
609
+ for (let keyword of keywords) {
610
+ if (level === 'all') {
611
+ keyword.style.opacity = '1';
612
+ keyword.style.filter = 'none';
613
+ } else {
614
+ if (keyword.classList.contains('keyword-' + level)) {
615
+ keyword.style.opacity = '1';
616
+ keyword.style.filter = 'none';
617
+ } else {
618
+ keyword.style.opacity = '0.3';
619
+ keyword.style.filter = 'grayscale(100%)';
620
+ }
621
+ }
622
+ }
623
+
624
+ // Update button styles
625
+ const buttons = document.querySelectorAll('button');
626
+ buttons.forEach(button => {
627
+ if (button.onclick && button.onclick.toString().includes(level)) {
628
+ button.style.transform = 'scale(1.1)';
629
+ button.style.boxShadow = '0 4px 8px rgba(0,0,0,0.2)';
630
+ } else {
631
+ button.style.transform = 'scale(1)';
632
+ button.style.boxShadow = 'none';
633
+ }
634
+ });
635
+ }
636
+ </script>
637
  """
638
  return html
639
 
 
672
 
673
  # Create summary
674
  avg_score = sum(k['score'] for k in keywords) / len(keywords)
675
+ model_display = selected_model.replace('yake_', '').replace('tfidf_', 'TF-IDF ').replace('rake_', 'RAKE-').replace('textrank', 'TextRank').title()
676
  summary = f"""
677
  ## πŸ“Š Analysis Summary
678
  - **Keywords extracted:** {len(keywords)}
 
691
  gr.Markdown("""
692
  # Keyword Extraction Explorer Tool
693
 
694
+ Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, TF-IDF, RAKE, and TextRank for comprehensive analysis.
695
 
696
  ### How to use:
697
  1. **πŸ“ Enter your text** in the text area below
698
  2. **🎯 Select a model** from the dropdown for keyword extraction
699
  3. **βš™οΈ Adjust parameters** (number of keywords, n-gram range)
700
  4. **πŸ” Click "Extract Keywords"** to see results with organized output
701
+ 5. **🎨 Click on the legend buttons** to filter keywords by relevance level
702
  """)
703
 
704
  # Add tip box
 
770
  <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
771
  </div>
772
  <div style="margin-bottom: 8px;">
773
+ <dt style="font-weight: bold; display: inline; color: #795548;">TF-IDF with Cosine Similarity:</dt>
774
+ <dd style="display: inline; margin-left: 5px;">Document similarity approach - extracts keywords most similar to the overall document</dd>
775
  </div>
776
  <div style="margin-bottom: 8px;">
777
  <dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
778
  <dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
779
  </div>
780
+ <div style="margin-bottom: 8px;">
781
+ <dt style="font-weight: bold; display: inline; color: #607D8B;">TextRank:</dt>
782
+ <dd style="display: inline; margin-left: 5px;">Graph-based ranking algorithm inspired by PageRank - good for extracting key concepts</dd>
783
+ </div>
784
  </dl>
785
  </div>
786
  </details>
 
825
  ],
826
  [
827
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
828
+ "tfidf_cosine",
829
  10,
830
  1,
831
  3
 
859
  Yet Another Keyword Extractor β†—
860
  </a>
861
  </li>
862
+ <li><strong>TF-IDF:</strong>
863
+ <a href="https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting" target="_blank" style="color: #1976d2;">
864
+ Term Frequency-Inverse Document Frequency β†—
865
  </a>
866
  </li>
867
  <li><strong>RAKE-NLTK:</strong>
 
869
  Rapid Automatic Keyword Extraction with NLTK β†—
870
  </a>
871
  </li>
872
+ <li><strong>TextRank:</strong>
873
+ <a href="https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf" target="_blank" style="color: #1976d2;">
874
+ TextRank: Bringing Order into Text β†—
875
  </a>
876
  </li>
877
  </ul>