SorrelC commited on
Commit
62a9b35
·
verified ·
1 Parent(s): 4d6534f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -39
app.py CHANGED
@@ -6,13 +6,13 @@ import re
6
  import time
7
  warnings.filterwarnings('ignore')
8
 
9
- # Reliable model names and descriptions (PKE removed for compatibility)
10
  KEYWORD_MODELS = {
11
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
12
  'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
13
  'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
14
  'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
15
- 'rakun_rakun': 'RaKUn - Rapid Automatic Keyword Extraction'
16
  }
17
 
18
  # Color palette for keywords based on scores
@@ -32,6 +32,7 @@ KEYWORD_COLORS = [
32
  class KeywordExtractionManager:
33
  def __init__(self):
34
  self.keybert_models = {}
 
35
 
36
  def load_keybert_model(self, model_name):
37
  """Load KeyBERT model"""
@@ -47,6 +48,29 @@ class KeywordExtractionManager:
47
  return None
48
  return self.keybert_models[model_name]
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
51
  """Extract keywords using the specified model"""
52
  try:
@@ -58,8 +82,8 @@ class KeywordExtractionManager:
58
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
59
  elif model_name.startswith('keybert_'):
60
  return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
61
- elif model_name.startswith('rakun_'):
62
- return self.extract_rakun_keywords(text, num_keywords, progress)
63
  else:
64
  raise ValueError(f"Unknown model: {model_name}")
65
 
@@ -144,51 +168,36 @@ class KeywordExtractionManager:
144
  print("KeyBERT library not found. Using fallback keyword extraction...")
145
  return self.fallback_keyword_extraction(text, num_keywords)
146
 
147
- def extract_rakun_keywords(self, text, num_keywords, progress):
148
- """Extract keywords using RaKUn"""
149
  try:
150
- from rakun import RakunDetector
151
-
152
  if progress:
153
- progress(0.5, desc="Processing with RaKUn...")
154
-
155
- # Initialize RaKUn
156
- hyperparameters = {
157
- "distance_threshold": 3,
158
- "num_keywords": num_keywords,
159
- "pair_diff_length": 2,
160
- "stopwords": "english",
161
- "bigram_count_threshold": 2,
162
- "num_tokens": [1, 2, 3]
163
- }
164
 
165
- keyword_detector = RakunDetector(hyperparameters)
 
 
166
 
167
  if progress:
168
  progress(0.7, desc="Extracting keywords...")
169
 
170
- keywords = keyword_detector.find_keywords(text)
 
 
171
 
172
  # Format results
173
  results = []
174
- for keyword_data in keywords[:num_keywords]:
175
- if isinstance(keyword_data, tuple):
176
- keyword, score = keyword_data
177
- else:
178
- # If no score available, assign based on rank
179
- keyword = keyword_data
180
- score = 1.0 / (keywords.index(keyword_data) + 1)
181
-
182
  results.append({
183
  'keyword': keyword,
184
  'score': score,
185
- 'model': 'RaKUn'
186
  })
187
 
188
  return results
189
 
190
  except ImportError:
191
- print("RaKUn library not found. Using fallback keyword extraction...")
192
  return self.fallback_keyword_extraction(text, num_keywords)
193
 
194
  def fallback_keyword_extraction(self, text, num_keywords=10):
@@ -387,10 +396,11 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
387
 
388
  # Create summary
389
  avg_score = sum(k['score'] for k in keywords) / len(keywords)
 
390
  summary = f"""
391
  ## 📊 Analysis Summary
392
  - **Keywords extracted:** {len(keywords)}
393
- - **Model used:** {selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rakun_', '').title()}
394
  - **Average relevance score:** {avg_score:.4f}
395
  - **N-gram range:** {ngram_min}-{ngram_max} words
396
  """
@@ -405,7 +415,7 @@ def create_interface():
405
  gr.Markdown("""
406
  # Keyword Extraction Explorer Tool
407
 
408
- Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RaKUn for comprehensive analysis.
409
 
410
  ### How to use:
411
  1. **📝 Enter your text** in the text area below
@@ -495,8 +505,8 @@ def create_interface():
495
  <dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
496
  </div>
497
  <div style="margin-bottom: 8px;">
498
- <dt style="font-weight: bold; display: inline; color: #FF5722;">RaKUn:</dt>
499
- <dd style="display: inline; margin-left: 5px;">Graph-based rapid extraction - efficient for large texts</dd>
500
  </div>
501
  </dl>
502
  </div>
@@ -549,7 +559,7 @@ def create_interface():
549
  ],
550
  [
551
  "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
552
- "keybert_all-MiniLM-L6-v2",
553
  10,
554
  1,
555
  3
@@ -581,9 +591,9 @@ def create_interface():
581
  Minimal keyword extraction with BERT ↗
582
  </a>
583
  </li>
584
- <li><strong>RaKUn:</strong>
585
- <a href="https://github.com/SkBlaz/rakun" target="_blank" style="color: #1976d2;">
586
- Rapid Automatic Keyword Extraction ↗
587
  </a>
588
  </li>
589
  <li><strong>Sentence Transformers:</strong>
 
6
  import time
7
  warnings.filterwarnings('ignore')
8
 
9
+ # Reliable model names and descriptions
10
  KEYWORD_MODELS = {
11
  'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
12
  'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
13
  'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
14
  'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
15
+ 'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction'
16
  }
17
 
18
  # Color palette for keywords based on scores
 
32
  class KeywordExtractionManager:
33
  def __init__(self):
34
  self.keybert_models = {}
35
+ self.rake_extractor = None
36
 
37
  def load_keybert_model(self, model_name):
38
  """Load KeyBERT model"""
 
48
  return None
49
  return self.keybert_models[model_name]
50
 
51
+ def load_rake_extractor(self):
52
+ """Load RAKE extractor"""
53
+ if self.rake_extractor is None:
54
+ try:
55
+ from rake_nltk import Rake
56
+ import nltk
57
+ # Download required NLTK data
58
+ try:
59
+ nltk.data.find('corpora/stopwords')
60
+ except LookupError:
61
+ nltk.download('stopwords', quiet=True)
62
+ try:
63
+ nltk.data.find('tokenizers/punkt')
64
+ except LookupError:
65
+ nltk.download('punkt', quiet=True)
66
+
67
+ self.rake_extractor = Rake()
68
+ print("✓ RAKE extractor loaded successfully")
69
+ except Exception as e:
70
+ print(f"Error loading RAKE extractor: {str(e)}")
71
+ return None
72
+ return self.rake_extractor
73
+
74
  def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
75
  """Extract keywords using the specified model"""
76
  try:
 
82
  return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
83
  elif model_name.startswith('keybert_'):
84
  return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
85
+ elif model_name.startswith('rake_'):
86
+ return self.extract_rake_keywords(text, num_keywords, progress)
87
  else:
88
  raise ValueError(f"Unknown model: {model_name}")
89
 
 
168
  print("KeyBERT library not found. Using fallback keyword extraction...")
169
  return self.fallback_keyword_extraction(text, num_keywords)
170
 
171
+ def extract_rake_keywords(self, text, num_keywords, progress):
172
+ """Extract keywords using RAKE"""
173
  try:
 
 
174
  if progress:
175
+ progress(0.5, desc="Processing with RAKE...")
 
 
 
 
 
 
 
 
 
 
176
 
177
+ rake_extractor = self.load_rake_extractor()
178
+ if rake_extractor is None:
179
+ return self.fallback_keyword_extraction(text, num_keywords)
180
 
181
  if progress:
182
  progress(0.7, desc="Extracting keywords...")
183
 
184
+ # Extract keywords
185
+ rake_extractor.extract_keywords_from_text(text)
186
+ keywords_with_scores = rake_extractor.get_ranked_phrases_with_scores()
187
 
188
  # Format results
189
  results = []
190
+ for score, keyword in keywords_with_scores[:num_keywords]:
 
 
 
 
 
 
 
191
  results.append({
192
  'keyword': keyword,
193
  'score': score,
194
+ 'model': 'RAKE-NLTK'
195
  })
196
 
197
  return results
198
 
199
  except ImportError:
200
+ print("RAKE-NLTK library not found. Using fallback keyword extraction...")
201
  return self.fallback_keyword_extraction(text, num_keywords)
202
 
203
  def fallback_keyword_extraction(self, text, num_keywords=10):
 
396
 
397
  # Create summary
398
  avg_score = sum(k['score'] for k in keywords) / len(keywords)
399
+ model_display = selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rake_', 'RAKE-').title()
400
  summary = f"""
401
  ## 📊 Analysis Summary
402
  - **Keywords extracted:** {len(keywords)}
403
+ - **Model used:** {model_display}
404
  - **Average relevance score:** {avg_score:.4f}
405
  - **N-gram range:** {ngram_min}-{ngram_max} words
406
  """
 
415
  gr.Markdown("""
416
  # Keyword Extraction Explorer Tool
417
 
418
+ Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RAKE for comprehensive analysis.
419
 
420
  ### How to use:
421
  1. **📝 Enter your text** in the text area below
 
505
  <dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
506
  </div>
507
  <div style="margin-bottom: 8px;">
508
+ <dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
509
+ <dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
510
  </div>
511
  </dl>
512
  </div>
 
559
  ],
560
  [
561
  "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
562
+ "rake_nltk",
563
  10,
564
  1,
565
  3
 
591
  Minimal keyword extraction with BERT ↗
592
  </a>
593
  </li>
594
+ <li><strong>RAKE-NLTK:</strong>
595
+ <a href="https://github.com/csurfer/rake-nltk" target="_blank" style="color: #1976d2;">
596
+ Rapid Automatic Keyword Extraction with NLTK
597
  </a>
598
  </li>
599
  <li><strong>Sentence Transformers:</strong>