SorrelC commited on
Commit
4b1a89e
·
verified ·
1 Parent(s): b4121fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -96
app.py CHANGED
@@ -6,14 +6,13 @@ import re
6
  import time
7
  warnings.filterwarnings('ignore')
8
 
9
- # PKE model names and descriptions
10
- PKE_MODELS = {
11
- 'kw_pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
12
- 'kw_pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
13
- 'kw_pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
14
- 'kw_pke_topicrank': 'TopicRank - Graph-based with topic clustering',
15
- 'kw_pke_textrank': 'TextRank - Graph-based ranking algorithm',
16
- 'kw_pke_positionrank': 'PositionRank - Incorporates word positions'
17
  }
18
 
19
  # Color palette for keywords based on scores
@@ -32,78 +31,103 @@ KEYWORD_COLORS = [
32
 
33
  class KeywordExtractionManager:
34
  def __init__(self):
35
- self.pke_models = {}
36
- self.spacy_model = None
37
 
38
- def load_spacy_model(self):
39
- """Load spaCy model for preprocessing"""
40
- if self.spacy_model is None:
41
  try:
42
- import spacy
43
- try:
44
- self.spacy_model = spacy.load("en_core_web_sm")
45
- print("✓ spaCy model loaded successfully")
46
- except OSError:
47
- print("spaCy model not found. Please install with: python -m spacy download en_core_web_sm")
48
- return None
49
  except Exception as e:
50
- print(f"Error loading spaCy model: {str(e)}")
51
  return None
52
- return self.spacy_model
53
 
54
  def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
55
- """Extract keywords using the specified PKE model"""
56
  try:
57
- import pke
58
-
59
  if progress:
60
  progress(0.3, desc="Loading model...")
61
 
62
- # Initialize the extractor based on model name
63
- if 'multipartiterank' in model_name:
64
- extractor = pke.unsupervised.MultipartiteRank()
65
- elif 'singlerank' in model_name:
66
- extractor = pke.unsupervised.SingleRank()
67
- elif 'tfidf' in model_name:
68
- extractor = pke.unsupervised.TfIdf()
69
- elif 'topicrank' in model_name:
70
- extractor = pke.unsupervised.TopicRank()
71
- elif 'textrank' in model_name:
72
- extractor = pke.unsupervised.TextRank()
73
- elif 'positionrank' in model_name:
74
- extractor = pke.unsupervised.PositionRank()
75
  else:
76
  raise ValueError(f"Unknown model: {model_name}")
77
 
 
 
 
 
 
 
 
 
 
78
  if progress:
79
- progress(0.5, desc="Processing text...")
80
-
81
- # Load the text
82
- extractor.load_document(input=text, language='en')
 
 
 
 
 
83
 
84
- # Select candidates based on model
85
- if 'multipartiterank' in model_name:
86
- extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
87
- extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
88
- elif 'topicrank' in model_name:
89
- extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
90
- extractor.candidate_weighting(threshold=0.74, method='average')
91
- elif 'positionrank' in model_name:
92
- extractor.candidate_selection(maximum_word_number=3)
93
- extractor.candidate_weighting(window=10)
94
- elif 'tfidf' in model_name:
95
- extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
96
- extractor.candidate_weighting()
97
- else:
98
- # SingleRank and TextRank
99
- extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
100
- extractor.candidate_weighting(window=10)
101
-
102
  if progress:
103
  progress(0.7, desc="Extracting keywords...")
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Get keywords
106
- keywords = extractor.get_n_best(n=num_keywords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Format results
109
  results = []
@@ -111,16 +135,60 @@ class KeywordExtractionManager:
111
  results.append({
112
  'keyword': keyword,
113
  'score': score,
114
- 'model': model_name.replace('kw_pke_', '').title()
115
  })
116
 
117
  return results
118
 
119
  except ImportError:
120
- print("PKE library not found. Using fallback keyword extraction...")
121
  return self.fallback_keyword_extraction(text, num_keywords)
122
- except Exception as e:
123
- print(f"Error with {model_name}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  return self.fallback_keyword_extraction(text, num_keywords)
125
 
126
  def fallback_keyword_extraction(self, text, num_keywords=10):
@@ -322,7 +390,7 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
322
  summary = f"""
323
  ## 📊 Analysis Summary
324
  - **Keywords extracted:** {len(keywords)}
325
- - **Model used:** {selected_model.replace('kw_pke_', '').title()}
326
  - **Average relevance score:** {avg_score:.4f}
327
  - **N-gram range:** {ngram_min}-{ngram_max} words
328
  """
@@ -337,7 +405,7 @@ def create_interface():
337
  gr.Markdown("""
338
  # Keyword Extraction Explorer Tool
339
 
340
- Extract the most important keywords and phrases from your text using various algorithms! This tool uses PKE (Python Keyphrase Extraction) models for comprehensive keyword extraction.
341
 
342
  ### How to use:
343
  1. **📝 Enter your text** in the text area below
@@ -365,8 +433,8 @@ def create_interface():
365
  with gr.Column(scale=1):
366
  # Model selector
367
  model_dropdown = gr.Dropdown(
368
- choices=list(PKE_MODELS.keys()),
369
- value='kw_pke_multipartiterank',
370
  label="🎯 Select Keyword Extraction Model"
371
  )
372
 
@@ -394,6 +462,13 @@ def create_interface():
394
  step=1,
395
  label="Max N-gram"
396
  )
 
 
 
 
 
 
 
397
 
398
  # Add model descriptions
399
  gr.HTML("""
@@ -404,28 +479,24 @@ def create_interface():
404
  <div style="margin-top: 10px; padding: 10px;">
405
  <dl style="margin: 0; font-size: 14px;">
406
  <div style="margin-bottom: 8px;">
407
- <dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
408
- <dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
409
  </div>
410
  <div style="margin-bottom: 8px;">
411
- <dt style="font-weight: bold; display: inline; color: #45B7D1;">SingleRank:</dt>
412
- <dd style="display: inline; margin-left: 5px;">Simple graph-based algorithm - fast and effective</dd>
413
  </div>
414
  <div style="margin-bottom: 8px;">
415
- <dt style="font-weight: bold; display: inline; color: #F9CA24;">TF-IDF:</dt>
416
- <dd style="display: inline; margin-left: 5px;">Statistical approach - good for technical texts</dd>
417
  </div>
418
  <div style="margin-bottom: 8px;">
419
- <dt style="font-weight: bold; display: inline; color: #6C5CE7;">TopicRank:</dt>
420
- <dd style="display: inline; margin-left: 5px;">Groups similar candidates - reduces redundancy</dd>
421
  </div>
422
  <div style="margin-bottom: 8px;">
423
- <dt style="font-weight: bold; display: inline; color: #00B894;">TextRank:</dt>
424
- <dd style="display: inline; margin-left: 5px;">Classic PageRank-inspired algorithm</dd>
425
- </div>
426
- <div style="margin-bottom: 8px;">
427
- <dt style="font-weight: bold; display: inline; color: #E17055;">PositionRank:</dt>
428
- <dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
429
  </div>
430
  </dl>
431
  </div>
@@ -464,21 +535,21 @@ def create_interface():
464
  examples=[
465
  [
466
  "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
467
- "kw_pke_multipartiterank",
468
  10,
469
  1,
470
  3
471
  ],
472
  [
473
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
474
- "kw_pke_topicrank",
475
  10,
476
  1,
477
  3
478
  ],
479
  [
480
  "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
481
- "kw_pke_textrank",
482
  10,
483
  1,
484
  3
@@ -500,14 +571,24 @@ def create_interface():
500
  <h4 style="margin-top: 0;">📚 Model Information & Documentation</h4>
501
  <p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
502
  <ul style="font-size: 14px; line-height: 1.8;">
503
- <li><strong>PKE Library:</strong>
504
- <a href="https://github.com/boudinfl/pke" target="_blank" style="color: #1976d2;">
505
- Python Keyphrase Extraction (PKE) GitHub
 
 
 
 
 
506
  </a>
507
  </li>
508
- <li><strong>Algorithm Papers:</strong>
509
- <a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
510
- PKE Documentation & References
 
 
 
 
 
511
  </a>
512
  </li>
513
  </ul>
@@ -531,6 +612,4 @@ def create_interface():
531
 
532
  if __name__ == "__main__":
533
  demo = create_interface()
534
- demo.launch()
535
-
536
-
 
6
  import time
7
  warnings.filterwarnings('ignore')
8
 
9
+ # Reliable model names and descriptions (PKE removed for compatibility)
10
+ KEYWORD_MODELS = {
11
+ 'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
12
+ 'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
13
+ 'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
14
+ 'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
15
+ 'rakun_rakun': 'RaKUn - Rapid Automatic Keyword Extraction'
 
16
  }
17
 
18
  # Color palette for keywords based on scores
 
31
 
32
  class KeywordExtractionManager:
33
  def __init__(self):
34
+ self.keybert_models = {}
 
35
 
36
+ def load_keybert_model(self, model_name):
37
+ """Load KeyBERT model"""
38
+ if model_name not in self.keybert_models:
39
  try:
40
+ from keybert import KeyBERT
41
+ # Extract the actual model name from the identifier
42
+ actual_model = model_name.replace('keybert_', '')
43
+ self.keybert_models[model_name] = KeyBERT(model=actual_model)
44
+ print(f"✓ KeyBERT model {actual_model} loaded successfully")
 
 
45
  except Exception as e:
46
+ print(f"Error loading KeyBERT model {model_name}: {str(e)}")
47
  return None
48
+ return self.keybert_models[model_name]
49
 
50
  def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
51
+ """Extract keywords using the specified model"""
52
  try:
 
 
53
  if progress:
54
  progress(0.3, desc="Loading model...")
55
 
56
+ # Handle different model types
57
+ if model_name.startswith('yake_'):
58
+ return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
59
+ elif model_name.startswith('keybert_'):
60
+ return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
61
+ elif model_name.startswith('rakun_'):
62
+ return self.extract_rakun_keywords(text, num_keywords, progress)
 
 
 
 
 
 
63
  else:
64
  raise ValueError(f"Unknown model: {model_name}")
65
 
66
+ except Exception as e:
67
+ print(f"Error with {model_name}: {str(e)}")
68
+ return self.fallback_keyword_extraction(text, num_keywords)
69
+
70
+ def extract_yake_keywords(self, text, num_keywords, ngram_range, progress):
71
+ """Extract keywords using YAKE"""
72
+ try:
73
+ import yake
74
+
75
  if progress:
76
+ progress(0.5, desc="Processing with YAKE...")
77
+
78
+ # Configure YAKE
79
+ kw_extractor = yake.KeywordExtractor(
80
+ lan="en",
81
+ n=ngram_range[1],
82
+ dedupLim=0.7,
83
+ top=num_keywords
84
+ )
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  if progress:
87
  progress(0.7, desc="Extracting keywords...")
88
+
89
+ keywords = kw_extractor.extract_keywords(text)
90
+
91
+ # Format results (YAKE returns lower scores for better keywords)
92
+ results = []
93
+ for keyword, score in keywords:
94
+ # Invert score for consistency (higher = better)
95
+ inverted_score = 1.0 / (1.0 + score)
96
+ results.append({
97
+ 'keyword': keyword,
98
+ 'score': inverted_score,
99
+ 'model': 'YAKE'
100
+ })
101
 
102
+ return results
103
+
104
+ except ImportError:
105
+ print("YAKE library not found. Using fallback keyword extraction...")
106
+ return self.fallback_keyword_extraction(text, num_keywords)
107
+
108
+ def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
109
+ """Extract keywords using KeyBERT"""
110
+ try:
111
+ if progress:
112
+ progress(0.4, desc="Loading KeyBERT model...")
113
+
114
+ kw_model = self.load_keybert_model(model_name)
115
+ if kw_model is None:
116
+ return self.fallback_keyword_extraction(text, num_keywords)
117
+
118
+ if progress:
119
+ progress(0.6, desc="Processing with KeyBERT...")
120
+
121
+ # Extract keywords
122
+ keywords = kw_model.extract_keywords(
123
+ text,
124
+ keyphrase_ngram_range=ngram_range,
125
+ stop_words='english',
126
+ top_k=num_keywords
127
+ )
128
+
129
+ if progress:
130
+ progress(0.8, desc="Formatting results...")
131
 
132
  # Format results
133
  results = []
 
135
  results.append({
136
  'keyword': keyword,
137
  'score': score,
138
+ 'model': f"KeyBERT-{model_name.replace('keybert_', '')}"
139
  })
140
 
141
  return results
142
 
143
  except ImportError:
144
+ print("KeyBERT library not found. Using fallback keyword extraction...")
145
  return self.fallback_keyword_extraction(text, num_keywords)
146
+
147
+ def extract_rakun_keywords(self, text, num_keywords, progress):
148
+ """Extract keywords using RaKUn"""
149
+ try:
150
+ from rakun import RakunDetector
151
+
152
+ if progress:
153
+ progress(0.5, desc="Processing with RaKUn...")
154
+
155
+ # Initialize RaKUn
156
+ hyperparameters = {
157
+ "distance_threshold": 3,
158
+ "num_keywords": num_keywords,
159
+ "pair_diff_length": 2,
160
+ "stopwords": "english",
161
+ "bigram_count_threshold": 2,
162
+ "num_tokens": [1, 2, 3]
163
+ }
164
+
165
+ keyword_detector = RakunDetector(hyperparameters)
166
+
167
+ if progress:
168
+ progress(0.7, desc="Extracting keywords...")
169
+
170
+ keywords = keyword_detector.find_keywords(text)
171
+
172
+ # Format results
173
+ results = []
174
+ for keyword_data in keywords[:num_keywords]:
175
+ if isinstance(keyword_data, tuple):
176
+ keyword, score = keyword_data
177
+ else:
178
+ # If no score available, assign based on rank
179
+ keyword = keyword_data
180
+ score = 1.0 / (keywords.index(keyword_data) + 1)
181
+
182
+ results.append({
183
+ 'keyword': keyword,
184
+ 'score': score,
185
+ 'model': 'RaKUn'
186
+ })
187
+
188
+ return results
189
+
190
+ except ImportError:
191
+ print("RaKUn library not found. Using fallback keyword extraction...")
192
  return self.fallback_keyword_extraction(text, num_keywords)
193
 
194
  def fallback_keyword_extraction(self, text, num_keywords=10):
 
390
  summary = f"""
391
  ## 📊 Analysis Summary
392
  - **Keywords extracted:** {len(keywords)}
393
+ - **Model used:** {selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rakun_', '').title()}
394
  - **Average relevance score:** {avg_score:.4f}
395
  - **N-gram range:** {ngram_min}-{ngram_max} words
396
  """
 
405
  gr.Markdown("""
406
  # Keyword Extraction Explorer Tool
407
 
408
+ Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RaKUn for comprehensive analysis.
409
 
410
  ### How to use:
411
  1. **📝 Enter your text** in the text area below
 
433
  with gr.Column(scale=1):
434
  # Model selector
435
  model_dropdown = gr.Dropdown(
436
+ choices=list(KEYWORD_MODELS.keys()),
437
+ value='yake_yake',
438
  label="🎯 Select Keyword Extraction Model"
439
  )
440
 
 
462
  step=1,
463
  label="Max N-gram"
464
  )
465
+
466
+ # Add N-gram tip box
467
+ gr.HTML("""
468
+ <div style="background-color: #e3f2fd; border: 1px solid #90caf9; border-radius: 8px; padding: 10px; margin: 10px 0;">
469
+ <strong style="color: #1565c0;">💡 N-gram Guide:</strong> N-grams are sequences of words. Set Min=1, Max=3 to extract single words, phrases of 2 words, and phrases of 3 words. Higher values capture longer phrases but may reduce precision.
470
+ </div>
471
+ """)
472
 
473
  # Add model descriptions
474
  gr.HTML("""
 
479
  <div style="margin-top: 10px; padding: 10px;">
480
  <dl style="margin: 0; font-size: 14px;">
481
  <div style="margin-bottom: 8px;">
482
+ <dt style="font-weight: bold; display: inline; color: #FF6B6B;">YAKE:</dt>
483
+ <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
484
  </div>
485
  <div style="margin-bottom: 8px;">
486
+ <dt style="font-weight: bold; display: inline; color: #9C27B0;">KeyBERT MPNet:</dt>
487
+ <dd style="display: inline; margin-left: 5px;">BERT-based semantic similarity - excellent for contextual understanding</dd>
488
  </div>
489
  <div style="margin-bottom: 8px;">
490
+ <dt style="font-weight: bold; display: inline; color: #795548;">KeyBERT MiniLM:</dt>
491
+ <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster processing with good results</dd>
492
  </div>
493
  <div style="margin-bottom: 8px;">
494
+ <dt style="font-weight: bold; display: inline; color: #607D8B;">KeyBERT Paraphrase:</dt>
495
+ <dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
496
  </div>
497
  <div style="margin-bottom: 8px;">
498
+ <dt style="font-weight: bold; display: inline; color: #FF5722;">RaKUn:</dt>
499
+ <dd style="display: inline; margin-left: 5px;">Graph-based rapid extraction - efficient for large texts</dd>
 
 
 
 
500
  </div>
501
  </dl>
502
  </div>
 
535
  examples=[
536
  [
537
  "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
538
+ "yake_yake",
539
  10,
540
  1,
541
  3
542
  ],
543
  [
544
  "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
545
+ "keybert_all-mpnet-base-v2",
546
  10,
547
  1,
548
  3
549
  ],
550
  [
551
  "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
552
+ "keybert_all-MiniLM-L6-v2",
553
  10,
554
  1,
555
  3
 
571
  <h4 style="margin-top: 0;">📚 Model Information & Documentation</h4>
572
  <p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
573
  <ul style="font-size: 14px; line-height: 1.8;">
574
+ <li><strong>YAKE:</strong>
575
+ <a href="https://github.com/LIAAD/yake" target="_blank" style="color: #1976d2;">
576
+ Yet Another Keyword Extractor
577
+ </a>
578
+ </li>
579
+ <li><strong>KeyBERT:</strong>
580
+ <a href="https://github.com/MaartenGr/KeyBERT" target="_blank" style="color: #1976d2;">
581
+ Minimal keyword extraction with BERT ↗
582
  </a>
583
  </li>
584
+ <li><strong>RaKUn:</strong>
585
+ <a href="https://github.com/SkBlaz/rakun" target="_blank" style="color: #1976d2;">
586
+ Rapid Automatic Keyword Extraction
587
+ </a>
588
+ </li>
589
+ <li><strong>Sentence Transformers:</strong>
590
+ <a href="https://www.sbert.net/" target="_blank" style="color: #1976d2;">
591
+ BERT-based models for semantic similarity ↗
592
  </a>
593
  </li>
594
  </ul>
 
612
 
613
  if __name__ == "__main__":
614
  demo = create_interface()
615
+ demo.launch()