Update app.py
Browse files
app.py
CHANGED
|
@@ -6,13 +6,13 @@ import re
|
|
| 6 |
import time
|
| 7 |
warnings.filterwarnings('ignore')
|
| 8 |
|
| 9 |
-
# Reliable model names and descriptions
|
| 10 |
KEYWORD_MODELS = {
|
| 11 |
'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
|
| 12 |
'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
|
| 13 |
'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
|
| 14 |
'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
|
| 15 |
-
'
|
| 16 |
}
|
| 17 |
|
| 18 |
# Color palette for keywords based on scores
|
|
@@ -32,6 +32,7 @@ KEYWORD_COLORS = [
|
|
| 32 |
class KeywordExtractionManager:
|
| 33 |
def __init__(self):
|
| 34 |
self.keybert_models = {}
|
|
|
|
| 35 |
|
| 36 |
def load_keybert_model(self, model_name):
|
| 37 |
"""Load KeyBERT model"""
|
|
@@ -47,6 +48,29 @@ class KeywordExtractionManager:
|
|
| 47 |
return None
|
| 48 |
return self.keybert_models[model_name]
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
|
| 51 |
"""Extract keywords using the specified model"""
|
| 52 |
try:
|
|
@@ -58,8 +82,8 @@ class KeywordExtractionManager:
|
|
| 58 |
return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
|
| 59 |
elif model_name.startswith('keybert_'):
|
| 60 |
return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
|
| 61 |
-
elif model_name.startswith('
|
| 62 |
-
return self.
|
| 63 |
else:
|
| 64 |
raise ValueError(f"Unknown model: {model_name}")
|
| 65 |
|
|
@@ -144,51 +168,36 @@ class KeywordExtractionManager:
|
|
| 144 |
print("KeyBERT library not found. Using fallback keyword extraction...")
|
| 145 |
return self.fallback_keyword_extraction(text, num_keywords)
|
| 146 |
|
| 147 |
-
def
|
| 148 |
-
"""Extract keywords using
|
| 149 |
try:
|
| 150 |
-
from rakun import RakunDetector
|
| 151 |
-
|
| 152 |
if progress:
|
| 153 |
-
progress(0.5, desc="Processing with
|
| 154 |
-
|
| 155 |
-
# Initialize RaKUn
|
| 156 |
-
hyperparameters = {
|
| 157 |
-
"distance_threshold": 3,
|
| 158 |
-
"num_keywords": num_keywords,
|
| 159 |
-
"pair_diff_length": 2,
|
| 160 |
-
"stopwords": "english",
|
| 161 |
-
"bigram_count_threshold": 2,
|
| 162 |
-
"num_tokens": [1, 2, 3]
|
| 163 |
-
}
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
|
| 167 |
if progress:
|
| 168 |
progress(0.7, desc="Extracting keywords...")
|
| 169 |
|
| 170 |
-
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Format results
|
| 173 |
results = []
|
| 174 |
-
for
|
| 175 |
-
if isinstance(keyword_data, tuple):
|
| 176 |
-
keyword, score = keyword_data
|
| 177 |
-
else:
|
| 178 |
-
# If no score available, assign based on rank
|
| 179 |
-
keyword = keyword_data
|
| 180 |
-
score = 1.0 / (keywords.index(keyword_data) + 1)
|
| 181 |
-
|
| 182 |
results.append({
|
| 183 |
'keyword': keyword,
|
| 184 |
'score': score,
|
| 185 |
-
'model': '
|
| 186 |
})
|
| 187 |
|
| 188 |
return results
|
| 189 |
|
| 190 |
except ImportError:
|
| 191 |
-
print("
|
| 192 |
return self.fallback_keyword_extraction(text, num_keywords)
|
| 193 |
|
| 194 |
def fallback_keyword_extraction(self, text, num_keywords=10):
|
|
@@ -387,10 +396,11 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
|
|
| 387 |
|
| 388 |
# Create summary
|
| 389 |
avg_score = sum(k['score'] for k in keywords) / len(keywords)
|
|
|
|
| 390 |
summary = f"""
|
| 391 |
## 📊 Analysis Summary
|
| 392 |
- **Keywords extracted:** {len(keywords)}
|
| 393 |
-
- **Model used:** {
|
| 394 |
- **Average relevance score:** {avg_score:.4f}
|
| 395 |
- **N-gram range:** {ngram_min}-{ngram_max} words
|
| 396 |
"""
|
|
@@ -405,7 +415,7 @@ def create_interface():
|
|
| 405 |
gr.Markdown("""
|
| 406 |
# Keyword Extraction Explorer Tool
|
| 407 |
|
| 408 |
-
Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and
|
| 409 |
|
| 410 |
### How to use:
|
| 411 |
1. **📝 Enter your text** in the text area below
|
|
@@ -495,8 +505,8 @@ def create_interface():
|
|
| 495 |
<dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
|
| 496 |
</div>
|
| 497 |
<div style="margin-bottom: 8px;">
|
| 498 |
-
<dt style="font-weight: bold; display: inline; color: #FF5722;">
|
| 499 |
-
<dd style="display: inline; margin-left: 5px;">
|
| 500 |
</div>
|
| 501 |
</dl>
|
| 502 |
</div>
|
|
@@ -549,7 +559,7 @@ def create_interface():
|
|
| 549 |
],
|
| 550 |
[
|
| 551 |
"Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
|
| 552 |
-
"
|
| 553 |
10,
|
| 554 |
1,
|
| 555 |
3
|
|
@@ -581,9 +591,9 @@ def create_interface():
|
|
| 581 |
Minimal keyword extraction with BERT ↗
|
| 582 |
</a>
|
| 583 |
</li>
|
| 584 |
-
<li><strong>
|
| 585 |
-
<a href="https://github.com/
|
| 586 |
-
Rapid Automatic Keyword Extraction ↗
|
| 587 |
</a>
|
| 588 |
</li>
|
| 589 |
<li><strong>Sentence Transformers:</strong>
|
|
|
|
| 6 |
import time
|
| 7 |
warnings.filterwarnings('ignore')
|
| 8 |
|
| 9 |
+
# Reliable model names and descriptions
|
| 10 |
KEYWORD_MODELS = {
|
| 11 |
'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
|
| 12 |
'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
|
| 13 |
'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
|
| 14 |
'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
|
| 15 |
+
'rake_nltk': 'RAKE-NLTK - Rapid Automatic Keyword Extraction'
|
| 16 |
}
|
| 17 |
|
| 18 |
# Color palette for keywords based on scores
|
|
|
|
| 32 |
class KeywordExtractionManager:
|
| 33 |
def __init__(self):
|
| 34 |
self.keybert_models = {}
|
| 35 |
+
self.rake_extractor = None
|
| 36 |
|
| 37 |
def load_keybert_model(self, model_name):
|
| 38 |
"""Load KeyBERT model"""
|
|
|
|
| 48 |
return None
|
| 49 |
return self.keybert_models[model_name]
|
| 50 |
|
| 51 |
+
def load_rake_extractor(self):
|
| 52 |
+
"""Load RAKE extractor"""
|
| 53 |
+
if self.rake_extractor is None:
|
| 54 |
+
try:
|
| 55 |
+
from rake_nltk import Rake
|
| 56 |
+
import nltk
|
| 57 |
+
# Download required NLTK data
|
| 58 |
+
try:
|
| 59 |
+
nltk.data.find('corpora/stopwords')
|
| 60 |
+
except LookupError:
|
| 61 |
+
nltk.download('stopwords', quiet=True)
|
| 62 |
+
try:
|
| 63 |
+
nltk.data.find('tokenizers/punkt')
|
| 64 |
+
except LookupError:
|
| 65 |
+
nltk.download('punkt', quiet=True)
|
| 66 |
+
|
| 67 |
+
self.rake_extractor = Rake()
|
| 68 |
+
print("✓ RAKE extractor loaded successfully")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"Error loading RAKE extractor: {str(e)}")
|
| 71 |
+
return None
|
| 72 |
+
return self.rake_extractor
|
| 73 |
+
|
| 74 |
def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
|
| 75 |
"""Extract keywords using the specified model"""
|
| 76 |
try:
|
|
|
|
| 82 |
return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
|
| 83 |
elif model_name.startswith('keybert_'):
|
| 84 |
return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
|
| 85 |
+
elif model_name.startswith('rake_'):
|
| 86 |
+
return self.extract_rake_keywords(text, num_keywords, progress)
|
| 87 |
else:
|
| 88 |
raise ValueError(f"Unknown model: {model_name}")
|
| 89 |
|
|
|
|
| 168 |
print("KeyBERT library not found. Using fallback keyword extraction...")
|
| 169 |
return self.fallback_keyword_extraction(text, num_keywords)
|
| 170 |
|
| 171 |
+
def extract_rake_keywords(self, text, num_keywords, progress):
|
| 172 |
+
"""Extract keywords using RAKE"""
|
| 173 |
try:
|
|
|
|
|
|
|
| 174 |
if progress:
|
| 175 |
+
progress(0.5, desc="Processing with RAKE...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
rake_extractor = self.load_rake_extractor()
|
| 178 |
+
if rake_extractor is None:
|
| 179 |
+
return self.fallback_keyword_extraction(text, num_keywords)
|
| 180 |
|
| 181 |
if progress:
|
| 182 |
progress(0.7, desc="Extracting keywords...")
|
| 183 |
|
| 184 |
+
# Extract keywords
|
| 185 |
+
rake_extractor.extract_keywords_from_text(text)
|
| 186 |
+
keywords_with_scores = rake_extractor.get_ranked_phrases_with_scores()
|
| 187 |
|
| 188 |
# Format results
|
| 189 |
results = []
|
| 190 |
+
for score, keyword in keywords_with_scores[:num_keywords]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
results.append({
|
| 192 |
'keyword': keyword,
|
| 193 |
'score': score,
|
| 194 |
+
'model': 'RAKE-NLTK'
|
| 195 |
})
|
| 196 |
|
| 197 |
return results
|
| 198 |
|
| 199 |
except ImportError:
|
| 200 |
+
print("RAKE-NLTK library not found. Using fallback keyword extraction...")
|
| 201 |
return self.fallback_keyword_extraction(text, num_keywords)
|
| 202 |
|
| 203 |
def fallback_keyword_extraction(self, text, num_keywords=10):
|
|
|
|
| 396 |
|
| 397 |
# Create summary
|
| 398 |
avg_score = sum(k['score'] for k in keywords) / len(keywords)
|
| 399 |
+
model_display = selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rake_', 'RAKE-').title()
|
| 400 |
summary = f"""
|
| 401 |
## 📊 Analysis Summary
|
| 402 |
- **Keywords extracted:** {len(keywords)}
|
| 403 |
+
- **Model used:** {model_display}
|
| 404 |
- **Average relevance score:** {avg_score:.4f}
|
| 405 |
- **N-gram range:** {ngram_min}-{ngram_max} words
|
| 406 |
"""
|
|
|
|
| 415 |
gr.Markdown("""
|
| 416 |
# Keyword Extraction Explorer Tool
|
| 417 |
|
| 418 |
+
Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RAKE for comprehensive analysis.
|
| 419 |
|
| 420 |
### How to use:
|
| 421 |
1. **📝 Enter your text** in the text area below
|
|
|
|
| 505 |
<dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
|
| 506 |
</div>
|
| 507 |
<div style="margin-bottom: 8px;">
|
| 508 |
+
<dt style="font-weight: bold; display: inline; color: #FF5722;">RAKE-NLTK:</dt>
|
| 509 |
+
<dd style="display: inline; margin-left: 5px;">Classic keyword extraction algorithm - fast and reliable for phrase extraction</dd>
|
| 510 |
</div>
|
| 511 |
</dl>
|
| 512 |
</div>
|
|
|
|
| 559 |
],
|
| 560 |
[
|
| 561 |
"Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
|
| 562 |
+
"rake_nltk",
|
| 563 |
10,
|
| 564 |
1,
|
| 565 |
3
|
|
|
|
| 591 |
Minimal keyword extraction with BERT ↗
|
| 592 |
</a>
|
| 593 |
</li>
|
| 594 |
+
<li><strong>RAKE-NLTK:</strong>
|
| 595 |
+
<a href="https://github.com/csurfer/rake-nltk" target="_blank" style="color: #1976d2;">
|
| 596 |
+
Rapid Automatic Keyword Extraction with NLTK ↗
|
| 597 |
</a>
|
| 598 |
</li>
|
| 599 |
<li><strong>Sentence Transformers:</strong>
|