Spaces:

SorrelC
/

KeywordExtraction-Explorer-Tool

Running

App Files Files Community

SorrelC commited on Jul 16, 2025

Commit

4b1a89e

verified ·

1 Parent(s): b4121fe

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -96

app.py CHANGED Viewed

@@ -6,14 +6,13 @@ import re
 import time
 warnings.filterwarnings('ignore')
-# PKE model names and descriptions
-PKE_MODELS = {
-    'kw_pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
-    'kw_pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
-    'kw_pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
-    'kw_pke_topicrank': 'TopicRank - Graph-based with topic clustering',
-    'kw_pke_textrank': 'TextRank - Graph-based ranking algorithm',
-    'kw_pke_positionrank': 'PositionRank - Incorporates word positions'
 }
 # Color palette for keywords based on scores
@@ -32,78 +31,103 @@ KEYWORD_COLORS = [
 class KeywordExtractionManager:
     def __init__(self):
-        self.pke_models = {}
-        self.spacy_model = None
-    def load_spacy_model(self):
-        """Load spaCy model for preprocessing"""
-        if self.spacy_model is None:
             try:
-                import spacy
-                try:
-                    self.spacy_model = spacy.load("en_core_web_sm")
-                    print("✓ spaCy model loaded successfully")
-                except OSError:
-                    print("spaCy model not found. Please install with: python -m spacy download en_core_web_sm")
-                    return None
             except Exception as e:
-                print(f"Error loading spaCy model: {str(e)}")
                 return None
-        return self.spacy_model
     def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
-        """Extract keywords using the specified PKE model"""
         try:
-            import pke
             if progress:
                 progress(0.3, desc="Loading model...")
-            # Initialize the extractor based on model name
-            if 'multipartiterank' in model_name:
-                extractor = pke.unsupervised.MultipartiteRank()
-            elif 'singlerank' in model_name:
-                extractor = pke.unsupervised.SingleRank()
-            elif 'tfidf' in model_name:
-                extractor = pke.unsupervised.TfIdf()
-            elif 'topicrank' in model_name:
-                extractor = pke.unsupervised.TopicRank()
-            elif 'textrank' in model_name:
-                extractor = pke.unsupervised.TextRank()
-            elif 'positionrank' in model_name:
-                extractor = pke.unsupervised.PositionRank()
             else:
                 raise ValueError(f"Unknown model: {model_name}")
             if progress:
-                progress(0.5, desc="Processing text...")
-            # Load the text
-            extractor.load_document(input=text, language='en')
-            # Select candidates based on model
-            if 'multipartiterank' in model_name:
-                extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
-                extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
-            elif 'topicrank' in model_name:
-                extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
-                extractor.candidate_weighting(threshold=0.74, method='average')
-            elif 'positionrank' in model_name:
-                extractor.candidate_selection(maximum_word_number=3)
-                extractor.candidate_weighting(window=10)
-            elif 'tfidf' in model_name:
-                extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
-                extractor.candidate_weighting()
-            else:
-                # SingleRank and TextRank
-                extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
-                extractor.candidate_weighting(window=10)
             if progress:
                 progress(0.7, desc="Extracting keywords...")
-            # Get keywords
-            keywords = extractor.get_n_best(n=num_keywords)
             # Format results
             results = []
@@ -111,16 +135,60 @@ class KeywordExtractionManager:
                 results.append({
                     'keyword': keyword,
                     'score': score,
-                    'model': model_name.replace('kw_pke_', '').title()
                 })
             return results
         except ImportError:
-            print("PKE library not found. Using fallback keyword extraction...")
             return self.fallback_keyword_extraction(text, num_keywords)
-        except Exception as e:
-            print(f"Error with {model_name}: {str(e)}")
             return self.fallback_keyword_extraction(text, num_keywords)
     def fallback_keyword_extraction(self, text, num_keywords=10):
@@ -322,7 +390,7 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
     summary = f"""
     ## 📊 Analysis Summary
     - **Keywords extracted:** {len(keywords)}
-    - **Model used:** {selected_model.replace('kw_pke_', '').title()}
     - **Average relevance score:** {avg_score:.4f}
     - **N-gram range:** {ngram_min}-{ngram_max} words
     """
@@ -337,7 +405,7 @@ def create_interface():
         gr.Markdown("""
         # Keyword Extraction Explorer Tool
-        Extract the most important keywords and phrases from your text using various algorithms! This tool uses PKE (Python Keyphrase Extraction) models for comprehensive keyword extraction.
         ### How to use:
         1. **📝 Enter your text** in the text area below
@@ -365,8 +433,8 @@ def create_interface():
             with gr.Column(scale=1):
                 # Model selector
                 model_dropdown = gr.Dropdown(
-                    choices=list(PKE_MODELS.keys()),
-                    value='kw_pke_multipartiterank',
                     label="🎯 Select Keyword Extraction Model"
                 )
@@ -394,6 +462,13 @@ def create_interface():
                         step=1,
                         label="Max N-gram"
                     )
         # Add model descriptions
         gr.HTML("""
@@ -404,28 +479,24 @@ def create_interface():
             <div style="margin-top: 10px; padding: 10px;">
                 <dl style="margin: 0; font-size: 14px;">
                     <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #45B7D1;">SingleRank:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Simple graph-based algorithm - fast and effective</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #F9CA24;">TF-IDF:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Statistical approach - good for technical texts</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #6C5CE7;">TopicRank:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Groups similar candidates - reduces redundancy</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #00B894;">TextRank:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Classic PageRank-inspired algorithm</dd>
-                    </div>
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #E17055;">PositionRank:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
                     </div>
                 </dl>
             </div>
@@ -464,21 +535,21 @@ def create_interface():
             examples=[
                 [
                     "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
-                    "kw_pke_multipartiterank",
                     10,
                     1,
                     3
                 ],
                 [
                     "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
-                    "kw_pke_topicrank",
                     10,
                     1,
                     3
                 ],
                 [
                     "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
-                    "kw_pke_textrank",
                     10,
                     1,
                     3
@@ -500,14 +571,24 @@ def create_interface():
             <h4 style="margin-top: 0;">📚 Model Information & Documentation</h4>
             <p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
             <ul style="font-size: 14px; line-height: 1.8;">
-                <li><strong>PKE Library:</strong>
-                    <a href="https://github.com/boudinfl/pke" target="_blank" style="color: #1976d2;">
-                        Python Keyphrase Extraction (PKE) GitHub ↗
                     </a>
                 </li>
-                <li><strong>Algorithm Papers:</strong>
-                    <a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
-                        PKE Documentation & References ↗
                     </a>
                 </li>
             </ul>
@@ -531,6 +612,4 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 import time
 warnings.filterwarnings('ignore')
+# Reliable model names and descriptions (PKE removed for compatibility)
+KEYWORD_MODELS = {
+    'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical)',
+    'keybert_all-mpnet-base-v2': 'KeyBERT MPNet - BERT-based semantic similarity',
+    'keybert_all-MiniLM-L6-v2': 'KeyBERT MiniLM - Lightweight BERT-based extraction',
+    'keybert_paraphrase-mpnet-base-v2': 'KeyBERT Paraphrase - Optimized for paraphrase detection',
+    'rakun_rakun': 'RaKUn - Rapid Automatic Keyword Extraction'
 }
 # Color palette for keywords based on scores
 class KeywordExtractionManager:
     def __init__(self):
+        self.keybert_models = {}
+    def load_keybert_model(self, model_name):
+        """Load KeyBERT model"""
+        if model_name not in self.keybert_models:
             try:
+                from keybert import KeyBERT
+                # Extract the actual model name from the identifier
+                actual_model = model_name.replace('keybert_', '')
+                self.keybert_models[model_name] = KeyBERT(model=actual_model)
+                print(f"✓ KeyBERT model {actual_model} loaded successfully")
             except Exception as e:
+                print(f"Error loading KeyBERT model {model_name}: {str(e)}")
                 return None
+        return self.keybert_models[model_name]
     def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
+        """Extract keywords using the specified model"""
         try:
             if progress:
                 progress(0.3, desc="Loading model...")
+            # Handle different model types
+            if model_name.startswith('yake_'):
+                return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
+            elif model_name.startswith('keybert_'):
+                return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
+            elif model_name.startswith('rakun_'):
+                return self.extract_rakun_keywords(text, num_keywords, progress)
             else:
                 raise ValueError(f"Unknown model: {model_name}")
+        except Exception as e:
+            print(f"Error with {model_name}: {str(e)}")
+            return self.fallback_keyword_extraction(text, num_keywords)
+    def extract_yake_keywords(self, text, num_keywords, ngram_range, progress):
+        """Extract keywords using YAKE"""
+        try:
+            import yake
             if progress:
+                progress(0.5, desc="Processing with YAKE...")
+            # Configure YAKE
+            kw_extractor = yake.KeywordExtractor(
+                lan="en",
+                n=ngram_range[1],
+                dedupLim=0.7,
+                top=num_keywords
+            )
             if progress:
                 progress(0.7, desc="Extracting keywords...")
+            keywords = kw_extractor.extract_keywords(text)
+            # Format results (YAKE returns lower scores for better keywords)
+            results = []
+            for keyword, score in keywords:
+                # Invert score for consistency (higher = better)
+                inverted_score = 1.0 / (1.0 + score)
+                results.append({
+                    'keyword': keyword,
+                    'score': inverted_score,
+                    'model': 'YAKE'
+                })
+            return results
+        except ImportError:
+            print("YAKE library not found. Using fallback keyword extraction...")
+            return self.fallback_keyword_extraction(text, num_keywords)
+    def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
+        """Extract keywords using KeyBERT"""
+        try:
+            if progress:
+                progress(0.4, desc="Loading KeyBERT model...")
+            kw_model = self.load_keybert_model(model_name)
+            if kw_model is None:
+                return self.fallback_keyword_extraction(text, num_keywords)
+            if progress:
+                progress(0.6, desc="Processing with KeyBERT...")
+            # Extract keywords
+            keywords = kw_model.extract_keywords(
+                text,
+                keyphrase_ngram_range=ngram_range,
+                stop_words='english',
+                top_k=num_keywords
+            )
+            if progress:
+                progress(0.8, desc="Formatting results...")
             # Format results
             results = []
                 results.append({
                     'keyword': keyword,
                     'score': score,
+                    'model': f"KeyBERT-{model_name.replace('keybert_', '')}"
                 })
             return results
         except ImportError:
+            print("KeyBERT library not found. Using fallback keyword extraction...")
             return self.fallback_keyword_extraction(text, num_keywords)
+    def extract_rakun_keywords(self, text, num_keywords, progress):
+        """Extract keywords using RaKUn"""
+        try:
+            from rakun import RakunDetector
+            if progress:
+                progress(0.5, desc="Processing with RaKUn...")
+            # Initialize RaKUn
+            hyperparameters = {
+                "distance_threshold": 3,
+                "num_keywords": num_keywords,
+                "pair_diff_length": 2,
+                "stopwords": "english",
+                "bigram_count_threshold": 2,
+                "num_tokens": [1, 2, 3]
+            }
+            keyword_detector = RakunDetector(hyperparameters)
+            if progress:
+                progress(0.7, desc="Extracting keywords...")
+            keywords = keyword_detector.find_keywords(text)
+            # Format results
+            results = []
+            for keyword_data in keywords[:num_keywords]:
+                if isinstance(keyword_data, tuple):
+                    keyword, score = keyword_data
+                else:
+                    # If no score available, assign based on rank
+                    keyword = keyword_data
+                    score = 1.0 / (keywords.index(keyword_data) + 1)
+                results.append({
+                    'keyword': keyword,
+                    'score': score,
+                    'model': 'RaKUn'
+                })
+            return results
+        except ImportError:
+            print("RaKUn library not found. Using fallback keyword extraction...")
             return self.fallback_keyword_extraction(text, num_keywords)
     def fallback_keyword_extraction(self, text, num_keywords=10):
     summary = f"""
     ## 📊 Analysis Summary
     - **Keywords extracted:** {len(keywords)}
+    - **Model used:** {selected_model.replace('yake_', '').replace('keybert_', 'KeyBERT-').replace('rakun_', '').title()}
     - **Average relevance score:** {avg_score:.4f}
     - **N-gram range:** {ngram_min}-{ngram_max} words
     """
         gr.Markdown("""
         # Keyword Extraction Explorer Tool
+        Extract the most important keywords and phrases from your text using various algorithms! This tool uses modern keyword extraction methods including YAKE, KeyBERT, and RaKUn for comprehensive analysis.
         ### How to use:
         1. **📝 Enter your text** in the text area below
             with gr.Column(scale=1):
                 # Model selector
                 model_dropdown = gr.Dropdown(
+                    choices=list(KEYWORD_MODELS.keys()),
+                    value='yake_yake',
                     label="🎯 Select Keyword Extraction Model"
                 )
                         step=1,
                         label="Max N-gram"
                     )
+                # Add N-gram tip box
+                gr.HTML("""
+                <div style="background-color: #e3f2fd; border: 1px solid #90caf9; border-radius: 8px; padding: 10px; margin: 10px 0;">
+                    <strong style="color: #1565c0;">💡 N-gram Guide:</strong> N-grams are sequences of words. Set Min=1, Max=3 to extract single words, phrases of 2 words, and phrases of 3 words. Higher values capture longer phrases but may reduce precision.
+                </div>
+                """)
         # Add model descriptions
         gr.HTML("""
             <div style="margin-top: 10px; padding: 10px;">
                 <dl style="margin: 0; font-size: 14px;">
                     <div style="margin-bottom: 8px;">
+                        <dt style="font-weight: bold; display: inline; color: #FF6B6B;">YAKE:</dt>
+                        <dd style="display: inline; margin-left: 5px;">Statistical approach requiring no training - works well on short texts and multilingual content</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
+                        <dt style="font-weight: bold; display: inline; color: #9C27B0;">KeyBERT MPNet:</dt>
+                        <dd style="display: inline; margin-left: 5px;">BERT-based semantic similarity - excellent for contextual understanding</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
+                        <dt style="font-weight: bold; display: inline; color: #795548;">KeyBERT MiniLM:</dt>
+                        <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster processing with good results</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
+                        <dt style="font-weight: bold; display: inline; color: #607D8B;">KeyBERT Paraphrase:</dt>
+                        <dd style="display: inline; margin-left: 5px;">Optimized for paraphrase detection - great for similar concept extraction</dd>
                     </div>
                     <div style="margin-bottom: 8px;">
+                        <dt style="font-weight: bold; display: inline; color: #FF5722;">RaKUn:</dt>
+                        <dd style="display: inline; margin-left: 5px;">Graph-based rapid extraction - efficient for large texts</dd>
                     </div>
                 </dl>
             </div>
             examples=[
                 [
                     "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
+                    "yake_yake",
                     10,
                     1,
                     3
                 ],
                 [
                     "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
+                    "keybert_all-mpnet-base-v2",
                     10,
                     1,
                     3
                 ],
                 [
                     "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
+                    "keybert_all-MiniLM-L6-v2",
                     10,
                     1,
                     3
             <h4 style="margin-top: 0;">📚 Model Information & Documentation</h4>
             <p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
             <ul style="font-size: 14px; line-height: 1.8;">
+                <li><strong>YAKE:</strong>
+                    <a href="https://github.com/LIAAD/yake" target="_blank" style="color: #1976d2;">
+                        Yet Another Keyword Extractor ↗
+                    </a>
+                </li>
+                <li><strong>KeyBERT:</strong>
+                    <a href="https://github.com/MaartenGr/KeyBERT" target="_blank" style="color: #1976d2;">
+                        Minimal keyword extraction with BERT ↗
                     </a>
                 </li>
+                <li><strong>RaKUn:</strong>
+                    <a href="https://github.com/SkBlaz/rakun" target="_blank" style="color: #1976d2;">
+                        Rapid Automatic Keyword Extraction ↗
+                    </a>
+                </li>
+                <li><strong>Sentence Transformers:</strong>
+                    <a href="https://www.sbert.net/" target="_blank" style="color: #1976d2;">
+                        BERT-based models for semantic similarity ↗
                     </a>
                 </li>
             </ul>
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()