Spaces:

SorrelC
/

KeywordExtraction-Explorer-Tool

Sleeping

App Files Files Community

SorrelC commited on Jul 16, 2025

Commit

8a60436

verified ·

1 Parent(s): ed3049f

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -245

app.py CHANGED Viewed

@@ -6,19 +6,14 @@ import re
 import time
 warnings.filterwarnings('ignore')
-# Model names and descriptions
-KEYWORD_MODELS = {
-    'pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
-    'pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
-    'pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
-    'pke_topicrank': 'TopicRank - Graph-based with topic clustering',
-    'pke_textrank': 'TextRank - Graph-based ranking algorithm',
-    'pke_positionrank': 'PositionRank - Incorporates word positions',
-    'yake_yake': 'YAKE - Yet Another Keyword Extractor (statistical approach)',
-    'keybert_all-mpnet-base-v2': 'KeyBERT - BERT-based with all-mpnet-base-v2 embeddings',
-    'keybert_all-MiniLM-L6-v2': 'KeyBERT - BERT-based with all-MiniLM-L6-v2 embeddings',
-    'keybert_paraphrase-mpnet-base-v2': 'KeyBERT - BERT-based with paraphrase embeddings',
-    'rakun_rakun': 'RaKUn - Graph-based unsupervised keyword extraction'
 }
 # Color palette for keywords based on scores
@@ -39,9 +34,6 @@ class KeywordExtractionManager:
     def __init__(self):
         self.pke_models = {}
         self.spacy_model = None
-        self.yake_model = None
-        self.keybert_models = {}
-        self.rakun_model = None
     def load_spacy_model(self):
         """Load spaCy model for preprocessing"""
@@ -60,48 +52,28 @@ class KeywordExtractionManager:
         return self.spacy_model
     def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
-        """Extract keywords using the specified model"""
         try:
             if progress:
                 progress(0.3, desc="Loading model...")
-            # Route to appropriate extraction method based on model type
-            if 'pke_' in model_name:
-                return self.extract_pke_keywords(text, model_name, num_keywords, ngram_range, progress)
-            elif 'yake_' in model_name:
-                return self.extract_yake_keywords(text, num_keywords, ngram_range, progress)
-            elif 'keybert_' in model_name:
-                return self.extract_keybert_keywords(text, model_name, num_keywords, ngram_range, progress)
-            elif 'rakun_' in model_name:
-                return self.extract_rakun_keywords(text, num_keywords, progress)
-            else:
-                raise ValueError(f"Unknown model: {model_name}")
-        except Exception as e:
-            print(f"Error with {model_name}: {str(e)}")
-            return self.fallback_keyword_extraction(text, num_keywords)
-    def extract_pke_keywords(self, text, model_name, num_keywords, ngram_range, progress):
-        """Extract keywords using PKE models"""
-        try:
-            import pke
             # Initialize the extractor based on model name
-            model_type = model_name.replace('pke_', '')
-            if 'multipartiterank' in model_type:
                 extractor = pke.unsupervised.MultipartiteRank()
-            elif 'singlerank' in model_type:
                 extractor = pke.unsupervised.SingleRank()
-            elif 'tfidf' in model_type:
                 extractor = pke.unsupervised.TfIdf()
-            elif 'topicrank' in model_type:
                 extractor = pke.unsupervised.TopicRank()
-            elif 'textrank' in model_type:
                 extractor = pke.unsupervised.TextRank()
-            elif 'positionrank' in model_type:
                 extractor = pke.unsupervised.PositionRank()
             else:
-                raise ValueError(f"Unknown PKE model: {model_type}")
             if progress:
                 progress(0.5, desc="Processing text...")
@@ -110,16 +82,16 @@ class KeywordExtractionManager:
             extractor.load_document(input=text, language='en')
             # Select candidates based on model
-            if 'multipartiterank' in model_type:
                 extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
                 extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
-            elif 'topicrank' in model_type:
                 extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
                 extractor.candidate_weighting(threshold=0.74, method='average')
-            elif 'positionrank' in model_type:
                 extractor.candidate_selection(maximum_word_number=3)
                 extractor.candidate_weighting(window=10)
-            elif 'tfidf' in model_type:
                 extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
                 extractor.candidate_weighting()
             else:
@@ -139,7 +111,7 @@ class KeywordExtractionManager:
                 results.append({
                     'keyword': keyword,
                     'score': score,
-                    'model': model_type.title()
                 })
             return results
@@ -147,142 +119,8 @@ class KeywordExtractionManager:
         except ImportError:
             print("PKE library not found. Using fallback keyword extraction...")
             return self.fallback_keyword_extraction(text, num_keywords)
-    def extract_yake_keywords(self, text, num_keywords, ngram_range, progress):
-        """Extract keywords using YAKE"""
-        try:
-            import yake
-            if progress:
-                progress(0.5, desc="Processing with YAKE...")
-            # Initialize YAKE
-            kw_extractor = yake.KeywordExtractor(
-                lan="en",
-                n=ngram_range[1],
-                dedupLim=0.9,
-                dedupFunc='seqm',
-                windowsSize=1,
-                top=num_keywords
-            )
-            # Extract keywords
-            keywords = kw_extractor.extract_keywords(text)
-            if progress:
-                progress(0.7, desc="Formatting results...")
-            # Format results (YAKE returns lower scores for better keywords, so we invert)
-            results = []
-            max_score = max([score for _, score in keywords]) if keywords else 1
-            for keyword, score in keywords:
-                # Invert and normalize score
-                normalized_score = (max_score - score) / max_score if max_score > 0 else 0
-                results.append({
-                    'keyword': keyword,
-                    'score': normalized_score,
-                    'model': 'YAKE'
-                })
-            return results
-        except ImportError:
-            print("YAKE not found. Please install with: pip install yake")
-            return self.fallback_keyword_extraction(text, num_keywords)
-    def extract_keybert_keywords(self, text, model_name, num_keywords, ngram_range, progress):
-        """Extract keywords using KeyBERT"""
-        try:
-            from keybert import KeyBERT
-            if progress:
-                progress(0.5, desc="Loading KeyBERT model...")
-            # Get the embedding model name
-            embedding_model = model_name.replace('keybert_', 'sentence-transformers/')
-            # Initialize or retrieve KeyBERT model
-            if model_name not in self.keybert_models:
-                self.keybert_models[model_name] = KeyBERT(embedding_model)
-                print(f"✓ KeyBERT model {embedding_model} loaded successfully")
-            kw_model = self.keybert_models[model_name]
-            if progress:
-                progress(0.6, desc="Extracting keywords with KeyBERT...")
-            # Extract keywords
-            keywords = kw_model.extract_keywords(
-                text,
-                keyphrase_ngram_range=ngram_range,
-                stop_words='english',
-                top_n=num_keywords,
-                use_mmr=True,
-                diversity=0.5
-            )
-            if progress:
-                progress(0.7, desc="Formatting results...")
-            # Format results
-            results = []
-            for keyword, score in keywords:
-                results.append({
-                    'keyword': keyword,
-                    'score': score,
-                    'model': f'KeyBERT-{embedding_model.split("/")[-1]}'
-                })
-            return results
-        except ImportError:
-            print("KeyBERT not found. Please install with: pip install keybert")
-            return self.fallback_keyword_extraction(text, num_keywords)
-    def extract_rakun_keywords(self, text, num_keywords, progress):
-        """Extract keywords using RaKUn"""
-        try:
-            from mrakun import RakunDetector
-            if progress:
-                progress(0.5, desc="Processing with RaKUn...")
-            # Initialize RaKUn
-            hyperparameters = {
-                "distance_threshold": 2,
-                "distance_method": "editdistance",
-                "num_keywords": num_keywords,
-                "pair_diff_length": 2,
-                "stopwords": "english",
-                "bigram_count_threshold": 2,
-                "num_tokens": [1, 2, 3],
-                "max_similar": 3,
-                "max_occurrence": 3
-            }
-            keyword_detector = RakunDetector(hyperparameters)
-            # Extract keywords
-            keywords = keyword_detector.find_keywords(text)
-            if progress:
-                progress(0.7, desc="Formatting results...")
-            # Format results
-            results = []
-            # RaKUn returns tuples of (keyword, score)
-            for keyword, score in keywords:
-                results.append({
-                    'keyword': keyword,
-                    'score': score,
-                    'model': 'RaKUn'
-                })
-            return results
-        except ImportError:
-            print("RaKUn not found. Please install with: pip install mrakun")
             return self.fallback_keyword_extraction(text, num_keywords)
     def fallback_keyword_extraction(self, text, num_keywords=10):
@@ -484,7 +322,7 @@ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progr
     summary = f"""
     ## 📊 Analysis Summary
     - **Keywords extracted:** {len(keywords)}
-    - **Model used:** {selected_model.replace('pke_', '').replace('yake_', '').replace('keybert_', '').replace('rakun_', '').replace('_', ' ').title()}
     - **Average relevance score:** {avg_score:.4f}
     - **N-gram range:** {ngram_min}-{ngram_max} words
     """
@@ -499,12 +337,12 @@ def create_interface():
         gr.Markdown("""
         # Keyword Extraction Explorer Tool
-        Extract the most important keywords and phrases from your text using various algorithms! This tool uses multiple state-of-the-art keyword extraction models for comprehensive analysis.
         ### How to use:
         1. **📝 Enter your text** in the text area below
         2. **🎯 Select a model** from the dropdown for keyword extraction
-        3. **⚙️ Adjust parameters** (number of keywords, n-gram range)
         4. **🔍 Click "Extract Keywords"** to see results with organized output
         """)
@@ -527,8 +365,8 @@ def create_interface():
             with gr.Column(scale=1):
                 # Model selector
                 model_dropdown = gr.Dropdown(
-                    choices=list(KEYWORD_MODELS.keys()),
-                    value='pke_multipartiterank',
                     label="🎯 Select Keyword Extraction Model"
                 )
@@ -556,15 +394,6 @@ def create_interface():
                         step=1,
                         label="Max N-gram"
                     )
-                # Add n-gram tip box
-                gr.HTML("""
-                <div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 10px; margin-top: 10px;">
-                    <strong style="color: #856404;">💡 What are n-grams?</strong> N-grams are sequences of words.
-                    For example: "1-gram" = single words (e.g., "science"), "2-gram" = two-word phrases (e.g., "data science"),
-                    "3-gram" = three-word phrases (e.g., "machine learning algorithm"). Adjust the sliders to control the length of extracted phrases.
-                </div>
-                """)
         # Add model descriptions
         gr.HTML("""
@@ -573,8 +402,7 @@ def create_interface():
                 ℹ️ Model Descriptions
             </summary>
             <div style="margin-top: 10px; padding: 10px;">
-                <h5 style="margin: 10px 0 5px 0; color: #333;">PKE-based Models:</h5>
-                <dl style="margin: 0 0 15px 0; font-size: 14px;">
                     <div style="margin-bottom: 8px;">
                         <dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
                         <dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
@@ -600,30 +428,6 @@ def create_interface():
                         <dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
                     </div>
                 </dl>
-                <h5 style="margin: 10px 0 5px 0; color: #333;">Other Models:</h5>
-                <dl style="margin: 0; font-size: 14px;">
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #FF9F43;">YAKE:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Statistical approach using word features - language independent, no training needed</dd>
-                    </div>
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #10AC84;">KeyBERT (all-mpnet-base-v2):</dt>
-                        <dd style="display: inline; margin-left: 5px;">BERT-based extraction with high-quality sentence embeddings</dd>
-                    </div>
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #EE5A24;">KeyBERT (all-MiniLM-L6-v2):</dt>
-                        <dd style="display: inline; margin-left: 5px;">Lightweight BERT model - faster with good performance</dd>
-                    </div>
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #0FBC89;">KeyBERT (paraphrase-mpnet-base-v2):</dt>
-                        <dd style="display: inline; margin-left: 5px;">BERT model optimized for paraphrase detection</dd>
-                    </div>
-                    <div style="margin-bottom: 8px;">
-                        <dt style="font-weight: bold; display: inline; color: #5F27CD;">RaKUn:</dt>
-                        <dd style="display: inline; margin-left: 5px;">Graph-based method using word co-occurrences and edit distances</dd>
-                    </div>
-                </dl>
             </div>
         </details>
         """)
@@ -660,21 +464,21 @@ def create_interface():
             examples=[
                 [
                     "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
-                    "pke_multipartiterank",
                     10,
                     1,
                     3
                 ],
                 [
                     "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
-                    "keybert_all-MiniLM-L6-v2",
                     10,
                     1,
                     3
                 ],
                 [
                     "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
-                    "yake_yake",
                     10,
                     1,
                     3
@@ -701,21 +505,6 @@ def create_interface():
                         Python Keyphrase Extraction (PKE) GitHub ↗
                     </a>
                 </li>
-                <li><strong>YAKE:</strong>
-                    <a href="https://github.com/LIAAD/yake" target="_blank" style="color: #1976d2;">
-                        Yet Another Keyword Extractor GitHub ↗
-                    </a>
-                </li>
-                <li><strong>KeyBERT:</strong>
-                    <a href="https://github.com/MaartenGr/KeyBERT" target="_blank" style="color: #1976d2;">
-                        KeyBERT Documentation ↗
-                    </a>
-                </li>
-                <li><strong>RaKUn:</strong>
-                    <a href="https://github.com/SkBlaz/rakun" target="_blank" style="color: #1976d2;">
-                        RaKUn GitHub Repository ↗
-                    </a>
-                </li>
                 <li><strong>Algorithm Papers:</strong>
                     <a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
                         PKE Documentation & References ↗

 import time
 warnings.filterwarnings('ignore')
+# PKE model names and descriptions
+PKE_MODELS = {
+    'kw_pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
+    'kw_pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
+    'kw_pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
+    'kw_pke_topicrank': 'TopicRank - Graph-based with topic clustering',
+    'kw_pke_textrank': 'TextRank - Graph-based ranking algorithm',
+    'kw_pke_positionrank': 'PositionRank - Incorporates word positions'
 }
 # Color palette for keywords based on scores
     def __init__(self):
         self.pke_models = {}
         self.spacy_model = None
     def load_spacy_model(self):
         """Load spaCy model for preprocessing"""
         return self.spacy_model
     def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
+        """Extract keywords using the specified PKE model"""
         try:
+            import pke
             if progress:
                 progress(0.3, desc="Loading model...")
             # Initialize the extractor based on model name
+            if 'multipartiterank' in model_name:
                 extractor = pke.unsupervised.MultipartiteRank()
+            elif 'singlerank' in model_name:
                 extractor = pke.unsupervised.SingleRank()
+            elif 'tfidf' in model_name:
                 extractor = pke.unsupervised.TfIdf()
+            elif 'topicrank' in model_name:
                 extractor = pke.unsupervised.TopicRank()
+            elif 'textrank' in model_name:
                 extractor = pke.unsupervised.TextRank()
+            elif 'positionrank' in model_name:
                 extractor = pke.unsupervised.PositionRank()
             else:
+                raise ValueError(f"Unknown model: {model_name}")
             if progress:
                 progress(0.5, desc="Processing text...")
             extractor.load_document(input=text, language='en')
             # Select candidates based on model
+            if 'multipartiterank' in model_name:
                 extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
                 extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
+            elif 'topicrank' in model_name:
                 extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
                 extractor.candidate_weighting(threshold=0.74, method='average')
+            elif 'positionrank' in model_name:
                 extractor.candidate_selection(maximum_word_number=3)
                 extractor.candidate_weighting(window=10)
+            elif 'tfidf' in model_name:
                 extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
                 extractor.candidate_weighting()
             else:
                 results.append({
                     'keyword': keyword,
                     'score': score,
+                    'model': model_name.replace('kw_pke_', '').title()
                 })
             return results
         except ImportError:
             print("PKE library not found. Using fallback keyword extraction...")
             return self.fallback_keyword_extraction(text, num_keywords)
+        except Exception as e:
+            print(f"Error with {model_name}: {str(e)}")
             return self.fallback_keyword_extraction(text, num_keywords)
     def fallback_keyword_extraction(self, text, num_keywords=10):
     summary = f"""
     ## 📊 Analysis Summary
     - **Keywords extracted:** {len(keywords)}
+    - **Model used:** {selected_model.replace('kw_pke_', '').title()}
     - **Average relevance score:** {avg_score:.4f}
     - **N-gram range:** {ngram_min}-{ngram_max} words
     """
         gr.Markdown("""
         # Keyword Extraction Explorer Tool
+        Extract the most important keywords and phrases from your text using various algorithms! This tool uses PKE (Python Keyphrase Extraction) models for comprehensive keyword extraction.
         ### How to use:
         1. **📝 Enter your text** in the text area below
         2. **🎯 Select a model** from the dropdown for keyword extraction
+        3. *⚙️ Adjust parameters** (number of keywords, n-gram range)
         4. **🔍 Click "Extract Keywords"** to see results with organized output
         """)
             with gr.Column(scale=1):
                 # Model selector
                 model_dropdown = gr.Dropdown(
+                    choices=list(PKE_MODELS.keys()),
+                    value='kw_pke_multipartiterank',
                     label="🎯 Select Keyword Extraction Model"
                 )
                         step=1,
                         label="Max N-gram"
                     )
         # Add model descriptions
         gr.HTML("""
                 ℹ️ Model Descriptions
             </summary>
             <div style="margin-top: 10px; padding: 10px;">
+                <dl style="margin: 0; font-size: 14px;">
                     <div style="margin-bottom: 8px;">
                         <dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
                         <dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
                         <dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
                     </div>
                 </dl>
             </div>
         </details>
         """)
             examples=[
                 [
                     "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
+                    "kw_pke_multipartiterank",
                     10,
                     1,
                     3
                 ],
                 [
                     "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
+                    "kw_pke_topicrank",
                     10,
                     1,
                     3
                 ],
                 [
                     "Charles Darwin arrived at the Galápagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
+                    "kw_pke_textrank",
                     10,
                     1,
                     3
                         Python Keyphrase Extraction (PKE) GitHub ↗
                     </a>
                 </li>
                 <li><strong>Algorithm Papers:</strong>
                     <a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
                         PKE Documentation & References ↗