Update app.py
Browse files
app.py
CHANGED
|
@@ -1,159 +1,531 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
| 3 |
-
import
|
|
|
|
| 4 |
import re
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"kw_pke_positionrank"
|
| 15 |
-
]
|
| 16 |
-
|
| 17 |
-
def extract_keywords_pke(text, model_choice, num_keywords):
|
| 18 |
-
if model_choice == "kw_pke_multipartiterank":
|
| 19 |
-
extractor = pke.unsupervised.MultipartiteRank()
|
| 20 |
-
elif model_choice == "kw_pke_singlerank":
|
| 21 |
-
extractor = pke.unsupervised.SingleRank()
|
| 22 |
-
elif model_choice == "kw_pke_tfidf":
|
| 23 |
-
extractor = pke.unsupervised.TfIdf()
|
| 24 |
-
elif model_choice == "kw_pke_topicrank":
|
| 25 |
-
extractor = pke.unsupervised.TopicRank()
|
| 26 |
-
elif model_choice == "kw_pke_textrank":
|
| 27 |
-
extractor = pke.unsupervised.TextRank()
|
| 28 |
-
elif model_choice == "kw_pke_positionrank":
|
| 29 |
-
extractor = pke.unsupervised.PositionRank()
|
| 30 |
-
else:
|
| 31 |
-
return ["Error: Unknown model"]
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
def
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
def
|
|
|
|
| 51 |
if not keywords:
|
| 52 |
-
return "<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
table_html = """
|
| 55 |
-
<
|
| 56 |
-
<
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
<
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"""
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
table_html += f"""
|
| 66 |
-
<tr>
|
| 67 |
-
<td style="padding: 10px; border: 1px solid #ddd;"
|
| 68 |
-
<td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
</tr>
|
| 70 |
"""
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
if not text.strip():
|
| 76 |
-
return "โ Please enter text to analyse.", "", ""
|
| 77 |
-
|
| 78 |
-
keywords = extract_keywords_pke(text, model_choice, num_keywords)
|
| 79 |
-
highlighted_html = highlight_keywords(text, keywords)
|
| 80 |
-
keywords_table_html = create_keywords_table(keywords)
|
| 81 |
-
|
| 82 |
-
summary_html = f"""
|
| 83 |
-
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; border: 1px solid #ddd; box-shadow: 0 2px 5px rgba(0,0,0,0.05); margin-bottom: 20px;">
|
| 84 |
-
<h3 style="margin-top: 0; color: #6C63FF;">๐ Analysis Summary</h3>
|
| 85 |
-
<p><strong>Model Used:</strong> {model_choice}</p>
|
| 86 |
-
<p><strong>Keywords Found:</strong> {len(keywords)}</p>
|
| 87 |
</div>
|
| 88 |
"""
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
</div>
|
| 95 |
"""
|
|
|
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
<h4 style="color: #6C63FF; margin-bottom: 10px;">๐ Extracted Keywords</h4>
|
| 100 |
-
{keywords_table_html}
|
| 101 |
-
</div>
|
| 102 |
-
"""
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
|
|
|
| 106 |
def create_interface():
|
| 107 |
-
with gr.Blocks(title="Keyword
|
| 108 |
gr.Markdown("""
|
| 109 |
-
#
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
###
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
with gr.Row():
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
fn=process_text,
|
| 140 |
-
inputs=[
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
)
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
gr.HTML("""
|
| 145 |
-
<hr style="margin-top: 40px; margin-bottom: 20px;">
|
| 146 |
-
<div style="background-color: #f8f9fa; padding: 20px; border-radius:
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
return demo
|
| 159 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import warnings
|
| 4 |
+
import random
|
| 5 |
import re
|
| 6 |
+
import time
|
| 7 |
+
warnings.filterwarnings('ignore')
|
| 8 |
|
| 9 |
+
# PKE model names and descriptions
|
| 10 |
+
PKE_MODELS = {
|
| 11 |
+
'kw_pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
|
| 12 |
+
'kw_pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
|
| 13 |
+
'kw_pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
|
| 14 |
+
'kw_pke_topicrank': 'TopicRank - Graph-based with topic clustering',
|
| 15 |
+
'kw_pke_textrank': 'TextRank - Graph-based ranking algorithm',
|
| 16 |
+
'kw_pke_positionrank': 'PositionRank - Incorporates word positions'
|
| 17 |
+
}
|
| 18 |
|
| 19 |
+
# Color palette for keywords based on scores
|
| 20 |
+
SCORE_COLORS = {
|
| 21 |
+
'high': '#00B894', # Green - High relevance
|
| 22 |
+
'medium': '#F9CA24', # Yellow - Medium relevance
|
| 23 |
+
'low': '#FF6B6B' # Red - Low relevance
|
| 24 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Additional colors for variety
|
| 27 |
+
KEYWORD_COLORS = [
|
| 28 |
+
'#4ECDC4', '#45B7D1', '#6C5CE7', '#A0E7E5', '#FD79A8',
|
| 29 |
+
'#8E8E93', '#55A3FF', '#E17055', '#DDA0DD', '#FF9F43',
|
| 30 |
+
'#10AC84', '#EE5A24', '#0FBC89', '#5F27CD', '#FF3838'
|
| 31 |
+
]
|
| 32 |
|
| 33 |
+
class KeywordExtractionManager:
|
| 34 |
+
def __init__(self):
|
| 35 |
+
self.pke_models = {}
|
| 36 |
+
self.spacy_model = None
|
| 37 |
+
|
| 38 |
+
def load_spacy_model(self):
|
| 39 |
+
"""Load spaCy model for preprocessing"""
|
| 40 |
+
if self.spacy_model is None:
|
| 41 |
+
try:
|
| 42 |
+
import spacy
|
| 43 |
+
try:
|
| 44 |
+
self.spacy_model = spacy.load("en_core_web_sm")
|
| 45 |
+
print("โ spaCy model loaded successfully")
|
| 46 |
+
except OSError:
|
| 47 |
+
print("spaCy model not found. Please install with: python -m spacy download en_core_web_sm")
|
| 48 |
+
return None
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Error loading spaCy model: {str(e)}")
|
| 51 |
+
return None
|
| 52 |
+
return self.spacy_model
|
| 53 |
+
|
| 54 |
+
def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
|
| 55 |
+
"""Extract keywords using the specified PKE model"""
|
| 56 |
+
try:
|
| 57 |
+
import pke
|
| 58 |
+
|
| 59 |
+
if progress:
|
| 60 |
+
progress(0.3, desc="Loading model...")
|
| 61 |
+
|
| 62 |
+
# Initialize the extractor based on model name
|
| 63 |
+
if 'multipartiterank' in model_name:
|
| 64 |
+
extractor = pke.unsupervised.MultipartiteRank()
|
| 65 |
+
elif 'singlerank' in model_name:
|
| 66 |
+
extractor = pke.unsupervised.SingleRank()
|
| 67 |
+
elif 'tfidf' in model_name:
|
| 68 |
+
extractor = pke.unsupervised.TfIdf()
|
| 69 |
+
elif 'topicrank' in model_name:
|
| 70 |
+
extractor = pke.unsupervised.TopicRank()
|
| 71 |
+
elif 'textrank' in model_name:
|
| 72 |
+
extractor = pke.unsupervised.TextRank()
|
| 73 |
+
elif 'positionrank' in model_name:
|
| 74 |
+
extractor = pke.unsupervised.PositionRank()
|
| 75 |
+
else:
|
| 76 |
+
raise ValueError(f"Unknown model: {model_name}")
|
| 77 |
+
|
| 78 |
+
if progress:
|
| 79 |
+
progress(0.5, desc="Processing text...")
|
| 80 |
+
|
| 81 |
+
# Load the text
|
| 82 |
+
extractor.load_document(input=text, language='en')
|
| 83 |
+
|
| 84 |
+
# Select candidates based on model
|
| 85 |
+
if 'multipartiterank' in model_name:
|
| 86 |
+
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
|
| 87 |
+
extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
|
| 88 |
+
elif 'topicrank' in model_name:
|
| 89 |
+
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
|
| 90 |
+
extractor.candidate_weighting(threshold=0.74, method='average')
|
| 91 |
+
elif 'positionrank' in model_name:
|
| 92 |
+
extractor.candidate_selection(maximum_word_number=3)
|
| 93 |
+
extractor.candidate_weighting(window=10)
|
| 94 |
+
elif 'tfidf' in model_name:
|
| 95 |
+
extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
|
| 96 |
+
extractor.candidate_weighting()
|
| 97 |
+
else:
|
| 98 |
+
# SingleRank and TextRank
|
| 99 |
+
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
|
| 100 |
+
extractor.candidate_weighting(window=10)
|
| 101 |
+
|
| 102 |
+
if progress:
|
| 103 |
+
progress(0.7, desc="Extracting keywords...")
|
| 104 |
+
|
| 105 |
+
# Get keywords
|
| 106 |
+
keywords = extractor.get_n_best(n=num_keywords)
|
| 107 |
+
|
| 108 |
+
# Format results
|
| 109 |
+
results = []
|
| 110 |
+
for keyword, score in keywords:
|
| 111 |
+
results.append({
|
| 112 |
+
'keyword': keyword,
|
| 113 |
+
'score': score,
|
| 114 |
+
'model': model_name.replace('kw_pke_', '').title()
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
return results
|
| 118 |
+
|
| 119 |
+
except ImportError:
|
| 120 |
+
print("PKE library not found. Using fallback keyword extraction...")
|
| 121 |
+
return self.fallback_keyword_extraction(text, num_keywords)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"Error with {model_name}: {str(e)}")
|
| 124 |
+
return self.fallback_keyword_extraction(text, num_keywords)
|
| 125 |
+
|
| 126 |
+
def fallback_keyword_extraction(self, text, num_keywords=10):
|
| 127 |
+
"""Simple fallback keyword extraction using basic statistics"""
|
| 128 |
+
import re
|
| 129 |
+
from collections import Counter
|
| 130 |
+
|
| 131 |
+
# Simple tokenization and filtering
|
| 132 |
+
words = re.findall(r'\b[a-z]+\b', text.lower())
|
| 133 |
+
|
| 134 |
+
# Remove common stop words
|
| 135 |
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 136 |
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
| 137 |
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
| 138 |
+
'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that',
|
| 139 |
+
'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'}
|
| 140 |
+
|
| 141 |
+
filtered_words = [w for w in words if w not in stop_words and len(w) > 3]
|
| 142 |
+
|
| 143 |
+
# Count frequencies
|
| 144 |
+
word_freq = Counter(filtered_words)
|
| 145 |
+
|
| 146 |
+
# Get top keywords
|
| 147 |
+
results = []
|
| 148 |
+
for word, freq in word_freq.most_common(num_keywords):
|
| 149 |
+
score = freq / len(filtered_words) # Normalize by total words
|
| 150 |
+
results.append({
|
| 151 |
+
'keyword': word,
|
| 152 |
+
'score': score,
|
| 153 |
+
'model': 'Fallback-TFIDF'
|
| 154 |
+
})
|
| 155 |
+
|
| 156 |
+
return results
|
| 157 |
|
| 158 |
+
def get_score_color(score, max_score):
|
| 159 |
+
"""Get color based on score relative to max score"""
|
| 160 |
+
if max_score == 0:
|
| 161 |
+
return SCORE_COLORS['medium']
|
| 162 |
+
|
| 163 |
+
relative_score = score / max_score
|
| 164 |
+
if relative_score >= 0.7:
|
| 165 |
+
return SCORE_COLORS['high']
|
| 166 |
+
elif relative_score >= 0.4:
|
| 167 |
+
return SCORE_COLORS['medium']
|
| 168 |
+
else:
|
| 169 |
+
return SCORE_COLORS['low']
|
| 170 |
|
| 171 |
+
def create_highlighted_html(text, keywords):
|
| 172 |
+
"""Create HTML with highlighted keywords in the text"""
|
| 173 |
if not keywords:
|
| 174 |
+
return f"<div style='padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;'><p>{text}</p></div>"
|
| 175 |
+
|
| 176 |
+
# Sort keywords by length (longest first) to avoid partial matches
|
| 177 |
+
sorted_keywords = sorted(keywords, key=lambda x: len(x['keyword']), reverse=True)
|
| 178 |
+
|
| 179 |
+
# Get max score for color scaling
|
| 180 |
+
max_score = max(k['score'] for k in keywords) if keywords else 1
|
| 181 |
+
|
| 182 |
+
# Create a modified text with highlights
|
| 183 |
+
highlighted_text = text
|
| 184 |
+
for i, kw_data in enumerate(sorted_keywords):
|
| 185 |
+
keyword = kw_data['keyword']
|
| 186 |
+
score = kw_data['score']
|
| 187 |
+
color = get_score_color(score, max_score)
|
| 188 |
+
|
| 189 |
+
# Create regex pattern for whole word matching (case-insensitive)
|
| 190 |
+
pattern = r'\b' + re.escape(keyword) + r'\b'
|
| 191 |
+
|
| 192 |
+
# Replace with highlighted version
|
| 193 |
+
replacement = f'<span style="background-color: {color}; padding: 2px 4px; ' \
|
| 194 |
+
f'border-radius: 3px; margin: 0 1px; ' \
|
| 195 |
+
f'border: 1px solid {color}; color: white; font-weight: bold;" ' \
|
| 196 |
+
f'title="Score: {score:.3f}">{keyword}</span>'
|
| 197 |
+
|
| 198 |
+
highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
|
| 199 |
+
|
| 200 |
+
return f"""
|
| 201 |
+
<div style='padding: 15px; border: 2px solid #ddd; border-radius: 8px; background-color: #fafafa; margin: 10px 0;'>
|
| 202 |
+
<h4 style='margin: 0 0 15px 0; color: #333;'>๐ Text with Highlighted Keywords</h4>
|
| 203 |
+
<div style='line-height: 1.8; font-size: 16px; background-color: white; padding: 15px; border-radius: 5px;'>{highlighted_text}</div>
|
| 204 |
+
</div>
|
| 205 |
+
"""
|
| 206 |
|
| 207 |
+
def create_keyword_table_html(keywords):
|
| 208 |
+
"""Create HTML table for keywords"""
|
| 209 |
+
if not keywords:
|
| 210 |
+
return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
|
| 211 |
+
|
| 212 |
+
# Sort by score
|
| 213 |
+
sorted_keywords = sorted(keywords, key=lambda x: x['score'], reverse=True)
|
| 214 |
+
max_score = sorted_keywords[0]['score'] if sorted_keywords else 1
|
| 215 |
+
|
| 216 |
table_html = """
|
| 217 |
+
<div style='max-height: 600px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa;'>
|
| 218 |
+
<h3 style="margin: 0 0 20px 0;">๐ฏ Extracted Keywords</h3>
|
| 219 |
+
<table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
|
| 220 |
+
<thead>
|
| 221 |
+
<tr style="background-color: #4ECDC4; color: white;">
|
| 222 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Rank</th>
|
| 223 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Keyword</th>
|
| 224 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Score</th>
|
| 225 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Relevance</th>
|
| 226 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Model</th>
|
| 227 |
+
</tr>
|
| 228 |
+
</thead>
|
| 229 |
+
<tbody>
|
| 230 |
"""
|
| 231 |
+
|
| 232 |
+
for i, kw_data in enumerate(sorted_keywords):
|
| 233 |
+
score = kw_data['score']
|
| 234 |
+
color = get_score_color(score, max_score)
|
| 235 |
+
|
| 236 |
+
# Create relevance bar
|
| 237 |
+
bar_width = int((score / max_score) * 100) if max_score > 0 else 0
|
| 238 |
+
relevance_bar = f"""
|
| 239 |
+
<div style="width: 100%; background-color: #e0e0e0; border-radius: 10px; height: 20px;">
|
| 240 |
+
<div style="width: {bar_width}%; background-color: {color}; height: 100%; border-radius: 10px;"></div>
|
| 241 |
+
</div>
|
| 242 |
+
"""
|
| 243 |
+
|
| 244 |
table_html += f"""
|
| 245 |
+
<tr style="background-color: #fff;">
|
| 246 |
+
<td style="padding: 10px; border: 1px solid #ddd; text-align: center; font-weight: bold;">#{i+1}</td>
|
| 247 |
+
<td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{kw_data['keyword']}</td>
|
| 248 |
+
<td style="padding: 10px; border: 1px solid #ddd;">
|
| 249 |
+
<span style="color: {color}; font-weight: bold;">{score:.4f}</span>
|
| 250 |
+
</td>
|
| 251 |
+
<td style="padding: 10px; border: 1px solid #ddd;">{relevance_bar}</td>
|
| 252 |
+
<td style="padding: 10px; border: 1px solid #ddd;">
|
| 253 |
+
<span style='background-color: #007bff; color: white; padding: 2px 6px; border-radius: 10px; font-size: 11px;'>
|
| 254 |
+
{kw_data['model']}
|
| 255 |
+
</span>
|
| 256 |
+
</td>
|
| 257 |
</tr>
|
| 258 |
"""
|
| 259 |
+
|
| 260 |
+
table_html += """
|
| 261 |
+
</tbody>
|
| 262 |
+
</table>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
</div>
|
| 264 |
"""
|
| 265 |
+
|
| 266 |
+
return table_html
|
| 267 |
|
| 268 |
+
def create_legend_html():
|
| 269 |
+
"""Create a legend showing score colors"""
|
| 270 |
+
html = """
|
| 271 |
+
<div style='margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px;'>
|
| 272 |
+
<h4 style='margin: 0 0 15px 0;'>๐จ Relevance Score Legend</h4>
|
| 273 |
+
<div style='display: flex; flex-wrap: wrap; gap: 15px;'>
|
| 274 |
+
<span style='background-color: #00B894; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
|
| 275 |
+
High Relevance (70%+)
|
| 276 |
+
</span>
|
| 277 |
+
<span style='background-color: #F9CA24; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
|
| 278 |
+
Medium Relevance (40-70%)
|
| 279 |
+
</span>
|
| 280 |
+
<span style='background-color: #FF6B6B; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
|
| 281 |
+
Low Relevance (<40%)
|
| 282 |
+
</span>
|
| 283 |
+
</div>
|
| 284 |
</div>
|
| 285 |
"""
|
| 286 |
+
return html
|
| 287 |
|
| 288 |
+
# Initialize the keyword extraction manager
|
| 289 |
+
keyword_manager = KeywordExtractionManager()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
+
def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progress=gr.Progress()):
|
| 292 |
+
"""Main processing function for Gradio interface with progress tracking"""
|
| 293 |
+
if not text.strip():
|
| 294 |
+
return "โ Please enter some text to analyse", "", ""
|
| 295 |
+
|
| 296 |
+
progress(0.1, desc="Initialising...")
|
| 297 |
+
|
| 298 |
+
# Extract keywords
|
| 299 |
+
progress(0.2, desc="Extracting keywords...")
|
| 300 |
+
keywords = keyword_manager.extract_keywords(
|
| 301 |
+
text,
|
| 302 |
+
selected_model,
|
| 303 |
+
num_keywords=num_keywords,
|
| 304 |
+
ngram_range=(ngram_min, ngram_max),
|
| 305 |
+
progress=progress
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
if not keywords:
|
| 309 |
+
return "โ No keywords found. Try adjusting the parameters.", "", ""
|
| 310 |
+
|
| 311 |
+
progress(0.8, desc="Processing results...")
|
| 312 |
+
|
| 313 |
+
# Create outputs
|
| 314 |
+
legend_html = create_legend_html()
|
| 315 |
+
highlighted_html = create_highlighted_html(text, keywords)
|
| 316 |
+
results_html = create_keyword_table_html(keywords)
|
| 317 |
+
|
| 318 |
+
progress(0.9, desc="Creating summary...")
|
| 319 |
+
|
| 320 |
+
# Create summary
|
| 321 |
+
avg_score = sum(k['score'] for k in keywords) / len(keywords)
|
| 322 |
+
summary = f"""
|
| 323 |
+
## ๐ Analysis Summary
|
| 324 |
+
- **Keywords extracted:** {len(keywords)}
|
| 325 |
+
- **Model used:** {selected_model.replace('kw_pke_', '').title()}
|
| 326 |
+
- **Average relevance score:** {avg_score:.4f}
|
| 327 |
+
- **N-gram range:** {ngram_min}-{ngram_max} words
|
| 328 |
+
"""
|
| 329 |
+
|
| 330 |
+
progress(1.0, desc="Complete!")
|
| 331 |
+
|
| 332 |
+
return summary, legend_html + highlighted_html, results_html
|
| 333 |
|
| 334 |
+
# Create Gradio interface
|
| 335 |
def create_interface():
|
| 336 |
+
with gr.Blocks(title="Keyword Extraction Tool", theme=gr.themes.Soft()) as demo:
|
| 337 |
gr.Markdown("""
|
| 338 |
+
# Keyword Extraction Explorer Tool
|
| 339 |
+
|
| 340 |
+
Extract the most important keywords and phrases from your text using various algorithms! This tool uses PKE (Python Keyphrase Extraction) models for comprehensive keyword extraction.
|
| 341 |
+
|
| 342 |
+
### How to use:
|
| 343 |
+
1. **๐ Enter your text** in the text area below
|
| 344 |
+
2. **๐ฏ Select a model** from the dropdown for keyword extraction
|
| 345 |
+
3. *โ๏ธ Adjust parameters** (number of keywords, n-gram range)
|
| 346 |
+
4. **๐ Click "Extract Keywords"** to see results with organized output
|
| 347 |
+
""")
|
| 348 |
+
|
| 349 |
+
# Add tip box
|
| 350 |
+
gr.HTML("""
|
| 351 |
+
<div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 12px; margin: 15px 0;">
|
| 352 |
+
<strong style="color: #856404;">๐ก Top tip:</strong> Different models excel at different types of texts - experiment to find the best one for your content!
|
| 353 |
+
</div>
|
| 354 |
+
""")
|
| 355 |
+
|
|
|
|
| 356 |
with gr.Row():
|
| 357 |
+
with gr.Column(scale=2):
|
| 358 |
+
text_input = gr.Textbox(
|
| 359 |
+
label="๐ Text to Analyse",
|
| 360 |
+
placeholder="Enter your text here...",
|
| 361 |
+
lines=6,
|
| 362 |
+
max_lines=10
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
with gr.Column(scale=1):
|
| 366 |
+
# Model selector
|
| 367 |
+
model_dropdown = gr.Dropdown(
|
| 368 |
+
choices=list(PKE_MODELS.keys()),
|
| 369 |
+
value='kw_pke_multipartiterank',
|
| 370 |
+
label="๐ฏ Select Keyword Extraction Model"
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
# Parameters
|
| 374 |
+
num_keywords = gr.Slider(
|
| 375 |
+
minimum=5,
|
| 376 |
+
maximum=30,
|
| 377 |
+
value=10,
|
| 378 |
+
step=1,
|
| 379 |
+
label="๐ Number of Keywords"
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
with gr.Row():
|
| 383 |
+
ngram_min = gr.Slider(
|
| 384 |
+
minimum=1,
|
| 385 |
+
maximum=3,
|
| 386 |
+
value=1,
|
| 387 |
+
step=1,
|
| 388 |
+
label="Min N-gram"
|
| 389 |
+
)
|
| 390 |
+
ngram_max = gr.Slider(
|
| 391 |
+
minimum=1,
|
| 392 |
+
maximum=4,
|
| 393 |
+
value=3,
|
| 394 |
+
step=1,
|
| 395 |
+
label="Max N-gram"
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# Add model descriptions
|
| 399 |
+
gr.HTML("""
|
| 400 |
+
<details style="margin: 20px 0; padding: 10px; background-color: #f8f9fa; border-radius: 8px; border: 1px solid #ddd;">
|
| 401 |
+
<summary style="cursor: pointer; font-weight: bold; padding: 5px; color: #1976d2;">
|
| 402 |
+
โน๏ธ Model Descriptions
|
| 403 |
+
</summary>
|
| 404 |
+
<div style="margin-top: 10px; padding: 10px;">
|
| 405 |
+
<dl style="margin: 0; font-size: 14px;">
|
| 406 |
+
<div style="margin-bottom: 8px;">
|
| 407 |
+
<dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
|
| 408 |
+
<dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
|
| 409 |
+
</div>
|
| 410 |
+
<div style="margin-bottom: 8px;">
|
| 411 |
+
<dt style="font-weight: bold; display: inline; color: #45B7D1;">SingleRank:</dt>
|
| 412 |
+
<dd style="display: inline; margin-left: 5px;">Simple graph-based algorithm - fast and effective</dd>
|
| 413 |
+
</div>
|
| 414 |
+
<div style="margin-bottom: 8px;">
|
| 415 |
+
<dt style="font-weight: bold; display: inline; color: #F9CA24;">TF-IDF:</dt>
|
| 416 |
+
<dd style="display: inline; margin-left: 5px;">Statistical approach - good for technical texts</dd>
|
| 417 |
+
</div>
|
| 418 |
+
<div style="margin-bottom: 8px;">
|
| 419 |
+
<dt style="font-weight: bold; display: inline; color: #6C5CE7;">TopicRank:</dt>
|
| 420 |
+
<dd style="display: inline; margin-left: 5px;">Groups similar candidates - reduces redundancy</dd>
|
| 421 |
+
</div>
|
| 422 |
+
<div style="margin-bottom: 8px;">
|
| 423 |
+
<dt style="font-weight: bold; display: inline; color: #00B894;">TextRank:</dt>
|
| 424 |
+
<dd style="display: inline; margin-left: 5px;">Classic PageRank-inspired algorithm</dd>
|
| 425 |
+
</div>
|
| 426 |
+
<div style="margin-bottom: 8px;">
|
| 427 |
+
<dt style="font-weight: bold; display: inline; color: #E17055;">PositionRank:</dt>
|
| 428 |
+
<dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
|
| 429 |
+
</div>
|
| 430 |
+
</dl>
|
| 431 |
+
</div>
|
| 432 |
+
</details>
|
| 433 |
+
""")
|
| 434 |
+
|
| 435 |
+
extract_btn = gr.Button("๐ Extract Keywords", variant="primary", size="lg")
|
| 436 |
+
|
| 437 |
+
# Output sections
|
| 438 |
+
with gr.Row():
|
| 439 |
+
summary_output = gr.Markdown(label="Summary")
|
| 440 |
+
|
| 441 |
+
with gr.Row():
|
| 442 |
+
highlighted_output = gr.HTML(label="Highlighted Text")
|
| 443 |
+
|
| 444 |
+
# Results section
|
| 445 |
+
with gr.Row():
|
| 446 |
+
with gr.Column():
|
| 447 |
+
gr.Markdown("### ๐ Detailed Results")
|
| 448 |
+
results_output = gr.HTML(label="Keyword Results")
|
| 449 |
+
|
| 450 |
+
# Connect the button to the processing function
|
| 451 |
+
extract_btn.click(
|
| 452 |
fn=process_text,
|
| 453 |
+
inputs=[
|
| 454 |
+
text_input,
|
| 455 |
+
model_dropdown,
|
| 456 |
+
num_keywords,
|
| 457 |
+
ngram_min,
|
| 458 |
+
ngram_max
|
| 459 |
+
],
|
| 460 |
+
outputs=[summary_output, highlighted_output, results_output]
|
| 461 |
)
|
| 462 |
+
|
| 463 |
+
gr.Examples(
|
| 464 |
+
examples=[
|
| 465 |
+
[
|
| 466 |
+
"On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
|
| 467 |
+
"kw_pke_multipartiterank",
|
| 468 |
+
10,
|
| 469 |
+
1,
|
| 470 |
+
3
|
| 471 |
+
],
|
| 472 |
+
[
|
| 473 |
+
"In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
|
| 474 |
+
"kw_pke_topicrank",
|
| 475 |
+
10,
|
| 476 |
+
1,
|
| 477 |
+
3
|
| 478 |
+
],
|
| 479 |
+
[
|
| 480 |
+
"Charles Darwin arrived at the Galรกpagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
|
| 481 |
+
"kw_pke_textrank",
|
| 482 |
+
10,
|
| 483 |
+
1,
|
| 484 |
+
3
|
| 485 |
+
]
|
| 486 |
+
],
|
| 487 |
+
inputs=[
|
| 488 |
+
text_input,
|
| 489 |
+
model_dropdown,
|
| 490 |
+
num_keywords,
|
| 491 |
+
ngram_min,
|
| 492 |
+
ngram_max
|
| 493 |
+
]
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
# Add model information links
|
| 497 |
gr.HTML("""
|
| 498 |
+
<hr style="margin-top: 40px; margin-bottom: 20px;">
|
| 499 |
+
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 8px; margin-top: 20px;">
|
| 500 |
+
<h4 style="margin-top: 0;">๐ Model Information & Documentation</h4>
|
| 501 |
+
<p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
|
| 502 |
+
<ul style="font-size: 14px; line-height: 1.8;">
|
| 503 |
+
<li><strong>PKE Library:</strong>
|
| 504 |
+
<a href="https://github.com/boudinfl/pke" target="_blank" style="color: #1976d2;">
|
| 505 |
+
Python Keyphrase Extraction (PKE) GitHub โ
|
| 506 |
+
</a>
|
| 507 |
+
</li>
|
| 508 |
+
<li><strong>Algorithm Papers:</strong>
|
| 509 |
+
<a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
|
| 510 |
+
PKE Documentation & References โ
|
| 511 |
+
</a>
|
| 512 |
+
</li>
|
| 513 |
+
</ul>
|
| 514 |
+
</div>
|
| 515 |
+
|
| 516 |
+
<br>
|
| 517 |
+
<hr style="margin-top: 40px; margin-bottom: 20px;">
|
| 518 |
+
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 8px; margin-top: 20px; text-align: center;">
|
| 519 |
+
<p style="font-size: 14px; line-height: 1.8; margin: 0;">
|
| 520 |
+
This <strong>Keyword Extraction Explorer Tool</strong> was created as part of the
|
| 521 |
+
<a href="https://digitalscholarship.web.ox.ac.uk/" target="_blank" style="color: #1976d2;">
|
| 522 |
+
Digital Scholarship at Oxford (DiSc)
|
| 523 |
+
</a>
|
| 524 |
+
funded research project:
|
| 525 |
+
<em>Extracting Keywords from Crowdsourced Collections</em>.
|
| 526 |
+
</p>
|
| 527 |
+
</div>
|
| 528 |
+
""")
|
| 529 |
|
| 530 |
return demo
|
| 531 |
|