SorrelC commited on
Commit
4ba8bd8
ยท
verified ยท
1 Parent(s): e0cb85f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +499 -127
app.py CHANGED
@@ -1,159 +1,531 @@
1
  import gradio as gr
2
- import pke
3
- import nltk
 
4
  import re
 
 
5
 
6
- nltk.download('stopwords')
 
 
 
 
 
 
 
 
7
 
8
- AVAILABLE_MODELS = [
9
- "kw_pke_multipartiterank",
10
- "kw_pke_singlerank",
11
- "kw_pke_tfidf",
12
- "kw_pke_topicrank",
13
- "kw_pke_textrank",
14
- "kw_pke_positionrank"
15
- ]
16
-
17
- def extract_keywords_pke(text, model_choice, num_keywords):
18
- if model_choice == "kw_pke_multipartiterank":
19
- extractor = pke.unsupervised.MultipartiteRank()
20
- elif model_choice == "kw_pke_singlerank":
21
- extractor = pke.unsupervised.SingleRank()
22
- elif model_choice == "kw_pke_tfidf":
23
- extractor = pke.unsupervised.TfIdf()
24
- elif model_choice == "kw_pke_topicrank":
25
- extractor = pke.unsupervised.TopicRank()
26
- elif model_choice == "kw_pke_textrank":
27
- extractor = pke.unsupervised.TextRank()
28
- elif model_choice == "kw_pke_positionrank":
29
- extractor = pke.unsupervised.PositionRank()
30
- else:
31
- return ["Error: Unknown model"]
32
 
33
- extractor.load_document(input=text, language='en', normalization=None)
34
- extractor.candidate_selection(n=3) if model_choice == "kw_pke_tfidf" else extractor.candidate_selection()
35
- extractor.candidate_weighting()
 
 
 
36
 
37
- return [kw for kw, score in extractor.get_n_best(n=num_keywords)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- def highlight_keywords(text, keywords):
40
- highlighted = text
41
- for kw in sorted(keywords, key=lambda k: -len(k)):
42
- pattern = re.compile(re.escape(kw), re.IGNORECASE)
43
- highlighted = pattern.sub(
44
- f'<span style="background-color: #6C63FF; color: white; padding: 2px 5px; '
45
- f'border-radius: 4px; font-weight: bold;">{kw}</span>',
46
- highlighted
47
- )
48
- return highlighted
 
 
49
 
50
- def create_keywords_table(keywords):
 
51
  if not keywords:
52
- return "<p>No keywords found.</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
54
  table_html = """
55
- <table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd;">
56
- <thead>
57
- <tr style="background-color: #6C63FF; color: white;">
58
- <th style="padding: 10px; text-align: left;">Rank</th>
59
- <th style="padding: 10px; text-align: left;">Keyword</th>
60
- </tr>
61
- </thead>
62
- <tbody>
 
 
 
 
 
63
  """
64
- for idx, kw in enumerate(keywords, 1):
 
 
 
 
 
 
 
 
 
 
 
 
65
  table_html += f"""
66
- <tr>
67
- <td style="padding: 10px; border: 1px solid #ddd;">{idx}</td>
68
- <td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{kw}</td>
 
 
 
 
 
 
 
 
 
69
  </tr>
70
  """
71
- table_html += "</tbody></table>"
72
- return table_html
73
-
74
- def process_text(text, model_choice, num_keywords):
75
- if not text.strip():
76
- return "โŒ Please enter text to analyse.", "", ""
77
-
78
- keywords = extract_keywords_pke(text, model_choice, num_keywords)
79
- highlighted_html = highlight_keywords(text, keywords)
80
- keywords_table_html = create_keywords_table(keywords)
81
-
82
- summary_html = f"""
83
- <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; border: 1px solid #ddd; box-shadow: 0 2px 5px rgba(0,0,0,0.05); margin-bottom: 20px;">
84
- <h3 style="margin-top: 0; color: #6C63FF;">๐Ÿ“Š Analysis Summary</h3>
85
- <p><strong>Model Used:</strong> {model_choice}</p>
86
- <p><strong>Keywords Found:</strong> {len(keywords)}</p>
87
  </div>
88
  """
 
 
89
 
90
- highlighted_section = f"""
91
- <div style='padding: 20px; border: 2px solid #ddd; border-radius: 10px; background-color: #fafafa; margin: 15px 0; box-shadow: 0 2px 5px rgba(0,0,0,0.05);'>
92
- <h4 style='margin: 0 0 15px 0; color: #6C63FF;'>๐Ÿ“ Text with Highlighted Keywords</h4>
93
- <div style='line-height: 1.8; font-size: 16px; background-color: white; padding: 20px; border-radius: 8px;'>{highlighted_html}</div>
 
 
 
 
 
 
 
 
 
 
 
 
94
  </div>
95
  """
 
96
 
97
- table_section = f"""
98
- <div style="margin-top: 20px;">
99
- <h4 style="color: #6C63FF; margin-bottom: 10px;">๐Ÿ“‹ Extracted Keywords</h4>
100
- {keywords_table_html}
101
- </div>
102
- """
103
 
104
- return summary_html, highlighted_section, table_section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
 
106
  def create_interface():
107
- with gr.Blocks(title="Keyword Explorer Tool") as demo:
108
  gr.Markdown("""
109
- # ๐Ÿ”‘ Keyword Explorer Tool
110
-
111
- ---
112
-
113
- ### ๐Ÿ“‹ How to use:
114
-
115
- 1. **๐Ÿ“ Enter your text** in the text area below.
116
- 2. **๐ŸŽ›๏ธ Select a model** from the dropdown.
117
- 3. **โš™๏ธ Adjust the maximum number of keywords to be identified ** using the slider.
118
- 4. **๐Ÿ” Click "Analyse Text"** to see:
119
- - ๐Ÿ“Š A summary of results.
120
- - โœจ Highlighted keywords inside your text.
121
- - ๐Ÿ“‹ A full keyword list.
122
-
123
- ---
124
- """)
125
-
126
- text_input = gr.Textbox(label="๐Ÿ“ Text to Analyse", placeholder="Enter your text here...", lines=10)
127
-
128
  with gr.Row():
129
- model_dropdown = gr.Dropdown(choices=AVAILABLE_MODELS, value=AVAILABLE_MODELS[0], label="๐ŸŽ›๏ธ Select Model")
130
- num_keywords_slider = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="โš™๏ธ Adjust Number of Keywords")
131
-
132
- analyse_btn = gr.Button("๐Ÿ” Extract Keywords", elem_classes="explorer-button")
133
-
134
- summary_output = gr.HTML()
135
- highlighted_output = gr.HTML()
136
- keywords_table_output = gr.HTML()
137
-
138
- analyse_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  fn=process_text,
140
- inputs=[text_input, model_dropdown, num_keywords_slider],
141
- outputs=[summary_output, highlighted_output, keywords_table_output]
 
 
 
 
 
 
142
  )
143
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  gr.HTML("""
145
- <hr style="margin-top: 40px; margin-bottom: 20px;">
146
- <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; text-align: center; border: 1px solid #ddd;">
147
- <p style="font-size: 14px; line-height: 1.8; margin: 0;">
148
- This <strong>Keyword Extraction Explorer Tool</strong> was created as part of the
149
- <a href="https://digitalscholarship.web.ox.ac.uk/" target="_blank" style="color: #6C63FF;">
150
- Digital Scholarship at Oxford (DiSc)
151
- </a>
152
- funded research project:
153
- <em>Extracting Keywords from Crowdsourced Collections</em>.
154
- </p>
155
- </div>
156
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  return demo
159
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import warnings
4
+ import random
5
  import re
6
+ import time
7
+ warnings.filterwarnings('ignore')
8
 
9
+ # PKE model names and descriptions
10
+ PKE_MODELS = {
11
+ 'kw_pke_multipartiterank': 'MultipartiteRank - Graph-based ranking using topic clustering',
12
+ 'kw_pke_singlerank': 'SingleRank - Graph-based ranking algorithm',
13
+ 'kw_pke_tfidf': 'TF-IDF - Term Frequency-Inverse Document Frequency',
14
+ 'kw_pke_topicrank': 'TopicRank - Graph-based with topic clustering',
15
+ 'kw_pke_textrank': 'TextRank - Graph-based ranking algorithm',
16
+ 'kw_pke_positionrank': 'PositionRank - Incorporates word positions'
17
+ }
18
 
19
+ # Color palette for keywords based on scores
20
+ SCORE_COLORS = {
21
+ 'high': '#00B894', # Green - High relevance
22
+ 'medium': '#F9CA24', # Yellow - Medium relevance
23
+ 'low': '#FF6B6B' # Red - Low relevance
24
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Additional colors for variety
27
+ KEYWORD_COLORS = [
28
+ '#4ECDC4', '#45B7D1', '#6C5CE7', '#A0E7E5', '#FD79A8',
29
+ '#8E8E93', '#55A3FF', '#E17055', '#DDA0DD', '#FF9F43',
30
+ '#10AC84', '#EE5A24', '#0FBC89', '#5F27CD', '#FF3838'
31
+ ]
32
 
33
+ class KeywordExtractionManager:
34
+ def __init__(self):
35
+ self.pke_models = {}
36
+ self.spacy_model = None
37
+
38
+ def load_spacy_model(self):
39
+ """Load spaCy model for preprocessing"""
40
+ if self.spacy_model is None:
41
+ try:
42
+ import spacy
43
+ try:
44
+ self.spacy_model = spacy.load("en_core_web_sm")
45
+ print("โœ“ spaCy model loaded successfully")
46
+ except OSError:
47
+ print("spaCy model not found. Please install with: python -m spacy download en_core_web_sm")
48
+ return None
49
+ except Exception as e:
50
+ print(f"Error loading spaCy model: {str(e)}")
51
+ return None
52
+ return self.spacy_model
53
+
54
+ def extract_keywords(self, text, model_name, num_keywords=10, ngram_range=(1, 3), progress=None):
55
+ """Extract keywords using the specified PKE model"""
56
+ try:
57
+ import pke
58
+
59
+ if progress:
60
+ progress(0.3, desc="Loading model...")
61
+
62
+ # Initialize the extractor based on model name
63
+ if 'multipartiterank' in model_name:
64
+ extractor = pke.unsupervised.MultipartiteRank()
65
+ elif 'singlerank' in model_name:
66
+ extractor = pke.unsupervised.SingleRank()
67
+ elif 'tfidf' in model_name:
68
+ extractor = pke.unsupervised.TfIdf()
69
+ elif 'topicrank' in model_name:
70
+ extractor = pke.unsupervised.TopicRank()
71
+ elif 'textrank' in model_name:
72
+ extractor = pke.unsupervised.TextRank()
73
+ elif 'positionrank' in model_name:
74
+ extractor = pke.unsupervised.PositionRank()
75
+ else:
76
+ raise ValueError(f"Unknown model: {model_name}")
77
+
78
+ if progress:
79
+ progress(0.5, desc="Processing text...")
80
+
81
+ # Load the text
82
+ extractor.load_document(input=text, language='en')
83
+
84
+ # Select candidates based on model
85
+ if 'multipartiterank' in model_name:
86
+ extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
87
+ extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
88
+ elif 'topicrank' in model_name:
89
+ extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
90
+ extractor.candidate_weighting(threshold=0.74, method='average')
91
+ elif 'positionrank' in model_name:
92
+ extractor.candidate_selection(maximum_word_number=3)
93
+ extractor.candidate_weighting(window=10)
94
+ elif 'tfidf' in model_name:
95
+ extractor.candidate_selection(n=ngram_range[1], stoplist=['en'])
96
+ extractor.candidate_weighting()
97
+ else:
98
+ # SingleRank and TextRank
99
+ extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
100
+ extractor.candidate_weighting(window=10)
101
+
102
+ if progress:
103
+ progress(0.7, desc="Extracting keywords...")
104
+
105
+ # Get keywords
106
+ keywords = extractor.get_n_best(n=num_keywords)
107
+
108
+ # Format results
109
+ results = []
110
+ for keyword, score in keywords:
111
+ results.append({
112
+ 'keyword': keyword,
113
+ 'score': score,
114
+ 'model': model_name.replace('kw_pke_', '').title()
115
+ })
116
+
117
+ return results
118
+
119
+ except ImportError:
120
+ print("PKE library not found. Using fallback keyword extraction...")
121
+ return self.fallback_keyword_extraction(text, num_keywords)
122
+ except Exception as e:
123
+ print(f"Error with {model_name}: {str(e)}")
124
+ return self.fallback_keyword_extraction(text, num_keywords)
125
+
126
+ def fallback_keyword_extraction(self, text, num_keywords=10):
127
+ """Simple fallback keyword extraction using basic statistics"""
128
+ import re
129
+ from collections import Counter
130
+
131
+ # Simple tokenization and filtering
132
+ words = re.findall(r'\b[a-z]+\b', text.lower())
133
+
134
+ # Remove common stop words
135
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
136
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
137
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
138
+ 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that',
139
+ 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'}
140
+
141
+ filtered_words = [w for w in words if w not in stop_words and len(w) > 3]
142
+
143
+ # Count frequencies
144
+ word_freq = Counter(filtered_words)
145
+
146
+ # Get top keywords
147
+ results = []
148
+ for word, freq in word_freq.most_common(num_keywords):
149
+ score = freq / len(filtered_words) # Normalize by total words
150
+ results.append({
151
+ 'keyword': word,
152
+ 'score': score,
153
+ 'model': 'Fallback-TFIDF'
154
+ })
155
+
156
+ return results
157
 
158
+ def get_score_color(score, max_score):
159
+ """Get color based on score relative to max score"""
160
+ if max_score == 0:
161
+ return SCORE_COLORS['medium']
162
+
163
+ relative_score = score / max_score
164
+ if relative_score >= 0.7:
165
+ return SCORE_COLORS['high']
166
+ elif relative_score >= 0.4:
167
+ return SCORE_COLORS['medium']
168
+ else:
169
+ return SCORE_COLORS['low']
170
 
171
+ def create_highlighted_html(text, keywords):
172
+ """Create HTML with highlighted keywords in the text"""
173
  if not keywords:
174
+ return f"<div style='padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;'><p>{text}</p></div>"
175
+
176
+ # Sort keywords by length (longest first) to avoid partial matches
177
+ sorted_keywords = sorted(keywords, key=lambda x: len(x['keyword']), reverse=True)
178
+
179
+ # Get max score for color scaling
180
+ max_score = max(k['score'] for k in keywords) if keywords else 1
181
+
182
+ # Create a modified text with highlights
183
+ highlighted_text = text
184
+ for i, kw_data in enumerate(sorted_keywords):
185
+ keyword = kw_data['keyword']
186
+ score = kw_data['score']
187
+ color = get_score_color(score, max_score)
188
+
189
+ # Create regex pattern for whole word matching (case-insensitive)
190
+ pattern = r'\b' + re.escape(keyword) + r'\b'
191
+
192
+ # Replace with highlighted version
193
+ replacement = f'<span style="background-color: {color}; padding: 2px 4px; ' \
194
+ f'border-radius: 3px; margin: 0 1px; ' \
195
+ f'border: 1px solid {color}; color: white; font-weight: bold;" ' \
196
+ f'title="Score: {score:.3f}">{keyword}</span>'
197
+
198
+ highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
199
+
200
+ return f"""
201
+ <div style='padding: 15px; border: 2px solid #ddd; border-radius: 8px; background-color: #fafafa; margin: 10px 0;'>
202
+ <h4 style='margin: 0 0 15px 0; color: #333;'>๐Ÿ“ Text with Highlighted Keywords</h4>
203
+ <div style='line-height: 1.8; font-size: 16px; background-color: white; padding: 15px; border-radius: 5px;'>{highlighted_text}</div>
204
+ </div>
205
+ """
206
 
207
+ def create_keyword_table_html(keywords):
208
+ """Create HTML table for keywords"""
209
+ if not keywords:
210
+ return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
211
+
212
+ # Sort by score
213
+ sorted_keywords = sorted(keywords, key=lambda x: x['score'], reverse=True)
214
+ max_score = sorted_keywords[0]['score'] if sorted_keywords else 1
215
+
216
  table_html = """
217
+ <div style='max-height: 600px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa;'>
218
+ <h3 style="margin: 0 0 20px 0;">๐ŸŽฏ Extracted Keywords</h3>
219
+ <table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
220
+ <thead>
221
+ <tr style="background-color: #4ECDC4; color: white;">
222
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Rank</th>
223
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Keyword</th>
224
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Score</th>
225
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Relevance</th>
226
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Model</th>
227
+ </tr>
228
+ </thead>
229
+ <tbody>
230
  """
231
+
232
+ for i, kw_data in enumerate(sorted_keywords):
233
+ score = kw_data['score']
234
+ color = get_score_color(score, max_score)
235
+
236
+ # Create relevance bar
237
+ bar_width = int((score / max_score) * 100) if max_score > 0 else 0
238
+ relevance_bar = f"""
239
+ <div style="width: 100%; background-color: #e0e0e0; border-radius: 10px; height: 20px;">
240
+ <div style="width: {bar_width}%; background-color: {color}; height: 100%; border-radius: 10px;"></div>
241
+ </div>
242
+ """
243
+
244
  table_html += f"""
245
+ <tr style="background-color: #fff;">
246
+ <td style="padding: 10px; border: 1px solid #ddd; text-align: center; font-weight: bold;">#{i+1}</td>
247
+ <td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{kw_data['keyword']}</td>
248
+ <td style="padding: 10px; border: 1px solid #ddd;">
249
+ <span style="color: {color}; font-weight: bold;">{score:.4f}</span>
250
+ </td>
251
+ <td style="padding: 10px; border: 1px solid #ddd;">{relevance_bar}</td>
252
+ <td style="padding: 10px; border: 1px solid #ddd;">
253
+ <span style='background-color: #007bff; color: white; padding: 2px 6px; border-radius: 10px; font-size: 11px;'>
254
+ {kw_data['model']}
255
+ </span>
256
+ </td>
257
  </tr>
258
  """
259
+
260
+ table_html += """
261
+ </tbody>
262
+ </table>
 
 
 
 
 
 
 
 
 
 
 
 
263
  </div>
264
  """
265
+
266
+ return table_html
267
 
268
+ def create_legend_html():
269
+ """Create a legend showing score colors"""
270
+ html = """
271
+ <div style='margin: 15px 0; padding: 15px; background-color: #f8f9fa; border-radius: 8px;'>
272
+ <h4 style='margin: 0 0 15px 0;'>๐ŸŽจ Relevance Score Legend</h4>
273
+ <div style='display: flex; flex-wrap: wrap; gap: 15px;'>
274
+ <span style='background-color: #00B894; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
275
+ High Relevance (70%+)
276
+ </span>
277
+ <span style='background-color: #F9CA24; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
278
+ Medium Relevance (40-70%)
279
+ </span>
280
+ <span style='background-color: #FF6B6B; padding: 4px 12px; border-radius: 15px; color: white; font-weight: bold;'>
281
+ Low Relevance (<40%)
282
+ </span>
283
+ </div>
284
  </div>
285
  """
286
+ return html
287
 
288
+ # Initialize the keyword extraction manager
289
+ keyword_manager = KeywordExtractionManager()
 
 
 
 
290
 
291
+ def process_text(text, selected_model, num_keywords, ngram_min, ngram_max, progress=gr.Progress()):
292
+ """Main processing function for Gradio interface with progress tracking"""
293
+ if not text.strip():
294
+ return "โŒ Please enter some text to analyse", "", ""
295
+
296
+ progress(0.1, desc="Initialising...")
297
+
298
+ # Extract keywords
299
+ progress(0.2, desc="Extracting keywords...")
300
+ keywords = keyword_manager.extract_keywords(
301
+ text,
302
+ selected_model,
303
+ num_keywords=num_keywords,
304
+ ngram_range=(ngram_min, ngram_max),
305
+ progress=progress
306
+ )
307
+
308
+ if not keywords:
309
+ return "โŒ No keywords found. Try adjusting the parameters.", "", ""
310
+
311
+ progress(0.8, desc="Processing results...")
312
+
313
+ # Create outputs
314
+ legend_html = create_legend_html()
315
+ highlighted_html = create_highlighted_html(text, keywords)
316
+ results_html = create_keyword_table_html(keywords)
317
+
318
+ progress(0.9, desc="Creating summary...")
319
+
320
+ # Create summary
321
+ avg_score = sum(k['score'] for k in keywords) / len(keywords)
322
+ summary = f"""
323
+ ## ๐Ÿ“Š Analysis Summary
324
+ - **Keywords extracted:** {len(keywords)}
325
+ - **Model used:** {selected_model.replace('kw_pke_', '').title()}
326
+ - **Average relevance score:** {avg_score:.4f}
327
+ - **N-gram range:** {ngram_min}-{ngram_max} words
328
+ """
329
+
330
+ progress(1.0, desc="Complete!")
331
+
332
+ return summary, legend_html + highlighted_html, results_html
333
 
334
+ # Create Gradio interface
335
  def create_interface():
336
+ with gr.Blocks(title="Keyword Extraction Tool", theme=gr.themes.Soft()) as demo:
337
  gr.Markdown("""
338
+ # Keyword Extraction Explorer Tool
339
+
340
+ Extract the most important keywords and phrases from your text using various algorithms! This tool uses PKE (Python Keyphrase Extraction) models for comprehensive keyword extraction.
341
+
342
+ ### How to use:
343
+ 1. **๐Ÿ“ Enter your text** in the text area below
344
+ 2. **๐ŸŽฏ Select a model** from the dropdown for keyword extraction
345
+ 3. *โš™๏ธ Adjust parameters** (number of keywords, n-gram range)
346
+ 4. **๐Ÿ” Click "Extract Keywords"** to see results with organized output
347
+ """)
348
+
349
+ # Add tip box
350
+ gr.HTML("""
351
+ <div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; padding: 12px; margin: 15px 0;">
352
+ <strong style="color: #856404;">๐Ÿ’ก Top tip:</strong> Different models excel at different types of texts - experiment to find the best one for your content!
353
+ </div>
354
+ """)
355
+
 
356
  with gr.Row():
357
+ with gr.Column(scale=2):
358
+ text_input = gr.Textbox(
359
+ label="๐Ÿ“ Text to Analyse",
360
+ placeholder="Enter your text here...",
361
+ lines=6,
362
+ max_lines=10
363
+ )
364
+
365
+ with gr.Column(scale=1):
366
+ # Model selector
367
+ model_dropdown = gr.Dropdown(
368
+ choices=list(PKE_MODELS.keys()),
369
+ value='kw_pke_multipartiterank',
370
+ label="๐ŸŽฏ Select Keyword Extraction Model"
371
+ )
372
+
373
+ # Parameters
374
+ num_keywords = gr.Slider(
375
+ minimum=5,
376
+ maximum=30,
377
+ value=10,
378
+ step=1,
379
+ label="๐Ÿ“Š Number of Keywords"
380
+ )
381
+
382
+ with gr.Row():
383
+ ngram_min = gr.Slider(
384
+ minimum=1,
385
+ maximum=3,
386
+ value=1,
387
+ step=1,
388
+ label="Min N-gram"
389
+ )
390
+ ngram_max = gr.Slider(
391
+ minimum=1,
392
+ maximum=4,
393
+ value=3,
394
+ step=1,
395
+ label="Max N-gram"
396
+ )
397
+
398
+ # Add model descriptions
399
+ gr.HTML("""
400
+ <details style="margin: 20px 0; padding: 10px; background-color: #f8f9fa; border-radius: 8px; border: 1px solid #ddd;">
401
+ <summary style="cursor: pointer; font-weight: bold; padding: 5px; color: #1976d2;">
402
+ โ„น๏ธ Model Descriptions
403
+ </summary>
404
+ <div style="margin-top: 10px; padding: 10px;">
405
+ <dl style="margin: 0; font-size: 14px;">
406
+ <div style="margin-bottom: 8px;">
407
+ <dt style="font-weight: bold; display: inline; color: #4ECDC4;">MultipartiteRank:</dt>
408
+ <dd style="display: inline; margin-left: 5px;">Graph-based ranking using topic clustering - excellent for diverse texts</dd>
409
+ </div>
410
+ <div style="margin-bottom: 8px;">
411
+ <dt style="font-weight: bold; display: inline; color: #45B7D1;">SingleRank:</dt>
412
+ <dd style="display: inline; margin-left: 5px;">Simple graph-based algorithm - fast and effective</dd>
413
+ </div>
414
+ <div style="margin-bottom: 8px;">
415
+ <dt style="font-weight: bold; display: inline; color: #F9CA24;">TF-IDF:</dt>
416
+ <dd style="display: inline; margin-left: 5px;">Statistical approach - good for technical texts</dd>
417
+ </div>
418
+ <div style="margin-bottom: 8px;">
419
+ <dt style="font-weight: bold; display: inline; color: #6C5CE7;">TopicRank:</dt>
420
+ <dd style="display: inline; margin-left: 5px;">Groups similar candidates - reduces redundancy</dd>
421
+ </div>
422
+ <div style="margin-bottom: 8px;">
423
+ <dt style="font-weight: bold; display: inline; color: #00B894;">TextRank:</dt>
424
+ <dd style="display: inline; margin-left: 5px;">Classic PageRank-inspired algorithm</dd>
425
+ </div>
426
+ <div style="margin-bottom: 8px;">
427
+ <dt style="font-weight: bold; display: inline; color: #E17055;">PositionRank:</dt>
428
+ <dd style="display: inline; margin-left: 5px;">Incorporates word positions - good for structured documents</dd>
429
+ </div>
430
+ </dl>
431
+ </div>
432
+ </details>
433
+ """)
434
+
435
+ extract_btn = gr.Button("๐Ÿ” Extract Keywords", variant="primary", size="lg")
436
+
437
+ # Output sections
438
+ with gr.Row():
439
+ summary_output = gr.Markdown(label="Summary")
440
+
441
+ with gr.Row():
442
+ highlighted_output = gr.HTML(label="Highlighted Text")
443
+
444
+ # Results section
445
+ with gr.Row():
446
+ with gr.Column():
447
+ gr.Markdown("### ๐Ÿ“‹ Detailed Results")
448
+ results_output = gr.HTML(label="Keyword Results")
449
+
450
+ # Connect the button to the processing function
451
+ extract_btn.click(
452
  fn=process_text,
453
+ inputs=[
454
+ text_input,
455
+ model_dropdown,
456
+ num_keywords,
457
+ ngram_min,
458
+ ngram_max
459
+ ],
460
+ outputs=[summary_output, highlighted_output, results_output]
461
  )
462
+
463
+ gr.Examples(
464
+ examples=[
465
+ [
466
+ "On June 6, 1944, Allied forces launched Operation Overlord, the invasion of Normandy. General Dwight D. Eisenhower commanded the operation, while Field Marshal Bernard Montgomery led ground forces. The BBC broadcast coded messages to the French Resistance, including the famous line 'The long sobs of autumn violins.'",
467
+ "kw_pke_multipartiterank",
468
+ 10,
469
+ 1,
470
+ 3
471
+ ],
472
+ [
473
+ "In Jane Austen's 'Pride and Prejudice', Elizabeth Bennet first meets Mr. Darcy at the Meryton assembly. The novel, published in 1813, explores themes of marriage and social class in Regency England. Austen wrote to her sister Cassandra about the manuscript while staying at Chawton Cottage.",
474
+ "kw_pke_topicrank",
475
+ 10,
476
+ 1,
477
+ 3
478
+ ],
479
+ [
480
+ "Charles Darwin arrived at the Galรกpagos Islands aboard HMS Beagle in September 1835. During his five-week visit, Darwin collected specimens of finches, tortoises, and mockingbirds. His observations of these species' variations across different islands later contributed to his theory of evolution by natural selection, published in 'On the Origin of Species' in 1859.",
481
+ "kw_pke_textrank",
482
+ 10,
483
+ 1,
484
+ 3
485
+ ]
486
+ ],
487
+ inputs=[
488
+ text_input,
489
+ model_dropdown,
490
+ num_keywords,
491
+ ngram_min,
492
+ ngram_max
493
+ ]
494
+ )
495
+
496
+ # Add model information links
497
  gr.HTML("""
498
+ <hr style="margin-top: 40px; margin-bottom: 20px;">
499
+ <div style="background-color: #f8f9fa; padding: 20px; border-radius: 8px; margin-top: 20px;">
500
+ <h4 style="margin-top: 0;">๐Ÿ“š Model Information & Documentation</h4>
501
+ <p style="font-size: 14px; margin-bottom: 15px;">Learn more about the algorithms used in this tool:</p>
502
+ <ul style="font-size: 14px; line-height: 1.8;">
503
+ <li><strong>PKE Library:</strong>
504
+ <a href="https://github.com/boudinfl/pke" target="_blank" style="color: #1976d2;">
505
+ Python Keyphrase Extraction (PKE) GitHub โ†—
506
+ </a>
507
+ </li>
508
+ <li><strong>Algorithm Papers:</strong>
509
+ <a href="https://boudinfl.github.io/pke/" target="_blank" style="color: #1976d2;">
510
+ PKE Documentation & References โ†—
511
+ </a>
512
+ </li>
513
+ </ul>
514
+ </div>
515
+
516
+ <br>
517
+ <hr style="margin-top: 40px; margin-bottom: 20px;">
518
+ <div style="background-color: #f8f9fa; padding: 20px; border-radius: 8px; margin-top: 20px; text-align: center;">
519
+ <p style="font-size: 14px; line-height: 1.8; margin: 0;">
520
+ This <strong>Keyword Extraction Explorer Tool</strong> was created as part of the
521
+ <a href="https://digitalscholarship.web.ox.ac.uk/" target="_blank" style="color: #1976d2;">
522
+ Digital Scholarship at Oxford (DiSc)
523
+ </a>
524
+ funded research project:
525
+ <em>Extracting Keywords from Crowdsourced Collections</em>.
526
+ </p>
527
+ </div>
528
+ """)
529
 
530
  return demo
531