Update app.py
Browse files
app.py
CHANGED
|
@@ -2,18 +2,18 @@ import gradio as gr
|
|
| 2 |
import re
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
-
def build_keywords_dict(primary_inputs,
|
| 6 |
-
"""Build keyword dictionary from separate primary and
|
| 7 |
keywords_dict = {}
|
| 8 |
|
| 9 |
-
for primary,
|
| 10 |
if primary and primary.strip(): # Only process if primary keyword exists
|
| 11 |
primary_clean = primary.strip()
|
| 12 |
-
if
|
| 13 |
-
|
| 14 |
else:
|
| 15 |
-
|
| 16 |
-
keywords_dict[primary_clean] =
|
| 17 |
|
| 18 |
return keywords_dict
|
| 19 |
|
|
@@ -24,8 +24,8 @@ def find_keywords(story, keywords_dict):
|
|
| 24 |
|
| 25 |
found_keywords = set()
|
| 26 |
|
| 27 |
-
# Search for each primary keyword and its
|
| 28 |
-
for primary_keyword,
|
| 29 |
keyword_group_found = False
|
| 30 |
|
| 31 |
# Check primary keyword
|
|
@@ -37,19 +37,19 @@ def find_keywords(story, keywords_dict):
|
|
| 37 |
if re.search(pattern, story, re.IGNORECASE):
|
| 38 |
keyword_group_found = True
|
| 39 |
|
| 40 |
-
# Check each
|
| 41 |
-
for
|
| 42 |
-
if
|
| 43 |
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 44 |
keyword_group_found = True
|
| 45 |
else:
|
| 46 |
-
if re.search(r'\b' + re.escape(
|
| 47 |
keyword_group_found = True
|
| 48 |
|
| 49 |
# If any keyword from this group was found, add ALL keywords from the group
|
| 50 |
if keyword_group_found:
|
| 51 |
found_keywords.add(primary_keyword) # Always include the primary
|
| 52 |
-
found_keywords.update(
|
| 53 |
|
| 54 |
return '; '.join(sorted(found_keywords))
|
| 55 |
|
|
@@ -83,15 +83,15 @@ def create_keyword_results_table(found_keywords_str, keywords_dict, input_text):
|
|
| 83 |
|
| 84 |
# Group keywords by their primary category
|
| 85 |
keyword_groups = {}
|
| 86 |
-
for primary,
|
| 87 |
found_in_group = []
|
| 88 |
# Check if primary keyword was found
|
| 89 |
if primary in found_keywords:
|
| 90 |
found_in_group.append(primary)
|
| 91 |
-
# Check if any
|
| 92 |
-
for
|
| 93 |
-
if
|
| 94 |
-
found_in_group.append(
|
| 95 |
|
| 96 |
if found_in_group:
|
| 97 |
keyword_groups[primary] = found_in_group
|
|
@@ -189,15 +189,15 @@ def create_keyword_results_table(found_keywords_str, keywords_dict, input_text):
|
|
| 189 |
|
| 190 |
return table_html
|
| 191 |
|
| 192 |
-
def process_text(input_text, primary1,
|
| 193 |
"""Main processing function with added results table"""
|
| 194 |
if not input_text.strip():
|
| 195 |
return "Please enter some text to analyse", "", "", "No keywords found"
|
| 196 |
|
| 197 |
# Build keywords dictionary from separate inputs
|
| 198 |
primary_inputs = [primary1, primary2, primary3, primary4, primary5]
|
| 199 |
-
|
| 200 |
-
keywords_dict = build_keywords_dict(primary_inputs,
|
| 201 |
|
| 202 |
if not keywords_dict:
|
| 203 |
return "Please enter at least one primary keyword", "", "", "No keyword dictionary provided"
|
|
@@ -222,7 +222,7 @@ def process_text(input_text, primary1, synonyms1, primary2, synonyms2, primary3,
|
|
| 222 |
**Matched Keywords:** {found_keywords_str}
|
| 223 |
**Keyword Dictionary Stats:**
|
| 224 |
- Primary keywords loaded: {len(keywords_dict)}
|
| 225 |
-
- Total searchable terms: {sum(len(
|
| 226 |
**Copy this result to your spreadsheet:**
|
| 227 |
{found_keywords_str}
|
| 228 |
"""
|
|
@@ -234,15 +234,15 @@ def create_interface():
|
|
| 234 |
# theme stays in gr.Blocks(), ssr_mode goes in launch()
|
| 235 |
with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
|
| 236 |
gr.HTML("""
|
| 237 |
-
<h1>Controlled
|
| 238 |
|
| 239 |
-
<p>This tool demonstrates how a simple python script can be used to extract keywords from text using a controlled vocabulary of primary keywords and associated
|
| 240 |
</p>
|
| 241 |
|
| 242 |
<h2>How to use this tool:</h2>
|
| 243 |
<ol>
|
| 244 |
<li>π <strong>Enter your text</strong> in the left panel</li>
|
| 245 |
-
<li>π <strong>Define your keyword dictionary</strong> in the right panel - enter primary keywords and their
|
| 246 |
<li>π <strong>Click "Find Keywords"</strong> to see results</li>
|
| 247 |
<li>π <strong>Copy the results</strong> to paste into your spreadsheet</li>
|
| 248 |
</ol>
|
|
@@ -258,32 +258,32 @@ def create_interface():
|
|
| 258 |
)
|
| 259 |
|
| 260 |
with gr.Column(scale=1):
|
| 261 |
-
gr.Markdown("**Keyword Dictionary** - Enter primary keywords and their
|
| 262 |
|
| 263 |
# Row 1
|
| 264 |
with gr.Row():
|
| 265 |
primary1 = gr.Textbox(label="Primary Keyword 1", placeholder="e.g., Prisoner of War", scale=1)
|
| 266 |
-
|
| 267 |
|
| 268 |
# Row 2
|
| 269 |
with gr.Row():
|
| 270 |
primary2 = gr.Textbox(label="Primary Keyword 2", placeholder="e.g., United States", scale=1)
|
| 271 |
-
|
| 272 |
|
| 273 |
# Row 3
|
| 274 |
with gr.Row():
|
| 275 |
primary3 = gr.Textbox(label="Primary Keyword 3", placeholder="e.g., University", scale=1)
|
| 276 |
-
|
| 277 |
|
| 278 |
# Row 4
|
| 279 |
with gr.Row():
|
| 280 |
primary4 = gr.Textbox(label="Primary Keyword 4", placeholder="Optional", scale=1)
|
| 281 |
-
|
| 282 |
|
| 283 |
# Row 5
|
| 284 |
with gr.Row():
|
| 285 |
primary5 = gr.Textbox(label="Primary Keyword 5", placeholder="Optional", scale=1)
|
| 286 |
-
|
| 287 |
|
| 288 |
# Full width Find Keywords button
|
| 289 |
with gr.Row():
|
|
@@ -335,7 +335,7 @@ def create_interface():
|
|
| 335 |
|
| 336 |
gr.Examples(
|
| 337 |
examples=[example1, example2],
|
| 338 |
-
inputs=[text_input, primary1,
|
| 339 |
label="Click an example to try it out"
|
| 340 |
)
|
| 341 |
|
|
@@ -351,7 +351,7 @@ def create_interface():
|
|
| 351 |
# Button functions
|
| 352 |
find_btn.click(
|
| 353 |
fn=process_text,
|
| 354 |
-
inputs=[text_input, primary1,
|
| 355 |
outputs=[results_output, highlighted_output, results_table_output, copy_output]
|
| 356 |
)
|
| 357 |
|
|
@@ -362,7 +362,7 @@ def create_interface():
|
|
| 362 |
|
| 363 |
clear_dict_btn.click(
|
| 364 |
fn=clear_dictionary_only,
|
| 365 |
-
outputs=[primary1,
|
| 366 |
)
|
| 367 |
|
| 368 |
# Instructions
|
|
@@ -371,13 +371,13 @@ def create_interface():
|
|
| 371 |
|
| 372 |
**How to enter keywords:**
|
| 373 |
- **Primary Keyword:** Enter the main/preferred term for a concept
|
| 374 |
-
- **
|
| 375 |
- Leave rows blank if you don't need all 5 keyword groups
|
| 376 |
- The tool will find ANY of these terms and return ALL related terms
|
| 377 |
|
| 378 |
**Example:**
|
| 379 |
- Primary: `Prisoner of War`
|
| 380 |
-
-
|
| 381 |
|
| 382 |
**Special Handling:**
|
| 383 |
- "US" is matched exactly to avoid confusion with the word "us"
|
|
@@ -385,7 +385,7 @@ def create_interface():
|
|
| 385 |
- Results are alphabetised and deduplicated
|
| 386 |
|
| 387 |
**How it works:**
|
| 388 |
-
When ANY variant is found in your text (primary OR
|
| 389 |
""")
|
| 390 |
|
| 391 |
# Bottom horizontal line and footer
|
|
|
|
| 2 |
import re
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
def build_keywords_dict(primary_inputs, associated_inputs):
|
| 6 |
+
"""Build keyword dictionary from separate primary and associated word inputs"""
|
| 7 |
keywords_dict = {}
|
| 8 |
|
| 9 |
+
for primary, associated in zip(primary_inputs, associated_inputs):
|
| 10 |
if primary and primary.strip(): # Only process if primary keyword exists
|
| 11 |
primary_clean = primary.strip()
|
| 12 |
+
if associated and associated.strip():
|
| 13 |
+
associated_list = [s.strip() for s in associated.split(';') if s.strip()]
|
| 14 |
else:
|
| 15 |
+
associated_list = []
|
| 16 |
+
keywords_dict[primary_clean] = associated_list
|
| 17 |
|
| 18 |
return keywords_dict
|
| 19 |
|
|
|
|
| 24 |
|
| 25 |
found_keywords = set()
|
| 26 |
|
| 27 |
+
# Search for each primary keyword and its associated words
|
| 28 |
+
for primary_keyword, associated_words in keywords_dict.items():
|
| 29 |
keyword_group_found = False
|
| 30 |
|
| 31 |
# Check primary keyword
|
|
|
|
| 37 |
if re.search(pattern, story, re.IGNORECASE):
|
| 38 |
keyword_group_found = True
|
| 39 |
|
| 40 |
+
# Check each associated word
|
| 41 |
+
for associated in associated_words:
|
| 42 |
+
if associated.upper() == "US":
|
| 43 |
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 44 |
keyword_group_found = True
|
| 45 |
else:
|
| 46 |
+
if re.search(r'\b' + re.escape(associated) + r'\b', story, re.IGNORECASE):
|
| 47 |
keyword_group_found = True
|
| 48 |
|
| 49 |
# If any keyword from this group was found, add ALL keywords from the group
|
| 50 |
if keyword_group_found:
|
| 51 |
found_keywords.add(primary_keyword) # Always include the primary
|
| 52 |
+
found_keywords.update(associated_words) # Add all associated words
|
| 53 |
|
| 54 |
return '; '.join(sorted(found_keywords))
|
| 55 |
|
|
|
|
| 83 |
|
| 84 |
# Group keywords by their primary category
|
| 85 |
keyword_groups = {}
|
| 86 |
+
for primary, associated_words in keywords_dict.items():
|
| 87 |
found_in_group = []
|
| 88 |
# Check if primary keyword was found
|
| 89 |
if primary in found_keywords:
|
| 90 |
found_in_group.append(primary)
|
| 91 |
+
# Check if any associated words were found
|
| 92 |
+
for associated in associated_words:
|
| 93 |
+
if associated in found_keywords:
|
| 94 |
+
found_in_group.append(associated)
|
| 95 |
|
| 96 |
if found_in_group:
|
| 97 |
keyword_groups[primary] = found_in_group
|
|
|
|
| 189 |
|
| 190 |
return table_html
|
| 191 |
|
| 192 |
+
def process_text(input_text, primary1, associated1, primary2, associated2, primary3, associated3, primary4, associated4, primary5, associated5):
|
| 193 |
"""Main processing function with added results table"""
|
| 194 |
if not input_text.strip():
|
| 195 |
return "Please enter some text to analyse", "", "", "No keywords found"
|
| 196 |
|
| 197 |
# Build keywords dictionary from separate inputs
|
| 198 |
primary_inputs = [primary1, primary2, primary3, primary4, primary5]
|
| 199 |
+
associated_inputs = [associated1, associated2, associated3, associated4, associated5]
|
| 200 |
+
keywords_dict = build_keywords_dict(primary_inputs, associated_inputs)
|
| 201 |
|
| 202 |
if not keywords_dict:
|
| 203 |
return "Please enter at least one primary keyword", "", "", "No keyword dictionary provided"
|
|
|
|
| 222 |
**Matched Keywords:** {found_keywords_str}
|
| 223 |
**Keyword Dictionary Stats:**
|
| 224 |
- Primary keywords loaded: {len(keywords_dict)}
|
| 225 |
+
- Total searchable terms: {sum(len(associated) + 1 for associated in keywords_dict.values())}
|
| 226 |
**Copy this result to your spreadsheet:**
|
| 227 |
{found_keywords_str}
|
| 228 |
"""
|
|
|
|
| 234 |
# theme stays in gr.Blocks(), ssr_mode goes in launch()
|
| 235 |
with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
|
| 236 |
gr.HTML("""
|
| 237 |
+
<h1>Controlled Vocabulary Keyword Tagging Tool</h1>
|
| 238 |
|
| 239 |
+
<p>This tool demonstrates how a simple python script can be used to extract keywords from text using a controlled vocabulary of primary keywords and associated words (abbreviations, alternate spellings, or related concepts).
|
| 240 |
</p>
|
| 241 |
|
| 242 |
<h2>How to use this tool:</h2>
|
| 243 |
<ol>
|
| 244 |
<li>π <strong>Enter your text</strong> in the left panel</li>
|
| 245 |
+
<li>π <strong>Define your keyword dictionary</strong> in the right panel - enter primary keywords and their associated words</li>
|
| 246 |
<li>π <strong>Click "Find Keywords"</strong> to see results</li>
|
| 247 |
<li>π <strong>Copy the results</strong> to paste into your spreadsheet</li>
|
| 248 |
</ol>
|
|
|
|
| 258 |
)
|
| 259 |
|
| 260 |
with gr.Column(scale=1):
|
| 261 |
+
gr.Markdown("**Keyword Dictionary** - Enter primary keywords and their associated words:")
|
| 262 |
|
| 263 |
# Row 1
|
| 264 |
with gr.Row():
|
| 265 |
primary1 = gr.Textbox(label="Primary Keyword 1", placeholder="e.g., Prisoner of War", scale=1)
|
| 266 |
+
associated1 = gr.Textbox(label="Associated Words 1", placeholder="e.g., POW; POWs; prisoner of war", scale=2)
|
| 267 |
|
| 268 |
# Row 2
|
| 269 |
with gr.Row():
|
| 270 |
primary2 = gr.Textbox(label="Primary Keyword 2", placeholder="e.g., United States", scale=1)
|
| 271 |
+
associated2 = gr.Textbox(label="Associated Words 2", placeholder="e.g., USA; US; America", scale=2)
|
| 272 |
|
| 273 |
# Row 3
|
| 274 |
with gr.Row():
|
| 275 |
primary3 = gr.Textbox(label="Primary Keyword 3", placeholder="e.g., University", scale=1)
|
| 276 |
+
associated3 = gr.Textbox(label="Associated Words 3", placeholder="e.g., university; institution; college", scale=2)
|
| 277 |
|
| 278 |
# Row 4
|
| 279 |
with gr.Row():
|
| 280 |
primary4 = gr.Textbox(label="Primary Keyword 4", placeholder="Optional", scale=1)
|
| 281 |
+
associated4 = gr.Textbox(label="Associated Words 4", placeholder="Optional", scale=2)
|
| 282 |
|
| 283 |
# Row 5
|
| 284 |
with gr.Row():
|
| 285 |
primary5 = gr.Textbox(label="Primary Keyword 5", placeholder="Optional", scale=1)
|
| 286 |
+
associated5 = gr.Textbox(label="Associated Words 5", placeholder="Optional", scale=2)
|
| 287 |
|
| 288 |
# Full width Find Keywords button
|
| 289 |
with gr.Row():
|
|
|
|
| 335 |
|
| 336 |
gr.Examples(
|
| 337 |
examples=[example1, example2],
|
| 338 |
+
inputs=[text_input, primary1, associated1, primary2, associated2, primary3, associated3, primary4, associated4, primary5, associated5],
|
| 339 |
label="Click an example to try it out"
|
| 340 |
)
|
| 341 |
|
|
|
|
| 351 |
# Button functions
|
| 352 |
find_btn.click(
|
| 353 |
fn=process_text,
|
| 354 |
+
inputs=[text_input, primary1, associated1, primary2, associated2, primary3, associated3, primary4, associated4, primary5, associated5],
|
| 355 |
outputs=[results_output, highlighted_output, results_table_output, copy_output]
|
| 356 |
)
|
| 357 |
|
|
|
|
| 362 |
|
| 363 |
clear_dict_btn.click(
|
| 364 |
fn=clear_dictionary_only,
|
| 365 |
+
outputs=[primary1, associated1, primary2, associated2, primary3, associated3, primary4, associated4, primary5, associated5]
|
| 366 |
)
|
| 367 |
|
| 368 |
# Instructions
|
|
|
|
| 371 |
|
| 372 |
**How to enter keywords:**
|
| 373 |
- **Primary Keyword:** Enter the main/preferred term for a concept
|
| 374 |
+
- **Associated Words:** Enter alternative terms separated by semicolons `;`
|
| 375 |
- Leave rows blank if you don't need all 5 keyword groups
|
| 376 |
- The tool will find ANY of these terms and return ALL related terms
|
| 377 |
|
| 378 |
**Example:**
|
| 379 |
- Primary: `Prisoner of War`
|
| 380 |
+
- Associated Words: `POW; POWs; prisoner of war`
|
| 381 |
|
| 382 |
**Special Handling:**
|
| 383 |
- "US" is matched exactly to avoid confusion with the word "us"
|
|
|
|
| 385 |
- Results are alphabetised and deduplicated
|
| 386 |
|
| 387 |
**How it works:**
|
| 388 |
+
When ANY variant is found in your text (primary OR associated word), the tool returns the complete standardised set of terms for that concept.
|
| 389 |
""")
|
| 390 |
|
| 391 |
# Bottom horizontal line and footer
|