Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Create the Gradio interface
|
| 2 |
def create_interface():
|
| 3 |
with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
|
|
@@ -92,24 +323,6 @@ def create_interface():
|
|
| 92 |
with gr.Column(scale=1):
|
| 93 |
example2_btn = gr.Button("Load Example 2", variant="secondary", size="sm")
|
| 94 |
|
| 95 |
-
# Define example data
|
| 96 |
-
example1_data = [
|
| 97 |
-
"During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
|
| 98 |
-
"Prisoner of War", "POW; POWs; prisoner of war",
|
| 99 |
-
"World War II", "WWII; Second World War",
|
| 100 |
-
"United States", "USA; US; America; American",
|
| 101 |
-
"", "", "", ""
|
| 102 |
-
]
|
| 103 |
-
|
| 104 |
-
example2_data = [
|
| 105 |
-
"The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution.",
|
| 106 |
-
"University", "university; institution; college",
|
| 107 |
-
"Oxford", "oxford",
|
| 108 |
-
"England", "england; English",
|
| 109 |
-
"Student", "student; students; pupils",
|
| 110 |
-
"", ""
|
| 111 |
-
]
|
| 112 |
-
|
| 113 |
# Clear functions
|
| 114 |
def clear_dictionary_only():
|
| 115 |
"""Clear only the keyword dictionary fields"""
|
|
@@ -121,10 +334,23 @@ def create_interface():
|
|
| 121 |
|
| 122 |
# Example loading functions
|
| 123 |
def load_example1():
|
| 124 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
def load_example2():
|
| 127 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Button functions
|
| 130 |
find_btn.click(
|
|
@@ -140,7 +366,7 @@ def create_interface():
|
|
| 140 |
|
| 141 |
clear_all_btn.click(
|
| 142 |
fn=clear_everything,
|
| 143 |
-
outputs=[text_input, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5, results_output, highlighted_output, copy_output]
|
| 144 |
)
|
| 145 |
|
| 146 |
example1_btn.click(
|
|
@@ -186,4 +412,8 @@ def create_interface():
|
|
| 186 |
</div>
|
| 187 |
""")
|
| 188 |
|
| 189 |
-
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import re
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
def build_keywords_dict(primary_inputs, synonym_inputs):
|
| 6 |
+
"""Build keyword dictionary from separate primary and synonym inputs"""
|
| 7 |
+
keywords_dict = {}
|
| 8 |
+
|
| 9 |
+
for primary, synonyms in zip(primary_inputs, synonym_inputs):
|
| 10 |
+
if primary and primary.strip(): # Only process if primary keyword exists
|
| 11 |
+
primary_clean = primary.strip()
|
| 12 |
+
if synonyms and synonyms.strip():
|
| 13 |
+
synonym_list = [s.strip() for s in synonyms.split(';') if s.strip()]
|
| 14 |
+
else:
|
| 15 |
+
synonym_list = []
|
| 16 |
+
keywords_dict[primary_clean] = synonym_list
|
| 17 |
+
|
| 18 |
+
return keywords_dict
|
| 19 |
+
|
| 20 |
+
def find_keywords(story, keywords_dict):
|
| 21 |
+
"""Find keywords in the story text"""
|
| 22 |
+
if not story or not isinstance(story, str):
|
| 23 |
+
return ''
|
| 24 |
+
|
| 25 |
+
found_keywords = set()
|
| 26 |
+
|
| 27 |
+
# Search for each primary keyword and its synonyms
|
| 28 |
+
for primary_keyword, synonyms in keywords_dict.items():
|
| 29 |
+
keyword_group_found = False
|
| 30 |
+
|
| 31 |
+
# Check primary keyword
|
| 32 |
+
if primary_keyword.upper() == "US":
|
| 33 |
+
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 34 |
+
keyword_group_found = True
|
| 35 |
+
else:
|
| 36 |
+
pattern = r'\b' + re.escape(primary_keyword) + r'\b'
|
| 37 |
+
if re.search(pattern, story, re.IGNORECASE):
|
| 38 |
+
keyword_group_found = True
|
| 39 |
+
|
| 40 |
+
# Check each synonym
|
| 41 |
+
for synonym in synonyms:
|
| 42 |
+
if synonym.upper() == "US":
|
| 43 |
+
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 44 |
+
keyword_group_found = True
|
| 45 |
+
else:
|
| 46 |
+
if re.search(r'\b' + re.escape(synonym) + r'\b', story, re.IGNORECASE):
|
| 47 |
+
keyword_group_found = True
|
| 48 |
+
|
| 49 |
+
# If any keyword from this group was found, add ALL keywords from the group
|
| 50 |
+
if keyword_group_found:
|
| 51 |
+
found_keywords.add(primary_keyword) # Always include the primary
|
| 52 |
+
found_keywords.update(synonyms) # Add all synonyms
|
| 53 |
+
|
| 54 |
+
return '; '.join(sorted(found_keywords))
|
| 55 |
+
|
| 56 |
+
def highlight_keywords_in_text(text, keywords_list):
|
| 57 |
+
"""Create HTML with highlighted keywords while preserving line breaks"""
|
| 58 |
+
if not keywords_list:
|
| 59 |
+
# Convert line breaks to HTML breaks for plain text
|
| 60 |
+
formatted_text = text.replace('\n', '<br>')
|
| 61 |
+
return formatted_text
|
| 62 |
+
|
| 63 |
+
highlighted_text = text
|
| 64 |
+
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
|
| 65 |
+
|
| 66 |
+
for i, keyword in enumerate(keywords_list):
|
| 67 |
+
if keyword:
|
| 68 |
+
color = colors[i % len(colors)]
|
| 69 |
+
pattern = r'\b' + re.escape(keyword) + r'\b'
|
| 70 |
+
replacement = f'<span style="background-color: {color}; padding: 2px 4px; border-radius: 3px; color: white; font-weight: bold;">{keyword}</span>'
|
| 71 |
+
highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
|
| 72 |
+
|
| 73 |
+
# Convert line breaks to HTML breaks after highlighting
|
| 74 |
+
highlighted_text = highlighted_text.replace('\n', '<br>')
|
| 75 |
+
return highlighted_text
|
| 76 |
+
|
| 77 |
+
def create_keyword_results_table(found_keywords_str, keywords_dict, input_text):
|
| 78 |
+
"""Create HTML table showing detailed keyword results"""
|
| 79 |
+
if not found_keywords_str:
|
| 80 |
+
return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
|
| 81 |
+
|
| 82 |
+
found_keywords = found_keywords_str.split('; ')
|
| 83 |
+
|
| 84 |
+
# Group keywords by their primary category
|
| 85 |
+
keyword_groups = {}
|
| 86 |
+
for primary, synonyms in keywords_dict.items():
|
| 87 |
+
found_in_group = []
|
| 88 |
+
# Check if primary keyword was found
|
| 89 |
+
if primary in found_keywords:
|
| 90 |
+
found_in_group.append(primary)
|
| 91 |
+
# Check if any synonyms were found
|
| 92 |
+
for synonym in synonyms:
|
| 93 |
+
if synonym in found_keywords:
|
| 94 |
+
found_in_group.append(synonym)
|
| 95 |
+
|
| 96 |
+
if found_in_group:
|
| 97 |
+
keyword_groups[primary] = found_in_group
|
| 98 |
+
|
| 99 |
+
if not keyword_groups:
|
| 100 |
+
return "<p style='text-align: center; padding: 20px;'>No keyword groups matched.</p>"
|
| 101 |
+
|
| 102 |
+
# Create the HTML table
|
| 103 |
+
table_html = """
|
| 104 |
+
<div style='max-height: 500px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa; margin: 10px 0;'>
|
| 105 |
+
<h4 style='margin: 0 0 15px 0; color: #333;'>📊 Detailed Keyword Results</h4>
|
| 106 |
+
<table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
|
| 107 |
+
<thead>
|
| 108 |
+
<tr style="background-color: #6366f1; color: white;">
|
| 109 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Primary Keyword</th>
|
| 110 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Found Terms</th>
|
| 111 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Count in Text</th>
|
| 112 |
+
<th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Context Preview</th>
|
| 113 |
+
</tr>
|
| 114 |
+
</thead>
|
| 115 |
+
<tbody>
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
|
| 119 |
+
|
| 120 |
+
for i, (primary, found_terms) in enumerate(keyword_groups.items()):
|
| 121 |
+
color = colors[i % len(colors)]
|
| 122 |
+
|
| 123 |
+
# Count total occurrences and get context
|
| 124 |
+
total_count = 0
|
| 125 |
+
contexts = []
|
| 126 |
+
|
| 127 |
+
for term in found_terms:
|
| 128 |
+
# Count occurrences (case insensitive)
|
| 129 |
+
if term.upper() == "US":
|
| 130 |
+
# Special handling for "US"
|
| 131 |
+
count = len([m for m in re.finditer(r'\bUS\b', input_text)])
|
| 132 |
+
else:
|
| 133 |
+
pattern = r'\b' + re.escape(term) + r'\b'
|
| 134 |
+
count = len(list(re.finditer(pattern, input_text, re.IGNORECASE)))
|
| 135 |
+
|
| 136 |
+
total_count += count
|
| 137 |
+
|
| 138 |
+
# Get context (first occurrence)
|
| 139 |
+
if term.upper() == "US":
|
| 140 |
+
match = re.search(r'\bUS\b', input_text)
|
| 141 |
+
else:
|
| 142 |
+
match = re.search(r'\b' + re.escape(term) + r'\b', input_text, re.IGNORECASE)
|
| 143 |
+
|
| 144 |
+
if match:
|
| 145 |
+
start = max(0, match.start() - 30)
|
| 146 |
+
end = min(len(input_text), match.end() + 30)
|
| 147 |
+
context = input_text[start:end].replace('\n', ' ')
|
| 148 |
+
# Highlight the found term in context
|
| 149 |
+
if term.upper() == "US":
|
| 150 |
+
highlighted_context = re.sub(
|
| 151 |
+
r'\bUS\b',
|
| 152 |
+
f'<strong style="background-color: {color}; color: white; padding: 1px 2px; border-radius: 2px;">{term}</strong>',
|
| 153 |
+
context
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
highlighted_context = re.sub(
|
| 157 |
+
r'\b' + re.escape(term) + r'\b',
|
| 158 |
+
f'<strong style="background-color: {color}; color: white; padding: 1px 2px; border-radius: 2px;">{term}</strong>',
|
| 159 |
+
context,
|
| 160 |
+
flags=re.IGNORECASE
|
| 161 |
+
)
|
| 162 |
+
contexts.append(highlighted_context)
|
| 163 |
+
|
| 164 |
+
# Create found terms display
|
| 165 |
+
found_terms_display = []
|
| 166 |
+
for term in found_terms:
|
| 167 |
+
found_terms_display.append(f'<span style="background-color: {color}; color: white; padding: 2px 6px; border-radius: 10px; font-size: 12px; margin: 1px;">{term}</span>')
|
| 168 |
+
|
| 169 |
+
table_html += f"""
|
| 170 |
+
<tr style="background-color: #fff;">
|
| 171 |
+
<td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{primary}</td>
|
| 172 |
+
<td style="padding: 10px; border: 1px solid #ddd;">{' '.join(found_terms_display)}</td>
|
| 173 |
+
<td style="padding: 10px; border: 1px solid #ddd; text-align: center;">
|
| 174 |
+
<span style='background-color: #28a745; color: white; padding: 4px 8px; border-radius: 12px; font-weight: bold;'>
|
| 175 |
+
{total_count}
|
| 176 |
+
</span>
|
| 177 |
+
</td>
|
| 178 |
+
<td style="padding: 10px; border: 1px solid #ddd; font-style: italic; font-size: 14px;">
|
| 179 |
+
{contexts[0] if contexts else 'No context available'}...
|
| 180 |
+
</td>
|
| 181 |
+
</tr>
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
table_html += """
|
| 185 |
+
</tbody>
|
| 186 |
+
</table>
|
| 187 |
+
</div>
|
| 188 |
+
"""
|
| 189 |
+
|
| 190 |
+
return table_html
|
| 191 |
+
|
| 192 |
+
def process_text(input_text, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5):
|
| 193 |
+
"""Main processing function with added results table"""
|
| 194 |
+
if not input_text.strip():
|
| 195 |
+
return "Please enter some text to analyze", "", "", "No keywords found"
|
| 196 |
+
|
| 197 |
+
# Build keywords dictionary from separate inputs
|
| 198 |
+
primary_inputs = [primary1, primary2, primary3, primary4, primary5]
|
| 199 |
+
synonym_inputs = [synonyms1, synonyms2, synonyms3, synonyms4, synonyms5]
|
| 200 |
+
keywords_dict = build_keywords_dict(primary_inputs, synonym_inputs)
|
| 201 |
+
|
| 202 |
+
if not keywords_dict:
|
| 203 |
+
return "Please enter at least one primary keyword", "", "", "No keyword dictionary provided"
|
| 204 |
+
|
| 205 |
+
# Find keywords in the text
|
| 206 |
+
found_keywords_str = find_keywords(input_text, keywords_dict)
|
| 207 |
+
|
| 208 |
+
if not found_keywords_str:
|
| 209 |
+
return f"No keywords found in the text.\n\nKeyword dictionary loaded: {len(keywords_dict)} primary keywords", input_text, "", "No matches found"
|
| 210 |
+
|
| 211 |
+
# Create highlighted version
|
| 212 |
+
keywords_list = found_keywords_str.split('; ')
|
| 213 |
+
highlighted_html = highlight_keywords_in_text(input_text, keywords_list)
|
| 214 |
+
|
| 215 |
+
# Create results table
|
| 216 |
+
results_table_html = create_keyword_results_table(found_keywords_str, keywords_dict, input_text)
|
| 217 |
+
|
| 218 |
+
# Create results summary
|
| 219 |
+
results_summary = f"""
|
| 220 |
+
## Results Summary
|
| 221 |
+
**Keywords Found:** {len(keywords_list)}
|
| 222 |
+
**Matched Keywords:** {found_keywords_str}
|
| 223 |
+
**Keyword Dictionary Stats:**
|
| 224 |
+
- Primary keywords loaded: {len(keywords_dict)}
|
| 225 |
+
- Total searchable terms: {sum(len(synonyms) + 1 for synonyms in keywords_dict.values())}
|
| 226 |
+
**Copy this result to your spreadsheet:**
|
| 227 |
+
{found_keywords_str}
|
| 228 |
+
"""
|
| 229 |
+
|
| 230 |
+
return results_summary, highlighted_html, results_table_html, found_keywords_str
|
| 231 |
+
|
| 232 |
# Create the Gradio interface
|
| 233 |
def create_interface():
|
| 234 |
with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
|
|
|
|
| 323 |
with gr.Column(scale=1):
|
| 324 |
example2_btn = gr.Button("Load Example 2", variant="secondary", size="sm")
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# Clear functions
|
| 327 |
def clear_dictionary_only():
|
| 328 |
"""Clear only the keyword dictionary fields"""
|
|
|
|
| 334 |
|
| 335 |
# Example loading functions
|
| 336 |
def load_example1():
|
| 337 |
+
return [
|
| 338 |
+
"During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
|
| 339 |
+
"Prisoner of War", "POW; POWs; prisoner of war",
|
| 340 |
+
"World War II", "WWII; Second World War",
|
| 341 |
+
"United States", "USA; US; America; American",
|
| 342 |
+
"", "", "", ""
|
| 343 |
+
]
|
| 344 |
|
| 345 |
def load_example2():
|
| 346 |
+
return [
|
| 347 |
+
"The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution.",
|
| 348 |
+
"University", "university; institution; college",
|
| 349 |
+
"Oxford", "oxford",
|
| 350 |
+
"England", "england; English",
|
| 351 |
+
"Student", "student; students; pupils",
|
| 352 |
+
"", ""
|
| 353 |
+
]
|
| 354 |
|
| 355 |
# Button functions
|
| 356 |
find_btn.click(
|
|
|
|
| 366 |
|
| 367 |
clear_all_btn.click(
|
| 368 |
fn=clear_everything,
|
| 369 |
+
outputs=[text_input, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5, results_output, highlighted_output, results_table_output, copy_output]
|
| 370 |
)
|
| 371 |
|
| 372 |
example1_btn.click(
|
|
|
|
| 412 |
</div>
|
| 413 |
""")
|
| 414 |
|
| 415 |
+
return demo
|
| 416 |
+
|
| 417 |
+
if __name__ == "__main__":
|
| 418 |
+
demo = create_interface()
|
| 419 |
+
demo.launch()
|