Update app.py
Browse files
app.py
CHANGED
|
@@ -1,253 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import re
|
| 3 |
-
import pandas as pd
|
| 4 |
|
| 5 |
-
def
|
| 6 |
-
"
|
| 7 |
-
keywords_dict = {}
|
| 8 |
-
|
| 9 |
-
if not keywords_text.strip():
|
| 10 |
-
return keywords_dict
|
| 11 |
-
|
| 12 |
-
lines = keywords_text.strip().split('\n')
|
| 13 |
-
for line in lines:
|
| 14 |
-
line = line.strip()
|
| 15 |
-
if not line or '|' not in line:
|
| 16 |
-
continue
|
| 17 |
-
|
| 18 |
-
try:
|
| 19 |
-
primary, secondary = line.split('|', 1)
|
| 20 |
-
primary = primary.strip()
|
| 21 |
-
secondary_list = [keyword.strip() for keyword in secondary.split(';') if keyword.strip()]
|
| 22 |
-
keywords_dict[primary] = secondary_list
|
| 23 |
-
except:
|
| 24 |
-
continue
|
| 25 |
-
|
| 26 |
-
return keywords_dict
|
| 27 |
|
| 28 |
-
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# Search for each primary keyword and its synonyms
|
| 36 |
-
for primary_keyword, synonyms in keywords_dict.items():
|
| 37 |
-
# Special handling for 'US' to avoid matching 'us'
|
| 38 |
-
if primary_keyword.upper() == "US":
|
| 39 |
-
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 40 |
-
found_keywords.update(synonyms)
|
| 41 |
-
else:
|
| 42 |
-
pattern = r'\b' + re.escape(primary_keyword) + r'\b'
|
| 43 |
-
if re.search(pattern, story, re.IGNORECASE):
|
| 44 |
-
found_keywords.update(synonyms)
|
| 45 |
-
|
| 46 |
-
# Check each secondary keyword independently
|
| 47 |
-
for synonym in synonyms:
|
| 48 |
-
if synonym.upper() == "US":
|
| 49 |
-
if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
|
| 50 |
-
found_keywords.add(synonym)
|
| 51 |
-
else:
|
| 52 |
-
if re.search(r'\b' + re.escape(synonym) + r'\b', story, re.IGNORECASE):
|
| 53 |
-
found_keywords.add(synonym)
|
| 54 |
-
|
| 55 |
-
return '; '.join(sorted(found_keywords))
|
| 56 |
|
| 57 |
-
|
| 58 |
-
"""Create HTML with highlighted keywords"""
|
| 59 |
-
if not keywords_list:
|
| 60 |
-
return text
|
| 61 |
-
|
| 62 |
-
highlighted_text = text
|
| 63 |
-
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
|
| 64 |
-
|
| 65 |
-
for i, keyword in enumerate(keywords_list):
|
| 66 |
-
if keyword:
|
| 67 |
-
color = colors[i % len(colors)]
|
| 68 |
-
pattern = r'\b' + re.escape(keyword) + r'\b'
|
| 69 |
-
replacement = f'<span style="background-color: {color}; padding: 2px 4px; border-radius: 3px; color: white; font-weight: bold;">{keyword}</span>'
|
| 70 |
-
highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
|
| 71 |
-
|
| 72 |
-
return highlighted_text
|
| 73 |
-
|
| 74 |
-
def process_text(input_text, keywords_text):
|
| 75 |
-
"""Main processing function"""
|
| 76 |
-
if not input_text.strip():
|
| 77 |
-
return "Please enter some text to analyze", "", "No keywords found"
|
| 78 |
-
|
| 79 |
-
if not keywords_text.strip():
|
| 80 |
-
return "Please enter keyword mappings", "", "No keyword dictionary provided"
|
| 81 |
-
|
| 82 |
-
# Parse keywords dictionary
|
| 83 |
-
keywords_dict = parse_keywords_dict(keywords_text)
|
| 84 |
-
|
| 85 |
-
if not keywords_dict:
|
| 86 |
-
return "Invalid keyword format. Please check your keyword dictionary format.", "", "Error parsing keywords"
|
| 87 |
-
|
| 88 |
-
# Find keywords in the text
|
| 89 |
-
found_keywords_str = find_keywords(input_text, keywords_dict)
|
| 90 |
-
|
| 91 |
-
if not found_keywords_str:
|
| 92 |
-
return f"No keywords found in the text.\n\nKeyword dictionary loaded: {len(keywords_dict)} primary keywords", input_text, "No matches found"
|
| 93 |
-
|
| 94 |
-
# Create highlighted version
|
| 95 |
-
keywords_list = found_keywords_str.split('; ')
|
| 96 |
-
highlighted_html = highlight_keywords_in_text(input_text, keywords_list)
|
| 97 |
-
|
| 98 |
-
# Create results summary
|
| 99 |
-
results_summary = f"""
|
| 100 |
-
## Results Summary
|
| 101 |
-
|
| 102 |
-
**Keywords Found:** {len(keywords_list)}
|
| 103 |
-
**Matched Keywords:** {found_keywords_str}
|
| 104 |
-
|
| 105 |
-
**Keyword Dictionary Stats:**
|
| 106 |
-
- Primary keywords loaded: {len(keywords_dict)}
|
| 107 |
-
- Total searchable terms: {sum(len(synonyms) + 1 for synonyms in keywords_dict.values())}
|
| 108 |
-
|
| 109 |
-
**Copy this result to your spreadsheet:**
|
| 110 |
-
{found_keywords_str}
|
| 111 |
-
"""
|
| 112 |
-
|
| 113 |
-
return results_summary, highlighted_html, found_keywords_str
|
| 114 |
-
|
| 115 |
-
# Create the Gradio interface
|
| 116 |
-
def create_interface():
|
| 117 |
-
with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
|
| 118 |
-
gr.Markdown("""
|
| 119 |
-
# Keyword Tagging Tool
|
| 120 |
-
|
| 121 |
-
This tool matches text against a controlled vocabulary and returns all associated keywords.
|
| 122 |
-
Based on the keyword matching logic used for digital humanities research.
|
| 123 |
-
|
| 124 |
-
## How to use:
|
| 125 |
-
1. **Enter your text** in the left panel
|
| 126 |
-
2. **Define your keyword dictionary** in the right panel using the format: `Primary Keyword | synonym1; synonym2; synonym3`
|
| 127 |
-
3. **Click "Find Keywords"** to see results
|
| 128 |
-
4. **Copy the results** to paste into your spreadsheet
|
| 129 |
-
""")
|
| 130 |
-
|
| 131 |
-
with gr.Row():
|
| 132 |
-
with gr.Column(scale=1):
|
| 133 |
-
text_input = gr.Textbox(
|
| 134 |
-
label="Text to Analyze",
|
| 135 |
-
placeholder="Enter the text you want to tag with keywords...",
|
| 136 |
-
lines=15,
|
| 137 |
-
max_lines=20
|
| 138 |
-
)
|
| 139 |
-
|
| 140 |
-
with gr.Column(scale=1):
|
| 141 |
-
keywords_input = gr.Textbox(
|
| 142 |
-
label="Keyword Dictionary",
|
| 143 |
-
placeholder="""Enter one mapping per line:
|
| 144 |
-
Primary Keyword | synonym1; synonym2; synonym3
|
| 145 |
-
|
| 146 |
-
Example:
|
| 147 |
-
Prisoner of War | Prisoner of War; POW; POWs
|
| 148 |
-
United States | United States; USA; US; America
|
| 149 |
-
""",
|
| 150 |
-
lines=15,
|
| 151 |
-
max_lines=20
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
with gr.Row():
|
| 155 |
-
find_btn = gr.Button("Find Keywords", variant="primary", size="lg")
|
| 156 |
-
clear_btn = gr.Button("Clear All", size="lg")
|
| 157 |
-
|
| 158 |
-
with gr.Row():
|
| 159 |
-
results_output = gr.Markdown(label="Results Summary")
|
| 160 |
-
|
| 161 |
-
with gr.Row():
|
| 162 |
-
highlighted_output = gr.HTML(label="Text with Highlighted Keywords")
|
| 163 |
-
|
| 164 |
-
with gr.Row():
|
| 165 |
-
copy_output = gr.Textbox(
|
| 166 |
-
label="Keywords for Spreadsheet (copy this text)",
|
| 167 |
-
lines=3,
|
| 168 |
-
max_lines=5
|
| 169 |
-
)
|
| 170 |
-
|
| 171 |
-
# Examples section
|
| 172 |
-
gr.Markdown("### Examples")
|
| 173 |
-
gr.Examples(
|
| 174 |
-
examples=[
|
| 175 |
-
[
|
| 176 |
-
"During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
|
| 177 |
-
"""Prisoner of War | Prisoner of War; POW; POWs; prisoner of war
|
| 178 |
-
World War II | World War II; WWII; Second World War
|
| 179 |
-
United States | United States; USA; US; America; American
|
| 180 |
-
United Kingdom | United Kingdom; UK; Britain; British"""
|
| 181 |
-
],
|
| 182 |
-
[
|
| 183 |
-
"The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution. The university has many colleges including Christ Church and Magdalen College.",
|
| 184 |
-
"""University | University; university; institution; college
|
| 185 |
-
Oxford | Oxford; oxford
|
| 186 |
-
England | England; england; English
|
| 187 |
-
Student | Student; student; students; pupils"""
|
| 188 |
-
],
|
| 189 |
-
[
|
| 190 |
-
"Shakespeare wrote many famous plays including Hamlet, Romeo and Juliet, and Macbeth. These works are performed in theatres worldwide and studied in schools.",
|
| 191 |
-
"""William Shakespeare | Shakespeare; william shakespeare; playwright
|
| 192 |
-
Theatre | Theatre; theater; stage; performance
|
| 193 |
-
Play | Play; plays; drama; theatrical work
|
| 194 |
-
School | School; schools; education; academic"""
|
| 195 |
-
]
|
| 196 |
-
],
|
| 197 |
-
inputs=[text_input, keywords_input],
|
| 198 |
-
label="Click an example to try it out"
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
# Button functions
|
| 202 |
-
find_btn.click(
|
| 203 |
-
fn=process_text,
|
| 204 |
-
inputs=[text_input, keywords_input],
|
| 205 |
-
outputs=[results_output, highlighted_output, copy_output]
|
| 206 |
-
)
|
| 207 |
-
|
| 208 |
-
def clear_all():
|
| 209 |
-
return "", "", "", "", ""
|
| 210 |
-
|
| 211 |
-
clear_btn.click(
|
| 212 |
-
fn=clear_all,
|
| 213 |
-
outputs=[text_input, keywords_input, results_output, highlighted_output, copy_output]
|
| 214 |
-
)
|
| 215 |
-
|
| 216 |
-
# Instructions
|
| 217 |
-
gr.Markdown("""
|
| 218 |
-
## Format Guide
|
| 219 |
-
|
| 220 |
-
**Keyword Dictionary Format:**
|
| 221 |
-
- One primary keyword per line
|
| 222 |
-
- Use the pipe symbol `|` to separate primary keyword from synonyms
|
| 223 |
-
- Use semicolons `;` to separate multiple synonyms
|
| 224 |
-
- Case-insensitive matching (except for special cases like "US")
|
| 225 |
-
|
| 226 |
-
**Special Handling:**
|
| 227 |
-
- "US" is matched exactly to avoid confusion with the word "us"
|
| 228 |
-
- Word boundaries are respected (prevents partial matches)
|
| 229 |
-
- Results are alphabetized and deduplicated
|
| 230 |
-
|
| 231 |
-
**Example Dictionary Entry:**
|
| 232 |
-
```
|
| 233 |
-
Prisoner of War | Prisoner of War; POW; POWs; prisoner of war
|
| 234 |
-
```
|
| 235 |
-
|
| 236 |
-
This will find any occurrence of "Prisoner of War", "POW", "POWs", or "prisoner of war" in your text and return all the associated terms.
|
| 237 |
-
""")
|
| 238 |
-
|
| 239 |
-
# Footer
|
| 240 |
-
gr.HTML("""
|
| 241 |
-
<div style="margin-top: 40px; padding: 20px; background-color: #f8f9fa; border-radius: 8px; text-align: center;">
|
| 242 |
-
<p style="margin: 0; color: #666;">
|
| 243 |
-
Created for digital humanities keyword tagging workflows.
|
| 244 |
-
Based on controlled vocabulary matching principles.
|
| 245 |
-
</p>
|
| 246 |
-
</div>
|
| 247 |
-
""")
|
| 248 |
-
|
| 249 |
-
return demo
|
| 250 |
-
|
| 251 |
-
if __name__ == "__main__":
|
| 252 |
-
demo = create_interface()
|
| 253 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
def simple_test(text):
|
| 4 |
+
return f"You entered: {text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
with gr.Blocks() as demo:
|
| 7 |
+
gr.Markdown("# Test App")
|
| 8 |
+
input_box = gr.Textbox(label="Test Input")
|
| 9 |
+
output_box = gr.Textbox(label="Output")
|
| 10 |
+
btn = gr.Button("Test")
|
| 11 |
+
btn.click(simple_test, inputs=input_box, outputs=output_box)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|