Spaces:
Sleeping
Sleeping
File size: 9,138 Bytes
a5703b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import gradio as gr
import re
from collections import Counter
# USAS category information
USAS_CATEGORIES = {
'A': ('General & Abstract Terms', '#fee2e2'),
'B': ('Body & Individual', '#fce7f3'),
'C': ('Arts & Crafts', '#f3e8ff'),
'E': ('Emotional Actions', '#ffe4e6'),
'F': ('Food & Farming', '#dcfce7'),
'G': ('Government & Public', '#dbeafe'),
'H': ('Architecture & Buildings', '#fef3c7'),
'I': ('Money & Commerce', '#d1fae5'),
'K': ('Entertainment & Sports', '#e9d5ff'),
'L': ('Life & Living Things', '#ecfccb'),
'M': ('Movement & Location', '#cffafe'),
'N': ('Numbers & Measurement', '#e0e7ff'),
'O': ('Substances & Objects', '#fed7aa'),
'P': ('Education', '#ccfbf1'),
'Q': ('Linguistic Actions', '#e0f2fe'),
'S': ('Social Actions', '#fae8ff'),
'T': ('Time', '#fef9c3'),
'W': ('World & Environment', '#bbf7d0'),
'X': ('Psychological Actions', '#ddd6fe'),
'Y': ('Science & Technology', '#bfdbfe'),
'Z': ('Names & Grammatical', '#e5e7eb')
}
def get_category_color(tag):
"""Get color for a tag based on its first letter"""
if not tag:
return '#f3f4f6'
first_char = tag[0].upper()
return USAS_CATEGORIES.get(first_char, ('#f3f4f6', 'Unknown'))[1]
def get_category_name(tag):
"""Get category name for a tag"""
if not tag:
return 'Unknown'
first_char = tag[0].upper()
return USAS_CATEGORIES.get(first_char, ('Unknown', '#f3f4f6'))[0]
def parse_tagged_text(text):
"""
Parse pre-tagged text in underscore format: word_TAG
Example: I_Z8 love_E2+ walking_M1
"""
if not text.strip():
return "Please enter some tagged text to visualize.", "", ""
tokens = []
# Split by whitespace and parse each token
parts = text.split()
for part in parts:
if '_' in part:
# word_TAG format - split on last underscore to handle words with underscores
word, tag = part.rsplit('_', 1)
tokens.append((word, tag))
else:
# No tag found, treat as untagged
tokens.append((part, 'Z99'))
if not tokens:
return "No tagged content found. Please check the format.", "", ""
# Create HTML visualization
html_parts = ['<div style="line-height: 2.5; font-size: 16px;">']
tag_counts = Counter()
for word, tag in tokens:
# Count tags (use first letter of primary tag)
first_char = tag.split('/')[0][0].upper() if tag else 'Z'
tag_counts[first_char] += 1
# Get color
color = get_category_color(tag)
category = get_category_name(tag)
# Create colored span with tooltip
html_parts.append(
f'<span style="background-color: {color}; '
f'padding: 4px 8px; margin: 2px; border-radius: 6px; '
f'display: inline-block; border: 2px solid {color}; '
f'cursor: help;" '
f'title="{word}\nTag: {tag}\nCategory: {category}">'
f'<strong>{word}</strong><br>'
f'<small style="font-size: 11px; font-family: monospace;">{tag}</small>'
f'</span> '
)
html_parts.append('</div>')
# Create statistics table
stats_html = ['<div style="margin-top: 20px;"><h3>Tag Distribution</h3>',
'<table style="width: 100%; border-collapse: collapse;">',
'<tr style="background-color: #f3f4f6;">',
'<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Category</th>',
'<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Name</th>',
'<th style="padding: 8px; text-align: right; border: 1px solid #ddd;">Count</th>',
'<th style="padding: 8px; text-align: right; border: 1px solid #ddd;">%</th>',
'</tr>']
total = sum(tag_counts.values())
for cat, count in tag_counts.most_common():
cat_name = USAS_CATEGORIES.get(cat, ('Unknown', '#f3f4f6'))[0]
color = USAS_CATEGORIES.get(cat, ('Unknown', '#f3f4f6'))[1]
percentage = (count / total * 100) if total > 0 else 0
stats_html.append(
f'<tr><td style="padding: 8px; border: 1px solid #ddd; background-color: {color};">'
f'<strong>{cat}</strong></td>'
f'<td style="padding: 8px; border: 1px solid #ddd;">{cat_name}</td>'
f'<td style="padding: 8px; border: 1px solid #ddd; text-align: right;">{count}</td>'
f'<td style="padding: 8px; border: 1px solid #ddd; text-align: right;">{percentage:.1f}%</td></tr>'
)
stats_html.append('</table></div>')
# Create legend
legend_html = ['<div style="margin-top: 20px;"><h3>USAS Categories Legend</h3>',
'<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(250px, 1fr)); gap: 10px;">']
for cat, (name, color) in sorted(USAS_CATEGORIES.items()):
legend_html.append(
f'<div style="background-color: {color}; padding: 10px; '
f'border-radius: 6px; border: 2px solid {color};">'
f'<strong>{cat}</strong> - {name}</div>'
)
legend_html.append('</div></div>')
return ''.join(html_parts), ''.join(stats_html), ''.join(legend_html)
# Create Gradio interface
with gr.Blocks(title="UCREL USAS Semantic Tag Visualizer", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🏷️ UCREL USAS Semantic Tag Visualizer
This app visualizes pre-tagged text using the **UCREL Semantic Analysis System (USAS)** tags.
**Format:** Use underscore notation: `word_TAG`
Example: `I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7`
Simply paste your tagged text below and click **Visualize**!
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Paste your tagged text here (word_TAG format)",
placeholder="Example: I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7 ._PUNC",
lines=10
)
submit_btn = gr.Button("🎨 Visualize Tags", variant="primary", size="lg")
with gr.Row():
with gr.Column():
tagged_output = gr.HTML(label="Visualized Tags")
with gr.Row():
with gr.Column(scale=1):
stats_output = gr.HTML(label="Statistics")
with gr.Column(scale=1):
legend_output = gr.HTML(label="Legend")
gr.Markdown(
"""
### About USAS Tags
The UCREL Semantic Analysis System (USAS) categorizes words into 21 major semantic fields:
- **A**: General & Abstract Terms (e.g., A5.1+ = good, A5.1- = bad)
- **B**: Body & Individual (e.g., B1 = anatomy)
- **E**: Emotional Actions (e.g., E2+ = like/love, E3- = violent/angry)
- **F**: Food & Farming (e.g., F1 = food)
- **G**: Government & Public (e.g., G1.1c = government, G1.2 = politics)
- **I**: Money & Commerce (e.g., I1.1 = money: affluent)
- **M**: Movement & Location (e.g., M1 = moving, M7 = places)
- **N**: Numbers & Measurement (e.g., N1 = numbers, N5+ = quantities: many)
- **P**: Education (e.g., P1 = education)
- **Q**: Linguistic Actions (e.g., Q2.2 = speech acts, Q3 = language)
- **S**: Social Actions (e.g., S2mf = people, S8+ = helping)
- **T**: Time (e.g., T1.3 = time: period)
- **X**: Psychological Actions (e.g., X2.1 = thought, X2.2+ = knowledge)
- **Z**: Names & Grammatical (e.g., Z5 = grammatical words, Z8 = pronouns)
- And more categories!
**Tag modifiers:**
- **+** = positive (e.g., A5.1+ = good)
- **-** = negative (e.g., A5.1- = bad)
- **/** = multiple tags (e.g., M1/M7/S2mf = moving/place/person)
**Hover over tagged words** to see detailed information about each semantic tag.
---
Learn more: [USAS Documentation](https://ucrel.lancs.ac.uk/usas/)
"""
)
# Examples
gr.Examples(
examples=[
["I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7 on_Z5 sunny_W4 days_T1.3 ._PUNC"],
["The_Z5 company_I2.1 announced_Q2.2 record_N5.1+ profits_I1.1 yesterday_T1.1.1 ._PUNC"],
["She_Z8 thinks_X2.1 education_P1 is_A3+ very_A13.3 important_A11.1+ ._PUNC"],
["As_Z5 an_Z5 immigrant_M1/M7/S2mf in_Z5 the_Z5 United_Z2c States_Z2c you_Z8mf have_A9+ the_Z5 right_S7.4+ to_Z5 receive_A9+ language_Q3 access_M1 services_S8+ ._PUNC"],
["The_Z5 Civil_G1.1 Rights_A5.3+ Act_A1.1.1 of_Z5 1964_N1 and_Z5 the_Z5 Voting_G1.2 Rights_A5.3+ Act_A1.1.1 of_Z5 1965_N1 protect_S8+/A15+ your_Z8 linguistic_Q3 rights_S7.4+ ._PUNC"]
],
inputs=text_input
)
submit_btn.click(
fn=parse_tagged_text,
inputs=text_input,
outputs=[tagged_output, stats_output, legend_output]
)
if __name__ == "__main__":
demo.launch() |