File size: 9,138 Bytes
a5703b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import gradio as gr
import re
from collections import Counter

# USAS category information
USAS_CATEGORIES = {
    'A': ('General & Abstract Terms', '#fee2e2'),
    'B': ('Body & Individual', '#fce7f3'),
    'C': ('Arts & Crafts', '#f3e8ff'),
    'E': ('Emotional Actions', '#ffe4e6'),
    'F': ('Food & Farming', '#dcfce7'),
    'G': ('Government & Public', '#dbeafe'),
    'H': ('Architecture & Buildings', '#fef3c7'),
    'I': ('Money & Commerce', '#d1fae5'),
    'K': ('Entertainment & Sports', '#e9d5ff'),
    'L': ('Life & Living Things', '#ecfccb'),
    'M': ('Movement & Location', '#cffafe'),
    'N': ('Numbers & Measurement', '#e0e7ff'),
    'O': ('Substances & Objects', '#fed7aa'),
    'P': ('Education', '#ccfbf1'),
    'Q': ('Linguistic Actions', '#e0f2fe'),
    'S': ('Social Actions', '#fae8ff'),
    'T': ('Time', '#fef9c3'),
    'W': ('World & Environment', '#bbf7d0'),
    'X': ('Psychological Actions', '#ddd6fe'),
    'Y': ('Science & Technology', '#bfdbfe'),
    'Z': ('Names & Grammatical', '#e5e7eb')
}

def get_category_color(tag):
    """Get color for a tag based on its first letter"""
    if not tag:
        return '#f3f4f6'
    first_char = tag[0].upper()
    return USAS_CATEGORIES.get(first_char, ('#f3f4f6', 'Unknown'))[1]

def get_category_name(tag):
    """Get category name for a tag"""
    if not tag:
        return 'Unknown'
    first_char = tag[0].upper()
    return USAS_CATEGORIES.get(first_char, ('Unknown', '#f3f4f6'))[0]

def parse_tagged_text(text):
    """
    Parse pre-tagged text in underscore format: word_TAG
    Example: I_Z8 love_E2+ walking_M1
    """
    if not text.strip():
        return "Please enter some tagged text to visualize.", "", ""
    
    tokens = []
    
    # Split by whitespace and parse each token
    parts = text.split()
    for part in parts:
        if '_' in part:
            # word_TAG format - split on last underscore to handle words with underscores
            word, tag = part.rsplit('_', 1)
            tokens.append((word, tag))
        else:
            # No tag found, treat as untagged
            tokens.append((part, 'Z99'))
    
    if not tokens:
        return "No tagged content found. Please check the format.", "", ""
    
    # Create HTML visualization
    html_parts = ['<div style="line-height: 2.5; font-size: 16px;">']
    
    tag_counts = Counter()
    
    for word, tag in tokens:
        # Count tags (use first letter of primary tag)
        first_char = tag.split('/')[0][0].upper() if tag else 'Z'
        tag_counts[first_char] += 1
        
        # Get color
        color = get_category_color(tag)
        category = get_category_name(tag)
        
        # Create colored span with tooltip
        html_parts.append(
            f'<span style="background-color: {color}; '
            f'padding: 4px 8px; margin: 2px; border-radius: 6px; '
            f'display: inline-block; border: 2px solid {color}; '
            f'cursor: help;" '
            f'title="{word}\nTag: {tag}\nCategory: {category}">'
            f'<strong>{word}</strong><br>'
            f'<small style="font-size: 11px; font-family: monospace;">{tag}</small>'
            f'</span> '
        )
    
    html_parts.append('</div>')
    
    # Create statistics table
    stats_html = ['<div style="margin-top: 20px;"><h3>Tag Distribution</h3>',
                  '<table style="width: 100%; border-collapse: collapse;">',
                  '<tr style="background-color: #f3f4f6;">',
                  '<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Category</th>',
                  '<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Name</th>',
                  '<th style="padding: 8px; text-align: right; border: 1px solid #ddd;">Count</th>',
                  '<th style="padding: 8px; text-align: right; border: 1px solid #ddd;">%</th>',
                  '</tr>']
    
    total = sum(tag_counts.values())
    for cat, count in tag_counts.most_common():
        cat_name = USAS_CATEGORIES.get(cat, ('Unknown', '#f3f4f6'))[0]
        color = USAS_CATEGORIES.get(cat, ('Unknown', '#f3f4f6'))[1]
        percentage = (count / total * 100) if total > 0 else 0
        stats_html.append(
            f'<tr><td style="padding: 8px; border: 1px solid #ddd; background-color: {color};">'
            f'<strong>{cat}</strong></td>'
            f'<td style="padding: 8px; border: 1px solid #ddd;">{cat_name}</td>'
            f'<td style="padding: 8px; border: 1px solid #ddd; text-align: right;">{count}</td>'
            f'<td style="padding: 8px; border: 1px solid #ddd; text-align: right;">{percentage:.1f}%</td></tr>'
        )
    
    stats_html.append('</table></div>')
    
    # Create legend
    legend_html = ['<div style="margin-top: 20px;"><h3>USAS Categories Legend</h3>',
                   '<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(250px, 1fr)); gap: 10px;">']
    
    for cat, (name, color) in sorted(USAS_CATEGORIES.items()):
        legend_html.append(
            f'<div style="background-color: {color}; padding: 10px; '
            f'border-radius: 6px; border: 2px solid {color};">'
            f'<strong>{cat}</strong> - {name}</div>'
        )
    
    legend_html.append('</div></div>')
    
    return ''.join(html_parts), ''.join(stats_html), ''.join(legend_html)

# Create Gradio interface
with gr.Blocks(title="UCREL USAS Semantic Tag Visualizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🏷️ UCREL USAS Semantic Tag Visualizer
        
        This app visualizes pre-tagged text using the **UCREL Semantic Analysis System (USAS)** tags.
        
        **Format:** Use underscore notation: `word_TAG`
        
        Example: `I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7`
        
        Simply paste your tagged text below and click **Visualize**!
        """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Paste your tagged text here (word_TAG format)",
                placeholder="Example: I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7 ._PUNC",
                lines=10
            )
            submit_btn = gr.Button("🎨 Visualize Tags", variant="primary", size="lg")
    
    with gr.Row():
        with gr.Column():
            tagged_output = gr.HTML(label="Visualized Tags")
    
    with gr.Row():
        with gr.Column(scale=1):
            stats_output = gr.HTML(label="Statistics")
        with gr.Column(scale=1):
            legend_output = gr.HTML(label="Legend")
    
    gr.Markdown(
        """
        ### About USAS Tags
        
        The UCREL Semantic Analysis System (USAS) categorizes words into 21 major semantic fields:
        - **A**: General & Abstract Terms (e.g., A5.1+ = good, A5.1- = bad)
        - **B**: Body & Individual (e.g., B1 = anatomy)
        - **E**: Emotional Actions (e.g., E2+ = like/love, E3- = violent/angry)
        - **F**: Food & Farming (e.g., F1 = food)
        - **G**: Government & Public (e.g., G1.1c = government, G1.2 = politics)
        - **I**: Money & Commerce (e.g., I1.1 = money: affluent)
        - **M**: Movement & Location (e.g., M1 = moving, M7 = places)
        - **N**: Numbers & Measurement (e.g., N1 = numbers, N5+ = quantities: many)
        - **P**: Education (e.g., P1 = education)
        - **Q**: Linguistic Actions (e.g., Q2.2 = speech acts, Q3 = language)
        - **S**: Social Actions (e.g., S2mf = people, S8+ = helping)
        - **T**: Time (e.g., T1.3 = time: period)
        - **X**: Psychological Actions (e.g., X2.1 = thought, X2.2+ = knowledge)
        - **Z**: Names & Grammatical (e.g., Z5 = grammatical words, Z8 = pronouns)
        - And more categories!
        
        **Tag modifiers:**
        - **+** = positive (e.g., A5.1+ = good)
        - **-** = negative (e.g., A5.1- = bad)
        - **/** = multiple tags (e.g., M1/M7/S2mf = moving/place/person)
        
        **Hover over tagged words** to see detailed information about each semantic tag.
        
        ---
        Learn more: [USAS Documentation](https://ucrel.lancs.ac.uk/usas/)
        """
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["I_Z8 love_E2+ walking_M1 in_Z5 the_Z5 park_M7 on_Z5 sunny_W4 days_T1.3 ._PUNC"],
            ["The_Z5 company_I2.1 announced_Q2.2 record_N5.1+ profits_I1.1 yesterday_T1.1.1 ._PUNC"],
            ["She_Z8 thinks_X2.1 education_P1 is_A3+ very_A13.3 important_A11.1+ ._PUNC"],
            ["As_Z5 an_Z5 immigrant_M1/M7/S2mf in_Z5 the_Z5 United_Z2c States_Z2c you_Z8mf have_A9+ the_Z5 right_S7.4+ to_Z5 receive_A9+ language_Q3 access_M1 services_S8+ ._PUNC"],
            ["The_Z5 Civil_G1.1 Rights_A5.3+ Act_A1.1.1 of_Z5 1964_N1 and_Z5 the_Z5 Voting_G1.2 Rights_A5.3+ Act_A1.1.1 of_Z5 1965_N1 protect_S8+/A15+ your_Z8 linguistic_Q3 rights_S7.4+ ._PUNC"]
        ],
        inputs=text_input
    )
    
    submit_btn.click(
        fn=parse_tagged_text,
        inputs=text_input,
        outputs=[tagged_output, stats_output, legend_output]
    )

if __name__ == "__main__":
    demo.launch()