yazoniak commited on
Commit
138dcf7
·
verified ·
1 Parent(s): 597a638

Initialized repo

Browse files
Files changed (2) hide show
  1. app.py +416 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app for Polish Twitter Emotion Classifier.
3
+
4
+ This application provides an interactive interface for predicting emotions
5
+ and sentiment in Polish text using a fine-tuned RoBERTa model.
6
+
7
+ For private models, set the HF_TOKEN environment variable:
8
+ export HF_TOKEN=your_huggingface_token
9
+ """
10
+
11
+ import gradio as gr
12
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
13
+ import torch
14
+ import numpy as np
15
+ import json
16
+ import os
17
+ import re
18
+
19
+
20
+ # Model configuration
21
+ MODEL_NAME = "yazoniak/twitter-emotion-pl-classifier"
22
+ MAX_LENGTH = 8192
23
+ DEFAULT_THRESHOLD = 0.5
24
+
25
+ # Authentication token for private models
26
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
27
+
28
+ # Emotion emojis for visual display
29
+ LABEL_EMOJIS = {
30
+ "radość": "😊",
31
+ "wstręt": "🤢",
32
+ "gniew": "😠",
33
+ "przeczuwanie": "🤔",
34
+ "pozytywny": "👍",
35
+ "negatywny": "👎",
36
+ "neutralny": "😐",
37
+ "sarkazm": "😏",
38
+ }
39
+
40
+
41
+ def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
42
+ """
43
+ Preprocess input text by anonymizing mentions.
44
+
45
+ Args:
46
+ text: Input text to preprocess
47
+ anonymize_mentions: Whether to replace @mentions with @anonymized_account
48
+
49
+ Returns:
50
+ Preprocessed text
51
+ """
52
+ if anonymize_mentions:
53
+ text = re.sub(r"@\w+", "@anonymized_account", text)
54
+ return text
55
+
56
+
57
+ def load_model():
58
+ """
59
+ Load the model, tokenizer, and calibration artifacts.
60
+
61
+ For private models, requires HF_TOKEN environment variable to be set.
62
+
63
+ Returns:
64
+ tuple: (model, tokenizer, labels, calibration_artifacts)
65
+ """
66
+ print(f"Loading model: {MODEL_NAME}")
67
+
68
+ if HF_TOKEN:
69
+ print(f"Using authentication token for model: {MODEL_NAME}")
70
+ model = AutoModelForSequenceClassification.from_pretrained(
71
+ MODEL_NAME, token=HF_TOKEN
72
+ )
73
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
74
+ else:
75
+ print(f"Loading public model: {MODEL_NAME}")
76
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
77
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
78
+
79
+ model.eval()
80
+
81
+ # Get label mappings from model config
82
+ labels = [model.config.id2label[i] for i in range(model.config.num_labels)]
83
+
84
+ # Try to load calibration artifacts
85
+ calibration_artifacts = None
86
+ try:
87
+ # Try to download from HF Hub
88
+ from huggingface_hub import hf_hub_download
89
+
90
+ calib_path = hf_hub_download(
91
+ repo_id=MODEL_NAME, filename="calibration_artifacts.json", token=HF_TOKEN
92
+ )
93
+ with open(calib_path, "r") as f:
94
+ calibration_artifacts = json.load(f)
95
+ print("✓ Calibration artifacts loaded")
96
+ except Exception as e:
97
+ print(f"⚠ Could not load calibration artifacts: {e}")
98
+ print(" Calibrated mode will not be available")
99
+
100
+ return model, tokenizer, labels, calibration_artifacts
101
+
102
+
103
+ # Load model at startup
104
+ print("Loading model...")
105
+ model, tokenizer, labels, calibration_artifacts = load_model()
106
+ print(f"✓ Model loaded successfully with {len(labels)} labels")
107
+ print(f" Labels: {', '.join(labels)}")
108
+
109
+
110
+ def predict_emotions(
111
+ text: str,
112
+ mode: str = "Calibrated",
113
+ threshold: float = DEFAULT_THRESHOLD,
114
+ anonymize: bool = True,
115
+ ) -> tuple[str, str]:
116
+ """
117
+ Predict emotions and sentiment for Polish text.
118
+
119
+ Args:
120
+ text: Input Polish text
121
+ mode: Prediction mode ("Simple" or "Calibrated")
122
+ threshold: Classification threshold (0-1) - used only in Simple mode
123
+ anonymize: Whether to anonymize @mentions
124
+
125
+ Returns:
126
+ tuple: (formatted_predictions, all_scores_json)
127
+ """
128
+ # Validate inputs
129
+ if not text or not text.strip():
130
+ return "⚠️ Please enter some text to analyze", ""
131
+
132
+ # Preprocess text
133
+ processed_text = preprocess_text(text, anonymize_mentions=anonymize)
134
+ text_changed = processed_text != text
135
+
136
+ # Validate mode
137
+ if mode == "Calibrated" and calibration_artifacts is None:
138
+ return (
139
+ "⚠️ Calibrated mode not available (calibration artifacts not found). Please use Default mode.",
140
+ "",
141
+ )
142
+
143
+ # Validate threshold for default mode
144
+ if mode == "Default" and (threshold < 0 or threshold > 1):
145
+ return "⚠️ Threshold must be between 0 and 1", ""
146
+
147
+ # Tokenize
148
+ inputs = tokenizer(
149
+ processed_text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH
150
+ )
151
+
152
+ # Make prediction
153
+ with torch.no_grad():
154
+ outputs = model(**inputs)
155
+ logits = outputs.logits.squeeze().numpy()
156
+
157
+ # Calculate probabilities based on mode
158
+ if mode == "Calibrated":
159
+ temperatures = calibration_artifacts["temperatures"]
160
+ optimal_thresholds = calibration_artifacts["optimal_thresholds"]
161
+
162
+ probabilities = []
163
+ predictions = []
164
+ used_thresholds = []
165
+
166
+ for i, label in enumerate(labels):
167
+ temp = temperatures[label]
168
+ thresh = optimal_thresholds[label]
169
+
170
+ calibrated_logit = logits[i] / temp
171
+ prob = 1 / (1 + np.exp(-calibrated_logit))
172
+
173
+ probabilities.append(prob)
174
+ predictions.append(prob > thresh)
175
+ used_thresholds.append(thresh)
176
+
177
+ probabilities = np.array(probabilities)
178
+ else: # Default mode
179
+ probabilities = 1 / (1 + np.exp(-logits))
180
+ predictions = probabilities > threshold
181
+ used_thresholds = [threshold] * len(labels)
182
+
183
+ # Get assigned labels
184
+ assigned_labels = [labels[i] for i in range(len(labels)) if predictions[i]]
185
+
186
+ # Format output - Start with detected labels prominently
187
+ result_text = "# Detected Labels\n\n"
188
+
189
+ # Assigned labels section
190
+ if assigned_labels:
191
+ for label in assigned_labels:
192
+ emoji = LABEL_EMOJIS.get(label, "🏷️")
193
+ idx = labels.index(label)
194
+ result_text += f"## {emoji} **{label}** `{probabilities[idx]:.1%}`\n\n"
195
+ else:
196
+ result_text += "## No Labels Detected\n\n"
197
+ result_text += "All confidence scores are below the threshold(s).\n\n"
198
+
199
+ result_text += "---\n\n"
200
+
201
+ # Categorize labels
202
+ emotions = ["radość", "wstręt", "gniew", "przeczuwanie"]
203
+ sentiments = ["pozytywny", "negatywny", "neutralny"]
204
+ special = ["sarkazm"]
205
+
206
+ # Additional details - Less prominent
207
+ result_text += "<details>\n"
208
+ result_text += "<summary><b>📊 All Scores (click to expand)</b></summary>\n\n"
209
+
210
+ if text_changed and anonymize:
211
+ result_text += f"**Preprocessed text:** _{processed_text}_\n\n"
212
+
213
+ result_text += f"**Original text:** {text}\n\n"
214
+ result_text += f"**Mode:** {mode}"
215
+ if mode == "Default":
216
+ result_text += f" (threshold: {threshold:.2f})"
217
+ result_text += "\n\n"
218
+
219
+ # Emotions
220
+ result_text += "**Emotions:**\n\n"
221
+ for label in emotions:
222
+ if label in labels:
223
+ idx = labels.index(label)
224
+ emoji = LABEL_EMOJIS.get(label, "🏷️")
225
+ status = "✓" if predictions[idx] else "·"
226
+ thresh_info = (
227
+ f" (threshold: {used_thresholds[idx]:.2f})"
228
+ if mode == "Calibrated"
229
+ else ""
230
+ )
231
+ result_text += f"{status} {emoji} {label:15s}: {probabilities[idx]:.4f}{thresh_info}\n\n"
232
+
233
+ # Sentiment
234
+ result_text += "**Sentiment:**\n\n"
235
+ for label in sentiments:
236
+ if label in labels:
237
+ idx = labels.index(label)
238
+ emoji = LABEL_EMOJIS.get(label, "🏷️")
239
+ status = "✓" if predictions[idx] else "·"
240
+ thresh_info = (
241
+ f" (threshold: {used_thresholds[idx]:.2f})"
242
+ if mode == "Calibrated"
243
+ else ""
244
+ )
245
+ result_text += f"{status} {emoji} {label:15s}: {probabilities[idx]:.4f}{thresh_info}\n\n"
246
+
247
+ # Special
248
+ result_text += "**Special:**\n\n"
249
+ for label in special:
250
+ if label in labels:
251
+ idx = labels.index(label)
252
+ emoji = LABEL_EMOJIS.get(label, "🏷️")
253
+ status = "✓" if predictions[idx] else "·"
254
+ thresh_info = (
255
+ f" (threshold: {used_thresholds[idx]:.2f})"
256
+ if mode == "Calibrated"
257
+ else ""
258
+ )
259
+ result_text += f"{status} {emoji} {label:15s}: {probabilities[idx]:.4f}{thresh_info}\n\n"
260
+
261
+ result_text += "</details>"
262
+
263
+ # Create JSON output
264
+ all_scores = {label: float(probabilities[i]) for i, label in enumerate(labels)}
265
+ json_output = {
266
+ "assigned_labels": assigned_labels,
267
+ "all_scores": all_scores,
268
+ "mode": mode,
269
+ "text_length": len(text),
270
+ "preprocessed": text_changed,
271
+ }
272
+
273
+ if mode == "Calibrated":
274
+ json_output["temperatures"] = calibration_artifacts["temperatures"]
275
+ json_output["optimal_thresholds"] = calibration_artifacts["optimal_thresholds"]
276
+ else:
277
+ json_output["threshold"] = threshold
278
+
279
+ all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
280
+
281
+ return result_text, all_scores_json
282
+
283
+
284
+ # Example inputs
285
+ examples = [
286
+ ["@zgp_intervillage Uwielbiam czekać na peronie 3 godziny! Gratulacje dla #zgp"],
287
+ ]
288
+
289
+
290
+ # Create Gradio interface
291
+ with gr.Blocks(
292
+ title="Polish Twitter Emotion Classifier", theme=gr.themes.Soft()
293
+ ) as demo:
294
+ gr.Markdown("""
295
+ # 🎭 Polish Twitter Emotion Classifier
296
+
297
+ This model predicts emotions and sentiment in Polish text using a fine-tuned **[PKOBP/polish-roberta-8k](https://huggingface.co/PKOBP/polish-roberta-8k)** model.
298
+
299
+ **Detected labels:**
300
+ - **Emotions**: 😊 radość (joy), 🤢 wstręt (disgust), 😠 gniew (anger), 🤔 przeczuwanie (anticipation)
301
+ - **Sentiment**: 👍 pozytywny (positive), 👎 negatywny (negative), 😐 neutralny (neutral)
302
+ - **Special**: 😏 sarkazm (sarcasm)
303
+
304
+ The model uses **multi-label classification** - text can have multiple emotions/sentiments simultaneously.
305
+ """)
306
+
307
+ with gr.Row():
308
+ with gr.Column(scale=2):
309
+ text_input = gr.Textbox(
310
+ label="Tweet to Analyze",
311
+ placeholder="e.g., Wspaniały dzień! Jestem bardzo szczęśliwy :)",
312
+ lines=4,
313
+ )
314
+
315
+ with gr.Row():
316
+ mode_input = gr.Radio(
317
+ choices=["Calibrated", "Default"],
318
+ value="Calibrated",
319
+ label="Prediction Mode",
320
+ info="Calibrated uses optimal thresholds per label (recommended)",
321
+ )
322
+
323
+ anonymize_input = gr.Checkbox(
324
+ value=True,
325
+ label="Anonymize @mentions",
326
+ info="Replace @username with @anonymized_account",
327
+ )
328
+
329
+ threshold_input = gr.Slider(
330
+ minimum=0.0,
331
+ maximum=1.0,
332
+ value=DEFAULT_THRESHOLD,
333
+ step=0.05,
334
+ label="Threshold (Default mode only)",
335
+ info="Only used when Default mode is selected",
336
+ )
337
+
338
+ predict_btn = gr.Button("Analyze Emotions", variant="primary", size="lg")
339
+
340
+ with gr.Column(scale=3):
341
+ prediction_output = gr.Markdown(label="Predictions")
342
+
343
+ with gr.Accordion("Detailed JSON Output", open=False):
344
+ json_output = gr.Code(label="Full Prediction Details", language="json")
345
+
346
+ # Connect the button
347
+ predict_btn.click(
348
+ fn=predict_emotions,
349
+ inputs=[text_input, mode_input, threshold_input, anonymize_input],
350
+ outputs=[prediction_output, json_output],
351
+ )
352
+
353
+ # Examples section
354
+ gr.Markdown("### Example Input")
355
+ gr.Examples(
356
+ examples=examples,
357
+ inputs=[text_input],
358
+ outputs=[prediction_output, json_output],
359
+ fn=predict_emotions,
360
+ cache_examples=False,
361
+ )
362
+
363
+ gr.Markdown("""
364
+ ---
365
+ ### Model Performance
366
+
367
+ | Metric | Validation Score |
368
+ |--------|------------------|
369
+ | F1 Macro | 0.85 |
370
+ | F1 Micro | 0.89 |
371
+ | F1 Weighted | 0.89 |
372
+ | Subset Accuracy | 0.89 |
373
+
374
+ ### How to Use
375
+
376
+ 1. **Enter Polish text**: Paste a tweet, social media post, or any Polish text
377
+ 2. **Select mode**:
378
+ - **Calibrated** (recommended): Uses temperature scaling and optimal thresholds per label
379
+ - **Default**: Uses a single threshold for all labels
380
+ 3. **Adjust settings**: Toggle mention anonymization, adjust threshold (Default mode)
381
+ 4. **Click Analyze**: Get emotion and sentiment predictions with confidence scores
382
+
383
+ ### Prediction Modes
384
+
385
+ - **Calibrated Mode** (Recommended): Uses temperature scaling and label-specific optimal thresholds for better accuracy and calibration. This mode is recommended for most use cases.
386
+ - **Default Mode**: Uses sigmoid activation with a single threshold across all labels. Useful for quick predictions or when you want uniform threshold control.
387
+
388
+ ### Limitations
389
+
390
+ - Model is trained on Polish Twitter data and works best with informal social media text
391
+ - May not generalize well to formal Polish text (news, academic writing)
392
+ - Optimal for tweet-length texts (not very long documents)
393
+ - Multi-label nature means texts can have seemingly contradictory labels (e.g., sarkazm + pozytywny)
394
+
395
+ ### Citation
396
+
397
+ If you use this model, please cite:
398
+ ```bibtex
399
+ @model{yazoniak2025twitteremotionpl,
400
+ author = {yazoniak},
401
+ title = {Polish Twitter Emotion Classifier},
402
+ year = {2025},
403
+ publisher = {Hugging Face},
404
+ url = {https://huggingface.co/yazoniak/twitter-emotion-pl-classifier}
405
+ }
406
+ ```
407
+
408
+ ### 📄 License
409
+
410
+ GPL-3.0 License
411
+ """)
412
+
413
+
414
+ # Launch the app
415
+ if __name__ == "__main__":
416
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ numpy>=1.24.0
5
+ huggingface_hub>=0.16.0
6
+