Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,101 @@ from datetime import datetime
|
|
| 5 |
from typing import Dict, List, Tuple
|
| 6 |
import hashlib
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Dummy dataset - replace with actual HuggingFace dataset loading
|
| 9 |
DUMMY_DATASET = [
|
| 10 |
{
|
|
@@ -227,42 +322,28 @@ custom_css = """
|
|
| 227 |
"""
|
| 228 |
|
| 229 |
# Create Gradio interface
|
| 230 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="Dataset Annotation Tool", css=custom_css) as app:
|
| 231 |
gr.Markdown("# Norwegian Fluency Annotation")
|
| 232 |
with gr.Accordion("▶ Click here to see the full annotation guidelines:", open=False):
|
| 233 |
-
|
| 234 |
-
## Detailed Information
|
| 235 |
-
|
| 236 |
-
This content is hidden by default and can be expanded.
|
| 237 |
-
|
| 238 |
-
- Point 1
|
| 239 |
-
- Point 2
|
| 240 |
-
- Point 3
|
| 241 |
-
|
| 242 |
-
You can put any Gradio components here, including:
|
| 243 |
-
- Markdown
|
| 244 |
-
- Code blocks
|
| 245 |
-
- Images
|
| 246 |
-
- Interactive components
|
| 247 |
-
""", padding=True)
|
| 248 |
|
| 249 |
user_state = gr.State("")
|
| 250 |
|
| 251 |
# Login Interface
|
| 252 |
-
with gr.
|
| 253 |
-
gr.
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
|
| 264 |
# Annotation Interface
|
| 265 |
-
with gr.
|
| 266 |
progress_label = gr.Markdown("")
|
| 267 |
|
| 268 |
# Row 1: Prompt
|
|
|
|
| 5 |
from typing import Dict, List, Tuple
|
| 6 |
import hashlib
|
| 7 |
|
| 8 |
+
|
| 9 |
+
guideline = """
|
| 10 |
+
## Overview
|
| 11 |
+
|
| 12 |
+
This document provides guidelines for evaluating the fluency of Norwegian responses generated by language models. Annotators will compare pairs of responses (Response A and Response B) and determine which response demonstrates better fluency, or if they are equally fluent. The evaluation focuses exclusively on language quality, naturalness, and grammaticality.
|
| 13 |
+
|
| 14 |
+
## Key principle
|
| 15 |
+
|
| 16 |
+
**Fluency evaluation is strictly limited to linguistic quality.** Do NOT consider:
|
| 17 |
+
- Factual accuracy or correctness
|
| 18 |
+
- Completeness of information
|
| 19 |
+
- Creativity or originality
|
| 20 |
+
- Formatting or structure (unless it affects readability)
|
| 21 |
+
- Length or conciseness
|
| 22 |
+
|
| 23 |
+
## Definitions
|
| 24 |
+
|
| 25 |
+
### What is fluency?
|
| 26 |
+
|
| 27 |
+
Fluency refers to the linguistic quality of text that makes it natural, smooth, and easy to read. A fluent response:
|
| 28 |
+
|
| 29 |
+
- **Grammatically correct**: Follows standard grammar rules with proper syntax
|
| 30 |
+
- **Natural-sounding**: Reads like something a native speaker would write
|
| 31 |
+
- **Coherent**: Maintains logical flow between sentences and paragraphs
|
| 32 |
+
- **Well-formed**: Uses appropriate vocabulary, punctuation, and sentence structure
|
| 33 |
+
- **Smooth**: Flows naturally without awkward phrasing or jarring transitions
|
| 34 |
+
- **Norwegian**: The models respond to Norwegian prompts and so they should always be either in Norwegian Bokmål or Norwegian Nynorsk
|
| 35 |
+
|
| 36 |
+
### Fluency issues to look for
|
| 37 |
+
|
| 38 |
+
When evaluating fluency, pay attention to:
|
| 39 |
+
|
| 40 |
+
1. **Grammar errors**: Subject-verb disagreement, incorrect tense, wrong word forms
|
| 41 |
+
2. **Awkward phrasing**: Unnatural word order, stilted expressions, robotic language
|
| 42 |
+
3. **Punctuation problems**: Missing or incorrect punctuation that affects readability
|
| 43 |
+
4. **Word choice issues**: Inappropriate vocabulary, incorrect word usage, repetitive language
|
| 44 |
+
5. **Sentence structure problems**: Run-on sentences, fragments, unclear pronoun references
|
| 45 |
+
6. **Flow disruptions**: Abrupt transitions, disconnected ideas within sentences
|
| 46 |
+
7. **Spelling errors**: Typos and misspellings that affect readability
|
| 47 |
+
8. **Translationese**: A common problem of language models is that they base their output on English -- the majority language in the language corpus. This can result in unnatural language patterns that look like literal translations from English, such as: TODO
|
| 48 |
+
|
| 49 |
+
## Annotation procedure
|
| 50 |
+
|
| 51 |
+
### Step-by-Step process
|
| 52 |
+
|
| 53 |
+
1. **Read both responses completely** without making immediate judgments
|
| 54 |
+
2. **Focus solely on language quality** - ignore content accuracy and relevance
|
| 55 |
+
3. **Identify fluency issues** in each response using the criteria above
|
| 56 |
+
4. **Compare the severity and frequency** of fluency issues between responses
|
| 57 |
+
5. **Make your decision** based on overall fluency
|
| 58 |
+
|
| 59 |
+
### Decision options
|
| 60 |
+
|
| 61 |
+
You must select one of three options:
|
| 62 |
+
|
| 63 |
+
- **A is more fluent**: Response A has better overall language quality than Response B
|
| 64 |
+
- **B is more fluent**: Response B has better overall language quality than Response A
|
| 65 |
+
- **Equal fluency**: Both responses have similar language quality (minor differences that don't clearly favor either response)
|
| 66 |
+
|
| 67 |
+
### Important guidelines
|
| 68 |
+
|
| 69 |
+
- **Minor differences matter**: Even small improvements in fluency should influence your decision
|
| 70 |
+
- **Consider overall impression**: Multiple minor issues may outweigh a single major issue
|
| 71 |
+
- **Be consistent**: Apply the same standards across all evaluations
|
| 72 |
+
- **When in doubt about equality**: If you cannot decisively determine which is better after careful analysis, select "Equal fluency"
|
| 73 |
+
|
| 74 |
+
## Examples
|
| 75 |
+
|
| 76 |
+
### Example 1: Clear fluency difference
|
| 77 |
+
|
| 78 |
+
TODO
|
| 79 |
+
|
| 80 |
+
### Example 2: Equal fluency
|
| 81 |
+
|
| 82 |
+
TODO
|
| 83 |
+
|
| 84 |
+
### Example 3: Subtle fluency difference
|
| 85 |
+
|
| 86 |
+
TODO
|
| 87 |
+
|
| 88 |
+
### Example 4: Content vs. fluency
|
| 89 |
+
|
| 90 |
+
TODO
|
| 91 |
+
|
| 92 |
+
## Edge cases and special considerations
|
| 93 |
+
|
| 94 |
+
TODO
|
| 95 |
+
|
| 96 |
+
**Technical or specialized language**: Technical terminology and domain-specific language should be considered fluent if used correctly and consistently, even if it might seem less natural to a general audience.
|
| 97 |
+
|
| 98 |
+
**Formatting issues**: Ignore formatting differences (bold, italics, bullet points) unless they directly impact readability or sentence structure.
|
| 99 |
+
|
| 100 |
+
**Code or mathematical expressions**: If responses contain code snippets or mathematical expressions, evaluate only the fluency of the natural language portions.
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
# Dummy dataset - replace with actual HuggingFace dataset loading
|
| 104 |
DUMMY_DATASET = [
|
| 105 |
{
|
|
|
|
| 322 |
"""
|
| 323 |
|
| 324 |
# Create Gradio interface
|
| 325 |
+
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial"]), title="Dataset Annotation Tool", css=custom_css) as app:
|
| 326 |
gr.Markdown("# Norwegian Fluency Annotation")
|
| 327 |
with gr.Accordion("▶ Click here to see the full annotation guidelines:", open=False):
|
| 328 |
+
gr.Markdown(guideline, padding=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
user_state = gr.State("")
|
| 331 |
|
| 332 |
# Login Interface
|
| 333 |
+
with gr.Column(visible=True, elem_id="login-group") as login_interface:
|
| 334 |
+
with gr.Group():
|
| 335 |
+
gr.Markdown("## Login", padding=True)
|
| 336 |
+
user_id_input = gr.Textbox(
|
| 337 |
+
label="Enter your unique annotator ID to begin",
|
| 338 |
+
placeholder="Annotator ID"
|
| 339 |
+
)
|
| 340 |
+
with gr.Row():
|
| 341 |
+
login_btn = gr.Button("Login", variant="primary", scale=0.2, min_width=100)
|
| 342 |
+
gr.HTML("")
|
| 343 |
+
login_status = gr.Markdown("", padding=True)
|
| 344 |
|
| 345 |
# Annotation Interface
|
| 346 |
+
with gr.Column(visible=False, elem_id="annotation-group") as annotation_interface:
|
| 347 |
progress_label = gr.Markdown("")
|
| 348 |
|
| 349 |
# Row 1: Prompt
|