Spaces:
Sleeping
Sleeping
Upload app.py with huggingface_hub
Browse files
app.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Swedish Text Complexity Analyzer
|
| 3 |
+
Uppsala NLP - Hugging Face Space
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import math
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
def calculate_lix(text):
|
| 11 |
+
"""Calculate LIX and other Swedish readability metrics."""
|
| 12 |
+
# Tokenize
|
| 13 |
+
words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
|
| 14 |
+
sentences = re.split(r'[.!?]+', text)
|
| 15 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 16 |
+
|
| 17 |
+
if not words or not sentences:
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
num_words = len(words)
|
| 21 |
+
num_sentences = len(sentences)
|
| 22 |
+
num_long_words = sum(1 for w in words if len(w) > 6)
|
| 23 |
+
num_unique = len(set(words))
|
| 24 |
+
num_chars = sum(len(w) for w in words)
|
| 25 |
+
|
| 26 |
+
# LIX
|
| 27 |
+
lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)
|
| 28 |
+
|
| 29 |
+
# OVIX
|
| 30 |
+
if num_words >= 10 and num_unique > 1:
|
| 31 |
+
log_tokens = math.log(num_words)
|
| 32 |
+
log_types = math.log(num_unique)
|
| 33 |
+
denom = 2 - (log_types / log_tokens)
|
| 34 |
+
ovix = log_tokens / math.log(denom) if denom > 1 else 0
|
| 35 |
+
else:
|
| 36 |
+
ovix = 0
|
| 37 |
+
|
| 38 |
+
# Category
|
| 39 |
+
if lix < 25: cat = "Mycket lätt / Very Easy"
|
| 40 |
+
elif lix < 30: cat = "Lätt / Easy"
|
| 41 |
+
elif lix < 40: cat = "Medel / Medium"
|
| 42 |
+
elif lix < 50: cat = "Svår / Difficult"
|
| 43 |
+
elif lix < 60: cat = "Mycket svår / Very Difficult"
|
| 44 |
+
else: cat = "Extremt svår / Extremely Difficult"
|
| 45 |
+
|
| 46 |
+
return {
|
| 47 |
+
"LIX Score": round(lix, 1),
|
| 48 |
+
"Category": cat,
|
| 49 |
+
"OVIX (Lexical Variation)": round(ovix, 1),
|
| 50 |
+
"Avg Sentence Length": round(num_words / num_sentences, 1),
|
| 51 |
+
"Avg Word Length": round(num_chars / num_words, 2),
|
| 52 |
+
"Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
|
| 53 |
+
"Total Words": num_words,
|
| 54 |
+
"Total Sentences": num_sentences,
|
| 55 |
+
"Unique Words": num_unique
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
def analyze_text(text):
|
| 59 |
+
if not text.strip():
|
| 60 |
+
return "Please enter some Swedish text to analyze."
|
| 61 |
+
|
| 62 |
+
results = calculate_lix(text)
|
| 63 |
+
if not results:
|
| 64 |
+
return "Could not analyze text. Please enter valid Swedish text."
|
| 65 |
+
|
| 66 |
+
output = "## 📊 Analysis Results\n\n"
|
| 67 |
+
output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
|
| 68 |
+
output += "| Metric | Value |\n|--------|-------|\n"
|
| 69 |
+
for k, v in results.items():
|
| 70 |
+
output += f"| {k} | {v} |\n"
|
| 71 |
+
|
| 72 |
+
output += "\n### 📖 LIX Scale Reference\n"
|
| 73 |
+
output += "| Score | Level | Example |\n|-------|-------|--------|\n"
|
| 74 |
+
output += "| < 25 | Very Easy | Children's books |\n"
|
| 75 |
+
output += "| 25-30 | Easy | Simple fiction |\n"
|
| 76 |
+
output += "| 30-40 | Medium | Newspapers |\n"
|
| 77 |
+
output += "| 40-50 | Difficult | Official documents |\n"
|
| 78 |
+
output += "| 50-60 | Very Difficult | Academic texts |\n"
|
| 79 |
+
output += "| > 60 | Extremely Difficult | Legal/technical |\n"
|
| 80 |
+
|
| 81 |
+
return output
|
| 82 |
+
|
| 83 |
+
examples = [
|
| 84 |
+
["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
|
| 85 |
+
["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
|
| 86 |
+
["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
demo = gr.Interface(
|
| 90 |
+
fn=analyze_text,
|
| 91 |
+
inputs=gr.Textbox(
|
| 92 |
+
label="Swedish Text",
|
| 93 |
+
placeholder="Enter Swedish text here...",
|
| 94 |
+
lines=8
|
| 95 |
+
),
|
| 96 |
+
outputs=gr.Markdown(label="Results"),
|
| 97 |
+
title="🇸🇪 Swedish Text Complexity Analyzer",
|
| 98 |
+
description="""
|
| 99 |
+
Analyze the readability of Swedish text using established linguistic metrics.
|
| 100 |
+
|
| 101 |
+
**Metrics:**
|
| 102 |
+
- **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
|
| 103 |
+
- **OVIX** (Ordvariationsindex): Lexical variation measure
|
| 104 |
+
|
| 105 |
+
Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
|
| 106 |
+
""",
|
| 107 |
+
examples=examples,
|
| 108 |
+
theme=gr.themes.Soft()
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
demo.launch()
|