birgermoell's picture
Upload app.py with huggingface_hub
72b5426 verified
"""
Swedish Text Complexity Analyzer
Uppsala NLP - Hugging Face Space
"""
import gradio as gr
import math
import re
def calculate_lix(text):
"""Calculate LIX and other Swedish readability metrics."""
# Tokenize
words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if not words or not sentences:
return None
num_words = len(words)
num_sentences = len(sentences)
num_long_words = sum(1 for w in words if len(w) > 6)
num_unique = len(set(words))
num_chars = sum(len(w) for w in words)
# LIX
lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)
# OVIX
if num_words >= 10 and num_unique > 1:
log_tokens = math.log(num_words)
log_types = math.log(num_unique)
denom = 2 - (log_types / log_tokens)
ovix = log_tokens / math.log(denom) if denom > 1 else 0
else:
ovix = 0
# Category
if lix < 25: cat = "Mycket lätt / Very Easy"
elif lix < 30: cat = "Lätt / Easy"
elif lix < 40: cat = "Medel / Medium"
elif lix < 50: cat = "Svår / Difficult"
elif lix < 60: cat = "Mycket svår / Very Difficult"
else: cat = "Extremt svår / Extremely Difficult"
return {
"LIX Score": round(lix, 1),
"Category": cat,
"OVIX (Lexical Variation)": round(ovix, 1),
"Avg Sentence Length": round(num_words / num_sentences, 1),
"Avg Word Length": round(num_chars / num_words, 2),
"Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
"Total Words": num_words,
"Total Sentences": num_sentences,
"Unique Words": num_unique
}
def analyze_text(text):
if not text.strip():
return "Please enter some Swedish text to analyze."
results = calculate_lix(text)
if not results:
return "Could not analyze text. Please enter valid Swedish text."
output = "## 📊 Analysis Results\n\n"
output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
output += "| Metric | Value |\n|--------|-------|\n"
for k, v in results.items():
output += f"| {k} | {v} |\n"
output += "\n### 📖 LIX Scale Reference\n"
output += "| Score | Level | Example |\n|-------|-------|--------|\n"
output += "| < 25 | Very Easy | Children's books |\n"
output += "| 25-30 | Easy | Simple fiction |\n"
output += "| 30-40 | Medium | Newspapers |\n"
output += "| 40-50 | Difficult | Official documents |\n"
output += "| 50-60 | Very Difficult | Academic texts |\n"
output += "| > 60 | Extremely Difficult | Legal/technical |\n"
return output
examples = [
["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
]
demo = gr.Interface(
fn=analyze_text,
inputs=gr.Textbox(
label="Swedish Text",
placeholder="Enter Swedish text here...",
lines=8
),
outputs=gr.Markdown(label="Results"),
title="🇸🇪 Swedish Text Complexity Analyzer",
description="""
Analyze the readability of Swedish text using established linguistic metrics.
**Metrics:**
- **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
- **OVIX** (Ordvariationsindex): Lexical variation measure
Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
""",
examples=examples,
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.launch()