Spaces:

UppsalaNLP
/

swedish-text-complexity

Sleeping

App Files Files Community

birgermoell commited on Jan 23

Commit

72b5426

verified ·

1 Parent(s): 5e69d4b

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +112 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Swedish Text Complexity Analyzer
+Uppsala NLP - Hugging Face Space
+"""
+import gradio as gr
+import math
+import re
+def calculate_lix(text):
+    """Calculate LIX and other Swedish readability metrics."""
+    # Tokenize
+    words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
+    sentences = re.split(r'[.!?]+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    if not words or not sentences:
+        return None
+    num_words = len(words)
+    num_sentences = len(sentences)
+    num_long_words = sum(1 for w in words if len(w) > 6)
+    num_unique = len(set(words))
+    num_chars = sum(len(w) for w in words)
+    # LIX
+    lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)
+    # OVIX
+    if num_words >= 10 and num_unique > 1:
+        log_tokens = math.log(num_words)
+        log_types = math.log(num_unique)
+        denom = 2 - (log_types / log_tokens)
+        ovix = log_tokens / math.log(denom) if denom > 1 else 0
+    else:
+        ovix = 0
+    # Category
+    if lix < 25: cat = "Mycket lätt / Very Easy"
+    elif lix < 30: cat = "Lätt / Easy"
+    elif lix < 40: cat = "Medel / Medium"
+    elif lix < 50: cat = "Svår / Difficult"
+    elif lix < 60: cat = "Mycket svår / Very Difficult"
+    else: cat = "Extremt svår / Extremely Difficult"
+    return {
+        "LIX Score": round(lix, 1),
+        "Category": cat,
+        "OVIX (Lexical Variation)": round(ovix, 1),
+        "Avg Sentence Length": round(num_words / num_sentences, 1),
+        "Avg Word Length": round(num_chars / num_words, 2),
+        "Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
+        "Total Words": num_words,
+        "Total Sentences": num_sentences,
+        "Unique Words": num_unique
+    }
+def analyze_text(text):
+    if not text.strip():
+        return "Please enter some Swedish text to analyze."
+    results = calculate_lix(text)
+    if not results:
+        return "Could not analyze text. Please enter valid Swedish text."
+    output = "## 📊 Analysis Results\n\n"
+    output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
+    output += "| Metric | Value |\n|--------|-------|\n"
+    for k, v in results.items():
+        output += f"| {k} | {v} |\n"
+    output += "\n### 📖 LIX Scale Reference\n"
+    output += "| Score | Level | Example |\n|-------|-------|--------|\n"
+    output += "| < 25 | Very Easy | Children's books |\n"
+    output += "| 25-30 | Easy | Simple fiction |\n"
+    output += "| 30-40 | Medium | Newspapers |\n"
+    output += "| 40-50 | Difficult | Official documents |\n"
+    output += "| 50-60 | Very Difficult | Academic texts |\n"
+    output += "| > 60 | Extremely Difficult | Legal/technical |\n"
+    return output
+examples = [
+    ["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
+    ["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
+    ["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
+]
+demo = gr.Interface(
+    fn=analyze_text,
+    inputs=gr.Textbox(
+        label="Swedish Text",
+        placeholder="Enter Swedish text here...",
+        lines=8
+    ),
+    outputs=gr.Markdown(label="Results"),
+    title="🇸🇪 Swedish Text Complexity Analyzer",
+    description="""
+    Analyze the readability of Swedish text using established linguistic metrics.
+    **Metrics:**
+    - **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
+    - **OVIX** (Ordvariationsindex): Lexical variation measure
+    Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
+    """,
+    examples=examples,
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    demo.launch()