Spaces:

UppsalaNLP
/

swedish-text-complexity

Sleeping

File size: 4,027 Bytes

72b5426

"""
Swedish Text Complexity Analyzer
Uppsala NLP - Hugging Face Space
"""

import gradio as gr
import math
import re

def calculate_lix(text):
    """Calculate LIX and other Swedish readability metrics."""
    # Tokenize
    words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not words or not sentences:
        return None

    num_words = len(words)
    num_sentences = len(sentences)
    num_long_words = sum(1 for w in words if len(w) > 6)
    num_unique = len(set(words))
    num_chars = sum(len(w) for w in words)

    # LIX
    lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)

    # OVIX
    if num_words >= 10 and num_unique > 1:
        log_tokens = math.log(num_words)
        log_types = math.log(num_unique)
        denom = 2 - (log_types / log_tokens)
        ovix = log_tokens / math.log(denom) if denom > 1 else 0
    else:
        ovix = 0

    # Category
    if lix < 25: cat = "Mycket lätt / Very Easy"
    elif lix < 30: cat = "Lätt / Easy"
    elif lix < 40: cat = "Medel / Medium"
    elif lix < 50: cat = "Svår / Difficult"
    elif lix < 60: cat = "Mycket svår / Very Difficult"
    else: cat = "Extremt svår / Extremely Difficult"

    return {
        "LIX Score": round(lix, 1),
        "Category": cat,
        "OVIX (Lexical Variation)": round(ovix, 1),
        "Avg Sentence Length": round(num_words / num_sentences, 1),
        "Avg Word Length": round(num_chars / num_words, 2),
        "Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
        "Total Words": num_words,
        "Total Sentences": num_sentences,
        "Unique Words": num_unique
    }

def analyze_text(text):
    if not text.strip():
        return "Please enter some Swedish text to analyze."

    results = calculate_lix(text)
    if not results:
        return "Could not analyze text. Please enter valid Swedish text."

    output = "## 📊 Analysis Results\n\n"
    output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
    output += "| Metric | Value |\n|--------|-------|\n"
    for k, v in results.items():
        output += f"| {k} | {v} |\n"

    output += "\n### 📖 LIX Scale Reference\n"
    output += "| Score | Level | Example |\n|-------|-------|--------|\n"
    output += "| < 25 | Very Easy | Children's books |\n"
    output += "| 25-30 | Easy | Simple fiction |\n"
    output += "| 30-40 | Medium | Newspapers |\n"
    output += "| 40-50 | Difficult | Official documents |\n"
    output += "| 50-60 | Very Difficult | Academic texts |\n"
    output += "| > 60 | Extremely Difficult | Legal/technical |\n"

    return output

examples = [
    ["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
    ["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
    ["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
]

demo = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(
        label="Swedish Text",
        placeholder="Enter Swedish text here...",
        lines=8
    ),
    outputs=gr.Markdown(label="Results"),
    title="🇸🇪 Swedish Text Complexity Analyzer",
    description="""
    Analyze the readability of Swedish text using established linguistic metrics.

    **Metrics:**
    - **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
    - **OVIX** (Ordvariationsindex): Lexical variation measure

    Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
    """,
    examples=examples,
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()