""" Swedish Text Complexity Analyzer Uppsala NLP - Hugging Face Space """ import gradio as gr import math import re def calculate_lix(text): """Calculate LIX and other Swedish readability metrics.""" # Tokenize words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower()) sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] if not words or not sentences: return None num_words = len(words) num_sentences = len(sentences) num_long_words = sum(1 for w in words if len(w) > 6) num_unique = len(set(words)) num_chars = sum(len(w) for w in words) # LIX lix = (num_words / num_sentences) + (num_long_words * 100 / num_words) # OVIX if num_words >= 10 and num_unique > 1: log_tokens = math.log(num_words) log_types = math.log(num_unique) denom = 2 - (log_types / log_tokens) ovix = log_tokens / math.log(denom) if denom > 1 else 0 else: ovix = 0 # Category if lix < 25: cat = "Mycket lätt / Very Easy" elif lix < 30: cat = "Lätt / Easy" elif lix < 40: cat = "Medel / Medium" elif lix < 50: cat = "Svår / Difficult" elif lix < 60: cat = "Mycket svår / Very Difficult" else: cat = "Extremt svår / Extremely Difficult" return { "LIX Score": round(lix, 1), "Category": cat, "OVIX (Lexical Variation)": round(ovix, 1), "Avg Sentence Length": round(num_words / num_sentences, 1), "Avg Word Length": round(num_chars / num_words, 2), "Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%", "Total Words": num_words, "Total Sentences": num_sentences, "Unique Words": num_unique } def analyze_text(text): if not text.strip(): return "Please enter some Swedish text to analyze." results = calculate_lix(text) if not results: return "Could not analyze text. Please enter valid Swedish text." output = "## 📊 Analysis Results\n\n" output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n" output += "| Metric | Value |\n|--------|-------|\n" for k, v in results.items(): output += f"| {k} | {v} |\n" output += "\n### 📖 LIX Scale Reference\n" output += "| Score | Level | Example |\n|-------|-------|--------|\n" output += "| < 25 | Very Easy | Children's books |\n" output += "| 25-30 | Easy | Simple fiction |\n" output += "| 30-40 | Medium | Newspapers |\n" output += "| 40-50 | Difficult | Official documents |\n" output += "| 50-60 | Very Difficult | Academic texts |\n" output += "| > 60 | Extremely Difficult | Legal/technical |\n" return output examples = [ ["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."], ["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."], ["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."], ] demo = gr.Interface( fn=analyze_text, inputs=gr.Textbox( label="Swedish Text", placeholder="Enter Swedish text here...", lines=8 ), outputs=gr.Markdown(label="Results"), title="🇸🇪 Swedish Text Complexity Analyzer", description=""" Analyze the readability of Swedish text using established linguistic metrics. **Metrics:** - **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968) - **OVIX** (Ordvariationsindex): Lexical variation measure Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools. """, examples=examples, theme=gr.themes.Soft() ) if __name__ == "__main__": demo.launch()