Spaces:

UppsalaNLP
/

swedish-text-complexity

Sleeping

App Files Files Community

swedish-text-complexity / app.py

birgermoell

Upload app.py with huggingface_hub

72b5426 verified 11 days ago

raw

history blame contribute delete

4.03 kB

	"""
	Swedish Text Complexity Analyzer
	Uppsala NLP - Hugging Face Space
	"""

	import gradio as gr
	import math
	import re

	def calculate_lix(text):
	"""Calculate LIX and other Swedish readability metrics."""
	# Tokenize
	words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if not words or not sentences:
	return None

	num_words = len(words)
	num_sentences = len(sentences)
	num_long_words = sum(1 for w in words if len(w) > 6)
	num_unique = len(set(words))
	num_chars = sum(len(w) for w in words)

	# LIX
	lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)

	# OVIX
	if num_words >= 10 and num_unique > 1:
	log_tokens = math.log(num_words)
	log_types = math.log(num_unique)
	denom = 2 - (log_types / log_tokens)
	ovix = log_tokens / math.log(denom) if denom > 1 else 0
	else:
	ovix = 0

	# Category
	if lix < 25: cat = "Mycket lätt / Very Easy"
	elif lix < 30: cat = "Lätt / Easy"
	elif lix < 40: cat = "Medel / Medium"
	elif lix < 50: cat = "Svår / Difficult"
	elif lix < 60: cat = "Mycket svår / Very Difficult"
	else: cat = "Extremt svår / Extremely Difficult"

	return {
	"LIX Score": round(lix, 1),
	"Category": cat,
	"OVIX (Lexical Variation)": round(ovix, 1),
	"Avg Sentence Length": round(num_words / num_sentences, 1),
	"Avg Word Length": round(num_chars / num_words, 2),
	"Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
	"Total Words": num_words,
	"Total Sentences": num_sentences,
	"Unique Words": num_unique
	}

	def analyze_text(text):
	if not text.strip():
	return "Please enter some Swedish text to analyze."

	results = calculate_lix(text)
	if not results:
	return "Could not analyze text. Please enter valid Swedish text."

	output = "## 📊 Analysis Results\n\n"
	output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
	output += "\| Metric \| Value \|\n\|--------\|-------\|\n"
	for k, v in results.items():
	output += f"\| {k} \| {v} \|\n"

	output += "\n### 📖 LIX Scale Reference\n"
	output += "\| Score \| Level \| Example \|\n\|-------\|-------\|--------\|\n"
	output += "\| < 25 \| Very Easy \| Children's books \|\n"
	output += "\| 25-30 \| Easy \| Simple fiction \|\n"
	output += "\| 30-40 \| Medium \| Newspapers \|\n"
	output += "\| 40-50 \| Difficult \| Official documents \|\n"
	output += "\| 50-60 \| Very Difficult \| Academic texts \|\n"
	output += "\| > 60 \| Extremely Difficult \| Legal/technical \|\n"

	return output

	examples = [
	["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
	["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
	["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
	]

	demo = gr.Interface(
	fn=analyze_text,
	inputs=gr.Textbox(
	label="Swedish Text",
	placeholder="Enter Swedish text here...",
	lines=8
	),
	outputs=gr.Markdown(label="Results"),
	title="🇸🇪 Swedish Text Complexity Analyzer",
	description="""
	Analyze the readability of Swedish text using established linguistic metrics.

	Metrics:
	- LIX (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
	- OVIX (Ordvariationsindex): Lexical variation measure

	Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
	""",
	examples=examples,
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch()