birgermoell commited on
Commit
72b5426
·
verified ·
1 Parent(s): 5e69d4b

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Swedish Text Complexity Analyzer
3
+ Uppsala NLP - Hugging Face Space
4
+ """
5
+
6
+ import gradio as gr
7
+ import math
8
+ import re
9
+
10
+ def calculate_lix(text):
11
+ """Calculate LIX and other Swedish readability metrics."""
12
+ # Tokenize
13
+ words = re.findall(r'[a-zA-ZåäöÅÄÖéÉüÜ]+', text.lower())
14
+ sentences = re.split(r'[.!?]+', text)
15
+ sentences = [s.strip() for s in sentences if s.strip()]
16
+
17
+ if not words or not sentences:
18
+ return None
19
+
20
+ num_words = len(words)
21
+ num_sentences = len(sentences)
22
+ num_long_words = sum(1 for w in words if len(w) > 6)
23
+ num_unique = len(set(words))
24
+ num_chars = sum(len(w) for w in words)
25
+
26
+ # LIX
27
+ lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)
28
+
29
+ # OVIX
30
+ if num_words >= 10 and num_unique > 1:
31
+ log_tokens = math.log(num_words)
32
+ log_types = math.log(num_unique)
33
+ denom = 2 - (log_types / log_tokens)
34
+ ovix = log_tokens / math.log(denom) if denom > 1 else 0
35
+ else:
36
+ ovix = 0
37
+
38
+ # Category
39
+ if lix < 25: cat = "Mycket lätt / Very Easy"
40
+ elif lix < 30: cat = "Lätt / Easy"
41
+ elif lix < 40: cat = "Medel / Medium"
42
+ elif lix < 50: cat = "Svår / Difficult"
43
+ elif lix < 60: cat = "Mycket svår / Very Difficult"
44
+ else: cat = "Extremt svår / Extremely Difficult"
45
+
46
+ return {
47
+ "LIX Score": round(lix, 1),
48
+ "Category": cat,
49
+ "OVIX (Lexical Variation)": round(ovix, 1),
50
+ "Avg Sentence Length": round(num_words / num_sentences, 1),
51
+ "Avg Word Length": round(num_chars / num_words, 2),
52
+ "Long Words (>6 chars)": f"{round(num_long_words * 100 / num_words, 1)}%",
53
+ "Total Words": num_words,
54
+ "Total Sentences": num_sentences,
55
+ "Unique Words": num_unique
56
+ }
57
+
58
+ def analyze_text(text):
59
+ if not text.strip():
60
+ return "Please enter some Swedish text to analyze."
61
+
62
+ results = calculate_lix(text)
63
+ if not results:
64
+ return "Could not analyze text. Please enter valid Swedish text."
65
+
66
+ output = "## 📊 Analysis Results\n\n"
67
+ output += f"### LIX: {results['LIX Score']} ({results['Category']})\n\n"
68
+ output += "| Metric | Value |\n|--------|-------|\n"
69
+ for k, v in results.items():
70
+ output += f"| {k} | {v} |\n"
71
+
72
+ output += "\n### 📖 LIX Scale Reference\n"
73
+ output += "| Score | Level | Example |\n|-------|-------|--------|\n"
74
+ output += "| < 25 | Very Easy | Children's books |\n"
75
+ output += "| 25-30 | Easy | Simple fiction |\n"
76
+ output += "| 30-40 | Medium | Newspapers |\n"
77
+ output += "| 40-50 | Difficult | Official documents |\n"
78
+ output += "| 50-60 | Very Difficult | Academic texts |\n"
79
+ output += "| > 60 | Extremely Difficult | Legal/technical |\n"
80
+
81
+ return output
82
+
83
+ examples = [
84
+ ["Solen skiner. Kalle går ut. Han ser en hund. Hunden är glad. De leker tillsammans i parken."],
85
+ ["Regeringen presenterade igår ett nytt förslag om klimatåtgärder. Enligt statsministern ska Sverige minska sina utsläpp med femtio procent till år 2030."],
86
+ ["Den epistemologiska problematiken kring vetenskaplig objektivitet har genomgått betydande transformationer under det senaste århundradet. Poststrukturalistiska perspektiv har ifrågasatt fundamentala antaganden om kunskapsproduktionens neutralitet."],
87
+ ]
88
+
89
+ demo = gr.Interface(
90
+ fn=analyze_text,
91
+ inputs=gr.Textbox(
92
+ label="Swedish Text",
93
+ placeholder="Enter Swedish text here...",
94
+ lines=8
95
+ ),
96
+ outputs=gr.Markdown(label="Results"),
97
+ title="🇸🇪 Swedish Text Complexity Analyzer",
98
+ description="""
99
+ Analyze the readability of Swedish text using established linguistic metrics.
100
+
101
+ **Metrics:**
102
+ - **LIX** (Läsbarhetsindex): Standard Swedish readability formula by Carl-Hugo Björnsson (1968)
103
+ - **OVIX** (Ordvariationsindex): Lexical variation measure
104
+
105
+ Part of [Uppsala NLP](https://huggingface.co/UppsalaNLP) research tools.
106
+ """,
107
+ examples=examples,
108
+ theme=gr.themes.Soft()
109
+ )
110
+
111
+ if __name__ == "__main__":
112
+ demo.launch()