Spaces:
Sleeping
Sleeping
File size: 16,396 Bytes
2c29c78 5b44626 2c29c78 d11d70d 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 d11d70d 5b44626 7037bc6 5b44626 d11d70d 08c1d85 5b44626 08c1d85 7037bc6 2c29c78 5b44626 2c29c78 5b44626 2c29c78 3928738 2c29c78 d11d70d 5b44626 d11d70d 08c1d85 5b44626 08c1d85 5b44626 08c1d85 5b44626 08c1d85 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 39f1c97 5b44626 39f1c97 5b44626 2c29c78 39f1c97 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 5b44626 2c29c78 39f1c97 5b44626 39f1c97 5b44626 39f1c97 5b44626 2c29c78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 |
#!/usr/bin/env python
# coding: utf-8
import spacy
nlp = spacy.load("en_core_web_sm")
relations = {
"subj": ["nsubj", "nsubjpass", "csubj", "csubjpass", "expl"],
"comp": [
"dobj",
"dative",
"attr",
"oprd",
"pobj",
"aux",
"auxpass",
"mark",
"case",
"ccomp",
"xcomp",
"acomp",
],
"mod": ["agent", "advmod", "advcl", "relcl", "npmod", "npadvmod", "prt"],
"udep": ["acl", "amod", "nmod", "poss", "nummod", "prep"],
}
def sudify(doc):
for token in doc:
to_reverse = [token]
for child in token.children:
if (child.dep_ in ["aux", "auxpass", "mark", "case"]) or (
child.dep_ == "advmod" and child.pos_ == "SCONJ"
):
to_reverse.append(child)
to_reverse.sort(key=lambda x: abs(x.i - token.i))
if len(to_reverse) > 1:
for i in range(1, len(to_reverse)):
if to_reverse[i].dep_ in ["aux", "auxpass"]:
for child in to_reverse[i - 1].children:
if child.dep_ in relations["subj"] + relations["mod"] + [
"conj",
"cc",
]:
child.head = to_reverse[i]
to_reverse[i].head = (
to_reverse[i - 1].head
if to_reverse[i - 1].head != to_reverse[i - 1]
else to_reverse[i]
)
to_reverse[i].dep_ = to_reverse[i - 1].dep_
to_reverse[i - 1].head = to_reverse[i]
to_reverse[i - 1].dep_ = "comp"
for token in doc:
if token.dep_ == "dep":
token.dep_ = "unknown"
if (
token.dep_ == "prep"
and token.head.pos_ in ["VERB", "AUX"]
and token.i < token.head.i
and token.head.dep_ not in relations["mod"]
):
token.dep_ = "mod"
if (
token.dep_ == "prep"
and token.head.pos_ in ["VERB", "AUX"]
and (
(
len(list(token.head.rights)) >= 1
and token == list(token.head.rights)[0]
)
or (
len(list(token.head.rights)) >= 2
and list(token.head.rights)[0].dep_ == "dobj"
and token == list(token.head.rights)[1]
)
)
):
token.dep_ = "comp"
if token.dep_ == "ccomp" and any(
sibling.dep_ in relations["comp"]
for sibling in token.head.rights
if sibling.i < token.i
):
token.dep_ = "mod"
dobjs = [child for child in token.children if child.dep_ == "dobj"]
if len(dobjs) > 1:
for i in range(1, len(dobjs)):
dobjs[i].head = dobjs[i - 1]
dobjs[i].dep_ = "appos"
for token in doc:
for rel in relations.keys():
if token.dep_ in relations[rel]:
token.dep_ = rel
for token in doc:
subjects = sorted(
[child for child in token.children if child.dep_ == "subj"],
key=lambda x: abs(x.i - token.i),
)
if len(subjects) > 1:
for s in subjects[1:]:
s.dep_ = "comp"
for token in doc:
subject = [child for child in token.children if child.dep_ == "subj"]
if subject:
for child in [c for c in token.children if c.i < subject[0].i]:
if child.dep_ in ["comp", "udep"] and token.dep_ != "mod":
child.dep_ = "mod"
for token in doc:
if any(
t.text in [";", ":"]
for t in doc
if (
(
token.i < t.i < token.head.i
and not (
any(p.text == "(" for p in doc if token.i < p.i < t.i)
and any(p.text == ")" for p in doc if t.i < p.i < token.head.i)
)
)
or (
token.head.i < t.i < token.i
and not (
any(p.text == "(" for p in doc if token.head.i < p.i < t.i)
and any(p.text == ")" for p in doc if t.i < p.i < token.i)
)
)
)
and token.pos_ != "PUNCT"
):
token.head = token
token.dep_ = "root"
if token.pos_ in ["VERB", "AUX"]:
core_children = [
child
for child in token.children
if child.dep_ in ["subj", "comp", "udep"]
]
core_children.append(token)
core_children.sort(key=lambda x: x.i)
right_edge = [t for t in core_children[-1].subtree if t.pos_ != "PUNCT"][-1]
if right_edge.i < len(doc) - 1:
if right_edge.text == "," or doc[right_edge.i + 1].text == ",":
for child in [
child
for child in token.children
if child.i > right_edge.i and child.dep_ == "conj"
]:
child.dep_ = "mod"
if (
token.pos_ in ["VERB", "AUX"]
and token.head.pos_ == "NOUN"
and token.dep_ == "udep"
):
token.dep_ = "mod"
return doc
def flyover(token):
if token.dep_ in ["subj", "comp"]:
dep_distance = len(
[
t
for t in token.doc[
min(token.i, token.head.i) + 1 : max(token.i, token.head.i)
]
if len(list(t.children)) > 0
]
)
if token.head.i < token.i:
return (token.doc[token.head.i + 1 : token.i], dep_distance)
elif token.head.i > token.i:
return (token.doc[token.i + 1 : token.head.i], dep_distance)
else:
return (token.doc[token.i : token.i], 0)
def get_fluff(doc):
flyovers = list(map(flyover, doc))
flyovers = [f for f in flyovers if len(f[0]) > 0]
flyovers = [
f1
for f1 in flyovers
if len(
[
f2
for f2 in flyovers
if (
f2[0][-1].i > f1[0][0].i >= f2[0][0].i
or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i
)
and (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])
]
)
== 0
and f1[1] > 0
]
flyovers = sorted(flyovers, key=lambda x: x[0][0].i)
interstices = []
for i in range(len(flyovers)):
if i == 0:
if flyovers[0][0][0].i > 0:
interstices.append((doc[0 : flyovers[0][0][0].i], 0))
else:
if flyovers[i][0][0].i > flyovers[i - 1][0][-1].i + 1:
interstices.append(
(doc[flyovers[i - 1][0][-1].i + 1 : flyovers[i][0][0].i], 0)
)
# elif flyovers[i][1] == flyovers[i-1][1]:
# flyovers[i] = (doc[flyovers[i-1][0][0].i:flyovers[i][0][-1].i+1], flyovers[i][1])
# flyovers[i-1] = (doc[flyovers[i-1][0][0].i:flyovers[i-1][0][0].i], flyovers[i-1][1])
if len(flyovers) > 0:
if flyovers[-1][0][-1].i < doc[-1].i:
interstices.append((doc[flyovers[-1][0][-1].i + 1 :], 0))
else:
interstices.append((doc, 0))
flyovers = [f for f in flyovers if len(f[0]) > 0]
return sorted(flyovers + interstices, key=lambda x: x[0][0].i)
from fasthtml_hf import setup_hf_backup
from fasthtml.common import *
import re
app, rt = fast_app(pico=True)
@app.get
def index():
page = Div(
Form(
hx_post=send,
hx_target="#output",
hx_swap="outerHTML show:none",
)(
Div(
Span(
Button("Check"),
A("How this works", href="/about"),
style="margin-bottom: 1rem; display: flex; gap: 1rem; align-items: center",
),
Textarea(
name="text",
id="input-text",
style="height: calc(100vh - 11rem);",
onscroll="document.getElementById('output').scrollTop = this.scrollTop + 1; document.getElementById('output').scrollLeft = this.scrollLeft;",
),
)
),
Div(
Div(
Em(
"Highlighted text segments can be shortened or reordered to improve readability."
),
cls="overflow-auto",
style="height: 4rem; text-wrap: balance; padding: 0rem 1rem",
),
Div(id="output", style="padding: 1rem calc(1rem - 5px)"),
),
cls="grid",
)
return Titled("Readability feedback", page)
@app.get
def about():
content = Div(
H2("How this works"),
P(
"One of the keys to writing clearly is to ",
Em("keep related words close together"),
". Don't, if you want to be understood, insert any long asides! (See what I did there?) This tool helps you visually identify places in your writing where two related words are interrupted by an aside, which you can then either shorten or move to a different position in the sentence.",
),
P(
"But how do we identify words in a sentence that are related to each other? We can do this using a technique from natural language processing called ",
Em("dependency parsing"),
". For example, we can take a sentence like ",
Var("The manager approved the proposal although she had doubts"),
", and produce a diagram like the following:",
),
Div(
NotStr(
open("sample_parse.svg", "r").read(),
),
style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
),
P(
"This tells us, for example, that ",
Var("manager"),
" (or ",
Var("the manager"),
") is the subject of ",
Var("approved"),
' (since she is the "main character" of the event of approving); that ',
Var("proposal"),
" (or ",
Var("the proposal"),
") is a complement of ",
Var("approved"),
" (since you cannot imagine an act of approving without imagining the thing that is being approved—in this case, the proposal); and that ",
Var("although"),
" (or ",
Var("although she had doubts"),
") is a modifier of ",
Var("approved"),
" (since it gives us the context of the manager's approval). Naturally, subjects and complements are more closely related to the verb than modifiers are, and so we ignore modifiers when identifying related words that should be kept close together. We can see that in this sentence, the subject and the complement are right next to the verb, and so the sentence is easy to read.",
),
P("Now let us see what happens when we reorder the sentence:"),
Div(
NotStr(
open("sample_parse_2.svg", "r").read(),
),
style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
),
P(
"Here, we see that the modifier ",
Var("although she had doubts"),
" now interrupts the subject relation between ",
Var("the manager"),
" and ",
Var("approved"),
". And indeed, you can see that this sentence is harder to read than the first one.",
),
H2("Technical details"),
P(
"The inspiration for this tool is the idea of ",
Em("Dependency Length Minimisation"),
" (DLM) in psycholinguistics, which posits that human languages tend to minimise the distance between syntactically related words to reduce cognitive load during sentence processing. For more information on DLM, see ",
A(
"Futrell et al. (2015)",
href="https://pmc.ncbi.nlm.nih.gov/articles/PMC4547262/",
),
". For evidence that dependency length predicts reading times in English, see e.g. ",
A(
"Bartek et al. (2011)",
href="https://pubmed.ncbi.nlm.nih.gov/21707210/",
),
".",
),
P(
"I have used the ",
A("spaCy", href="https://spacy.io/"),
" library's ",
Var("en_core_web_sm"),
" model to perform dependency parsing, adjusting the results to bring them in line with the ",
A(
"Surface Syntactic Universal Dependencies (SUD)",
href="https://surfacesyntacticud.org",
),
" framework, which is more consistent with linguistic theories. For any dependency arc that exhibits the ",
Var("subj"),
" (subject) or ",
Var("comp"),
" (complement) relation, the words lying between the head and the dependent are highlighted with an opacity proportional to the number of heads (i.e. words with at least one dependent) in that interval. This follows the revised definition of dependency length proposed by ",
A(
"Yadav et al. (2022)",
href="https://direct.mit.edu/opmi/article/doi/10.1162/opmi_a_00060/112598/A-Reappraisal-of-Dependency-Length-Minimization-as",
),
".",
),
P(
"The web app itself was built using the ",
A("FastHTML", href="https://fasthtml.org/"),
" framework, which I learned about in the ",
A("Solve It With Code", href="https://solve.it.com/"),
" course from ",
A("Answer.AI", href="https://www.answer.ai/"),
". The entire development took two days (starting on December 26, 2025), with an extra day for handling various edge cases. This page was added on January 4, 2026.",
),
A("Back to main page", href="/"),
style="padding-bottom: 1rem; padding-top: 1rem; max-width: 800px; margin: auto",
)
return Titled("Readability feedback", content)
@app.post
def send(text: str):
paragraphs = re.sub(r"[^\S\r\n]+", " ", text).split("\r\n\r\n")
docs = [sudify(nlp(para)) for para in paragraphs]
annot_paras = [get_fluff(doc) for doc in docs]
sync_script = Script(
"setTimeout(() => { const textarea = document.getElementById('input-text'); const output = document.getElementById('output'); if (textarea && output) { output.scrollTop = textarea.scrollTop + 1; output.scrollLeft = textarea.scrollLeft; } }, 100);"
)
return Div(
sync_script,
*[
P(
*[
Span(
Span(
a[0],
style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
),
Span(" "),
)
for a in annot_para
],
style="margin-bottom: 1.5em",
)
for annot_para in annot_paras[:-1]
],
P(
*[
Span(
Span(
a[0],
style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
),
Span(" "),
)
for a in annot_paras[-1]
],
style="margin-bottom: 0em",
),
id="output",
cls="overflow-auto",
style="height: calc(100vh - 11rem); padding: 1rem; padding-bottom: calc(1rem - 5px);",
onscroll="document.getElementById('input-text').scrollTop = this.scrollTop - 1; document.getElementById('input-text').scrollLeft = this.scrollLeft;",
)
setup_hf_backup(app)
serve()
|