Spaces:

skalyan91
/

readability-feedback

Sleeping

File size: 16,396 Bytes

#!/usr/bin/env python
# coding: utf-8

import spacy

nlp = spacy.load("en_core_web_sm")

relations = {
    "subj": ["nsubj", "nsubjpass", "csubj", "csubjpass", "expl"],
    "comp": [
        "dobj",
        "dative",
        "attr",
        "oprd",
        "pobj",
        "aux",
        "auxpass",
        "mark",
        "case",
        "ccomp",
        "xcomp",
        "acomp",
    ],
    "mod": ["agent", "advmod", "advcl", "relcl", "npmod", "npadvmod", "prt"],
    "udep": ["acl", "amod", "nmod", "poss", "nummod", "prep"],
}


def sudify(doc):
    for token in doc:
        to_reverse = [token]
        for child in token.children:
            if (child.dep_ in ["aux", "auxpass", "mark", "case"]) or (
                child.dep_ == "advmod" and child.pos_ == "SCONJ"
            ):
                to_reverse.append(child)
        to_reverse.sort(key=lambda x: abs(x.i - token.i))
        if len(to_reverse) > 1:
            for i in range(1, len(to_reverse)):
                if to_reverse[i].dep_ in ["aux", "auxpass"]:
                    for child in to_reverse[i - 1].children:
                        if child.dep_ in relations["subj"] + relations["mod"] + [
                            "conj",
                            "cc",
                        ]:
                            child.head = to_reverse[i]
                to_reverse[i].head = (
                    to_reverse[i - 1].head
                    if to_reverse[i - 1].head != to_reverse[i - 1]
                    else to_reverse[i]
                )
                to_reverse[i].dep_ = to_reverse[i - 1].dep_
                to_reverse[i - 1].head = to_reverse[i]
                to_reverse[i - 1].dep_ = "comp"
    for token in doc:
        if token.dep_ == "dep":
            token.dep_ = "unknown"
        if (
            token.dep_ == "prep"
            and token.head.pos_ in ["VERB", "AUX"]
            and token.i < token.head.i
            and token.head.dep_ not in relations["mod"]
        ):
            token.dep_ = "mod"
        if (
            token.dep_ == "prep"
            and token.head.pos_ in ["VERB", "AUX"]
            and (
                (
                    len(list(token.head.rights)) >= 1
                    and token == list(token.head.rights)[0]
                )
                or (
                    len(list(token.head.rights)) >= 2
                    and list(token.head.rights)[0].dep_ == "dobj"
                    and token == list(token.head.rights)[1]
                )
            )
        ):
            token.dep_ = "comp"
        if token.dep_ == "ccomp" and any(
            sibling.dep_ in relations["comp"]
            for sibling in token.head.rights
            if sibling.i < token.i
        ):
            token.dep_ = "mod"
        dobjs = [child for child in token.children if child.dep_ == "dobj"]
        if len(dobjs) > 1:
            for i in range(1, len(dobjs)):
                dobjs[i].head = dobjs[i - 1]
                dobjs[i].dep_ = "appos"
    for token in doc:
        for rel in relations.keys():
            if token.dep_ in relations[rel]:
                token.dep_ = rel
    for token in doc:
        subjects = sorted(
            [child for child in token.children if child.dep_ == "subj"],
            key=lambda x: abs(x.i - token.i),
        )
        if len(subjects) > 1:
            for s in subjects[1:]:
                s.dep_ = "comp"
    for token in doc:
        subject = [child for child in token.children if child.dep_ == "subj"]
        if subject:
            for child in [c for c in token.children if c.i < subject[0].i]:
                if child.dep_ in ["comp", "udep"] and token.dep_ != "mod":
                    child.dep_ = "mod"
    for token in doc:
        if any(
            t.text in [";", ":"]
            for t in doc
            if (
                (
                    token.i < t.i < token.head.i
                    and not (
                        any(p.text == "(" for p in doc if token.i < p.i < t.i)
                        and any(p.text == ")" for p in doc if t.i < p.i < token.head.i)
                    )
                )
                or (
                    token.head.i < t.i < token.i
                    and not (
                        any(p.text == "(" for p in doc if token.head.i < p.i < t.i)
                        and any(p.text == ")" for p in doc if t.i < p.i < token.i)
                    )
                )
            )
            and token.pos_ != "PUNCT"
        ):
            token.head = token
            token.dep_ = "root"
        if token.pos_ in ["VERB", "AUX"]:
            core_children = [
                child
                for child in token.children
                if child.dep_ in ["subj", "comp", "udep"]
            ]
            core_children.append(token)
            core_children.sort(key=lambda x: x.i)
            right_edge = [t for t in core_children[-1].subtree if t.pos_ != "PUNCT"][-1]
            if right_edge.i < len(doc) - 1:
                if right_edge.text == "," or doc[right_edge.i + 1].text == ",":
                    for child in [
                        child
                        for child in token.children
                        if child.i > right_edge.i and child.dep_ == "conj"
                    ]:
                        child.dep_ = "mod"
        if (
            token.pos_ in ["VERB", "AUX"]
            and token.head.pos_ == "NOUN"
            and token.dep_ == "udep"
        ):
            token.dep_ = "mod"
    return doc


def flyover(token):
    if token.dep_ in ["subj", "comp"]:
        dep_distance = len(
            [
                t
                for t in token.doc[
                    min(token.i, token.head.i) + 1 : max(token.i, token.head.i)
                ]
                if len(list(t.children)) > 0
            ]
        )
        if token.head.i < token.i:
            return (token.doc[token.head.i + 1 : token.i], dep_distance)
        elif token.head.i > token.i:
            return (token.doc[token.i + 1 : token.head.i], dep_distance)
    else:
        return (token.doc[token.i : token.i], 0)


def get_fluff(doc):
    flyovers = list(map(flyover, doc))
    flyovers = [f for f in flyovers if len(f[0]) > 0]
    flyovers = [
        f1
        for f1 in flyovers
        if len(
            [
                f2
                for f2 in flyovers
                if (
                    f2[0][-1].i > f1[0][0].i >= f2[0][0].i
                    or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i
                )
                and (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])
            ]
        )
        == 0
        and f1[1] > 0
    ]
    flyovers = sorted(flyovers, key=lambda x: x[0][0].i)
    interstices = []
    for i in range(len(flyovers)):
        if i == 0:
            if flyovers[0][0][0].i > 0:
                interstices.append((doc[0 : flyovers[0][0][0].i], 0))
        else:
            if flyovers[i][0][0].i > flyovers[i - 1][0][-1].i + 1:
                interstices.append(
                    (doc[flyovers[i - 1][0][-1].i + 1 : flyovers[i][0][0].i], 0)
                )
            # elif flyovers[i][1] == flyovers[i-1][1]:
            #     flyovers[i] = (doc[flyovers[i-1][0][0].i:flyovers[i][0][-1].i+1], flyovers[i][1])
            #     flyovers[i-1] = (doc[flyovers[i-1][0][0].i:flyovers[i-1][0][0].i], flyovers[i-1][1])
    if len(flyovers) > 0:
        if flyovers[-1][0][-1].i < doc[-1].i:
            interstices.append((doc[flyovers[-1][0][-1].i + 1 :], 0))
    else:
        interstices.append((doc, 0))
    flyovers = [f for f in flyovers if len(f[0]) > 0]
    return sorted(flyovers + interstices, key=lambda x: x[0][0].i)


from fasthtml_hf import setup_hf_backup
from fasthtml.common import *
import re

app, rt = fast_app(pico=True)


@app.get
def index():
    page = Div(
        Form(
            hx_post=send,
            hx_target="#output",
            hx_swap="outerHTML show:none",
        )(
            Div(
                Span(
                    Button("Check"),
                    A("How this works", href="/about"),
                    style="margin-bottom: 1rem; display: flex; gap: 1rem; align-items: center",
                ),
                Textarea(
                    name="text",
                    id="input-text",
                    style="height: calc(100vh - 11rem);",
                    onscroll="document.getElementById('output').scrollTop = this.scrollTop + 1; document.getElementById('output').scrollLeft = this.scrollLeft;",
                ),
            )
        ),
        Div(
            Div(
                Em(
                    "Highlighted text segments can be shortened or reordered to improve readability."
                ),
                cls="overflow-auto",
                style="height: 4rem; text-wrap: balance; padding: 0rem 1rem",
            ),
            Div(id="output", style="padding: 1rem calc(1rem - 5px)"),
        ),
        cls="grid",
    )
    return Titled("Readability feedback", page)


@app.get
def about():
    content = Div(
        H2("How this works"),
        P(
            "One of the keys to writing clearly is to ",
            Em("keep related words close together"),
            ". Don't, if you want to be understood, insert any long asides! (See what I did there?) This tool helps you visually identify places in your writing where two related words are interrupted by an aside, which you can then either shorten or move to a different position in the sentence.",
        ),
        P(
            "But how do we identify words in a sentence that are related to each other? We can do this using a technique from natural language processing called ",
            Em("dependency parsing"),
            ". For example, we can take a sentence like ",
            Var("The manager approved the proposal although she had doubts"),
            ", and produce a diagram like the following:",
        ),
        Div(
            NotStr(
                open("sample_parse.svg", "r").read(),
            ),
            style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
        ),
        P(
            "This tells us, for example, that ",
            Var("manager"),
            " (or ",
            Var("the manager"),
            ") is the subject of ",
            Var("approved"),
            ' (since she is the "main character" of the event of approving); that ',
            Var("proposal"),
            " (or ",
            Var("the proposal"),
            ") is a complement of ",
            Var("approved"),
            " (since you cannot imagine an act of approving without imagining the thing that is being approved—in this case, the proposal); and that ",
            Var("although"),
            " (or ",
            Var("although she had doubts"),
            ") is a modifier of ",
            Var("approved"),
            " (since it gives us the context of the manager's approval). Naturally, subjects and complements are more closely related to the verb than modifiers are, and so we ignore modifiers when identifying related words that should be kept close together. We can see that in this sentence, the subject and the complement are right next to the verb, and so the sentence is easy to read.",
        ),
        P("Now let us see what happens when we reorder the sentence:"),
        Div(
            NotStr(
                open("sample_parse_2.svg", "r").read(),
            ),
            style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
        ),
        P(
            "Here, we see that the modifier ",
            Var("although she had doubts"),
            " now interrupts the subject relation between ",
            Var("the manager"),
            " and ",
            Var("approved"),
            ". And indeed, you can see that this sentence is harder to read than the first one.",
        ),
        H2("Technical details"),
        P(
            "The inspiration for this tool is the idea of ",
            Em("Dependency Length Minimisation"),
            " (DLM) in psycholinguistics, which posits that human languages tend to minimise the distance between syntactically related words to reduce cognitive load during sentence processing. For more information on DLM, see ",
            A(
                "Futrell et al. (2015)",
                href="https://pmc.ncbi.nlm.nih.gov/articles/PMC4547262/",
            ),
            ". For evidence that dependency length predicts reading times in English, see e.g. ",
            A(
                "Bartek et al. (2011)",
                href="https://pubmed.ncbi.nlm.nih.gov/21707210/",
            ),
            ".",
        ),
        P(
            "I have used the ",
            A("spaCy", href="https://spacy.io/"),
            " library's ",
            Var("en_core_web_sm"),
            " model to perform dependency parsing, adjusting the results to bring them in line with the ",
            A(
                "Surface Syntactic Universal Dependencies (SUD)",
                href="https://surfacesyntacticud.org",
            ),
            " framework, which is more consistent with linguistic theories. For any dependency arc that exhibits the ",
            Var("subj"),
            " (subject) or ",
            Var("comp"),
            " (complement) relation, the words lying between the head and the dependent are highlighted with an opacity proportional to the number of heads (i.e. words with at least one dependent) in that interval. This follows the revised definition of dependency length proposed by ",
            A(
                "Yadav et al. (2022)",
                href="https://direct.mit.edu/opmi/article/doi/10.1162/opmi_a_00060/112598/A-Reappraisal-of-Dependency-Length-Minimization-as",
            ),
            ".",
        ),
        P(
            "The web app itself was built using the ",
            A("FastHTML", href="https://fasthtml.org/"),
            " framework, which I learned about in the ",
            A("Solve It With Code", href="https://solve.it.com/"),
            " course from ",
            A("Answer.AI", href="https://www.answer.ai/"),
            ". The entire development took two days (starting on December 26, 2025), with an extra day for handling various edge cases. This page was added on January 4, 2026.",
        ),
        A("Back to main page", href="/"),
        style="padding-bottom: 1rem; padding-top: 1rem; max-width: 800px; margin: auto",
    )
    return Titled("Readability feedback", content)


@app.post
def send(text: str):
    paragraphs = re.sub(r"[^\S\r\n]+", " ", text).split("\r\n\r\n")
    docs = [sudify(nlp(para)) for para in paragraphs]
    annot_paras = [get_fluff(doc) for doc in docs]

    sync_script = Script(
        "setTimeout(() => { const textarea = document.getElementById('input-text'); const output = document.getElementById('output'); if (textarea && output) { output.scrollTop = textarea.scrollTop + 1; output.scrollLeft = textarea.scrollLeft; } }, 100);"
    )

    return Div(
        sync_script,
        *[
            P(
                *[
                    Span(
                        Span(
                            a[0],
                            style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
                        ),
                        Span(" "),
                    )
                    for a in annot_para
                ],
                style="margin-bottom: 1.5em",
            )
            for annot_para in annot_paras[:-1]
        ],
        P(
            *[
                Span(
                    Span(
                        a[0],
                        style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
                    ),
                    Span(" "),
                )
                for a in annot_paras[-1]
            ],
            style="margin-bottom: 0em",
        ),
        id="output",
        cls="overflow-auto",
        style="height: calc(100vh - 11rem); padding: 1rem; padding-bottom: calc(1rem - 5px);",
        onscroll="document.getElementById('input-text').scrollTop = this.scrollTop - 1; document.getElementById('input-text').scrollLeft = this.scrollLeft;",
    )


setup_hf_backup(app)
serve()