skalyan91 commited on
Commit
5b44626
·
verified ·
1 Parent(s): a6daf52

deploy at 2026-01-04 16:38:37.435466

Browse files
Files changed (4) hide show
  1. Dependency length.ipynb +72 -55
  2. main.py +308 -62
  3. sample_parse.svg +406 -0
  4. sample_parse_2.svg +406 -0
Dependency length.ipynb CHANGED
@@ -68,7 +68,8 @@
68
  " if token.dep_ == \"prep\" and token.head.pos_ in [\"VERB\", \"AUX\"] and \\\n",
69
  " ((len(list(token.head.rights)) >= 1 and token == list(token.head.rights)[0]) or \n",
70
  " (len(list(token.head.rights)) >= 2 and \n",
71
- " list(token.head.rights)[0].dep_ == \"dobj\" and token == list(token.head.rights)[1])):\n",
 
72
  " token.dep_ = \"comp\"\n",
73
  " if token.dep_ == \"ccomp\" and any(sibling.dep_ in relations[\"comp\"] \n",
74
  " for sibling in token.head.rights if sibling.i < token.i):\n",
@@ -122,12 +123,13 @@
122
  "outputs": [],
123
  "source": [
124
  "def flyover(token):\n",
125
- " if token.dep_ in [\"subj\", \"comp\", \"conj\"]:\n",
126
- " dep_distance = abs(token.i - token.head.i)\n",
 
127
  " if token.head.i < token.i:\n",
128
- " return (token.doc[token.head.i+1:token.i], dep_distance - 1)\n",
129
  " elif token.head.i > token.i:\n",
130
- " return (token.doc[token.i+1:token.head.i], dep_distance - 1)\n",
131
  " else:\n",
132
  " return (token.doc[token.i:token.i], 0)"
133
  ]
@@ -144,7 +146,7 @@
144
  " flyovers = [f for f in flyovers if len(f[0]) > 0]\n",
145
  " flyovers = [f1 for f1 in flyovers if len([f2 for f2 in flyovers if \n",
146
  " (f2[0][-1].i > f1[0][0].i >= f2[0][0].i or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i) and \n",
147
- " (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])]) == 0 and len(f1[0]) > 2]\n",
148
  " flyovers = sorted(flyovers, key=lambda x: x[0][0].i)\n",
149
  " interstices = []\n",
150
  " for i in range(len(flyovers)):\n",
@@ -168,108 +170,121 @@
168
  },
169
  {
170
  "cell_type": "code",
171
- "execution_count": 6,
172
  "id": "ba90ff19-c665-49d8-8ad4-5caee885901d",
173
  "metadata": {},
174
  "outputs": [
175
  {
176
  "data": {
177
  "text/html": [
178
- "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"7cae4e9aeafb4cf4bd557e4780eb30b6-0\" class=\"displacy\" width=\"1450\" height=\"399.5\" direction=\"ltr\" style=\"max-width: none; height: 399.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
179
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
180
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">They</tspan>\n",
181
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
182
  "</text>\n",
183
  "\n",
184
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
185
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">designated</tspan>\n",
186
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n",
187
  "</text>\n",
188
  "\n",
189
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
190
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">the</tspan>\n",
191
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">DET</tspan>\n",
192
  "</text>\n",
193
  "\n",
194
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
195
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\">building,</tspan>\n",
196
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">NOUN</tspan>\n",
197
  "</text>\n",
198
  "\n",
199
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
200
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">as</tspan>\n",
201
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">ADP</tspan>\n",
202
  "</text>\n",
203
  "\n",
204
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
205
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">a</tspan>\n",
206
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">DET</tspan>\n",
207
  "</text>\n",
208
  "\n",
209
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
210
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">national</tspan>\n",
211
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">ADJ</tspan>\n",
212
  "</text>\n",
213
  "\n",
214
- "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n",
215
- " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">landmark.</tspan>\n",
216
- " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">NOUN</tspan>\n",
217
  "</text>\n",
218
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  "<g class=\"displacy-arrow\">\n",
220
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-0\" stroke-width=\"2px\" d=\"M70,264.5 C70,177.0 215.0,177.0 215.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
221
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
222
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">subj</textPath>\n",
223
  " </text>\n",
224
- " <path class=\"displacy-arrowhead\" d=\"M70,266.5 L62,254.5 78,254.5\" fill=\"currentColor\"/>\n",
225
  "</g>\n",
226
  "\n",
227
  "<g class=\"displacy-arrow\">\n",
228
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-1\" stroke-width=\"2px\" d=\"M420,264.5 C420,177.0 565.0,177.0 565.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
229
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
230
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
231
  " </text>\n",
232
- " <path class=\"displacy-arrowhead\" d=\"M420,266.5 L412,254.5 428,254.5\" fill=\"currentColor\"/>\n",
233
  "</g>\n",
234
  "\n",
235
  "<g class=\"displacy-arrow\">\n",
236
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-2\" stroke-width=\"2px\" d=\"M245,264.5 C245,89.5 570.0,89.5 570.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
237
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
238
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">comp</textPath>\n",
239
  " </text>\n",
240
- " <path class=\"displacy-arrowhead\" d=\"M570.0,266.5 L578.0,254.5 562.0,254.5\" fill=\"currentColor\"/>\n",
241
  "</g>\n",
242
  "\n",
243
  "<g class=\"displacy-arrow\">\n",
244
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-3\" stroke-width=\"2px\" d=\"M245,264.5 C245,2.0 750.0,2.0 750.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
245
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
246
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">udep</textPath>\n",
247
  " </text>\n",
248
- " <path class=\"displacy-arrowhead\" d=\"M750.0,266.5 L758.0,254.5 742.0,254.5\" fill=\"currentColor\"/>\n",
249
  "</g>\n",
250
  "\n",
251
  "<g class=\"displacy-arrow\">\n",
252
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-4\" stroke-width=\"2px\" d=\"M945,264.5 C945,89.5 1270.0,89.5 1270.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
253
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
254
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
255
  " </text>\n",
256
- " <path class=\"displacy-arrowhead\" d=\"M945,266.5 L937,254.5 953,254.5\" fill=\"currentColor\"/>\n",
257
  "</g>\n",
258
  "\n",
259
  "<g class=\"displacy-arrow\">\n",
260
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-5\" stroke-width=\"2px\" d=\"M1120,264.5 C1120,177.0 1265.0,177.0 1265.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
261
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
262
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">udep</textPath>\n",
263
  " </text>\n",
264
- " <path class=\"displacy-arrowhead\" d=\"M1120,266.5 L1112,254.5 1128,254.5\" fill=\"currentColor\"/>\n",
265
  "</g>\n",
266
  "\n",
267
  "<g class=\"displacy-arrow\">\n",
268
- " <path class=\"displacy-arc\" id=\"arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-6\" stroke-width=\"2px\" d=\"M770,264.5 C770,2.0 1275.0,2.0 1275.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n",
269
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
270
- " <textPath xlink:href=\"#arrow-7cae4e9aeafb4cf4bd557e4780eb30b6-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">comp</textPath>\n",
271
  " </text>\n",
272
- " <path class=\"displacy-arrowhead\" d=\"M1275.0,266.5 L1283.0,254.5 1267.0,254.5\" fill=\"currentColor\"/>\n",
273
  "</g>\n",
274
  "</svg></span>"
275
  ],
@@ -282,9 +297,11 @@
282
  }
283
  ],
284
  "source": [
285
- "doc = ssudify(nlp(\"They designated the building, as a national landmark.\"))\n",
286
  "# Since this is an interactive Jupyter environment, we can use displacy.render here\n",
287
- "displacy.render(doc, style='dep')"
 
 
288
  ]
289
  },
290
  {
 
68
  " if token.dep_ == \"prep\" and token.head.pos_ in [\"VERB\", \"AUX\"] and \\\n",
69
  " ((len(list(token.head.rights)) >= 1 and token == list(token.head.rights)[0]) or \n",
70
  " (len(list(token.head.rights)) >= 2 and \n",
71
+ " (list(token.head.rights)[0].dep_ == \"dobj\" or list(token.head.rights)[0].pos_ == \"ADV\") and \n",
72
+ " token == list(token.head.rights)[1])):\n",
73
  " token.dep_ = \"comp\"\n",
74
  " if token.dep_ == \"ccomp\" and any(sibling.dep_ in relations[\"comp\"] \n",
75
  " for sibling in token.head.rights if sibling.i < token.i):\n",
 
123
  "outputs": [],
124
  "source": [
125
  "def flyover(token):\n",
126
+ " if token.dep_ in [\"subj\", \"comp\"]:\n",
127
+ " dep_distance = len([t for t in token.doc[min(token.i, token.head.i) + 1 : max(token.i, token.head.i)]\n",
128
+ " if len(list(t.children)) > 0])\n",
129
  " if token.head.i < token.i:\n",
130
+ " return (token.doc[token.head.i+1:token.i], dep_distance)\n",
131
  " elif token.head.i > token.i:\n",
132
+ " return (token.doc[token.i+1:token.head.i], dep_distance)\n",
133
  " else:\n",
134
  " return (token.doc[token.i:token.i], 0)"
135
  ]
 
146
  " flyovers = [f for f in flyovers if len(f[0]) > 0]\n",
147
  " flyovers = [f1 for f1 in flyovers if len([f2 for f2 in flyovers if \n",
148
  " (f2[0][-1].i > f1[0][0].i >= f2[0][0].i or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i) and \n",
149
+ " (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])]) == 0 and len(f1[0]) > 0]\n",
150
  " flyovers = sorted(flyovers, key=lambda x: x[0][0].i)\n",
151
  " interstices = []\n",
152
  " for i in range(len(flyovers)):\n",
 
170
  },
171
  {
172
  "cell_type": "code",
173
+ "execution_count": 13,
174
  "id": "ba90ff19-c665-49d8-8ad4-5caee885901d",
175
  "metadata": {},
176
  "outputs": [
177
  {
178
  "data": {
179
  "text/html": [
180
+ "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"b935f75a06f14438a922ac30e5ab8f72-0\" class=\"displacy\" width=\"950\" height=\"337.0\" direction=\"ltr\" style=\"max-width: none; height: 337.0px; color: #000000; background: #ffffff; font-family: ; direction: ltr\">\n",
181
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
182
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">The</tspan>\n",
183
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
184
  "</text>\n",
185
  "\n",
186
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
187
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">manager,</tspan>\n",
188
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">NOUN</tspan>\n",
189
  "</text>\n",
190
  "\n",
191
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
192
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">although</tspan>\n",
193
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">SCONJ</tspan>\n",
194
  "</text>\n",
195
  "\n",
196
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
197
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">she</tspan>\n",
198
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">PRON</tspan>\n",
199
  "</text>\n",
200
  "\n",
201
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
202
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">had</tspan>\n",
203
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">VERB</tspan>\n",
204
  "</text>\n",
205
  "\n",
206
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
207
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">doubts,</tspan>\n",
208
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
209
  "</text>\n",
210
  "\n",
211
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
212
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">approved</tspan>\n",
213
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">VERB</tspan>\n",
214
  "</text>\n",
215
  "\n",
216
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
217
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">the</tspan>\n",
218
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">DET</tspan>\n",
219
  "</text>\n",
220
  "\n",
221
+ "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"247.0\">\n",
222
+ " <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"850\">proposal.</tspan>\n",
223
+ " <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"850\">NOUN</tspan>\n",
224
+ "</text>\n",
225
+ "\n",
226
+ "<g class=\"displacy-arrow\">\n",
227
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-0\" stroke-width=\"2px\" d=\"M62,202.0 62,185.33333333333334 141.0,185.33333333333334 141.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
228
+ " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
229
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
230
+ " </text>\n",
231
+ " <path class=\"displacy-arrowhead\" d=\"M62,204.0 L58,196.0 66,196.0\" fill=\"currentColor\"/>\n",
232
+ "</g>\n",
233
+ "\n",
234
  "<g class=\"displacy-arrow\">\n",
235
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-1\" stroke-width=\"2px\" d=\"M162,202.0 162,135.33333333333331 650.0,135.33333333333331 650.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
236
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
237
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">subj</textPath>\n",
238
  " </text>\n",
239
+ " <path class=\"displacy-arrowhead\" d=\"M162,204.0 L158,196.0 166,196.0\" fill=\"currentColor\"/>\n",
240
  "</g>\n",
241
  "\n",
242
  "<g class=\"displacy-arrow\">\n",
243
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-2\" stroke-width=\"2px\" d=\"M262,202.0 262,152.0 647.0,152.0 647.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
244
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
245
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mod</textPath>\n",
246
  " </text>\n",
247
+ " <path class=\"displacy-arrowhead\" d=\"M262,204.0 L258,196.0 266,196.0\" fill=\"currentColor\"/>\n",
248
  "</g>\n",
249
  "\n",
250
  "<g class=\"displacy-arrow\">\n",
251
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-3\" stroke-width=\"2px\" d=\"M362,202.0 362,185.33333333333334 441.0,185.33333333333334 441.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
252
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
253
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">subj</textPath>\n",
254
  " </text>\n",
255
+ " <path class=\"displacy-arrowhead\" d=\"M362,204.0 L358,196.0 366,196.0\" fill=\"currentColor\"/>\n",
256
  "</g>\n",
257
  "\n",
258
  "<g class=\"displacy-arrow\">\n",
259
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-4\" stroke-width=\"2px\" d=\"M262,202.0 262,168.66666666666666 444.0,168.66666666666666 444.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
260
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
261
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">comp</textPath>\n",
262
  " </text>\n",
263
+ " <path class=\"displacy-arrowhead\" d=\"M444.0,204.0 L448.0,196.0 440.0,196.0\" fill=\"currentColor\"/>\n",
264
  "</g>\n",
265
  "\n",
266
  "<g class=\"displacy-arrow\">\n",
267
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-5\" stroke-width=\"2px\" d=\"M462,202.0 462,185.33333333333334 541.0,185.33333333333334 541.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
268
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
269
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">comp</textPath>\n",
270
  " </text>\n",
271
+ " <path class=\"displacy-arrowhead\" d=\"M541.0,204.0 L545.0,196.0 537.0,196.0\" fill=\"currentColor\"/>\n",
272
  "</g>\n",
273
  "\n",
274
  "<g class=\"displacy-arrow\">\n",
275
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-6\" stroke-width=\"2px\" d=\"M762,202.0 762,185.33333333333334 841.0,185.33333333333334 841.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
276
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
277
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
278
  " </text>\n",
279
+ " <path class=\"displacy-arrowhead\" d=\"M762,204.0 L758,196.0 766,196.0\" fill=\"currentColor\"/>\n",
280
  "</g>\n",
281
  "\n",
282
  "<g class=\"displacy-arrow\">\n",
283
+ " <path class=\"displacy-arc\" id=\"arrow-b935f75a06f14438a922ac30e5ab8f72-0-7\" stroke-width=\"2px\" d=\"M662,202.0 662,168.66666666666666 844.0,168.66666666666666 844.0,202.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
284
  " <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
285
+ " <textPath xlink:href=\"#arrow-b935f75a06f14438a922ac30e5ab8f72-0-7\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">comp</textPath>\n",
286
  " </text>\n",
287
+ " <path class=\"displacy-arrowhead\" d=\"M844.0,204.0 L848.0,196.0 840.0,196.0\" fill=\"currentColor\"/>\n",
288
  "</g>\n",
289
  "</svg></span>"
290
  ],
 
297
  }
298
  ],
299
  "source": [
300
+ "doc = ssudify(nlp(\"The manager, although she had doubts, approved the proposal.\"))\n",
301
  "# Since this is an interactive Jupyter environment, we can use displacy.render here\n",
302
+ "displacy.render(doc, style='dep', options={'compact': True, 'font': \"\", 'distance': 100})\n",
303
+ "# with open(\"sample_parse.svg\", \"w\", encoding=\"utf-8\") as f:\n",
304
+ "# f.write(svg)"
305
  ]
306
  },
307
  {
main.py CHANGED
@@ -7,52 +7,97 @@ nlp = spacy.load("en_core_web_sm")
7
 
8
  relations = {
9
  "subj": ["nsubj", "nsubjpass", "csubj", "csubjpass", "expl"],
10
- "comp": ["dobj", "dative", "attr", "oprd", "pobj", "aux", "auxpass", "mark", "case", "ccomp", "xcomp", "acomp"],
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "mod": ["agent", "advmod", "advcl", "relcl", "npmod", "npadvmod", "prt"],
12
  "udep": ["acl", "amod", "nmod", "poss", "nummod", "prep"],
13
  }
14
 
15
 
16
- def ssudify(doc):
17
  for token in doc:
18
  to_reverse = [token]
19
  for child in token.children:
20
- if (child.dep_ in ["aux", "auxpass", "mark", "case"]) or (child.dep_ == "advmod" and child.pos_ == "SCONJ"):
 
 
21
  to_reverse.append(child)
22
- to_reverse.sort(key = lambda x: abs(x.i - token.i))
23
  if len(to_reverse) > 1:
24
  for i in range(1, len(to_reverse)):
25
  if to_reverse[i].dep_ in ["aux", "auxpass"]:
26
- for child in to_reverse[i-1].children:
27
- if child.dep_ in relations["subj"] + relations["mod"] + ["conj", "cc"]:
 
 
 
28
  child.head = to_reverse[i]
29
- to_reverse[i].head = to_reverse[i-1].head if to_reverse[i-1].head != to_reverse[i-1] else to_reverse[i]
30
- to_reverse[i].dep_ = to_reverse[i-1].dep_
31
- to_reverse[i-1].head = to_reverse[i]
32
- to_reverse[i-1].dep_ = "comp"
 
 
 
 
33
  for token in doc:
34
- if token.dep_ == "dep": token.dep_ = "unknown"
35
- if token.dep_ == "prep" and token.head.pos_ in ["VERB", "AUX"] and \
36
- token.i < token.head.i and token.head.dep_ not in relations["mod"]:
 
 
 
 
 
37
  token.dep_ = "mod"
38
- if token.dep_ == "prep" and token.head.pos_ in ["VERB", "AUX"] and \
39
- ((len(list(token.head.rights)) >= 1 and token == list(token.head.rights)[0]) or
40
- (len(list(token.head.rights)) >= 2 and
41
- list(token.head.rights)[0].dep_ == "dobj" and token == list(token.head.rights)[1])):
 
 
 
 
 
 
 
 
 
 
 
42
  token.dep_ = "comp"
43
- if token.dep_ == "ccomp" and any(sibling.dep_ in relations["comp"]
44
- for sibling in token.head.rights if sibling.i < token.i):
 
 
 
45
  token.dep_ = "mod"
46
  dobjs = [child for child in token.children if child.dep_ == "dobj"]
47
  if len(dobjs) > 1:
48
  for i in range(1, len(dobjs)):
49
- dobjs[i].head = dobjs[i-1]
50
  dobjs[i].dep_ = "appos"
51
  for token in doc:
52
  for rel in relations.keys():
53
- if token.dep_ in relations[rel]: token.dep_ = rel
 
54
  for token in doc:
55
- subjects = sorted([child for child in token.children if child.dep_ == "subj"], key = lambda x: abs(x.i - token.i))
 
 
 
56
  if len(subjects) > 1:
57
  for s in subjects[1:]:
58
  s.dep_ = "comp"
@@ -63,66 +108,117 @@ def ssudify(doc):
63
  if child.dep_ in ["comp", "udep"] and token.dep_ != "mod":
64
  child.dep_ = "mod"
65
  for token in doc:
66
- if any(t.text in [";", ":"] for t in doc
67
- if ((token.i < t.i < token.head.i and not
68
- (any(p.text == "(" for p in doc if token.i < p.i < t.i) and
69
- any(p.text == ")" for p in doc if t.i < p.i < token.head.i))) or
70
- (token.head.i < t.i < token.i and not
71
- (any(p.text == "(" for p in doc if token.head.i < p.i < t.i) and
72
- any(p.text == ")" for p in doc if t.i < p.i < token.i)))) and token.pos_ != "PUNCT"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  token.head = token
74
  token.dep_ = "root"
75
  if token.pos_ in ["VERB", "AUX"]:
76
- core_children = [child for child in token.children if child.dep_ in ["subj", "comp", "udep"]]
 
 
 
 
77
  core_children.append(token)
78
  core_children.sort(key=lambda x: x.i)
79
  right_edge = [t for t in core_children[-1].subtree if t.pos_ != "PUNCT"][-1]
80
  if right_edge.i < len(doc) - 1:
81
- if right_edge.text == "," or doc[right_edge.i+1].text == ",":
82
- for child in [child for child in token.children if child.i > right_edge.i and child.dep_ == "conj"]:
 
 
 
 
83
  child.dep_ = "mod"
84
- if token.pos_ in ["VERB", "AUX"] and token.head.pos_ == "NOUN" and token.dep_ == "udep":
 
 
 
 
85
  token.dep_ = "mod"
86
  return doc
87
 
88
 
89
  def flyover(token):
90
- if token.dep_ in ["subj", "comp", "udep", "conj"]:
91
- dep_distance = abs(token.i - token.head.i)
 
 
 
 
 
 
 
 
92
  if token.head.i < token.i:
93
- return (token.doc[token.head.i+1:token.i], dep_distance - 1)
94
  elif token.head.i > token.i:
95
- return (token.doc[token.i+1:token.head.i], dep_distance - 1)
96
  else:
97
- return (token.doc[token.i:token.i], 0)
98
 
99
 
100
  def get_fluff(doc):
101
  flyovers = list(map(flyover, doc))
102
  flyovers = [f for f in flyovers if len(f[0]) > 0]
103
- flyovers = [f1 for f1 in flyovers if len([f2 for f2 in flyovers if
104
- (f2[0][-1].i > f1[0][0].i >= f2[0][0].i or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i) and
105
- (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])]) == 0 and len(f1[0]) > 2]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  flyovers = sorted(flyovers, key=lambda x: x[0][0].i)
107
  interstices = []
108
  for i in range(len(flyovers)):
109
  if i == 0:
110
  if flyovers[0][0][0].i > 0:
111
- interstices.append((doc[0:flyovers[0][0][0].i], 0))
112
  else:
113
- if flyovers[i][0][0].i > flyovers[i-1][0][-1].i + 1:
114
- interstices.append((doc[flyovers[i-1][0][-1].i+1:flyovers[i][0][0].i], 0))
 
 
115
  # elif flyovers[i][1] == flyovers[i-1][1]:
116
  # flyovers[i] = (doc[flyovers[i-1][0][0].i:flyovers[i][0][-1].i+1], flyovers[i][1])
117
  # flyovers[i-1] = (doc[flyovers[i-1][0][0].i:flyovers[i-1][0][0].i], flyovers[i-1][1])
118
  if len(flyovers) > 0:
119
  if flyovers[-1][0][-1].i < doc[-1].i:
120
- interstices.append((doc[flyovers[-1][0][-1].i+1:], 0))
121
  else:
122
  interstices.append((doc, 0))
123
  flyovers = [f for f in flyovers if len(f[0]) > 0]
124
  return sorted(flyovers + interstices, key=lambda x: x[0][0].i)
125
 
 
126
  from fasthtml_hf import setup_hf_backup
127
  from fasthtml.common import *
128
  import re
@@ -134,31 +230,181 @@ app, rt = fast_app(pico=True)
134
  def index():
135
  page = Div(
136
  Form(hx_post=send, hx_target="#output", hx_swap="outerHTML")(
137
- Div(Button("Check", style="margin-bottom: 1rem"),
138
- Textarea(name="text", style="height: calc(100vh - 11rem)"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  Div(
141
- Div(Small(Em("Highlighted text segments can be shortened or reordered to improve readability. The stronger the highlight, the more the segment burdens the reader’s memory.")),
142
- cls="overflow-auto", style="height: 4rem; text-wrap: balance; padding: 0rem 1rem"),
143
- Div(id="output", style="padding: 1rem; padding-bottom: calc(1rem - 5px)")
 
144
  ),
145
- cls="grid"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  )
147
- return Titled('Readability feedback', page)
 
148
 
149
  @app.post
150
- def send(text:str):
151
  paragraphs = re.sub(r"[^\S\r\n]+", " ", text).split("\r\n\r\n")
152
- docs = [ssudify(nlp(para)) for para in paragraphs]
153
  annot_paras = [get_fluff(doc) for doc in docs]
154
- return Div(*[P(*[Span(Span(a[0], style=f"background: light-dark(rgba(237, 201, 241, {a[1]/15}), rgba(182, 69, 205, {a[1]/15}))"),
155
- Span(" ")) for a in annot_para],
156
- style="margin-bottom: 1.5em")
157
- for annot_para in annot_paras[:-1]],
158
- P(*[Span(Span(a[0], style=f"background: light-dark(rgba(237, 201, 241, {a[1]/15}), rgba(182, 69, 205, {a[1]/15}))"),
159
- Span(" ")) for a in annot_paras[-1]],
160
- style="margin-bottom: 0em"),
161
- id="output", cls="overflow-auto", style="height: calc(100vh - 11rem); padding: 1rem; padding-bottom: calc(1rem - 5px)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  setup_hf_backup(app)
164
  serve()
 
7
 
8
  relations = {
9
  "subj": ["nsubj", "nsubjpass", "csubj", "csubjpass", "expl"],
10
+ "comp": [
11
+ "dobj",
12
+ "dative",
13
+ "attr",
14
+ "oprd",
15
+ "pobj",
16
+ "aux",
17
+ "auxpass",
18
+ "mark",
19
+ "case",
20
+ "ccomp",
21
+ "xcomp",
22
+ "acomp",
23
+ ],
24
  "mod": ["agent", "advmod", "advcl", "relcl", "npmod", "npadvmod", "prt"],
25
  "udep": ["acl", "amod", "nmod", "poss", "nummod", "prep"],
26
  }
27
 
28
 
29
+ def sudify(doc):
30
  for token in doc:
31
  to_reverse = [token]
32
  for child in token.children:
33
+ if (child.dep_ in ["aux", "auxpass", "mark", "case"]) or (
34
+ child.dep_ == "advmod" and child.pos_ == "SCONJ"
35
+ ):
36
  to_reverse.append(child)
37
+ to_reverse.sort(key=lambda x: abs(x.i - token.i))
38
  if len(to_reverse) > 1:
39
  for i in range(1, len(to_reverse)):
40
  if to_reverse[i].dep_ in ["aux", "auxpass"]:
41
+ for child in to_reverse[i - 1].children:
42
+ if child.dep_ in relations["subj"] + relations["mod"] + [
43
+ "conj",
44
+ "cc",
45
+ ]:
46
  child.head = to_reverse[i]
47
+ to_reverse[i].head = (
48
+ to_reverse[i - 1].head
49
+ if to_reverse[i - 1].head != to_reverse[i - 1]
50
+ else to_reverse[i]
51
+ )
52
+ to_reverse[i].dep_ = to_reverse[i - 1].dep_
53
+ to_reverse[i - 1].head = to_reverse[i]
54
+ to_reverse[i - 1].dep_ = "comp"
55
  for token in doc:
56
+ if token.dep_ == "dep":
57
+ token.dep_ = "unknown"
58
+ if (
59
+ token.dep_ == "prep"
60
+ and token.head.pos_ in ["VERB", "AUX"]
61
+ and token.i < token.head.i
62
+ and token.head.dep_ not in relations["mod"]
63
+ ):
64
  token.dep_ = "mod"
65
+ if (
66
+ token.dep_ == "prep"
67
+ and token.head.pos_ in ["VERB", "AUX"]
68
+ and (
69
+ (
70
+ len(list(token.head.rights)) >= 1
71
+ and token == list(token.head.rights)[0]
72
+ )
73
+ or (
74
+ len(list(token.head.rights)) >= 2
75
+ and list(token.head.rights)[0].dep_ == "dobj"
76
+ and token == list(token.head.rights)[1]
77
+ )
78
+ )
79
+ ):
80
  token.dep_ = "comp"
81
+ if token.dep_ == "ccomp" and any(
82
+ sibling.dep_ in relations["comp"]
83
+ for sibling in token.head.rights
84
+ if sibling.i < token.i
85
+ ):
86
  token.dep_ = "mod"
87
  dobjs = [child for child in token.children if child.dep_ == "dobj"]
88
  if len(dobjs) > 1:
89
  for i in range(1, len(dobjs)):
90
+ dobjs[i].head = dobjs[i - 1]
91
  dobjs[i].dep_ = "appos"
92
  for token in doc:
93
  for rel in relations.keys():
94
+ if token.dep_ in relations[rel]:
95
+ token.dep_ = rel
96
  for token in doc:
97
+ subjects = sorted(
98
+ [child for child in token.children if child.dep_ == "subj"],
99
+ key=lambda x: abs(x.i - token.i),
100
+ )
101
  if len(subjects) > 1:
102
  for s in subjects[1:]:
103
  s.dep_ = "comp"
 
108
  if child.dep_ in ["comp", "udep"] and token.dep_ != "mod":
109
  child.dep_ = "mod"
110
  for token in doc:
111
+ if any(
112
+ t.text in [";", ":"]
113
+ for t in doc
114
+ if (
115
+ (
116
+ token.i < t.i < token.head.i
117
+ and not (
118
+ any(p.text == "(" for p in doc if token.i < p.i < t.i)
119
+ and any(p.text == ")" for p in doc if t.i < p.i < token.head.i)
120
+ )
121
+ )
122
+ or (
123
+ token.head.i < t.i < token.i
124
+ and not (
125
+ any(p.text == "(" for p in doc if token.head.i < p.i < t.i)
126
+ and any(p.text == ")" for p in doc if t.i < p.i < token.i)
127
+ )
128
+ )
129
+ )
130
+ and token.pos_ != "PUNCT"
131
+ ):
132
  token.head = token
133
  token.dep_ = "root"
134
  if token.pos_ in ["VERB", "AUX"]:
135
+ core_children = [
136
+ child
137
+ for child in token.children
138
+ if child.dep_ in ["subj", "comp", "udep"]
139
+ ]
140
  core_children.append(token)
141
  core_children.sort(key=lambda x: x.i)
142
  right_edge = [t for t in core_children[-1].subtree if t.pos_ != "PUNCT"][-1]
143
  if right_edge.i < len(doc) - 1:
144
+ if right_edge.text == "," or doc[right_edge.i + 1].text == ",":
145
+ for child in [
146
+ child
147
+ for child in token.children
148
+ if child.i > right_edge.i and child.dep_ == "conj"
149
+ ]:
150
  child.dep_ = "mod"
151
+ if (
152
+ token.pos_ in ["VERB", "AUX"]
153
+ and token.head.pos_ == "NOUN"
154
+ and token.dep_ == "udep"
155
+ ):
156
  token.dep_ = "mod"
157
  return doc
158
 
159
 
160
  def flyover(token):
161
+ if token.dep_ in ["subj", "comp"]:
162
+ dep_distance = len(
163
+ [
164
+ t
165
+ for t in token.doc[
166
+ min(token.i, token.head.i) + 1 : max(token.i, token.head.i)
167
+ ]
168
+ if len(list(t.children)) > 0
169
+ ]
170
+ )
171
  if token.head.i < token.i:
172
+ return (token.doc[token.head.i + 1 : token.i], dep_distance)
173
  elif token.head.i > token.i:
174
+ return (token.doc[token.i + 1 : token.head.i], dep_distance)
175
  else:
176
+ return (token.doc[token.i : token.i], 0)
177
 
178
 
179
  def get_fluff(doc):
180
  flyovers = list(map(flyover, doc))
181
  flyovers = [f for f in flyovers if len(f[0]) > 0]
182
+ flyovers = [
183
+ f1
184
+ for f1 in flyovers
185
+ if len(
186
+ [
187
+ f2
188
+ for f2 in flyovers
189
+ if (
190
+ f2[0][-1].i > f1[0][0].i >= f2[0][0].i
191
+ or f2[0][0].i < f1[0][-1].i <= f2[0][-1].i
192
+ )
193
+ and (len(f1[0]) < len(f2[0]) or f1[1] < f2[1])
194
+ ]
195
+ )
196
+ == 0
197
+ and f1[1] > 0
198
+ ]
199
  flyovers = sorted(flyovers, key=lambda x: x[0][0].i)
200
  interstices = []
201
  for i in range(len(flyovers)):
202
  if i == 0:
203
  if flyovers[0][0][0].i > 0:
204
+ interstices.append((doc[0 : flyovers[0][0][0].i], 0))
205
  else:
206
+ if flyovers[i][0][0].i > flyovers[i - 1][0][-1].i + 1:
207
+ interstices.append(
208
+ (doc[flyovers[i - 1][0][-1].i + 1 : flyovers[i][0][0].i], 0)
209
+ )
210
  # elif flyovers[i][1] == flyovers[i-1][1]:
211
  # flyovers[i] = (doc[flyovers[i-1][0][0].i:flyovers[i][0][-1].i+1], flyovers[i][1])
212
  # flyovers[i-1] = (doc[flyovers[i-1][0][0].i:flyovers[i-1][0][0].i], flyovers[i-1][1])
213
  if len(flyovers) > 0:
214
  if flyovers[-1][0][-1].i < doc[-1].i:
215
+ interstices.append((doc[flyovers[-1][0][-1].i + 1 :], 0))
216
  else:
217
  interstices.append((doc, 0))
218
  flyovers = [f for f in flyovers if len(f[0]) > 0]
219
  return sorted(flyovers + interstices, key=lambda x: x[0][0].i)
220
 
221
+
222
  from fasthtml_hf import setup_hf_backup
223
  from fasthtml.common import *
224
  import re
 
230
  def index():
231
  page = Div(
232
  Form(hx_post=send, hx_target="#output", hx_swap="outerHTML")(
233
+ Div(
234
+ Span(
235
+ Button("Check"),
236
+ A("How this works", href="/about"),
237
+ style="margin-bottom: 1rem; display: flex; gap: 1rem; align-items: center",
238
+ ),
239
+ Textarea(name="text", style="height: calc(100vh - 11rem)"),
240
+ )
241
+ ),
242
+ Div(
243
+ Div(
244
+ Em(
245
+ "Highlighted text segments can be shortened or reordered to improve readability."
246
+ ),
247
+ cls="overflow-auto",
248
+ style="height: 4rem; text-wrap: balance; padding: 0rem 1rem",
249
  ),
250
+ Div(id="output", style="padding: 1rem; padding-bottom: calc(1rem - 5px)"),
251
+ ),
252
+ cls="grid",
253
+ )
254
+ return Titled("Readability feedback", page)
255
+
256
+
257
+ @app.get
258
+ def about():
259
+ content = Div(
260
+ H2("How this works"),
261
+ P(
262
+ "One of the keys to writing clearly is to ",
263
+ Em("keep related words close together"),
264
+ ". Don't, if you want to be understood, insert any long asides! (See what I did there?) This tool helps you visually identify places in your writing where two related words are interrupted by an aside, which you can then either shorten or move to a different position in the sentence.",
265
+ ),
266
+ P(
267
+ "But how do we identify words in a sentence that are related to each other? We can do this using a technique from natural language processing called ",
268
+ Em("dependency parsing"),
269
+ ". For example, we can take a sentence like ",
270
+ Var("The manager approved the proposal although she had doubts"),
271
+ ", and produce a diagram like the following:",
272
+ ),
273
  Div(
274
+ NotStr(
275
+ open("sample_parse.svg", "r").read(),
276
+ ),
277
+ style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
278
  ),
279
+ P(
280
+ "This tells us, for example, that ",
281
+ Var("manager"),
282
+ " (or ",
283
+ Var("the manager"),
284
+ ") is the subject of ",
285
+ Var("approved"),
286
+ ' (since she is the "main character" of the event of approving); that ',
287
+ Var("proposal"),
288
+ " (or ",
289
+ Var("the proposal"),
290
+ ") is a complement of ",
291
+ Var("approved"),
292
+ " (since you cannot imagine an act of approving without imagining the thing that is being approved—in this case, the proposal); and that ",
293
+ Var("although"),
294
+ " (or ",
295
+ Var("although she had doubts"),
296
+ ") is a modifier of ",
297
+ Var("approved"),
298
+ " (since it gives us the context of the manager's approval). Naturally, subjects and complements are more closely related to the verb than modifiers are, and so we ignore modifiers when identifying related words that should be kept close together. We can see that in this sentence, the subject and the complement are right next to the verb, and so the sentence is easy to read.",
299
+ ),
300
+ P("Now let us see what happens when we reorder the sentence:"),
301
+ Div(
302
+ NotStr(
303
+ open("sample_parse_2.svg", "r").read(),
304
+ ),
305
+ style="margin-top: 1rem; margin-bottom: 1rem; width: 100%; overflow-x: auto",
306
+ ),
307
+ P(
308
+ "Here, we see that the modifier ",
309
+ Var("although she had doubts"),
310
+ " now interrupts the subject relation between ",
311
+ Var("the manager"),
312
+ " and ",
313
+ Var("approved"),
314
+ ". And indeed, you can see that this sentence is harder to read than the first one.",
315
+ ),
316
+ H2("Technical details"),
317
+ P(
318
+ "The inspiration for this tool is the idea of ",
319
+ Em("Dependency Length Minimisation"),
320
+ " (DLM) in psycholinguistics, which posits that human languages tend to minimise the distance between syntactically related words to reduce cognitive load during sentence processing. For more information on DLM, see ",
321
+ A(
322
+ "Futrell et al. (2015)",
323
+ href="https://pmc.ncbi.nlm.nih.gov/articles/PMC4547262/",
324
+ ),
325
+ ". For evidence that dependency length predicts reading times in English, see e.g. ",
326
+ A(
327
+ "Bartek et al. (2011)",
328
+ href="https://pubmed.ncbi.nlm.nih.gov/21707210/",
329
+ ),
330
+ ".",
331
+ ),
332
+ P(
333
+ "I have used the ",
334
+ A("spaCy", href="https://spacy.io/"),
335
+ " library's ",
336
+ Var("en_core_web_sm"),
337
+ " model to perform dependency parsing, adjusting the results to bring them in line with the ",
338
+ A(
339
+ "Surface Syntactic Universal Dependencies (SUD)",
340
+ href="https://surfacesyntacticud.org",
341
+ ),
342
+ " framework, which is more consistent with linguistic theories. For any dependency arc that exhibits the ",
343
+ Var("subj"),
344
+ " (subject) or ",
345
+ Var("comp"),
346
+ " (complement) relation, the words lying between the head and the dependent are highlighted with an opacity proportional to the number of heads (i.e. words with at least one dependent) in that interval. This follows the revised definition of dependency length proposed by ",
347
+ A(
348
+ "Yadav et al. (2022)",
349
+ href="https://direct.mit.edu/opmi/article/doi/10.1162/opmi_a_00060/112598/A-Reappraisal-of-Dependency-Length-Minimization-as",
350
+ ),
351
+ ".",
352
+ ),
353
+ P(
354
+ "The web app itself was built using the ",
355
+ A("FastHTML", href="https://fasthtml.org/"),
356
+ " framework, which I learned about in the ",
357
+ A("Solve It With Code", href="https://solve.it.com/"),
358
+ " course from ",
359
+ A("Answer.AI", href="https://www.answer.ai/"),
360
+ ". The entire development took two days (starting on December 26, 2025), with an extra day for handling various edge cases. This page was added on January 4, 2026.",
361
+ ),
362
+ A("Back to main page", href="/"),
363
+ style="padding-bottom: 1rem; padding-top: 1rem; max-width: 800px; margin: auto",
364
  )
365
+ return Titled("Readability feedback", content)
366
+
367
 
368
  @app.post
369
+ def send(text: str):
370
  paragraphs = re.sub(r"[^\S\r\n]+", " ", text).split("\r\n\r\n")
371
+ docs = [sudify(nlp(para)) for para in paragraphs]
372
  annot_paras = [get_fluff(doc) for doc in docs]
373
+ return Div(
374
+ *[
375
+ P(
376
+ *[
377
+ Span(
378
+ Span(
379
+ a[0],
380
+ style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
381
+ ),
382
+ Span(" "),
383
+ )
384
+ for a in annot_para
385
+ ],
386
+ style="margin-bottom: 1.5em",
387
+ )
388
+ for annot_para in annot_paras[:-1]
389
+ ],
390
+ P(
391
+ *[
392
+ Span(
393
+ Span(
394
+ a[0],
395
+ style=f"background: light-dark(rgba(237, 201, 241, {a[1]/5}), rgba(182, 69, 205, {a[1]/5}))",
396
+ ),
397
+ Span(" "),
398
+ )
399
+ for a in annot_paras[-1]
400
+ ],
401
+ style="margin-bottom: 0em",
402
+ ),
403
+ id="output",
404
+ cls="overflow-auto",
405
+ style="height: calc(100vh - 11rem); padding: 1rem; padding-bottom: calc(1rem - 5px)",
406
+ )
407
+
408
 
409
  setup_hf_backup(app)
410
  serve()
sample_parse.svg ADDED
sample_parse_2.svg ADDED