= commited on
Commit
24aa1c5
·
1 Parent(s): 82c3ebe
Files changed (2) hide show
  1. app.py +302 -0
  2. language.py +614 -0
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Gradio demo for the multilingual token-classification language ID model."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from collections import Counter, defaultdict
7
+ from functools import lru_cache
8
+ from typing import Any
9
+
10
+ import pandas as pd
11
+ import gradio as gr
12
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
13
+
14
+ from language import ALL_LANGS, LANG_ISO2_TO_ISO3
15
+
16
+
17
+ MODEL_CHECKPOINT = "DerivedFunction/lang-ner-xlmr"
18
+ MAX_TEXT_CHARS = 512
19
+
20
+
21
+ @lru_cache(maxsize=1)
22
+ def get_pipeline():
23
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT)
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
25
+ return pipeline(
26
+ "token-classification",
27
+ model=model,
28
+ tokenizer=tokenizer,
29
+ aggregation_strategy="simple",
30
+ )
31
+
32
+
33
+ def normalize_label(label: str) -> str:
34
+ if label.startswith(("B-", "I-")):
35
+ label = label[2:]
36
+ return label.lower()
37
+
38
+
39
+ def predict(text: str) -> tuple[str, pd.DataFrame, dict[str, Any]]:
40
+ text = (text or "").strip()
41
+ if not text:
42
+ empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
43
+ return (
44
+ "<div class='empty-state'>Paste some text to see the model's language signal.</div>",
45
+ empty,
46
+ {},
47
+ )
48
+
49
+ nlp = get_pipeline()
50
+ entities = nlp(text[:MAX_TEXT_CHARS])
51
+
52
+ rows: list[dict[str, Any]] = []
53
+ token_counts: Counter[str] = Counter()
54
+ token_scores: defaultdict[str, float] = defaultdict(float)
55
+
56
+ for entity in entities:
57
+ label = normalize_label(entity.get("entity_group", entity.get("entity", "O")))
58
+ if label == "o":
59
+ continue
60
+ token_counts[label] += 1
61
+ token_scores[label] += float(entity.get("score", 0.0))
62
+ rows.append(
63
+ {
64
+ "token": entity.get("word", ""),
65
+ "language": label,
66
+ "score": round(float(entity.get("score", 0.0)), 4),
67
+ "start": entity.get("start", None),
68
+ "end": entity.get("end", None),
69
+ }
70
+ )
71
+
72
+ spans = pd.DataFrame(rows, columns=["token", "language", "score", "start", "end"])
73
+ spans = spans.sort_values(by=["start", "end"], na_position="last") if not spans.empty else spans
74
+
75
+ if token_counts:
76
+ dominant_lang, dominant_count = token_counts.most_common(1)[0]
77
+ avg_score = token_scores[dominant_lang] / max(dominant_count, 1)
78
+ iso3 = LANG_ISO2_TO_ISO3.get(dominant_lang, "n/a")
79
+ chips = "".join(
80
+ f"<span class='chip'>{lang.upper()} <strong>{count}</strong></span>"
81
+ for lang, count in token_counts.most_common(5)
82
+ )
83
+ summary = f"""
84
+ <div class="summary-card">
85
+ <div class="summary-kicker">Prediction</div>
86
+ <div class="summary-main">{dominant_lang.upper()}</div>
87
+ <div class="summary-subtitle">ISO-3: {iso3} | analyzed tokens: {len(rows)}</div>
88
+ <div class="summary-score">Avg confidence: {avg_score:.3f}</div>
89
+ <div class="chip-row">{chips}</div>
90
+ </div>
91
+ """
92
+ else:
93
+ summary = """
94
+ <div class="summary-card">
95
+ <div class="summary-kicker">Prediction</div>
96
+ <div class="summary-main">No language spans detected</div>
97
+ <div class="summary-subtitle">Try a longer sample or a cleaner single-language paragraph.</div>
98
+ </div>
99
+ """
100
+
101
+ raw = {
102
+ "model": MODEL_CHECKPOINT,
103
+ "languages_supported": len(ALL_LANGS),
104
+ "top_predictions": token_counts.most_common(10),
105
+ "entities": entities,
106
+ }
107
+ return summary, spans, raw
108
+
109
+
110
+ EXAMPLES = [
111
+ "This model should recognize English text without much trouble.",
112
+ "Hola, este ejemplo mezcla palabras en espanol para probar el detector.",
113
+ "هذا مثال باللغة العربية لاختبار النموذج على فقرة قصيرة.",
114
+ "Bonjour, ceci est un petit texte en francais pour un test rapide.",
115
+ "今日は日本語の文章を入力して、モデルの反応を確認します。",
116
+ "This sentence mixes English and العربية to show mixed-language behavior.",
117
+ ]
118
+
119
+
120
+ CSS = """
121
+ :root {
122
+ --bg-1: #06111f;
123
+ --bg-2: #0b1f33;
124
+ --card: rgba(10, 20, 33, 0.72);
125
+ --card-border: rgba(255, 255, 255, 0.12);
126
+ --text: #f4f7fb;
127
+ --muted: #b7c3d6;
128
+ --accent: #7dd3fc;
129
+ --accent-2: #f59e0b;
130
+ }
131
+ body {
132
+ background:
133
+ radial-gradient(circle at top left, rgba(125, 211, 252, 0.22), transparent 28%),
134
+ radial-gradient(circle at top right, rgba(245, 158, 11, 0.16), transparent 24%),
135
+ linear-gradient(135deg, var(--bg-1), var(--bg-2));
136
+ }
137
+ .wrap {
138
+ max-width: 1180px;
139
+ margin: 0 auto;
140
+ }
141
+ .hero {
142
+ padding: 28px 28px 22px;
143
+ border: 1px solid var(--card-border);
144
+ border-radius: 24px;
145
+ background: linear-gradient(180deg, rgba(255,255,255,0.08), rgba(255,255,255,0.03));
146
+ box-shadow: 0 24px 80px rgba(0, 0, 0, 0.28);
147
+ backdrop-filter: blur(14px);
148
+ }
149
+ .eyebrow {
150
+ text-transform: uppercase;
151
+ letter-spacing: 0.22em;
152
+ color: var(--accent);
153
+ font-size: 12px;
154
+ font-weight: 700;
155
+ margin-bottom: 8px;
156
+ }
157
+ .title {
158
+ font-size: clamp(32px, 5vw, 56px);
159
+ line-height: 1.02;
160
+ margin: 0;
161
+ color: var(--text);
162
+ font-weight: 800;
163
+ }
164
+ .subtitle {
165
+ margin-top: 12px;
166
+ color: var(--muted);
167
+ font-size: 16px;
168
+ max-width: 820px;
169
+ }
170
+ .summary-card {
171
+ border: 1px solid var(--card-border);
172
+ border-radius: 22px;
173
+ padding: 22px;
174
+ background: rgba(7, 13, 24, 0.7);
175
+ color: var(--text);
176
+ min-height: 220px;
177
+ }
178
+ .summary-kicker {
179
+ color: var(--accent);
180
+ text-transform: uppercase;
181
+ letter-spacing: 0.18em;
182
+ font-size: 11px;
183
+ font-weight: 700;
184
+ }
185
+ .summary-main {
186
+ font-size: 42px;
187
+ font-weight: 900;
188
+ margin-top: 8px;
189
+ color: white;
190
+ }
191
+ .summary-subtitle, .summary-score {
192
+ color: var(--muted);
193
+ margin-top: 8px;
194
+ }
195
+ .chip-row {
196
+ display: flex;
197
+ flex-wrap: wrap;
198
+ gap: 8px;
199
+ margin-top: 18px;
200
+ }
201
+ .chip {
202
+ border: 1px solid rgba(125, 211, 252, 0.25);
203
+ background: rgba(125, 211, 252, 0.08);
204
+ color: var(--text);
205
+ padding: 7px 10px;
206
+ border-radius: 999px;
207
+ font-size: 13px;
208
+ }
209
+ .empty-state {
210
+ padding: 18px 20px;
211
+ border-radius: 18px;
212
+ border: 1px dashed rgba(255,255,255,0.16);
213
+ color: var(--muted);
214
+ background: rgba(255,255,255,0.03);
215
+ }
216
+ .gradio-container .gr-textbox textarea {
217
+ font-size: 15px !important;
218
+ }
219
+ .footer-note {
220
+ color: var(--muted);
221
+ font-size: 13px;
222
+ margin-top: 8px;
223
+ }
224
+ """
225
+
226
+
227
+ with gr.Blocks(title="Polyglot Tagger Studio", css=CSS) as demo:
228
+ gr.HTML(
229
+ """
230
+ <div class="wrap hero">
231
+ <div class="eyebrow">Multilingual Language ID</div>
232
+ <h1 class="title">Polyglot Tagger Studio</h1>
233
+ <div class="subtitle">
234
+ A Gradio demo for the token-classification model behind this repo. Paste a sentence or paragraph,
235
+ and the app will surface the dominant language signal, token-level spans, and raw predictions.
236
+ </div>
237
+ </div>
238
+ """
239
+ )
240
+
241
+ with gr.Row():
242
+ with gr.Column(scale=6):
243
+ input_text = gr.Textbox(
244
+ label="Text",
245
+ lines=12,
246
+ placeholder="Paste a sentence or a short paragraph here...",
247
+ value=EXAMPLES[0],
248
+ )
249
+ gr.Markdown(
250
+ "Try a clean sentence for a single-language prediction, or mix languages to see how the model behaves."
251
+ )
252
+ with gr.Row():
253
+ analyze_btn = gr.Button("Analyze", variant="primary")
254
+ clear_btn = gr.Button("Clear")
255
+ gr.Examples(
256
+ examples=[[example] for example in EXAMPLES],
257
+ inputs=input_text,
258
+ label="Examples",
259
+ cache_examples=False,
260
+ )
261
+ with gr.Column(scale=6):
262
+ summary = gr.HTML()
263
+ spans = gr.Dataframe(
264
+ headers=["token", "language", "score", "start", "end"],
265
+ datatype=["str", "str", "number", "number", "number"],
266
+ label="Token-level spans",
267
+ interactive=False,
268
+ wrap=True,
269
+ )
270
+ raw = gr.JSON(label="Raw output")
271
+
272
+ analyze_btn.click(
273
+ fn=predict,
274
+ inputs=input_text,
275
+ outputs=[summary, spans, raw],
276
+ api_name="analyze",
277
+ )
278
+ input_text.submit(
279
+ fn=predict,
280
+ inputs=input_text,
281
+ outputs=[summary, spans, raw],
282
+ api_name="analyze_text",
283
+ )
284
+ clear_btn.click(
285
+ fn=lambda: ("", pd.DataFrame(columns=["token", "language", "score", "start", "end"]), {}),
286
+ inputs=None,
287
+ outputs=[summary, spans, raw],
288
+ api_name="clear",
289
+ )
290
+
291
+ gr.HTML(
292
+ """
293
+ <div class="footer-note">
294
+ Supported model languages: 60. The demo uses the local repo checkpoint and the ISO-2 to ISO-3 mapping in language.py.
295
+ </div>
296
+ """
297
+ )
298
+
299
+
300
+ if __name__ == "__main__":
301
+ demo.queue()
302
+ demo.launch()
language.py ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from source_config import LANGUAGE_BUCKETS
8
+
9
+ LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
10
+ LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
11
+ LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
12
+ LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
13
+
14
+ LANGS_JSON = Path(__file__).with_name("all_langs.json")
15
+
16
+ # Read from file to load the key-value pairs. JSON key order is canonical.
17
+ LANG_ISO2_TO_ISO3 = {}
18
+ with open(LANGS_JSON) as f:
19
+ # Parse the file as a json string
20
+ LANG_ISO2_TO_ISO3 = json.load(f)
21
+
22
+ ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
23
+ LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
24
+
25
+
26
+ def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
27
+ """Write the canonical ALL_LANGS list to JSON if it is missing."""
28
+ path = Path(path)
29
+ if path.exists():
30
+ return
31
+ with path.open("w", encoding="utf-8") as f:
32
+ json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2)
33
+
34
+
35
+ def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
36
+ """Load ALL_LANGS from JSON, falling back to the in-repo constant."""
37
+ path = Path(path)
38
+ if path.exists():
39
+ with path.open(encoding="utf-8") as f:
40
+ langs = json.load(f)
41
+ if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
42
+ return langs
43
+ write_all_langs_json(path)
44
+ return ALL_LANGS[:]
45
+
46
+ ENGLISH_STOP_WORDS = [
47
+ "able",
48
+ "about",
49
+ "above",
50
+ "abroad",
51
+ "according",
52
+ "accordingly",
53
+ "across",
54
+ "actually",
55
+ "after",
56
+ "afterwards",
57
+ "again",
58
+ "against",
59
+ "ago",
60
+ "ahead",
61
+ "aint",
62
+ "all",
63
+ "allow",
64
+ "almost",
65
+ "alone",
66
+ "along",
67
+ "alongside",
68
+ "already",
69
+ "also",
70
+ "although",
71
+ "always",
72
+ "am",
73
+ "amid",
74
+ "amidst",
75
+ "among",
76
+ "amongst",
77
+ "an",
78
+ "and",
79
+ "another",
80
+ "any",
81
+ "anybody",
82
+ "anyhow",
83
+ "anyone",
84
+ "anything",
85
+ "anyway",
86
+ "anyways",
87
+ "anywhere",
88
+ "apart",
89
+ "appear",
90
+ "appreciate",
91
+ "appropriate",
92
+ "app",
93
+ "are",
94
+ "arent",
95
+ "aren",
96
+ "around",
97
+ "as",
98
+ "aside",
99
+ "ask",
100
+ "asking",
101
+ "associated",
102
+ "at",
103
+ "available",
104
+ "away",
105
+ "awfully",
106
+ "back",
107
+ "backward",
108
+ "be",
109
+ "became",
110
+ "because",
111
+ "become",
112
+ "becoming",
113
+ "been",
114
+ "before",
115
+ "beforehand",
116
+ "begin",
117
+ "behind",
118
+ "being",
119
+ "believe",
120
+ "below",
121
+ "beside",
122
+ "best",
123
+ "better",
124
+ "between",
125
+ "beyond",
126
+ "both",
127
+ "brief",
128
+ "but",
129
+ "by",
130
+ "came",
131
+ "can",
132
+ "cannot",
133
+ "cant",
134
+ "caption",
135
+ "cause",
136
+ "certain",
137
+ "certainly",
138
+ "changes",
139
+ "clearly",
140
+ "cmon",
141
+ "com",
142
+ "come",
143
+ "concerning",
144
+ "consequently",
145
+ "consider",
146
+ "considering",
147
+ "contain",
148
+ "containing",
149
+ "corresponding",
150
+ "could",
151
+ "couldnt",
152
+ "course",
153
+ "currently",
154
+ "definitely",
155
+ "described",
156
+ "despite",
157
+ "did",
158
+ "didnt",
159
+ "different",
160
+ "directly",
161
+ "do",
162
+ "does",
163
+ "doesnt",
164
+ "doing",
165
+ "done",
166
+ "dont",
167
+ "down",
168
+ "downward",
169
+ "download",
170
+ "during",
171
+ "each",
172
+ "eight",
173
+ "eighty",
174
+ "either",
175
+ "else",
176
+ "elsewhere",
177
+ "end",
178
+ "ending",
179
+ "enough",
180
+ "entirely",
181
+ "especially",
182
+ "etc",
183
+ "even",
184
+ "ever",
185
+ "evermore",
186
+ "every",
187
+ "everybody",
188
+ "everyone",
189
+ "everything",
190
+ "everywhere",
191
+ "exactly",
192
+ "example",
193
+ "except",
194
+ "fairly",
195
+ "far",
196
+ "farther",
197
+ "few",
198
+ "fewer",
199
+ "fifth",
200
+ "first",
201
+ "five",
202
+ "followed",
203
+ "following",
204
+ "follows",
205
+ "for",
206
+ "forever",
207
+ "former",
208
+ "formerly",
209
+ "forth",
210
+ "forward",
211
+ "found",
212
+ "four",
213
+ "from",
214
+ "free",
215
+ "further",
216
+ "furthermore",
217
+ "get",
218
+ "gets",
219
+ "getting",
220
+ "given",
221
+ "gives",
222
+ "go",
223
+ "goes",
224
+ "going",
225
+ "gone",
226
+ "got",
227
+ "gotten",
228
+ "greetings",
229
+ "had",
230
+ "hadnt",
231
+ "half",
232
+ "happens",
233
+ "hardly",
234
+ "has",
235
+ "hasnt",
236
+ "have",
237
+ "havent",
238
+ "having",
239
+ "he",
240
+ "hed",
241
+ "hell",
242
+ "hello",
243
+ "help",
244
+ "hence",
245
+ "her",
246
+ "here",
247
+ "hereafter",
248
+ "hereby",
249
+ "herein",
250
+ "hereupon",
251
+ "herself",
252
+ "hi",
253
+ "him",
254
+ "himself",
255
+ "his",
256
+ "hither",
257
+ "hopefully",
258
+ "how",
259
+ "howbeit",
260
+ "however",
261
+ "hundred",
262
+ "id",
263
+ "ie",
264
+ "if",
265
+ "ignored",
266
+ "ill",
267
+ "im",
268
+ "immediate",
269
+ "in",
270
+ "inasmuch",
271
+ "inc",
272
+ "indeed",
273
+ "indicate",
274
+ "indicated",
275
+ "inner",
276
+ "inside",
277
+ "insofar",
278
+ "instead",
279
+ "into",
280
+ "inward",
281
+ "is",
282
+ "isnt",
283
+ "it",
284
+ "itd",
285
+ "itll",
286
+ "itself",
287
+ "ive",
288
+ "just",
289
+ "keep",
290
+ "keeps",
291
+ "kept",
292
+ "know",
293
+ "known",
294
+ "last",
295
+ "lately",
296
+ "later",
297
+ "latter",
298
+ "least",
299
+ "less",
300
+ "lest",
301
+ "let",
302
+ "like",
303
+ "liked",
304
+ "likely",
305
+ "likewise",
306
+ "little",
307
+ "look",
308
+ "looking",
309
+ "low",
310
+ "lower",
311
+ "ltd",
312
+ "made",
313
+ "mainly",
314
+ "make",
315
+ "many",
316
+ "may",
317
+ "maybe",
318
+ "maynt",
319
+ "me",
320
+ "mean",
321
+ "meantime",
322
+ "meanwhile",
323
+ "merely",
324
+ "might",
325
+ "mightnt",
326
+ "mine",
327
+ "minus",
328
+ "miss",
329
+ "more",
330
+ "moreover",
331
+ "most",
332
+ "mostly",
333
+ "much",
334
+ "must",
335
+ "mustnt",
336
+ "my",
337
+ "myself",
338
+ "name",
339
+ "namely",
340
+ "near",
341
+ "nearly",
342
+ "necessary",
343
+ "need",
344
+ "neednt",
345
+ "neither",
346
+ "never",
347
+ "neverless",
348
+ "nevertheless",
349
+ "new",
350
+ "next",
351
+ "nine",
352
+ "ninety",
353
+ "no",
354
+ "nobody",
355
+ "non",
356
+ "none",
357
+ "nonetheless",
358
+ "noone",
359
+ "no-one",
360
+ "nor",
361
+ "normally",
362
+ "not",
363
+ "nothing",
364
+ "notwithstanding",
365
+ "novel",
366
+ "now",
367
+ "nowhere",
368
+ "obviously",
369
+ "of",
370
+ "off",
371
+ "often",
372
+ "oh",
373
+ "ok",
374
+ "okay",
375
+ "old",
376
+ "on",
377
+ "once",
378
+ "one",
379
+ "only",
380
+ "onto",
381
+ "opposite",
382
+ "or",
383
+ "other",
384
+ "otherwise",
385
+ "ought",
386
+ "oughtnt",
387
+ "our",
388
+ "ourselves",
389
+ "out",
390
+ "outside",
391
+ "over",
392
+ "overall",
393
+ "own",
394
+ "particular",
395
+ "particularly",
396
+ "past",
397
+ "per",
398
+ "perhaps",
399
+ "placed",
400
+ "please",
401
+ "plus",
402
+ "possible",
403
+ "presumably",
404
+ "probably",
405
+ "provided",
406
+ "provide",
407
+ "quite",
408
+ "rather",
409
+ "really",
410
+ "reasonably",
411
+ "recent",
412
+ "recently",
413
+ "regarding",
414
+ "regardless",
415
+ "regards",
416
+ "relatively",
417
+ "respectively",
418
+ "right",
419
+ "round",
420
+ "said",
421
+ "same",
422
+ "saw",
423
+ "say",
424
+ "saying",
425
+ "second",
426
+ "secondly",
427
+ "see",
428
+ "seeing",
429
+ "seem",
430
+ "seemed",
431
+ "seeming",
432
+ "seems",
433
+ "seen",
434
+ "self",
435
+ "sensible",
436
+ "sent",
437
+ "serious",
438
+ "seriously",
439
+ "seven",
440
+ "several",
441
+ "shall",
442
+ "shant",
443
+ "she",
444
+ "shed",
445
+ "shell",
446
+ "should",
447
+ "shouldnt",
448
+ "since",
449
+ "six",
450
+ "so",
451
+ "some",
452
+ "somebody",
453
+ "someday",
454
+ "somehow",
455
+ "someone",
456
+ "something",
457
+ "sometime",
458
+ "somewhat",
459
+ "somewhere",
460
+ "soon",
461
+ "sorry",
462
+ "specified",
463
+ "specify",
464
+ "specifying",
465
+ "still",
466
+ "such",
467
+ "sure",
468
+ "take",
469
+ "taken",
470
+ "taking",
471
+ "tell",
472
+ "tends",
473
+ "ten",
474
+ "than",
475
+ "thank",
476
+ "that",
477
+ "thatll",
478
+ "thatve",
479
+ "the",
480
+ "their",
481
+ "them",
482
+ "themselves",
483
+ "then",
484
+ "thence",
485
+ "there",
486
+ "thereafter",
487
+ "thereby",
488
+ "thered",
489
+ "therefore",
490
+ "therein",
491
+ "therell",
492
+ "therere",
493
+ "thereupon",
494
+ "thereve",
495
+ "these",
496
+ "they",
497
+ "theyd",
498
+ "theyll",
499
+ "theyre",
500
+ "theyve",
501
+ "thing",
502
+ "think",
503
+ "third",
504
+ "thirty",
505
+ "this",
506
+ "thorough",
507
+ "thoroughly",
508
+ "those",
509
+ "though",
510
+ "three",
511
+ "through",
512
+ "throughout",
513
+ "thru",
514
+ "thus",
515
+ "till",
516
+ "to",
517
+ "together",
518
+ "too",
519
+ "took",
520
+ "toward",
521
+ "tried",
522
+ "tries",
523
+ "truly",
524
+ "try",
525
+ "trying",
526
+ "twice",
527
+ "two",
528
+ "under",
529
+ "underneath",
530
+ "undoing",
531
+ "unfortunately",
532
+ "unless",
533
+ "unlike",
534
+ "unlikely",
535
+ "until",
536
+ "unto",
537
+ "up",
538
+ "upon",
539
+ "upwards",
540
+ "use",
541
+ "used",
542
+ "useful",
543
+ "using",
544
+ "usually",
545
+ "value",
546
+ "various",
547
+ "versus",
548
+ "very",
549
+ "via",
550
+ "viz",
551
+ "want",
552
+ "was",
553
+ "wasnt",
554
+ "way",
555
+ "we",
556
+ "wed",
557
+ "welcome",
558
+ "well",
559
+ "went",
560
+ "were",
561
+ "werent",
562
+ "weve",
563
+ "what",
564
+ "whatever",
565
+ "whatll",
566
+ "whatve",
567
+ "when",
568
+ "whence",
569
+ "whenever",
570
+ "where",
571
+ "whereafter",
572
+ "whereas",
573
+ "whereby",
574
+ "wherein",
575
+ "whereupon",
576
+ "wherever",
577
+ "whether",
578
+ "which",
579
+ "whichever",
580
+ "while",
581
+ "whilst",
582
+ "whither",
583
+ "who",
584
+ "whod",
585
+ "whoever",
586
+ "whole",
587
+ "wholl",
588
+ "whom",
589
+ "whomever",
590
+ "whose",
591
+ "why",
592
+ "will",
593
+ "willing",
594
+ "wish",
595
+ "with",
596
+ "within",
597
+ "without",
598
+ "wonder",
599
+ "wont",
600
+ "would",
601
+ "wouldnt",
602
+ "website",
603
+ "yes",
604
+ "yet",
605
+ "you",
606
+ "youd",
607
+ "youll",
608
+ "your",
609
+ "youre",
610
+ "yourself",
611
+ "yourselves",
612
+ "youve",
613
+ "zero",
614
+ ]