nabin2004 commited on
Commit
af875ad
·
verified ·
1 Parent(s): 269962d

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: SymSpell For Post Processing ASR Applications
3
- emoji: 👀
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.30.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: SymSpell_for_Post_processing_ASR_applications
3
+ app_file: runed_gradio.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.29.1
 
 
6
  ---
 
 
data/simplified_dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/simplified_only_names2.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ कठमड$100
2
+ भरतपर$100
3
+ भतपर$100
4
+ ललतपर$100
5
+ पखर$100
6
+ वरटनगर$100
7
+ धरन$100
8
+ बटवल$100
9
+ धनगढ$100
10
+ नपलगज$100
11
+ जनकपर$100
12
+ बरगज$100
13
+ सर्लह$100
14
+ मरङ$100
15
+ रपन्दह$100
16
+ सन्धपल्चक$100
17
+ धदङ$100
18
+ रसव$100
19
+ सन्धल$100
20
+ सककट$100
21
+ सकटर$100
22
+ सकधर$100
23
+ पशपतनथ$100
24
+ सहदरबर$100
25
+ नरयणहट$100
26
+ त्रपरश्वर$100
27
+ बद्ध$100
28
+ कटश्वर$100
data/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.2
2
+ ago==0.1.0
3
+ aiofiles==24.1.0
4
+ annotated-types==0.7.0
5
+ antlr4-python3-runtime==4.8
6
+ anyio==4.9.0
7
+ asttokens==3.0.0
8
+ astunparse==1.6.3
9
+ attrs==25.3.0
10
+ Automat==25.4.16
11
+ beautifulsoup4==4.13.4
12
+ bitarray==3.4.1
13
+ blinker==1.9.0
14
+ boto3==1.38.20
15
+ botocore==1.38.20
16
+ bs4==0.0.2
17
+ certifi==2025.4.26
18
+ cffi==1.17.1
19
+ chardet==3.0.4
20
+ charset-normalizer==3.4.2
21
+ click==8.1.8
22
+ colorama==0.4.6
23
+ constantly==23.10.4
24
+ cryptography==45.0.2
25
+ cssselect==1.3.0
26
+ Cython==3.1.1
27
+ decorator==5.2.1
28
+ defusedxml==0.7.1
29
+ dotmap==1.3.30
30
+ editdistpy==0.1.5
31
+ elasticsearch==7.17.12
32
+ executing==2.2.0
33
+ fairseq==0.12.2
34
+ fastapi==0.115.12
35
+ faust-cchardet==2.1.19
36
+ feedfinder2==0.0.4
37
+ feedparser==6.0.11
38
+ ffmpy==0.5.0
39
+ filelock==3.18.0
40
+ Flask==3.1.1
41
+ flatbuffers==25.2.10
42
+ fsspec==2025.3.2
43
+ gast==0.6.0
44
+ gensim==3.7.3
45
+ google-pasta==0.2.0
46
+ gradio==5.29.1
47
+ gradio_client==1.10.1
48
+ groovy==0.1.2
49
+ grpcio==1.71.0
50
+ gunicorn==23.0.0
51
+ h11==0.16.0
52
+ h5py==3.13.0
53
+ hjson==3.1.0
54
+ httpcore==1.0.9
55
+ httpx==0.28.1
56
+ huggingface-hub==0.31.4
57
+ hurry.filesize==0.9
58
+ hydra-core==1.0.7
59
+ hyperlink==21.0.0
60
+ idna==2.8
61
+ importlib-resources==1.4.0
62
+ incremental==24.7.2
63
+ ipython==9.2.0
64
+ ipython_pygments_lexers==1.1.1
65
+ itemadapter==0.11.0
66
+ itemloaders==1.3.2
67
+ itsdangerous==2.2.0
68
+ jedi==0.19.2
69
+ jieba3k==0.35.1
70
+ Jinja2==3.1.6
71
+ jmespath==1.0.1
72
+ joblib==1.5.0
73
+ keras==3.10.0
74
+ langdetect==1.0.9
75
+ libclang==18.1.1
76
+ lxml==5.4.0
77
+ lxml_html_clean==0.4.2
78
+ Markdown==3.8
79
+ markdown-it-py==3.0.0
80
+ MarkupSafe==3.0.2
81
+ matplotlib-inline==0.1.7
82
+ mdurl==0.1.2
83
+ ml_dtypes==0.5.1
84
+ mpmath==1.3.0
85
+ namex==0.0.9
86
+ Nepali-nlp @ git+https://github.com/nabin2004/Nepali_nlp@67dd261ffacdfe7ec6e9c06c57d4768be2f80628
87
+ nepali-stemmer==0.0.2
88
+ networkx==3.4.2
89
+ news-please==1.6.10
90
+ newspaper3k==0.2.8
91
+ nltk==3.4.5
92
+ numpy==2.1.3
93
+ nvidia-cublas-cu12==12.6.4.1
94
+ nvidia-cuda-cupti-cu12==12.6.80
95
+ nvidia-cuda-nvrtc-cu12==12.6.77
96
+ nvidia-cuda-runtime-cu12==12.6.77
97
+ nvidia-cudnn-cu12==9.5.1.17
98
+ nvidia-cufft-cu12==11.3.0.4
99
+ nvidia-cufile-cu12==1.11.1.6
100
+ nvidia-curand-cu12==10.3.7.77
101
+ nvidia-cusolver-cu12==11.7.1.2
102
+ nvidia-cusparse-cu12==12.5.4.2
103
+ nvidia-cusparselt-cu12==0.6.3
104
+ nvidia-nccl-cu12==2.26.2
105
+ nvidia-nvjitlink-cu12==12.6.85
106
+ nvidia-nvtx-cu12==12.6.77
107
+ omegaconf==2.0.6
108
+ opencv-python==4.11.0.86
109
+ opt_einsum==3.4.0
110
+ optree==0.15.0
111
+ orjson==3.10.18
112
+ packaging==25.0
113
+ pandas==2.2.3
114
+ parsel==1.10.0
115
+ parso==0.8.4
116
+ pexpect==4.9.0
117
+ pillow==11.2.1
118
+ plac==1.4.5
119
+ portalocker==3.1.1
120
+ progressbar2==4.5.0
121
+ prompt_toolkit==3.0.51
122
+ Protego==0.4.0
123
+ protobuf==5.29.4
124
+ psycopg2-binary==2.9.10
125
+ ptyprocess==0.7.0
126
+ pure_eval==0.2.3
127
+ pyasn1==0.6.1
128
+ pyasn1_modules==0.4.2
129
+ pycparser==2.22
130
+ pydantic==2.11.4
131
+ pydantic_core==2.33.2
132
+ PyDispatcher==2.0.7
133
+ pydload==1.0.9
134
+ pydub==0.25.1
135
+ Pygments==2.19.1
136
+ PyMySQL==1.1.1
137
+ pyOpenSSL==25.1.0
138
+ pytesseract==0.3.13
139
+ python-dateutil==2.9.0.post0
140
+ python-multipart==0.0.20
141
+ python-utils==3.9.1
142
+ pytz==2025.2
143
+ PyYAML==6.0.2
144
+ queuelib==1.8.0
145
+ readability-lxml==0.8.4.1
146
+ regex==2024.11.6
147
+ requests==2.32.3
148
+ requests-file==2.1.0
149
+ rich==14.0.0
150
+ ruff==0.11.10
151
+ s3transfer==0.12.0
152
+ sacrebleu==2.5.1
153
+ safehttpx==0.1.6
154
+ safetensors==0.5.3
155
+ scikit-learn==1.6.1
156
+ scipy==1.15.3
157
+ Scrapy==2.13.0
158
+ semantic-version==2.10.0
159
+ sentencepiece==0.2.0
160
+ service-identity==24.2.0
161
+ setuptools==80.8.0
162
+ sgmllib3k==1.0.0
163
+ shellingham==1.5.4
164
+ six==1.17.0
165
+ smart-open==7.1.0
166
+ sniffio==1.3.1
167
+ snowballstemmer==3.0.1
168
+ soupsieve==2.7
169
+ spello==1.2.0
170
+ stack-data==0.6.3
171
+ starlette==0.46.2
172
+ sympy==1.14.0
173
+ symspellpy==6.9.0
174
+ tabulate==0.9.0
175
+ tensorboard==2.19.0
176
+ tensorboard-data-server==0.7.2
177
+ tensorboardX==2.6.2.2
178
+ tensorflow==2.19.0
179
+ termcolor==3.1.0
180
+ threadpoolctl==3.6.0
181
+ tinysegmenter==0.3
182
+ tldextract==5.3.0
183
+ tokenizers==0.21.1
184
+ tomlkit==0.13.2
185
+ torch==2.7.0
186
+ torchaudio==2.7.0
187
+ tqdm==4.67.1
188
+ traitlets==5.14.3
189
+ transformers==4.52.1
190
+ triton==3.3.0
191
+ Twisted==24.11.0
192
+ typer==0.15.4
193
+ typing-inspection==0.4.0
194
+ typing_extensions==4.13.2
195
+ tzdata==2025.2
196
+ urllib3==2.4.0
197
+ uvicorn==0.34.2
198
+ w3lib==2.3.1
199
+ warcio==1.7.5
200
+ wcwidth==0.2.13
201
+ websockets==15.0.1
202
+ Werkzeug==3.1.3
203
+ wget==3.2
204
+ wheel==0.45.1
205
+ wrapt==1.17.2
206
+ zope.interface==7.2
runed_gradio.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ from symspellpy import SymSpell, Verbosity
4
+ from nepali_stemmer.stemmer import NepStemmer
5
+ from itertools import product
6
+ from typing import List, Tuple, Dict, Set
7
+
8
+ # ------------------- Utilities -------------------
9
+
10
+ def simplify_devanagari(text: str) -> str:
11
+ cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text)
12
+ cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned)
13
+ cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned)
14
+ return cleaned
15
+
16
+ def load_vocab(filepath: str) -> Set[str]:
17
+ with open(filepath, "r", encoding="utf-8") as f:
18
+ return {line.strip() for line in f if line.strip()}
19
+
20
+ def load_simplified_map(filepath: str) -> Dict[str, str]:
21
+ simplified_map = {}
22
+ with open(filepath, "r", encoding="utf-8") as f:
23
+ for line in f:
24
+ if ":" not in line:
25
+ continue
26
+ parts = line.strip().strip(",").replace('"', '').split(":")
27
+ if len(parts) == 2:
28
+ orig, simp = parts[0].strip(), parts[1].strip()
29
+ simplified_map[simp] = orig
30
+ return simplified_map
31
+
32
+ def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell:
33
+ sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
34
+ if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"):
35
+ raise ValueError("Failed to load dictionary from: " + dict_path)
36
+ return sym_spell
37
+
38
+ # ------------------- Correction Function -------------------
39
+
40
+ def correct_sentence(
41
+ sentence: str,
42
+ max_edit_distance: int,
43
+ prefix_length: int,
44
+ top_k: int
45
+ ) -> List[str]:
46
+
47
+ simplified_only_path = "./data/simplified_only_names2.txt"
48
+ simplified_dict_path = "./data/simplified_dict.txt"
49
+ vocab_path = "./data/vocab.txt"
50
+
51
+ # Load components
52
+ sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length)
53
+ simplified_map = load_simplified_map(simplified_dict_path)
54
+ vocab = load_vocab(vocab_path)
55
+ nepstem = NepStemmer()
56
+
57
+ words = sentence.split()
58
+ sentence_options = []
59
+
60
+ for word in words:
61
+ if word in vocab:
62
+ sentence_options.append([word])
63
+ continue
64
+
65
+ stemmed_tokens = nepstem.stem(word).split()
66
+ base_stem = stemmed_tokens[0]
67
+ simplified = simplify_devanagari(base_stem)
68
+
69
+ suggestions = sym_spell.lookup(
70
+ simplified,
71
+ verbosity=Verbosity.ALL,
72
+ max_edit_distance=max_edit_distance,
73
+ include_unknown=False
74
+ )
75
+
76
+ correction_list = []
77
+ if suggestions:
78
+ for suggestion in suggestions[:top_k]:
79
+ corrected_base = simplified_map.get(suggestion.term, base_stem)
80
+ if len(stemmed_tokens) > 1:
81
+ full_word = corrected_base + ''.join(stemmed_tokens[1:])
82
+ else:
83
+ full_word = corrected_base
84
+ correction_list.append(full_word)
85
+ else:
86
+ correction_list = [word]
87
+
88
+ sentence_options.append(correction_list)
89
+
90
+ corrected_variants = [' '.join(variant) for variant in product(*sentence_options)]
91
+ return corrected_variants
92
+
93
+ # ------------------- Gradio UI -------------------
94
+
95
+ examples = [
96
+ ["भतपरको जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।", 2, 3, 3],
97
+ ["ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।", 2, 3, 3],
98
+ ]
99
+
100
+ iface = gr.Interface(
101
+ fn=correct_sentence,
102
+ inputs=[
103
+ gr.Textbox(label="Input Nepali Sentence", lines=2, placeholder="नेपालको समृद्ध इतिहास..."),
104
+ gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance"),
105
+ gr.Slider(1, 5, value=3, step=1, label="Prefix Length"),
106
+ gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions per Word")
107
+ ],
108
+ outputs=gr.Textbox(label="Corrected Sentence Variants"),
109
+ title="Nepali Spell Correction App",
110
+ description="Generates corrected sentence variants using SymSpell and a stemmer.",
111
+ examples=examples
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ iface.launch(share=True)