Spaces:

nabin2004
/

SymSpell_for_Post_processing_ASR_applications

Sleeping

App Files Files Community

nabin2004 commited on May 22, 2025

Commit

af875ad

verified ·

1 Parent(s): 269962d

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +3 -9
data/simplified_dict.txt +0 -0
data/simplified_only_names2.txt +28 -0
data/vocab.txt +0 -0
requirements.txt +206 -0
runed_gradio.py +115 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: SymSpell For Post Processing ASR Applications
-emoji: 👀
-colorFrom: yellow
-colorTo: blue
 sdk: gradio
-sdk_version: 5.30.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SymSpell_for_Post_processing_ASR_applications
+app_file: runed_gradio.py
 sdk: gradio
+sdk_version: 5.29.1
 ---

data/simplified_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/simplified_only_names2.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+कठमड$100
+भरतपर$100
+भतपर$100
+ललतपर$100
+पखर$100
+वरटनगर$100
+धरन$100
+बटवल$100
+धनगढ$100
+नपलगज$100
+जनकपर$100
+बरगज$100
+सर्लह$100
+मरङ$100
+रपन्दह$100
+सन्धपल्चक$100
+धदङ$100
+रसव$100
+सन्धल$100
+सककट$100
+सकटर$100
+सकधर$100
+पशपतनथ$100
+सहदरबर$100
+नरयणहट$100
+त्रपरश्वर$100
+बद्ध$100
+कटश्वर$100

data/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,206 @@

+absl-py==2.2.2
+ago==0.1.0
+aiofiles==24.1.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.8
+anyio==4.9.0
+asttokens==3.0.0
+astunparse==1.6.3
+attrs==25.3.0
+Automat==25.4.16
+beautifulsoup4==4.13.4
+bitarray==3.4.1
+blinker==1.9.0
+boto3==1.38.20
+botocore==1.38.20
+bs4==0.0.2
+certifi==2025.4.26
+cffi==1.17.1
+chardet==3.0.4
+charset-normalizer==3.4.2
+click==8.1.8
+colorama==0.4.6
+constantly==23.10.4
+cryptography==45.0.2
+cssselect==1.3.0
+Cython==3.1.1
+decorator==5.2.1
+defusedxml==0.7.1
+dotmap==1.3.30
+editdistpy==0.1.5
+elasticsearch==7.17.12
+executing==2.2.0
+fairseq==0.12.2
+fastapi==0.115.12
+faust-cchardet==2.1.19
+feedfinder2==0.0.4
+feedparser==6.0.11
+ffmpy==0.5.0
+filelock==3.18.0
+Flask==3.1.1
+flatbuffers==25.2.10
+fsspec==2025.3.2
+gast==0.6.0
+gensim==3.7.3
+google-pasta==0.2.0
+gradio==5.29.1
+gradio_client==1.10.1
+groovy==0.1.2
+grpcio==1.71.0
+gunicorn==23.0.0
+h11==0.16.0
+h5py==3.13.0
+hjson==3.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.31.4
+hurry.filesize==0.9
+hydra-core==1.0.7
+hyperlink==21.0.0
+idna==2.8
+importlib-resources==1.4.0
+incremental==24.7.2
+ipython==9.2.0
+ipython_pygments_lexers==1.1.1
+itemadapter==0.11.0
+itemloaders==1.3.2
+itsdangerous==2.2.0
+jedi==0.19.2
+jieba3k==0.35.1
+Jinja2==3.1.6
+jmespath==1.0.1
+joblib==1.5.0
+keras==3.10.0
+langdetect==1.0.9
+libclang==18.1.1
+lxml==5.4.0
+lxml_html_clean==0.4.2
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+ml_dtypes==0.5.1
+mpmath==1.3.0
+namex==0.0.9
+Nepali-nlp @ git+https://github.com/nabin2004/Nepali_nlp@67dd261ffacdfe7ec6e9c06c57d4768be2f80628
+nepali-stemmer==0.0.2
+networkx==3.4.2
+news-please==1.6.10
+newspaper3k==0.2.8
+nltk==3.4.5
+numpy==2.1.3
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+omegaconf==2.0.6
+opencv-python==4.11.0.86
+opt_einsum==3.4.0
+optree==0.15.0
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+parsel==1.10.0
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.2.1
+plac==1.4.5
+portalocker==3.1.1
+progressbar2==4.5.0
+prompt_toolkit==3.0.51
+Protego==0.4.0
+protobuf==5.29.4
+psycopg2-binary==2.9.10
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.4
+pydantic_core==2.33.2
+PyDispatcher==2.0.7
+pydload==1.0.9
+pydub==0.25.1
+Pygments==2.19.1
+PyMySQL==1.1.1
+pyOpenSSL==25.1.0
+pytesseract==0.3.13
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+python-utils==3.9.1
+pytz==2025.2
+PyYAML==6.0.2
+queuelib==1.8.0
+readability-lxml==0.8.4.1
+regex==2024.11.6
+requests==2.32.3
+requests-file==2.1.0
+rich==14.0.0
+ruff==0.11.10
+s3transfer==0.12.0
+sacrebleu==2.5.1
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+Scrapy==2.13.0
+semantic-version==2.10.0
+sentencepiece==0.2.0
+service-identity==24.2.0
+setuptools==80.8.0
+sgmllib3k==1.0.0
+shellingham==1.5.4
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+snowballstemmer==3.0.1
+soupsieve==2.7
+spello==1.2.0
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.14.0
+symspellpy==6.9.0
+tabulate==0.9.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+tensorflow==2.19.0
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tinysegmenter==0.3
+tldextract==5.3.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+torchaudio==2.7.0
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.1
+triton==3.3.0
+Twisted==24.11.0
+typer==0.15.4
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+w3lib==2.3.1
+warcio==1.7.5
+wcwidth==0.2.13
+websockets==15.0.1
+Werkzeug==3.1.3
+wget==3.2
+wheel==0.45.1
+wrapt==1.17.2
+zope.interface==7.2

runed_gradio.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import gradio as gr
+import re
+from symspellpy import SymSpell, Verbosity
+from nepali_stemmer.stemmer import NepStemmer
+from itertools import product
+from typing import List, Tuple, Dict, Set
+# ------------------- Utilities -------------------
+def simplify_devanagari(text: str) -> str:
+    cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text)
+    cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned)
+    cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned)
+    return cleaned
+def load_vocab(filepath: str) -> Set[str]:
+    with open(filepath, "r", encoding="utf-8") as f:
+        return {line.strip() for line in f if line.strip()}
+def load_simplified_map(filepath: str) -> Dict[str, str]:
+    simplified_map = {}
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line in f:
+            if ":" not in line:
+                continue
+            parts = line.strip().strip(",").replace('"', '').split(":")
+            if len(parts) == 2:
+                orig, simp = parts[0].strip(), parts[1].strip()
+                simplified_map[simp] = orig
+    return simplified_map
+def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell:
+    sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
+    if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"):
+        raise ValueError("Failed to load dictionary from: " + dict_path)
+    return sym_spell
+# ------------------- Correction Function -------------------
+def correct_sentence(
+    sentence: str,
+    max_edit_distance: int,
+    prefix_length: int,
+    top_k: int
+) -> List[str]:
+    simplified_only_path = "./data/simplified_only_names2.txt"
+    simplified_dict_path = "./data/simplified_dict.txt"
+    vocab_path = "./data/vocab.txt"
+    # Load components
+    sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length)
+    simplified_map = load_simplified_map(simplified_dict_path)
+    vocab = load_vocab(vocab_path)
+    nepstem = NepStemmer()
+    words = sentence.split()
+    sentence_options = []
+    for word in words:
+        if word in vocab:
+            sentence_options.append([word])
+            continue
+        stemmed_tokens = nepstem.stem(word).split()
+        base_stem = stemmed_tokens[0]
+        simplified = simplify_devanagari(base_stem)
+        suggestions = sym_spell.lookup(
+            simplified,
+            verbosity=Verbosity.ALL,
+            max_edit_distance=max_edit_distance,
+            include_unknown=False
+        )
+        correction_list = []
+        if suggestions:
+            for suggestion in suggestions[:top_k]:
+                corrected_base = simplified_map.get(suggestion.term, base_stem)
+                if len(stemmed_tokens) > 1:
+                    full_word = corrected_base + ''.join(stemmed_tokens[1:])
+                else:
+                    full_word = corrected_base
+                correction_list.append(full_word)
+        else:
+            correction_list = [word]
+        sentence_options.append(correction_list)
+    corrected_variants = [' '.join(variant) for variant in product(*sentence_options)]
+    return corrected_variants
+# ------------------- Gradio UI -------------------
+examples = [
+    ["भतपरको  जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।", 2, 3, 3],
+    ["ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।", 2, 3, 3],
+]
+iface = gr.Interface(
+    fn=correct_sentence,
+    inputs=[
+        gr.Textbox(label="Input Nepali Sentence", lines=2, placeholder="नेपालको समृद्ध इतिहास..."),
+        gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance"),
+        gr.Slider(1, 5, value=3, step=1, label="Prefix Length"),
+        gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions per Word")
+    ],
+    outputs=gr.Textbox(label="Corrected Sentence Variants"),
+    title="Nepali Spell Correction App",
+    description="Generates corrected sentence variants using SymSpell and a stemmer.",
+    examples=examples
+)
+if __name__ == "__main__":
+    iface.launch(share=True)