nabin2004's picture
Upload folder using huggingface_hub
539887b verified
import gradio as gr
import re
import json
from symspellpy import SymSpell, Verbosity
from nepali_stemmer.stemmer import NepStemmer
from itertools import product
from typing import List, Dict, Set
# ------------------- Paths -------------------
simplified_only_path = "./data/simplified_only_names2.txt"
simplified_dict_path = "./data/simplified_dict.txt"
vocab_path = "./data/vocab.txt"
# ------------------- Utilities -------------------
def simplify_devanagari(text: str) -> str:
cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text)
cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned)
cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned)
return cleaned
def load_vocab(filepath: str) -> Set[str]:
with open(filepath, "r", encoding="utf-8") as f:
return {line.strip() for line in f if line.strip()}
def save_to_vocab(word: str, filepath: str = vocab_path) -> str:
word = word.strip()
if not word:
return "Invalid input. No word added."
vocab = load_vocab(filepath)
if word in vocab:
return f"'{word}' already exists in the vocab."
with open(filepath, "a", encoding="utf-8") as f:
f.write(word + "\n")
return f"'{word}' added to vocab."
def load_simplified_map(filepath: str) -> Dict[str, str]:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return {v: k for k, v in data.items()}
def list_locations_as_table(
simplified_keys_file: str = simplified_only_path,
simplified_map_file: str = simplified_dict_path
) -> List[List[str]]:
simplified_map = load_simplified_map(simplified_map_file)
keys = []
with open(simplified_keys_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
key = line.split("$")[0]
keys.append(key)
output_table = []
for key in keys:
original_name = simplified_map.get(key, "Unknown")
output_table.append([key, original_name])
return output_table
def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell:
sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"):
raise ValueError("Failed to load dictionary from: " + dict_path)
return sym_spell
# ------------------- Correction Function -------------------
def correct_sentence(
sentence: str,
max_edit_distance: int,
prefix_length: int,
top_k: int
) -> List[List[str]]:
sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length)
simplified_map = load_simplified_map(simplified_dict_path)
vocab = load_vocab(vocab_path)
nepstem = NepStemmer()
words = sentence.split()
sentence_options = []
for word in words:
if word in vocab:
sentence_options.append([word])
continue
stemmed_tokens = nepstem.stem(word).split()
base_stem = stemmed_tokens[0]
simplified = simplify_devanagari(base_stem)
suggestions = sym_spell.lookup(
simplified,
verbosity=Verbosity.ALL,
max_edit_distance=max_edit_distance,
include_unknown=False
)
correction_list = []
if suggestions:
for suggestion in suggestions[:top_k]:
corrected_base = simplified_map.get(suggestion.term, base_stem)
if len(stemmed_tokens) > 1:
full_word = corrected_base + ''.join(stemmed_tokens[1:])
else:
full_word = corrected_base
correction_list.append(full_word)
else:
correction_list = [word]
sentence_options.append(correction_list)
corrected_variants = [' '.join(variant) for variant in product(*sentence_options)]
return [[variant] for variant in corrected_variants]
# ------------------- Gradio UI -------------------
with gr.Blocks(title="Nepali Spell Correction Tool") as demo:
gr.Markdown(
"""
# Nepali Spell Correction Tool
Enter a Nepali sentence to generate corrected variants. You can also view and manage the location vocabulary.
"""
)
example_sentences = {
"Example 1": "भतपरको जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।",
"Example 2": "ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।",
"Example 3": "पोखराेाै प्रकृतिक सौन्दर्यले भरिपूर्ण शहर हो।"
}
with gr.Row():
with gr.Column(scale=3):
gr.Markdown("## Sentence Correction")
example_dropdown = gr.Dropdown(
label="Choose Example Sentence",
choices=list(example_sentences.values()),
value=list(example_sentences.values())[0],
interactive=True
)
sentence_input = gr.Textbox(
label="Input Sentence",
value=list(example_sentences.values())[0],
placeholder="Enter a Nepali sentence",
lines=2
)
def set_example(example):
return example
example_dropdown.change(set_example, inputs=[example_dropdown], outputs=[sentence_input])
max_dist = gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance")
prefix_len = gr.Slider(1, 5, value=3, step=1, label="Prefix Length")
top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions")
submit_btn = gr.Button("Correct Sentence")
gr.Markdown("## Location Vocabulary Table")
with gr.Accordion("View or Manage Location Vocabulary", open=False):
loc_out = gr.Dataframe(
headers=["Simplified Form", "Original Name"],
datatype=["str", "str"],
row_count=5,
interactive=False,
label="Location Vocabulary"
)
view_btn = gr.Button("Show Locations")
# Uncomment below to enable adding new locations
# new_loc = gr.Textbox(label="Add New Place", placeholder="e.g., काठमाडौँ")
# add_btn = gr.Button("Add Location")
# add_msg = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=2):
gr.Markdown("## Corrected Sentence Variants")
corrected_out = gr.Dataframe(
headers=["Corrected Sentence Variants"],
datatype=["str"],
row_count=5,
interactive=False
)
submit_btn.click(
correct_sentence,
inputs=[sentence_input, max_dist, prefix_len, top_k],
outputs=corrected_out
)
view_btn.click(
list_locations_as_table,
inputs=[],
outputs=loc_out
)
# add_btn.click(save_to_vocab, inputs=new_loc, outputs=add_msg)
# ------------------- Launch App -------------------
if __name__ == "__main__":
demo.launch()