import gradio as gr import re import json from symspellpy import SymSpell, Verbosity from nepali_stemmer.stemmer import NepStemmer from itertools import product from typing import List, Dict, Set # ------------------- Paths ------------------- simplified_only_path = "./data/simplified_only_names2.txt" simplified_dict_path = "./data/simplified_dict.txt" vocab_path = "./data/vocab.txt" # ------------------- Utilities ------------------- def simplify_devanagari(text: str) -> str: cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text) cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned) cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned) return cleaned def load_vocab(filepath: str) -> Set[str]: with open(filepath, "r", encoding="utf-8") as f: return {line.strip() for line in f if line.strip()} def save_to_vocab(word: str, filepath: str = vocab_path) -> str: word = word.strip() if not word: return "Invalid input. No word added." vocab = load_vocab(filepath) if word in vocab: return f"'{word}' already exists in the vocab." with open(filepath, "a", encoding="utf-8") as f: f.write(word + "\n") return f"'{word}' added to vocab." def load_simplified_map(filepath: str) -> Dict[str, str]: with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) return {v: k for k, v in data.items()} def list_locations_as_table( simplified_keys_file: str = simplified_only_path, simplified_map_file: str = simplified_dict_path ) -> List[List[str]]: simplified_map = load_simplified_map(simplified_map_file) keys = [] with open(simplified_keys_file, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue key = line.split("$")[0] keys.append(key) output_table = [] for key in keys: original_name = simplified_map.get(key, "Unknown") output_table.append([key, original_name]) return output_table def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell: sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length) if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"): raise ValueError("Failed to load dictionary from: " + dict_path) return sym_spell # ------------------- Correction Function ------------------- def correct_sentence( sentence: str, max_edit_distance: int, prefix_length: int, top_k: int ) -> List[List[str]]: sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length) simplified_map = load_simplified_map(simplified_dict_path) vocab = load_vocab(vocab_path) nepstem = NepStemmer() words = sentence.split() sentence_options = [] for word in words: if word in vocab: sentence_options.append([word]) continue stemmed_tokens = nepstem.stem(word).split() base_stem = stemmed_tokens[0] simplified = simplify_devanagari(base_stem) suggestions = sym_spell.lookup( simplified, verbosity=Verbosity.ALL, max_edit_distance=max_edit_distance, include_unknown=False ) correction_list = [] if suggestions: for suggestion in suggestions[:top_k]: corrected_base = simplified_map.get(suggestion.term, base_stem) if len(stemmed_tokens) > 1: full_word = corrected_base + ''.join(stemmed_tokens[1:]) else: full_word = corrected_base correction_list.append(full_word) else: correction_list = [word] sentence_options.append(correction_list) corrected_variants = [' '.join(variant) for variant in product(*sentence_options)] return [[variant] for variant in corrected_variants] # ------------------- Gradio UI ------------------- with gr.Blocks(title="Nepali Spell Correction Tool") as demo: gr.Markdown( """ # Nepali Spell Correction Tool Enter a Nepali sentence to generate corrected variants. You can also view and manage the location vocabulary. """ ) example_sentences = { "Example 1": "भतपरको जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।", "Example 2": "ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।", "Example 3": "पोखराेाै प्रकृतिक सौन्दर्यले भरिपूर्ण शहर हो।" } with gr.Row(): with gr.Column(scale=3): gr.Markdown("## Sentence Correction") example_dropdown = gr.Dropdown( label="Choose Example Sentence", choices=list(example_sentences.values()), value=list(example_sentences.values())[0], interactive=True ) sentence_input = gr.Textbox( label="Input Sentence", value=list(example_sentences.values())[0], placeholder="Enter a Nepali sentence", lines=2 ) def set_example(example): return example example_dropdown.change(set_example, inputs=[example_dropdown], outputs=[sentence_input]) max_dist = gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance") prefix_len = gr.Slider(1, 5, value=3, step=1, label="Prefix Length") top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions") submit_btn = gr.Button("Correct Sentence") gr.Markdown("## Location Vocabulary Table") with gr.Accordion("View or Manage Location Vocabulary", open=False): loc_out = gr.Dataframe( headers=["Simplified Form", "Original Name"], datatype=["str", "str"], row_count=5, interactive=False, label="Location Vocabulary" ) view_btn = gr.Button("Show Locations") # Uncomment below to enable adding new locations # new_loc = gr.Textbox(label="Add New Place", placeholder="e.g., काठमाडौँ") # add_btn = gr.Button("Add Location") # add_msg = gr.Textbox(label="Status", interactive=False) with gr.Column(scale=2): gr.Markdown("## Corrected Sentence Variants") corrected_out = gr.Dataframe( headers=["Corrected Sentence Variants"], datatype=["str"], row_count=5, interactive=False ) submit_btn.click( correct_sentence, inputs=[sentence_input, max_dist, prefix_len, top_k], outputs=corrected_out ) view_btn.click( list_locations_as_table, inputs=[], outputs=loc_out ) # add_btn.click(save_to_vocab, inputs=new_loc, outputs=add_msg) # ------------------- Launch App ------------------- if __name__ == "__main__": demo.launch()