|
|
import gradio as gr |
|
|
import re |
|
|
import json |
|
|
from symspellpy import SymSpell, Verbosity |
|
|
from nepali_stemmer.stemmer import NepStemmer |
|
|
from itertools import product |
|
|
from typing import List, Dict, Set |
|
|
|
|
|
|
|
|
simplified_only_path = "./data/simplified_only_names2.txt" |
|
|
simplified_dict_path = "./data/simplified_dict.txt" |
|
|
vocab_path = "./data/vocab.txt" |
|
|
|
|
|
|
|
|
|
|
|
def simplify_devanagari(text: str) -> str: |
|
|
cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text) |
|
|
cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned) |
|
|
cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned) |
|
|
return cleaned |
|
|
|
|
|
def load_vocab(filepath: str) -> Set[str]: |
|
|
with open(filepath, "r", encoding="utf-8") as f: |
|
|
return {line.strip() for line in f if line.strip()} |
|
|
|
|
|
def save_to_vocab(word: str, filepath: str = vocab_path) -> str: |
|
|
word = word.strip() |
|
|
if not word: |
|
|
return "Invalid input. No word added." |
|
|
vocab = load_vocab(filepath) |
|
|
if word in vocab: |
|
|
return f"'{word}' already exists in the vocab." |
|
|
with open(filepath, "a", encoding="utf-8") as f: |
|
|
f.write(word + "\n") |
|
|
return f"'{word}' added to vocab." |
|
|
|
|
|
def load_simplified_map(filepath: str) -> Dict[str, str]: |
|
|
with open(filepath, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
return {v: k for k, v in data.items()} |
|
|
|
|
|
def list_locations_as_table( |
|
|
simplified_keys_file: str = simplified_only_path, |
|
|
simplified_map_file: str = simplified_dict_path |
|
|
) -> List[List[str]]: |
|
|
simplified_map = load_simplified_map(simplified_map_file) |
|
|
|
|
|
keys = [] |
|
|
with open(simplified_keys_file, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
key = line.split("$")[0] |
|
|
keys.append(key) |
|
|
|
|
|
output_table = [] |
|
|
for key in keys: |
|
|
original_name = simplified_map.get(key, "Unknown") |
|
|
output_table.append([key, original_name]) |
|
|
|
|
|
return output_table |
|
|
|
|
|
def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell: |
|
|
sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length) |
|
|
if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"): |
|
|
raise ValueError("Failed to load dictionary from: " + dict_path) |
|
|
return sym_spell |
|
|
|
|
|
|
|
|
|
|
|
def correct_sentence( |
|
|
sentence: str, |
|
|
max_edit_distance: int, |
|
|
prefix_length: int, |
|
|
top_k: int |
|
|
) -> List[List[str]]: |
|
|
|
|
|
sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length) |
|
|
simplified_map = load_simplified_map(simplified_dict_path) |
|
|
vocab = load_vocab(vocab_path) |
|
|
nepstem = NepStemmer() |
|
|
|
|
|
words = sentence.split() |
|
|
sentence_options = [] |
|
|
|
|
|
for word in words: |
|
|
if word in vocab: |
|
|
sentence_options.append([word]) |
|
|
continue |
|
|
|
|
|
stemmed_tokens = nepstem.stem(word).split() |
|
|
base_stem = stemmed_tokens[0] |
|
|
simplified = simplify_devanagari(base_stem) |
|
|
|
|
|
suggestions = sym_spell.lookup( |
|
|
simplified, |
|
|
verbosity=Verbosity.ALL, |
|
|
max_edit_distance=max_edit_distance, |
|
|
include_unknown=False |
|
|
) |
|
|
|
|
|
correction_list = [] |
|
|
if suggestions: |
|
|
for suggestion in suggestions[:top_k]: |
|
|
corrected_base = simplified_map.get(suggestion.term, base_stem) |
|
|
if len(stemmed_tokens) > 1: |
|
|
full_word = corrected_base + ''.join(stemmed_tokens[1:]) |
|
|
else: |
|
|
full_word = corrected_base |
|
|
correction_list.append(full_word) |
|
|
else: |
|
|
correction_list = [word] |
|
|
|
|
|
sentence_options.append(correction_list) |
|
|
|
|
|
corrected_variants = [' '.join(variant) for variant in product(*sentence_options)] |
|
|
return [[variant] for variant in corrected_variants] |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Nepali Spell Correction Tool") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Nepali Spell Correction Tool |
|
|
|
|
|
Enter a Nepali sentence to generate corrected variants. You can also view and manage the location vocabulary. |
|
|
""" |
|
|
) |
|
|
|
|
|
example_sentences = { |
|
|
"Example 1": "भतपरको जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।", |
|
|
"Example 2": "ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।", |
|
|
"Example 3": "पोखराेाै प्रकृतिक सौन्दर्यले भरिपूर्ण शहर हो।" |
|
|
} |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
gr.Markdown("## Sentence Correction") |
|
|
|
|
|
example_dropdown = gr.Dropdown( |
|
|
label="Choose Example Sentence", |
|
|
choices=list(example_sentences.values()), |
|
|
value=list(example_sentences.values())[0], |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
sentence_input = gr.Textbox( |
|
|
label="Input Sentence", |
|
|
value=list(example_sentences.values())[0], |
|
|
placeholder="Enter a Nepali sentence", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
def set_example(example): |
|
|
return example |
|
|
|
|
|
example_dropdown.change(set_example, inputs=[example_dropdown], outputs=[sentence_input]) |
|
|
|
|
|
max_dist = gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance") |
|
|
prefix_len = gr.Slider(1, 5, value=3, step=1, label="Prefix Length") |
|
|
top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions") |
|
|
|
|
|
submit_btn = gr.Button("Correct Sentence") |
|
|
|
|
|
gr.Markdown("## Location Vocabulary Table") |
|
|
|
|
|
with gr.Accordion("View or Manage Location Vocabulary", open=False): |
|
|
loc_out = gr.Dataframe( |
|
|
headers=["Simplified Form", "Original Name"], |
|
|
datatype=["str", "str"], |
|
|
row_count=5, |
|
|
interactive=False, |
|
|
label="Location Vocabulary" |
|
|
) |
|
|
view_btn = gr.Button("Show Locations") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("## Corrected Sentence Variants") |
|
|
corrected_out = gr.Dataframe( |
|
|
headers=["Corrected Sentence Variants"], |
|
|
datatype=["str"], |
|
|
row_count=5, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
correct_sentence, |
|
|
inputs=[sentence_input, max_dist, prefix_len, top_k], |
|
|
outputs=corrected_out |
|
|
) |
|
|
|
|
|
view_btn.click( |
|
|
list_locations_as_table, |
|
|
inputs=[], |
|
|
outputs=loc_out |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|