Spaces:

nabin2004
/

SymSpell_for_Post_processing_ASR_applications

Sleeping

App Files Files Community

SymSpell_for_Post_processing_ASR_applications / runed_gradio.py

nabin2004

Upload folder using huggingface_hub

539887b verified 8 months ago

raw

history blame contribute delete

7.45 kB

	import gradio as gr
	import re
	import json
	from symspellpy import SymSpell, Verbosity
	from nepali_stemmer.stemmer import NepStemmer
	from itertools import product
	from typing import List, Dict, Set

	# ------------------- Paths -------------------
	simplified_only_path = "./data/simplified_only_names2.txt"
	simplified_dict_path = "./data/simplified_dict.txt"
	vocab_path = "./data/vocab.txt"

	# ------------------- Utilities -------------------

	def simplify_devanagari(text: str) -> str:
	cleaned = re.sub(r'[\u093E-\u094C\u0962\u0963]', '', text)
	cleaned = re.sub(r'[\u0901-\u0903\u093C]', '', cleaned)
	cleaned = re.sub(r'[^\u0900-\u097F]', '', cleaned)
	return cleaned

	def load_vocab(filepath: str) -> Set[str]:
	with open(filepath, "r", encoding="utf-8") as f:
	return {line.strip() for line in f if line.strip()}

	def save_to_vocab(word: str, filepath: str = vocab_path) -> str:
	word = word.strip()
	if not word:
	return "Invalid input. No word added."
	vocab = load_vocab(filepath)
	if word in vocab:
	return f"'{word}' already exists in the vocab."
	with open(filepath, "a", encoding="utf-8") as f:
	f.write(word + "\n")
	return f"'{word}' added to vocab."

	def load_simplified_map(filepath: str) -> Dict[str, str]:
	with open(filepath, "r", encoding="utf-8") as f:
	data = json.load(f)
	return {v: k for k, v in data.items()}

	def list_locations_as_table(
	simplified_keys_file: str = simplified_only_path,
	simplified_map_file: str = simplified_dict_path
	) -> List[List[str]]:
	simplified_map = load_simplified_map(simplified_map_file)

	keys = []
	with open(simplified_keys_file, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	key = line.split("$")[0]
	keys.append(key)

	output_table = []
	for key in keys:
	original_name = simplified_map.get(key, "Unknown")
	output_table.append([key, original_name])

	return output_table

	def init_spellchecker(dict_path: str, max_edit_distance: int, prefix_length: int) -> SymSpell:
	sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
	if not sym_spell.load_dictionary(dict_path, term_index=0, count_index=1, separator="$"):
	raise ValueError("Failed to load dictionary from: " + dict_path)
	return sym_spell

	# ------------------- Correction Function -------------------

	def correct_sentence(
	sentence: str,
	max_edit_distance: int,
	prefix_length: int,
	top_k: int
	) -> List[List[str]]:

	sym_spell = init_spellchecker(simplified_only_path, max_edit_distance, prefix_length)
	simplified_map = load_simplified_map(simplified_dict_path)
	vocab = load_vocab(vocab_path)
	nepstem = NepStemmer()

	words = sentence.split()
	sentence_options = []

	for word in words:
	if word in vocab:
	sentence_options.append([word])
	continue

	stemmed_tokens = nepstem.stem(word).split()
	base_stem = stemmed_tokens[0]
	simplified = simplify_devanagari(base_stem)

	suggestions = sym_spell.lookup(
	simplified,
	verbosity=Verbosity.ALL,
	max_edit_distance=max_edit_distance,
	include_unknown=False
	)

	correction_list = []
	if suggestions:
	for suggestion in suggestions[:top_k]:
	corrected_base = simplified_map.get(suggestion.term, base_stem)
	if len(stemmed_tokens) > 1:
	full_word = corrected_base + ''.join(stemmed_tokens[1:])
	else:
	full_word = corrected_base
	correction_list.append(full_word)
	else:
	correction_list = [word]

	sentence_options.append(correction_list)

	corrected_variants = [' '.join(variant) for variant in product(*sentence_options)]
	return [[variant] for variant in corrected_variants]

	# ------------------- Gradio UI -------------------

	with gr.Blocks(title="Nepali Spell Correction Tool") as demo:
	gr.Markdown(
	"""
	# Nepali Spell Correction Tool

	Enter a Nepali sentence to generate corrected variants. You can also view and manage the location vocabulary.
	"""
	)

	example_sentences = {
	"Example 1": "भतपरको जिज्ञासु वातावरणले धेरै पर्यटकलाई आकर्षित गर्छ।",
	"Example 2": "ललतपुर प्राचीन मूर्तिकला र वास्तुकलाको केन्द्र हो।",
	"Example 3": "पोखराेाै प्रकृतिक सौन्दर्यले भरिपूर्ण शहर हो।"
	}

	with gr.Row():
	with gr.Column(scale=3):
	gr.Markdown("## Sentence Correction")

	example_dropdown = gr.Dropdown(
	label="Choose Example Sentence",
	choices=list(example_sentences.values()),
	value=list(example_sentences.values())[0],
	interactive=True
	)

	sentence_input = gr.Textbox(
	label="Input Sentence",
	value=list(example_sentences.values())[0],
	placeholder="Enter a Nepali sentence",
	lines=2
	)

	def set_example(example):
	return example

	example_dropdown.change(set_example, inputs=[example_dropdown], outputs=[sentence_input])

	max_dist = gr.Slider(0, 4, value=2, step=1, label="Max Edit Distance")
	prefix_len = gr.Slider(1, 5, value=3, step=1, label="Prefix Length")
	top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K Suggestions")

	submit_btn = gr.Button("Correct Sentence")

	gr.Markdown("## Location Vocabulary Table")

	with gr.Accordion("View or Manage Location Vocabulary", open=False):
	loc_out = gr.Dataframe(
	headers=["Simplified Form", "Original Name"],
	datatype=["str", "str"],
	row_count=5,
	interactive=False,
	label="Location Vocabulary"
	)
	view_btn = gr.Button("Show Locations")

	# Uncomment below to enable adding new locations
	# new_loc = gr.Textbox(label="Add New Place", placeholder="e.g., काठमाडौँ")
	# add_btn = gr.Button("Add Location")
	# add_msg = gr.Textbox(label="Status", interactive=False)

	with gr.Column(scale=2):
	gr.Markdown("## Corrected Sentence Variants")
	corrected_out = gr.Dataframe(
	headers=["Corrected Sentence Variants"],
	datatype=["str"],
	row_count=5,
	interactive=False
	)

	submit_btn.click(
	correct_sentence,
	inputs=[sentence_input, max_dist, prefix_len, top_k],
	outputs=corrected_out
	)

	view_btn.click(
	list_locations_as_table,
	inputs=[],
	outputs=loc_out
	)

	# add_btn.click(save_to_vocab, inputs=new_loc, outputs=add_msg)

	# ------------------- Launch App -------------------

	if __name__ == "__main__":
	demo.launch()