|
|
import gradio as gr |
|
|
from collections import defaultdict |
|
|
from typing import List, Dict, Set, Tuple |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
class MultilingualSpellChecker: |
|
|
def __init__(self): |
|
|
self.vocab = defaultdict(set) |
|
|
self.char_to_words = defaultdict(lambda: defaultdict(set)) |
|
|
self.current_language = None |
|
|
|
|
|
def load_corpus(self, language: str, file_path: str): |
|
|
"""Load corpus for specific language""" |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
word = line.strip() |
|
|
if word: |
|
|
self.vocab[language].add(word) |
|
|
self.char_to_words[language][word[0]].add(word) |
|
|
print(f"Loaded {len(self.vocab[language])} words for {language}") |
|
|
except Exception as e: |
|
|
print(f"Error loading corpus for {language}: {e}") |
|
|
|
|
|
def levenshtein_distance(self, s1: str, s2: str) -> int: |
|
|
"""Calculate Levenshtein distance between two strings""" |
|
|
if len(s1) < len(s2): |
|
|
return self.levenshtein_distance(s2, s1) |
|
|
|
|
|
if len(s2) == 0: |
|
|
return len(s1) |
|
|
|
|
|
previous_row = range(len(s2) + 1) |
|
|
for i, c1 in enumerate(s1): |
|
|
current_row = [i + 1] |
|
|
for j, c2 in enumerate(s2): |
|
|
insertions = previous_row[j + 1] + 1 |
|
|
deletions = current_row[j] + 1 |
|
|
substitutions = previous_row[j] + (c1 != c2) |
|
|
current_row.append(min(insertions, deletions, substitutions)) |
|
|
previous_row = current_row |
|
|
|
|
|
return previous_row[-1] |
|
|
|
|
|
def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]: |
|
|
"""Get word suggestions for given language""" |
|
|
if word in self.vocab[language]: |
|
|
return {"exact_match": [word]} |
|
|
|
|
|
candidates = self.char_to_words[language].get(word[0], set()) |
|
|
if not candidates: |
|
|
candidates = self.vocab[language] |
|
|
|
|
|
distances = [] |
|
|
for candidate in candidates: |
|
|
distance = self.levenshtein_distance(word, candidate) |
|
|
distances.append((distance, candidate)) |
|
|
|
|
|
distances.sort() |
|
|
|
|
|
suggestions = { |
|
|
"High Probability": [], |
|
|
"Medium Probability": [], |
|
|
"Low Probability": [], |
|
|
"Very Low Probability": [] |
|
|
} |
|
|
|
|
|
for distance, candidate in distances[:max_suggestions]: |
|
|
if distance == 1: |
|
|
suggestions["High Probability"].append(candidate) |
|
|
elif distance == 2: |
|
|
suggestions["Medium Probability"].append(candidate) |
|
|
elif distance == 3: |
|
|
suggestions["Low Probability"].append(candidate) |
|
|
else: |
|
|
suggestions["Very Low Probability"].append(candidate) |
|
|
|
|
|
return {k: v for k, v in suggestions.items() if v} |
|
|
|
|
|
def format_suggestions(suggestions: Dict[str, List[str]]) -> str: |
|
|
"""Format suggestions for Gradio output""" |
|
|
if "exact_match" in suggestions: |
|
|
return "✓ Word is correct and exists in the corpus!" |
|
|
|
|
|
result = "Suggested corrections:\n\n" |
|
|
for category, words in suggestions.items(): |
|
|
if words: |
|
|
result += f"{category}:\n" |
|
|
result += ", ".join(words) |
|
|
result += "\n\n" |
|
|
return result |
|
|
|
|
|
def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str: |
|
|
"""Gradio interface function""" |
|
|
if not word.strip(): |
|
|
return "Please enter a word to check." |
|
|
|
|
|
suggestions = spell_checker.get_suggestions(word, language) |
|
|
return format_suggestions(suggestions) |
|
|
|
|
|
def create_gradio_interface(): |
|
|
|
|
|
spell_checker = MultilingualSpellChecker() |
|
|
|
|
|
|
|
|
spell_checker.load_corpus("Tamil", "tamil.txt") |
|
|
spell_checker.load_corpus("Malayalam", "malayalam.txt") |
|
|
spell_checker.load_corpus("English", "english.txt") |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=lambda word, lang: check_spelling(word, lang, spell_checker), |
|
|
inputs=[ |
|
|
gr.Textbox( |
|
|
label="Enter word to check", |
|
|
placeholder="Type a word here...", |
|
|
lines=1 |
|
|
), |
|
|
gr.Dropdown( |
|
|
choices=["Tamil", "Malayalam","English"], |
|
|
label="Select Language", |
|
|
value="Tamil" |
|
|
) |
|
|
], |
|
|
outputs=gr.Textbox( |
|
|
label="Results", |
|
|
lines=10 |
|
|
), |
|
|
title="Multilingual Spell Checker", |
|
|
description="""Enter a word in the selected language to check its spelling and get suggestions. |
|
|
The system will verify if the word exists in the corpus and provide similar words if it doesn't.""", |
|
|
theme="default", |
|
|
css=""" |
|
|
.gradio-container {max-width: 800px; margin: auto;} |
|
|
.output-text {font-family: monospace;} |
|
|
""" |
|
|
) |
|
|
return iface |
|
|
|
|
|
|
|
|
def setup_colab(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = create_gradio_interface() |
|
|
iface.launch(share=True) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
iface = create_gradio_interface() |
|
|
iface.launch() |
|
|
|
|
|
|