File size: 5,476 Bytes
a027cd2 7beeb1e 6f65911 a027cd2 6f65911 a027cd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
from collections import defaultdict
from typing import List, Dict, Set, Tuple
import pandas as pd
import numpy as np
class MultilingualSpellChecker:
def __init__(self):
self.vocab = defaultdict(set) # Language -> set of words
self.char_to_words = defaultdict(lambda: defaultdict(set)) # Language -> char -> words
self.current_language = None
def load_corpus(self, language: str, file_path: str):
"""Load corpus for specific language"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
word = line.strip()
if word:
self.vocab[language].add(word)
self.char_to_words[language][word[0]].add(word)
print(f"Loaded {len(self.vocab[language])} words for {language}")
except Exception as e:
print(f"Error loading corpus for {language}: {e}")
def levenshtein_distance(self, s1: str, s2: str) -> int:
"""Calculate Levenshtein distance between two strings"""
if len(s1) < len(s2):
return self.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
"""Get word suggestions for given language"""
if word in self.vocab[language]:
return {"exact_match": [word]}
candidates = self.char_to_words[language].get(word[0], set())
if not candidates:
candidates = self.vocab[language]
distances = []
for candidate in candidates:
distance = self.levenshtein_distance(word, candidate)
distances.append((distance, candidate))
distances.sort()
suggestions = {
"High Probability": [], # Distance 1
"Medium Probability": [], # Distance 2
"Low Probability": [], # Distance 3
"Very Low Probability": [] # Distance 4+
}
for distance, candidate in distances[:max_suggestions]:
if distance == 1:
suggestions["High Probability"].append(candidate)
elif distance == 2:
suggestions["Medium Probability"].append(candidate)
elif distance == 3:
suggestions["Low Probability"].append(candidate)
else:
suggestions["Very Low Probability"].append(candidate)
return {k: v for k, v in suggestions.items() if v}
def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
"""Format suggestions for Gradio output"""
if "exact_match" in suggestions:
return "✓ Word is correct and exists in the corpus!"
result = "Suggested corrections:\n\n"
for category, words in suggestions.items():
if words:
result += f"{category}:\n"
result += ", ".join(words)
result += "\n\n"
return result
def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
"""Gradio interface function"""
if not word.strip():
return "Please enter a word to check."
suggestions = spell_checker.get_suggestions(word, language)
return format_suggestions(suggestions)
def create_gradio_interface():
# Initialize spell checker
spell_checker = MultilingualSpellChecker()
# Load corpora (adjust paths as needed)
spell_checker.load_corpus("Tamil", "tamil.txt")
spell_checker.load_corpus("Malayalam", "malayalam.txt")
spell_checker.load_corpus("English", "english.txt")
# Create Gradio interface
iface = gr.Interface(
fn=lambda word, lang: check_spelling(word, lang, spell_checker),
inputs=[
gr.Textbox(
label="Enter word to check",
placeholder="Type a word here...",
lines=1
),
gr.Dropdown(
choices=["Tamil", "Malayalam","English"],
label="Select Language",
value="Tamil"
)
],
outputs=gr.Textbox(
label="Results",
lines=10
),
title="Multilingual Spell Checker",
description="""Enter a word in the selected language to check its spelling and get suggestions.
The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
theme="default",
css="""
.gradio-container {max-width: 800px; margin: auto;}
.output-text {font-family: monospace;}
"""
)
return iface
# For Colab usage
def setup_colab():
# Install Gradio if not already installed
# !pip install -q gradio
# Create and launch interface
iface = create_gradio_interface()
iface.launch(share=True)
if __name__ == "__main__":
# For local development
iface = create_gradio_interface()
iface.launch()
|