File size: 5,476 Bytes
a027cd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7beeb1e
 
6f65911
a027cd2
 
 
 
 
 
 
 
 
 
 
6f65911
a027cd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
from collections import defaultdict
from typing import List, Dict, Set, Tuple
import pandas as pd
import numpy as np

class MultilingualSpellChecker:
    def __init__(self):
        self.vocab = defaultdict(set)  # Language -> set of words
        self.char_to_words = defaultdict(lambda: defaultdict(set))  # Language -> char -> words
        self.current_language = None

    def load_corpus(self, language: str, file_path: str):
        """Load corpus for specific language"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    word = line.strip()
                    if word:
                        self.vocab[language].add(word)
                        self.char_to_words[language][word[0]].add(word)
            print(f"Loaded {len(self.vocab[language])} words for {language}")
        except Exception as e:
            print(f"Error loading corpus for {language}: {e}")

    def levenshtein_distance(self, s1: str, s2: str) -> int:
        """Calculate Levenshtein distance between two strings"""
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)

        if len(s2) == 0:
            return len(s1)

        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
        """Get word suggestions for given language"""
        if word in self.vocab[language]:
            return {"exact_match": [word]}

        candidates = self.char_to_words[language].get(word[0], set())
        if not candidates:
            candidates = self.vocab[language]

        distances = []
        for candidate in candidates:
            distance = self.levenshtein_distance(word, candidate)
            distances.append((distance, candidate))

        distances.sort()

        suggestions = {
            "High Probability": [],    # Distance 1
            "Medium Probability": [],  # Distance 2
            "Low Probability": [],     # Distance 3
            "Very Low Probability": [] # Distance 4+
        }

        for distance, candidate in distances[:max_suggestions]:
            if distance == 1:
                suggestions["High Probability"].append(candidate)
            elif distance == 2:
                suggestions["Medium Probability"].append(candidate)
            elif distance == 3:
                suggestions["Low Probability"].append(candidate)
            else:
                suggestions["Very Low Probability"].append(candidate)

        return {k: v for k, v in suggestions.items() if v}

def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
    """Format suggestions for Gradio output"""
    if "exact_match" in suggestions:
        return "✓ Word is correct and exists in the corpus!"

    result = "Suggested corrections:\n\n"
    for category, words in suggestions.items():
        if words:
            result += f"{category}:\n"
            result += ", ".join(words)
            result += "\n\n"
    return result

def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
    """Gradio interface function"""
    if not word.strip():
        return "Please enter a word to check."

    suggestions = spell_checker.get_suggestions(word, language)
    return format_suggestions(suggestions)

def create_gradio_interface():
    # Initialize spell checker
    spell_checker = MultilingualSpellChecker()

    # Load corpora (adjust paths as needed)
    spell_checker.load_corpus("Tamil", "tamil.txt")
    spell_checker.load_corpus("Malayalam", "malayalam.txt")
    spell_checker.load_corpus("English", "english.txt")

    # Create Gradio interface
    iface = gr.Interface(
        fn=lambda word, lang: check_spelling(word, lang, spell_checker),
        inputs=[
            gr.Textbox(
                label="Enter word to check",
                placeholder="Type a word here...",
                lines=1
            ),
            gr.Dropdown(
                choices=["Tamil", "Malayalam","English"],
                label="Select Language",
                value="Tamil"
            )
        ],
        outputs=gr.Textbox(
            label="Results",
            lines=10
        ),
        title="Multilingual Spell Checker",
        description="""Enter a word in the selected language to check its spelling and get suggestions.
        The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
        theme="default",
        css="""
        .gradio-container {max-width: 800px; margin: auto;}
        .output-text {font-family: monospace;}
        """
    )
    return iface

# For Colab usage
def setup_colab():
    # Install Gradio if not already installed
   # !pip install -q gradio

    # Create and launch interface
    iface = create_gradio_interface()
    iface.launch(share=True)
    

if __name__ == "__main__":
    # For local development
    iface = create_gradio_interface()
    iface.launch()