Spaces:

SanjayKannaV
/

MultilingualSpellChecker

Sleeping

File size: 5,476 Bytes

import gradio as gr
from collections import defaultdict
from typing import List, Dict, Set, Tuple
import pandas as pd
import numpy as np

class MultilingualSpellChecker:
    def __init__(self):
        self.vocab = defaultdict(set)  # Language -> set of words
        self.char_to_words = defaultdict(lambda: defaultdict(set))  # Language -> char -> words
        self.current_language = None

    def load_corpus(self, language: str, file_path: str):
        """Load corpus for specific language"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    word = line.strip()
                    if word:
                        self.vocab[language].add(word)
                        self.char_to_words[language][word[0]].add(word)
            print(f"Loaded {len(self.vocab[language])} words for {language}")
        except Exception as e:
            print(f"Error loading corpus for {language}: {e}")

    def levenshtein_distance(self, s1: str, s2: str) -> int:
        """Calculate Levenshtein distance between two strings"""
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)

        if len(s2) == 0:
            return len(s1)

        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
        """Get word suggestions for given language"""
        if word in self.vocab[language]:
            return {"exact_match": [word]}

        candidates = self.char_to_words[language].get(word[0], set())
        if not candidates:
            candidates = self.vocab[language]

        distances = []
        for candidate in candidates:
            distance = self.levenshtein_distance(word, candidate)
            distances.append((distance, candidate))

        distances.sort()

        suggestions = {
            "High Probability": [],    # Distance 1
            "Medium Probability": [],  # Distance 2
            "Low Probability": [],     # Distance 3
            "Very Low Probability": [] # Distance 4+
        }

        for distance, candidate in distances[:max_suggestions]:
            if distance == 1:
                suggestions["High Probability"].append(candidate)
            elif distance == 2:
                suggestions["Medium Probability"].append(candidate)
            elif distance == 3:
                suggestions["Low Probability"].append(candidate)
            else:
                suggestions["Very Low Probability"].append(candidate)

        return {k: v for k, v in suggestions.items() if v}

def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
    """Format suggestions for Gradio output"""
    if "exact_match" in suggestions:
        return "✓ Word is correct and exists in the corpus!"

    result = "Suggested corrections:\n\n"
    for category, words in suggestions.items():
        if words:
            result += f"{category}:\n"
            result += ", ".join(words)
            result += "\n\n"
    return result

def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
    """Gradio interface function"""
    if not word.strip():
        return "Please enter a word to check."

    suggestions = spell_checker.get_suggestions(word, language)
    return format_suggestions(suggestions)

def create_gradio_interface():
    # Initialize spell checker
    spell_checker = MultilingualSpellChecker()

    # Load corpora (adjust paths as needed)
    spell_checker.load_corpus("Tamil", "tamil.txt")
    spell_checker.load_corpus("Malayalam", "malayalam.txt")
    spell_checker.load_corpus("English", "english.txt")

    # Create Gradio interface
    iface = gr.Interface(
        fn=lambda word, lang: check_spelling(word, lang, spell_checker),
        inputs=[
            gr.Textbox(
                label="Enter word to check",
                placeholder="Type a word here...",
                lines=1
            ),
            gr.Dropdown(
                choices=["Tamil", "Malayalam","English"],
                label="Select Language",
                value="Tamil"
            )
        ],
        outputs=gr.Textbox(
            label="Results",
            lines=10
        ),
        title="Multilingual Spell Checker",
        description="""Enter a word in the selected language to check its spelling and get suggestions.
        The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
        theme="default",
        css="""
        .gradio-container {max-width: 800px; margin: auto;}
        .output-text {font-family: monospace;}
        """
    )
    return iface

# For Colab usage
def setup_colab():
    # Install Gradio if not already installed
   # !pip install -q gradio

    # Create and launch interface
    iface = create_gradio_interface()
    iface.launch(share=True)
    

if __name__ == "__main__":
    # For local development
    iface = create_gradio_interface()
    iface.launch()