Spaces:

SanjayKannaV
/

MultilingualSpellChecker

Sleeping

App Files Files Community

SanjayKannaV commited on Jan 12, 2025

Commit

a027cd2

verified ·

1 Parent(s): fb9c36b

Upload app (1).py

Browse files

Files changed (1) hide show

app (1).py +154 -0

app (1).py ADDED Viewed

	@@ -0,0 +1,154 @@

+import gradio as gr
+from collections import defaultdict
+from typing import List, Dict, Set, Tuple
+import pandas as pd
+import numpy as np
+class MultilingualSpellChecker:
+    def __init__(self):
+        self.vocab = defaultdict(set)  # Language -> set of words
+        self.char_to_words = defaultdict(lambda: defaultdict(set))  # Language -> char -> words
+        self.current_language = None
+    def load_corpus(self, language: str, file_path: str):
+        """Load corpus for specific language"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    word = line.strip()
+                    if word:
+                        self.vocab[language].add(word)
+                        self.char_to_words[language][word[0]].add(word)
+            print(f"Loaded {len(self.vocab[language])} words for {language}")
+        except Exception as e:
+            print(f"Error loading corpus for {language}: {e}")
+    def levenshtein_distance(self, s1: str, s2: str) -> int:
+        """Calculate Levenshtein distance between two strings"""
+        if len(s1) < len(s2):
+            return self.levenshtein_distance(s2, s1)
+        if len(s2) == 0:
+            return len(s1)
+        previous_row = range(len(s2) + 1)
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+        return previous_row[-1]
+    def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
+        """Get word suggestions for given language"""
+        if word in self.vocab[language]:
+            return {"exact_match": [word]}
+        candidates = self.char_to_words[language].get(word[0], set())
+        if not candidates:
+            candidates = self.vocab[language]
+        distances = []
+        for candidate in candidates:
+            distance = self.levenshtein_distance(word, candidate)
+            distances.append((distance, candidate))
+        distances.sort()
+        suggestions = {
+            "High Probability": [],    # Distance 1
+            "Medium Probability": [],  # Distance 2
+            "Low Probability": [],     # Distance 3
+            "Very Low Probability": [] # Distance 4+
+        }
+        for distance, candidate in distances[:max_suggestions]:
+            if distance == 1:
+                suggestions["High Probability"].append(candidate)
+            elif distance == 2:
+                suggestions["Medium Probability"].append(candidate)
+            elif distance == 3:
+                suggestions["Low Probability"].append(candidate)
+            else:
+                suggestions["Very Low Probability"].append(candidate)
+        return {k: v for k, v in suggestions.items() if v}
+def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
+    """Format suggestions for Gradio output"""
+    if "exact_match" in suggestions:
+        return "✓ Word is correct and exists in the corpus!"
+    result = "Suggested corrections:\n\n"
+    for category, words in suggestions.items():
+        if words:
+            result += f"{category}:\n"
+            result += ", ".join(words)
+            result += "\n\n"
+    return result
+def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
+    """Gradio interface function"""
+    if not word.strip():
+        return "Please enter a word to check."
+    suggestions = spell_checker.get_suggestions(word, language)
+    return format_suggestions(suggestions)
+def create_gradio_interface():
+    # Initialize spell checker
+    spell_checker = MultilingualSpellChecker()
+    # Load corpora (adjust paths as needed)
+    spell_checker.load_corpus("Tamil", "/content/tamil.txt")
+    spell_checker.load_corpus("Malayalam", "/content/malayalam.txt")
+    # Create Gradio interface
+    iface = gr.Interface(
+        fn=lambda word, lang: check_spelling(word, lang, spell_checker),
+        inputs=[
+            gr.Textbox(
+                label="Enter word to check",
+                placeholder="Type a word here...",
+                lines=1
+            ),
+            gr.Dropdown(
+                choices=["Tamil", "Malayalam"],
+                label="Select Language",
+                value="Tamil"
+            )
+        ],
+        outputs=gr.Textbox(
+            label="Results",
+            lines=10
+        ),
+        title="Multilingual Spell Checker",
+        description="""Enter a word in the selected language to check its spelling and get suggestions.
+        The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
+        theme="default",
+        css="""
+        .gradio-container {max-width: 800px; margin: auto;}
+        .output-text {font-family: monospace;}
+        """
+    )
+    return iface
+# For Colab usage
+def setup_colab():
+    # Install Gradio if not already installed
+   # !pip install -q gradio
+    # Create and launch interface
+    iface = create_gradio_interface()
+    iface.launch(share=True)
+if __name__ == "__main__":
+    # For local development
+    iface = create_gradio_interface()
+    iface.launch()