Upload app (1).py
Browse files- app (1).py +154 -0
app (1).py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from collections import defaultdict
|
| 3 |
+
from typing import List, Dict, Set, Tuple
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
class MultilingualSpellChecker:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.vocab = defaultdict(set) # Language -> set of words
|
| 10 |
+
self.char_to_words = defaultdict(lambda: defaultdict(set)) # Language -> char -> words
|
| 11 |
+
self.current_language = None
|
| 12 |
+
|
| 13 |
+
def load_corpus(self, language: str, file_path: str):
|
| 14 |
+
"""Load corpus for specific language"""
|
| 15 |
+
try:
|
| 16 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 17 |
+
for line in f:
|
| 18 |
+
word = line.strip()
|
| 19 |
+
if word:
|
| 20 |
+
self.vocab[language].add(word)
|
| 21 |
+
self.char_to_words[language][word[0]].add(word)
|
| 22 |
+
print(f"Loaded {len(self.vocab[language])} words for {language}")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"Error loading corpus for {language}: {e}")
|
| 25 |
+
|
| 26 |
+
def levenshtein_distance(self, s1: str, s2: str) -> int:
|
| 27 |
+
"""Calculate Levenshtein distance between two strings"""
|
| 28 |
+
if len(s1) < len(s2):
|
| 29 |
+
return self.levenshtein_distance(s2, s1)
|
| 30 |
+
|
| 31 |
+
if len(s2) == 0:
|
| 32 |
+
return len(s1)
|
| 33 |
+
|
| 34 |
+
previous_row = range(len(s2) + 1)
|
| 35 |
+
for i, c1 in enumerate(s1):
|
| 36 |
+
current_row = [i + 1]
|
| 37 |
+
for j, c2 in enumerate(s2):
|
| 38 |
+
insertions = previous_row[j + 1] + 1
|
| 39 |
+
deletions = current_row[j] + 1
|
| 40 |
+
substitutions = previous_row[j] + (c1 != c2)
|
| 41 |
+
current_row.append(min(insertions, deletions, substitutions))
|
| 42 |
+
previous_row = current_row
|
| 43 |
+
|
| 44 |
+
return previous_row[-1]
|
| 45 |
+
|
| 46 |
+
def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
|
| 47 |
+
"""Get word suggestions for given language"""
|
| 48 |
+
if word in self.vocab[language]:
|
| 49 |
+
return {"exact_match": [word]}
|
| 50 |
+
|
| 51 |
+
candidates = self.char_to_words[language].get(word[0], set())
|
| 52 |
+
if not candidates:
|
| 53 |
+
candidates = self.vocab[language]
|
| 54 |
+
|
| 55 |
+
distances = []
|
| 56 |
+
for candidate in candidates:
|
| 57 |
+
distance = self.levenshtein_distance(word, candidate)
|
| 58 |
+
distances.append((distance, candidate))
|
| 59 |
+
|
| 60 |
+
distances.sort()
|
| 61 |
+
|
| 62 |
+
suggestions = {
|
| 63 |
+
"High Probability": [], # Distance 1
|
| 64 |
+
"Medium Probability": [], # Distance 2
|
| 65 |
+
"Low Probability": [], # Distance 3
|
| 66 |
+
"Very Low Probability": [] # Distance 4+
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
for distance, candidate in distances[:max_suggestions]:
|
| 70 |
+
if distance == 1:
|
| 71 |
+
suggestions["High Probability"].append(candidate)
|
| 72 |
+
elif distance == 2:
|
| 73 |
+
suggestions["Medium Probability"].append(candidate)
|
| 74 |
+
elif distance == 3:
|
| 75 |
+
suggestions["Low Probability"].append(candidate)
|
| 76 |
+
else:
|
| 77 |
+
suggestions["Very Low Probability"].append(candidate)
|
| 78 |
+
|
| 79 |
+
return {k: v for k, v in suggestions.items() if v}
|
| 80 |
+
|
| 81 |
+
def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
|
| 82 |
+
"""Format suggestions for Gradio output"""
|
| 83 |
+
if "exact_match" in suggestions:
|
| 84 |
+
return "✓ Word is correct and exists in the corpus!"
|
| 85 |
+
|
| 86 |
+
result = "Suggested corrections:\n\n"
|
| 87 |
+
for category, words in suggestions.items():
|
| 88 |
+
if words:
|
| 89 |
+
result += f"{category}:\n"
|
| 90 |
+
result += ", ".join(words)
|
| 91 |
+
result += "\n\n"
|
| 92 |
+
return result
|
| 93 |
+
|
| 94 |
+
def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
|
| 95 |
+
"""Gradio interface function"""
|
| 96 |
+
if not word.strip():
|
| 97 |
+
return "Please enter a word to check."
|
| 98 |
+
|
| 99 |
+
suggestions = spell_checker.get_suggestions(word, language)
|
| 100 |
+
return format_suggestions(suggestions)
|
| 101 |
+
|
| 102 |
+
def create_gradio_interface():
|
| 103 |
+
# Initialize spell checker
|
| 104 |
+
spell_checker = MultilingualSpellChecker()
|
| 105 |
+
|
| 106 |
+
# Load corpora (adjust paths as needed)
|
| 107 |
+
spell_checker.load_corpus("Tamil", "/content/tamil.txt")
|
| 108 |
+
spell_checker.load_corpus("Malayalam", "/content/malayalam.txt")
|
| 109 |
+
|
| 110 |
+
# Create Gradio interface
|
| 111 |
+
iface = gr.Interface(
|
| 112 |
+
fn=lambda word, lang: check_spelling(word, lang, spell_checker),
|
| 113 |
+
inputs=[
|
| 114 |
+
gr.Textbox(
|
| 115 |
+
label="Enter word to check",
|
| 116 |
+
placeholder="Type a word here...",
|
| 117 |
+
lines=1
|
| 118 |
+
),
|
| 119 |
+
gr.Dropdown(
|
| 120 |
+
choices=["Tamil", "Malayalam"],
|
| 121 |
+
label="Select Language",
|
| 122 |
+
value="Tamil"
|
| 123 |
+
)
|
| 124 |
+
],
|
| 125 |
+
outputs=gr.Textbox(
|
| 126 |
+
label="Results",
|
| 127 |
+
lines=10
|
| 128 |
+
),
|
| 129 |
+
title="Multilingual Spell Checker",
|
| 130 |
+
description="""Enter a word in the selected language to check its spelling and get suggestions.
|
| 131 |
+
The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
|
| 132 |
+
theme="default",
|
| 133 |
+
css="""
|
| 134 |
+
.gradio-container {max-width: 800px; margin: auto;}
|
| 135 |
+
.output-text {font-family: monospace;}
|
| 136 |
+
"""
|
| 137 |
+
)
|
| 138 |
+
return iface
|
| 139 |
+
|
| 140 |
+
# For Colab usage
|
| 141 |
+
def setup_colab():
|
| 142 |
+
# Install Gradio if not already installed
|
| 143 |
+
# !pip install -q gradio
|
| 144 |
+
|
| 145 |
+
# Create and launch interface
|
| 146 |
+
iface = create_gradio_interface()
|
| 147 |
+
iface.launch(share=True)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
if __name__ == "__main__":
|
| 151 |
+
# For local development
|
| 152 |
+
iface = create_gradio_interface()
|
| 153 |
+
iface.launch()
|
| 154 |
+
|