SanjayKannaV commited on
Commit
a027cd2
·
verified ·
1 Parent(s): fb9c36b

Upload app (1).py

Browse files
Files changed (1) hide show
  1. app (1).py +154 -0
app (1).py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from collections import defaultdict
3
+ from typing import List, Dict, Set, Tuple
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ class MultilingualSpellChecker:
8
+ def __init__(self):
9
+ self.vocab = defaultdict(set) # Language -> set of words
10
+ self.char_to_words = defaultdict(lambda: defaultdict(set)) # Language -> char -> words
11
+ self.current_language = None
12
+
13
+ def load_corpus(self, language: str, file_path: str):
14
+ """Load corpus for specific language"""
15
+ try:
16
+ with open(file_path, 'r', encoding='utf-8') as f:
17
+ for line in f:
18
+ word = line.strip()
19
+ if word:
20
+ self.vocab[language].add(word)
21
+ self.char_to_words[language][word[0]].add(word)
22
+ print(f"Loaded {len(self.vocab[language])} words for {language}")
23
+ except Exception as e:
24
+ print(f"Error loading corpus for {language}: {e}")
25
+
26
+ def levenshtein_distance(self, s1: str, s2: str) -> int:
27
+ """Calculate Levenshtein distance between two strings"""
28
+ if len(s1) < len(s2):
29
+ return self.levenshtein_distance(s2, s1)
30
+
31
+ if len(s2) == 0:
32
+ return len(s1)
33
+
34
+ previous_row = range(len(s2) + 1)
35
+ for i, c1 in enumerate(s1):
36
+ current_row = [i + 1]
37
+ for j, c2 in enumerate(s2):
38
+ insertions = previous_row[j + 1] + 1
39
+ deletions = current_row[j] + 1
40
+ substitutions = previous_row[j] + (c1 != c2)
41
+ current_row.append(min(insertions, deletions, substitutions))
42
+ previous_row = current_row
43
+
44
+ return previous_row[-1]
45
+
46
+ def get_suggestions(self, word: str, language: str, max_suggestions: int = 10) -> Dict[str, List[str]]:
47
+ """Get word suggestions for given language"""
48
+ if word in self.vocab[language]:
49
+ return {"exact_match": [word]}
50
+
51
+ candidates = self.char_to_words[language].get(word[0], set())
52
+ if not candidates:
53
+ candidates = self.vocab[language]
54
+
55
+ distances = []
56
+ for candidate in candidates:
57
+ distance = self.levenshtein_distance(word, candidate)
58
+ distances.append((distance, candidate))
59
+
60
+ distances.sort()
61
+
62
+ suggestions = {
63
+ "High Probability": [], # Distance 1
64
+ "Medium Probability": [], # Distance 2
65
+ "Low Probability": [], # Distance 3
66
+ "Very Low Probability": [] # Distance 4+
67
+ }
68
+
69
+ for distance, candidate in distances[:max_suggestions]:
70
+ if distance == 1:
71
+ suggestions["High Probability"].append(candidate)
72
+ elif distance == 2:
73
+ suggestions["Medium Probability"].append(candidate)
74
+ elif distance == 3:
75
+ suggestions["Low Probability"].append(candidate)
76
+ else:
77
+ suggestions["Very Low Probability"].append(candidate)
78
+
79
+ return {k: v for k, v in suggestions.items() if v}
80
+
81
+ def format_suggestions(suggestions: Dict[str, List[str]]) -> str:
82
+ """Format suggestions for Gradio output"""
83
+ if "exact_match" in suggestions:
84
+ return "✓ Word is correct and exists in the corpus!"
85
+
86
+ result = "Suggested corrections:\n\n"
87
+ for category, words in suggestions.items():
88
+ if words:
89
+ result += f"{category}:\n"
90
+ result += ", ".join(words)
91
+ result += "\n\n"
92
+ return result
93
+
94
+ def check_spelling(word: str, language: str, spell_checker: MultilingualSpellChecker) -> str:
95
+ """Gradio interface function"""
96
+ if not word.strip():
97
+ return "Please enter a word to check."
98
+
99
+ suggestions = spell_checker.get_suggestions(word, language)
100
+ return format_suggestions(suggestions)
101
+
102
+ def create_gradio_interface():
103
+ # Initialize spell checker
104
+ spell_checker = MultilingualSpellChecker()
105
+
106
+ # Load corpora (adjust paths as needed)
107
+ spell_checker.load_corpus("Tamil", "/content/tamil.txt")
108
+ spell_checker.load_corpus("Malayalam", "/content/malayalam.txt")
109
+
110
+ # Create Gradio interface
111
+ iface = gr.Interface(
112
+ fn=lambda word, lang: check_spelling(word, lang, spell_checker),
113
+ inputs=[
114
+ gr.Textbox(
115
+ label="Enter word to check",
116
+ placeholder="Type a word here...",
117
+ lines=1
118
+ ),
119
+ gr.Dropdown(
120
+ choices=["Tamil", "Malayalam"],
121
+ label="Select Language",
122
+ value="Tamil"
123
+ )
124
+ ],
125
+ outputs=gr.Textbox(
126
+ label="Results",
127
+ lines=10
128
+ ),
129
+ title="Multilingual Spell Checker",
130
+ description="""Enter a word in the selected language to check its spelling and get suggestions.
131
+ The system will verify if the word exists in the corpus and provide similar words if it doesn't.""",
132
+ theme="default",
133
+ css="""
134
+ .gradio-container {max-width: 800px; margin: auto;}
135
+ .output-text {font-family: monospace;}
136
+ """
137
+ )
138
+ return iface
139
+
140
+ # For Colab usage
141
+ def setup_colab():
142
+ # Install Gradio if not already installed
143
+ # !pip install -q gradio
144
+
145
+ # Create and launch interface
146
+ iface = create_gradio_interface()
147
+ iface.launch(share=True)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ # For local development
152
+ iface = create_gradio_interface()
153
+ iface.launch()
154
+