abhinavsarkar commited on
Commit
8212cca
·
verified ·
1 Parent(s): 614c61e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import textdistance
4
+ import re
5
+ from collections import Counter
6
+ import torch
7
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
8
+
9
+ # Set the page configuration as the first Streamlit command
10
+ st.set_page_config(page_title="Spell & Grammar Checker", layout="wide")
11
+
12
+ # Load the grammar correction model
13
+ @st.cache_resource
14
+ def load_grammar_model():
15
+ model_name = 'abhinavsarkar/Google-T5-base-Grammatical_Error_Correction-Finetuned-C4-200M-550k'
16
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
18
+ model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)
19
+ return tokenizer, model, torch_device
20
+
21
+ tokenizer, model, torch_device = load_grammar_model()
22
+
23
+ # Load vocabulary for spell checking (optimized loading)
24
+ @st.cache_resource
25
+ def load_vocabulary():
26
+ file_paths = ['Vocabulary/book.txt', 'Vocabulary/alice_in_wonderland.txt', 'Vocabulary/big.txt', 'Vocabulary/shakespeare.txt']
27
+ words = []
28
+ for file_path in file_paths:
29
+ with open(file_path, 'r') as f:
30
+ file_name_data = f.read().lower()
31
+ words += re.findall(r'\w+', file_name_data)
32
+ V = set(words)
33
+ word_freq = Counter(words)
34
+ probs = {k: word_freq[k] / sum(word_freq.values()) for k in word_freq}
35
+ return V, word_freq, probs
36
+
37
+ V, word_freq, probs = load_vocabulary()
38
+
39
+ # Precompute Jaccard similarity scores for spell check
40
+ def precompute_similarities(input_word):
41
+ input_word = input_word.lower()
42
+ sim = [1 - (textdistance.Jaccard(qval=2).distance(v, input_word)) for v in word_freq.keys()]
43
+ return sim
44
+
45
+ def my_autocorrect(input_paragraph, top_n=5):
46
+ input_paragraph = input_paragraph.lower()
47
+ words_in_paragraph = re.findall(r'\w+', input_paragraph)
48
+ incorrect_words = []
49
+ corrected_words = []
50
+ for word in words_in_paragraph:
51
+ if word not in V:
52
+ sim = precompute_similarities(word)
53
+ df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
54
+ df = df.rename(columns={'index': 'Word', 0: 'Prob'})
55
+ df['Similarity'] = sim
56
+ output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(top_n)
57
+ output = output[['Word', 'Similarity', 'Prob']].reset_index(drop=True)
58
+ output.index = output.index + 1
59
+ incorrect_words.append(word)
60
+ corrected_words.append(output)
61
+ return incorrect_words, corrected_words
62
+
63
+ # Function for grammar correction
64
+ def correct_grammar(input_text, num_return_sequences=2):
65
+ batch = tokenizer([input_text], truncation=True, padding='max_length', max_length=64, return_tensors="pt").to(torch_device)
66
+ translated = model.generate(**batch, max_length=64, num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
67
+ tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
68
+ return tgt_text
69
+
70
+ # Streamlit app layout
71
+ def main():
72
+ st.title("📚 Intelligent Spell & Grammar Checker")
73
+ st.markdown("""
74
+ Welcome to the **Spell & Grammar Checker**! This app is designed to help you improve your writing by detecting and correcting spelling and grammar errors. Simply enter a paragraph below and let the app do the rest. Each section provides unique suggestions to refine your text.
75
+ """)
76
+
77
+ paragraph = st.text_area("✨ Enter a paragraph to check for spelling and grammar issues:", height=200)
78
+
79
+ # Two side-by-side sections
80
+ col1, col2 = st.columns(2)
81
+
82
+ # Initialize session state for storing results
83
+ if 'spelling_results' not in st.session_state:
84
+ st.session_state.spelling_results = None
85
+ if 'grammar_results' not in st.session_state:
86
+ st.session_state.grammar_results = None
87
+
88
+ with col1:
89
+ st.header("🔍 Spell Checker")
90
+ st.markdown("""
91
+ **About the Spell Checker:**
92
+ Our spell checker uses a vocabulary from multiple literary texts to detect potential misspellings. It offers suggestions ranked by similarity and probability, helping you to identify and correct errors with ease.
93
+ **How to use:**
94
+ Enter a paragraph and click **Check Spelling** to see any misspelled words along with suggestions.
95
+ """)
96
+
97
+ if st.button("Check Spelling"):
98
+ if paragraph:
99
+ with st.spinner("Checking spelling..."):
100
+ incorrect_words, corrected_words = my_autocorrect(paragraph)
101
+ if incorrect_words:
102
+ st.session_state.spelling_results = (incorrect_words, corrected_words)
103
+ else:
104
+ st.session_state.spelling_results = ("✅ No spelling errors detected!", [])
105
+ else:
106
+ st.warning("Please enter a paragraph to check for spelling.")
107
+
108
+ if st.session_state.spelling_results:
109
+ incorrect_words, corrected_words = st.session_state.spelling_results
110
+ if isinstance(incorrect_words, str):
111
+ st.success(incorrect_words)
112
+ else:
113
+ st.subheader("🔴 Spelling Errors & Suggestions:")
114
+ for i, word in enumerate(incorrect_words):
115
+ st.write(f"**Misspelled Word**: `{word}`")
116
+ with st.expander(f"Suggestions for `{word}`"):
117
+ suggestions_df = corrected_words[i]
118
+ st.table(suggestions_df[['Word', 'Similarity', 'Prob']])
119
+
120
+ with col2:
121
+ st.header("📝 Grammar Checker")
122
+ st.markdown("""
123
+ **About the Grammar Checker:**
124
+ Powered by a fine-tuned T5 model, our grammar checker analyzes each sentence for potential errors in structure, tense, and word choice. It offers refined suggestions to enhance readability and grammatical accuracy.
125
+ **How to use:**
126
+ Enter a paragraph and click **Check Grammar** to review each sentence with suggested improvements.
127
+ """)
128
+
129
+ if st.button("Check Grammar"):
130
+ if paragraph:
131
+ with st.spinner("Checking grammar..."):
132
+ sentences = re.split(r'(?<=[.!?]) +', paragraph)
133
+ grammar_results = []
134
+ for sentence in sentences:
135
+ if sentence.strip():
136
+ corrected_sentences = correct_grammar(sentence, num_return_sequences=2)
137
+ grammar_results.append((sentence, corrected_sentences))
138
+ st.session_state.grammar_results = grammar_results
139
+ else:
140
+ st.warning("Please enter a paragraph to check for grammar.")
141
+
142
+ if st.session_state.grammar_results:
143
+ st.subheader("🔵 Grammar Corrections:")
144
+ for sentence, corrected_sentences in st.session_state.grammar_results:
145
+ with st.expander(f"**Original Sentence:** {sentence}", expanded=True):
146
+ st.write("### Suggestions:")
147
+ for corrected_sentence in corrected_sentences:
148
+ st.write(f"- {corrected_sentence}")
149
+
150
+ # Model details section
151
+ st.markdown("---")
152
+ st.header("📘 Grammar Checker Information")
153
+
154
+ st.markdown("""
155
+ ### Grammar Checker Model
156
+ The Grammar Checker model, fine-tuned for grammatical error correction (GEC), is ideal for enhancing writing quality across various domains. Below, you'll find relevant resources related to this model's development and usage.
157
+
158
+ - 🔗 **[Finetuned Model on Hugging Face](https://huggingface.co/abhinavsarkar/Google-T5-base-Grammatical_Error_Correction-Finetuned-C4-200M-550k)**
159
+ Access the model details, fine-tuning specifics, and download options on Hugging Face.
160
+
161
+ - 📊 **[Used Dataset on Hugging Face](https://huggingface.co/datasets/abhinavsarkar/C4-200m-550k-Determiner)**
162
+ Explore the pre-processed dataset used to train this model.
163
+
164
+ - 📂 **[Original Dataset URL](https://www.kaggle.com/datasets/felixstahlberg/the-c4-200m-dataset-for-gec)**
165
+ This dataset contains 200 million sentences with diverse structures, hosted on Kaggle.
166
+
167
+ - 🛠️ **[GitHub Repository](https://github.com/AbhinavSarkarr/Spell-and-Grammer-Checker)**
168
+ Access the code repository for dataset preparation, model training, and additional development resources.
169
+ """)
170
+
171
+ # Spell Checker Information
172
+ st.markdown("---")
173
+ st.header("🔍 Spell Checker Information")
174
+
175
+ st.markdown("""
176
+ ### Spell Checker
177
+ The Spell Checker leverages a corpus containing multiple text resources to suggest corrections for spelling errors. The algorithm uses **Jaccard Similarity** and **Relative Probability** to identify the closest matches to the input words, ensuring accuracy in suggestions.
178
+
179
+ - 📂 **[Corpus Resource](https://drive.google.com/drive/u/0/folders/1WsvpWHKUv3OI2mRce-NPg4HsVPyhfk0e)**
180
+ The vocabulary for this checker is based on a collection of literary works and publicly available texts.
181
+ """)
182
+
183
+ # Run the app
184
+ if __name__ == "__main__":
185
+ main()