|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import pickle |
|
|
from tqdm import tqdm |
|
|
from Levenshtein import distance as lev |
|
|
import joblib |
|
|
from googletrans import Translator |
|
|
from indictrans import Transliterator |
|
|
from pyphonetics import RefinedSoundex |
|
|
from bs4 import BeautifulSoup |
|
|
import re |
|
|
|
|
|
def closest_match(word, vocabulary): |
|
|
best_match = None |
|
|
best_distance = float('inf') |
|
|
for vocab_word in vocabulary: |
|
|
dist = lev(word, vocab_word) |
|
|
if dist < best_distance: |
|
|
best_distance = dist |
|
|
best_match = vocab_word |
|
|
return best_match |
|
|
|
|
|
def main(): |
|
|
st.title('Text Processing App') |
|
|
rs = RefinedSoundex() |
|
|
normalized_string_final=[] |
|
|
translator = Translator() |
|
|
trn = Transliterator(source='eng', target='hin') |
|
|
|
|
|
with open(r'./english_vocab.pkl', "rb") as fp: |
|
|
english = pickle.load(fp) |
|
|
english_vocab=english |
|
|
with open(r'./hinglish_vocab.pkl', "rb") as fp: |
|
|
hinglish = pickle.load(fp) |
|
|
hinglish_vocab=hinglish |
|
|
|
|
|
english_vocab['and'] = ['and'] |
|
|
english_vocab['is'] = ['is'] |
|
|
|
|
|
def clean_tweet(tweet): |
|
|
text=re.sub(r'@ [A-Za-z0-9\']+','',tweet) |
|
|
text=BeautifulSoup(text,'lxml').get_text() |
|
|
text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text) |
|
|
text=re.sub(r'https[A-Za-z0-9/. ]*','',text) |
|
|
text=re.sub("[^a-zA-Z]"," ",text) |
|
|
text=re.sub(r'\bRT\b',' ',text) |
|
|
text=re.sub(r'\bnan\b',' ',text) |
|
|
return text |
|
|
|
|
|
input_text = st.text_area("Enter the text:") |
|
|
total_translated = [] |
|
|
if st.button('Process'): |
|
|
data = {'Text': [input_text]} |
|
|
df1 = pd.DataFrame(data) |
|
|
df1['Text'] = df1['Text'].apply(clean_tweet) |
|
|
cleaned_text = df1['Text'].tolist()[0] |
|
|
total_text = [cleaned_text] |
|
|
st.write("Input Text:", total_text) |
|
|
|
|
|
for i in tqdm(total_text): |
|
|
test_text=i.split() |
|
|
not_changed_idx=[] |
|
|
for i in range(len(test_text)): |
|
|
not_changed_idx.append(0) |
|
|
changed_text=[] |
|
|
changed_idx=[] |
|
|
|
|
|
for i in range(len(test_text)): |
|
|
for key in english_vocab: |
|
|
done=0 |
|
|
for val in english_vocab[key]: |
|
|
if(test_text[i]==val): |
|
|
changed_text.append(key) |
|
|
changed_idx.append(i) |
|
|
not_changed_idx[i]=1 |
|
|
done=1 |
|
|
break |
|
|
if done==1: |
|
|
break |
|
|
|
|
|
|
|
|
normalized_string=[] |
|
|
res = dict(zip(changed_idx, changed_text)) |
|
|
for i in range(len(test_text)): |
|
|
try: |
|
|
normalized_string.append(res[i]) |
|
|
except: |
|
|
normalized_string.append(test_text[i]) |
|
|
print("English Normalized String:", normalized_string) |
|
|
|
|
|
|
|
|
test_list = [i for i in range(len(test_text))] |
|
|
changed_hing_idx = [i for i in test_list if i not in changed_idx] |
|
|
hinglish_text_part = [test_text[i] for i in changed_hing_idx] |
|
|
changed_text2 = [] |
|
|
changed_idx2 = [] |
|
|
|
|
|
for i in range(len(hinglish_text_part)): |
|
|
for key in hinglish_vocab: |
|
|
done = 0 |
|
|
for val in hinglish_vocab[key]: |
|
|
if hinglish_text_part[i] == val: |
|
|
changed_text2.append(key) |
|
|
changed_idx2.append(i) |
|
|
done = 1 |
|
|
break |
|
|
if done == 1: |
|
|
break |
|
|
|
|
|
normalized_string2 = [] |
|
|
res2 = dict(zip(changed_idx2, changed_text2)) |
|
|
for i in range(len(hinglish_text_part)): |
|
|
try: |
|
|
normalized_string2.append(res2[i]) |
|
|
except: |
|
|
normalized_string2.append(hinglish_text_part[i]) |
|
|
|
|
|
for i in changed_idx: |
|
|
normalized_string2.append(res[i]) |
|
|
|
|
|
print("Hinglish Normalized String:", normalized_string) |
|
|
|
|
|
|
|
|
for i in range(len(not_changed_idx)): |
|
|
try: |
|
|
if not_changed_idx[i] == 0: |
|
|
eng_phoneme_correction = [] |
|
|
for j in english_vocab: |
|
|
try: |
|
|
phoneme = rs.distance(normalized_string2[i], j) |
|
|
except: |
|
|
pass |
|
|
if phoneme <= 1: |
|
|
eng_phoneme_correction.append(j) |
|
|
eng_lev_correction = [] |
|
|
for k in eng_phoneme_correction: |
|
|
dist = lev(normalized_string2[i], k) |
|
|
if dist <= 2: |
|
|
eng_lev_correction.append(k) |
|
|
|
|
|
eng_lev_correction.extend(hing_lev_correction) |
|
|
new_correction = eng_lev_correction |
|
|
eng_lev_correction = [] |
|
|
for l in new_correction: |
|
|
dist = lev(normalized_string2[i], l) |
|
|
eng_lev_correction.append(dist) |
|
|
min_val = min(eng_lev_correction) |
|
|
min_idx = eng_lev_correction.index(min_val) |
|
|
|
|
|
suggestion = closest_match(new_correction[min_idx], english_vocab.keys()) |
|
|
normalized_string2[i] = suggestion |
|
|
except: |
|
|
pass |
|
|
|
|
|
normalized_string_final = normalized_string2 |
|
|
print("Phoneme levenshtein Distionary suggestion Normalized String:", normalized_string_final) |
|
|
|
|
|
|
|
|
classifier = joblib.load(r"./classifer.joblib") |
|
|
classify = [] |
|
|
for i in normalized_string: |
|
|
test_classify = classifier(i) |
|
|
classify.append(test_classify[0].get("label")) |
|
|
|
|
|
for i in range(len(classify)): |
|
|
if classify[i] == 'en': |
|
|
try: |
|
|
normalized_string[i] = translator.translate(normalized_string[i], src='en', dest='hi').text |
|
|
except: |
|
|
normalized_string[i] = "delete" |
|
|
print("English -> Hindi Translated String:", normalized_string) |
|
|
|
|
|
conversion_list = [trn.transform(i) for i in normalized_string] |
|
|
print("Hinglish -> Hindi Transliterated String:", conversion_list) |
|
|
|
|
|
sentence = [" ".join(conversion_list)] |
|
|
translated = [] |
|
|
for i in sentence: |
|
|
try: |
|
|
translated_text = translator.translate(i, src='hi', dest='en') |
|
|
translated.append(translated_text.text) |
|
|
except: |
|
|
translated.append("delete") |
|
|
print("Hindi -> English Translated String:", translated) |
|
|
total_translated.append(translated[0]) |
|
|
|
|
|
st.write("English Normalized String:", normalized_string) |
|
|
st.write("Hinglish Normalized String:", normalized_string) |
|
|
st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final) |
|
|
st.write("English -> Hindi Translated String:", normalized_string) |
|
|
st.write("Hinglish -> Hindi Transliterated String:", conversion_list) |
|
|
st.write("Hindi -> English Translated String:", translated) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |