Upload 8 files
Browse files- app.py +24 -0
- data_analysis.py +15 -0
- data_cleaning.py +61 -0
- data_preparing.py +0 -0
- data_splitting.py +0 -0
- model.py +0 -0
- model_callbacks.py +13 -0
- predict.py +10 -0
app.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, render_template, jsonify
|
| 2 |
+
from predict import predict_language
|
| 3 |
+
import joblib
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import h5py
|
| 6 |
+
|
| 7 |
+
model = tf.keras.models.load_model('models\\full_language_identifcation_modelf.h5')
|
| 8 |
+
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
| 9 |
+
CountVectorizer = joblib.load('models\\cv.joblib')
|
| 10 |
+
LabelEncoder = joblib.load('models\\le.joblib')
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
app = Flask(__name__)
|
| 14 |
+
|
| 15 |
+
@app.route('/', methods=['GET', 'POST'])
|
| 16 |
+
def predict():
|
| 17 |
+
if request.method == 'POST':
|
| 18 |
+
text = request.form['text']
|
| 19 |
+
prediction = predict_language(text, model, CountVectorizer, LabelEncoder) # Call your prediction function
|
| 20 |
+
return render_template('result.html', prediction=prediction, text=text)
|
| 21 |
+
return render_template('index.html')
|
| 22 |
+
|
| 23 |
+
if __name__ == '__main__':
|
| 24 |
+
app.run(debug=True)
|
data_analysis.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
df = pd.read_csv('data\\dataset.csv')
|
| 4 |
+
|
| 5 |
+
# df.head()
|
| 6 |
+
|
| 7 |
+
# df.info()
|
| 8 |
+
|
| 9 |
+
# df.isnull().sum()
|
| 10 |
+
|
| 11 |
+
# df.language.value_counts()
|
| 12 |
+
|
| 13 |
+
# df.text[0]
|
| 14 |
+
|
| 15 |
+
# df.language[0]
|
data_cleaning.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from data_analysis import df
|
| 2 |
+
from nltk.tokenize import word_tokenize
|
| 3 |
+
import re
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import nltk
|
| 6 |
+
|
| 7 |
+
#Removing Duplicates
|
| 8 |
+
# df = df.drop_duplicates(subset='Text')
|
| 9 |
+
# df = df.reset_index(drop=True)
|
| 10 |
+
|
| 11 |
+
nltk.download('punkt')
|
| 12 |
+
# Initialize the set of non-alphanumeric characters to remove
|
| 13 |
+
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
|
| 14 |
+
'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
|
| 15 |
+
'/', '>', '<', '|', ' ']
|
| 16 |
+
|
| 17 |
+
def clean_text(text):
|
| 18 |
+
"""
|
| 19 |
+
Function to clean and preprocess text data.
|
| 20 |
+
"""
|
| 21 |
+
# Tokenize the text using spaCy
|
| 22 |
+
tokens = word_tokenize(text)
|
| 23 |
+
|
| 24 |
+
# Remove non-alphanumeric characters
|
| 25 |
+
words = [word.lower() for word in tokens if word not in nonalphanumeric]
|
| 26 |
+
|
| 27 |
+
# Join the lemmatized words back into a single string
|
| 28 |
+
cleaned_text = " ".join(words)
|
| 29 |
+
|
| 30 |
+
return cleaned_text
|
| 31 |
+
|
| 32 |
+
def remove_english(text):
|
| 33 |
+
"""
|
| 34 |
+
function that takes text as input and returns text without english words
|
| 35 |
+
"""
|
| 36 |
+
pat = "[a-zA-Z]+"
|
| 37 |
+
text = re.sub(pat, "", text)
|
| 38 |
+
return text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
#applying clean_text function to all rows in 'Text' column
|
| 42 |
+
# df['clean_text'] = df['Text'].apply(clean_text)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# #Removing English from Chinese text
|
| 47 |
+
# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset
|
| 48 |
+
|
| 49 |
+
# clean_text = df.loc[df.language=='Chinese']['clean_text']
|
| 50 |
+
# clean_text = clean_text.apply(remove_english) # removing English words
|
| 51 |
+
# df_Chinese.loc[:,'clean_text'] = clean_text
|
| 52 |
+
|
| 53 |
+
# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
|
| 54 |
+
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
|
| 55 |
+
|
| 56 |
+
# # Drop rows with 'Chinese' language from the original DataFrame
|
| 57 |
+
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# # shuffling dataframe and resetting index
|
| 61 |
+
# df = df.sample(frac=1).reset_index(drop=True)
|
data_preparing.py
ADDED
|
File without changes
|
data_splitting.py
ADDED
|
File without changes
|
model.py
ADDED
|
File without changes
|
model_callbacks.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def lr_scheduler(epoch, lr):
|
| 5 |
+
if epoch < 3:
|
| 6 |
+
return lr
|
| 7 |
+
else:
|
| 8 |
+
return lr * tf.math.exp(-0.1)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
early_stopping = tf.keras.callbacksEarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
|
| 12 |
+
lr_scheduler_callback = tf.keras.callbacksLearningRateScheduler(lr_scheduler)
|
| 13 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
|
predict.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from data_cleaning import clean_text
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def predict_language(text, model, cv, le):
|
| 6 |
+
cleaned_text = clean_text(text)
|
| 7 |
+
text_vectorized = cv.transform([cleaned_text])
|
| 8 |
+
prediction = model.predict(text_vectorized)
|
| 9 |
+
predicted_label = le.inverse_transform([np.argmax(prediction)])[0] # Get the first element of the list
|
| 10 |
+
return predicted_label
|