|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import tensorflow as tf
|
|
|
from keras import Model, Input
|
|
|
from keras.api.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape, TimeDistributed
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from keras.api.utils import to_categorical
|
|
|
from PIL import Image
|
|
|
import string
|
|
|
|
|
|
class TextTypeLangModel:
|
|
|
def __init__(self, csv_path):
|
|
|
self.df = pd.read_csv(csv_path)
|
|
|
self.characters = string.ascii_letters + string.digits + " " + \
|
|
|
"آبپچڈڑڤکگہہٹژزسٹطظعغفقکگلاںمںنۓہھوؤ" + \
|
|
|
"ےیئؤٹپجچحخدڈذرزسشصضطظعغفقکلمنوٕں" + \
|
|
|
"ۓۓہ۔،؛؟"
|
|
|
|
|
|
self.num_chars = len(self.characters) + 1
|
|
|
self.char_to_index = {c: i+1 for i, c in enumerate(self.characters)}
|
|
|
self.index_to_char = {i+1: c for i, c in enumerate(self.characters)}
|
|
|
self.model = self.build_model()
|
|
|
|
|
|
def encode_text(self, text, max_len=10):
|
|
|
text = text[:max_len].ljust(max_len)
|
|
|
return [self.char_to_index.get(c, 0) for c in text]
|
|
|
|
|
|
def preprocess_image(self, image_path):
|
|
|
image = Image.open(image_path).convert("RGB")
|
|
|
image = image.resize((128, 128))
|
|
|
image = np.array(image) / 255.0
|
|
|
return image
|
|
|
|
|
|
def prepare_data(self):
|
|
|
X_images = np.array([self.preprocess_image(img) for img in self.df['Cropped Image Path']])
|
|
|
y_text = np.array([self.encode_text(txt) for txt in self.df['Text']])
|
|
|
y_type = to_categorical(self.df['type'].values, num_classes=7)
|
|
|
y_lang = to_categorical(self.df['language'].values, num_classes=2)
|
|
|
return train_test_split(X_images, to_categorical(y_text, self.num_chars), y_type, y_lang, test_size=0.2, random_state=42)
|
|
|
|
|
|
def build_model(self):
|
|
|
input_layer = Input(shape=(128, 128, 3))
|
|
|
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
|
|
|
x = MaxPooling2D()(x)
|
|
|
x = Conv2D(64, (3, 3), activation='relu')(x)
|
|
|
x = MaxPooling2D()(x)
|
|
|
x = Flatten()(x)
|
|
|
|
|
|
|
|
|
x = Dense(1280, activation='relu')(x)
|
|
|
|
|
|
|
|
|
x = Reshape((10, 128))(x)
|
|
|
|
|
|
|
|
|
time_distributed_text_output = TimeDistributed(Dense(self.num_chars, activation='softmax'))(x)
|
|
|
text_output = Reshape((10, self.num_chars), name='text_output')(time_distributed_text_output)
|
|
|
|
|
|
|
|
|
output_type = Dense(7, activation='softmax', name='type_output')(x)
|
|
|
output_lang = Dense(2, activation='softmax', name='lang_output')(x)
|
|
|
|
|
|
model = Model(inputs=input_layer, outputs=[text_output, output_type, output_lang])
|
|
|
model.compile(optimizer='adam',
|
|
|
loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
|
|
|
metrics={'text_output': 'accuracy', 'type_output': 'accuracy', 'lang_output': 'accuracy'})
|
|
|
return model
|
|
|
|
|
|
def train(self, epochs=10, batch_size=32):
|
|
|
X_train, X_test, y_train_text, y_test_text, y_train_type, y_test_type, y_train_lang, y_test_lang = self.prepare_data()
|
|
|
|
|
|
|
|
|
y_train_type = np.expand_dims(y_train_type, axis=1)
|
|
|
y_train_type = np.repeat(y_train_type, 10, axis=1)
|
|
|
|
|
|
y_test_type = np.expand_dims(y_test_type, axis=1)
|
|
|
y_test_type = np.repeat(y_test_type, 10, axis=1)
|
|
|
|
|
|
y_train_lang = np.expand_dims(y_train_lang, axis=1)
|
|
|
y_train_lang = np.repeat(y_train_lang, 10, axis=1)
|
|
|
|
|
|
y_test_lang = np.expand_dims(y_test_lang, axis=1)
|
|
|
y_test_lang = np.repeat(y_test_lang, 10, axis=1)
|
|
|
|
|
|
|
|
|
self.model.fit(X_train, [y_train_text, y_train_type, y_train_lang],
|
|
|
validation_data=(X_test, [y_test_text, y_test_type, y_test_lang]),
|
|
|
epochs=epochs, batch_size=batch_size)
|
|
|
self.model.save("text_type_lang_model.h5")
|
|
|
|
|
|
def predict(self, image_path):
|
|
|
image = self.preprocess_image(image_path)
|
|
|
image = np.expand_dims(image, axis=0)
|
|
|
pred_text, pred_type, pred_lang = self.model.predict(image)
|
|
|
pred_text = ''.join(self.index_to_char.get(np.argmax(pred_text[0][i]), '') for i in range(10))
|
|
|
return pred_text.strip(), np.argmax(pred_type), np.argmax(pred_lang)
|
|
|
|
|
|
|
|
|
|
|
|
model = TextTypeLangModel("all_cropped_data_cleaned.csv")
|
|
|
model.train()
|
|
|
|
|
|
|
|
|
first_image_path = model.df['Cropped Image Path'].iloc[0]
|
|
|
predicted_text, predicted_type, predicted_language = model.predict(first_image_path)
|
|
|
|
|
|
print("Predicted Text:", predicted_text)
|
|
|
print("Predicted Type:", predicted_type)
|
|
|
print("Predicted Language:", "English" if predicted_language == 0 else "Urdu") |