File size: 5,474 Bytes
5626a1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import pandas as pd
import numpy as np
import tensorflow as tf
from keras import Model, Input
from keras.api.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.api.utils import to_categorical
from PIL import Image
import string
class TextTypeLangModel:
def __init__(self, csv_path):
self.df = pd.read_csv(csv_path)
self.characters = string.ascii_letters + string.digits + " " + \
"آبپچڈڑڤکگہہٹژزسٹطظعغفقکگلاںمںنۓہھوؤ" + \
"ےیئؤٹپجچحخدڈذرزسشصضطظعغفقکلمنوٕں" + \
"ۓۓہ۔،؛؟"
self.num_chars = len(self.characters) + 1 # Extra for blank
self.char_to_index = {c: i+1 for i, c in enumerate(self.characters)}
self.index_to_char = {i+1: c for i, c in enumerate(self.characters)}
self.model = self.build_model()
def encode_text(self, text, max_len=10):
text = text[:max_len].ljust(max_len) # Pad or trim text
return [self.char_to_index.get(c, 0) for c in text] # Convert to indices
def preprocess_image(self, image_path):
image = Image.open(image_path).convert("RGB")
image = image.resize((128, 128))
image = np.array(image) / 255.0 # Normalize
return image
def prepare_data(self):
X_images = np.array([self.preprocess_image(img) for img in self.df['Cropped Image Path']])
y_text = np.array([self.encode_text(txt) for txt in self.df['Text']])
y_type = to_categorical(self.df['type'].values, num_classes=7)
y_lang = to_categorical(self.df['language'].values, num_classes=2)
return train_test_split(X_images, to_categorical(y_text, self.num_chars), y_type, y_lang, test_size=0.2, random_state=42)
def build_model(self):
input_layer = Input(shape=(128, 128, 3))
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D()(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
# Increase the number of units in Dense to match the required total number of features
x = Dense(1280, activation='relu')(x) # Set to 1280 units (10 * 128) to match reshape size
# Reshape the output to have 10 time steps and each time step has 128 features
x = Reshape((10, 128))(x) # Shape (batch_size, 10, 128)
# TimeDistributed applied to Dense layer for text output (10-character sequence)
time_distributed_text_output = TimeDistributed(Dense(self.num_chars, activation='softmax'))(x)
text_output = Reshape((10, self.num_chars), name='text_output')(time_distributed_text_output) # 10-character output
# Type and language prediction
output_type = Dense(7, activation='softmax', name='type_output')(x)
output_lang = Dense(2, activation='softmax', name='lang_output')(x)
model = Model(inputs=input_layer, outputs=[text_output, output_type, output_lang])
model.compile(optimizer='adam',
loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
metrics={'text_output': 'accuracy', 'type_output': 'accuracy', 'lang_output': 'accuracy'})
return model
def train(self, epochs=10, batch_size=32):
X_train, X_test, y_train_text, y_test_text, y_train_type, y_test_type, y_train_lang, y_test_lang = self.prepare_data()
# Reshape the target labels to match the output of the model (batch_size, 10, num_classes)
y_train_type = np.expand_dims(y_train_type, axis=1)
y_train_type = np.repeat(y_train_type, 10, axis=1) # Repeat along the time axis
y_test_type = np.expand_dims(y_test_type, axis=1)
y_test_type = np.repeat(y_test_type, 10, axis=1)
y_train_lang = np.expand_dims(y_train_lang, axis=1)
y_train_lang = np.repeat(y_train_lang, 10, axis=1) # Repeat along the time axis
y_test_lang = np.expand_dims(y_test_lang, axis=1)
y_test_lang = np.repeat(y_test_lang, 10, axis=1)
# Now, fit the model
self.model.fit(X_train, [y_train_text, y_train_type, y_train_lang],
validation_data=(X_test, [y_test_text, y_test_type, y_test_lang]),
epochs=epochs, batch_size=batch_size)
self.model.save("text_type_lang_model.h5")
def predict(self, image_path):
image = self.preprocess_image(image_path)
image = np.expand_dims(image, axis=0)
pred_text, pred_type, pred_lang = self.model.predict(image)
pred_text = ''.join(self.index_to_char.get(np.argmax(pred_text[0][i]), '') for i in range(10))
return pred_text.strip(), np.argmax(pred_type), np.argmax(pred_lang)
# Initialize and train the model
model = TextTypeLangModel("all_cropped_data_cleaned.csv")
model.train()
# Predict using the first image from the dataset
first_image_path = model.df['Cropped Image Path'].iloc[0]
predicted_text, predicted_type, predicted_language = model.predict(first_image_path)
print("Predicted Text:", predicted_text)
print("Predicted Type:", predicted_type)
print("Predicted Language:", "English" if predicted_language == 0 else "Urdu") |