import pandas as pd import numpy as np import tensorflow as tf from keras import Model, Input from keras.api.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape, TimeDistributed from sklearn.model_selection import train_test_split from keras.api.utils import to_categorical from PIL import Image import string class TextTypeLangModel: def __init__(self, csv_path): self.df = pd.read_csv(csv_path) self.characters = string.ascii_letters + string.digits + " " + \ "آبپچڈڑڤکگہہٹژزسٹطظعغفقکگلاںمںنۓہھوؤ" + \ "ےیئؤٹپجچحخدڈذرزسشصضطظعغفقکلمنوٕں" + \ "ۓۓہ۔،؛؟" self.num_chars = len(self.characters) + 1 # Extra for blank self.char_to_index = {c: i+1 for i, c in enumerate(self.characters)} self.index_to_char = {i+1: c for i, c in enumerate(self.characters)} self.model = self.build_model() def encode_text(self, text, max_len=10): text = text[:max_len].ljust(max_len) # Pad or trim text return [self.char_to_index.get(c, 0) for c in text] # Convert to indices def preprocess_image(self, image_path): image = Image.open(image_path).convert("RGB") image = image.resize((128, 128)) image = np.array(image) / 255.0 # Normalize return image def prepare_data(self): X_images = np.array([self.preprocess_image(img) for img in self.df['Cropped Image Path']]) y_text = np.array([self.encode_text(txt) for txt in self.df['Text']]) y_type = to_categorical(self.df['type'].values, num_classes=7) y_lang = to_categorical(self.df['language'].values, num_classes=2) return train_test_split(X_images, to_categorical(y_text, self.num_chars), y_type, y_lang, test_size=0.2, random_state=42) def build_model(self): input_layer = Input(shape=(128, 128, 3)) x = Conv2D(32, (3, 3), activation='relu')(input_layer) x = MaxPooling2D()(x) x = Conv2D(64, (3, 3), activation='relu')(x) x = MaxPooling2D()(x) x = Flatten()(x) # Increase the number of units in Dense to match the required total number of features x = Dense(1280, activation='relu')(x) # Set to 1280 units (10 * 128) to match reshape size # Reshape the output to have 10 time steps and each time step has 128 features x = Reshape((10, 128))(x) # Shape (batch_size, 10, 128) # TimeDistributed applied to Dense layer for text output (10-character sequence) time_distributed_text_output = TimeDistributed(Dense(self.num_chars, activation='softmax'))(x) text_output = Reshape((10, self.num_chars), name='text_output')(time_distributed_text_output) # 10-character output # Type and language prediction output_type = Dense(7, activation='softmax', name='type_output')(x) output_lang = Dense(2, activation='softmax', name='lang_output')(x) model = Model(inputs=input_layer, outputs=[text_output, output_type, output_lang]) model.compile(optimizer='adam', loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'], metrics={'text_output': 'accuracy', 'type_output': 'accuracy', 'lang_output': 'accuracy'}) return model def train(self, epochs=10, batch_size=32): X_train, X_test, y_train_text, y_test_text, y_train_type, y_test_type, y_train_lang, y_test_lang = self.prepare_data() # Reshape the target labels to match the output of the model (batch_size, 10, num_classes) y_train_type = np.expand_dims(y_train_type, axis=1) y_train_type = np.repeat(y_train_type, 10, axis=1) # Repeat along the time axis y_test_type = np.expand_dims(y_test_type, axis=1) y_test_type = np.repeat(y_test_type, 10, axis=1) y_train_lang = np.expand_dims(y_train_lang, axis=1) y_train_lang = np.repeat(y_train_lang, 10, axis=1) # Repeat along the time axis y_test_lang = np.expand_dims(y_test_lang, axis=1) y_test_lang = np.repeat(y_test_lang, 10, axis=1) # Now, fit the model self.model.fit(X_train, [y_train_text, y_train_type, y_train_lang], validation_data=(X_test, [y_test_text, y_test_type, y_test_lang]), epochs=epochs, batch_size=batch_size) self.model.save("text_type_lang_model.h5") def predict(self, image_path): image = self.preprocess_image(image_path) image = np.expand_dims(image, axis=0) pred_text, pred_type, pred_lang = self.model.predict(image) pred_text = ''.join(self.index_to_char.get(np.argmax(pred_text[0][i]), '') for i in range(10)) return pred_text.strip(), np.argmax(pred_type), np.argmax(pred_lang) # Initialize and train the model model = TextTypeLangModel("all_cropped_data_cleaned.csv") model.train() # Predict using the first image from the dataset first_image_path = model.df['Cropped Image Path'].iloc[0] predicted_text, predicted_type, predicted_language = model.predict(first_image_path) print("Predicted Text:", predicted_text) print("Predicted Type:", predicted_type) print("Predicted Language:", "English" if predicted_language == 0 else "Urdu")