Doctor_Handwriting_Text_Detection / train_from_scratch.py
Umer2762's picture
Upload folder using huggingface_hub
5626a1a verified
import pandas as pd
import numpy as np
import tensorflow as tf
from keras import Model, Input
from keras.api.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.api.utils import to_categorical
from PIL import Image
import string
class TextTypeLangModel:
def __init__(self, csv_path):
self.df = pd.read_csv(csv_path)
self.characters = string.ascii_letters + string.digits + " " + \
"آبپچڈڑڤکگہہٹژزسٹطظعغفقکگلاںمںنۓہھوؤ" + \
"ےیئؤٹپجچحخدڈذرزسشصضطظعغفقکلمنوٕں" + \
"ۓۓہ۔،؛؟"
self.num_chars = len(self.characters) + 1 # Extra for blank
self.char_to_index = {c: i+1 for i, c in enumerate(self.characters)}
self.index_to_char = {i+1: c for i, c in enumerate(self.characters)}
self.model = self.build_model()
def encode_text(self, text, max_len=10):
text = text[:max_len].ljust(max_len) # Pad or trim text
return [self.char_to_index.get(c, 0) for c in text] # Convert to indices
def preprocess_image(self, image_path):
image = Image.open(image_path).convert("RGB")
image = image.resize((128, 128))
image = np.array(image) / 255.0 # Normalize
return image
def prepare_data(self):
X_images = np.array([self.preprocess_image(img) for img in self.df['Cropped Image Path']])
y_text = np.array([self.encode_text(txt) for txt in self.df['Text']])
y_type = to_categorical(self.df['type'].values, num_classes=7)
y_lang = to_categorical(self.df['language'].values, num_classes=2)
return train_test_split(X_images, to_categorical(y_text, self.num_chars), y_type, y_lang, test_size=0.2, random_state=42)
def build_model(self):
input_layer = Input(shape=(128, 128, 3))
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D()(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
# Increase the number of units in Dense to match the required total number of features
x = Dense(1280, activation='relu')(x) # Set to 1280 units (10 * 128) to match reshape size
# Reshape the output to have 10 time steps and each time step has 128 features
x = Reshape((10, 128))(x) # Shape (batch_size, 10, 128)
# TimeDistributed applied to Dense layer for text output (10-character sequence)
time_distributed_text_output = TimeDistributed(Dense(self.num_chars, activation='softmax'))(x)
text_output = Reshape((10, self.num_chars), name='text_output')(time_distributed_text_output) # 10-character output
# Type and language prediction
output_type = Dense(7, activation='softmax', name='type_output')(x)
output_lang = Dense(2, activation='softmax', name='lang_output')(x)
model = Model(inputs=input_layer, outputs=[text_output, output_type, output_lang])
model.compile(optimizer='adam',
loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
metrics={'text_output': 'accuracy', 'type_output': 'accuracy', 'lang_output': 'accuracy'})
return model
def train(self, epochs=10, batch_size=32):
X_train, X_test, y_train_text, y_test_text, y_train_type, y_test_type, y_train_lang, y_test_lang = self.prepare_data()
# Reshape the target labels to match the output of the model (batch_size, 10, num_classes)
y_train_type = np.expand_dims(y_train_type, axis=1)
y_train_type = np.repeat(y_train_type, 10, axis=1) # Repeat along the time axis
y_test_type = np.expand_dims(y_test_type, axis=1)
y_test_type = np.repeat(y_test_type, 10, axis=1)
y_train_lang = np.expand_dims(y_train_lang, axis=1)
y_train_lang = np.repeat(y_train_lang, 10, axis=1) # Repeat along the time axis
y_test_lang = np.expand_dims(y_test_lang, axis=1)
y_test_lang = np.repeat(y_test_lang, 10, axis=1)
# Now, fit the model
self.model.fit(X_train, [y_train_text, y_train_type, y_train_lang],
validation_data=(X_test, [y_test_text, y_test_type, y_test_lang]),
epochs=epochs, batch_size=batch_size)
self.model.save("text_type_lang_model.h5")
def predict(self, image_path):
image = self.preprocess_image(image_path)
image = np.expand_dims(image, axis=0)
pred_text, pred_type, pred_lang = self.model.predict(image)
pred_text = ''.join(self.index_to_char.get(np.argmax(pred_text[0][i]), '') for i in range(10))
return pred_text.strip(), np.argmax(pred_type), np.argmax(pred_lang)
# Initialize and train the model
model = TextTypeLangModel("all_cropped_data_cleaned.csv")
model.train()
# Predict using the first image from the dataset
first_image_path = model.df['Cropped Image Path'].iloc[0]
predicted_text, predicted_type, predicted_language = model.predict(first_image_path)
print("Predicted Text:", predicted_text)
print("Predicted Type:", predicted_type)
print("Predicted Language:", "English" if predicted_language == 0 else "Urdu")