import pandas as pd
import numpy as np
import tensorflow as tf
from keras import Model, Input
from keras.api.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.api.utils import to_categorical
from PIL import Image
import string

class TextTypeLangModel:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.characters = string.ascii_letters + string.digits + " " + \
                        "آبپچڈڑڤکگہہٹژزسٹطظعغفقکگلاںمںنۓہھوؤ" + \
                        "ےیئؤٹپجچحخدڈذرزسشصضطظعغفقکلمنوٕں" + \
                        "ۓۓہ۔،؛؟"

        self.num_chars = len(self.characters) + 1  # Extra for blank
        self.char_to_index = {c: i+1 for i, c in enumerate(self.characters)}
        self.index_to_char = {i+1: c for i, c in enumerate(self.characters)}
        self.model = self.build_model()

    def encode_text(self, text, max_len=10):
        text = text[:max_len].ljust(max_len)  # Pad or trim text
        return [self.char_to_index.get(c, 0) for c in text]  # Convert to indices

    def preprocess_image(self, image_path):
        image = Image.open(image_path).convert("RGB")
        image = image.resize((128, 128))
        image = np.array(image) / 255.0  # Normalize
        return image

    def prepare_data(self):
        X_images = np.array([self.preprocess_image(img) for img in self.df['Cropped Image Path']])
        y_text = np.array([self.encode_text(txt) for txt in self.df['Text']])
        y_type = to_categorical(self.df['type'].values, num_classes=7)
        y_lang = to_categorical(self.df['language'].values, num_classes=2)
        return train_test_split(X_images, to_categorical(y_text, self.num_chars), y_type, y_lang, test_size=0.2, random_state=42)

    def build_model(self):
        input_layer = Input(shape=(128, 128, 3))
        x = Conv2D(32, (3, 3), activation='relu')(input_layer)
        x = MaxPooling2D()(x)
        x = Conv2D(64, (3, 3), activation='relu')(x)
        x = MaxPooling2D()(x)
        x = Flatten()(x)
        
        # Increase the number of units in Dense to match the required total number of features
        x = Dense(1280, activation='relu')(x)  # Set to 1280 units (10 * 128) to match reshape size

        # Reshape the output to have 10 time steps and each time step has 128 features
        x = Reshape((10, 128))(x)  # Shape (batch_size, 10, 128)

        # TimeDistributed applied to Dense layer for text output (10-character sequence)
        time_distributed_text_output = TimeDistributed(Dense(self.num_chars, activation='softmax'))(x)
        text_output = Reshape((10, self.num_chars), name='text_output')(time_distributed_text_output)  # 10-character output

        # Type and language prediction
        output_type = Dense(7, activation='softmax', name='type_output')(x)
        output_lang = Dense(2, activation='softmax', name='lang_output')(x)

        model = Model(inputs=input_layer, outputs=[text_output, output_type, output_lang])
        model.compile(optimizer='adam', 
                      loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
                      metrics={'text_output': 'accuracy', 'type_output': 'accuracy', 'lang_output': 'accuracy'})
        return model

    def train(self, epochs=10, batch_size=32):
        X_train, X_test, y_train_text, y_test_text, y_train_type, y_test_type, y_train_lang, y_test_lang = self.prepare_data()

        # Reshape the target labels to match the output of the model (batch_size, 10, num_classes)
        y_train_type = np.expand_dims(y_train_type, axis=1)
        y_train_type = np.repeat(y_train_type, 10, axis=1)  # Repeat along the time axis

        y_test_type = np.expand_dims(y_test_type, axis=1)
        y_test_type = np.repeat(y_test_type, 10, axis=1)

        y_train_lang = np.expand_dims(y_train_lang, axis=1)
        y_train_lang = np.repeat(y_train_lang, 10, axis=1)  # Repeat along the time axis

        y_test_lang = np.expand_dims(y_test_lang, axis=1)
        y_test_lang = np.repeat(y_test_lang, 10, axis=1)

        # Now, fit the model
        self.model.fit(X_train, [y_train_text, y_train_type, y_train_lang],
                    validation_data=(X_test, [y_test_text, y_test_type, y_test_lang]),
                    epochs=epochs, batch_size=batch_size)
        self.model.save("text_type_lang_model.h5")

    def predict(self, image_path):
        image = self.preprocess_image(image_path)
        image = np.expand_dims(image, axis=0)
        pred_text, pred_type, pred_lang = self.model.predict(image)
        pred_text = ''.join(self.index_to_char.get(np.argmax(pred_text[0][i]), '') for i in range(10))
        return pred_text.strip(), np.argmax(pred_type), np.argmax(pred_lang)


# Initialize and train the model
model = TextTypeLangModel("all_cropped_data_cleaned.csv")
model.train()

# Predict using the first image from the dataset
first_image_path = model.df['Cropped Image Path'].iloc[0]
predicted_text, predicted_type, predicted_language = model.predict(first_image_path)

print("Predicted Text:", predicted_text)
print("Predicted Type:", predicted_type)
print("Predicted Language:", "English" if predicted_language == 0 else "Urdu")