import fitz
import cv2
import numpy as np
import os
import h5py
from tqdm import tqdm
import shutil
import argparse

PDF_SOURCE_DIR = "sample_documents/books/"
OUTPUT_DATA_DIR = "data/"
HDF5_FILE_PATH = os.path.join(OUTPUT_DATA_DIR, "real_line_dataset.h5")


def find_text_lines_from_image(image_data):
    gray = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = np.ones((1, 40), np.uint8)
    connected = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    contours, _ = cv2.findContours(connected, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    line_boxes, line_crops = [], []
    if not contours: return line_boxes, line_crops

    bounding_boxes = [cv2.boundingRect(c) for c in contours]
    sorted_boxes = sorted(bounding_boxes, key=lambda b: b[1])

    for x, y, w, h in sorted_boxes:
        if w > 15 and h > 8:
            pad = 2
            line_crop = binary[max(0, y - pad):y + h + pad, max(0, x - pad):x + w + pad]

            if line_crop is not None and line_crop.shape[0] > 0 and line_crop.shape[1] > 0:
                line_boxes.append({'x': x, 'y': y, 'w': w, 'h': h, 'words': []})
                line_crops.append(line_crop)

    return line_boxes, line_crops


def align_text_with_lines(page_words, line_boxes):
    for x1, y1, x2, y2, word, _, _, _ in page_words:
        word_mid_y = (y1 + y2) / 2
        for line_box in line_boxes:
            if line_box['y'] <= word_mid_y <= (line_box['y'] + line_box['h']):
                line_box['words'].append((x1, word))
                break
    line_texts = []
    for box in line_boxes:
        if box['words']:
            sorted_words = sorted(box['words'], key=lambda w: w[0])
            line_texts.append(" ".join([word for _, word in sorted_words]))
        else:
            line_texts.append("")
    return line_texts


def main():
    parser = argparse.ArgumentParser(description="Build a robust, real-world CRNN dataset from PDFs.")
    parser.add_argument("--clean", action="store_true", help="Wipe the existing dataset.")
    args = parser.parse_args()
    if args.clean and os.path.exists(HDF5_FILE_PATH):
        os.remove(HDF5_FILE_PATH)
    if not os.path.exists(OUTPUT_DATA_DIR):
        os.makedirs(OUTPUT_DATA_DIR)
    pdf_files = [f for f in sorted(os.listdir(PDF_SOURCE_DIR)) if f.lower().endswith('.pdf')]
    if not pdf_files:
        print("No PDF files found. Aborting.");
        return
    all_chars = set()
    for pdf_filename in pdf_files:
        pdf_path = os.path.join(PDF_SOURCE_DIR, pdf_filename)
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text = "".join(c for c in page.get_text() if c.isalnum() or c.isspace())
                all_chars.update(text)
    char_list = sorted(list(all_chars))
    with h5py.File(HDF5_FILE_PATH, 'w') as hf:
        hf.create_dataset('char_list', data=[s.encode('utf-8') for s in char_list])
        labels_ds = hf.create_dataset('labels', (0,), maxshape=(None,), dtype=h5py.string_dtype(encoding='utf-8'),
                                      chunks=True)
        images_ds = hf.create_dataset('image_data', (0,), maxshape=(None,), dtype=h5py.vlen_dtype(np.uint8),
                                      chunks=True)
        total_lines_saved = 0
        for pdf_filename in pdf_files:
            pdf_path = os.path.join(PDF_SOURCE_DIR, pdf_filename)
            with fitz.open(pdf_path) as doc:
                for page_num, page in enumerate(tqdm(doc, desc=f"Processing {pdf_filename}")):
                    page_words = page.get_text("words")
                    pix = page.get_pixmap()
                    image_data = cv2.cvtColor(np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n),
                                              cv2.COLOR_RGB2BGR)
                    line_boxes, line_crops = find_text_lines_from_image(image_data)
                    line_texts = align_text_with_lines(page_words, line_boxes)
                    image_chunk, label_chunk = [], []
                    for crop, text in zip(line_crops, line_texts):
                        filtered_text = "".join(c for c in text if c in all_chars)
                        if len(filtered_text) > 2:
                            _, img_encoded = cv2.imencode('.png', crop)
                            image_chunk.append(img_encoded.flatten())
                            label_chunk.append(filtered_text)
                    if image_chunk:
                        start_idx = labels_ds.shape[0]
                        new_size = start_idx + len(image_chunk)
                        labels_ds.resize(new_size, axis=0)
                        images_ds.resize(new_size, axis=0)
                        labels_ds[start_idx:] = label_chunk
                        for i, img_data in enumerate(image_chunk):
                            images_ds[start_idx + i] = img_data
                        total_lines_saved += len(image_chunk)
    print(f"\n--- Real Dataset Creation Complete! ---")
    print(f"Total lines saved: {total_lines_saved}")


if __name__ == "__main__":
    main()