import os
import math
import numpy as np
import onnxruntime as ort
from PIL import Image
import gradio as gr
from huggingface_hub import hf_hub_download
import shutil

# ——————————————————————————————————————————————————————————————
# 1) Auto-Download Ultra-Fast Model 
MODEL_DIR = "model"
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_X4_PATH = os.path.join(MODEL_DIR, "realesr-general-x4v3.onnx")

def fetch_models():
    """Downloads the 4.8MB ultra-lightweight model from an open repository."""
    if not os.path.exists(MODEL_X4_PATH):
        print("Downloading 4.8MB High-Speed Real-ESRGAN model...")
        downloaded_path = hf_hub_download(
            repo_id="OwlMaster/AllFilesRope",
            filename="realesr-general-x4v3.onnx"
        )
        shutil.copy(downloaded_path, MODEL_X4_PATH)
        print("Model downloaded successfully!")

fetch_models()

# ——————————————————————————————————————————————————————————————
# 2) Efficient CPU Session
sess_opts = ort.SessionOptions()
sess_opts.intra_op_num_threads = 2
sess_opts.inter_op_num_threads = 2
session_x4 = None

def load_model():
    global session_x4
    if session_x4 is None:
        session_x4 = ort.InferenceSession(MODEL_X4_PATH, sess_options=sess_opts, providers=["CPUExecutionProvider"])
    return session_x4

# ——————————————————————————————————————————————————————————————
# 3) Adaptive Tiling Engine
def process_tensor(session, tensor_np):
    input_name = session.get_inputs()[0].name
    patch_nchw = np.transpose(tensor_np, (2, 0, 1))[np.newaxis, ...]
    out_nchw = session.run(None, {input_name: patch_nchw})[0]
    out_nchw = np.squeeze(out_nchw, axis=0)
    return np.transpose(out_nchw, (1, 2, 0))

def dynamic_upscale(input_img: Image.Image, target_scale: int, progress=gr.Progress()):
    session = load_model()
    base_model_scale = 4 

    img_rgb = input_img.convert("RGB")
    arr = np.array(img_rgb).astype(np.float32) / 255.0
    h_orig, w_orig, _ = arr.shape
    
    # 16GB RAM can safely process up to ~1500x1500px in one shot without tiling.
    MAX_NO_TILE_DIM = 1500 
    
    if max(h_orig, w_orig) <= MAX_NO_TILE_DIM:
        progress(0.4, desc="RAM check passed. Processing full image instantly...")
        out_arr = process_tensor(session, arr)
    else:
        # For huge images, we use large 800px tiles to minimize loop overhead
        tile_size = 800
        tiles_h = math.ceil(h_orig / tile_size)
        tiles_w = math.ceil(w_orig / tile_size)
        
        pad_h = tiles_h * tile_size - h_orig
        pad_w = tiles_w * tile_size - w_orig

        arr_padded = np.pad(arr, ((0, pad_h), (0, pad_w), (0, 0)), mode="reflect")
        
        out_h = tiles_h * tile_size * base_model_scale
        out_w = tiles_w * tile_size * base_model_scale
        out_arr = np.zeros((out_h, out_w, 3), dtype=np.float32)

        total_tiles = tiles_h * tiles_w
        current_tile = 0

        for i in range(tiles_h):
            for j in range(tiles_w):
                current_tile += 1
                progress(current_tile / total_tiles, desc=f"AI Processing tile {current_tile}/{total_tiles}...")
                
                y0, x0 = i * tile_size, j * tile_size
                tile = arr_padded[y0:y0+tile_size, x0:x0+tile_size, :]
                
                up_tile = process_tensor(session, tile)
                
                oy0, ox0 = i * tile_size * base_model_scale, j * tile_size * base_model_scale
                out_arr[oy0:oy0 + tile_size * base_model_scale, ox0:ox0 + tile_size * base_model_scale, :] = up_tile

        out_arr = out_arr[0:h_orig * base_model_scale, 0:w_orig * base_model_scale, :]

    # Finalize Image
    progress(0.8, desc="Finalizing output...")
    out_arr = np.clip(out_arr, 0.0, 1.0)
    final_pil = Image.fromarray((out_arr * 255.0).round().astype(np.uint8))

    # Downscale if the user requested 2x (from the 4x native output)
    if target_scale != 4:
        progress(0.9, desc=f"Refining output to {target_scale}x...")
        target_size = (w_orig * target_scale, h_orig * target_scale)
        final_pil = final_pil.resize(target_size, resample=Image.LANCZOS)

    return final_pil

# ——————————————————————————————————————————————————————————————
# 4) UI Setup
def upscale_2x(img, prog=gr.Progress()): return dynamic_upscale(img, 2, prog) if img else None
def upscale_4x(img, prog=gr.Progress()): return dynamic_upscale(img, 4, prog) if img else None
def upscale_6x(img, prog=gr.Progress()): return dynamic_upscale(img, 6, prog) if img else None

css = """
#x2-btn { background-color: #d1fae5 !important; color: black !important; }
#x4-btn { background-color: #bfdbfe !important; color: black !important; }
#x6-btn { background-color: #fef08a !important; color: black !important; }
"""

with gr.Blocks(title="Speed-Optimized CPU Upscaler") as demo:
    gr.Markdown("# ⚡ Adaptive CPU Upscaler\nUpload an image. Great For Anime/Cartoon/Text , Very Fast . This app dynamically allocates RAM to bypass tiling for smaller images, running significantly faster on the Free Tier.")
    with gr.Row(): inp_image = gr.Image(type="pil", label="Source Image")
    with gr.Row():
        btn_x2 = gr.Button("Upscale (×2)", elem_id="x2-btn")
        btn_x4 = gr.Button("Standard AI (×4)", elem_id="x4-btn")
        btn_x6 = gr.Button("Hybrid High-Res (×6)", elem_id="x6-btn")
    out_preview = gr.Image(type="pil", label="Upscaled Result")

    btn_x2.click(fn=upscale_2x, inputs=inp_image, outputs=out_preview)
    btn_x4.click(fn=upscale_4x, inputs=inp_image, outputs=out_preview)
    btn_x6.click(fn=upscale_6x, inputs=inp_image, outputs=out_preview)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, css=css)