import gradio as gr
import math
import json
import os
from transformers import AutoTokenizer

# Try to import qwen_vl_utils, otherwise use the built-in official implementation copy
try:
    from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize
except ImportError:
    # Qwen-VL-Utils official implementation copy
    def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280):
        """
        Official implementation from qwen_vl_utils.vision_process
        """
        if max(height, width) / min(height, width) > 200:
            factor = 1 # For extreme aspect ratios
            
        h_bar = round(height / factor) * factor
        w_bar = round(width / factor) * factor
        
        if h_bar * w_bar > max_pixels:
            beta = math.sqrt((height * width) / max_pixels)
            h_bar = math.floor(height / beta / factor) * factor
            w_bar = math.floor(width / beta / factor) * factor
        elif h_bar * w_bar < min_pixels:
            beta = math.sqrt(min_pixels / (height * width))
            h_bar = math.ceil(height * beta / factor) * factor
            w_bar = math.ceil(width * beta / factor) * factor
            
        return h_bar, w_bar

# --- Tokenizer Loading Logic ---
TOKENIZERS = {}

def get_tokenizer(model_name):
    if model_name in TOKENIZERS:
        return TOKENIZERS[model_name]
    
    try:
        if model_name == "Qwen2.5-VL / Qwen2-VL":
            # Qwen2-VL uses Qwen2 tokenizer
            tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
        elif model_name == "Llava-1.6 (Next)":
            # Llava-1.6 based on Vicuna/Llama-2
            tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
        else:
            return None
        
        TOKENIZERS[model_name] = tokenizer
        return tokenizer
    except Exception as e:
        print(f"Error loading tokenizer for {model_name}: {e}")
        return None

# --- Token Calculation Logic ---

def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels):
    """
    Qwen2-VL / Qwen2.5-VL Token Calculation Formula
    """
    text_tokens_count = 0
    image_tokens_count = 0
    video_tokens_count = 0
    
    # 1. Text Tokens (Real Calculation)
    text_tokens_ids = []
    if tokenizer:
        text_tokens_ids = tokenizer.encode(text)
        text_tokens_count = len(text_tokens_ids)
    else:
        # Fallback
        text_tokens_count = len(text) // 2
    
    # 2. Image Tokens
    image_details = []
    for group in image_groups:
        width, height = group['width'], group['height']
        count = int(group['count'])
        if count <= 0:
            continue
        
        # Apply Qwen Official Smart Resize
        new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
        
        grid_w = new_w // 14
        grid_h = new_h // 14
        img_tokens = grid_h * grid_w
        
        group_tokens = img_tokens * count
        image_tokens_count += group_tokens
        
        image_details.append({
            "count": count,
            "original_size": [width, height],
            "resized_size": [new_w, new_h],
            "tokens_per_image": img_tokens,
            "total_tokens": group_tokens
        })
        
    # 3. Video Tokens
    video_details = []
    for vid in videos:
        frames = vid['frames']
        width, height = vid['width'], vid['height']
        
        # Video processing logic is similar to images
        new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
        
        grid_w = new_w // 14
        grid_h = new_h // 14
        frame_tokens = grid_h * grid_w
        
        vid_total = frames * frame_tokens
        video_tokens_count += vid_total
        video_details.append({
            "original_size": [width, height],
            "resized_size": [new_w, new_h],
            "frames": frames,
            "tokens": vid_total
        })

    total_tokens = text_tokens_count + image_tokens_count + video_tokens_count
    
    breakdown = {
        "text_tokens": text_tokens_count,
        "image_tokens": image_tokens_count,
        "video_tokens": video_tokens_count
    }
    
    media_details = {
        "images": image_details,
        "videos": video_details
    }

    return total_tokens, text_tokens_ids, breakdown, media_details

def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels):
    """
    Llava-1.6 (Next) Token Calculation Formula
    """
    text_tokens_count = 0
    image_tokens_count = 0
    
    # 1. Text Tokens
    text_tokens_ids = []
    if tokenizer:
        text_tokens_ids = tokenizer.encode(text)
        text_tokens_count = len(text_tokens_ids)
    else:
        text_tokens_count = len(text) // 2
    
    # 2. Image Tokens
    image_details = []
    for group in image_groups:
        width, height = group['width'], group['height']
        count = int(group['count'])
        if count <= 0:
            continue
        
        # Llava-Next Logic:
        # If max_pixels is specified, resize first
        if max_pixels > 0 and (width * height > max_pixels):
            scale_factor = math.sqrt(max_pixels / (width * height))
            width = int(width * scale_factor)
            height = int(height * scale_factor)
            
        scale_res = 336
        patch_x = math.ceil(width / scale_res)
        patch_y = math.ceil(height / scale_res)
        num_patches = patch_x * patch_y
        img_tokens = (num_patches + 1) * 576
        
        group_tokens = img_tokens * count
        image_tokens_count += group_tokens
        
        image_details.append({
            "count": count,
            "original_size": [group['width'], group['height']],
            "resized_size": [width, height],
            "grid_patches": f"{patch_x}x{patch_y}",
            "tokens_per_image": img_tokens,
            "total_tokens": group_tokens
        })
        
    total_tokens = text_tokens_count + image_tokens_count
    
    breakdown = {
        "text_tokens": text_tokens_count,
        "image_tokens": image_tokens_count,
        "video_tokens": 0
    }
    
    media_details = {
        "images": image_details,
        "videos": []
    }
        
    return total_tokens, text_tokens_ids, breakdown, media_details

# --- Actual UI Logic ---

def run_calculation(text, model, img_max_pixels, vid_count, vid_frames, vid_w, vid_h, *args):
    # Parse variable number of image group arguments
    # args structure: c1, w1, h1, c2, w2, h2, ...
    image_groups = []
    
    # Group args into triplets
    for i in range(0, len(args), 3):
        if i + 2 < len(args):
            c = args[i]
            w = args[i+1]
            h = args[i+2]
            try:
                c = int(c)
                w = int(w)
                h = int(h)
                if c > 0:
                    image_groups.append({'count': c, 'width': w, 'height': h})
            except Exception:
                pass

    videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
    
    # Get Tokenizer
    tokenizer = get_tokenizer(model)
    
    # Determine real model ID
    model_id_map = {
        "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
        "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
    }
    real_model_id = model_id_map.get(model, model)
    
    text_tokens_ids = []
    breakdown = {}
    media_details = {}
    tokens = 0
    
    if model == "Qwen2.5-VL / Qwen2-VL":
        tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels)
    elif model == "Llava-1.6 (Next)":
        tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels)
    else:
        tokens = 0
    
    # Generate Token Analysis File
    token_file_path = None
    if tokenizer and text_tokens_ids:
        token_data = []
        # Decode each token id
        for tid in text_tokens_ids:
            token_str = tokenizer.decode([tid])
            token_data.append({"id": tid, "token": token_str})
            
        token_file_path = "token_analysis.json"
        with open(token_file_path, "w", encoding="utf-8") as f:
            json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)

    # Construct final JSON result
    result = {
        "model_id": real_model_id,
        "tokenizer_loaded": tokenizer is not None,
        "total_tokens": tokens,
        "breakdown": breakdown,
        "text_stats": {
            "char_count": len(text)
        },
        "media_details": media_details
    }

    return result, token_file_path

def create_ui():
    gr.Markdown("""
    ## 📝 Token Stats (文本/多模态 Token 统计)
    Estimate token usage for text, images, and videos using various model tokenizers (e.g., Qwen2.5-VL, LLaVa).
    
    估算文本、图片和视频在不同模型（如 Qwen2.5-VL, LLaVa）下的 Token 用量。
    """)
    with gr.Row():
        with gr.Column(scale=1):
            model_select = gr.Dropdown(
                choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"],
                value="Qwen2.5-VL / Qwen2-VL",
                label="选择模型"
            )
            text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
            
            with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
                # Group 1 (Always visible)
                with gr.Row():
                    img_c_1 = gr.Number(value=1, label="图片数量 (Group 1)", precision=0)
                    img_w_1 = gr.Number(value=1080, label="宽 (px)")
                    img_h_1 = gr.Number(value=1920, label="高 (px)")
                
                # Group 2 (Hidden by default)
                with gr.Row(visible=False) as group_2:
                    img_c_2 = gr.Number(value=0, label="图片数量 (Group 2)", precision=0)
                    img_w_2 = gr.Number(value=1024, label="宽 (px)")
                    img_h_2 = gr.Number(value=1024, label="高 (px)")
                    
                # Group 3 (Hidden by default)
                with gr.Row(visible=False) as group_3:
                    img_c_3 = gr.Number(value=0, label="图片数量 (Group 3)", precision=0)
                    img_w_3 = gr.Number(value=1024, label="宽 (px)")
                    img_h_3 = gr.Number(value=1024, label="高 (px)")

                # Group 4 (Hidden by default)
                with gr.Row(visible=False) as group_4:
                    img_c_4 = gr.Number(value=0, label="图片数量 (Group 4)", precision=0)
                    img_w_4 = gr.Number(value=1024, label="宽 (px)")
                    img_h_4 = gr.Number(value=1024, label="高 (px)")

                add_group_btn = gr.Button("➕ 增加一组图片 (Add Group)", size="sm")
                
                # State to track visible groups
                visible_groups = gr.State(1)

                def add_group(curr_count):
                    next_count = min(curr_count + 1, 4)
                    
                    # Helper to create update for a group
                    def get_update(group_idx):
                        if next_count == group_idx:
                            # Just revealed, set count to 1
                            return gr.update(visible=True, value=1)
                        elif next_count > group_idx:
                            # Already visible, keep as is (don't reset value)
                            return gr.update(visible=True)
                        else:
                            # Still hidden
                            return gr.update(visible=False)

                    return (
                        next_count, 
                        get_update(2),
                        get_update(3),
                        get_update(4)
                    )

                add_group_btn.click(
                    add_group, 
                    [visible_groups], 
                    [visible_groups, group_2, group_3, group_4]
                )

                with gr.Row():
                    img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
            
            with gr.Accordion("🎥 视频设置 (Videos)", open=False):
                with gr.Row():
                    vid_count = gr.Number(value=0, label="视频数量", precision=0)
                    vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0)
                    vid_w = gr.Number(value=512, label="宽 (px)")
                    vid_h = gr.Number(value=512, label="高 (px)")
            
            btn = gr.Button("🚀 计算 Token", variant="primary")
            
        with gr.Column(scale=1):
            out_json = gr.JSON(label="计算结果")
            out_file = gr.File(label="下载 Token 分析 (JSON)")
            gr.Markdown("""
            ### 说明
            * **真实 Tokenizer**: 首次运行时会自动下载 `transformers` 模型配置，可能需要几秒钟。
            * **Qwen2-VL**: 基于 `H/14 * W/14` 计算，自动对齐到 28px 网格。
            * **Llava-1.6**: 基于 `(Patches + 1) * 576` 计算，Patch 大小为 336px。
            """)
    
    btn.click(
        run_calculation, 
        [
            text_input, model_select, img_max_pixels, vid_count, vid_frames, vid_w, vid_h,
            img_c_1, img_w_1, img_h_1,
            img_c_2, img_w_2, img_h_2,
            img_c_3, img_w_3, img_h_3,
            img_c_4, img_w_4, img_h_4
        ], 
        [out_json, out_file]
    )