Spaces:

KarthiEz
/

HunyuanEz

Sleeping

App Files Files Community

KarthiEz commited on 11 days ago

Commit

e4b0b88

verified ·

1 Parent(s): 8848bb4

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -230

app.py CHANGED Viewed

@@ -52,6 +52,37 @@ def _get_args():
     args = parser.parse_args()
     return args
 def _load_model_processor(args):
     # ZeroGPU: Model loads on CPU, uses eager mode
@@ -130,14 +161,15 @@ def _launch_demo(args, model, processor):
     # Track first call
     first_call = [True]
-    # Uses closure to access model and processor
-    # Duration increased to 120s to avoid timeout during peak hours
     @spaces.GPU(duration=120)
     def call_local_model(messages):
         import time
         import sys
         start_time = time.time()
         if first_call[0]:
             print(f"[INFO] ========== First inference call ==========")
             first_call[0] = False
@@ -154,13 +186,13 @@ def _launch_demo(args, model, processor):
             print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
             print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
             print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
-        # Ensure model is on GPU
         model_device = next(model.parameters()).device
         print(f"[DEBUG] Model device: {model_device}")
         print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
-        if str(model_device) == 'cpu':
             print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
             if torch.cuda.is_available():
                 move_start = time.time()
@@ -170,12 +202,10 @@ def _launch_demo(args, model, processor):
                 print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
             else:
                 print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
-                print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
-        else:
-            print(f"[INFO] Model already on GPU: {model_device}")
         messages = [messages]
         # Build input using processor
         texts = [
             processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
@@ -186,14 +216,6 @@ def _launch_demo(args, model, processor):
         image_inputs, video_inputs = process_vision_info(messages)
         print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")
-        # Check image input size
-        if image_inputs:
-            for idx, img in enumerate(image_inputs):
-                if hasattr(img, 'size'):
-                    print(f"[DEBUG] Image {idx} size: {img.size}")
-                elif isinstance(img, np.ndarray):
-                    print(f"[DEBUG] Image {idx} shape: {img.shape}")
         print(f"[DEBUG] Starting processor encoding...")
         processor_start = time.time()
         inputs = processor(
@@ -205,239 +227,204 @@ def _launch_demo(args, model, processor):
         )
         print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")
-        # Ensure inputs on GPU
         to_device_start = time.time()
-        inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
-        print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
-        print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
-        print(f"[DEBUG] Input device: {inputs.input_ids.device}")
-        print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
         # Generation
         gen_start = time.time()
-        print(f"[DEBUG] ========== Starting token generation ==========")
-        # Optimized max_new_tokens for OCR tasks
         max_new_tokens = 2048
-        print(f"[DEBUG] max_new_tokens: {max_new_tokens}")
-        # Progress callback
-        token_count = [0]
-        last_time = [gen_start]
-        def progress_callback(input_ids, scores, **kwargs):
-            token_count[0] += 1
-            current_time = time.time()
-            if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
-                elapsed = current_time - gen_start
-                tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
-                print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
-                last_time[0] = current_time
-            return False
         with torch.no_grad():
-            print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")
-            # Test forward pass
             print(f"[DEBUG] Testing forward pass...")
             forward_test_start = time.time()
             try:
-                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                    test_outputs = model(**inputs, use_cache=False)
                 print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
             except Exception as e:
                 print(f"[WARNING] Forward pass test failed: {e}")
             print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
             generate_call_start = time.time()
-            try:
-                # Deterministic generation
-                generated_ids = model.generate(
-                    **inputs,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    temperature=0
-                )
-                print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
-            except Exception as e:
-                print(f"[ERROR] Generation failed: {e}")
-                import traceback
-                traceback.print_exc()
-                raise
-        print(f"[DEBUG] Exited torch.no_grad() context")
         gen_time = time.time() - gen_start
-        print(f"[DEBUG] ========== Generation complete ==========")
         print(f"[DEBUG] Generation time: {gen_time:.2f}s")
         print(f"[DEBUG] Output shape: {generated_ids.shape}")
-        # Decode output
-        if "input_ids" in inputs:
-            input_ids = inputs.input_ids
-        else:
-            input_ids = inputs.inputs
         generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
         ]
         actual_tokens = len(generated_ids_trimmed[0])
         print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
-        print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
         output_texts = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         total_time = time.time() - start_time
-        print(f"[DEBUG] ========== All done ==========")
         print(f"[DEBUG] Total time: {total_time:.2f}s")
-        print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
         print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
         output_texts[0] = clean_repeated_substrings(output_texts[0])
         return output_texts
     def create_predict_fn():
-        def predict(_chatbot, task_history):
             nonlocal model, processor
-            chat_query = _chatbot[-1][0]
             query = task_history[-1][0]
-            if len(chat_query) == 0:
-                _chatbot.pop()
-                task_history.pop()
-                return _chatbot
-            print('User: ', query)
             history_cp = copy.deepcopy(task_history)
-            full_response = ''
             messages = []
             content = []
             for q, a in history_cp:
                 if isinstance(q, (tuple, list)):
-                    # Check if URL or local path
                     img_path = q[0]
-                    if img_path.startswith(('http://', 'https://')):
-                        content.append({'type': 'image', 'image': img_path})
                     else:
-                        content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
                 else:
-                    content.append({'type': 'text', 'text': q})
-                    messages.append({'role': 'user', 'content': content})
-                    messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
-                    content = []
-            messages.pop()
-            # Call model to get response
             response_list = call_local_model(messages)
             response = response_list[0] if response_list else ""
-            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
             full_response = _parse_text(response)
             task_history[-1] = (query, full_response)
-            print('HunyuanOCR: ' + _parse_text(full_response))
-            yield _chatbot
         return predict
-    def create_regenerate_fn():
-        def regenerate(_chatbot, task_history):
-            nonlocal model, processor
-            if not task_history:
-                return _chatbot
-            item = task_history[-1]
-            if item[1] is None:
-                return _chatbot
-            task_history[-1] = (item[0], None)
-            chatbot_item = _chatbot.pop(-1)
-            if chatbot_item[0] is None:
-                _chatbot[-1] = (_chatbot[-1][0], None)
-            else:
-                _chatbot.append((chatbot_item[0], None))
-            # Use outer predict function
-            _chatbot_gen = predict(_chatbot, task_history)
-            for _chatbot in _chatbot_gen:
-                yield _chatbot
         return regenerate
     predict = create_predict_fn()
     regenerate = create_regenerate_fn()
-    def add_text(history, task_history, text):
         task_text = text
-        history = history if history is not None else []
         task_history = task_history if task_history is not None else []
-        history = history + [(_parse_text(text), None)]
         task_history = task_history + [(task_text, None)]
-        return history, task_history, ''
-    def add_file(history, task_history, file):
-        history = history if history is not None else []
         task_history = task_history if task_history is not None else []
-        history = history + [((file.name,), None)]
         task_history = task_history + [((file.name,), None)]
-        return history, task_history
-    def download_url_image(url):
-        """Download URL image to local temp file"""
-        try:
-            # Use URL hash as filename to avoid duplicate downloads
-            url_hash = hashlib.md5(url.encode()).hexdigest()
-            temp_dir = tempfile.gettempdir()
-            temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")
-            # Return cached file if exists
-            if os.path.exists(temp_path):
-                return temp_path
-            # Download image
-            response = requests.get(url, timeout=10)
-            response.raise_for_status()
-            with open(temp_path, 'wb') as f:
-                f.write(response.content)
-            return temp_path
-        except Exception as e:
-            print(f"Failed to download image: {url}, error: {e}")
-            return url  # Return original URL on failure
     def reset_user_input():
-        return gr.update(value='')
-    def reset_state(_chatbot, task_history):
-        task_history.clear()
-        _chatbot.clear()
         _gc()
-        return []
-    # Example image paths - local files
     EXAMPLE_IMAGES = {
         "spotting": "examples/spotting.jpg",
         "parsing": "examples/parsing.jpg",
         "ie": "examples/ie.jpg",
         "vqa": "examples/vqa.jpg",
-        "translation": "examples/translation.jpg"
     }
     with gr.Blocks() as demo:
-        # Header
         gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
         with gr.Column():
-            # Chat area
             chatbot = gr.Chatbot(
                 label="Chat",
                 height=600,
-                type="tuple",   # 👈 tell Gradio we are using (user, bot) tuples
             )
-            # Input panel
             with gr.Group():
                 query = gr.Textbox(
                     lines=2,
@@ -454,75 +441,78 @@ def _launch_demo(args, model, processor):
                     submit_btn = gr.Button("Send", variant="primary", scale=3)
                     regen_btn = gr.Button("Regenerate")
                     empty_bin = gr.Button("Clear")
-            # Examples section
-            gr.Markdown("### Quick Examples - Click to load")
             with gr.Row():
                 example_1_btn = gr.Button("Text Detection")
                 example_2_btn = gr.Button("Document Parsing")
                 example_3_btn = gr.Button("Info Extraction")
                 example_4_btn = gr.Button("Visual Q&A")
                 example_5_btn = gr.Button("Translation")
-        task_history = gr.State([])
-        # Example 1: Text Detection
-        def load_example_1(history, task_hist):
-            prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
-            image_path = EXAMPLE_IMAGES["spotting"]
-            history = [((image_path,), None)]
-            task_hist = [((image_path,), None)]
-            return history, task_hist, prompt
-        # Example 2: Document Parsing
-        def load_example_2(history, task_hist):
-            prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
-            image_path = EXAMPLE_IMAGES["parsing"]
-            history = [((image_path,), None)]
-            task_hist = [((image_path,), None)]
-            return history, task_hist, prompt
-        # Example 3: Information Extraction
-        def load_example_3(history, task_hist):
-            prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
-            image_path = EXAMPLE_IMAGES["ie"]
-            history = [((image_path,), None)]
-            task_hist = [((image_path,), None)]
-            return history, task_hist, prompt
-        # Example 4: Visual Q&A
-        def load_example_4(history, task_hist):
-            prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
-            image_path = EXAMPLE_IMAGES["vqa"]
-            history = [((image_path,), None)]
-            task_hist = [((image_path,), None)]
-            return history, task_hist, prompt
-        # Example 5: Translation
-        def load_example_5(history, task_hist):
-            prompt = "Translate all text in this image to English."
-            image_path = EXAMPLE_IMAGES["translation"]
-            history = [((image_path,), None)]
-            task_hist = [((image_path,), None)]
-            return history, task_hist, prompt
-        # Bind events
-        example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
-        example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
-        example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
-        example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
-        example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
-        submit_btn.click(add_text, [chatbot, task_history, query],
-                         [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
         submit_btn.click(reset_user_input, [], [query])
-        empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
-        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
-        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
-        # Feature descriptions
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("""
@@ -543,17 +533,13 @@ def _launch_demo(args, model, processor):
 - **Use Cases** - OCR, document digitization, receipt recognition, translation
                 """)
-        # Footer
         gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")
     demo.queue().launch(
         share=args.share,
         inbrowser=args.inbrowser,
-        # server_port=args.server_port,
-        # server_name=args.server_name,
     )
 def main():
     args = _get_args()
     model, processor = _load_model_processor(args)

     args = parser.parse_args()
     return args
+def build_chatbot_messages(task_history):
+    """
+    Convert internal task_history [(q, a), ...] into Gradio Chatbot
+    messages format: [{"role": "...", "content": ...}, ...]
+    """
+    messages = []
+    for q, a in task_history:
+        # User side
+        if isinstance(q, (tuple, list)):
+            # Image-only turn
+            img_path = q[0]
+            messages.append({
+                "role": "user",
+                "content": [{"type": "image", "image": img_path}],
+            })
+        else:
+            messages.append({
+                "role": "user",
+                "content": q,
+            })
+        # Assistant side
+        if a is not None:
+            messages.append({
+                "role": "assistant",
+                "content": a,
+            })
+    return messages
 def _load_model_processor(args):
     # ZeroGPU: Model loads on CPU, uses eager mode
     # Track first call
     first_call = [True]
+    # =========================
+    # Model call (unchanged)
+    # =========================
     @spaces.GPU(duration=120)
     def call_local_model(messages):
         import time
         import sys
         start_time = time.time()
         if first_call[0]:
             print(f"[INFO] ========== First inference call ==========")
             first_call[0] = False
             print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
             print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
             print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
+        # Ensure model is on correct device
         model_device = next(model.parameters()).device
         print(f"[DEBUG] Model device: {model_device}")
         print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
+        if str(model_device) == "cpu":
             print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
             if torch.cuda.is_available():
                 move_start = time.time()
                 print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
             else:
                 print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
+        # Hunyuan expects a list of conversations → wrap once
         messages = [messages]
         # Build input using processor
         texts = [
             processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Starting processor encoding...")
         processor_start = time.time()
         inputs = processor(
         )
         print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")
+        # Move to device
         to_device_start = time.time()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        inputs = inputs.to(device)
+        print(f"[DEBUG] Inputs moved to {device}, elapsed: {time.time() - to_device_start:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
         # Generation
         gen_start = time.time()
         max_new_tokens = 2048
+        print(f"[DEBUG] ========== Starting token generation (max_new_tokens={max_new_tokens}) ==========")
         with torch.no_grad():
             print(f"[DEBUG] Testing forward pass...")
             forward_test_start = time.time()
             try:
+                if device == "cuda":
+                    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                        _ = model(**inputs, use_cache=False)
+                else:
+                    _ = model(**inputs, use_cache=False)
                 print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
             except Exception as e:
                 print(f"[WARNING] Forward pass test failed: {e}")
             print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
             generate_call_start = time.time()
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                temperature=0,
+            )
+            print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
         gen_time = time.time() - gen_start
         print(f"[DEBUG] Generation time: {gen_time:.2f}s")
         print(f"[DEBUG] Output shape: {generated_ids.shape}")
+        # Decode
+        input_ids = inputs.input_ids
         generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids, generated_ids)
         ]
         actual_tokens = len(generated_ids_trimmed[0])
         print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
         output_texts = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         total_time = time.time() - start_time
         print(f"[DEBUG] Total time: {total_time:.2f}s")
         print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
         output_texts[0] = clean_repeated_substrings(output_texts[0])
         return output_texts
+    # =========================
+    # Chat logic
+    # =========================
     def create_predict_fn():
+        def predict(chatbot_value, task_history):
             nonlocal model, processor
+            if not task_history:
+                return chatbot_value, task_history
             query = task_history[-1][0]
+            print("User:", query)
             history_cp = copy.deepcopy(task_history)
+            # Build messages for Hunyuan
             messages = []
             content = []
             for q, a in history_cp:
                 if isinstance(q, (tuple, list)):
                     img_path = q[0]
+                    if img_path.startswith(("http://", "https://")):
+                        content.append({"type": "image", "image": img_path})
                     else:
+                        content.append({"type": "image", "image": os.path.abspath(img_path)})
                 else:
+                    content.append({"type": "text", "text": q})
+                messages.append({"role": "user", "content": content})
+                content = []
+                if a is not None:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": a}],
+                        }
+                    )
+            if messages and messages[-1]["role"] == "assistant" and history_cp[-1][1] is None:
+                messages.pop()
             response_list = call_local_model(messages)
             response = response_list[0] if response_list else ""
             full_response = _parse_text(response)
             task_history[-1] = (query, full_response)
+            print("HunyuanOCR:", full_response)
+            chatbot_messages = build_chatbot_messages(task_history)
+            return chatbot_messages, task_history
         return predict
+    def create_regenerate_fn():
+        def regenerate(chatbot_value, task_history):
+            # No-op regenerate for now
+            return chatbot_value, task_history
         return regenerate
     predict = create_predict_fn()
     regenerate = create_regenerate_fn()
+    def add_text(chatbot_value, task_history, text):
         task_text = text
         task_history = task_history if task_history is not None else []
         task_history = task_history + [(task_text, None)]
+        chatbot_messages = build_chatbot_messages(task_history)
+        return chatbot_messages, task_history, ""
+    def add_file(chatbot_value, task_history, file):
         task_history = task_history if task_history is not None else []
         task_history = task_history + [((file.name,), None)]
+        chatbot_messages = build_chatbot_messages(task_history)
+        return chatbot_messages, task_history
     def reset_user_input():
+        return gr.update(value="")
+    def reset_state(chatbot_value, task_history):
         _gc()
+        return [], []
+    # Example image paths
     EXAMPLE_IMAGES = {
         "spotting": "examples/spotting.jpg",
         "parsing": "examples/parsing.jpg",
         "ie": "examples/ie.jpg",
         "vqa": "examples/vqa.jpg",
+        "translation": "examples/translation.jpg",
     }
+    # Example loaders: they only touch task_history; chatbot is rebuilt via helper
+    def load_example_1(chatbot_value, task_hist):
+        prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
+        task_hist = [((EXAMPLE_IMAGES["spotting"],), None)]
+        chatbot_messages = build_chatbot_messages(task_hist)
+        return chatbot_messages, task_hist, prompt
+    def load_example_2(chatbot_value, task_hist):
+        prompt = (
+            "Extract all text from this document in markdown format. Use HTML for tables "
+            "and LaTeX for equations. Parse in reading order."
+        )
+        task_hist = [((EXAMPLE_IMAGES["parsing"],), None)]
+        chatbot_messages = build_chatbot_messages(task_hist)
+        return chatbot_messages, task_hist, prompt
+    def load_example_3(chatbot_value, task_hist):
+        prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
+        task_hist = [((EXAMPLE_IMAGES["ie"],), None)]
+        chatbot_messages = build_chatbot_messages(task_hist)
+        return chatbot_messages, task_hist, prompt
+    def load_example_4(chatbot_value, task_hist):
+        prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
+        task_hist = [((EXAMPLE_IMAGES["vqa"],), None)]
+        chatbot_messages = build_chatbot_messages(task_hist)
+        return chatbot_messages, task_hist, prompt
+    def load_example_5(chatbot_value, task_hist):
+        prompt = "Translate all text in this image to English."
+        task_hist = [((EXAMPLE_IMAGES["translation"],), None)]
+        chatbot_messages = build_chatbot_messages(task_hist)
+        return chatbot_messages, task_hist, prompt
+    # =========================
+    # UI
+    # =========================
     with gr.Blocks() as demo:
         gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
         with gr.Column():
             chatbot = gr.Chatbot(
                 label="Chat",
                 height=600,
+                # ❌ DO NOT PASS type=... here – this env doesn't support it
             )
+            task_history = gr.State([])
             with gr.Group():
                 query = gr.Textbox(
                     lines=2,
                     submit_btn = gr.Button("Send", variant="primary", scale=3)
                     regen_btn = gr.Button("Regenerate")
                     empty_bin = gr.Button("Clear")
+            gr.Markdown("### Quick Examples - Click to load")
             with gr.Row():
                 example_1_btn = gr.Button("Text Detection")
                 example_2_btn = gr.Button("Document Parsing")
                 example_3_btn = gr.Button("Info Extraction")
                 example_4_btn = gr.Button("Visual Q&A")
                 example_5_btn = gr.Button("Translation")
+        # Example bindings
+        example_1_btn.click(
+            load_example_1,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history, query],
+        )
+        example_2_btn.click(
+            load_example_2,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history, query],
+        )
+        example_3_btn.click(
+            load_example_3,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history, query],
+        )
+        example_4_btn.click(
+            load_example_4,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history, query],
+        )
+        example_5_btn.click(
+            load_example_5,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history, query],
+        )
+        # Main flow
+        submit_btn.click(
+            add_text,
+            inputs=[chatbot, task_history, query],
+            outputs=[chatbot, task_history, query],
+        ).then(
+            predict,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history],
+            show_progress=True,
+        )
         submit_btn.click(reset_user_input, [], [query])
+        empty_bin.click(
+            reset_state,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history],
+            show_progress=True,
+        )
+        regen_btn.click(
+            regenerate,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history],
+            show_progress=True,
+        )
+        # Upload: pass only chatbot + state; file comes as extra arg
+        addfile_btn.upload(
+            add_file,
+            inputs=[chatbot, task_history],
+            outputs=[chatbot, task_history],
+            show_progress=True,
+        )
+        # Descriptive section (unchanged)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("""
 - **Use Cases** - OCR, document digitization, receipt recognition, translation
                 """)
         gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")
     demo.queue().launch(
         share=args.share,
         inbrowser=args.inbrowser,
     )
 def main():
     args = _get_args()
     model, processor = _load_model_processor(args)