Spaces:

KarthiEz
/

HunyuanEz

Running

App Files Files Community

KarthiEz commited on 8 days ago

Commit

542e67d

verified ·

1 Parent(s): 4dfba61

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -147

app.py CHANGED Viewed

@@ -83,17 +83,33 @@ def _gc():
         torch.cuda.empty_cache()
-def build_hunyuan_messages(history_messages, latest_user_text, image_path):
     """
-    history_messages: list of {'role', 'content'} for past turns
-    latest_user_text: str (current user message)
-    image_path: filepath of last uploaded image (or None)
-    Returns: new list of messages including current user turn
     """
-    messages = copy.deepcopy(history_messages)
-    # Build content for the current user turn
     content = []
     if image_path:
         content.append(
@@ -105,71 +121,26 @@ def build_hunyuan_messages(history_messages, latest_user_text, image_path):
     if latest_user_text:
         content.append({"type": "text", "text": latest_user_text})
-    if not content:
-        # No text, no image → don't add a turn
-        return messages
-    messages.append({"role": "user", "content": content})
     return messages
-def rebuild_chat_display(history_messages):
-    """
-    Convert internal Hunyuan-like messages into classic
-    Chatbot display: list of (user_str, assistant_str) tuples.
-    """
-    chat = []
-    last_user = None
-    for msg in history_messages:
-        role = msg.get("role")
-        content = msg.get("content", [])
-        if role == "user":
-            # Collect only text pieces for display
-            if isinstance(content, list):
-                text_parts = [
-                    c.get("text", "")
-                    for c in content
-                    if isinstance(c, dict) and c.get("type") == "text"
-                ]
-                user_text = " ".join(tp for tp in text_parts if tp.strip())
-                if not user_text:
-                    user_text = "[image]"
-            else:
-                user_text = str(content)
-            last_user = user_text
-        elif role == "assistant":
-            if isinstance(content, list):
-                text_parts = [
-                    c.get("text", "")
-                    for c in content
-                    if isinstance(c, dict) and c.get("type") == "text"
-                ]
-                bot_text = " ".join(tp for tp in text_parts if tp.strip())
-            else:
-                bot_text = str(content)
-            if last_user is None:
-                last_user = ""
-            chat.append((last_user, bot_text))
-            last_user = None
-    return chat
 def main():
     args = _get_args()
     model, processor = _load_model_processor(args)
     @spaces.GPU(duration=120)
     def call_local_model(hy_messages):
         import time
         start_time = time.time()
-        # Hunyuan expects list[list[message]]
         convs = [hy_messages]
         texts = [
@@ -192,7 +163,7 @@ def main():
         device = "cuda" if torch.cuda.is_available() else "cpu"
         inputs = inputs.to(device)
-        max_new_tokens = 2048
         with torch.no_grad():
             if device == "cuda":
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
@@ -221,103 +192,62 @@ def main():
         return text
     # -------------------------
-    # Gradio UI (Blocks)
     # -------------------------
     with gr.Blocks() as demo:
         gr.Markdown(
             "# HunyuanOCR\n"
-            "*Upload an image (invoice, document, receipt, notice, etc.) and ask OCR questions.*"
         )
         with gr.Row():
             with gr.Column(scale=2):
-                chatbot = gr.Chatbot(
-                    label="HunyuanOCR Chat",
-                    height=600,
-                    type="messages",   # ✅ explicitly say we are using messages format
-                )
-                user_input = gr.Textbox(
-                    label="Your question",
-                    placeholder="Example: Detect and recognize all text in this image.",
-                    lines=2,
-                )
-                with gr.Row():
-                    send_btn = gr.Button("Send", variant="primary")
-                    clear_btn = gr.Button("Clear Chat")
-            with gr.Column(scale=1):
-                image_input = gr.Image(
-                    label="Upload image",
-                    type="filepath",
-                )
-                gr.Markdown(
-                    "Tips:\n"
-                    "- Use clear, high-resolution scans.\n"
-                    "- Supported: JPG, PNG.\n"
-                    "- You can reuse the same image for multiple questions."
                 )
-        # Internal states:
-        #   - history_messages: list of Hunyuan-style messages
-        #   - image_state: latest uploaded image path
-        history_messages = gr.State([])
-        image_state = gr.State(None)
-        # Handler: on image upload → just store path in state
-        def on_image_upload(img_path):
-            # img_path is already a filepath (type='filepath')
-            return img_path
-        image_input.upload(
-            on_image_upload,
-            inputs=image_input,
-            outputs=image_state,
-        )
-        # Handler: main send logic
-        def on_send(text, chat_value, history_msgs, img_path):
-            # If nothing to do, return unchanged
-            if (not text or not text.strip()) and not img_path:
-                return chat_value, history_msgs, ""
-            # 1) Build messages with new user turn
-            messages = build_hunyuan_messages(history_msgs, text.strip(), img_path)
-            # 2) Call model
-            answer = call_local_model(messages)
-            # 3) Append assistant turn to history
-            messages.append(
-                {
-                    "role": "assistant",
-                    "content": [{"type": "text", "text": answer}],
-                }
-            )
-            # 4) Return updated messages both to Chatbot and history_state
-            #    Chatbot (type="messages") expects this format directly
-            return messages, messages, ""
-        send_btn.click(
-            on_send,
-            inputs=[user_input, chatbot, history_messages, image_state],
-            outputs=[chatbot, history_messages, user_input],
-        )
-        user_input.submit(
-            on_send,
-            inputs=[user_input, chatbot, history_messages, image_state],
-            outputs=[chatbot, history_messages, user_input],
-        )
-        # Clear everything
-        def on_clear():
-            _gc()
-            return [], [], None, None
-        clear_btn.click(
-            on_clear,
-            inputs=[],
-            outputs=[chatbot, history_messages, image_input, image_state],
         )
     demo.queue().launch(

         torch.cuda.empty_cache()
+def build_hunyuan_messages_from_history(history, image_path, latest_user_text):
     """
+    history: list of [user_text, assistant_text] pairs from ChatInterface
+    image_path: current uploaded image file path (or None)
+    latest_user_text: current user message (str)
+    Returns: list[{"role": ..., "content": [...]}] for HunYuan
     """
+    messages = []
+    # 1) Past turns (only text – image reused only for current turn)
+    for user, assistant in history:
+        # user
+        messages.append(
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": user}],
+            }
+        )
+        # assistant
+        messages.append(
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": assistant}],
+            }
+        )
+    # 2) Current user turn (image + text)
     content = []
     if image_path:
         content.append(
     if latest_user_text:
         content.append({"type": "text", "text": latest_user_text})
+    if content:
+        messages.append({"role": "user", "content": content})
     return messages
 def main():
     args = _get_args()
     model, processor = _load_model_processor(args)
+    # -------------------------
+    # Core model call
+    # -------------------------
     @spaces.GPU(duration=120)
     def call_local_model(hy_messages):
         import time
         start_time = time.time()
+        # HunYuan expects list[list[message]]
         convs = [hy_messages]
         texts = [
         device = "cuda" if torch.cuda.is_available() else "cpu"
         inputs = inputs.to(device)
+        max_new_tokens = 512  # keep this smaller on CPU
         with torch.no_grad():
             if device == "cuda":
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
         return text
     # -------------------------
+    # Chat handler for ChatInterface
+    # -------------------------
+    def ocr_chat(message, history, image_path):
+        """
+        message: current user text (str)
+        history: list[[user, assistant], ...]
+        image_path: filepath from Image component
+        """
+        message = (message or "").strip()
+        if not message and not image_path:
+            return "Please upload an image and/or type a question."
+        hy_messages = build_hunyuan_messages_from_history(
+            history or [], image_path, message
+        )
+        answer = call_local_model(hy_messages)
+        return answer
+    # -------------------------
+    # UI: ChatInterface + image
     # -------------------------
     with gr.Blocks() as demo:
         gr.Markdown(
             "# HunyuanOCR\n"
+            "Upload an image (invoice, document, receipt, notice, etc.) and ask OCR questions."
         )
         with gr.Row():
             with gr.Column(scale=2):
+                chat = gr.ChatInterface(
+                    fn=ocr_chat,
+                    chatbot=gr.Chatbot(
+                        label="HunyuanOCR Chat",
+                        height=600,
+                    ),
+                    textbox=gr.Textbox(
+                        label="Your question",
+                        placeholder="Example: Detect and recognize all text in this image.",
+                        lines=2,
+                    ),
+                    additional_inputs=[
+                        gr.Image(
+                            label="Upload image",
+                            type="filepath",
+                        )
+                    ],
+                    title=None,
+                    description=None,
                 )
+        gr.Markdown(
+            "Tips:\n"
+            "- Use clear, high-resolution scans.\n"
+            "- Supported: JPG, PNG.\n"
+            "- You can reuse the same image for multiple questions."
         )
     demo.queue().launch(