Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Runtime error

App Files Files Community

ChaseHan commited on Jul 15, 2025

Commit

6783e7d

verified ·

1 Parent(s): 9ea5d2c

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -59

app.py CHANGED Viewed

@@ -1,64 +1,220 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from PIL import Image, ImageDraw, ImageFont
+import json
+import re
+from spaces import GPU
+# --- 1. Configurations and Constants ---
+# Hugging Face model repository
+MODEL_ID = "ChaseHan/Latex2Layout-2000-sync"
+# BUG FIX: Use fixed-size scaling
+TARGET_SIZE = (924, 1204)
+# Visualization style constants
+OUTLINE_WIDTH = 3
+# RGBA colors for layout regions (with transparency)
+LABEL_COLORS = {
+    "title": (255, 82, 82, 90),        # Red
+    "abstract": (46, 204, 113, 90),    # Green
+    "heading": (52, 152, 219, 90),     # Blue
+    "footnote": (241, 196, 15, 90),    # Yellow
+    "figure": (155, 89, 182, 90),      # Purple
+    "figure caption": (26, 188, 156, 90),  # Teal
+    "table": (230, 126, 34, 90),       # Orange
+    "table caption": (44, 62, 80, 90), # Dark Blue/Gray
+    "math": (231, 76, 60, 90),         # Pomegranate
+    "text": (149, 165, 166, 90),       # Gray
+    "other": (127, 140, 141, 90)       # Light Gray
+}
+# Default prompt for layout detection
+PROMPT_GROUNDING = (
+    """<image>Please carefully observe the document and detect the following regions: "title", "abstract", "heading", "footnote", "figure", "figure caption", "table", "table caption", "math", "text". Output each detected region's bbox coordinates in JSON format. The format of the output is: <answer>
+json[{"bbox_2d": [x1, y1, x2, y2], "label": "region name", "order": "reading order"}]
+</answer>."""
 )
+# --- 2. Load Model and Processor ---
+print("Loading model and processor, this may take a moment...")
+try:
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    print("Model loaded successfully!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    exit()
+# --- 3. Core Inference and Visualization Function ---
+@GPU
+def analyze_and_visualize_layout(input_image: Image.Image, prompt: str, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
+    """
+    Resizes input image, runs model inference, visualizes layout, and returns results.
+    """
+    if input_image is None:
+        return None, "Please upload an image first."
+    progress(0, desc="Resizing image...")
+    # BUG FIX: Use fixed-size scaling
+    image = input_image.resize(TARGET_SIZE)
+    image = image.convert("RGBA")  # For transparent drawing
+    messages = [
+        {"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": prompt}
+        ]}
+    ]
+    progress(0.2, desc="Preparing model inputs...")
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
+    progress(0.5, desc="Generating layout data...")
+    with torch.no_grad():
+        generate_kwargs = {
+            "max_new_tokens": 4096,
+            "do_sample": temperature > 0,  # Enable sampling if temperature > 0
+            "temperature": temperature if temperature > 0 else None,
+            "top_p": top_p if temperature > 0 else None,
+        }
+        output_ids = model.generate(**inputs, **generate_kwargs)
+    output_text = processor.batch_decode(
+        output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
+    )[0]
+    progress(0.8, desc="Parsing and visualizing results...")
+    try:
+        json_match = re.search(r"
+json(.*?)
+", output_text, re.DOTALL)
+        json_str = json_match.group(1).strip() if json_match else output_text.strip()
+        results = json.loads(json_str)
+    except (json.JSONDecodeError, AttributeError):
+        return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
+    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+    draw = ImageDraw.Draw(overlay)
+    try:
+        font = ImageFont.truetype("Arial.ttf", 15)
+    except IOError:
+        font = ImageFont.load_default()
+    for item in sorted(results, key=lambda x: x.get("order", 999)):
+        bbox = item.get("bbox_2d")
+        label = item.get("label", "other")
+        order = item.get("order", "")
+        if not bbox or len(bbox) != 4:
+            continue
+        fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
+        solid_color_rgb = fill_color_rgba[:3]
+        draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH)
+        tag_text = f"{order}: {label}"
+        tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
+        tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
+        tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
+        draw.rectangle(tag_bg_box, fill=solid_color_rgb)
+        draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
+    visualized_image = Image.alpha_composite(image, overlay).convert("RGB")
+    return visualized_image, output_text
+def clear_outputs():
+    """Clears output fields."""
+    return None, None
+# --- 4. Gradio User Interface ---
+with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
+    gr.Markdown("# 📄 Academic Paper Layout Detection")
+    gr.Markdown(
+        "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
+        "Upload a document image to begin."
+        # BUG FIX: Updated description to reflect fixed-size scaling
+        "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
+    )
+    gr.Markdown("<hr>")
+    with gr.Row():
+        with gr.Column(scale=4):
+            input_image = gr.Image(type="pil", label="Upload Document Image", height=700)
+            with gr.Accordion("Advanced Settings", open=False):
+                prompt_input = gr.Textbox(
+                    value=PROMPT_GROUNDING,
+                    label="Custom Prompt",
+                    lines=5,
+                    info="Edit the prompt sent to the model. Changes may affect output format."
+                )
+                temperature_input = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Temperature",
+                    info="Controls randomness (higher = more creative, 0 = deterministic)."
+                )
+                top_p_input = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.95,
+                    step=0.05,
+                    label="Top-P",
+                    info="Nucleus sampling: considers the top p% probability mass."
+                )
+        with gr.Column(scale=5):
+            output_image = gr.Image(type="pil", label="Analyzed Layout", interactive=False, height=700)
+    with gr.Row():
+        analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
+    output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
+    # fix
+    gr.Examples(
+        examples=[["page_2.png"], ["page_3.png"], ["page_5.png"], ["page_13.png"]],
+        inputs=[input_image, prompt_input, temperature_input, top_p_input],
+        outputs=[output_image, output_text],
+        fn=analyze_and_visualize_layout,
+        label="Examples (Click to Run)",
+        cache_examples=True
+    )
+    gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
+    # --- Event Handlers ---
+    analyze_btn.click(
+        fn=analyze_and_visualize_layout,
+        inputs=[input_image, prompt_input, temperature_input, top_p_input],
+        outputs=[output_image, output_text]
+    )
+    input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
+    input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
+# --- 5. Launch the Application ---
 if __name__ == "__main__":
+    demo.launch()