Spaces:

IFMedTechdemo
/

PaddleOCR-VL-Demo

Running

App Files Files Community

IFMedTechdemo commited on 25 days ago

Commit

ccaeb92

verified ·

1 Parent(s): fe44064

Create app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+import spaces
+# Model configuration
+MODEL_PATH = "PaddlePaddle/PaddleOCR-VL"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Task prompts
+PROMPTS = {
+    "OCR": "OCR:",
+    "Table Recognition": "Table Recognition:",
+    "Formula Recognition": "Formula Recognition:",
+    "Chart Recognition": "Chart Recognition:",
+}
+# Load model and processor
+print(f"Loading model on {DEVICE}...")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to(DEVICE).eval()
+processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+print("Model loaded successfully!")
+@spaces.GPU
+def process_image(image, task):
+    """
+    Process an image with PaddleOCR-VL model.
+    Args:
+        image: PIL Image or path to image
+        task: Task type (OCR, Table Recognition, etc.)
+    Returns:
+        str: Recognition result
+    """
+    if image is None:
+        return "Please upload an image first."
+    # Convert to PIL Image if needed
+    if not isinstance(image, Image.Image):
+        image = Image.open(image)
+    image = image.convert("RGB")
+    # Get prompt for the task
+    prompt = PROMPTS.get(task, PROMPTS["OCR"])
+    # Prepare messages
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]
+        }
+    ]
+    # Process with model
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(DEVICE)
+    # Generate output
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=1024)
+    # Decode and return result
+    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    return result
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_image,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Radio(
+            choices=list(PROMPTS.keys()),
+            value="OCR",
+            label="Task Type"
+        )
+    ],
+    outputs=gr.Textbox(label="Result", lines=10),
+    title="PaddleOCR-VL: Multilingual Document Parsing",
+    description="Upload an image and select a task. This model supports OCR in 109 languages, table recognition, formula recognition, and chart recognition.",
+    examples=[
+        ["example.png", "OCR"],
+    ] if False else None,  # Add examples if you upload sample images
+)
+if __name__ == "__main__":
+    demo.launch()