IFMedTechdemo commited on
Commit
ccaeb92
·
verified ·
1 Parent(s): fe44064

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoModelForCausalLM, AutoProcessor
5
+ import spaces
6
+
7
+ # Model configuration
8
+ MODEL_PATH = "PaddlePaddle/PaddleOCR-VL"
9
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ # Task prompts
12
+ PROMPTS = {
13
+ "OCR": "OCR:",
14
+ "Table Recognition": "Table Recognition:",
15
+ "Formula Recognition": "Formula Recognition:",
16
+ "Chart Recognition": "Chart Recognition:",
17
+ }
18
+
19
+ # Load model and processor
20
+ print(f"Loading model on {DEVICE}...")
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MODEL_PATH,
23
+ trust_remote_code=True,
24
+ torch_dtype=torch.bfloat16
25
+ ).to(DEVICE).eval()
26
+
27
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
28
+ print("Model loaded successfully!")
29
+
30
+ @spaces.GPU
31
+ def process_image(image, task):
32
+ """
33
+ Process an image with PaddleOCR-VL model.
34
+
35
+ Args:
36
+ image: PIL Image or path to image
37
+ task: Task type (OCR, Table Recognition, etc.)
38
+
39
+ Returns:
40
+ str: Recognition result
41
+ """
42
+ if image is None:
43
+ return "Please upload an image first."
44
+
45
+ # Convert to PIL Image if needed
46
+ if not isinstance(image, Image.Image):
47
+ image = Image.open(image)
48
+
49
+ image = image.convert("RGB")
50
+
51
+ # Get prompt for the task
52
+ prompt = PROMPTS.get(task, PROMPTS["OCR"])
53
+
54
+ # Prepare messages
55
+ messages = [
56
+ {
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "image", "image": image},
60
+ {"type": "text", "text": prompt},
61
+ ]
62
+ }
63
+ ]
64
+
65
+ # Process with model
66
+ inputs = processor.apply_chat_template(
67
+ messages,
68
+ tokenize=True,
69
+ add_generation_prompt=True,
70
+ return_dict=True,
71
+ return_tensors="pt"
72
+ ).to(DEVICE)
73
+
74
+ # Generate output
75
+ with torch.no_grad():
76
+ outputs = model.generate(**inputs, max_new_tokens=1024)
77
+
78
+ # Decode and return result
79
+ result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
80
+
81
+ return result
82
+
83
+ # Create Gradio interface
84
+ demo = gr.Interface(
85
+ fn=process_image,
86
+ inputs=[
87
+ gr.Image(type="pil", label="Upload Image"),
88
+ gr.Radio(
89
+ choices=list(PROMPTS.keys()),
90
+ value="OCR",
91
+ label="Task Type"
92
+ )
93
+ ],
94
+ outputs=gr.Textbox(label="Result", lines=10),
95
+ title="PaddleOCR-VL: Multilingual Document Parsing",
96
+ description="Upload an image and select a task. This model supports OCR in 109 languages, table recognition, formula recognition, and chart recognition.",
97
+ examples=[
98
+ ["example.png", "OCR"],
99
+ ] if False else None, # Add examples if you upload sample images
100
+ )
101
+
102
+ if __name__ == "__main__":
103
+ demo.launch()