aal-hawa commited on
Commit
5e60c7c
·
1 Parent(s): a98b0f5
Files changed (2) hide show
  1. app.py +119 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import tempfile
4
+ import os
5
+ from PIL import Image
6
+ from transformers import AutoProcessor, HunYuanVLForConditionalGeneration
7
+
8
+ # ============================================================
9
+ # HunyuanOCR – Image Text Extraction
10
+ # ============================================================
11
+ MODEL_ID = "tencent/HunyuanOCR"
12
+ model = None
13
+ processor = None
14
+
15
+ def clean_repeated_substrings(text):
16
+ n = len(text)
17
+ if n < 8000:
18
+ return text
19
+ for length in range(2, n // 10 + 1):
20
+ candidate = text[-length:]
21
+ count = 0
22
+ i = n - length
23
+ while i >= 0 and text[i:i + length] == candidate:
24
+ count += 1
25
+ i -= length
26
+ if count >= 10:
27
+ return text[:n - length * (count - 1)]
28
+ return text
29
+
30
+ def load_model():
31
+ global model, processor
32
+ if model is not None:
33
+ return
34
+ import os
35
+ token = os.getenv("HF_TOKEN", None)
36
+ print("Loading HunyuanOCR ...")
37
+ processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=False, token=token)
38
+ model = HunYuanVLForConditionalGeneration.from_pretrained(
39
+ MODEL_ID,
40
+ attn_implementation="eager",
41
+ device_map=None,
42
+ low_cpu_mem_usage=True,
43
+ token=token,
44
+ ).float() # convert all params from bfloat16 to float32 for CPU
45
+ model.eval()
46
+ print("HunyuanOCR loaded.")
47
+
48
+ def ocr_process(image):
49
+ if image is None:
50
+ return "Please upload an image."
51
+
52
+ load_model()
53
+
54
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
55
+ image.save(tmp.name)
56
+ img_path = tmp.name
57
+
58
+ try:
59
+ messages = [
60
+ {
61
+ "role": "system",
62
+ "content": ""
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "image", "image": img_path},
68
+ {"type": "text", "text": "检测并识别图片中的文字,将文本坐标格式化输出。"}
69
+ ]
70
+ }
71
+ ]
72
+
73
+ text_prompt = processor.apply_chat_template(
74
+ messages, tokenize=False, add_generation_prompt=True
75
+ )
76
+ image_input = Image.open(img_path)
77
+ inputs = processor(
78
+ text=[text_prompt], images=[image_input],
79
+ padding=True, return_tensors="pt"
80
+ ).to("cpu")
81
+
82
+ with torch.no_grad():
83
+ generated_ids = model.generate(**inputs, max_new_tokens=16384, do_sample=False)
84
+
85
+ input_ids = inputs["input_ids"]
86
+ generated_ids_trimmed = [
87
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
88
+ ]
89
+ output_text = clean_repeated_substrings(
90
+ processor.batch_decode(
91
+ generated_ids_trimmed,
92
+ skip_special_tokens=True,
93
+ clean_up_tokenization_spaces=False
94
+ )[0]
95
+ )
96
+
97
+ return output_text
98
+ finally:
99
+ if os.path.exists(img_path):
100
+ os.remove(img_path)
101
+
102
+ # ============================================================
103
+ # Gradio Interface
104
+ # ============================================================
105
+ with gr.Blocks(title="HunyuanOCR") as demo:
106
+ gr.Markdown("""
107
+ # 📄 HunyuanOCR – Text Extraction
108
+ Upload an image and the model will detect and extract all text with coordinates.
109
+ """)
110
+
111
+ image_input = gr.Image(type="pil", label="Upload Image")
112
+ ocr_output = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
113
+ ocr_btn = gr.Button("Extract Text", variant="primary")
114
+
115
+ ocr_btn.click(ocr_process, image_input, ocr_output)
116
+ image_input.change(ocr_process, image_input, ocr_output)
117
+
118
+ if __name__ == "__main__":
119
+ demo.launch(server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git@82a06db03535c49aa987719ed0746a76093b1ec4
2
+ torch
3
+ torchvision
4
+ gradio
5
+ accelerate
6
+ Pillow