ranbac commited on
Commit
9f6817b
·
verified ·
1 Parent(s): 819108a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -213
app.py CHANGED
@@ -1,35 +1,34 @@
1
  import os
2
- import base64
3
- import io
4
- import json
 
 
 
 
5
  import logging
6
  import re
7
- import cv2
8
- import numpy as np
9
- import requests
10
- from PIL import Image, ImageDraw, ImageFont
11
  import gradio as gr
12
  from paddleocr import PaddleOCR
 
 
 
13
 
14
- # --- PHẦN 1: CẤU HÌNH & KHỞI TẠO PADDLEOCR (LOCAL ENGINE) ---
15
- os.environ["FLAGS_use_mkldnn"] = "0"
16
- os.environ["FLAGS_enable_mkldnn"] = "0"
17
- os.environ["CPP_MIN_LOG_LEVEL"] = "3"
18
  logging.getLogger("ppocr").setLevel(logging.WARNING)
19
 
20
- print("🚀 Đang khởi tạo PaddleOCR Local...")
 
21
  try:
22
- # Cấu hình OCR Local
23
- ocr = PaddleOCR(use_textline_orientation=True,
24
- use_doc_orientation_classify=False,
25
- use_doc_unwarping=False,
26
- lang='ch') # Có thể đổi sang 'en' hoặc 'vi'
27
  except Exception as e:
28
- print(f"⚠️ Lỗi khởi tạo nâng cao: {e}. Dùng chế độ mặc định.")
29
  ocr = PaddleOCR(lang='ch')
30
- print("✅ Model đã sẵn sàng!")
31
 
32
- # Tải Font để vẽ chữ (Từ Phần 1)
 
 
33
  def check_and_download_font():
34
  font_path = "./simfang.ttf"
35
  if not os.path.exists(font_path):
@@ -44,19 +43,15 @@ def check_and_download_font():
44
 
45
  FONT_PATH = check_and_download_font()
46
 
47
- # --- HELPER FUNCTIONS (HỖ TRỢ XỬ LÝ ẢNH & TEXT) ---
48
-
49
- def pil_to_base64_html(image):
50
- """Chuyển đổi PIL Image thành thẻ HTML <img> base64"""
51
- buffered = io.BytesIO()
52
- image.save(buffered, format="JPEG")
53
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
54
- return f'<img src="data:image/jpeg;base64,{img_str}" alt="Result" style="width:100%; object-fit:contain;">'
55
-
56
  def universal_draw(image, raw_data, font_path):
57
- """Hàm vẽ box lên ảnh (Từ Phần 1)"""
58
  if image is None: return image
59
- if isinstance(image, np.ndarray): image = Image.fromarray(image)
 
 
 
 
 
60
  canvas = image.copy()
61
  draw = ImageDraw.Draw(canvas)
62
 
@@ -66,198 +61,161 @@ def universal_draw(image, raw_data, font_path):
66
  except:
67
  font = ImageFont.load_default()
68
 
69
- boxes = [line[0] for line in raw_data[0]] if raw_data and raw_data[0] else []
70
- txts = [line[1][0] for line in raw_data[0]] if raw_data and raw_data[0] else []
71
- scores = [line[1][1] for line in raw_data[0]] if raw_data and raw_data[0] else []
72
-
73
- for box, txt in zip(boxes, txts):
74
- box = [tuple(p) for p in box]
75
- draw.polygon(box, outline="red", width=3)
76
- # Vẽ nền chữ
77
- if hasattr(draw, "textbbox"):
78
- text_bbox = draw.textbbox(box[0], txt, font=font, anchor="lb")
79
- draw.rectangle(text_bbox, fill="red")
80
- draw.text(box[0], txt, fill="white", font=font, anchor="lb")
81
- else:
82
- draw.text((box[0][0], box[0][1] - font_size), txt, fill="white", font=font)
83
- return canvas
84
-
85
- # --- HÀM XỬ LÝ CHÍNH (LOGIC CẦU NỐI) ---
86
- # Hàm này nhận input từ UI Phần 2, chạy Logic Phần 1, trả về format UI Phần 2
87
-
88
- def local_inference(image_path, mode="Document"):
89
- if not image_path:
90
- return "Please upload an image.", "", ""
91
 
92
- try:
93
- # 1. Đọc ảnh
94
- img = Image.open(image_path).convert("RGB")
95
- img_np = np.array(img)
96
 
97
- # 2. Chạy PaddleOCR (Local)
98
- # Lưu ý: Model Local bản không hỗ trợ tách bảng/công thức chuyên sâu như API
99
- # nhưng ta vẫn chạy OCR để lấy text.
100
- result = ocr.ocr(img_np, cls=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- # 3. Xử lý kết quả để hiển thị
103
- if not result or result[0] is None:
104
- return "No text found.", "<p>No text detected</p>", "[]"
105
 
106
- # Tạo ảnh visualization (Vẽ box)
107
- annotated_img = universal_draw(img, result, FONT_PATH)
108
- html_vis = pil_to_base64_html(annotated_img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # Tạo Markdown Output
111
- # Gom nhóm text lại thành đoạn văn
112
- texts = [line[1][0] for line in result[0]]
 
113
 
114
- if mode == "Formula":
115
- md_text = "### Recognized Formula (Raw Text):\n\n" + " ".join(texts)
116
- md_text += "\n\n*(Note: Local generic OCR model cannot convert to LaTeX math syntax)*"
117
- elif mode == "Table":
118
- md_text = "### Recognized Table Content:\n\n" + "\n".join(texts)
119
- md_text += "\n\n*(Note: Local generic OCR model does not reconstruct HTML structure)*"
120
- else: # Document / Generic
121
- md_text = "### Document Content:\n\n" + "\n".join(texts)
122
-
123
- # Raw Data (JSON string để debug)
124
- raw_json = json.dumps(result[0], ensure_ascii=False, indent=2)
125
-
126
- return md_text, html_vis, raw_json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  except Exception as e:
129
  import traceback
130
- err = traceback.format_exc()
131
- return f"Error: {str(e)}", f"<p style='color:red'>{str(e)}</p>", err
132
-
133
- # Wrapper cho các Tab khác nhau
134
- def run_doc_parsing(file, *args):
135
- return local_inference(file, mode="Document")
136
 
137
- def run_element_recognition(file, prompt_label, *args):
138
- # prompt_label: "Formula Recognition", "Table Recognition", etc.
139
- mode = prompt_label.split()[0] # Lấy từ đầu tiên (Formula/Table...)
140
- return local_inference(file, mode=mode)
141
-
142
- def run_spotting(file, *args):
143
- # Spotting giả lập: Trả về bounding boxes của text dưới dạng JSON
144
- if not file: return "", "{}"
145
-
146
- img = Image.open(file).convert("RGB")
147
- result = ocr.ocr(np.array(img), cls=True)
148
-
149
- if not result or result[0] is None:
150
- return "<p>No objects found</p>", "[]"
151
-
152
- annotated_img = universal_draw(img, result, FONT_PATH)
153
- html_vis = pil_to_base64_html(annotated_img)
154
-
155
- # Format lại JSON cho giống spotting
156
- spotting_res = []
157
- for line in result[0]:
158
- spotting_res.append({
159
- "label": "text_block",
160
- "text": line[1][0],
161
- "confidence": line[1][1],
162
- "box": line[0]
163
- })
164
 
165
- return html_vis, json.dumps(spotting_res, ensure_ascii=False, indent=2)
166
-
167
-
168
- # --- PHẦN 2: GIAO DIỆN (UI TỪ FILE 2) ---
169
- custom_css = """
170
- body, .gradio-container { font-family: "Noto Sans SC", sans-serif; }
171
- .app-header { text-align: center; margin-bottom: 20px; }
172
- .prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
173
- .prompt-grid button { height: 40px !important; }
174
- .notice { background: #f0f9ff; padding: 10px; border-radius: 8px; border: 1px solid #bae6fd; font-size: 14px; margin-bottom: 10px;}
175
- """
176
-
177
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
178
- gr.HTML("""
179
- <div class="app-header">
180
- <h1>PaddleOCR Local - Pro Interface</h1>
181
- <p>Giao diện nâng cao chạy trên Backend Local (CPU)</p>
182
- </div>
183
- <div class="notice">
184
- <strong>Lưu ý:</strong> Đây là phiên bản chạy model Local.
185
- Các tính năng như <em>Formula to Latex</em>, <em>Table to HTML</em> hay <em>Layout Analysis</em>
186
- chỉ trả về văn bản thô (Raw OCR) do giới hạn của model cài đặt cục bộ.
187
- </div>
188
- """)
189
-
190
- with gr.Tabs():
191
- # ===================== Tab 1: Document Parsing =====================
192
- with gr.Tab("Document Parsing"):
193
- with gr.Row():
194
- with gr.Column(scale=5):
195
- file_doc = gr.File(label="Upload Image", type="filepath", file_types=["image"])
196
- btn_parse = gr.Button("Parse Document", variant="primary")
197
- # Các tùy chọn checkbox (Dummy - vì local model config đơn giản)
198
- with gr.Row():
199
- gr.Checkbox(label="Chart parsing (N/A)", value=False, interactive=False)
200
- gr.Checkbox(label="Doc unwarping (N/A)", value=False, interactive=False)
201
-
202
- with gr.Column(scale=7):
203
- with gr.Tabs():
204
- with gr.Tab("Markdown Preview"):
205
- md_preview_doc = gr.Markdown()
206
- with gr.Tab("Visualization"):
207
- vis_image_doc = gr.HTML()
208
- with gr.Tab("Raw Data"):
209
- raw_doc = gr.Code(language="json")
210
-
211
- btn_parse.click(run_doc_parsing, inputs=[file_doc], outputs=[md_preview_doc, vis_image_doc, raw_doc])
212
-
213
- # ===================== Tab 2: Element-level Recognition =====================
214
- with gr.Tab("Element-level Recognition"):
215
- with gr.Row():
216
- with gr.Column(scale=5):
217
- file_vl = gr.File(label="Upload Image", type="filepath", file_types=["image"])
218
- gr.Markdown("_(Chế độ này tối ưu cho từng thành phần riêng lẻ)_")
219
-
220
- with gr.Row(elem_classes=["prompt-grid"]):
221
- btn_ocr = gr.Button("Text Recognition", variant="secondary")
222
- btn_formula = gr.Button("Formula Recognition", variant="secondary")
223
- with gr.Row(elem_classes=["prompt-grid"]):
224
- btn_table = gr.Button("Table Recognition", variant="secondary")
225
- btn_seal = gr.Button("Seal Recognition", variant="secondary")
226
-
227
- with gr.Column(scale=7):
228
- with gr.Tabs():
229
- with gr.Tab("Result"):
230
- md_preview_vl = gr.Markdown()
231
- with gr.Tab("Visualization"):
232
- vis_image_vl = gr.HTML()
233
- with gr.Tab("Raw Output"):
234
- md_raw_vl = gr.Code(language="json")
235
-
236
- # Gán sự kiện cho các nút
237
- for btn, label in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table"), (btn_seal, "Seal")]:
238
- btn.click(
239
- fn=run_element_recognition,
240
- inputs=[file_vl, gr.State(label)],
241
- outputs=[md_preview_vl, vis_image_vl, md_raw_vl]
242
- )
243
-
244
- # ===================== Tab 3: Spotting =====================
245
- with gr.Tab("Spotting"):
246
- with gr.Row():
247
- with gr.Column(scale=5):
248
- file_spot = gr.File(label="Upload Image", type="filepath", file_types=["image"])
249
- btn_run_spot = gr.Button("Run Spotting", variant="primary")
250
- gr.Markdown("_(Phát hiện vị trí văn bản)_")
251
-
252
- with gr.Column(scale=7):
253
- with gr.Tabs():
254
- with gr.Tab("Visualization"):
255
- vis_image_spot = gr.HTML()
256
- with gr.Tab("JSON Result"):
257
- json_spot = gr.Code(language="json")
258
-
259
- btn_run_spot.click(run_spotting, inputs=[file_spot], outputs=[vis_image_spot, json_spot])
260
 
261
  if __name__ == "__main__":
262
- demo.queue().launch(server_name="0.0.0.0", server_port=7860,
263
- ssr_mode=False)
 
1
  import os
2
+
3
+ # --- CẤU HÌNH HỆ THỐNG ---
4
+ os.environ["FLAGS_use_mkldnn"] = "0"
5
+ os.environ["FLAGS_enable_mkldnn"] = "0"
6
+ os.environ["DN_ENABLE_MKLDNN"] = "0"
7
+ os.environ["CPP_MIN_LOG_LEVEL"] = "3"
8
+
9
  import logging
10
  import re
 
 
 
 
11
  import gradio as gr
12
  from paddleocr import PaddleOCR
13
+ from PIL import Image, ImageDraw, ImageFont
14
+ import numpy as np
15
+ import requests
16
 
17
+ # Tắt log thừa
 
 
 
18
  logging.getLogger("ppocr").setLevel(logging.WARNING)
19
 
20
+ print("Đang khởi tạo PaddleOCR (Coordinate Sync Mode)...")
21
+
22
  try:
23
+ ocr = PaddleOCR(use_textline_orientation=True, use_doc_orientation_classify=False,
24
+ use_doc_unwarping=False, lang='ch')
 
 
 
25
  except Exception as e:
26
+ print(f"Lỗi khởi tạo: {e}. Chuyển về chế độ mặc định.")
27
  ocr = PaddleOCR(lang='ch')
 
28
 
29
+ print("Model đã sẵn sàng!")
30
+
31
+ # --- TẢI FONT ---
32
  def check_and_download_font():
33
  font_path = "./simfang.ttf"
34
  if not os.path.exists(font_path):
 
43
 
44
  FONT_PATH = check_and_download_font()
45
 
46
+ # --- HÀM VẼ ĐA NĂNG ---
 
 
 
 
 
 
 
 
47
  def universal_draw(image, raw_data, font_path):
 
48
  if image is None: return image
49
+
50
+ # Đảm bảo image là PIL
51
+ if isinstance(image, np.ndarray):
52
+ image = Image.fromarray(image)
53
+
54
+ # Copy để vẽ
55
  canvas = image.copy()
56
  draw = ImageDraw.Draw(canvas)
57
 
 
61
  except:
62
  font = ImageFont.load_default()
63
 
64
+ # Hàm parse box
65
+ def parse_box(b):
66
+ try:
67
+ if hasattr(b, 'tolist'): b = b.tolist()
68
+ if len(b) > 0 and isinstance(b[0], list): return [tuple(p) for p in b]
69
+ if len(b) == 4 and isinstance(b[0], (int, float)):
70
+ return [(b[0], b[1]), (b[2], b[1]), (b[2], b[3]), (b[0], b[3])]
71
+ return None
72
+ except: return None
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ items_to_draw = []
 
 
 
75
 
76
+ # Logic tìm box/text
77
+ # Ưu tiên cấu trúc PaddleX: rec_texts + dt_polys
78
+ processed = False
79
+ if isinstance(raw_data, list) and len(raw_data) > 0 and isinstance(raw_data[0], dict):
80
+ data_dict = raw_data[0]
81
+ texts = data_dict.get('rec_texts')
82
+ boxes = data_dict.get('dt_polys', data_dict.get('rec_polys', data_dict.get('dt_boxes')))
83
+
84
+ if texts and boxes and isinstance(texts, list) and isinstance(boxes, list):
85
+ for i in range(min(len(texts), len(boxes))):
86
+ txt = texts[i]
87
+ box = parse_box(boxes[i])
88
+ if box and txt: items_to_draw.append((box, txt))
89
+ processed = True
90
+
91
+ # Fallback Logic
92
+ if not processed:
93
+ def hunt(data):
94
+ if isinstance(data, dict):
95
+ box = None; text = None
96
+ for k in ['points', 'box', 'dt_boxes', 'poly']:
97
+ if k in data: box = parse_box(data[k]); break
98
+ for k in ['transcription', 'text', 'rec_text', 'label']:
99
+ if k in data: text = data[k]; break
100
+ if box and text: items_to_draw.append((box, text)); return
101
+ for v in data.values(): hunt(v)
102
+ elif isinstance(data, (list, tuple)):
103
+ if len(data) == 2 and isinstance(data[0], list) and len(data[0]) == 4:
104
+ box = parse_box(data[0])
105
+ txt_obj = data[1]
106
+ text = txt_obj[0] if isinstance(txt_obj, (list, tuple)) else txt_obj
107
+ if box and isinstance(text, str): items_to_draw.append((box, text)); return
108
+ for item in data: hunt(item)
109
+ hunt(raw_data)
110
+
111
+ # Vẽ
112
+ for box, txt in items_to_draw:
113
+ try:
114
+ # Vẽ khung đỏ
115
+ draw.polygon(box, outline="red", width=3)
116
+ # Vẽ chữ
117
+ txt_x, txt_y = box[0]
118
+ if hasattr(draw, "textbbox"):
119
+ text_bbox = draw.textbbox((txt_x, txt_y), txt, font=font, anchor="lb")
120
+ draw.rectangle(text_bbox, fill="red")
121
+ draw.text((txt_x, txt_y), txt, fill="white", font=font, anchor="lb")
122
+ else:
123
+ draw.text((txt_x, txt_y - font_size), txt, fill="white", font=font)
124
+ except: continue
125
 
126
+ return canvas
 
 
127
 
128
+ # --- HÀM XỬ TEXT ---
129
+ def deep_extract_text(data):
130
+ found_texts = []
131
+ if isinstance(data, str):
132
+ if len(data.strip()) > 0: return [data]
133
+ return []
134
+ if isinstance(data, (list, tuple)):
135
+ for item in data: found_texts.extend(deep_extract_text(item))
136
+ elif isinstance(data, dict):
137
+ for val in data.values(): found_texts.extend(deep_extract_text(val))
138
+ elif hasattr(data, '__dict__'): found_texts.extend(deep_extract_text(data.__dict__))
139
+ return found_texts
140
+
141
+ def clean_text_result(text_list):
142
+ cleaned = []
143
+ block_list = ['min', 'max', 'general', 'header', 'footer', 'structure']
144
+ for t in text_list:
145
+ t = t.strip()
146
+ if len(t) < 2 and not any(u'\u4e00' <= c <= u'\u9fff' for c in t): continue
147
+ if t.lower().endswith(('.ttf', '.json', '.pdparams', '.yml', '.log')): continue
148
+ if t.lower() in block_list: continue
149
+ if not re.search(r'[\w\u4e00-\u9fff]', t): continue
150
+ cleaned.append(t)
151
+ return cleaned
152
+
153
+ # --- MAIN PREDICT ---
154
+ def predict(image):
155
+ if image is None: return None, "Chưa có ảnh.", "No Data"
156
 
157
+ try:
158
+ # Chuẩn bị ảnh đầu vào
159
+ original_pil = image.copy() if isinstance(image, Image.Image) else Image.fromarray(image).copy()
160
+ image_np = np.array(image)
161
 
162
+ # 1. OCR
163
+ raw_result = ocr.ocr(image_np)
164
+
165
+ # 2. XỬ LÝ ẢNH ĐỂ VẼ (KEY FIX: Lấy ảnh từ Preprocessor nếu có)
166
+ target_image_for_drawing = original_pil
167
+
168
+ # Kiểm tra xem Paddle có chỉnh sửa ảnh không (dựa vào key 'doc_preprocessor_res')
169
+ if isinstance(raw_result, list) and len(raw_result) > 0 and isinstance(raw_result[0], dict):
170
+ if 'doc_preprocessor_res' in raw_result[0]:
171
+ proc_res = raw_result[0]['doc_preprocessor_res']
172
+ # Nếu ảnh đầu ra đã chỉnh sửa (output_img)
173
+ if 'output_img' in proc_res:
174
+ print("Phát hiện ảnh đã qua xử lý hình học. Đang đồng bộ tọa độ...")
175
+ numpy_img = proc_res['output_img']
176
+ target_image_for_drawing = Image.fromarray(numpy_img)
177
+
178
+ # 3. Vẽ lên ảnh ĐÚNG (Target Image)
179
+ annotated_image = universal_draw(target_image_for_drawing, raw_result, FONT_PATH)
180
+
181
+ # 4. Xử lý Text
182
+ all_texts = deep_extract_text(raw_result)
183
+ final_texts = clean_text_result(all_texts)
184
+ text_output = "\n".join(final_texts) if final_texts else "Không tìm thấy văn bản."
185
+
186
+ # Debug Info
187
+ debug_str = str(raw_result)[:1000]
188
+ debug_info = f"Used Image Source: {'Preprocessed' if target_image_for_drawing != original_pil else 'Original'}\nData Preview:\n{debug_str}..."
189
+
190
+ return annotated_image, text_output, debug_info
191
 
192
  except Exception as e:
193
  import traceback
194
+ return image, f"Lỗi: {str(e)}", traceback.format_exc()
 
 
 
 
 
195
 
196
+ # --- GIAO DIỆN ---
197
+ with gr.Blocks(title="PaddleOCR Perfect Overlay") as iface:
198
+ gr.Markdown("## PaddleOCR Chinese - High Precision Overlay")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ with gr.Row():
201
+ with gr.Column():
202
+ input_img = gr.Image(type="pil", label="Input Image")
203
+ submit_btn = gr.Button("RUN OCR", variant="primary")
204
+
205
+ with gr.Column():
206
+ with gr.Tabs():
207
+ with gr.TabItem("🖼️ Kết quả Khớp Tọa Độ"):
208
+ output_img = gr.Image(type="pil", label="Overlay Result")
209
+ with gr.TabItem("📝 Văn bản"):
210
+ output_txt = gr.Textbox(label="Text Content", lines=15)
211
+ with gr.TabItem("🐞 Debug"):
212
+ output_debug = gr.Textbox(label="Debug Info", lines=15)
213
+
214
+ submit_btn.click(
215
+ fn=predict,
216
+ inputs=input_img,
217
+ outputs=[output_img, output_txt, output_debug]
218
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  if __name__ == "__main__":
221
+ iface.launch(server_name="0.0.0.0", server_port=7860)