ranbac commited on
Commit
0ea77f8
·
verified ·
1 Parent(s): 7b9a396

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -171
app.py CHANGED
@@ -1,34 +1,35 @@
1
  import os
2
-
3
- # --- CẤU HÌNH HỆ THỐNG ---
4
- os.environ["FLAGS_use_mkldnn"] = "0"
5
- os.environ["FLAGS_enable_mkldnn"] = "0"
6
- os.environ["DN_ENABLE_MKLDNN"] = "0"
7
- os.environ["CPP_MIN_LOG_LEVEL"] = "3"
8
-
9
  import logging
10
  import re
11
- import gradio as gr
12
- from paddleocr import PaddleOCR
13
- from PIL import Image, ImageDraw, ImageFont
14
  import numpy as np
15
  import requests
 
 
 
16
 
17
- # Tắt log thừa
 
 
 
18
  logging.getLogger("ppocr").setLevel(logging.WARNING)
19
 
20
- print("Đang khởi tạo PaddleOCR (Coordinate Sync Mode)...")
21
-
22
  try:
23
- ocr = PaddleOCR(use_textline_orientation=True, use_doc_orientation_classify=False,
24
- use_doc_unwarping=False, lang='ch')
 
 
 
25
  except Exception as e:
26
- print(f"Lỗi khởi tạo: {e}. Chuyển về chế độ mặc định.")
27
  ocr = PaddleOCR(lang='ch')
 
28
 
29
- print("Model đã sẵn sàng!")
30
-
31
- # --- TẢI FONT ---
32
  def check_and_download_font():
33
  font_path = "./simfang.ttf"
34
  if not os.path.exists(font_path):
@@ -43,15 +44,19 @@ def check_and_download_font():
43
 
44
  FONT_PATH = check_and_download_font()
45
 
46
- # --- HÀM VẼ ĐA NĂNG ---
 
 
 
 
 
 
 
 
47
  def universal_draw(image, raw_data, font_path):
 
48
  if image is None: return image
49
-
50
- # Đảm bảo image là PIL
51
- if isinstance(image, np.ndarray):
52
- image = Image.fromarray(image)
53
-
54
- # Copy để vẽ
55
  canvas = image.copy()
56
  draw = ImageDraw.Draw(canvas)
57
 
@@ -61,161 +66,197 @@ def universal_draw(image, raw_data, font_path):
61
  except:
62
  font = ImageFont.load_default()
63
 
64
- # Hàm parse box
65
- def parse_box(b):
66
- try:
67
- if hasattr(b, 'tolist'): b = b.tolist()
68
- if len(b) > 0 and isinstance(b[0], list): return [tuple(p) for p in b]
69
- if len(b) == 4 and isinstance(b[0], (int, float)):
70
- return [(b[0], b[1]), (b[2], b[1]), (b[2], b[3]), (b[0], b[3])]
71
- return None
72
- except: return None
73
-
74
- items_to_draw = []
75
-
76
- # Logic tìm box/text
77
- # Ưu tiên cấu trúc PaddleX: rec_texts + dt_polys
78
- processed = False
79
- if isinstance(raw_data, list) and len(raw_data) > 0 and isinstance(raw_data[0], dict):
80
- data_dict = raw_data[0]
81
- texts = data_dict.get('rec_texts')
82
- boxes = data_dict.get('dt_polys', data_dict.get('rec_polys', data_dict.get('dt_boxes')))
83
-
84
- if texts and boxes and isinstance(texts, list) and isinstance(boxes, list):
85
- for i in range(min(len(texts), len(boxes))):
86
- txt = texts[i]
87
- box = parse_box(boxes[i])
88
- if box and txt: items_to_draw.append((box, txt))
89
- processed = True
90
-
91
- # Fallback Logic
92
- if not processed:
93
- def hunt(data):
94
- if isinstance(data, dict):
95
- box = None; text = None
96
- for k in ['points', 'box', 'dt_boxes', 'poly']:
97
- if k in data: box = parse_box(data[k]); break
98
- for k in ['transcription', 'text', 'rec_text', 'label']:
99
- if k in data: text = data[k]; break
100
- if box and text: items_to_draw.append((box, text)); return
101
- for v in data.values(): hunt(v)
102
- elif isinstance(data, (list, tuple)):
103
- if len(data) == 2 and isinstance(data[0], list) and len(data[0]) == 4:
104
- box = parse_box(data[0])
105
- txt_obj = data[1]
106
- text = txt_obj[0] if isinstance(txt_obj, (list, tuple)) else txt_obj
107
- if box and isinstance(text, str): items_to_draw.append((box, text)); return
108
- for item in data: hunt(item)
109
- hunt(raw_data)
110
-
111
- # Vẽ
112
- for box, txt in items_to_draw:
113
- try:
114
- # Vẽ khung đỏ
115
- draw.polygon(box, outline="red", width=3)
116
- # Vẽ chữ
117
- txt_x, txt_y = box[0]
118
- if hasattr(draw, "textbbox"):
119
- text_bbox = draw.textbbox((txt_x, txt_y), txt, font=font, anchor="lb")
120
- draw.rectangle(text_bbox, fill="red")
121
- draw.text((txt_x, txt_y), txt, fill="white", font=font, anchor="lb")
122
- else:
123
- draw.text((txt_x, txt_y - font_size), txt, fill="white", font=font)
124
- except: continue
125
 
 
 
 
 
 
 
 
 
 
 
126
  return canvas
127
 
128
- # --- HÀM XỬ LÝ TEXT ---
129
- def deep_extract_text(data):
130
- found_texts = []
131
- if isinstance(data, str):
132
- if len(data.strip()) > 0: return [data]
133
- return []
134
- if isinstance(data, (list, tuple)):
135
- for item in data: found_texts.extend(deep_extract_text(item))
136
- elif isinstance(data, dict):
137
- for val in data.values(): found_texts.extend(deep_extract_text(val))
138
- elif hasattr(data, '__dict__'): found_texts.extend(deep_extract_text(data.__dict__))
139
- return found_texts
140
-
141
- def clean_text_result(text_list):
142
- cleaned = []
143
- block_list = ['min', 'max', 'general', 'header', 'footer', 'structure']
144
- for t in text_list:
145
- t = t.strip()
146
- if len(t) < 2 and not any(u'\u4e00' <= c <= u'\u9fff' for c in t): continue
147
- if t.lower().endswith(('.ttf', '.json', '.pdparams', '.yml', '.log')): continue
148
- if t.lower() in block_list: continue
149
- if not re.search(r'[\w\u4e00-\u9fff]', t): continue
150
- cleaned.append(t)
151
- return cleaned
152
-
153
- # --- MAIN PREDICT ---
154
- def predict(image):
155
- if image is None: return None, "Chưa có ảnh.", "No Data"
156
 
157
  try:
158
- # Chuẩn bị ảnh đầu vào
159
- original_pil = image.copy() if isinstance(image, Image.Image) else Image.fromarray(image).copy()
160
- image_np = np.array(image)
161
-
162
- # 1. OCR
163
- raw_result = ocr.ocr(image_np)
164
-
165
- # 2. XỬ LÝ ẢNH ĐỂ VẼ (KEY FIX: Lấy ảnh từ Preprocessor nếu có)
166
- target_image_for_drawing = original_pil
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- # Kiểm tra xem Paddle có chỉnh sửa ảnh không (dựa vào key 'doc_preprocessor_res')
169
- if isinstance(raw_result, list) and len(raw_result) > 0 and isinstance(raw_result[0], dict):
170
- if 'doc_preprocessor_res' in raw_result[0]:
171
- proc_res = raw_result[0]['doc_preprocessor_res']
172
- # Nếu ảnh đầu ra đã chỉnh sửa (output_img)
173
- if 'output_img' in proc_res:
174
- print("Phát hiện ảnh đã qua xử lý hình học. Đang đồng bộ tọa độ...")
175
- numpy_img = proc_res['output_img']
176
- target_image_for_drawing = Image.fromarray(numpy_img)
177
-
178
- # 3. Vẽ lên ảnh ĐÚNG (Target Image)
179
- annotated_image = universal_draw(target_image_for_drawing, raw_result, FONT_PATH)
180
-
181
- # 4. Xử lý Text
182
- all_texts = deep_extract_text(raw_result)
183
- final_texts = clean_text_result(all_texts)
184
- text_output = "\n".join(final_texts) if final_texts else "Không tìm thấy văn bản."
185
-
186
- # Debug Info
187
- debug_str = str(raw_result)[:1000]
188
- debug_info = f"Used Image Source: {'Preprocessed' if target_image_for_drawing != original_pil else 'Original'}\nData Preview:\n{debug_str}..."
189
-
190
- return annotated_image, text_output, debug_info
191
 
192
  except Exception as e:
193
  import traceback
194
- return image, f"Lỗi: {str(e)}", traceback.format_exc()
 
 
 
 
 
195
 
196
- # --- GIAO DIỆN ---
197
- with gr.Blocks(title="PaddleOCR Perfect Overlay") as iface:
198
- gr.Markdown("## PaddleOCR Chinese - High Precision Overlay")
 
 
 
 
 
199
 
200
- with gr.Row():
201
- with gr.Column():
202
- input_img = gr.Image(type="pil", label="Input Image")
203
- submit_btn = gr.Button("RUN OCR", variant="primary")
204
-
205
- with gr.Column():
206
- with gr.Tabs():
207
- with gr.TabItem("🖼️ Kết quả Khớp Tọa Độ"):
208
- output_img = gr.Image(type="pil", label="Overlay Result")
209
- with gr.TabItem("📝 Văn bản"):
210
- output_txt = gr.Textbox(label="Text Content", lines=15)
211
- with gr.TabItem("🐞 Debug"):
212
- output_debug = gr.Textbox(label="Debug Info", lines=15)
213
-
214
- submit_btn.click(
215
- fn=predict,
216
- inputs=input_img,
217
- outputs=[output_img, output_txt, output_debug]
218
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  if __name__ == "__main__":
221
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
+ import base64
3
+ import io
4
+ import json
 
 
 
 
5
  import logging
6
  import re
7
+ import cv2
 
 
8
  import numpy as np
9
  import requests
10
+ from PIL import Image, ImageDraw, ImageFont
11
+ import gradio as gr
12
+ from paddleocr import PaddleOCR
13
 
14
+ # --- PHẦN 1: CẤU HÌNH & KHỞI TẠO PADDLEOCR (LOCAL ENGINE) ---
15
+ os.environ["FLAGS_use_mkldnn"] = "0"
16
+ os.environ["FLAGS_enable_mkldnn"] = "0"
17
+ os.environ["CPP_MIN_LOG_LEVEL"] = "3"
18
  logging.getLogger("ppocr").setLevel(logging.WARNING)
19
 
20
+ print("🚀 Đang khởi tạo PaddleOCR Local...")
 
21
  try:
22
+ # Cấu hình OCR Local
23
+ ocr = PaddleOCR(use_textline_orientation=True,
24
+ use_doc_orientation_classify=False,
25
+ use_doc_unwarping=False,
26
+ lang='ch') # Có thể đổi sang 'en' hoặc 'vi'
27
  except Exception as e:
28
+ print(f"⚠️ Lỗi khởi tạo nâng cao: {e}. Dùng chế độ mặc định.")
29
  ocr = PaddleOCR(lang='ch')
30
+ print("✅ Model đã sẵn sàng!")
31
 
32
+ # Tải Font để vẽ chữ (Từ Phần 1)
 
 
33
  def check_and_download_font():
34
  font_path = "./simfang.ttf"
35
  if not os.path.exists(font_path):
 
44
 
45
  FONT_PATH = check_and_download_font()
46
 
47
+ # --- HELPER FUNCTIONS (HỖ TRỢ XỬ LÝ ẢNH & TEXT) ---
48
+
49
+ def pil_to_base64_html(image):
50
+ """Chuyển đổi PIL Image thành thẻ HTML <img> base64"""
51
+ buffered = io.BytesIO()
52
+ image.save(buffered, format="JPEG")
53
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
54
+ return f'<img src="data:image/jpeg;base64,{img_str}" alt="Result" style="width:100%; object-fit:contain;">'
55
+
56
  def universal_draw(image, raw_data, font_path):
57
+ """Hàm vẽ box lên ảnh (Từ Phần 1)"""
58
  if image is None: return image
59
+ if isinstance(image, np.ndarray): image = Image.fromarray(image)
 
 
 
 
 
60
  canvas = image.copy()
61
  draw = ImageDraw.Draw(canvas)
62
 
 
66
  except:
67
  font = ImageFont.load_default()
68
 
69
+ boxes = [line[0] for line in raw_data[0]] if raw_data and raw_data[0] else []
70
+ txts = [line[1][0] for line in raw_data[0]] if raw_data and raw_data[0] else []
71
+ scores = [line[1][1] for line in raw_data[0]] if raw_data and raw_data[0] else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ for box, txt in zip(boxes, txts):
74
+ box = [tuple(p) for p in box]
75
+ draw.polygon(box, outline="red", width=3)
76
+ # Vẽ nền chữ
77
+ if hasattr(draw, "textbbox"):
78
+ text_bbox = draw.textbbox(box[0], txt, font=font, anchor="lb")
79
+ draw.rectangle(text_bbox, fill="red")
80
+ draw.text(box[0], txt, fill="white", font=font, anchor="lb")
81
+ else:
82
+ draw.text((box[0][0], box[0][1] - font_size), txt, fill="white", font=font)
83
  return canvas
84
 
85
+ # --- HÀM XỬ LÝ CHÍNH (LOGIC CẦU NỐI) ---
86
+ # Hàm này nhận input từ UI Phần 2, chạy Logic Phần 1, trả về format UI Phần 2
87
+
88
+ def local_inference(image_path, mode="Document"):
89
+ if not image_path:
90
+ return "Please upload an image.", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  try:
93
+ # 1. Đọc ảnh
94
+ img = Image.open(image_path).convert("RGB")
95
+ img_np = np.array(img)
96
+
97
+ # 2. Chạy PaddleOCR (Local)
98
+ # Lưu ý: Model Local cơ bản không hỗ trợ tách bảng/công thức chuyên sâu như API
99
+ # nhưng ta vẫn chạy OCR để lấy text.
100
+ result = ocr.ocr(img_np, cls=True)
101
+
102
+ # 3. Xử lý kết quả để hiển thị
103
+ if not result or result[0] is None:
104
+ return "No text found.", "<p>No text detected</p>", "[]"
105
+
106
+ # Tạo ảnh visualization (Vẽ box)
107
+ annotated_img = universal_draw(img, result, FONT_PATH)
108
+ html_vis = pil_to_base64_html(annotated_img)
109
+
110
+ # Tạo Markdown Output
111
+ # Gom nhóm text lại thành đoạn văn
112
+ texts = [line[1][0] for line in result[0]]
113
 
114
+ if mode == "Formula":
115
+ md_text = "### Recognized Formula (Raw Text):\n\n" + " ".join(texts)
116
+ md_text += "\n\n*(Note: Local generic OCR model cannot convert to LaTeX math syntax)*"
117
+ elif mode == "Table":
118
+ md_text = "### Recognized Table Content:\n\n" + "\n".join(texts)
119
+ md_text += "\n\n*(Note: Local generic OCR model does not reconstruct HTML structure)*"
120
+ else: # Document / Generic
121
+ md_text = "### Document Content:\n\n" + "\n".join(texts)
122
+
123
+ # Raw Data (JSON string để debug)
124
+ raw_json = json.dumps(result[0], ensure_ascii=False, indent=2)
125
+
126
+ return md_text, html_vis, raw_json
 
 
 
 
 
 
 
 
 
 
127
 
128
  except Exception as e:
129
  import traceback
130
+ err = traceback.format_exc()
131
+ return f"Error: {str(e)}", f"<p style='color:red'>{str(e)}</p>", err
132
+
133
+ # Wrapper cho các Tab khác nhau
134
+ def run_doc_parsing(file, *args):
135
+ return local_inference(file, mode="Document")
136
 
137
+ def run_element_recognition(file, prompt_label, *args):
138
+ # prompt_label: "Formula Recognition", "Table Recognition", etc.
139
+ mode = prompt_label.split()[0] # Lấy từ đầu tiên (Formula/Table...)
140
+ return local_inference(file, mode=mode)
141
+
142
+ def run_spotting(file, *args):
143
+ # Spotting giả lập: Trả về bounding boxes của text dưới dạng JSON
144
+ if not file: return "", "{}"
145
 
146
+ img = Image.open(file).convert("RGB")
147
+ result = ocr.ocr(np.array(img), cls=True)
148
+
149
+ if not result or result[0] is None:
150
+ return "<p>No objects found</p>", "[]"
151
+
152
+ annotated_img = universal_draw(img, result, FONT_PATH)
153
+ html_vis = pil_to_base64_html(annotated_img)
154
+
155
+ # Format lại JSON cho giống spotting
156
+ spotting_res = []
157
+ for line in result[0]:
158
+ spotting_res.append({
159
+ "label": "text_block",
160
+ "text": line[1][0],
161
+ "confidence": line[1][1],
162
+ "box": line[0]
163
+ })
164
+
165
+ return html_vis, json.dumps(spotting_res, ensure_ascii=False, indent=2)
166
+
167
+
168
+ # --- PHẦN 2: GIAO DIỆN (UI TỪ FILE 2) ---
169
+ custom_css = """
170
+ body, .gradio-container { font-family: "Noto Sans SC", sans-serif; }
171
+ .app-header { text-align: center; margin-bottom: 20px; }
172
+ .prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
173
+ .prompt-grid button { height: 40px !important; }
174
+ .notice { background: #f0f9ff; padding: 10px; border-radius: 8px; border: 1px solid #bae6fd; font-size: 14px; margin-bottom: 10px;}
175
+ """
176
+
177
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
178
+ gr.HTML("""
179
+ <div class="app-header">
180
+ <h1>PaddleOCR Local - Pro Interface</h1>
181
+ <p>Giao diện nâng cao chạy trên Backend Local (CPU)</p>
182
+ </div>
183
+ <div class="notice">
184
+ <strong>Lưu ý:</strong> Đây là phiên bản chạy model Local.
185
+ Các tính năng như <em>Formula to Latex</em>, <em>Table to HTML</em> hay <em>Layout Analysis</em>
186
+ chỉ trả về văn bản thô (Raw OCR) do giới hạn của model cài đặt cục bộ.
187
+ </div>
188
+ """)
189
+
190
+ with gr.Tabs():
191
+ # ===================== Tab 1: Document Parsing =====================
192
+ with gr.Tab("Document Parsing"):
193
+ with gr.Row():
194
+ with gr.Column(scale=5):
195
+ file_doc = gr.File(label="Upload Image", type="filepath", file_types=["image"])
196
+ btn_parse = gr.Button("Parse Document", variant="primary")
197
+ # Các tùy chọn checkbox (Dummy - vì local model config đơn giản)
198
+ with gr.Row():
199
+ gr.Checkbox(label="Chart parsing (N/A)", value=False, interactive=False)
200
+ gr.Checkbox(label="Doc unwarping (N/A)", value=False, interactive=False)
201
+
202
+ with gr.Column(scale=7):
203
+ with gr.Tabs():
204
+ with gr.Tab("Markdown Preview"):
205
+ md_preview_doc = gr.Markdown()
206
+ with gr.Tab("Visualization"):
207
+ vis_image_doc = gr.HTML()
208
+ with gr.Tab("Raw Data"):
209
+ raw_doc = gr.Code(language="json")
210
+
211
+ btn_parse.click(run_doc_parsing, inputs=[file_doc], outputs=[md_preview_doc, vis_image_doc, raw_doc])
212
+
213
+ # ===================== Tab 2: Element-level Recognition =====================
214
+ with gr.Tab("Element-level Recognition"):
215
+ with gr.Row():
216
+ with gr.Column(scale=5):
217
+ file_vl = gr.File(label="Upload Image", type="filepath", file_types=["image"])
218
+ gr.Markdown("_(Chế độ này tối ưu cho từng thành phần riêng lẻ)_")
219
+
220
+ with gr.Row(elem_classes=["prompt-grid"]):
221
+ btn_ocr = gr.Button("Text Recognition", variant="secondary")
222
+ btn_formula = gr.Button("Formula Recognition", variant="secondary")
223
+ with gr.Row(elem_classes=["prompt-grid"]):
224
+ btn_table = gr.Button("Table Recognition", variant="secondary")
225
+ btn_seal = gr.Button("Seal Recognition", variant="secondary")
226
+
227
+ with gr.Column(scale=7):
228
+ with gr.Tabs():
229
+ with gr.Tab("Result"):
230
+ md_preview_vl = gr.Markdown()
231
+ with gr.Tab("Visualization"):
232
+ vis_image_vl = gr.HTML()
233
+ with gr.Tab("Raw Output"):
234
+ md_raw_vl = gr.Code(language="json")
235
+
236
+ # Gán sự kiện cho các nút
237
+ for btn, label in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table"), (btn_seal, "Seal")]:
238
+ btn.click(
239
+ fn=run_element_recognition,
240
+ inputs=[file_vl, gr.State(label)],
241
+ outputs=[md_preview_vl, vis_image_vl, md_raw_vl]
242
+ )
243
+
244
+ # ===================== Tab 3: Spotting =====================
245
+ with gr.Tab("Spotting"):
246
+ with gr.Row():
247
+ with gr.Column(scale=5):
248
+ file_spot = gr.File(label="Upload Image", type="filepath", file_types=["image"])
249
+ btn_run_spot = gr.Button("Run Spotting", variant="primary")
250
+ gr.Markdown("_(Phát hiện vị trí văn bản)_")
251
+
252
+ with gr.Column(scale=7):
253
+ with gr.Tabs():
254
+ with gr.Tab("Visualization"):
255
+ vis_image_spot = gr.HTML()
256
+ with gr.Tab("JSON Result"):
257
+ json_spot = gr.Code(language="json")
258
+
259
+ btn_run_spot.click(run_spotting, inputs=[file_spot], outputs=[vis_image_spot, json_spot])
260
 
261
  if __name__ == "__main__":
262
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)