PDF-to-Word

Runtime error

App Files Files Community

VietKien commited on Feb 4

Commit

4687093

verified ·

1 Parent(s): 60d6bcc

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -203

app.py CHANGED Viewed

@@ -6,305 +6,196 @@ import cv2
 import numpy as np
 from PIL import Image
-# Thêm dòng này lên đầu file hoặc ngay dưới các lệnh import để tắt check mạng
 os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
 try:
     from paddleocr import PaddleOCR
-    # Sửa: Bỏ 'show_log', đổi 'use_angle_cls' thành 'use_textline_orientation'
-    # Nếu vẫn báo lỗi tham số, chỉ cần dùng: PaddleOCR(lang='vi')
     ocr_engine = PaddleOCR(use_textline_orientation=True, lang='vi')
     HAS_OCR = True
 except ImportError:
     HAS_OCR = False
-    print("⚠️ CHƯA CÀI PADDLEOCR: Vui lòng chạy 'pip install paddlepaddle paddleocr'")
 except Exception as e:
     HAS_OCR = False
-    print(f"⚠️ Lỗi khởi tạo OCR: {e}")
-# --- 1. CẤU HÌNH NGÔN NGỮ (TỪ ĐIỂN) ---
 TRANS = {
     "vi": {
         "lang_btn": "🇬🇧 English",
-        "app_title": "DOCUMENTS TOOLKIT PRO",
-        # Sidebar items
         "menu_label": "🛠️ Chọn Chức năng",
         "menu_pdf": "📝 Chuyển PDF sang Word",
-        "menu_ocr": "👁️ OCR (Trích xuất chữ từ ảnh)",
         "info_header": "ℹ️ Thông tin",
-        # PDF Tool
         "pdf_title": "### 📄 CHUYỂN ĐỔI PDF SANG WORD",
-        "pdf_input_label": "Tải file PDF (Chọn nhiều file)",
         "pdf_btn": "🚀 CHUYỂN ĐỔI NGAY",
-        # OCR Tool
         "ocr_title": "### 👁️ TRÍCH XUẤT VĂN BẢN (OCR)",
-        "ocr_input_label": "Tải hình ảnh (PNG, JPG)",
-        "ocr_btn": "🔍 QUÉT VÀ TRÍCH XUẤT",
-        "ocr_output_text": "Nội dung văn bản",
-        "ocr_output_img": "Ảnh đã khoanh vùng chữ",
-        # Messages
-        "msg_warning": "⚠️ Vui lòng tải file lên!",
-        "msg_processing": "Đang xử lý...",
         "msg_success": "✅ Hoàn tất!",
         "msg_error": "❌ Lỗi: ",
-        "ocr_not_installed": "⚠️ Lỗi: Chưa cài thư viện PaddleOCR.",
-        # Info Content
-        "info_content": """
-            **Phát triển bởi:** Chu Viết Kiên
-            **Liên hệ:** kiencv.3107@gmail.com
-            **Phiên bản:** 2.0 (OCR Update)
-        """
     },
     "en": {
         "lang_btn": "🇻🇳 Tiếng Việt",
-        "app_title": "DOCUMENTS TOOLKIT PRO",
-        # Sidebar items
         "menu_label": "🛠️ Select Tool",
         "menu_pdf": "📝 PDF to Word Converter",
         "menu_ocr": "👁️ OCR (Image to Text)",
         "info_header": "ℹ️ Information",
-        # PDF Tool
         "pdf_title": "### 📄 PDF TO WORD CONVERTER",
-        "pdf_input_label": "Upload PDF Files",
         "pdf_btn": "🚀 CONVERT NOW",
-        # OCR Tool
-        "ocr_title": "### 👁️ OPTICAL CHARACTER RECOGNITION (OCR)",
-        "ocr_input_label": "Upload Image (PNG, JPG)",
-        "ocr_btn": "🔍 SCAN & EXTRACT",
         "ocr_output_text": "Extracted Text",
-        "ocr_output_img": "Annotated Image",
-        # Messages
-        "msg_warning": "⚠️ Please upload a file!",
-        "msg_processing": "Processing...",
         "msg_success": "✅ Done!",
         "msg_error": "❌ Error: ",
-        "ocr_not_installed": "⚠️ Error: PaddleOCR library not installed.",
-        # Info Content
-        "info_content": """
-            **Developer:** Chu Viet Kien
-            **Contact:** kiencv.3107@gmail.com
-            **Version:** 2.0 (OCR Update)
-        """
     }
 }
-# --- 2. HÀM XỬ LÝ LOGIC ---
-# 2.1 Logic PDF -> Word
 def convert_pdfs_to_word(pdf_files, lang_code, progress=gr.Progress()):
     T = TRANS[lang_code]
     if not pdf_files: return None, T["msg_warning"]
     if not isinstance(pdf_files, list): pdf_files = [pdf_files]
-    converted_files = []
     try:
-        for idx, pdf_file in enumerate(pdf_files):
-            progress((idx / len(pdf_files)), desc=f"Processing {os.path.basename(pdf_file.name)}...")
-            docx_name = os.path.splitext(os.path.basename(pdf_file.name))[0] + ".docx"
-            cv = Converter(pdf_file.name)
-            cv.convert(docx_name, start=0, end=None)
             cv.close()
-            converted_files.append(docx_name)
-        if len(converted_files) == 1:
-            return converted_files[0], T["msg_success"]
         else:
-            zip_name = "Converted_Docs.zip"
             with zipfile.ZipFile(zip_name, 'w') as zf:
-                for f in converted_files: zf.write(f)
             return zip_name, T["msg_success"]
     except Exception as e:
-        return None, f"{T['msg_error']} {str(e)}"
-# 2.2 Logic OCR
 def run_ocr_func(image, lang_code):
     T = TRANS[lang_code]
     if not HAS_OCR: return None, None, T["ocr_not_installed"]
     if image is None: return None, None, T["msg_warning"]
     try:
-        # PaddleOCR nhận đường dẫn file hoặc numpy array
-        # Gradio Image input mặc định trả về numpy array (RGB)
-        # Chạy OCR
         result = ocr_engine.ocr(image, cls=True)
-        # Xử lý kết quả
-        txts = []
-        boxes = []
-        scores = []
-        # Result cấu trúc: [ [ [box], (text, score) ], ... ]
-        # Đôi khi result trả về list lồng nhau nếu có nhiều vùng
-        if result and result[0]:
-            for line in result[0]:
-                boxes.append(line[0])
-                txts.append(line[1][0])
-                scores.append(line[1][1])
-        # Tạo văn bản kết quả
         full_text = "\n".join(txts)
-        # Vẽ box lên ảnh để hiển thị (Dùng OpenCV)
-        # Convert RGB (Gradio) -> BGR (OpenCV)
         img_cv = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
         for box in boxes:
-            box = np.array(box).astype(np.int32).reshape((-1, 1, 2))
-            cv2.polylines(img_cv, [box], True, (0, 0, 255), 2)
-        # Convert ngược lại BGR -> RGB để hiển thị trên Gradio
         final_img = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
-        # Lưu file text để tải về (tuỳ chọn)
-        with open("ocr_result.txt", "w", encoding="utf-8") as f:
             f.write(full_text)
-        return full_text, final_img, "ocr_result.txt"
     except Exception as e:
-        return str(e), None, None
-# --- 3. UI HELPER FUNCTIONS ---
 def change_ui_language(lang):
-    new_lang = "en" if lang == "vi" else "vi"
-    T = TRANS[new_lang]
     return (
-        new_lang, # Update State
-        T["lang_btn"],
-        gr.update(label=T["menu_label"], choices=[T["menu_pdf"], T["menu_ocr"]], value=T["menu_pdf"]), # Radio
-        T["info_header"], # Accordion Label
-        T["info_content"], # Info Text
-        # PDF UI
-        T["pdf_title"],
-        gr.update(label=T["pdf_input_label"]),
-        T["pdf_btn"],
-        # OCR UI
-        T["ocr_title"],
-        gr.update(label=T["ocr_input_label"]),
-        T["ocr_btn"],
-        gr.update(label=T["ocr_output_text"]),
-        gr.update(label=T["ocr_output_img"])
     )
-def toggle_tool(menu_choice, lang_code):
-    # Hàm này quyết định ẩn hiện Group nào dựa trên Radio Button
-    T = TRANS[lang_code]
-    if menu_choice == T["menu_pdf"] or menu_choice == TRANS["en"]["menu_pdf"] or menu_choice == TRANS["vi"]["menu_pdf"]:
-        return gr.update(visible=True), gr.update(visible=False)
-    else:
-        return gr.update(visible=False), gr.update(visible=True)
-# --- 4. GIAO DIỆN CHÍNH ---
 custom_css = """
-.gradio-container {background-color: #f9f9f9}
-#btn_convert {background: linear-gradient(90deg, #4b6cb7, #182848); color: white;}
-#btn_ocr {background: linear-gradient(90deg, #d53369, #daae51); color: white;}
-footer {visibility: hidden}
 """
-with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Toolkit Pro") as demo:
-    lang_state = gr.State(value="vi")
-    # --- SIDEBAR (THANH BÊN) ---
     with gr.Sidebar():
-        gr.Markdown("## 🛠️ CONTROL PANEL")
-        # 1. Nút đổi ngôn ngữ
         btn_lang = gr.Button(TRANS["vi"]["lang_btn"])
-        # 2. Menu chọn công cụ
         radio_menu = gr.Radio(
             choices=[TRANS["vi"]["menu_pdf"], TRANS["vi"]["menu_ocr"]],
             value=TRANS["vi"]["menu_pdf"],
-            label=TRANS["vi"]["menu_label"],
-            interactive=True
         )
-        gr.HTML("<hr>")
-        # 3. Thông tin tác giả
         with gr.Accordion(TRANS["vi"]["info_header"], open=True) as info_acc:
             info_md = gr.Markdown(TRANS["vi"]["info_content"])
-    # --- MAIN CONTENT AREA ---
-    # === GROUP 1: PDF TO WORD ===
     with gr.Group(visible=True) as group_pdf:
         header_pdf = gr.Markdown(TRANS["vi"]["pdf_title"])
         with gr.Row():
-            with gr.Column():
-                input_pdf = gr.File(label=TRANS["vi"]["pdf_input_label"], file_types=[".pdf"], file_count="multiple", height=200)
-            with gr.Column():
-                output_docx = gr.File(label="Download Result", height=100)
-                status_pdf = gr.Textbox(label="Status", interactive=False)
-        btn_run_pdf = gr.Button(TRANS["vi"]["pdf_btn"], elem_id="btn_convert")
-    # === GROUP 2: OCR ===
     with gr.Group(visible=False) as group_ocr:
         header_ocr = gr.Markdown(TRANS["vi"]["ocr_title"])
         with gr.Row():
-            # Cột trái: Input
-            with gr.Column(scale=1):
-                input_img = gr.Image(label=TRANS["vi"]["ocr_input_label"], type="numpy", height=300)
-                btn_run_ocr = gr.Button(TRANS["vi"]["ocr_btn"], elem_id="btn_ocr")
-            # Cột phải: Output
-            with gr.Column(scale=1):
-                output_text = gr.Textbox(label=TRANS["vi"]["ocr_output_text"], lines=10, show_copy_button=True)
-                output_txt_file = gr.File(label="Download .txt")
-        # Hiển thị ảnh kết quả bên dưới
-        output_img_viz = gr.Image(label=TRANS["vi"]["ocr_output_img"], interactive=False)
-    # --- EVENT HANDLERS ---
-    # 1. Logic chạy PDF
-    btn_run_pdf.click(
-        fn=convert_pdfs_to_word,
-        inputs=[input_pdf, lang_state],
-        outputs=[output_docx, status_pdf]
-    )
-    # 2. Logic chạy OCR
-    btn_run_ocr.click(
-        fn=run_ocr_func,
-        inputs=[input_img, lang_state],
-        outputs=[output_text, output_img_viz, output_txt_file]
-    )
-    # 3. Logic chuyển đổi Tab (Ẩn hiện Group)
-    radio_menu.change(
-        fn=toggle_tool,
-        inputs=[radio_menu, lang_state],
-        outputs=[group_pdf, group_ocr]
-    )
-    # 4. Logic đổi ngôn ngữ (Cập nhật toàn bộ Label)
-    btn_lang.click(
-        fn=change_ui_language,
-        inputs=[lang_state],
-        outputs=[
-            lang_state, btn_lang, radio_menu, info_acc, info_md, # Sidebar updates
-            header_pdf, input_pdf, btn_run_pdf, # PDF updates
-            header_ocr, input_img, btn_run_ocr, output_text, output_img_viz # OCR updates
-        ]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import numpy as np
 from PIL import Image
+# --- CẤU HÌNH FIX LỖI PADDLE ---
 os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
 try:
     from paddleocr import PaddleOCR
+    # Khởi tạo OCR
     ocr_engine = PaddleOCR(use_textline_orientation=True, lang='vi')
     HAS_OCR = True
 except ImportError:
     HAS_OCR = False
+    print("⚠️ CHƯA CÀI PADDLEOCR")
 except Exception as e:
     HAS_OCR = False
+    print(f"⚠️ Lỗi OCR init: {e}")
+# --- 1. TỪ ĐIỂN NGÔN NGỮ ---
 TRANS = {
     "vi": {
         "lang_btn": "🇬🇧 English",
         "menu_label": "🛠️ Chọn Chức năng",
         "menu_pdf": "📝 Chuyển PDF sang Word",
+        "menu_ocr": "👁️ OCR (Trích xuất chữ)",
         "info_header": "ℹ️ Thông tin",
         "pdf_title": "### 📄 CHUYỂN ĐỔI PDF SANG WORD",
+        "pdf_input_label": "Tải file PDF",
         "pdf_btn": "🚀 CHUYỂN ĐỔI NGAY",
         "ocr_title": "### 👁️ TRÍCH XUẤT VĂN BẢN (OCR)",
+        "ocr_input_label": "Tải hình ảnh",
+        "ocr_btn": "🔍 QUÉT NGAY",
+        "ocr_output_text": "Kết quả văn bản",
+        "ocr_output_img": "Ảnh đã xử lý",
+        "msg_warning": "⚠️ Vui lòng tải file!",
         "msg_success": "✅ Hoàn tất!",
         "msg_error": "❌ Lỗi: ",
+        "ocr_not_installed": "⚠️ Lỗi: Chưa cài thư viện OCR.",
+        "info_content": "**Dev:** Chu Viết Kiên | **Ver:** 2.1 (Stable)"
     },
     "en": {
         "lang_btn": "🇻🇳 Tiếng Việt",
         "menu_label": "🛠️ Select Tool",
         "menu_pdf": "📝 PDF to Word Converter",
         "menu_ocr": "👁️ OCR (Image to Text)",
         "info_header": "ℹ️ Information",
         "pdf_title": "### 📄 PDF TO WORD CONVERTER",
+        "pdf_input_label": "Upload PDF",
         "pdf_btn": "🚀 CONVERT NOW",
+        "ocr_title": "### 👁️ OPTICAL CHARACTER RECOGNITION",
+        "ocr_input_label": "Upload Image",
+        "ocr_btn": "🔍 SCAN NOW",
         "ocr_output_text": "Extracted Text",
+        "ocr_output_img": "Processed Image",
+        "msg_warning": "⚠️ Please upload file!",
         "msg_success": "✅ Done!",
         "msg_error": "❌ Error: ",
+        "ocr_not_installed": "⚠️ Error: OCR lib missing.",
+        "info_content": "**Dev:** Chu Viet Kien | **Ver:** 2.1 (Stable)"
     }
 }
+# --- 2. HÀM LOGIC ---
 def convert_pdfs_to_word(pdf_files, lang_code, progress=gr.Progress()):
     T = TRANS[lang_code]
     if not pdf_files: return None, T["msg_warning"]
     if not isinstance(pdf_files, list): pdf_files = [pdf_files]
+    converted = []
     try:
+        for idx, pdf in enumerate(pdf_files):
+            docx = os.path.splitext(os.path.basename(pdf.name))[0] + ".docx"
+            cv = Converter(pdf.name)
+            cv.convert(docx)
             cv.close()
+            converted.append(docx)
+        if len(converted) == 1:
+            return converted[0], T["msg_success"]
         else:
+            zip_name = "Result.zip"
             with zipfile.ZipFile(zip_name, 'w') as zf:
+                for f in converted: zf.write(f)
             return zip_name, T["msg_success"]
     except Exception as e:
+        return None, f"{T['msg_error']} {e}"
 def run_ocr_func(image, lang_code):
     T = TRANS[lang_code]
     if not HAS_OCR: return None, None, T["ocr_not_installed"]
     if image is None: return None, None, T["msg_warning"]
     try:
+        # OCR
         result = ocr_engine.ocr(image, cls=True)
+        txts, boxes = [], []
+        # Xử lý format output của Paddle
+        if result:
+            data = result[0] if isinstance(result[0], list) else result
+            if data:
+                for line in data:
+                    boxes.append(line[0])
+                    txts.append(line[1][0])
         full_text = "\n".join(txts)
+        # Vẽ box
         img_cv = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
         for box in boxes:
+            pts = np.array(box, np.int32).reshape((-1, 1, 2))
+            cv2.polylines(img_cv, [pts], True, (0, 0, 255), 2)
         final_img = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
+        with open("ocr_text.txt", "w", encoding="utf-8") as f:
             f.write(full_text)
+        return full_text, final_img, "ocr_text.txt"
     except Exception as e:
+        return f"Err: {e}", None, None
+# --- 3. UI HELPERS ---
 def change_ui_language(lang):
+    new = "en" if lang == "vi" else "vi"
+    T = TRANS[new]
     return (
+        new, T["lang_btn"],
+        gr.update(label=T["menu_label"], choices=[T["menu_pdf"], T["menu_ocr"]], value=T["menu_pdf"]),
+        T["info_header"], T["info_content"],
+        T["pdf_title"], gr.update(label=T["pdf_input_label"]), T["pdf_btn"],
+        T["ocr_title"], gr.update(label=T["ocr_input_label"]), T["ocr_btn"],
+        gr.update(label=T["ocr_output_text"]), gr.update(label=T["ocr_output_img"])
     )
+def toggle_tool(menu, lang):
+    is_pdf = (menu == TRANS["vi"]["menu_pdf"] or menu == TRANS["en"]["menu_pdf"])
+    return (gr.update(visible=True), gr.update(visible=False)) if is_pdf else (gr.update(visible=False), gr.update(visible=True))
+# --- 4. GIAO DIỆN ---
 custom_css = """
+#convert_btn, #ocr_btn {color: white; background: linear-gradient(90deg, #2b5876, #4e4376);}
 """
+# Chuyển theme ra khỏi constructor Blocks để tránh warning
+with gr.Blocks(title="Toolkit Pro") as demo:
+    lang_state = gr.State("vi")
     with gr.Sidebar():
+        gr.Markdown("## 🛠️ MENU")
         btn_lang = gr.Button(TRANS["vi"]["lang_btn"])
         radio_menu = gr.Radio(
             choices=[TRANS["vi"]["menu_pdf"], TRANS["vi"]["menu_ocr"]],
             value=TRANS["vi"]["menu_pdf"],
+            label=TRANS["vi"]["menu_label"]
         )
         with gr.Accordion(TRANS["vi"]["info_header"], open=True) as info_acc:
             info_md = gr.Markdown(TRANS["vi"]["info_content"])
+    # PDF GROUP
     with gr.Group(visible=True) as group_pdf:
         header_pdf = gr.Markdown(TRANS["vi"]["pdf_title"])
         with gr.Row():
+            in_pdf = gr.File(label=TRANS["vi"]["pdf_input_label"], file_types=[".pdf"], file_count="multiple")
+            out_word = gr.File(label="Result")
+        btn_pdf = gr.Button(TRANS["vi"]["pdf_btn"], elem_id="convert_btn")
+        st_pdf = gr.Textbox(label="Status", interactive=False)
+    # OCR GROUP
     with gr.Group(visible=False) as group_ocr:
         header_ocr = gr.Markdown(TRANS["vi"]["ocr_title"])
         with gr.Row():
+            with gr.Column():
+                in_img = gr.Image(label=TRANS["vi"]["ocr_input_label"], type="numpy", height=300)
+                btn_ocr = gr.Button(TRANS["vi"]["ocr_btn"], elem_id="ocr_btn")
+            with gr.Column():
+                # ĐÃ SỬA: Bỏ show_copy_button=True để tránh lỗi crash
+                out_txt = gr.Textbox(label=TRANS["vi"]["ocr_output_text"], lines=10)
+                out_file = gr.File(label="Download .txt")
+        out_img_viz = gr.Image(label=TRANS["vi"]["ocr_output_img"], interactive=False)
+    # EVENTS
+    btn_pdf.click(convert_pdfs_to_word, [in_pdf, lang_state], [out_word, st_pdf])
+    btn_ocr.click(run_ocr_func, [in_img, lang_state], [out_txt, out_img_viz, out_file])
+    radio_menu.change(toggle_tool, [radio_menu, lang_state], [group_pdf, group_ocr])
+    btn_lang.click(change_ui_language, [lang_state], [
+        lang_state, btn_lang, radio_menu, info_acc, info_md,
+        header_pdf, in_pdf, btn_pdf,
+        header_ocr, in_img, btn_ocr, out_txt, out_img_viz
+    ])
 if __name__ == "__main__":
+    # Di chuyển theme và css xuống đây để đúng chuẩn mới
+    demo.launch(theme=gr.themes.Soft(), css=custom_css)