Spaces:

123Sabrina
/

pdf_2025

Sleeping

App Files Files Community

123Sabrina commited on Jun 14, 2025

Commit

eac5970

verified ·

1 Parent(s): 38da418

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -18

app.py CHANGED Viewed

@@ -10,17 +10,44 @@ import zipfile
 from io import BytesIO
 import pandas as pd
 from datetime import datetime
 class PDFOCRProcessor:
     def __init__(self):
         self.results = []
         self.processing_log = []
     def log_message(self, message):
         """添加日誌訊息"""
         timestamp = datetime.now().strftime("%H:%M:%S")
         log_entry = f"[{timestamp}] {message}"
         self.processing_log.append(log_entry)
         return log_entry
     def download_pdf(self, url, filename):
@@ -37,20 +64,91 @@ class PDFOCRProcessor:
             self.log_message(f"❌ 下載失敗: {e}")
             return False
     def pdf_to_text_ocr(self, pdf_path):
         """將PDF轉換為文字（使用OCR）"""
         try:
             self.log_message(f"🔍 開始OCR處理: {os.path.basename(pdf_path)}")
-            images = convert_from_path(pdf_path, dpi=300)
             all_text = ""
             for i, img in enumerate(images):
                 self.log_message(f"   處理第 {i+1}/{len(images)} 頁...")
-                text = pytesseract.image_to_string(img, lang='chi_tra')
-                all_text += f"\n--- 第 {i+1} 頁 ---\n{text}"
             self.log_message(f"✅ OCR完成，共處理 {len(images)} 頁")
             return all_text
         except Exception as e:
             self.log_message(f"❌ OCR處理失敗: {e}")
             return None
@@ -216,8 +314,34 @@ def create_interface():
         1. 在下方文字框中輸入PDF網址（每行一個）
         2. 點擊「開始處理」按鈕
         3. 等待處理完成後，可以下載包含所有結果的ZIP檔案
         """)
         with gr.Row():
             with gr.Column(scale=2):
                 pdf_urls_input = gr.Textbox(
@@ -258,6 +382,16 @@ def create_interface():
             )
         # 綁定處理函數
         process_btn.click(
             fn=processor.process_multiple_pdfs,
             inputs=[pdf_urls_input],
@@ -282,26 +416,30 @@ def create_interface():
     return app
-# 啟動應用
 if __name__ == "__main__":
-    # 檢查必要套件
-    required_packages = """
-    在運行此應用前，請確保已安裝以下套件：
-    pip install gradio requests pdf2image pillow pytesseract pandas
-    # Ubuntu/Debian 系統
-    apt-get install -y tesseract-ocr tesseract-ocr-chi-tra poppler-utils
-    # macOS 系統
-    brew install tesseract tesseract-lang poppler
-    # Windows 系統
-    # 請下載並安裝 Tesseract OCR 和 Poppler
-    """
-    print(required_packages)
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",

 from io import BytesIO
 import pandas as pd
 from datetime import datetime
+import subprocess
+import sys
+import fitz  # PyMuPDF 作為備用方案
 class PDFOCRProcessor:
     def __init__(self):
         self.results = []
         self.processing_log = []
+        self.check_dependencies()
+    def check_dependencies(self):
+        """檢查系統依賴"""
+        self.log_message("🔍 檢查系統依賴...")
+        # 檢查 poppler
+        try:
+            result = subprocess.run(['pdftoppm', '-h'], capture_output=True, text=True)
+            self.log_message("✅ Poppler 已安裝")
+            self.has_poppler = True
+        except FileNotFoundError:
+            self.log_message("⚠️ 未找到 Poppler，將使用 PyMuPDF 作為備用方案")
+            self.has_poppler = False
+        # 檢查 tesseract
+        try:
+            result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
+            self.log_message("✅ Tesseract 已安裝")
+            self.has_tesseract = True
+        except FileNotFoundError:
+            self.log_message("❌ 未找到 Tesseract OCR")
+            self.has_tesseract = False
     def log_message(self, message):
         """添加日誌訊息"""
         timestamp = datetime.now().strftime("%H:%M:%S")
         log_entry = f"[{timestamp}] {message}"
         self.processing_log.append(log_entry)
+        print(log_entry)  # 同時輸出到控制台
         return log_entry
     def download_pdf(self, url, filename):
             self.log_message(f"❌ 下載失敗: {e}")
             return False
+    def pdf_to_images_poppler(self, pdf_path):
+        """使用 poppler 將PDF轉換為圖片"""
+        try:
+            images = convert_from_path(pdf_path, dpi=300)
+            return images
+        except Exception as e:
+            self.log_message(f"❌ Poppler 轉換失敗: {e}")
+            return None
+    def pdf_to_images_pymupdf(self, pdf_path):
+        """使用 PyMuPDF 將PDF轉換為圖片"""
+        try:
+            doc = fitz.open(pdf_path)
+            images = []
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                # 提高解析度
+                mat = fitz.Matrix(2.0, 2.0)  # 2x zoom
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("ppm")
+                img = Image.open(BytesIO(img_data))
+                images.append(img)
+            doc.close()
+            return images
+        except Exception as e:
+            self.log_message(f"❌ PyMuPDF 轉換失敗: {e}")
+            return None
     def pdf_to_text_ocr(self, pdf_path):
         """將PDF轉換為文字（使用OCR）"""
         try:
             self.log_message(f"🔍 開始OCR處理: {os.path.basename(pdf_path)}")
+            # 嘗試不同的PDF處理方法
+            images = None
+            if self.has_poppler:
+                self.log_message("   使用 Poppler 轉換PDF...")
+                images = self.pdf_to_images_poppler(pdf_path)
+            if images is None:
+                self.log_message("   使用 PyMuPDF 轉換PDF...")
+                images = self.pdf_to_images_pymupdf(pdf_path)
+            if images is None:
+                self.log_message("❌ 無法轉換PDF為圖片")
+                return None
+            if not self.has_tesseract:
+                self.log_message("❌ Tesseract 未安裝，無法進行OCR")
+                return None
             all_text = ""
             for i, img in enumerate(images):
                 self.log_message(f"   處理第 {i+1}/{len(images)} 頁...")
+                try:
+                    # 嘗試不同的OCR語言設定
+                    ocr_langs = ['chi_tra', 'chi_sim', 'eng']
+                    text = None
+                    for lang in ocr_langs:
+                        try:
+                            text = pytesseract.image_to_string(img, lang=lang)
+                            if text.strip():  # 如果有內容就使用
+                                self.log_message(f"   使用 {lang} 語言模型成功")
+                                break
+                        except Exception as lang_error:
+                            self.log_message(f"   {lang} 語言模型失敗: {lang_error}")
+                            continue
+                    if text is None:
+                        # 最後嘗試預設設定
+                        text = pytesseract.image_to_string(img)
+                    all_text += f"\n--- 第 {i+1} 頁 ---\n{text}"
+                except Exception as page_error:
+                    self.log_message(f"   第 {i+1} 頁處理失敗: {page_error}")
+                    all_text += f"\n--- 第 {i+1} 頁 ---\n[處理失敗]\n"
             self.log_message(f"✅ OCR完成，共處理 {len(images)} 頁")
             return all_text
         except Exception as e:
             self.log_message(f"❌ OCR處理失敗: {e}")
             return None
         1. 在下方文字框中輸入PDF網址（每行一個）
         2. 點擊「開始處理」按鈕
         3. 等待處理完成後，可以下載包含所有結果的ZIP檔案
+        ## 🔧 安裝說明：
+        如果遇到依賴問題，請安裝以下套件：
+        ```bash
+        # Python 套件
+        pip install gradio requests pdf2image pillow pytesseract pandas PyMuPDF
+        # Ubuntu/Debian 系統
+        sudo apt-get update
+        sudo apt-get install -y tesseract-ocr tesseract-ocr-chi-tra poppler-utils
+        # macOS 系統
+        brew install tesseract tesseract-lang poppler
+        # Windows 系統
+        # 請下載安裝 Tesseract OCR 和 Poppler for Windows
+        ```
         """)
+        # 添加依賴檢查按鈕
+        with gr.Row():
+            check_deps_btn = gr.Button("🔍 檢查系統依賴", variant="secondary")
+            deps_status = gr.Textbox(
+                label="依賴狀態",
+                lines=3,
+                interactive=False
+            )
         with gr.Row():
             with gr.Column(scale=2):
                 pdf_urls_input = gr.Textbox(
             )
         # 綁定處理函數
+        def check_dependencies():
+            processor_temp = PDFOCRProcessor()
+            status_lines = processor_temp.processing_log[-4:]  # 取最後4行日誌
+            return '\n'.join(status_lines)
+        check_deps_btn.click(
+            fn=check_dependencies,
+            outputs=[deps_status]
+        )
         process_btn.click(
             fn=processor.process_multiple_pdfs,
             inputs=[pdf_urls_input],
     return app
 if __name__ == "__main__":
+    # 檢查並安裝必要套件
+    required_packages = [
+        "gradio", "requests", "pdf2image", "pillow",
+        "pytesseract", "pandas", "PyMuPDF"
+    ]
+    print("📦 檢查 Python 套件...")
+    missing_packages = []
+    for package in required_packages:
+        try:
+            __import__(package.lower().replace('-', '_'))
+            print(f"✅ {package}")
+        except ImportError:
+            print(f"❌ {package}")
+            missing_packages.append(package)
+    if missing_packages:
+        print(f"\n⚠️ 缺少套件: {', '.join(missing_packages)}")
+        print("請執行以下命令安裝:")
+        print(f"pip install {' '.join(missing_packages)}")
+    print("\n🚀 啟動 Gradio 應用...")
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",