PDF_LLM_API

Paused

App Files Files Community

tomo2chin2 commited on Mar 2, 2025

Commit

d7da753

verified ·

1 Parent(s): 62a0bc6

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -34

app.py CHANGED Viewed

@@ -5,17 +5,22 @@ from pathlib import Path
 import google.generativeai as genai
 import tempfile
 import base64
-from concurrent.futures import ThreadPoolExecutor
 import logging
 import time
 # ロギング設定
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Gemini APIの設定 (環境変数から取得)
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 if not GOOGLE_API_KEY:
     raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
 genai.configure(api_key=GOOGLE_API_KEY)
@@ -41,7 +46,7 @@ def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
         output_pdf.save(output_path)
         output_pdf.close()
-        split_pdfs.append(output_path)
     pdf_document.close()
     return split_pdfs
@@ -53,32 +58,44 @@ def encode_pdf_to_base64(pdf_path):
         return base64.b64encode(pdf_file.read()).decode('utf-8')
-def ocr_pdf_with_gemini(pdf_path):
     """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
     # PDFをbase64エンコード
     pdf_base64 = encode_pdf_to_base64(pdf_path)
     # Geminiモデルの設定
-    model = genai.GenerativeModel('gemini-2.0-flash')  # または利用可能な他のモデル
     # プロンプトの設定
     prompt = """
-    このPDFに含まれるテキストをOCRで読み取り、整形されたマークダウン形式に変換してください。
-    以下の点に注意してください：
-    - 見出しは適切なマークダウン見出し記法（#, ##, ###など）を使用
-    - 箇条書きリストは適切に変換
-    - 表はマークダウン表形式に変換
-    - 段落構造を維持
-    - 余分な改行やスペースは整理
-    - 画像の内容は[画像: 内容の説明]と表記
     """
     # PDFをGeminiに送信
     try:
-        response = model.generate_content([
-            prompt,
-            {"mime_type": "application/pdf", "data": pdf_base64}
-        ])
         # 結果を返す
         return response.text
     except Exception as e:
@@ -86,6 +103,7 @@ def ocr_pdf_with_gemini(pdf_path):
         return f"エラーが発生しました: {e}"
 def process_pdf(pdf_file, progress=gr.Progress()):
     """PDFファイルを処理するメイン関数"""
     logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
@@ -96,30 +114,33 @@ def process_pdf(pdf_file, progress=gr.Progress()):
         logging.info(f"Temporary PDF path: {temp_pdf_path}")
         # PDFを分割
-        split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
         logging.info(f"Split PDF paths: {split_pdf_paths}")
-        progress(0.2, desc="PDFを分割中...")  # 進捗更新
         # 並列処理でOCR変換
-        markdown_results = []
         with ThreadPoolExecutor() as executor:
-            # futureオブジェクトのリストを作成し、進捗を追跡
-            futures = [executor.submit(ocr_pdf_with_gemini, path) for path in split_pdf_paths]
-            for i, future in enumerate(futures):
                 try:
                     result = future.result()
-                    markdown_results.append(result)
-                    progress(0.2 + 0.6 * (i + 1) / len(futures), desc="OCR処理中...")  # 進捗更新
                 except Exception as e:
                     logging.error(f"Error processing split PDF: {e}")
-                    markdown_results.append(f"分割PDFの処理中にエラーが発生しました: {e}")
         logging.info(f"Markdown results length: {len(markdown_results)}")
-        progress(0.8, desc="結果を結合中...")  # 進捗更新
-        # 結果を結合
-        combined_markdown = "\n\n".join(markdown_results)
-        progress(1.0, desc="完了")  # 進捗更新
-        time.sleep(0.5)  # 完了表示のため少し待つ
         return combined_markdown
@@ -135,11 +156,11 @@ def create_interface():
         with gr.Row():
             convert_btn = gr.Button("変換開始", variant="primary",
-                                    elem_id="convert-button")  # variantとelem_idを追加
         with gr.Row():
             markdown_output = gr.Textbox(label="変換結果", lines=10,
-                                         max_lines=20)  # MarkdownからTextboxに変更、行数を指定
         with gr.Row():
             copy_btn = gr.Button("クリップボードにコピー")
@@ -154,7 +175,7 @@ def create_interface():
             }
             </script>
             """,
-            visible=False, # 非表示にしておく
         )
         # js_code が読み込まれた後に styleButton 関数を実行

 import google.generativeai as genai
 import tempfile
 import base64
+from concurrent.futures import ThreadPoolExecutor, as_completed  # as_completed を追加
 import logging
 import time
 # ロギング設定
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# 環境変数から設定を読み込む
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")  # デフォルトモデルも設定
 if not GOOGLE_API_KEY:
     raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
+if not MODEL_NAME:
+    raise ValueError("環境変数 'MODEL_NAME' が設定されていません。")
 genai.configure(api_key=GOOGLE_API_KEY)
         output_pdf.save(output_path)
         output_pdf.close()
+        split_pdfs.append((start_page, output_path))  # (開始ページ, パス) のタプルで保存
     pdf_document.close()
     return split_pdfs
         return base64.b64encode(pdf_file.read()).decode('utf-8')
+def ocr_pdf_with_gemini(pdf_path, model_name):
     """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
     # PDFをbase64エンコード
     pdf_base64 = encode_pdf_to_base64(pdf_path)
     # Geminiモデルの設定
+    model = genai.GenerativeModel(model_name)
     # プロンプトの設定
     prompt = """
+    Extract the text content from this PDF using OCR and output it in a well-structured Markdown format.
+    Focus solely on the text extraction; do not include any conversational elements, greetings, or additional explanations.
+    Only provide the extracted text in Markdown.
+    Pay attention to the following:
+    - Use appropriate Markdown heading syntax (#, ##, ###, etc.) for headings.
+    - Convert bulleted lists correctly.
+    - Convert tables to Markdown table format.
+    - Maintain the paragraph structure.
+    - Clean up any extra line breaks or spaces.
+    - For images, use the format [Image: Description of the content].
     """
     # PDFをGeminiに送信
     try:
+        response = model.generate_content(
+            [
+                prompt,
+                {"mime_type": "application/pdf", "data": pdf_base64}
+            ],
+            # generation_configで不要な応答を抑制 (モデルによっては効かない場合あり)
+            generation_config=genai.types.GenerationConfig(
+                candidate_count=1,  # 候補を1つに絞る
+                max_output_tokens=8192, # 必要に応じてトークン数を調整
+                # stop_sequences=["."],  # 句点で生成を停止 (モデルによっては無効)
+            )
+        )
         # 結果を返す
         return response.text
     except Exception as e:
         return f"エラーが発生しました: {e}"
 def process_pdf(pdf_file, progress=gr.Progress()):
     """PDFファイルを処理するメイン関数"""
     logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
         logging.info(f"Temporary PDF path: {temp_pdf_path}")
         # PDFを分割
+        split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)  # [(start_page, path), ...]
         logging.info(f"Split PDF paths: {split_pdf_paths}")
+        progress(0.2, desc="PDFを分割中...")
         # 並列処理でOCR変換
+        markdown_results = {}  # {start_page: markdown_text, ...}  辞書に変更
         with ThreadPoolExecutor() as executor:
+            futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
+            completed_count = 0
+            for future in as_completed(futures):
+                start_page = futures[future]
                 try:
                     result = future.result()
+                    markdown_results[start_page] = result
+                    completed_count += 1
+                    progress(0.2 + 0.6 * (completed_count) / len(futures), desc="OCR処理中...")
                 except Exception as e:
                     logging.error(f"Error processing split PDF: {e}")
+                    markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
         logging.info(f"Markdown results length: {len(markdown_results)}")
+        progress(0.8, desc="結果を結合中...")
+        # 結果を結合 (開始ページ番号でソート)
+        combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
+        progress(1.0, desc="完了")
+        time.sleep(0.5)
         return combined_markdown
         with gr.Row():
             convert_btn = gr.Button("変換開始", variant="primary",
+                                    elem_id="convert-button")
         with gr.Row():
             markdown_output = gr.Textbox(label="変換結果", lines=10,
+                                         max_lines=20)
         with gr.Row():
             copy_btn = gr.Button("クリップボードにコピー")
             }
             </script>
             """,
+            visible=False,
         )
         # js_code が読み込まれた後に styleButton 関数を実行