Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
| 5 |
import google.generativeai as genai
|
| 6 |
import tempfile
|
| 7 |
import base64
|
| 8 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
import logging
|
| 10 |
import time
|
| 11 |
|
|
@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 14 |
|
| 15 |
# 環境変数から設定を読み込む
|
| 16 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
| 17 |
-
MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
|
| 18 |
|
| 19 |
if not GOOGLE_API_KEY:
|
| 20 |
raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
|
|
@@ -46,7 +46,7 @@ def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
|
|
| 46 |
output_pdf.save(output_path)
|
| 47 |
output_pdf.close()
|
| 48 |
|
| 49 |
-
split_pdfs.append((start_page, output_path))
|
| 50 |
|
| 51 |
pdf_document.close()
|
| 52 |
return split_pdfs
|
|
@@ -60,43 +60,69 @@ def encode_pdf_to_base64(pdf_path):
|
|
| 60 |
|
| 61 |
def ocr_pdf_with_gemini(pdf_path, model_name):
|
| 62 |
"""GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
|
| 63 |
-
# PDFをbase64エンコード
|
| 64 |
pdf_base64 = encode_pdf_to_base64(pdf_path)
|
| 65 |
-
|
| 66 |
-
# Geminiモデルの設定
|
| 67 |
model = genai.GenerativeModel(model_name)
|
| 68 |
|
| 69 |
-
# プロンプトの設定
|
| 70 |
prompt = """
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
-
# PDFをGeminiに送信
|
| 85 |
try:
|
| 86 |
response = model.generate_content(
|
| 87 |
[
|
| 88 |
prompt,
|
| 89 |
{"mime_type": "application/pdf", "data": pdf_base64}
|
| 90 |
],
|
| 91 |
-
# generation_configで不要な応答を抑制 (モデルによっては効かない場合あり)
|
| 92 |
generation_config=genai.types.GenerationConfig(
|
| 93 |
-
candidate_count=1,
|
| 94 |
-
max_output_tokens=8192,
|
| 95 |
-
# stop_sequences=["."], # 句点で生成を停止 (モデルによっては無効)
|
| 96 |
)
|
| 97 |
-
|
| 98 |
)
|
| 99 |
-
# 結果を返す
|
| 100 |
return response.text
|
| 101 |
except Exception as e:
|
| 102 |
logging.error(f"Error during Gemini API call: {e}")
|
|
@@ -108,18 +134,15 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 108 |
"""PDFファイルを処理するメイン関数"""
|
| 109 |
logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
|
| 110 |
|
| 111 |
-
# 一時ディレクトリを作成
|
| 112 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 113 |
temp_pdf_path = pdf_file.name
|
| 114 |
logging.info(f"Temporary PDF path: {temp_pdf_path}")
|
| 115 |
|
| 116 |
-
|
| 117 |
-
split_pdf_paths = split_pdf(temp_pdf_path, temp_dir) # [(start_page, path), ...]
|
| 118 |
logging.info(f"Split PDF paths: {split_pdf_paths}")
|
| 119 |
progress(0.2, desc="PDFを分割中...")
|
| 120 |
|
| 121 |
-
|
| 122 |
-
markdown_results = {} # {start_page: markdown_text, ...} 辞書に変更
|
| 123 |
with ThreadPoolExecutor() as executor:
|
| 124 |
futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
|
| 125 |
completed_count = 0
|
|
@@ -137,7 +160,6 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 137 |
logging.info(f"Markdown results length: {len(markdown_results)}")
|
| 138 |
progress(0.8, desc="結果を結合中...")
|
| 139 |
|
| 140 |
-
# 結果を結合 (開始ページ番号でソート)
|
| 141 |
combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
|
| 142 |
progress(1.0, desc="完了")
|
| 143 |
time.sleep(0.5)
|
|
@@ -145,7 +167,6 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 145 |
return combined_markdown
|
| 146 |
|
| 147 |
|
| 148 |
-
# Gradioインターフェースの作成
|
| 149 |
def create_interface():
|
| 150 |
with gr.Blocks() as demo:
|
| 151 |
gr.Markdown("# PDF OCR & マークダウン変換ツール")
|
|
@@ -166,7 +187,6 @@ def create_interface():
|
|
| 166 |
copy_btn = gr.Button("クリップボードにコピー")
|
| 167 |
download_btn = gr.Button("ダウンロード")
|
| 168 |
|
| 169 |
-
# JavaScriptコードを埋め込むためのHTMLコンポーネント
|
| 170 |
js_code = gr.HTML(
|
| 171 |
"""
|
| 172 |
<script>
|
|
@@ -178,17 +198,14 @@ def create_interface():
|
|
| 178 |
visible=False,
|
| 179 |
)
|
| 180 |
|
| 181 |
-
# js_code が読み込まれた後に styleButton 関数を実行
|
| 182 |
pdf_input.upload(None, [], [], js="styleButton")
|
| 183 |
|
| 184 |
-
|
| 185 |
convert_btn.click(
|
| 186 |
fn=process_pdf,
|
| 187 |
inputs=pdf_input,
|
| 188 |
outputs=markdown_output
|
| 189 |
)
|
| 190 |
|
| 191 |
-
# クリップボードにコピー
|
| 192 |
copy_btn.click(
|
| 193 |
None,
|
| 194 |
markdown_output,
|
|
@@ -196,7 +213,6 @@ def create_interface():
|
|
| 196 |
js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
|
| 197 |
)
|
| 198 |
|
| 199 |
-
# ダウンロード
|
| 200 |
download_btn.click(
|
| 201 |
None,
|
| 202 |
markdown_output,
|
|
@@ -216,7 +232,6 @@ def create_interface():
|
|
| 216 |
return demo
|
| 217 |
|
| 218 |
|
| 219 |
-
# アプリの起動
|
| 220 |
if __name__ == "__main__":
|
| 221 |
demo = create_interface()
|
| 222 |
demo.launch()
|
|
|
|
| 5 |
import google.generativeai as genai
|
| 6 |
import tempfile
|
| 7 |
import base64
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
import logging
|
| 10 |
import time
|
| 11 |
|
|
|
|
| 14 |
|
| 15 |
# 環境変数から設定を読み込む
|
| 16 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
| 17 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
|
| 18 |
|
| 19 |
if not GOOGLE_API_KEY:
|
| 20 |
raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
|
|
|
|
| 46 |
output_pdf.save(output_path)
|
| 47 |
output_pdf.close()
|
| 48 |
|
| 49 |
+
split_pdfs.append((start_page, output_path))
|
| 50 |
|
| 51 |
pdf_document.close()
|
| 52 |
return split_pdfs
|
|
|
|
| 60 |
|
| 61 |
def ocr_pdf_with_gemini(pdf_path, model_name):
|
| 62 |
"""GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
|
|
|
|
| 63 |
pdf_base64 = encode_pdf_to_base64(pdf_path)
|
|
|
|
|
|
|
| 64 |
model = genai.GenerativeModel(model_name)
|
| 65 |
|
|
|
|
| 66 |
prompt = """
|
| 67 |
+
You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.
|
| 68 |
+
|
| 69 |
+
**Crucial Instructions:**
|
| 70 |
+
|
| 71 |
+
1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
|
| 72 |
+
2. **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
|
| 73 |
+
* **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans. Use extended Markdown table syntax if needed.
|
| 74 |
+
* **Lists:** Preserve the original list style (numbered, bulleted, nested).
|
| 75 |
+
* **Font Sizes:** Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text. Larger text should generally correspond to higher-level headings.
|
| 76 |
+
* **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
|
| 77 |
+
3. **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters). Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text. Do *not* blindly output the raw OCR result if it contains obvious mistakes.
|
| 78 |
+
4. **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots. *Do not* include text from:
|
| 79 |
+
* Window title bars
|
| 80 |
+
* Operating system toolbars (e.g., Windows taskbar)
|
| 81 |
+
* Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
|
| 82 |
+
* Any other UI elements that are not part of the core content being displayed.
|
| 83 |
+
5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
|
| 84 |
+
6. **Output:** Only provide the extracted text in Markdown.
|
| 85 |
+
|
| 86 |
+
**Example (Illustrative - Adapt to the specific PDF):**
|
| 87 |
+
|
| 88 |
+
**Input PDF (Screenshot of a webpage):**
|
| 89 |
+
|
| 90 |
+
```
|
| 91 |
+
[Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Desired Markdown Output:**
|
| 95 |
+
|
| 96 |
+
```markdown
|
| 97 |
+
# Welcome
|
| 98 |
+
|
| 99 |
+
This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.
|
| 100 |
+
|
| 101 |
+
* This is the first bullet point.
|
| 102 |
+
* This is the second bullet point.
|
| 103 |
+
* This is a nested bullet point.
|
| 104 |
+
|
| 105 |
+
| Feature | Description | Price |
|
| 106 |
+
|--------------|----------------------------|---------|
|
| 107 |
+
| Feature A | Description of Feature A | $10 |
|
| 108 |
+
| Feature B | Description of Feature B | $20 |
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
**Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
|
| 113 |
"""
|
| 114 |
|
|
|
|
| 115 |
try:
|
| 116 |
response = model.generate_content(
|
| 117 |
[
|
| 118 |
prompt,
|
| 119 |
{"mime_type": "application/pdf", "data": pdf_base64}
|
| 120 |
],
|
|
|
|
| 121 |
generation_config=genai.types.GenerationConfig(
|
| 122 |
+
candidate_count=1,
|
| 123 |
+
max_output_tokens=8192,
|
|
|
|
| 124 |
)
|
|
|
|
| 125 |
)
|
|
|
|
| 126 |
return response.text
|
| 127 |
except Exception as e:
|
| 128 |
logging.error(f"Error during Gemini API call: {e}")
|
|
|
|
| 134 |
"""PDFファイルを処理するメイン関数"""
|
| 135 |
logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
|
| 136 |
|
|
|
|
| 137 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 138 |
temp_pdf_path = pdf_file.name
|
| 139 |
logging.info(f"Temporary PDF path: {temp_pdf_path}")
|
| 140 |
|
| 141 |
+
split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
|
|
|
|
| 142 |
logging.info(f"Split PDF paths: {split_pdf_paths}")
|
| 143 |
progress(0.2, desc="PDFを分割中...")
|
| 144 |
|
| 145 |
+
markdown_results = {}
|
|
|
|
| 146 |
with ThreadPoolExecutor() as executor:
|
| 147 |
futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
|
| 148 |
completed_count = 0
|
|
|
|
| 160 |
logging.info(f"Markdown results length: {len(markdown_results)}")
|
| 161 |
progress(0.8, desc="結果を結合中...")
|
| 162 |
|
|
|
|
| 163 |
combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
|
| 164 |
progress(1.0, desc="完了")
|
| 165 |
time.sleep(0.5)
|
|
|
|
| 167 |
return combined_markdown
|
| 168 |
|
| 169 |
|
|
|
|
| 170 |
def create_interface():
|
| 171 |
with gr.Blocks() as demo:
|
| 172 |
gr.Markdown("# PDF OCR & マークダウン変換ツール")
|
|
|
|
| 187 |
copy_btn = gr.Button("クリップボードにコピー")
|
| 188 |
download_btn = gr.Button("ダウンロード")
|
| 189 |
|
|
|
|
| 190 |
js_code = gr.HTML(
|
| 191 |
"""
|
| 192 |
<script>
|
|
|
|
| 198 |
visible=False,
|
| 199 |
)
|
| 200 |
|
|
|
|
| 201 |
pdf_input.upload(None, [], [], js="styleButton")
|
| 202 |
|
|
|
|
| 203 |
convert_btn.click(
|
| 204 |
fn=process_pdf,
|
| 205 |
inputs=pdf_input,
|
| 206 |
outputs=markdown_output
|
| 207 |
)
|
| 208 |
|
|
|
|
| 209 |
copy_btn.click(
|
| 210 |
None,
|
| 211 |
markdown_output,
|
|
|
|
| 213 |
js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
|
| 214 |
)
|
| 215 |
|
|
|
|
| 216 |
download_btn.click(
|
| 217 |
None,
|
| 218 |
markdown_output,
|
|
|
|
| 232 |
return demo
|
| 233 |
|
| 234 |
|
|
|
|
| 235 |
if __name__ == "__main__":
|
| 236 |
demo = create_interface()
|
| 237 |
demo.launch()
|