Spaces:
Paused
Paused
File size: 9,895 Bytes
dd6c350 aea2a0a 5bb44a9 74af422 5bb44a9 dd6c350 d7da753 b9aa4f2 aea2a0a d7da753 b9aa4f2 d7da753 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 aea2a0a b9aa4f2 dd6c350 b9aa4f2 dd6c350 b9aa4f2 d7da753 dd6c350 d7da753 b9aa4f2 dd6c350 aea2a0a dd6c350 b9aa4f2 d7da753 aea2a0a d7da753 b9aa4f2 5bb44a9 b9aa4f2 dd6c350 d7da753 5bb44a9 dd6c350 5bb44a9 4d151db dd6c350 4d151db aea2a0a 4d151db d7da753 b9aa4f2 aea2a0a dd6c350 d7da753 5bb44a9 d7da753 5bb44a9 d7da753 b9aa4f2 5bb44a9 d7da753 b9aa4f2 dd6c350 5bb44a9 dd6c350 b9aa4f2 dd6c350 b9aa4f2 dd6c350 74af422 d7da753 5bb44a9 74af422 d7da753 b9aa4f2 dd6c350 74af422 d7da753 74af422 5bb44a9 74af422 5bb44a9 dd6c350 b9aa4f2 5bb44a9 74af422 5bb44a9 dd6c350 b9aa4f2 dd6c350 b9aa4f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | import os
import gradio as gr
import fitz # PyMuPDF
from pathlib import Path
import google.generativeai as genai
import tempfile
import base64
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import time
# ロギング設定
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 環境変数から設定を読み込む
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
if not GOOGLE_API_KEY:
raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
if not MODEL_NAME:
raise ValueError("環境変数 'MODEL_NAME' が設定されていません。")
genai.configure(api_key=GOOGLE_API_KEY)
def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
"""PDFを指定ページ数ごとに分割する関数"""
pdf_document = fitz.open(pdf_path)
total_pages = len(pdf_document)
split_pdfs = []
for start_page in range(0, total_pages, pages_per_chunk):
end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
# 新しいPDFドキュメントを作成
output_pdf = fitz.open()
# 指定範囲のページを新しいPDFに追加
for page_num in range(start_page, end_page + 1):
output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
# 分割したPDFを保存
output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
output_pdf.save(output_path)
output_pdf.close()
split_pdfs.append((start_page, output_path))
pdf_document.close()
return split_pdfs
def encode_pdf_to_base64(pdf_path):
"""PDFファイルをbase64エンコードする関数"""
with open(pdf_path, "rb") as pdf_file:
return base64.b64encode(pdf_file.read()).decode('utf-8')
def ocr_pdf_with_gemini(pdf_path, model_name):
"""GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
pdf_base64 = encode_pdf_to_base64(pdf_path)
model = genai.GenerativeModel(model_name)
prompt = """
You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.
**Crucial Instructions:**
1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
2. **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
* **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans. Use extended Markdown table syntax if needed.
* **Lists:** Preserve the original list style (numbered, bulleted, nested).
* **Font Sizes:** Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text. Larger text should generally correspond to higher-level headings.
* **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
3. **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters). Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text. Do *not* blindly output the raw OCR result if it contains obvious mistakes.
4. **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots. *Do not* include text from:
* Window title bars
* Operating system toolbars (e.g., Windows taskbar)
* Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
* Any other UI elements that are not part of the core content being displayed.
5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
6. **Output:** Only provide the extracted text in Markdown.
**Example (Illustrative - Adapt to the specific PDF):**
**Input PDF (Screenshot of a webpage):**
```
[Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
```
**Desired Markdown Output:**
```markdown
# Welcome
This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.
* This is the first bullet point.
* This is the second bullet point.
* This is a nested bullet point.
| Feature | Description | Price |
|--------------|----------------------------|---------|
| Feature A | Description of Feature A | $10 |
| Feature B | Description of Feature B | $20 |
```
**Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
"""
try:
response = model.generate_content(
[
prompt,
{"mime_type": "application/pdf", "data": pdf_base64}
],
generation_config=genai.types.GenerationConfig(
candidate_count=1,
max_output_tokens=8192,
)
)
return response.text
except Exception as e:
logging.error(f"Error during Gemini API call: {e}")
return f"エラーが発生しました: {e}"
def process_pdf(pdf_file, progress=gr.Progress()):
"""PDFファイルを処理するメイン関数"""
logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
with tempfile.TemporaryDirectory() as temp_dir:
temp_pdf_path = pdf_file.name
logging.info(f"Temporary PDF path: {temp_pdf_path}")
split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
logging.info(f"Split PDF paths: {split_pdf_paths}")
progress(0.2, desc="PDFを分割中...")
markdown_results = {}
with ThreadPoolExecutor() as executor:
futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
completed_count = 0
for future in as_completed(futures):
start_page = futures[future]
try:
result = future.result()
markdown_results[start_page] = result
completed_count += 1
progress(0.2 + 0.6 * (completed_count) / len(futures), desc="OCR処理中...")
except Exception as e:
logging.error(f"Error processing split PDF: {e}")
markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
logging.info(f"Markdown results length: {len(markdown_results)}")
progress(0.8, desc="結果を結合中...")
combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
progress(1.0, desc="完了")
time.sleep(0.5)
return combined_markdown
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("# PDF OCR & マークダウン変換ツール")
gr.Markdown("PDFをアップロードすると、OCRでテキストを抽出しマークダウン形式に変換します。")
with gr.Row():
pdf_input = gr.File(label="PDFファイルをアップロード", file_types=[".pdf"])
with gr.Row():
convert_btn = gr.Button("変換開始", variant="primary",
elem_id="convert-button")
with gr.Row():
markdown_output = gr.Textbox(label="変換結果", lines=10,
max_lines=20)
with gr.Row():
copy_btn = gr.Button("クリップボードにコピー")
download_btn = gr.Button("ダウンロード")
js_code = gr.HTML(
"""
<script>
function styleButton() {
document.getElementById('convert-button').style.backgroundColor = 'orange';
}
</script>
""",
visible=False,
)
pdf_input.upload(None, [], [], js="styleButton")
convert_btn.click(
fn=process_pdf,
inputs=pdf_input,
outputs=markdown_output
)
copy_btn.click(
None,
markdown_output,
[],
js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
)
download_btn.click(
None,
markdown_output,
[],
js=f"""(x) =>{{
const blob = new Blob([x], {{type: 'text/markdown;charset=utf-8'}});
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'converted.md';
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}}"""
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch() |