File size: 10,525 Bytes
dd6c350
1061e53
96b652b
1061e53
96b652b
1061e53
 
 
 
aea2a0a
1061e53
 
5bb44a9
96b652b
1061e53
dd6c350
1061e53
b9aa4f2
aea2a0a
b9aa4f2
 
d7da753
 
dd6c350
 
96b652b
dd6c350
 
 
 
 
 
 
 
 
 
 
 
 
aea2a0a
dd6c350
 
 
b9aa4f2
dd6c350
 
 
 
 
b9aa4f2
d7da753
fe395d5
dd6c350
d7da753
fe395d5
dd6c350
aea2a0a
 
 
fe395d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd6c350
fe395d5
b9aa4f2
d7da753
 
 
 
 
 
aea2a0a
 
d7da753
 
b9aa4f2
 
5bb44a9
 
b9aa4f2
dd6c350
96b652b
 
 
dd6c350
96b652b
 
 
4d151db
96b652b
aea2a0a
dd6c350
96b652b
 
d7da753
 
5bb44a9
 
d7da753
96b652b
 
5bb44a9
 
d7da753
96b652b
 
d7da753
96b652b
 
dd6c350
 
5bb44a9
1061e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9aa4f2
 
1061e53
 
 
 
 
96b652b
1061e53
96b652b
1061e53
96b652b
1061e53
 
 
96b652b
dd6c350
b9aa4f2
1061e53
 
 
 
96b652b
1061e53
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
import logging
import tempfile
import time
import base64
import requests
import fitz  # PyMuPDF
import gradio as gr
import google.generativeai as genai
from concurrent.futures import ThreadPoolExecutor, as_completed
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware

# ロギング設定
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# 環境変数から設定を読み込み
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
if not GOOGLE_API_KEY:
    raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
if not MODEL_NAME:
    raise ValueError("環境変数 'MODEL_NAME' が設定されていません。")
genai.configure(api_key=GOOGLE_API_KEY)


def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
    """PDFを指定ページ数ごとに分割する関数"""
    pdf_document = fitz.open(pdf_path)
    total_pages = len(pdf_document)
    split_pdfs = []
    for start_page in range(0, total_pages, pages_per_chunk):
        end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
        output_pdf = fitz.open()
        for page_num in range(start_page, end_page + 1):
            output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
        output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
        output_pdf.save(output_path)
        output_pdf.close()
        split_pdfs.append((start_page, output_path))
    pdf_document.close()
    return split_pdfs


def encode_pdf_to_base64(pdf_path):
    """PDFファイルをbase64エンコードする関数"""
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode('utf-8')


def ocr_pdf_with_gemini(pdf_path, model_name):
    """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
    pdf_base64 = encode_pdf_to_base64(pdf_path)
    model = genai.GenerativeModel(model_name)

    prompt = """
    You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.

    **Crucial Instructions:**

    1.  **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
    2.  **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
        *   **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans.  Use extended Markdown table syntax if needed.
        *   **Lists:** Preserve the original list style (numbered, bulleted, nested).
        *   **Font Sizes:**  Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text.  Larger text should generally correspond to higher-level headings.
        *   **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
    3.  **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters).  Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text.  Do *not* blindly output the raw OCR result if it contains obvious mistakes.
    4.  **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots.  *Do not* include text from:
        *   Window title bars
        *   Operating system toolbars (e.g., Windows taskbar)
        *   Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
        *   Any other UI elements that are not part of the core content being displayed.
    5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
    6. **Output:** Only provide the extracted text in Markdown.

    **Example (Illustrative - Adapt to the specific PDF):**

    **Input PDF (Screenshot of a webpage):**

    ```
    [Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
    ```

    **Desired Markdown Output:**

    ```markdown
    # Welcome

    This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.

    *   This is the first bullet point.
    *   This is the second bullet point.
        *   This is a nested bullet point.

    | Feature      | Description                | Price   |
    |--------------|----------------------------|---------|
    | Feature A    | Description of Feature A  | $10     |
    | Feature B    | Description of Feature B  | $20     |

    ```

    **Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
    """

    try:
        response = model.generate_content(
            [
                prompt,
                {"mime_type": "application/pdf", "data": pdf_base64}
            ],
            generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                max_output_tokens=8192,
            )
        )
        return response.text
    except Exception as e:
        logging.error(f"Error during Gemini API call: {e}")
        return f"エラーが発生しました: {e}"


def process_pdf(pdf_file, progress=gr.Progress()):
    """PDFファイルを処理するメイン関数"""
    logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_pdf_path = pdf_file.name
        logging.info(f"Temporary PDF path: {temp_pdf_path}")
        split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
        logging.info(f"Split PDF paths: {split_pdf_paths}")
        progress(0.2, desc="PDFを分割中...")
        markdown_results = {}
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
            completed_count = 0
            for future in as_completed(futures):
                start_page = futures[future]
                try:
                    result = future.result()
                    markdown_results[start_page] = result
                    completed_count += 1
                    progress(0.2 + 0.6 * (completed_count) / len(futures), desc="OCR処理中...")
                except Exception as e:
                    logging.error(f"Error processing split PDF: {e}")
                    markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
        logging.info(f"Markdown results length: {len(markdown_results)}")
        progress(0.8, desc="結果を結合中...")
        combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
        progress(1.0, desc="完了")
        time.sleep(0.5)
        return combined_markdown


def process_pdf_from_url(url: str):
    """指定されたURLからPDFをダウンロードし、OCR→Markdown変換を実施する関数"""
    logging.info(f"Downloading PDF from URL: {url}")
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"PDFのダウンロードに失敗しました。ステータスコード: {response.status_code}")
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
        tmp.write(response.content)
        tmp.flush()
        tmp_name = tmp.name
    try:
        with open(tmp_name, "rb") as pdf_file:
            markdown = process_pdf(pdf_file)
    finally:
        os.remove(tmp_name)
    title = os.path.splitext(os.path.basename(url))[0]
    return title, markdown


# FastAPIアプリケーションの作成
app = FastAPI()

# CORS対応(必要に応じて)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/api/ocr")
async def ocr_endpoint(payload: dict):
    """
    POSTリクエストで受け取ったPDFのURLからOCR処理を実施し、
    タイトルとMarkdown形式の変換結果を返すエンドポイント。
    リクエスト例:
    {
        "url": "https://example.com/document.pdf"
    }
    """
    url = payload.get("url")
    if not url:
        raise HTTPException(status_code=400, detail="URLパラメータが必要です。")
    try:
        title, markdown = process_pdf_from_url(url)
        return {"title": title, "markdown": markdown}
    except Exception as e:
        logging.error(f"Error in /api/ocr: {e}")
        raise HTTPException(status_code=500, detail=str(e))


def create_interface():
    """Gradioインターフェースを作成(URL入力によるPDF処理)"""
    with gr.Blocks() as demo:
        gr.Markdown("# PDF OCR & Markdown変換ツール")
        gr.Markdown("PDFのURLを入力すると、OCR処理を実行し、Markdown形式に変換します。")
        with gr.Row():
            url_input = gr.Textbox(label="PDF URL", placeholder="例: https://example.com/document.pdf")
        with gr.Row():
            convert_btn = gr.Button("変換開始", variant="primary")
        with gr.Row():
            title_output = gr.Textbox(label="タイトル", interactive=False)
            markdown_output = gr.Textbox(label="変換結果 (Markdown)", lines=10, max_lines=20)
        convert_btn.click(fn=process_pdf_from_url, inputs=url_input, outputs=[title_output, markdown_output])
    return demo


# GradioインターフェースをFastAPIにマウント
demo = create_interface()
app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)