tomo2chin2 commited on
Commit
96b652b
·
verified ·
1 Parent(s): 93159e0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +135 -123
  2. requirements.txt +2 -5
app.py CHANGED
@@ -1,20 +1,15 @@
1
  import os
2
- import tempfile
3
- import time
4
- import logging
5
- import base64
6
- from typing import Optional
7
-
8
  import fitz # PyMuPDF
 
9
  import google.generativeai as genai
10
- from pydantic import BaseModel, Field, HttpUrl
11
- from fastapi import FastAPI, HTTPException, BackgroundTasks
12
- from fastapi.responses import JSONResponse
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
- import httpx # HTTPクライアントライブラリ
15
-
16
 
17
- # ロギング設定 (FastAPI のログレベルに合わせる)
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
 
20
  # 環境変数から設定を読み込む
@@ -28,7 +23,7 @@ if not MODEL_NAME:
28
 
29
  genai.configure(api_key=GOOGLE_API_KEY)
30
 
31
- # --- PDF 処理関数 (変更なし) ---
32
  def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
33
  """PDFを指定ページ数ごとに分割する関数"""
34
  pdf_document = fitz.open(pdf_path)
@@ -38,12 +33,19 @@ def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
38
 
39
  for start_page in range(0, total_pages, pages_per_chunk):
40
  end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
 
 
41
  output_pdf = fitz.open()
 
 
42
  for page_num in range(start_page, end_page + 1):
43
  output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
 
 
44
  output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
45
  output_pdf.save(output_path)
46
  output_pdf.close()
 
47
  split_pdfs.append((start_page, output_path))
48
 
49
  pdf_document.close()
@@ -66,12 +68,48 @@ def ocr_pdf_with_gemini(pdf_path, model_name):
66
 
67
  **Crucial Instructions:**
68
 
69
- 1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, etc.)
70
- 2. **Visual Fidelity:** Strive to reproduce the original visual appearance.
71
- 3. **OCR Correction:** Correct OCR errors (typos, misspellings, etc.).
72
- 4. **Content Filtering (Screenshots):** Focus *exclusively* on the *main content area*. *Do not* include text from window title bars, OS toolbars.
73
- 5. **Accuracy and Clarity:** Prioritize providing accurate and clear information.
74
- 6. **Output:** Only provide the extracted text in Markdown. No extra chat, greetings.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  """
76
 
77
  try:
@@ -92,134 +130,108 @@ def ocr_pdf_with_gemini(pdf_path, model_name):
92
 
93
 
94
 
95
- def process_pdf(pdf_path, model_name):
96
- """PDFファイルを処理するメイン関数 (同期処理に変更)"""
 
97
 
98
  with tempfile.TemporaryDirectory() as temp_dir:
99
- # PDFを分割
100
- split_pdf_paths = split_pdf(pdf_path, temp_dir)
 
 
101
  logging.info(f"Split PDF paths: {split_pdf_paths}")
 
102
 
103
- # 並列処理でOCR変換
104
  markdown_results = {}
105
  with ThreadPoolExecutor() as executor:
106
- futures = {executor.submit(ocr_pdf_with_gemini, path, model_name): start_page for start_page, path in split_pdf_paths}
 
107
  for future in as_completed(futures):
108
  start_page = futures[future]
109
  try:
110
  result = future.result()
111
  markdown_results[start_page] = result
 
 
112
  except Exception as e:
113
  logging.error(f"Error processing split PDF: {e}")
114
  markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
115
 
116
- # 結果を結合 (開始ページ番号でソート)
 
 
117
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
 
 
 
118
  return combined_markdown
119
 
120
 
 
 
 
 
121
 
122
- # --- FastAPI 定義 ---
123
-
124
- app = FastAPI()
125
-
126
- # リクエストモデル
127
- class PDFRequest(BaseModel):
128
- pdf_url: HttpUrl = Field(..., description="URL of the PDF file to be processed.")
129
- request_id: Optional[str] = Field(None, description="Optional request ID for tracking.") # 追跡用のID
130
-
131
- # レスポンスモデル
132
- class PDFResponse(BaseModel):
133
- request_id: Optional[str] = Field(None, description="Request ID")
134
- title: str = Field(..., description="Title extracted from the PDF (if available).")
135
- markdown_content: str = Field(..., description="The extracted text in Markdown format.")
136
- processing_time: float = Field(..., description="Time taken for processing in seconds.")
137
- status: str = Field("success", description="Status of the request (success or failure).")
138
- error_message: Optional[str] = Field(None, description="Error message if processing failed.")
139
-
140
- async def download_pdf(url: str, temp_dir: str) -> str:
141
- """PDFをURLからダウンロードする関数 (非同期)"""
142
- async with httpx.AsyncClient() as client:
143
- try:
144
- response = await client.get(url, follow_redirects=True, timeout=30) # リダイレクト対応, タイムアウト設定
145
- response.raise_for_status() # エラーチェック
146
-
147
- # 一時ファイルに保存
148
- file_extension = os.path.splitext(url)[1] or ".pdf" # 拡張子取得 or デフォルト.pdf
149
- with tempfile.NamedTemporaryFile(delete=False, dir=temp_dir, suffix=file_extension) as temp_file:
150
- temp_file.write(response.content)
151
- temp_file_path = temp_file.name
152
- logging.info(f"Downloaded PDF to: {temp_file_path}")
153
- return temp_file_path
154
-
155
- except httpx.RequestError as exc:
156
- logging.error(f"Request failed: {exc}")
157
- raise HTTPException(status_code=400, detail=f"Failed to download PDF from URL: {exc}")
158
- except httpx.HTTPStatusError as exc:
159
- logging.error(f"HTTP error: {exc}")
160
- raise HTTPException(status_code=exc.response.status_code, detail=str(exc))
161
- except Exception as e:
162
- logging.error(f"Download failed {e}")
163
- raise HTTPException(status_code=500, detail=str(e))
164
-
165
-
166
- @app.post("/process-pdf/", response_model=PDFResponse)
167
- async def process_pdf_endpoint(request: PDFRequest, background_tasks: BackgroundTasks):
168
- """PDFを処理するエンドポイント"""
169
- start_time = time.time()
170
- request_id = request.request_id or str(time.time()) # リクエストIDがない場合は生成
171
 
172
- with tempfile.TemporaryDirectory() as temp_dir:
173
- try:
174
- # PDFをダウンロード (非同期)
175
- pdf_path = await download_pdf(str(request.pdf_url), temp_dir)
176
 
177
- # PDF処理 (同期処理をバックグラウンドで実行)
178
- markdown_content = await background_tasks.add_task(process_pdf, pdf_path, MODEL_NAME)
 
179
 
180
- # PDFからタイトルを取得 (可能であれば)
181
- try:
182
- with fitz.open(pdf_path) as doc:
183
- title = doc.metadata.get("title", "Untitled")
184
- except Exception:
185
- title = "Untitled"
186
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- end_time = time.time()
189
- processing_time = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- return PDFResponse(
192
- request_id=request_id,
193
- title=title,
194
- markdown_content=markdown_content,
195
- processing_time=processing_time
196
- )
197
 
198
- except HTTPException as e: # HTTPレベルエラー
199
- logging.exception("HTTPException during processing:")
200
- end_time = time.time()
201
- return JSONResponse(
202
- status_code=e.status_code,
203
- content=PDFResponse(
204
- request_id = request_id,
205
- title="Error",
206
- markdown_content="",
207
- processing_time=end_time - start_time,
208
- status="failure",
209
- error_message=e.detail
210
- ).dict()
211
- )
212
- except Exception as e: # その他のエラー
213
- logging.exception("Exception during processing:")
214
- end_time = time.time()
215
- return JSONResponse(
216
- status_code=500,
217
- content=PDFResponse(
218
- request_id=request_id,
219
- title="Error",
220
- markdown_content="",
221
- processing_time=end_time - start_time,
222
- status="failure",
223
- error_message=str(e)
224
- ).dict()
225
- )
 
1
  import os
2
+ import gradio as gr
 
 
 
 
 
3
  import fitz # PyMuPDF
4
+ from pathlib import Path
5
  import google.generativeai as genai
6
+ import tempfile
7
+ import base64
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import logging
10
+ import time
11
 
12
+ # ロギング設定
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
  # 環境変数から設定を読み込む
 
23
 
24
  genai.configure(api_key=GOOGLE_API_KEY)
25
 
26
+
27
  def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
28
  """PDFを指定ページ数ごとに分割する関数"""
29
  pdf_document = fitz.open(pdf_path)
 
33
 
34
  for start_page in range(0, total_pages, pages_per_chunk):
35
  end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
36
+
37
+ # 新しいPDFドキュメントを作成
38
  output_pdf = fitz.open()
39
+
40
+ # 指定範囲のページを新しいPDFに追加
41
  for page_num in range(start_page, end_page + 1):
42
  output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
43
+
44
+ # 分割したPDFを保存
45
  output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
46
  output_pdf.save(output_path)
47
  output_pdf.close()
48
+
49
  split_pdfs.append((start_page, output_path))
50
 
51
  pdf_document.close()
 
68
 
69
  **Crucial Instructions:**
70
 
71
+ 1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
72
+ 2. **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
73
+ * **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans. Use extended Markdown table syntax if needed.
74
+ * **Lists:** Preserve the original list style (numbered, bulleted, nested).
75
+ * **Font Sizes:** Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text. Larger text should generally correspond to higher-level headings.
76
+ * **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
77
+ 3. **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters). Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text. Do *not* blindly output the raw OCR result if it contains obvious mistakes.
78
+ 4. **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots. *Do not* include text from:
79
+ * Window title bars
80
+ * Operating system toolbars (e.g., Windows taskbar)
81
+ * Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
82
+ * Any other UI elements that are not part of the core content being displayed.
83
+ 5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
84
+ 6. **Output:** Only provide the extracted text in Markdown.
85
+
86
+ **Example (Illustrative - Adapt to the specific PDF):**
87
+
88
+ **Input PDF (Screenshot of a webpage):**
89
+
90
+ ```
91
+ [Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
92
+ ```
93
+
94
+ **Desired Markdown Output:**
95
+
96
+ ```markdown
97
+ # Welcome
98
+
99
+ This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.
100
+
101
+ * This is the first bullet point.
102
+ * This is the second bullet point.
103
+ * This is a nested bullet point.
104
+
105
+ | Feature | Description | Price |
106
+ |--------------|----------------------------|---------|
107
+ | Feature A | Description of Feature A | $10 |
108
+ | Feature B | Description of Feature B | $20 |
109
+
110
+ ```
111
+
112
+ **Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
113
  """
114
 
115
  try:
 
130
 
131
 
132
 
133
+ def process_pdf(pdf_file, progress=gr.Progress()):
134
+ """PDFファイルを処理するメイン関数"""
135
+ logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
136
 
137
  with tempfile.TemporaryDirectory() as temp_dir:
138
+ temp_pdf_path = pdf_file.name
139
+ logging.info(f"Temporary PDF path: {temp_pdf_path}")
140
+
141
+ split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
142
  logging.info(f"Split PDF paths: {split_pdf_paths}")
143
+ progress(0.2, desc="PDFを分割中...")
144
 
 
145
  markdown_results = {}
146
  with ThreadPoolExecutor() as executor:
147
+ futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
148
+ completed_count = 0
149
  for future in as_completed(futures):
150
  start_page = futures[future]
151
  try:
152
  result = future.result()
153
  markdown_results[start_page] = result
154
+ completed_count += 1
155
+ progress(0.2 + 0.6 * (completed_count) / len(futures), desc="OCR処理中...")
156
  except Exception as e:
157
  logging.error(f"Error processing split PDF: {e}")
158
  markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
159
 
160
+ logging.info(f"Markdown results length: {len(markdown_results)}")
161
+ progress(0.8, desc="結果を結合中...")
162
+
163
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
164
+ progress(1.0, desc="完了")
165
+ time.sleep(0.5)
166
+
167
  return combined_markdown
168
 
169
 
170
+ def create_interface():
171
+ with gr.Blocks() as demo:
172
+ gr.Markdown("# PDF OCR & マークダウン変換ツール")
173
+ gr.Markdown("PDFをアップロードすると、OCRでテキストを抽出しマークダウン形式に変換します。")
174
 
175
+ with gr.Row():
176
+ pdf_input = gr.File(label="PDFファイルをアップロード", file_types=[".pdf"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ with gr.Row():
179
+ convert_btn = gr.Button("変換開始", variant="primary",
180
+ elem_id="convert-button")
 
181
 
182
+ with gr.Row():
183
+ markdown_output = gr.Textbox(label="変換結果", lines=10,
184
+ max_lines=20)
185
 
186
+ with gr.Row():
187
+ copy_btn = gr.Button("クリップボードにコピー")
188
+ download_btn = gr.Button("ダウンロード")
 
 
 
189
 
190
+ js_code = gr.HTML(
191
+ """
192
+ <script>
193
+ function styleButton() {
194
+ document.getElementById('convert-button').style.backgroundColor = 'orange';
195
+ }
196
+ </script>
197
+ """,
198
+ visible=False,
199
+ )
200
 
201
+ pdf_input.upload(None, [], [], js="styleButton")
202
+
203
+ convert_btn.click(
204
+ fn=process_pdf,
205
+ inputs=pdf_input,
206
+ outputs=markdown_output
207
+ )
208
+
209
+ copy_btn.click(
210
+ None,
211
+ markdown_output,
212
+ [],
213
+ js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
214
+ )
215
+
216
+ download_btn.click(
217
+ None,
218
+ markdown_output,
219
+ [],
220
+ js=f"""(x) =>{{
221
+ const blob = new Blob([x], {{type: 'text/markdown;charset=utf-8'}});
222
+ const url = URL.createObjectURL(blob);
223
+ const a = document.createElement('a');
224
+ a.href = url;
225
+ a.download = 'converted.md';
226
+ document.body.appendChild(a);
227
+ a.click();
228
+ document.body.removeChild(a);
229
+ URL.revokeObjectURL(url);
230
+ }}"""
231
+ )
232
+ return demo
233
 
 
 
 
 
 
 
234
 
235
+ if __name__ == "__main__":
236
+ demo = create_interface()
237
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
- fastapi
2
- uvicorn
3
- httpx
4
  PyMuPDF
5
- google-generativeai
6
- pydantic
 
1
+ gradio==4.44.1
 
 
2
  PyMuPDF
3
+ google-generativeai