tomo2chin2 commited on
Commit
1061e53
·
verified ·
1 Parent(s): 75e7f23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -132
app.py CHANGED
@@ -1,26 +1,26 @@
1
  import os
2
- import gradio as gr
3
- import fitz # PyMuPDF
4
- from pathlib import Path
5
- import google.generativeai as genai
6
  import tempfile
 
7
  import base64
 
 
 
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- import logging
10
- import time
11
 
12
  # ロギング設定
13
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
- # 環境変数から設定を読み込む
16
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
17
  MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
18
-
19
  if not GOOGLE_API_KEY:
20
  raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
21
  if not MODEL_NAME:
22
  raise ValueError("環境変数 'MODEL_NAME' が設定されていません。")
23
-
24
  genai.configure(api_key=GOOGLE_API_KEY)
25
 
26
 
@@ -28,26 +28,16 @@ def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
28
  """PDFを指定ページ数ごとに分割する関数"""
29
  pdf_document = fitz.open(pdf_path)
30
  total_pages = len(pdf_document)
31
-
32
  split_pdfs = []
33
-
34
  for start_page in range(0, total_pages, pages_per_chunk):
35
  end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
36
-
37
- # 新しいPDFドキュメントを作成
38
  output_pdf = fitz.open()
39
-
40
- # 指定範囲のページを新しいPDFに追加
41
  for page_num in range(start_page, end_page + 1):
42
  output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
43
-
44
- # 分割したPDFを保存
45
  output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
46
  output_pdf.save(output_path)
47
  output_pdf.close()
48
-
49
  split_pdfs.append((start_page, output_path))
50
-
51
  pdf_document.close()
52
  return split_pdfs
53
 
@@ -59,59 +49,19 @@ def encode_pdf_to_base64(pdf_path):
59
 
60
 
61
  def ocr_pdf_with_gemini(pdf_path, model_name):
62
- """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
63
  pdf_base64 = encode_pdf_to_base64(pdf_path)
64
  model = genai.GenerativeModel(model_name)
65
-
66
  prompt = """
67
  You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.
68
 
69
  **Crucial Instructions:**
70
-
71
- 1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
72
- 2. **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
73
- * **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans. Use extended Markdown table syntax if needed.
74
- * **Lists:** Preserve the original list style (numbered, bulleted, nested).
75
- * **Font Sizes:** Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text. Larger text should generally correspond to higher-level headings.
76
- * **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
77
- 3. **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters). Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text. Do *not* blindly output the raw OCR result if it contains obvious mistakes.
78
- 4. **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots. *Do not* include text from:
79
- * Window title bars
80
- * Operating system toolbars (e.g., Windows taskbar)
81
- * Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
82
- * Any other UI elements that are not part of the core content being displayed.
83
- 5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
84
- 6. **Output:** Only provide the extracted text in Markdown.
85
-
86
- **Example (Illustrative - Adapt to the specific PDF):**
87
-
88
- **Input PDF (Screenshot of a webpage):**
89
-
90
- ```
91
- [Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
92
- ```
93
-
94
- **Desired Markdown Output:**
95
-
96
- ```markdown
97
- # Welcome
98
-
99
- This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.
100
-
101
- * This is the first bullet point.
102
- * This is the second bullet point.
103
- * This is a nested bullet point.
104
-
105
- | Feature | Description | Price |
106
- |--------------|----------------------------|---------|
107
- | Feature A | Description of Feature A | $10 |
108
- | Feature B | Description of Feature B | $20 |
109
-
110
- ```
111
-
112
- **Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
113
  """
114
-
115
  try:
116
  response = model.generate_content(
117
  [
@@ -129,19 +79,15 @@ def ocr_pdf_with_gemini(pdf_path, model_name):
129
  return f"エラーが発生しました: {e}"
130
 
131
 
132
-
133
  def process_pdf(pdf_file, progress=gr.Progress()):
134
  """PDFファイルを処理するメイン関数"""
135
  logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
136
-
137
  with tempfile.TemporaryDirectory() as temp_dir:
138
  temp_pdf_path = pdf_file.name
139
  logging.info(f"Temporary PDF path: {temp_pdf_path}")
140
-
141
  split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
142
  logging.info(f"Split PDF paths: {split_pdf_paths}")
143
  progress(0.2, desc="PDFを分割中...")
144
-
145
  markdown_results = {}
146
  with ThreadPoolExecutor() as executor:
147
  futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
@@ -156,82 +102,86 @@ def process_pdf(pdf_file, progress=gr.Progress()):
156
  except Exception as e:
157
  logging.error(f"Error processing split PDF: {e}")
158
  markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
159
-
160
  logging.info(f"Markdown results length: {len(markdown_results)}")
161
  progress(0.8, desc="結果を結合中...")
162
-
163
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
164
  progress(1.0, desc="完了")
165
  time.sleep(0.5)
166
-
167
  return combined_markdown
168
 
169
 
170
- def create_interface():
171
- with gr.Blocks() as demo:
172
- gr.Markdown("# PDF OCR & マークダウン変換ツール")
173
- gr.Markdown("PDFをアップロードすると、OCRでテキストを抽出しマークダウン形式に変換します。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- with gr.Row():
176
- pdf_input = gr.File(label="PDFファイルをアップロード", file_types=[".pdf"])
177
 
 
 
 
 
 
178
  with gr.Row():
179
- convert_btn = gr.Button("変換開始", variant="primary",
180
- elem_id="convert-button")
181
-
182
  with gr.Row():
183
- markdown_output = gr.Textbox(label="変換結果", lines=10,
184
- max_lines=20)
185
-
186
  with gr.Row():
187
- copy_btn = gr.Button("クリップボードにコピー")
188
- download_btn = gr.Button("ダウンロード")
189
-
190
- js_code = gr.HTML(
191
- """
192
- <script>
193
- function styleButton() {
194
- document.getElementById('convert-button').style.backgroundColor = 'orange';
195
- }
196
- </script>
197
- """,
198
- visible=False,
199
- )
200
-
201
- pdf_input.upload(None, [], [], js="styleButton")
202
-
203
- convert_btn.click(
204
- fn=process_pdf,
205
- inputs=pdf_input,
206
- outputs=markdown_output
207
- )
208
-
209
- copy_btn.click(
210
- None,
211
- markdown_output,
212
- [],
213
- js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
214
- )
215
-
216
- download_btn.click(
217
- None,
218
- markdown_output,
219
- [],
220
- js=f"""(x) =>{{
221
- const blob = new Blob([x], {{type: 'text/markdown;charset=utf-8'}});
222
- const url = URL.createObjectURL(blob);
223
- const a = document.createElement('a');
224
- a.href = url;
225
- a.download = 'converted.md';
226
- document.body.appendChild(a);
227
- a.click();
228
- document.body.removeChild(a);
229
- URL.revokeObjectURL(url);
230
- }}"""
231
- )
232
  return demo
233
 
234
 
 
 
 
 
235
  if __name__ == "__main__":
236
- demo = create_interface()
237
- demo.launch()
 
1
  import os
2
+ import logging
 
 
 
3
  import tempfile
4
+ import time
5
  import base64
6
+ import requests
7
+ import fitz # PyMuPDF
8
+ import gradio as gr
9
+ import google.generativeai as genai
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from fastapi import FastAPI, HTTPException
12
+ from fastapi.middleware.cors import CORSMiddleware
13
 
14
  # ロギング設定
15
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16
 
17
+ # 環境変数から設定を読み込み
18
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
19
  MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
 
20
  if not GOOGLE_API_KEY:
21
  raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
22
  if not MODEL_NAME:
23
  raise ValueError("環境変数 'MODEL_NAME' が設定されていません。")
 
24
  genai.configure(api_key=GOOGLE_API_KEY)
25
 
26
 
 
28
  """PDFを指定ページ数ごとに分割する関数"""
29
  pdf_document = fitz.open(pdf_path)
30
  total_pages = len(pdf_document)
 
31
  split_pdfs = []
 
32
  for start_page in range(0, total_pages, pages_per_chunk):
33
  end_page = min(start_page + pages_per_chunk - 1, total_pages - 1)
 
 
34
  output_pdf = fitz.open()
 
 
35
  for page_num in range(start_page, end_page + 1):
36
  output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
 
 
37
  output_path = os.path.join(output_dir, f"split_{start_page+1}_to_{end_page+1}.pdf")
38
  output_pdf.save(output_path)
39
  output_pdf.close()
 
40
  split_pdfs.append((start_page, output_path))
 
41
  pdf_document.close()
42
  return split_pdfs
43
 
 
49
 
50
 
51
  def ocr_pdf_with_gemini(pdf_path, model_name):
52
+ """GeminiモデルでPDFをOCRしてMarkdownに変換する関数"""
53
  pdf_base64 = encode_pdf_to_base64(pdf_path)
54
  model = genai.GenerativeModel(model_name)
 
55
  prompt = """
56
  You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.
57
 
58
  **Crucial Instructions:**
59
+ 1. Maintain consistent Markdown styling.
60
+ 2. Reproduce the visual appearance (tables, lists, headings) as faithfully as possible.
61
+ 3. Correct OCR-induced errors.
62
+ 4. If the PDF mainly contains screenshots, focus on the main content area.
63
+ 5. Only output the extracted text in Markdown.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  """
 
65
  try:
66
  response = model.generate_content(
67
  [
 
79
  return f"エラーが発生しました: {e}"
80
 
81
 
 
82
  def process_pdf(pdf_file, progress=gr.Progress()):
83
  """PDFファイルを処理するメイン関数"""
84
  logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
 
85
  with tempfile.TemporaryDirectory() as temp_dir:
86
  temp_pdf_path = pdf_file.name
87
  logging.info(f"Temporary PDF path: {temp_pdf_path}")
 
88
  split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
89
  logging.info(f"Split PDF paths: {split_pdf_paths}")
90
  progress(0.2, desc="PDFを分割中...")
 
91
  markdown_results = {}
92
  with ThreadPoolExecutor() as executor:
93
  futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
 
102
  except Exception as e:
103
  logging.error(f"Error processing split PDF: {e}")
104
  markdown_results[start_page] = f"分割PDFの処理中にエラーが発生しました: {e}"
 
105
  logging.info(f"Markdown results length: {len(markdown_results)}")
106
  progress(0.8, desc="結果を結合中...")
 
107
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
108
  progress(1.0, desc="完了")
109
  time.sleep(0.5)
 
110
  return combined_markdown
111
 
112
 
113
+ def process_pdf_from_url(url: str):
114
+ """指定されたURLからPDFをダウンロードし、OCR→Markdown変換を実施する関数"""
115
+ logging.info(f"Downloading PDF from URL: {url}")
116
+ response = requests.get(url)
117
+ if response.status_code != 200:
118
+ raise Exception(f"PDFのダウンロードに失敗しました。ステータスコード: {response.status_code}")
119
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
120
+ tmp.write(response.content)
121
+ tmp.flush()
122
+ tmp_name = tmp.name
123
+ try:
124
+ with open(tmp_name, "rb") as pdf_file:
125
+ markdown = process_pdf(pdf_file)
126
+ finally:
127
+ os.remove(tmp_name)
128
+ title = os.path.splitext(os.path.basename(url))[0]
129
+ return title, markdown
130
+
131
+
132
+ # FastAPIアプリケーションの作成
133
+ app = FastAPI()
134
+
135
+ # CORS対応(必要に応じて)
136
+ app.add_middleware(
137
+ CORSMiddleware,
138
+ allow_origins=["*"],
139
+ allow_credentials=True,
140
+ allow_methods=["*"],
141
+ allow_headers=["*"],
142
+ )
143
+
144
+ @app.post("/api/ocr")
145
+ async def ocr_endpoint(payload: dict):
146
+ """
147
+ POSTリクエストで受け取ったPDFのURLからOCR処理を実施し、
148
+ タイトルとMarkdown形式の変換結果を返すエンドポイント。
149
+ リクエスト例:
150
+ {
151
+ "url": "https://example.com/document.pdf"
152
+ }
153
+ """
154
+ url = payload.get("url")
155
+ if not url:
156
+ raise HTTPException(status_code=400, detail="URLパラメータが必要です。")
157
+ try:
158
+ title, markdown = process_pdf_from_url(url)
159
+ return {"title": title, "markdown": markdown}
160
+ except Exception as e:
161
+ logging.error(f"Error in /api/ocr: {e}")
162
+ raise HTTPException(status_code=500, detail=str(e))
163
 
 
 
164
 
165
+ def create_interface():
166
+ """Gradioインターフェースを作成(URL入力によるPDF処理)"""
167
+ with gr.Blocks() as demo:
168
+ gr.Markdown("# PDF OCR & Markdown変換ツール")
169
+ gr.Markdown("PDFのURLを入力すると、OCR処理を実行し、Markdown形式に変換します。")
170
  with gr.Row():
171
+ url_input = gr.Textbox(label="PDF URL", placeholder="例: https://example.com/document.pdf")
 
 
172
  with gr.Row():
173
+ convert_btn = gr.Button("変換開始", variant="primary")
 
 
174
  with gr.Row():
175
+ title_output = gr.Textbox(label="タイトル", interactive=False)
176
+ markdown_output = gr.Textbox(label="変換結果 (Markdown)", lines=10, max_lines=20)
177
+ convert_btn.click(fn=process_pdf_from_url, inputs=url_input, outputs=[title_output, markdown_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  return demo
179
 
180
 
181
+ # GradioインターフェースをFastAPIにマウント
182
+ demo = create_interface()
183
+ app = gr.mount_gradio_app(app, demo, path="/")
184
+
185
  if __name__ == "__main__":
186
+ import uvicorn
187
+ uvicorn.run(app, host="0.0.0.0", port=7860)