tomo2chin2 commited on
Commit
aea2a0a
·
verified ·
1 Parent(s): d7da753

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -38
app.py CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
5
  import google.generativeai as genai
6
  import tempfile
7
  import base64
8
- from concurrent.futures import ThreadPoolExecutor, as_completed # as_completed を追加
9
  import logging
10
  import time
11
 
@@ -14,7 +14,7 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
14
 
15
  # 環境変数から設定を読み込む
16
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
17
- MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro") # デフォルトモデルも設定
18
 
19
  if not GOOGLE_API_KEY:
20
  raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
@@ -46,7 +46,7 @@ def split_pdf(pdf_path, output_dir, pages_per_chunk=5):
46
  output_pdf.save(output_path)
47
  output_pdf.close()
48
 
49
- split_pdfs.append((start_page, output_path)) # (開始ページ, パス) のタプルで保存
50
 
51
  pdf_document.close()
52
  return split_pdfs
@@ -60,43 +60,69 @@ def encode_pdf_to_base64(pdf_path):
60
 
61
  def ocr_pdf_with_gemini(pdf_path, model_name):
62
  """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
63
- # PDFをbase64エンコード
64
  pdf_base64 = encode_pdf_to_base64(pdf_path)
65
-
66
- # Geminiモデルの設定
67
  model = genai.GenerativeModel(model_name)
68
 
69
- # プロンプトの設定
70
  prompt = """
71
- Extract the text content from this PDF using OCR and output it in a well-structured Markdown format.
72
- Focus solely on the text extraction; do not include any conversational elements, greetings, or additional explanations.
73
- Only provide the extracted text in Markdown.
74
-
75
- Pay attention to the following:
76
- - Use appropriate Markdown heading syntax (#, ##, ###, etc.) for headings.
77
- - Convert bulleted lists correctly.
78
- - Convert tables to Markdown table format.
79
- - Maintain the paragraph structure.
80
- - Clean up any extra line breaks or spaces.
81
- - For images, use the format [Image: Description of the content].
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  """
83
 
84
- # PDFをGeminiに送信
85
  try:
86
  response = model.generate_content(
87
  [
88
  prompt,
89
  {"mime_type": "application/pdf", "data": pdf_base64}
90
  ],
91
- # generation_configで不要な応答を抑制 (モデルによっては効かない場合あり)
92
  generation_config=genai.types.GenerationConfig(
93
- candidate_count=1, # 候補を1つに絞る
94
- max_output_tokens=8192, # 必要に応じてトークン数を調整
95
- # stop_sequences=["."], # 句点で生成を停止 (モデルによっては無効)
96
  )
97
-
98
  )
99
- # 結果を返す
100
  return response.text
101
  except Exception as e:
102
  logging.error(f"Error during Gemini API call: {e}")
@@ -108,18 +134,15 @@ def process_pdf(pdf_file, progress=gr.Progress()):
108
  """PDFファイルを処理するメイン関数"""
109
  logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
110
 
111
- # 一時ディレクトリを作成
112
  with tempfile.TemporaryDirectory() as temp_dir:
113
  temp_pdf_path = pdf_file.name
114
  logging.info(f"Temporary PDF path: {temp_pdf_path}")
115
 
116
- # PDFを分割
117
- split_pdf_paths = split_pdf(temp_pdf_path, temp_dir) # [(start_page, path), ...]
118
  logging.info(f"Split PDF paths: {split_pdf_paths}")
119
  progress(0.2, desc="PDFを分割中...")
120
 
121
- # 並列処理でOCR変換
122
- markdown_results = {} # {start_page: markdown_text, ...} 辞書に変更
123
  with ThreadPoolExecutor() as executor:
124
  futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
125
  completed_count = 0
@@ -137,7 +160,6 @@ def process_pdf(pdf_file, progress=gr.Progress()):
137
  logging.info(f"Markdown results length: {len(markdown_results)}")
138
  progress(0.8, desc="結果を結合中...")
139
 
140
- # 結果を結合 (開始ページ番号でソート)
141
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
142
  progress(1.0, desc="完了")
143
  time.sleep(0.5)
@@ -145,7 +167,6 @@ def process_pdf(pdf_file, progress=gr.Progress()):
145
  return combined_markdown
146
 
147
 
148
- # Gradioインターフェースの作成
149
  def create_interface():
150
  with gr.Blocks() as demo:
151
  gr.Markdown("# PDF OCR & マークダウン変換ツール")
@@ -166,7 +187,6 @@ def create_interface():
166
  copy_btn = gr.Button("クリップボードにコピー")
167
  download_btn = gr.Button("ダウンロード")
168
 
169
- # JavaScriptコードを埋め込むためのHTMLコンポーネント
170
  js_code = gr.HTML(
171
  """
172
  <script>
@@ -178,17 +198,14 @@ def create_interface():
178
  visible=False,
179
  )
180
 
181
- # js_code が読み込まれた後に styleButton 関数を実行
182
  pdf_input.upload(None, [], [], js="styleButton")
183
 
184
-
185
  convert_btn.click(
186
  fn=process_pdf,
187
  inputs=pdf_input,
188
  outputs=markdown_output
189
  )
190
 
191
- # クリップボードにコピー
192
  copy_btn.click(
193
  None,
194
  markdown_output,
@@ -196,7 +213,6 @@ def create_interface():
196
  js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
197
  )
198
 
199
- # ダウンロード
200
  download_btn.click(
201
  None,
202
  markdown_output,
@@ -216,7 +232,6 @@ def create_interface():
216
  return demo
217
 
218
 
219
- # アプリの起動
220
  if __name__ == "__main__":
221
  demo = create_interface()
222
  demo.launch()
 
5
  import google.generativeai as genai
6
  import tempfile
7
  import base64
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import logging
10
  import time
11
 
 
14
 
15
  # 環境変数から設定を読み込む
16
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
17
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gemini-1.5-pro")
18
 
19
  if not GOOGLE_API_KEY:
20
  raise ValueError("環境変数 'GOOGLE_API_KEY' が設定されていません。")
 
46
  output_pdf.save(output_path)
47
  output_pdf.close()
48
 
49
+ split_pdfs.append((start_page, output_path))
50
 
51
  pdf_document.close()
52
  return split_pdfs
 
60
 
61
  def ocr_pdf_with_gemini(pdf_path, model_name):
62
  """GeminiモデルでPDFをOCRしてマークダウンに変換する関数"""
 
63
  pdf_base64 = encode_pdf_to_base64(pdf_path)
 
 
64
  model = genai.GenerativeModel(model_name)
65
 
 
66
  prompt = """
67
+ You are an expert document processing assistant. Your task is to extract text from the provided PDF using OCR and convert it into a highly readable and visually appealing Markdown format.
68
+
69
+ **Crucial Instructions:**
70
+
71
+ 1. **Consistency:** Maintain a consistent Markdown style (headings, lists, tables, font sizes, etc.) throughout the entire output, even if the PDF is split into multiple parts. Avoid variations in formatting between different sections.
72
+ 2. **Visual Fidelity:** Strive to reproduce the original visual appearance of the PDF as closely as possible in Markdown. Pay close attention to:
73
+ * **Tables:** Recreate tables with accurate borders, cell alignment, and row/column spans. Use extended Markdown table syntax if needed.
74
+ * **Lists:** Preserve the original list style (numbered, bulleted, nested).
75
+ * **Font Sizes:** Use Markdown heading levels (#, ##, ###) and relative font size adjustments (if possible in your Markdown flavor) to approximate the visual hierarchy of the original text. Larger text should generally correspond to higher-level headings.
76
+ * **Layout:** Try to maintain the spatial relationships between elements (e.g., paragraphs, images). If there are multiple columns, consider representing that structure in Markdown, perhaps using tables.
77
+ 3. **OCR Correction:** The OCR process may introduce errors (typos, misspellings, incorrect characters). Apply your language understanding capabilities to correct these errors and produce grammatically correct and semantically meaningful text. Do *not* blindly output the raw OCR result if it contains obvious mistakes.
78
+ 4. **Content Filtering (Screenshots):** If the PDF primarily contains screenshots (e.g., of software interfaces), focus *exclusively* on extracting text from the *main content area* of the screenshots. *Do not* include text from:
79
+ * Window title bars
80
+ * Operating system toolbars (e.g., Windows taskbar)
81
+ * Menu bars *unless* they are directly related to the primary content (e.g. describing a software's menu options)
82
+ * Any other UI elements that are not part of the core content being displayed.
83
+ 5. **Accuracy and Clarity:** Prioritize providing accurate and clear information to the user. Do not simply reproduce OCR output verbatim if it is nonsensical or misleading. Use your understanding of the content to present information in a user-friendly way.
84
+ 6. **Output:** Only provide the extracted text in Markdown.
85
+
86
+ **Example (Illustrative - Adapt to the specific PDF):**
87
+
88
+ **Input PDF (Screenshot of a webpage):**
89
+
90
+ ```
91
+ [Screenshot of a webpage with a large heading "Welcome", a paragraph of text, a bulleted list, and a table.]
92
+ ```
93
+
94
+ **Desired Markdown Output:**
95
+
96
+ ```markdown
97
+ # Welcome
98
+
99
+ This is a paragraph of introductory text. It explains the purpose of the webpage and provides some context.
100
+
101
+ * This is the first bullet point.
102
+ * This is the second bullet point.
103
+ * This is a nested bullet point.
104
+
105
+ | Feature | Description | Price |
106
+ |--------------|----------------------------|---------|
107
+ | Feature A | Description of Feature A | $10 |
108
+ | Feature B | Description of Feature B | $20 |
109
+
110
+ ```
111
+
112
+ **Do NOT include text like "File Edit View" (from a menu bar) or "[X] Minimize Maximize Close" (from a window title bar).**
113
  """
114
 
 
115
  try:
116
  response = model.generate_content(
117
  [
118
  prompt,
119
  {"mime_type": "application/pdf", "data": pdf_base64}
120
  ],
 
121
  generation_config=genai.types.GenerationConfig(
122
+ candidate_count=1,
123
+ max_output_tokens=8192,
 
124
  )
 
125
  )
 
126
  return response.text
127
  except Exception as e:
128
  logging.error(f"Error during Gemini API call: {e}")
 
134
  """PDFファイルを処理するメイン関数"""
135
  logging.info(f"Received file: {pdf_file.name if hasattr(pdf_file, 'name') else pdf_file}")
136
 
 
137
  with tempfile.TemporaryDirectory() as temp_dir:
138
  temp_pdf_path = pdf_file.name
139
  logging.info(f"Temporary PDF path: {temp_pdf_path}")
140
 
141
+ split_pdf_paths = split_pdf(temp_pdf_path, temp_dir)
 
142
  logging.info(f"Split PDF paths: {split_pdf_paths}")
143
  progress(0.2, desc="PDFを分割中...")
144
 
145
+ markdown_results = {}
 
146
  with ThreadPoolExecutor() as executor:
147
  futures = {executor.submit(ocr_pdf_with_gemini, path, MODEL_NAME): start_page for start_page, path in split_pdf_paths}
148
  completed_count = 0
 
160
  logging.info(f"Markdown results length: {len(markdown_results)}")
161
  progress(0.8, desc="結果を結合中...")
162
 
 
163
  combined_markdown = "\n\n".join(markdown_results[page] for page in sorted(markdown_results.keys()))
164
  progress(1.0, desc="完了")
165
  time.sleep(0.5)
 
167
  return combined_markdown
168
 
169
 
 
170
  def create_interface():
171
  with gr.Blocks() as demo:
172
  gr.Markdown("# PDF OCR & マークダウン変換ツール")
 
187
  copy_btn = gr.Button("クリップボードにコピー")
188
  download_btn = gr.Button("ダウンロード")
189
 
 
190
  js_code = gr.HTML(
191
  """
192
  <script>
 
198
  visible=False,
199
  )
200
 
 
201
  pdf_input.upload(None, [], [], js="styleButton")
202
 
 
203
  convert_btn.click(
204
  fn=process_pdf,
205
  inputs=pdf_input,
206
  outputs=markdown_output
207
  )
208
 
 
209
  copy_btn.click(
210
  None,
211
  markdown_output,
 
213
  js=f"(x) => {{ navigator.clipboard.writeText(x); }}",
214
  )
215
 
 
216
  download_btn.click(
217
  None,
218
  markdown_output,
 
232
  return demo
233
 
234
 
 
235
  if __name__ == "__main__":
236
  demo = create_interface()
237
  demo.launch()