Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ from dotenv import load_dotenv
|
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
| 4 |
from PyPDF2 import PdfReader
|
|
|
|
| 5 |
from langchain.text_splitter import CharacterTextSplitter
|
| 6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
|
| 7 |
from langchain_community.vectorstores import FAISS
|
|
@@ -75,6 +76,43 @@ class PDFChatBot:
|
|
| 75 |
|
| 76 |
return raw_text, processed_count
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def get_text_chunks(self, text):
|
| 79 |
"""將文字分割成區塊進行處理"""
|
| 80 |
text_splitter = CharacterTextSplitter(
|
|
@@ -193,7 +231,7 @@ class PDFChatBot:
|
|
| 193 |
except Exception as e:
|
| 194 |
return f"處理問題時發生錯誤:{str(e)}"
|
| 195 |
|
| 196 |
-
def process_pdfs(self, pdf_files, progress=gr.Progress()):
|
| 197 |
"""處理PDF文件"""
|
| 198 |
if not pdf_files:
|
| 199 |
return "請上傳至少一個PDF文件。", ""
|
|
@@ -203,7 +241,10 @@ class PDFChatBot:
|
|
| 203 |
|
| 204 |
# 提取文字
|
| 205 |
progress(0.2, desc="提取PDF文字內容...")
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
if not raw_text.strip():
|
| 209 |
return "無法從PDF文件中提取到文字。", ""
|
|
@@ -300,8 +341,8 @@ class PDFChatBot:
|
|
| 300 |
bot = PDFChatBot()
|
| 301 |
|
| 302 |
# Gradio 接口函數
|
| 303 |
-
def upload_and_process(files, progress=gr.Progress()):
|
| 304 |
-
return bot.process_pdfs(files, progress)
|
| 305 |
|
| 306 |
def ask_question(question, history, temperature, max_tokens, search_k):
|
| 307 |
if not question.strip():
|
|
@@ -350,6 +391,20 @@ def load_existing_data():
|
|
| 350 |
else:
|
| 351 |
return "❌ 沒有找到已處理的資料。", ""
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
# 創建自定義主題
|
| 354 |
custom_theme = gr.themes.Soft(
|
| 355 |
primary_hue="blue",
|
|
@@ -407,12 +462,19 @@ with gr.Blocks(
|
|
| 407 |
# 文件上傳區域
|
| 408 |
with gr.Group():
|
| 409 |
gr.Markdown("### 📤 上傳PDF文件")
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
file_count="multiple",
|
| 412 |
file_types=[".pdf"],
|
| 413 |
label="選擇PDF文件",
|
| 414 |
height=150
|
| 415 |
)
|
|
|
|
| 416 |
|
| 417 |
# 處理選項
|
| 418 |
with gr.Row():
|
|
@@ -565,11 +627,17 @@ with gr.Blocks(
|
|
| 565 |
# 事件處理
|
| 566 |
process_btn.click(
|
| 567 |
fn=upload_and_process,
|
| 568 |
-
inputs=[file_upload],
|
| 569 |
outputs=[status_text, file_list],
|
| 570 |
show_progress=True
|
| 571 |
)
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
load_btn.click(
|
| 574 |
fn=load_existing_data,
|
| 575 |
outputs=[status_text, file_list]
|
|
|
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
+
import google.generativeai as genai
|
| 6 |
from langchain.text_splitter import CharacterTextSplitter
|
| 7 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
|
| 8 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 76 |
|
| 77 |
return raw_text, processed_count
|
| 78 |
|
| 79 |
+
def get_pdf_text_via_gemini(self, pdf_files):
|
| 80 |
+
"""使用 Gemini 2.0 Flash 直接解析 PDF 文字(透過 Files API)。"""
|
| 81 |
+
api_key = _get_api_key()
|
| 82 |
+
if not api_key:
|
| 83 |
+
return "", 0
|
| 84 |
+
|
| 85 |
+
genai.configure(api_key=api_key)
|
| 86 |
+
model = genai.GenerativeModel("gemini-2.0-flash-exp")
|
| 87 |
+
|
| 88 |
+
raw_text = ""
|
| 89 |
+
processed_count = 0
|
| 90 |
+
|
| 91 |
+
if not pdf_files:
|
| 92 |
+
return raw_text, processed_count
|
| 93 |
+
|
| 94 |
+
if not isinstance(pdf_files, list):
|
| 95 |
+
pdf_files = [pdf_files]
|
| 96 |
+
|
| 97 |
+
for pdf_file in pdf_files:
|
| 98 |
+
try:
|
| 99 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
|
| 100 |
+
uploaded = genai.upload_file(pdf_path)
|
| 101 |
+
prompt = (
|
| 102 |
+
"請從此 PDF 中提取可讀文字,按頁面順序輸出純文字。"
|
| 103 |
+
)
|
| 104 |
+
resp = model.generate_content([uploaded, prompt])
|
| 105 |
+
text = resp.text or ""
|
| 106 |
+
if text.strip():
|
| 107 |
+
raw_text += text + "\n"
|
| 108 |
+
processed_count += 1
|
| 109 |
+
self.processed_files.append(os.path.basename(pdf_path))
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"使用Gemini解析PDF時發生錯誤:{str(e)}")
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
return raw_text, processed_count
|
| 115 |
+
|
| 116 |
def get_text_chunks(self, text):
|
| 117 |
"""將文字分割成區塊進行處理"""
|
| 118 |
text_splitter = CharacterTextSplitter(
|
|
|
|
| 231 |
except Exception as e:
|
| 232 |
return f"處理問題時發生錯誤:{str(e)}"
|
| 233 |
|
| 234 |
+
def process_pdfs(self, pdf_files, progress=gr.Progress(), use_gemini=False):
|
| 235 |
"""處理PDF文件"""
|
| 236 |
if not pdf_files:
|
| 237 |
return "請上傳至少一個PDF文件。", ""
|
|
|
|
| 241 |
|
| 242 |
# 提取文字
|
| 243 |
progress(0.2, desc="提取PDF文字內容...")
|
| 244 |
+
if use_gemini:
|
| 245 |
+
raw_text, processed_count = self.get_pdf_text_via_gemini(pdf_files)
|
| 246 |
+
else:
|
| 247 |
+
raw_text, processed_count = self.get_pdf_text(pdf_files)
|
| 248 |
|
| 249 |
if not raw_text.strip():
|
| 250 |
return "無法從PDF文件中提取到文字。", ""
|
|
|
|
| 341 |
bot = PDFChatBot()
|
| 342 |
|
| 343 |
# Gradio 接口函數
|
| 344 |
+
def upload_and_process(files, use_gemini=False, progress=gr.Progress()):
|
| 345 |
+
return bot.process_pdfs(files, progress, use_gemini)
|
| 346 |
|
| 347 |
def ask_question(question, history, temperature, max_tokens, search_k):
|
| 348 |
if not question.strip():
|
|
|
|
| 391 |
else:
|
| 392 |
return "❌ 沒有找到已處理的資料。", ""
|
| 393 |
|
| 394 |
+
def set_api_key(api_key: str):
|
| 395 |
+
"""設定/更新 Google Gemini API 金鑰。
|
| 396 |
+
僅在記憶體與環境變數中更新,不會寫入硬碟。"""
|
| 397 |
+
key = (api_key or "").strip()
|
| 398 |
+
if not key:
|
| 399 |
+
return "❌ 未輸入任何金鑰。請貼上有效的 GOOGLE_API_KEY。"
|
| 400 |
+
os.environ["GOOGLE_API_KEY"] = key
|
| 401 |
+
# 重置 embeddings,確保後續以新金鑰初始化
|
| 402 |
+
try:
|
| 403 |
+
bot.embeddings = None
|
| 404 |
+
except Exception:
|
| 405 |
+
pass
|
| 406 |
+
return "✅ 已設定 API 金鑰(僅本次執行期間有效)。"
|
| 407 |
+
|
| 408 |
# 創建自定義主題
|
| 409 |
custom_theme = gr.themes.Soft(
|
| 410 |
primary_hue="blue",
|
|
|
|
| 462 |
# 文件上傳區域
|
| 463 |
with gr.Group():
|
| 464 |
gr.Markdown("### 📤 上傳PDF文件")
|
| 465 |
+
api_key_box = gr.Textbox(
|
| 466 |
+
label="Google API Key (可選:部署後可在此貼上)",
|
| 467 |
+
placeholder="以 sk- 或 AIza 開頭的金鑰(不會儲存到硬碟)",
|
| 468 |
+
type="password"
|
| 469 |
+
)
|
| 470 |
+
set_key_btn = gr.Button("🔑 設定 API 金鑰")
|
| 471 |
+
file_upload = gr.File(
|
| 472 |
file_count="multiple",
|
| 473 |
file_types=[".pdf"],
|
| 474 |
label="選擇PDF文件",
|
| 475 |
height=150
|
| 476 |
)
|
| 477 |
+
use_gemini_toggle = gr.Checkbox(label="使用 Gemini 解析 PDF(支援掃描影像)", value=False)
|
| 478 |
|
| 479 |
# 處理選項
|
| 480 |
with gr.Row():
|
|
|
|
| 627 |
# 事件處理
|
| 628 |
process_btn.click(
|
| 629 |
fn=upload_and_process,
|
| 630 |
+
inputs=[file_upload, use_gemini_toggle],
|
| 631 |
outputs=[status_text, file_list],
|
| 632 |
show_progress=True
|
| 633 |
)
|
| 634 |
|
| 635 |
+
set_key_btn.click(
|
| 636 |
+
fn=set_api_key,
|
| 637 |
+
inputs=[api_key_box],
|
| 638 |
+
outputs=[status_text]
|
| 639 |
+
)
|
| 640 |
+
|
| 641 |
load_btn.click(
|
| 642 |
fn=load_existing_data,
|
| 643 |
outputs=[status_text, file_list]
|