mikao007 commited on
Commit
73d91a6
·
verified ·
1 Parent(s): a8d1bed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +601 -591
app.py CHANGED
@@ -1,592 +1,602 @@
1
- from dotenv import load_dotenv
2
- import os
3
- import gradio as gr
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
7
- from langchain_community.vectorstores import FAISS
8
- from langchain.chains.question_answering import load_qa_chain
9
- from langchain.prompts import PromptTemplate
10
- import shutil
11
- import tempfile
12
- from docx import Document
13
- from docx.shared import Inches
14
- from datetime import datetime
15
-
16
- # Load environment variables
17
- load_dotenv()
18
-
19
- # Set Gemini API key
20
- gemini_api_key = "AIzaSyA8zqhqNb-bNYU6KVb0Zj0XIKi3aZfvXE0"
21
- os.environ["GOOGLE_API_KEY"] = gemini_api_key
22
-
23
- class PDFChatBot:
24
- def __init__(self):
25
- self.vector_store = None
26
- self.embeddings = GoogleGenerativeAIEmbeddings(
27
- model="models/text-embedding-004",
28
- google_api_key=gemini_api_key
29
- )
30
- self.processed_files = []
31
- self.chat_history = [] # 儲存聊天歷史
32
-
33
- def get_pdf_text(self, pdf_files):
34
- """從多個PDF文件中提取文字"""
35
- raw_text = ""
36
- processed_count = 0
37
-
38
- if not pdf_files:
39
- return raw_text, processed_count
40
-
41
- # 處理單個文件和多個文件
42
- if not isinstance(pdf_files, list):
43
- pdf_files = [pdf_files]
44
-
45
- for pdf_file in pdf_files:
46
- try:
47
- # 如果是上傳的文件對象,使用其name屬性
48
- pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
49
-
50
- pdf_reader = PdfReader(pdf_path)
51
- file_text = ""
52
- for page in pdf_reader.pages:
53
- text = page.extract_text()
54
- if text:
55
- file_text += text + "\n"
56
-
57
- if file_text.strip():
58
- raw_text += file_text
59
- processed_count += 1
60
- self.processed_files.append(os.path.basename(pdf_path))
61
-
62
- except Exception as e:
63
- print(f"讀取PDF時發生錯誤:{str(e)}")
64
- continue
65
-
66
- return raw_text, processed_count
67
-
68
- def get_text_chunks(self, text):
69
- """將文字分割成區塊進行處理"""
70
- text_splitter = CharacterTextSplitter(
71
- separator="\n",
72
- chunk_size=10000,
73
- chunk_overlap=1000,
74
- length_function=len
75
- )
76
- chunks = text_splitter.split_text(text)
77
- return chunks
78
-
79
- def create_vector_store(self, chunks):
80
- """從文字區塊創建FAISS向量存儲"""
81
- try:
82
- self.vector_store = FAISS.from_texts(chunks, self.embeddings)
83
- self.vector_store.save_local("faiss_index")
84
- return True
85
- except Exception as e:
86
- print(f"創建向量存儲時發生錯誤:{str(e)}")
87
- return False
88
-
89
- def load_vector_store(self):
90
- """載入已存在的向量存儲"""
91
- try:
92
- if os.path.exists("faiss_index"):
93
- self.vector_store = FAISS.load_local(
94
- "faiss_index",
95
- embeddings=self.embeddings,
96
- allow_dangerous_deserialization=True
97
- )
98
- return True
99
- else:
100
- return False
101
- except Exception as e:
102
- print(f"載入向量存儲時發生錯誤:{str(e)}")
103
- return False
104
-
105
- def get_conversational_chain(self, temperature=0.3, max_tokens=4096):
106
- """創建對話鏈"""
107
- prompt_template = """
108
- 根據提供的內容盡可能詳細地回答問題。確保提供所有細節。
109
- 如果你需要更多細節來完美回答問題,那麼請詢問你認為需要了解的更多細節。
110
- 如果答案不在提供的內容中,只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。
111
-
112
- 內容:\n {context}\n
113
- 問題: \n{question}\n
114
-
115
- 回答:
116
- """
117
-
118
- # Using Flash 2.0 model
119
- model = ChatGoogleGenerativeAI(
120
- model="gemini-2.0-flash-exp",
121
- google_api_key=gemini_api_key,
122
- temperature=temperature,
123
- max_tokens=max_tokens,
124
- top_p=0.8,
125
- top_k=40
126
- )
127
-
128
- prompt = PromptTemplate(
129
- template=prompt_template,
130
- input_variables=['context', 'question']
131
- )
132
-
133
- chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
134
- return chain
135
-
136
- def answer_question(self, question, temperature=0.3, max_tokens=4096, search_k=6):
137
- """回答用戶問題"""
138
- if not self.vector_store:
139
- return "請先上傳並處理PDF文件!"
140
-
141
- if not question.strip():
142
- return "請輸入您的問題。"
143
-
144
- try:
145
- # 搜索相關文檔
146
- docs = self.vector_store.similarity_search(question, k=search_k)
147
-
148
- if not docs:
149
- return "在上傳的文檔中找不到相關信息。"
150
-
151
- # 生成回答
152
- chain = self.get_conversational_chain(temperature, max_tokens)
153
- response = chain(
154
- {
155
- "input_documents": docs,
156
- "question": question,
157
- },
158
- return_only_outputs=True
159
- )
160
-
161
- return response["output_text"]
162
-
163
- except Exception as e:
164
- return f"處理問題時發生錯誤:{str(e)}"
165
-
166
- def process_pdfs(self, pdf_files, progress=gr.Progress()):
167
- """處理PDF文件"""
168
- if not pdf_files:
169
- return "請上傳至少一個PDF文件。", ""
170
-
171
- self.processed_files = []
172
- progress(0, desc="開始處理PDF文件...")
173
-
174
- # 提取文字
175
- progress(0.2, desc="提取PDF文字內容...")
176
- raw_text, processed_count = self.get_pdf_text(pdf_files)
177
-
178
- if not raw_text.strip():
179
- return "無法從PDF文件中提取到文字。", ""
180
-
181
- progress(0.4, desc="分割文字內容...")
182
- # 分割文字
183
- text_chunks = self.get_text_chunks(raw_text)
184
-
185
- progress(0.6, desc="創建向量存儲...")
186
- # 創建向量存儲
187
- success = self.create_vector_store(text_chunks)
188
-
189
- progress(1.0, desc="處理完成!")
190
-
191
- if success:
192
- file_list = "已處理的文件:\n" + "\n".join([f"• {file}" for file in self.processed_files])
193
- return f"✅ 成功處理 {processed_count} 個PDF文件!\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list
194
- else:
195
- return " PDF處理失敗,請重試。", ""
196
-
197
- def clear_data(self):
198
- """清除處理過的資料"""
199
- try:
200
- if os.path.exists("faiss_index"):
201
- shutil.rmtree("faiss_index")
202
- self.vector_store = None
203
- self.processed_files = []
204
- self.chat_history = []
205
- return "✅ 已清除所有處理過的資料!", ""
206
- except Exception as e:
207
- return f"❌ 清除資料時發生錯誤:{str(e)}", ""
208
-
209
- def create_docx_report(self, chat_history):
210
- """創建包含聊天記錄的docx報告"""
211
- try:
212
- # 創建新的文檔
213
- doc = Document()
214
-
215
- # 添加標題
216
- title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0)
217
- title.alignment = 1 # 置中對齊
218
-
219
- # 添加生成時間
220
- doc.add_paragraph(f'生成時間:{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}')
221
-
222
- # 添加處理的文件列表
223
- if self.processed_files:
224
- doc.add_heading('已處理的PDF文件:', level=2)
225
- for i, file in enumerate(self.processed_files, 1):
226
- doc.add_paragraph(f'{i}. {file}', style='List Number')
227
-
228
- doc.add_paragraph('') # 空行
229
-
230
- # 添加問答記錄
231
- doc.add_heading('問答記錄:', level=2)
232
-
233
- if not chat_history:
234
- doc.add_paragraph('目前沒有問答記錄。')
235
- else:
236
- for i in range(0, len(chat_history), 2):
237
- if i + 1 < len(chat_history):
238
- question = chat_history[i]['content']
239
- answer = chat_history[i + 1]['content']
240
-
241
- # 問題
242
- q_paragraph = doc.add_paragraph()
243
- q_run = q_paragraph.add_run(f'問題 {(i//2)+1}:')
244
- q_run.bold = True
245
- q_run.font.size = Inches(0.14)
246
- q_paragraph.add_run(question)
247
-
248
- # 回答
249
- a_paragraph = doc.add_paragraph()
250
- a_run = a_paragraph.add_run('回答:')
251
- a_run.bold = True
252
- a_run.font.size = Inches(0.14)
253
- a_paragraph.add_run(answer)
254
-
255
- # 分隔線
256
- doc.add_paragraph('─' * 50)
257
-
258
- # 保存到臨時文件
259
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
260
- doc.save(temp_file.name)
261
- temp_file.close()
262
-
263
- return temp_file.name
264
-
265
- except Exception as e:
266
- print(f"創建docx文件時發生錯誤:{str(e)}")
267
- return None
268
-
269
- # 初始化聊天機器人
270
- bot = PDFChatBot()
271
-
272
- # Gradio 接口函數
273
- def upload_and_process(files, progress=gr.Progress()):
274
- return bot.process_pdfs(files, progress)
275
-
276
- def ask_question(question, history, temperature, max_tokens, search_k):
277
- if not question.strip():
278
- return history, ""
279
-
280
- response = bot.answer_question(question, temperature, max_tokens, search_k)
281
- # 使用新的消息格式
282
- user_msg = {"role": "user", "content": question}
283
- assistant_msg = {"role": "assistant", "content": response}
284
-
285
- history.append(user_msg)
286
- history.append(assistant_msg)
287
-
288
- # 同步更新聊天歷史到bot實例
289
- bot.chat_history = history.copy()
290
-
291
- return history, ""
292
-
293
- def download_chat_history():
294
- """下載聊天記錄為docx文件"""
295
- if not bot.chat_history:
296
- return None
297
-
298
- docx_path = bot.create_docx_report(bot.chat_history)
299
- return docx_path
300
-
301
- def export_to_word():
302
- """匯出問答記錄為Word文件"""
303
- if not bot.chat_history:
304
- return None
305
-
306
- docx_path = bot.create_docx_report(bot.chat_history)
307
- return docx_path
308
-
309
- def clear_chat():
310
- """清除聊天記錄"""
311
- bot.chat_history = []
312
- return [], ""
313
-
314
- def clear_all_data():
315
- return bot.clear_data()
316
-
317
- def load_existing_data():
318
- if bot.load_vector_store():
319
- return "✅ 成功載入已處理的資料!", ""
320
- else:
321
- return "❌ 沒有找到已處理的資料。", ""
322
-
323
- # 創建自定義主題
324
- custom_theme = gr.themes.Soft(
325
- primary_hue="blue",
326
- secondary_hue="gray",
327
- neutral_hue="slate",
328
- font=gr.themes.GoogleFont("Noto Sans TC"),
329
- font_mono=gr.themes.GoogleFont("JetBrains Mono")
330
- )
331
-
332
- # 創建 Gradio 介面
333
- with gr.Blocks(
334
- title="PDF智能問答系統",
335
- theme=custom_theme,
336
- css="""
337
- .gradio-container {
338
- max-width: 1200px !important;
339
- margin: auto !important;
340
- }
341
- .main-header {
342
- text-align: center;
343
- padding: 20px;
344
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
345
- color: white;
346
- border-radius: 10px;
347
- margin-bottom: 20px;
348
- }
349
- .status-box {
350
- background-color: #f8f9fa;
351
- border-left: 4px solid #007bff;
352
- padding: 15px;
353
- border-radius: 5px;
354
- }
355
- .file-info {
356
- background-color: #e8f5e8;
357
- border-left: 4px solid #28a745;
358
- padding: 10px;
359
- border-radius: 5px;
360
- }
361
- """
362
- ) as demo:
363
-
364
- # 主標題區域
365
- with gr.Row():
366
- gr.HTML("""
367
- <div class="main-header">
368
- <h1>🤖 PDF智能問答系統</h1>
369
- <p>基於 Gemini 2.0 Flash 的 RAG 技術 | 支持多語言問答</p>
370
- </div>
371
- """)
372
-
373
- # 主要功能區域
374
- with gr.Tab("📁 文件管理", id="file_tab"):
375
- with gr.Row():
376
- with gr.Column(scale=3):
377
- # 文件上傳區域
378
- with gr.Group():
379
- gr.Markdown("### 📤 上傳PDF文件")
380
- file_upload = gr.File(
381
- file_count="multiple",
382
- file_types=[".pdf"],
383
- label="選擇PDF文件",
384
- height=150
385
- )
386
-
387
- # 處理選項
388
- with gr.Row():
389
- process_btn = gr.Button(
390
- "🚀 開始處理",
391
- variant="primary",
392
- size="lg",
393
- scale=2
394
- )
395
- load_btn = gr.Button(
396
- "📂 載入已處理資料",
397
- variant="secondary",
398
- scale=1
399
- )
400
- clear_btn = gr.Button(
401
- "🗑️ 清除所有資料",
402
- variant="stop",
403
- scale=1
404
- )
405
-
406
- with gr.Column(scale=2):
407
- # 狀態顯示區域
408
- with gr.Group():
409
- gr.Markdown("### 📊 處理狀態")
410
- status_text = gr.Textbox(
411
- label="處理進度",
412
- lines=6,
413
- interactive=False,
414
- elem_classes=["status-box"]
415
- )
416
-
417
- # 文件列表
418
- gr.Markdown("### 📋 已處理文件")
419
- file_list = gr.Textbox(
420
- label="文件清單",
421
- lines=8,
422
- interactive=False,
423
- elem_classes=["file-info"]
424
- )
425
-
426
- with gr.Tab("💬 智能問答", id="chat_tab"):
427
- with gr.Row():
428
- with gr.Column(scale=4):
429
- # 聊天區域
430
- chatbot = gr.Chatbot(
431
- label="💬 對話記錄",
432
- height=600,
433
- show_copy_button=True,
434
- type="messages",
435
- avatar_images=["👤", "🤖"],
436
- bubble_full_width=False
437
- )
438
-
439
- with gr.Column(scale=1):
440
- # 側邊欄功能
441
- with gr.Group():
442
- gr.Markdown("### ⚙️ 問答設定")
443
-
444
- # 模型參數調整
445
- temperature = gr.Slider(
446
- minimum=0.1,
447
- maximum=1.0,
448
- value=0.3,
449
- step=0.1,
450
- label="創意度 (Temperature)",
451
- info="數值越高回答越有創意"
452
- )
453
-
454
- max_tokens = gr.Slider(
455
- minimum=512,
456
- maximum=8192,
457
- value=4096,
458
- step=512,
459
- label="最大回答長度",
460
- info="控制回答的詳細程度"
461
- )
462
-
463
- search_k = gr.Slider(
464
- minimum=2,
465
- maximum=10,
466
- value=6,
467
- step=1,
468
- label="檢索文檔數量",
469
- info="搜索相關文檔的數量"
470
- )
471
-
472
- # 輸入區域
473
- with gr.Row():
474
- question_input = gr.Textbox(
475
- placeholder="請輸入您的問題... (支援中文、英文等多語言)",
476
- label="💭 問題輸入",
477
- lines=3,
478
- scale=4,
479
- max_lines=5
480
- )
481
- ask_btn = gr.Button(
482
- "📤 發送問題",
483
- variant="primary",
484
- scale=1,
485
- size="lg"
486
- )
487
-
488
- # 快捷操作
489
- with gr.Row():
490
- clear_chat_btn = gr.Button(
491
- "🧹 清除對話",
492
- variant="secondary",
493
- scale=1
494
- )
495
- download_btn = gr.Button(
496
- "📥 下載問答記錄",
497
- variant="primary",
498
- scale=1
499
- )
500
- export_btn = gr.Button(
501
- "📄 匯出為Word",
502
- variant="secondary",
503
- scale=1
504
- )
505
-
506
- # 問題範例
507
- with gr.Group():
508
- gr.Markdown("### 💡 問題範例")
509
- gr.Examples(
510
- examples=[
511
- "這份文檔的主要內容是什麼?",
512
- "請總結文檔的重點和關鍵概念",
513
- "文檔中提到了哪些重要數據或統計?",
514
- "能否詳細解釋某個特定主題或概念?",
515
- "文檔的結論是什麼?",
516
- "有哪些重要的建議或建議?",
517
- "文檔中提到了哪些風險或挑戰?",
518
- "請比較文檔中提到的不同觀點"
519
- ],
520
- inputs=question_input,
521
- label="點擊範例快速填入"
522
- )
523
-
524
- # 隱藏的文件下載組件
525
- download_file = gr.File(visible=False)
526
-
527
- # 下載功能處理函數
528
- def handle_download():
529
- file_path = download_chat_history()
530
- if file_path:
531
- return gr.update(value=file_path, visible=True)
532
- else:
533
- gr.Warning("沒有聊天記錄可以下載!")
534
- return gr.update(visible=False)
535
-
536
- # 事件處理
537
- process_btn.click(
538
- fn=upload_and_process,
539
- inputs=[file_upload],
540
- outputs=[status_text, file_list],
541
- show_progress=True
542
- )
543
-
544
- load_btn.click(
545
- fn=load_existing_data,
546
- outputs=[status_text, file_list]
547
- )
548
-
549
- clear_btn.click(
550
- fn=clear_all_data,
551
- outputs=[status_text, file_list]
552
- )
553
-
554
- ask_btn.click(
555
- fn=ask_question,
556
- inputs=[question_input, chatbot, temperature, max_tokens, search_k],
557
- outputs=[chatbot, question_input]
558
- )
559
-
560
- question_input.submit(
561
- fn=ask_question,
562
- inputs=[question_input, chatbot, temperature, max_tokens, search_k],
563
- outputs=[chatbot, question_input]
564
- )
565
-
566
- clear_chat_btn.click(
567
- fn=clear_chat,
568
- outputs=[chatbot, question_input]
569
- )
570
-
571
- download_btn.click(
572
- fn=handle_download,
573
- outputs=download_file
574
- )
575
-
576
- export_btn.click(
577
- fn=export_to_word,
578
- outputs=download_file
579
- )
580
-
581
- if __name__ == "__main__":
582
- # 嘗試載入現有的向量存儲
583
- bot.load_vector_store()
584
-
585
- # 啟動應用
586
- demo.launch(
587
- share=False, # 設為 True 可獲得公共連結
588
- server_name="127.0.0.1", # 本地訪問
589
- server_port=None, # 自動選擇可用端口
590
- show_error=True,
591
- inbrowser=True # 自動打開瀏覽器
 
 
 
 
 
 
 
 
 
 
592
  )
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import gradio as gr
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain.chains.question_answering import load_qa_chain
9
+ from langchain.prompts import PromptTemplate
10
+ import shutil
11
+ import tempfile
12
+ from docx import Document
13
+ from docx.shared import Inches
14
+ from datetime import datetime
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Read Gemini API key from environment (.env or platform secrets)
20
+ gemini_api_key = os.getenv("GOOGLE_API_KEY", "").strip()
21
+ if gemini_api_key:
22
+ os.environ["GOOGLE_API_KEY"] = gemini_api_key
23
+ else:
24
+ print("警告:未找到 GOOGLE_API_KEY,請在環境變數或 .env 設定。")
25
+
26
+ class PDFChatBot:
27
+ def __init__(self):
28
+ self.vector_store = None
29
+ self.embeddings = GoogleGenerativeAIEmbeddings(
30
+ model="models/text-embedding-004",
31
+ google_api_key=gemini_api_key
32
+ )
33
+ self.processed_files = []
34
+ self.chat_history = [] # 儲存聊天歷史
35
+
36
+ def get_pdf_text(self, pdf_files):
37
+ """從多個PDF文件中提取文字"""
38
+ raw_text = ""
39
+ processed_count = 0
40
+
41
+ if not pdf_files:
42
+ return raw_text, processed_count
43
+
44
+ # 處理單個文件和多個文件
45
+ if not isinstance(pdf_files, list):
46
+ pdf_files = [pdf_files]
47
+
48
+ for pdf_file in pdf_files:
49
+ try:
50
+ # 如果是上傳的文件對象,使用其name屬性
51
+ pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
52
+
53
+ pdf_reader = PdfReader(pdf_path)
54
+ file_text = ""
55
+ for page in pdf_reader.pages:
56
+ text = page.extract_text()
57
+ if text:
58
+ file_text += text + "\n"
59
+
60
+ if file_text.strip():
61
+ raw_text += file_text
62
+ processed_count += 1
63
+ self.processed_files.append(os.path.basename(pdf_path))
64
+
65
+ except Exception as e:
66
+ print(f"讀取PDF時發生錯誤:{str(e)}")
67
+ continue
68
+
69
+ return raw_text, processed_count
70
+
71
+ def get_text_chunks(self, text):
72
+ """將文字分割成區塊進行處理"""
73
+ text_splitter = CharacterTextSplitter(
74
+ separator="\n",
75
+ chunk_size=10000,
76
+ chunk_overlap=1000,
77
+ length_function=len
78
+ )
79
+ chunks = text_splitter.split_text(text)
80
+ return chunks
81
+
82
+ def create_vector_store(self, chunks):
83
+ """從文字區塊創建FAISS向量存儲"""
84
+ try:
85
+ self.vector_store = FAISS.from_texts(chunks, self.embeddings)
86
+ self.vector_store.save_local("faiss_index")
87
+ return True
88
+ except Exception as e:
89
+ print(f"創建向量存儲時發生錯誤:{str(e)}")
90
+ return False
91
+
92
+ def load_vector_store(self):
93
+ """載入已存在的向量存儲"""
94
+ try:
95
+ if os.path.exists("faiss_index"):
96
+ self.vector_store = FAISS.load_local(
97
+ "faiss_index",
98
+ embeddings=self.embeddings,
99
+ allow_dangerous_deserialization=True
100
+ )
101
+ return True
102
+ else:
103
+ return False
104
+ except Exception as e:
105
+ print(f"載入向量存儲時發生錯誤:{str(e)}")
106
+ return False
107
+
108
+ def get_conversational_chain(self, temperature=0.3, max_tokens=4096):
109
+ """創建對話鏈"""
110
+ prompt_template = """
111
+ 根據提供的內容盡可能詳細地回答問題。確保提供所有細節。
112
+ 如果你需要更多細節來完美回答問題,那麼請詢問你認為需要了解的更多細節。
113
+ 如果答案不在提供的內容中,只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。
114
+
115
+ 內容:\n {context}\n
116
+ 問題: \n{question}\n
117
+
118
+ 回答:
119
+ """
120
+
121
+ # Using Flash 2.0 model
122
+ model = ChatGoogleGenerativeAI(
123
+ model="gemini-2.0-flash-exp",
124
+ google_api_key=gemini_api_key,
125
+ temperature=temperature,
126
+ max_tokens=max_tokens,
127
+ top_p=0.8,
128
+ top_k=40
129
+ )
130
+
131
+ prompt = PromptTemplate(
132
+ template=prompt_template,
133
+ input_variables=['context', 'question']
134
+ )
135
+
136
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
137
+ return chain
138
+
139
+ def answer_question(self, question, temperature=0.3, max_tokens=4096, search_k=6):
140
+ """回答用戶問題"""
141
+ if not self.vector_store:
142
+ return "請先上傳並處理PDF文件!"
143
+
144
+ if not question.strip():
145
+ return "請輸入您的問題。"
146
+
147
+ try:
148
+ # 搜索相關文檔
149
+ docs = self.vector_store.similarity_search(question, k=search_k)
150
+
151
+ if not docs:
152
+ return "在上傳的文檔中找不到相關信息。"
153
+
154
+ # 生成回答
155
+ chain = self.get_conversational_chain(temperature, max_tokens)
156
+ response = chain(
157
+ {
158
+ "input_documents": docs,
159
+ "question": question,
160
+ },
161
+ return_only_outputs=True
162
+ )
163
+
164
+ return response["output_text"]
165
+
166
+ except Exception as e:
167
+ return f"處理問題時發生錯誤:{str(e)}"
168
+
169
+ def process_pdfs(self, pdf_files, progress=gr.Progress()):
170
+ """處理PDF文件"""
171
+ if not pdf_files:
172
+ return "請上傳至少一個PDF文件。", ""
173
+
174
+ self.processed_files = []
175
+ progress(0, desc="開始處理PDF文件...")
176
+
177
+ # 提取文字
178
+ progress(0.2, desc="提取PDF文字內容...")
179
+ raw_text, processed_count = self.get_pdf_text(pdf_files)
180
+
181
+ if not raw_text.strip():
182
+ return "無法從PDF文件中提取到文字。", ""
183
+
184
+ progress(0.4, desc="分割文字內容...")
185
+ # 分割文字
186
+ text_chunks = self.get_text_chunks(raw_text)
187
+
188
+ progress(0.6, desc="創建向量存儲...")
189
+ # 創建向量存儲
190
+ success = self.create_vector_store(text_chunks)
191
+
192
+ progress(1.0, desc="處理完成!")
193
+
194
+ if success:
195
+ file_list = "已處理的文件:\n" + "\n".join([f"• {file}" for file in self.processed_files])
196
+ return f"✅ 成功處理 {processed_count} 個PDF文件!\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list
197
+ else:
198
+ return "❌ PDF處理失敗,請重試。", ""
199
+
200
+ def clear_data(self):
201
+ """清除處理過的資料"""
202
+ try:
203
+ if os.path.exists("faiss_index"):
204
+ shutil.rmtree("faiss_index")
205
+ self.vector_store = None
206
+ self.processed_files = []
207
+ self.chat_history = []
208
+ return "✅ 已清除所有處理過的資料!", ""
209
+ except Exception as e:
210
+ return f"❌ 清除資料時發生錯誤:{str(e)}", ""
211
+
212
+ def create_docx_report(self, chat_history):
213
+ """創建包含聊天記錄的docx報告"""
214
+ try:
215
+ # 創建新的文檔
216
+ doc = Document()
217
+
218
+ # 添加標題
219
+ title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0)
220
+ title.alignment = 1 # 置中對齊
221
+
222
+ # 添加生成時間
223
+ doc.add_paragraph(f'生成時間:{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}')
224
+
225
+ # 添加處理的文件列表
226
+ if self.processed_files:
227
+ doc.add_heading('已處理的PDF文件:', level=2)
228
+ for i, file in enumerate(self.processed_files, 1):
229
+ doc.add_paragraph(f'{i}. {file}', style='List Number')
230
+
231
+ doc.add_paragraph('') # 空行
232
+
233
+ # 添加問答記錄
234
+ doc.add_heading('問答記錄:', level=2)
235
+
236
+ if not chat_history:
237
+ doc.add_paragraph('目前沒有問答記錄。')
238
+ else:
239
+ for i in range(0, len(chat_history), 2):
240
+ if i + 1 < len(chat_history):
241
+ question = chat_history[i]['content']
242
+ answer = chat_history[i + 1]['content']
243
+
244
+ # 問題
245
+ q_paragraph = doc.add_paragraph()
246
+ q_run = q_paragraph.add_run(f'問題 {(i//2)+1}:')
247
+ q_run.bold = True
248
+ q_run.font.size = Inches(0.14)
249
+ q_paragraph.add_run(question)
250
+
251
+ # 回答
252
+ a_paragraph = doc.add_paragraph()
253
+ a_run = a_paragraph.add_run('回答:')
254
+ a_run.bold = True
255
+ a_run.font.size = Inches(0.14)
256
+ a_paragraph.add_run(answer)
257
+
258
+ # 分隔線
259
+ doc.add_paragraph('' * 50)
260
+
261
+ # 保存到臨時文件
262
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
263
+ doc.save(temp_file.name)
264
+ temp_file.close()
265
+
266
+ return temp_file.name
267
+
268
+ except Exception as e:
269
+ print(f"創建docx文件時發生錯誤:{str(e)}")
270
+ return None
271
+
272
+ # 初始化聊天機器人
273
+ bot = PDFChatBot()
274
+
275
+ # Gradio 接口函數
276
+ def upload_and_process(files, progress=gr.Progress()):
277
+ return bot.process_pdfs(files, progress)
278
+
279
+ def ask_question(question, history, temperature, max_tokens, search_k):
280
+ if not question.strip():
281
+ return history, ""
282
+
283
+ response = bot.answer_question(question, temperature, max_tokens, search_k)
284
+ # 使用新的消息格式
285
+ user_msg = {"role": "user", "content": question}
286
+ assistant_msg = {"role": "assistant", "content": response}
287
+
288
+ history.append(user_msg)
289
+ history.append(assistant_msg)
290
+
291
+ # 同步更新聊天歷史到bot實例
292
+ bot.chat_history = history.copy()
293
+
294
+ return history, ""
295
+
296
+ def download_chat_history():
297
+ """下載聊天記錄為docx文件"""
298
+ if not bot.chat_history:
299
+ return None
300
+
301
+ docx_path = bot.create_docx_report(bot.chat_history)
302
+ return docx_path
303
+
304
+ def export_to_word():
305
+ """匯出問答記錄為Word文件"""
306
+ if not bot.chat_history:
307
+ return None
308
+
309
+ docx_path = bot.create_docx_report(bot.chat_history)
310
+ return docx_path
311
+
312
+ def clear_chat():
313
+ """清除聊天記錄"""
314
+ bot.chat_history = []
315
+ return [], ""
316
+
317
+ def clear_all_data():
318
+ return bot.clear_data()
319
+
320
+ def load_existing_data():
321
+ if bot.load_vector_store():
322
+ return "✅ 成功載入已處理的資料!", ""
323
+ else:
324
+ return "❌ 沒有找到已處理的資料。", ""
325
+
326
+ # 創建自定義主題
327
+ custom_theme = gr.themes.Soft(
328
+ primary_hue="blue",
329
+ secondary_hue="gray",
330
+ neutral_hue="slate",
331
+ font=gr.themes.GoogleFont("Noto Sans TC"),
332
+ font_mono=gr.themes.GoogleFont("JetBrains Mono")
333
+ )
334
+
335
+ # 創建 Gradio 介面
336
+ with gr.Blocks(
337
+ title="PDF智能問答系統",
338
+ theme=custom_theme,
339
+ css="""
340
+ .gradio-container {
341
+ max-width: 1200px !important;
342
+ margin: auto !important;
343
+ }
344
+ .main-header {
345
+ text-align: center;
346
+ padding: 20px;
347
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
348
+ color: white;
349
+ border-radius: 10px;
350
+ margin-bottom: 20px;
351
+ }
352
+ .status-box {
353
+ background-color: #f8f9fa;
354
+ border-left: 4px solid #007bff;
355
+ padding: 15px;
356
+ border-radius: 5px;
357
+ }
358
+ .file-info {
359
+ background-color: #e8f5e8;
360
+ border-left: 4px solid #28a745;
361
+ padding: 10px;
362
+ border-radius: 5px;
363
+ }
364
+ """
365
+ ) as demo:
366
+
367
+ # 主標題區域
368
+ with gr.Row():
369
+ gr.HTML("""
370
+ <div class="main-header">
371
+ <h1>🤖 PDF智能問答系統</h1>
372
+ <p>基於 Gemini 2.0 Flash 的 RAG 技術 | 支持多語言問答</p>
373
+ </div>
374
+ """)
375
+
376
+ # 主要功能區域
377
+ with gr.Tab("📁 文件管理", id="file_tab"):
378
+ with gr.Row():
379
+ with gr.Column(scale=3):
380
+ # 文件上傳區域
381
+ with gr.Group():
382
+ gr.Markdown("### 📤 上傳PDF文件")
383
+ file_upload = gr.File(
384
+ file_count="multiple",
385
+ file_types=[".pdf"],
386
+ label="選擇PDF文件",
387
+ height=150
388
+ )
389
+
390
+ # 處理選項
391
+ with gr.Row():
392
+ process_btn = gr.Button(
393
+ "🚀 開始處理",
394
+ variant="primary",
395
+ size="lg",
396
+ scale=2
397
+ )
398
+ load_btn = gr.Button(
399
+ "📂 載入已處理資料",
400
+ variant="secondary",
401
+ scale=1
402
+ )
403
+ clear_btn = gr.Button(
404
+ "🗑️ 清除所有資料",
405
+ variant="stop",
406
+ scale=1
407
+ )
408
+
409
+ with gr.Column(scale=2):
410
+ # 狀態顯示區域
411
+ with gr.Group():
412
+ gr.Markdown("### 📊 處理狀態")
413
+ status_text = gr.Textbox(
414
+ label="處理進度",
415
+ lines=6,
416
+ interactive=False,
417
+ elem_classes=["status-box"]
418
+ )
419
+
420
+ # 文件列表
421
+ gr.Markdown("### 📋 已處理文件")
422
+ file_list = gr.Textbox(
423
+ label="文件清單",
424
+ lines=8,
425
+ interactive=False,
426
+ elem_classes=["file-info"]
427
+ )
428
+
429
+ with gr.Tab("💬 智能問答", id="chat_tab"):
430
+ with gr.Row():
431
+ with gr.Column(scale=4):
432
+ # 聊天區域
433
+ chatbot = gr.Chatbot(
434
+ label="💬 對話記錄",
435
+ height=600,
436
+ show_copy_button=True,
437
+ type="messages",
438
+ avatar_images=["👤", "🤖"]
439
+ )
440
+
441
+ with gr.Column(scale=1):
442
+ # 側邊欄功能
443
+ with gr.Group():
444
+ gr.Markdown("### ⚙️ 問答設定")
445
+
446
+ # 模型參數調整
447
+ temperature = gr.Slider(
448
+ minimum=0.1,
449
+ maximum=1.0,
450
+ value=0.3,
451
+ step=0.1,
452
+ label="創意度 (Temperature)",
453
+ info="數值越高回答越有創意"
454
+ )
455
+
456
+ max_tokens = gr.Slider(
457
+ minimum=512,
458
+ maximum=8192,
459
+ value=4096,
460
+ step=512,
461
+ label="最大回答長度",
462
+ info="控制回答的詳細程度"
463
+ )
464
+
465
+ search_k = gr.Slider(
466
+ minimum=2,
467
+ maximum=10,
468
+ value=6,
469
+ step=1,
470
+ label="檢索文檔數量",
471
+ info="搜索相關文檔的數量"
472
+ )
473
+
474
+ # 輸入區域
475
+ with gr.Row():
476
+ question_input = gr.Textbox(
477
+ placeholder="請輸入您的問題... (支援中文、英文等多語言)",
478
+ label="💭 問題輸入",
479
+ lines=3,
480
+ scale=4,
481
+ max_lines=5
482
+ )
483
+ ask_btn = gr.Button(
484
+ "📤 發送問題",
485
+ variant="primary",
486
+ scale=1,
487
+ size="lg"
488
+ )
489
+
490
+ # 快捷操作
491
+ with gr.Row():
492
+ clear_chat_btn = gr.Button(
493
+ "🧹 清除對話",
494
+ variant="secondary",
495
+ scale=1
496
+ )
497
+ download_btn = gr.Button(
498
+ "📥 下載問答記錄",
499
+ variant="primary",
500
+ scale=1
501
+ )
502
+ export_btn = gr.Button(
503
+ "📄 匯出為Word",
504
+ variant="secondary",
505
+ scale=1
506
+ )
507
+
508
+ # 問題範例
509
+ with gr.Group():
510
+ gr.Markdown("### 💡 問題範例")
511
+ gr.Examples(
512
+ examples=[
513
+ "這份文檔的主要內容是什麼?",
514
+ "請總結文檔的重點和關鍵概念",
515
+ "文檔中提到了哪些重要數據或統計?",
516
+ "能否詳細解釋某個特定主題或概念?",
517
+ "文檔的結論是什麼?",
518
+ "有哪些重要的建議或建議?",
519
+ "文檔中提到了哪些風險或挑戰?",
520
+ "請比較文檔中提到的不同觀點"
521
+ ],
522
+ inputs=question_input,
523
+ label="點擊範例快速填入"
524
+ )
525
+
526
+ # 隱藏的文件下載組件
527
+ download_file = gr.File(visible=False)
528
+
529
+ # 下載功能處理函數
530
+ def handle_download():
531
+ file_path = download_chat_history()
532
+ if file_path:
533
+ return gr.update(value=file_path, visible=True)
534
+ else:
535
+ gr.Warning("沒有聊天記錄可以下載!")
536
+ return gr.update(visible=False)
537
+
538
+ # 事件處理
539
+ process_btn.click(
540
+ fn=upload_and_process,
541
+ inputs=[file_upload],
542
+ outputs=[status_text, file_list],
543
+ show_progress=True
544
+ )
545
+
546
+ load_btn.click(
547
+ fn=load_existing_data,
548
+ outputs=[status_text, file_list]
549
+ )
550
+
551
+ clear_btn.click(
552
+ fn=clear_all_data,
553
+ outputs=[status_text, file_list]
554
+ )
555
+
556
+ ask_btn.click(
557
+ fn=ask_question,
558
+ inputs=[question_input, chatbot, temperature, max_tokens, search_k],
559
+ outputs=[chatbot, question_input]
560
+ )
561
+
562
+ question_input.submit(
563
+ fn=ask_question,
564
+ inputs=[question_input, chatbot, temperature, max_tokens, search_k],
565
+ outputs=[chatbot, question_input]
566
+ )
567
+
568
+ clear_chat_btn.click(
569
+ fn=clear_chat,
570
+ outputs=[chatbot, question_input]
571
+ )
572
+
573
+ download_btn.click(
574
+ fn=handle_download,
575
+ outputs=download_file
576
+ )
577
+
578
+ export_btn.click(
579
+ fn=export_to_word,
580
+ outputs=download_file
581
+ )
582
+
583
+ if __name__ == "__main__":
584
+ # 嘗試載入現有的向量存儲
585
+ bot.load_vector_store()
586
+
587
+ # 讀取部署相關配置
588
+ server_name = os.getenv("HOST", os.getenv("SERVER_NAME", "0.0.0.0"))
589
+ # 常見平台會傳入 PORT;若無則使用 7860(Gradio 預設)
590
+ server_port_env = os.getenv("PORT", os.getenv("SERVER_PORT"))
591
+ server_port = int(server_port_env) if server_port_env and server_port_env.isdigit() else 7860
592
+ inbrowser = os.getenv("INBROWSER", "false").lower() == "true"
593
+ share = os.getenv("GRADIO_SHARE", "false").lower() == "true"
594
+
595
+ # 啟動應用(綁定 0.0.0.0 以支援容器/雲端)
596
+ demo.launch(
597
+ share=share,
598
+ server_name=server_name,
599
+ server_port=server_port,
600
+ show_error=True,
601
+ inbrowser=inbrowser
602
  )