Spaces:
Sleeping
Sleeping
| # 第二步:匯入必要的庫並初始化 | |
| import os | |
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.prompts import PromptTemplate | |
| import shutil | |
| import tempfile | |
| from docx import Document | |
| from docx.shared import Inches | |
| from datetime import datetime | |
| import resend | |
| print("📦 所有庫匯入成功!") | |
| # 第三步:設置API密鑰 | |
| print("🔑 設置API密鑰...") | |
| # Gemini API key (請替換為您自己的API密鑰) | |
| gemini_api_key = "AIzaSyBbufVdrxdZkBxXLzXxfdtGArHUMfos5Z0" | |
| os.environ["GOOGLE_API_KEY"] = gemini_api_key | |
| # Resend API key (請替換為您自己的API密鑰) | |
| resend.api_key = "re_TPd7f23i_E3gvJYJF8xibuymWSPXxKPrY" | |
| print("✅ API密鑰設置完成!") | |
| # 第四步:定義PDF聊天機器人類 | |
| class PDFChatBot: | |
| def __init__(self): | |
| self.vector_store = None | |
| self.embeddings = GoogleGenerativeAIEmbeddings( | |
| model="models/text-embedding-004", | |
| google_api_key=gemini_api_key | |
| ) | |
| self.processed_files = [] | |
| self.chat_history = [] | |
| def get_pdf_text(self, pdf_files): | |
| """從多個PDF檔案中提取文字""" | |
| raw_text = "" | |
| processed_count = 0 | |
| if not pdf_files: | |
| return raw_text, processed_count | |
| if not isinstance(pdf_files, list): | |
| pdf_files = [pdf_files] | |
| for pdf_file in pdf_files: | |
| try: | |
| pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file | |
| pdf_reader = PdfReader(pdf_path) | |
| file_text = "" | |
| for page in pdf_reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| file_text += text + "\n" | |
| if file_text.strip(): | |
| raw_text += file_text | |
| processed_count += 1 | |
| self.processed_files.append(os.path.basename(pdf_path)) | |
| except Exception as e: | |
| print(f"讀取PDF時發生錯誤:{str(e)}") | |
| continue | |
| return raw_text, processed_count | |
| def get_text_chunks(self, text): | |
| """將文字分割成區塊進行處理""" | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=10000, | |
| chunk_overlap=1000, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def create_vector_store(self, chunks): | |
| """從文字區塊建立FAISS向量儲存""" | |
| try: | |
| self.vector_store = FAISS.from_texts(chunks, self.embeddings) | |
| self.vector_store.save_local("faiss_index") | |
| return True | |
| except Exception as e: | |
| print(f"建立向量儲存時發生錯誤:{str(e)}") | |
| return False | |
| def load_vector_store(self): | |
| """載入已存在的向量儲存""" | |
| try: | |
| if os.path.exists("faiss_index"): | |
| self.vector_store = FAISS.load_local( | |
| "faiss_index", | |
| embeddings=self.embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| return True | |
| else: | |
| return False | |
| except Exception as e: | |
| print(f"載入向量儲存時發生錯誤:{str(e)}") | |
| return False | |
| def get_conversational_chain(self): | |
| """建立對話鏈""" | |
| prompt_template = """ | |
| 根據提供的內容盡可能詳細地回答問題。確保提供所有細節。 | |
| 如果你需要更多細節來完美回答問題,那麼請詢問你認為需要了解的更多細節。 | |
| 如果答案不在提供的內容中,只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。 | |
| 內容:\n {context}\n | |
| 問題: \n{question}\n | |
| 回答: | |
| """ | |
| model = ChatGoogleGenerativeAI( | |
| model="gemini-2.0-flash-exp", | |
| google_api_key=gemini_api_key, | |
| temperature=0.3, | |
| max_tokens=8192, | |
| top_p=0.8, | |
| top_k=40 | |
| ) | |
| prompt = PromptTemplate( | |
| template=prompt_template, | |
| input_variables=['context', 'question'] | |
| ) | |
| chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) | |
| return chain | |
| def answer_question(self, question): | |
| """回答使用者問題""" | |
| if not self.vector_store: | |
| return "請先上傳並處理PDF檔案!" | |
| if not question.strip(): | |
| return "請輸入您的問題。" | |
| try: | |
| docs = self.vector_store.similarity_search(question, k=6) | |
| if not docs: | |
| return "在上傳的文件中找不到相關資訊。" | |
| chain = self.get_conversational_chain() | |
| response = chain( | |
| { | |
| "input_documents": docs, | |
| "question": question, | |
| }, | |
| return_only_outputs=True | |
| ) | |
| return response["output_text"] | |
| except Exception as e: | |
| return f"處理問題時發生錯誤:{str(e)}" | |
| def process_pdfs(self, pdf_files, progress=gr.Progress()): | |
| """處理PDF檔案""" | |
| if not pdf_files: | |
| return "請上傳至少一個PDF檔案。", "" | |
| self.processed_files = [] | |
| progress(0, desc="開始處理PDF檔案...") | |
| progress(0.2, desc="提取PDF文字內容...") | |
| raw_text, processed_count = self.get_pdf_text(pdf_files) | |
| if not raw_text.strip(): | |
| return "無法從PDF檔案中提取到文字。", "" | |
| progress(0.4, desc="分割文字內容...") | |
| text_chunks = self.get_text_chunks(raw_text) | |
| progress(0.6, desc="建立向量儲存...") | |
| success = self.create_vector_store(text_chunks) | |
| progress(1.0, desc="處理完成!") | |
| if success: | |
| file_list = "已處理的檔案:\n" + "\n".join([f"• {file}" for file in self.processed_files]) | |
| return f"✅ 成功處理 {processed_count} 個PDF檔案!\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list | |
| else: | |
| return "❌ PDF處理失敗,請重試。", "" | |
| def clear_data(self): | |
| """清除處理過的資料""" | |
| try: | |
| if os.path.exists("faiss_index"): | |
| shutil.rmtree("faiss_index") | |
| self.vector_store = None | |
| self.processed_files = [] | |
| self.chat_history = [] | |
| return "✅ 已清除所有處理過的資料!", "" | |
| except Exception as e: | |
| return f"❌ 清除資料時發生錯誤:{str(e)}", "" | |
| def create_docx_report(self, chat_history): | |
| """建立包含聊天記錄的docx報告""" | |
| try: | |
| doc = Document() | |
| title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0) | |
| title.alignment = 1 | |
| doc.add_paragraph(f'產生時間:{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}') | |
| if self.processed_files: | |
| doc.add_heading('已處理的PDF檔案:', level=2) | |
| for i, file in enumerate(self.processed_files, 1): | |
| doc.add_paragraph(f'{i}. {file}', style='List Number') | |
| doc.add_paragraph('') | |
| doc.add_heading('問答記錄:', level=2) | |
| if not chat_history: | |
| doc.add_paragraph('目前沒有問答記錄。') | |
| else: | |
| for i in range(0, len(chat_history), 2): | |
| if i + 1 < len(chat_history): | |
| question = chat_history[i]['content'] | |
| answer = chat_history[i + 1]['content'] | |
| q_paragraph = doc.add_paragraph() | |
| q_run = q_paragraph.add_run(f'問題 {(i//2)+1}:') | |
| q_run.bold = True | |
| q_run.font.size = Inches(0.14) | |
| q_paragraph.add_run(question) | |
| a_paragraph = doc.add_paragraph() | |
| a_run = a_paragraph.add_run('回答:') | |
| a_run.bold = True | |
| a_run.font.size = Inches(0.14) | |
| a_paragraph.add_run(answer) | |
| doc.add_paragraph('─' * 50) | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') | |
| doc.save(temp_file.name) | |
| temp_file.close() | |
| return temp_file.name | |
| except Exception as e: | |
| print(f"建立docx檔案時發生錯誤:{str(e)}") | |
| return None | |
| def create_email_html_content(self, chat_history): | |
| """建立郵件的HTML內容""" | |
| if not chat_history: | |
| return "<p>目前沒有問答記錄。</p>" | |
| html_content = f""" | |
| <html> | |
| <head> | |
| <style> | |
| body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; }} | |
| .header {{ background-color: #f4f4f4; padding: 20px; text-align: center; }} | |
| .content {{ padding: 20px; }} | |
| .question {{ background-color: #e8f4f8; padding: 10px; margin: 10px 0; border-left: 4px solid #2196F3; }} | |
| .answer {{ background-color: #f0f8e8; padding: 10px; margin: 10px 0; border-left: 4px solid #4CAF50; }} | |
| .file-list {{ background-color: #fff3cd; padding: 10px; margin: 10px 0; border: 1px solid #ffeeba; }} | |
| hr {{ border: none; border-top: 1px solid #ddd; margin: 20px 0; }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="header"> | |
| <h1>🤖 PDF聊天機器人 - 問答記錄</h1> | |
| <p>產生時間:{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}</p> | |
| </div> | |
| <div class="content"> | |
| """ | |
| if self.processed_files: | |
| html_content += """ | |
| <div class="file-list"> | |
| <h3>📁 已處理的PDF檔案:</h3> | |
| <ul> | |
| """ | |
| for file in self.processed_files: | |
| html_content += f"<li>{file}</li>" | |
| html_content += "</ul></div>" | |
| html_content += "<h3>💬 問答記錄:</h3>" | |
| for i in range(0, len(chat_history), 2): | |
| if i + 1 < len(chat_history): | |
| question = chat_history[i]['content'] | |
| answer = chat_history[i + 1]['content'] | |
| question_html = question.replace('\n', '<br>') | |
| answer_html = answer.replace('\n', '<br>') | |
| html_content += f""" | |
| <div class="question"> | |
| <strong>問題 {(i//2)+1}:</strong><br> | |
| {question_html} | |
| </div> | |
| <div class="answer"> | |
| <strong>回答:</strong><br> | |
| {answer_html} | |
| </div> | |
| <hr> | |
| """ | |
| html_content += """ | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| return html_content | |
| def send_chat_history_email(self, recipient_email): | |
| """發送聊天記錄到指定信箱""" | |
| if not self.chat_history: | |
| return "❌ 沒有聊天記錄可以發送!" | |
| if not recipient_email or "@" not in recipient_email: | |
| return "❌ 請輸入有效的信箱地址!" | |
| try: | |
| html_content = self.create_email_html_content(self.chat_history) | |
| r = resend.Emails.send({ | |
| "from": "onboarding@resend.dev", | |
| "to": recipient_email, | |
| "subject": f"PDF聊天機器人問答記錄 - {datetime.now().strftime('%Y-%m-%d %H:%M')}", | |
| "html": html_content | |
| }) | |
| return f"✅ 郵件已成功發送到 {recipient_email}!\n郵件ID: {r.get('id', 'Unknown')}" | |
| except Exception as e: | |
| return f"❌ 發送郵件時發生錯誤:{str(e)}" | |
| # 第五步:初始化聊天機器人 | |
| print("🤖 初始化PDF聊天機器人...") | |
| bot = PDFChatBot() | |
| # 第六步:定義Gradio介面函數 | |
| def upload_and_process(files, progress=gr.Progress()): | |
| return bot.process_pdfs(files, progress) | |
| def ask_question(question, history): | |
| if not question.strip(): | |
| return history, "" | |
| response = bot.answer_question(question) | |
| user_msg = {"role": "user", "content": question} | |
| assistant_msg = {"role": "assistant", "content": response} | |
| history.append(user_msg) | |
| history.append(assistant_msg) | |
| bot.chat_history = history.copy() | |
| return history, "" | |
| def download_chat_history(): | |
| if not bot.chat_history: | |
| return None | |
| docx_path = bot.create_docx_report(bot.chat_history) | |
| return docx_path | |
| def send_email(email_address): | |
| return bot.send_chat_history_email(email_address) | |
| def clear_chat(): | |
| bot.chat_history = [] | |
| return [], "" | |
| def clear_all_data(): | |
| return bot.clear_data() | |
| def load_existing_data(): | |
| if bot.load_vector_store(): | |
| return "✅ 成功載入已處理的資料!", "" | |
| else: | |
| return "❌ 沒有找到已處理的資料。", "" | |
| # 第七步:建立Gradio介面 | |
| print("🎨 建立使用者介面...") | |
| with gr.Blocks(title="PDF聊天機器人", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🤖 PDF聊天機器人 (Flash 2.0 + 郵件發送) | |
| 上傳您的PDF檔案,然後就可以向文件提問!支援多語言問答並可將記錄發送到信箱。 | |
| **🔥 在Hugging Face中執行** | |
| """ | |
| ) | |
| with gr.Tab("📁 檔案處理"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| file_upload = gr.File( | |
| file_count="multiple", | |
| file_types=[".pdf"], | |
| label="上傳PDF檔案", | |
| height=200 | |
| ) | |
| with gr.Row(): | |
| process_btn = gr.Button("🚀 處理PDF檔案", variant="primary", size="lg") | |
| load_btn = gr.Button("📂 載入已處理資料", variant="secondary") | |
| clear_btn = gr.Button("🗑️ 清除資料", variant="stop") | |
| with gr.Column(scale=1): | |
| status_text = gr.Textbox( | |
| label="處理狀態", | |
| lines=8, | |
| interactive=False | |
| ) | |
| file_list = gr.Textbox( | |
| label="已處理檔案", | |
| lines=6, | |
| interactive=False | |
| ) | |
| with gr.Tab("💬 問答聊天"): | |
| chatbot = gr.Chatbot( | |
| label="聊天記錄", | |
| height=500, | |
| show_copy_button=True, | |
| type="messages" | |
| ) | |
| with gr.Row(): | |
| question_input = gr.Textbox( | |
| placeholder="請輸入您的問題...", | |
| label="問題", | |
| lines=2, | |
| scale=4 | |
| ) | |
| ask_btn = gr.Button("📤 提問", variant="primary", scale=1) | |
| with gr.Row(): | |
| clear_chat_btn = gr.Button("🧹 清除聊天記錄", variant="secondary", scale=1) | |
| download_btn = gr.Button("📥 下載問答記錄", variant="primary", scale=1) | |
| download_file = gr.File(visible=False) | |
| gr.Examples( | |
| examples=[ | |
| "這份文件的主要內容是什麼?", | |
| "請總結文件的重點。", | |
| "文件中提到了哪些重要概念?", | |
| "能否詳細解釋某個特定主題?" | |
| ], | |
| inputs=question_input, | |
| label="問題範例" | |
| ) | |
| with gr.Tab("📧 郵件發送"): | |
| gr.Markdown( | |
| """ | |
| ### 📮 發送聊天記錄到信箱 | |
| 將您的問答記錄以精美的HTML格式發送到指定信箱,方便保存和分享。 | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| email_input = gr.Textbox( | |
| label="收件人信箱", | |
| placeholder="請輸入有效的信箱地址...", | |
| value="grace.chenyiwen@gmail.com" | |
| ) | |
| send_email_btn = gr.Button("📧 發送聊天記錄", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| email_status = gr.Textbox( | |
| label="發送狀態", | |
| lines=6, | |
| interactive=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| **注意事項:** | |
| - 請確保您已經有一些問答記錄 | |
| - 郵件將包含所有處理過的PDF檔案清單和完整的問答記錄 | |
| - 郵件格式為HTML,在大多數郵件用戶端中都能正常顯示 | |
| """ | |
| ) | |
| # 事件處理 | |
| def handle_download(): | |
| file_path = download_chat_history() | |
| if file_path: | |
| return gr.update(value=file_path, visible=True) | |
| else: | |
| gr.Warning("沒有聊天記錄可以下載!") | |
| return gr.update(visible=False) | |
| # 綁定事件 | |
| process_btn.click( | |
| fn=upload_and_process, | |
| inputs=[file_upload], | |
| outputs=[status_text, file_list], | |
| show_progress=True | |
| ) | |
| load_btn.click( | |
| fn=load_existing_data, | |
| outputs=[status_text, file_list] | |
| ) | |
| clear_btn.click( | |
| fn=clear_all_data, | |
| outputs=[status_text, file_list] | |
| ) | |
| ask_btn.click( | |
| fn=ask_question, | |
| inputs=[question_input, chatbot], | |
| outputs=[chatbot, question_input] | |
| ) | |
| question_input.submit( | |
| fn=ask_question, | |
| inputs=[question_input, chatbot], | |
| outputs=[chatbot, question_input] | |
| ) | |
| clear_chat_btn.click( | |
| fn=clear_chat, | |
| outputs=[chatbot, question_input] | |
| ) | |
| download_btn.click( | |
| fn=handle_download, | |
| outputs=download_file | |
| ) | |
| send_email_btn.click( | |
| fn=send_email, | |
| inputs=[email_input], | |
| outputs=[email_status] | |
| ) | |
| # 第八步:啟動應用程式 | |
| print("🚀 啟動應用程式中...") | |
| # 嘗試載入現有的向量儲存 | |
| bot.load_vector_store() | |
| # 在Hugging Face中啟動應用程式 | |
| demo.launch( | |
| share=True, # 在Hugging Face中設為True獲得公共連結 | |
| server_name="0.0.0.0", # 允許外部存取 | |
| server_port=None, | |
| show_error=True, | |
| debug=True | |
| ) | |
| print("✅ PDF聊天機器人已成功啟動!") | |
| print("📍 請點選上方顯示的連結來存取應用程式") |