Spaces:

mikao007
/

Gemini_longchain_RAG

Runtime error

App Files Files Community

mikao007 commited on Oct 2, 2025

Commit

444ae48

verified ·

1 Parent(s): 783c136

Upload 2 files

Browse files

Files changed (2) hide show

app.py +593 -0
requirements.txt +59 -0

app.py ADDED Viewed

	@@ -0,0 +1,593 @@

+from dotenv import load_dotenv
+import os
+import gradio as gr
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
+from langchain_community.vectorstores import FAISS
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
+import shutil
+import tempfile
+from docx import Document
+from docx.shared import Inches
+from datetime import datetime
+# Load environment variables
+load_dotenv()
+# Set Gemini API key
+gemini_api_key = "AIzaSyA8zqhqNb-bNYU6KVb0Zj0XIKi3aZfvXE0"
+os.environ["GOOGLE_API_KEY"] = gemini_api_key
+class PDFChatBot:
+    def __init__(self):
+        self.vector_store = None
+        self.embeddings = GoogleGenerativeAIEmbeddings(
+            model="models/text-embedding-004",
+            google_api_key=gemini_api_key
+        )
+        self.processed_files = []
+        self.chat_history = []  # 儲存聊天歷史
+    def get_pdf_text(self, pdf_files):
+        """從多個PDF文件中提取文字"""
+        raw_text = ""
+        processed_count = 0
+        if not pdf_files:
+            return raw_text, processed_count
+        # 處理單個文件和多個文件
+        if not isinstance(pdf_files, list):
+            pdf_files = [pdf_files]
+        for pdf_file in pdf_files:
+            try:
+                # 如果是上傳的文件對象，使用其name屬性
+                pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
+                pdf_reader = PdfReader(pdf_path)
+                file_text = ""
+                for page in pdf_reader.pages:
+                    text = page.extract_text()
+                    if text:
+                        file_text += text + "\n"
+                if file_text.strip():
+                    raw_text += file_text
+                    processed_count += 1
+                    self.processed_files.append(os.path.basename(pdf_path))
+            except Exception as e:
+                print(f"讀取PDF時發生錯誤：{str(e)}")
+                continue
+        return raw_text, processed_count
+    def get_text_chunks(self, text):
+        """將文字分割成區塊進行處理"""
+        text_splitter = CharacterTextSplitter(
+            separator="\n",
+            chunk_size=10000,
+            chunk_overlap=1000,
+            length_function=len
+        )
+        chunks = text_splitter.split_text(text)
+        return chunks
+    def create_vector_store(self, chunks):
+        """從文字區塊創建FAISS向量存儲"""
+        try:
+            self.vector_store = FAISS.from_texts(chunks, self.embeddings)
+            self.vector_store.save_local("faiss_index")
+            return True
+        except Exception as e:
+            print(f"創建向量存儲時發生錯誤：{str(e)}")
+            return False
+    def load_vector_store(self):
+        """載入已存在的向量存儲"""
+        try:
+            if os.path.exists("faiss_index"):
+                self.vector_store = FAISS.load_local(
+                    "faiss_index",
+                    embeddings=self.embeddings,
+                    allow_dangerous_deserialization=True
+                )
+                return True
+            else:
+                return False
+        except Exception as e:
+            print(f"載入向量存儲時發生錯誤：{str(e)}")
+            return False
+    def get_conversational_chain(self, temperature=0.3, max_tokens=4096):
+        """創建對話鏈"""
+        prompt_template = """
+        根據提供的內容盡可能詳細地回答問題。確保提供所有細節。
+        如果你需要更多細節來完美回答問題，那麼請詢問你認為需要了解的更多細節。
+        如果答案不在提供的內容中，只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。
+        內容:\n {context}\n
+        問題: \n{question}\n
+        回答:
+        """
+        # Using Flash 2.0 model
+        model = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            google_api_key=gemini_api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=0.8,
+            top_k=40
+        )
+        prompt = PromptTemplate(
+            template=prompt_template,
+            input_variables=['context', 'question']
+        )
+        chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
+        return chain
+    def answer_question(self, question, temperature=0.3, max_tokens=4096, search_k=6):
+        """回答用戶問題"""
+        if not self.vector_store:
+            return "請先上傳並處理PDF文件！"
+        if not question.strip():
+            return "請輸入您的問題。"
+        try:
+            # 搜索相關文檔
+            docs = self.vector_store.similarity_search(question, k=search_k)
+            if not docs:
+                return "在上傳的文檔中找不到相關信息。"
+            # 生成回答
+            chain = self.get_conversational_chain(temperature, max_tokens)
+            response = chain(
+                {
+                    "input_documents": docs,
+                    "question": question,
+                },
+                return_only_outputs=True
+            )
+            return response["output_text"]
+        except Exception as e:
+            return f"處理問題時發生錯誤：{str(e)}"
+    def process_pdfs(self, pdf_files, progress=gr.Progress()):
+        """處理PDF文件"""
+        if not pdf_files:
+            return "請上傳至少一個PDF文件。", ""
+        self.processed_files = []
+        progress(0, desc="開始處理PDF文件...")
+        # 提取文字
+        progress(0.2, desc="提取PDF文字內容...")
+        raw_text, processed_count = self.get_pdf_text(pdf_files)
+        if not raw_text.strip():
+            return "無法從PDF文件中提取到文字。", ""
+        progress(0.4, desc="分割文字內容...")
+        # 分割文字
+        text_chunks = self.get_text_chunks(raw_text)
+        progress(0.6, desc="創建向量存儲...")
+        # 創建向量存儲
+        success = self.create_vector_store(text_chunks)
+        progress(1.0, desc="處理完成!")
+        if success:
+            file_list = "已處理的文件:\n" + "\n".join([f"• {file}" for file in self.processed_files])
+            return f"✅ 成功處理 {processed_count} 個PDF文件！\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list
+        else:
+            return "❌ PDF處理失敗，請重試。", ""
+    def clear_data(self):
+        """清除處理過的資料"""
+        try:
+            if os.path.exists("faiss_index"):
+                shutil.rmtree("faiss_index")
+            self.vector_store = None
+            self.processed_files = []
+            self.chat_history = []
+            return "✅ 已清除所有處理過的資料！", ""
+        except Exception as e:
+            return f"❌ 清除資料時發生錯誤：{str(e)}", ""
+    def create_docx_report(self, chat_history):
+        """創建包含聊天記錄的docx報告"""
+        try:
+            # 創建新的文檔
+            doc = Document()
+            # 添加標題
+            title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0)
+            title.alignment = 1  # 置中對齊
+            # 添加生成時間
+            doc.add_paragraph(f'生成時間：{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}')
+            # 添加處理的文件列表
+            if self.processed_files:
+                doc.add_heading('已處理的PDF文件：', level=2)
+                for i, file in enumerate(self.processed_files, 1):
+                    doc.add_paragraph(f'{i}. {file}', style='List Number')
+            doc.add_paragraph('')  # 空行
+            # 添加問答記錄
+            doc.add_heading('問答記錄：', level=2)
+            if not chat_history:
+                doc.add_paragraph('目前沒有問答記錄。')
+            else:
+                for i in range(0, len(chat_history), 2):
+                    if i + 1 < len(chat_history):
+                        question = chat_history[i]['content']
+                        answer = chat_history[i + 1]['content']
+                        # 問題
+                        q_paragraph = doc.add_paragraph()
+                        q_run = q_paragraph.add_run(f'問題 {(i//2)+1}：')
+                        q_run.bold = True
+                        q_run.font.size = Inches(0.14)
+                        q_paragraph.add_run(question)
+                        # 回答
+                        a_paragraph = doc.add_paragraph()
+                        a_run = a_paragraph.add_run('回答：')
+                        a_run.bold = True
+                        a_run.font.size = Inches(0.14)
+                        a_paragraph.add_run(answer)
+                        # 分隔線
+                        doc.add_paragraph('─' * 50)
+            # 保存到臨時文件
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
+            doc.save(temp_file.name)
+            temp_file.close()
+            return temp_file.name
+        except Exception as e:
+            print(f"創建docx文件時發生錯誤：{str(e)}")
+            return None
+# 初始化聊天機器人
+bot = PDFChatBot()
+# Gradio 接口函數
+def upload_and_process(files, progress=gr.Progress()):
+    return bot.process_pdfs(files, progress)
+def ask_question(question, history, temperature, max_tokens, search_k):
+    if not question.strip():
+        return history, ""
+    response = bot.answer_question(question, temperature, max_tokens, search_k)
+    # 使用新的消息格式
+    user_msg = {"role": "user", "content": question}
+    assistant_msg = {"role": "assistant", "content": response}
+    history.append(user_msg)
+    history.append(assistant_msg)
+    # 同步更新聊天歷史到bot實例
+    bot.chat_history = history.copy()
+    return history, ""
+def download_chat_history():
+    """下載聊天記錄為docx文件"""
+    if not bot.chat_history:
+        return None
+    docx_path = bot.create_docx_report(bot.chat_history)
+    return docx_path
+def export_to_word():
+    """匯出問答記錄為Word文件"""
+    if not bot.chat_history:
+        return None
+    docx_path = bot.create_docx_report(bot.chat_history)
+    return docx_path
+def clear_chat():
+    """清除聊天記錄"""
+    bot.chat_history = []
+    return [], ""
+def clear_all_data():
+    return bot.clear_data()
+def load_existing_data():
+    if bot.load_vector_store():
+        return "✅ 成功載入已處理的資料！", ""
+    else:
+        return "❌ 沒有找到已處理的資料。", ""
+# 創建自定義主題
+custom_theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="gray",
+    neutral_hue="slate",
+    font=gr.themes.GoogleFont("Noto Sans TC"),
+    font_mono=gr.themes.GoogleFont("JetBrains Mono")
+)
+# 創建 Gradio 介面
+with gr.Blocks(
+    title="PDF智能問答系統",
+    theme=custom_theme,
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .main-header {
+        text-align: center;
+        padding: 20px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 10px;
+        margin-bottom: 20px;
+    }
+    .status-box {
+        background-color: #f8f9fa;
+        border-left: 4px solid #007bff;
+        padding: 15px;
+        border-radius: 5px;
+    }
+    .file-info {
+        background-color: #e8f5e8;
+        border-left: 4px solid #28a745;
+        padding: 10px;
+        border-radius: 5px;
+    }
+    """
+) as demo:
+    # 主標題區域
+    with gr.Row():
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🤖 PDF智能問答系統</h1>
+            <p>基於 Gemini 2.0 Flash 的 RAG 技術 | 支持多語言問答</p>
+        </div>
+        """)
+    # 主要功能區域
+    with gr.Tab("📁 文件管理", id="file_tab"):
+        with gr.Row():
+            with gr.Column(scale=3):
+                # 文件上傳區域
+                with gr.Group():
+                    gr.Markdown("### 📤 上傳PDF文件")
+                    file_upload = gr.File(
+                        file_count="multiple",
+                        file_types=[".pdf"],
+                        label="選擇PDF文件",
+                        height=150,
+                        file_count="multiple"
+                    )
+                    # 處理選項
+                    with gr.Row():
+                        process_btn = gr.Button(
+                            "🚀 開始處理",
+                            variant="primary",
+                            size="lg",
+                            scale=2
+                        )
+                        load_btn = gr.Button(
+                            "📂 載入已處理資料",
+                            variant="secondary",
+                            scale=1
+                        )
+                        clear_btn = gr.Button(
+                            "🗑️ 清除所有資料",
+                            variant="stop",
+                            scale=1
+                        )
+            with gr.Column(scale=2):
+                # 狀態顯示區域
+                with gr.Group():
+                    gr.Markdown("### 📊 處理狀態")
+                    status_text = gr.Textbox(
+                        label="處理進度",
+                        lines=6,
+                        interactive=False,
+                        elem_classes=["status-box"]
+                    )
+                    # 文件列表
+                    gr.Markdown("### 📋 已處理文件")
+                    file_list = gr.Textbox(
+                        label="文件清單",
+                        lines=8,
+                        interactive=False,
+                        elem_classes=["file-info"]
+                    )
+    with gr.Tab("💬 智能問答", id="chat_tab"):
+        with gr.Row():
+            with gr.Column(scale=4):
+                # 聊天區域
+                chatbot = gr.Chatbot(
+                    label="💬 對話記錄",
+                    height=600,
+                    show_copy_button=True,
+                    type="messages",
+                    avatar_images=["👤", "🤖"],
+                    bubble_full_width=False
+                )
+            with gr.Column(scale=1):
+                # 側邊欄功能
+                with gr.Group():
+                    gr.Markdown("### ⚙️ 問答設定")
+                    # 模型參數調整
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.3,
+                        step=0.1,
+                        label="創意度 (Temperature)",
+                        info="數值越高回答越有創意"
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=512,
+                        maximum=8192,
+                        value=4096,
+                        step=512,
+                        label="最大回答長度",
+                        info="控制回答的詳細程度"
+                    )
+                    search_k = gr.Slider(
+                        minimum=2,
+                        maximum=10,
+                        value=6,
+                        step=1,
+                        label="檢索文檔數量",
+                        info="搜索相關文檔的數量"
+                    )
+        # 輸入區域
+        with gr.Row():
+            question_input = gr.Textbox(
+                placeholder="請輸入您的問題... (支援中文、英文等多語言)",
+                label="💭 問題輸入",
+                lines=3,
+                scale=4,
+                max_lines=5
+            )
+            ask_btn = gr.Button(
+                "📤 發送問題",
+                variant="primary",
+                scale=1,
+                size="lg"
+            )
+        # 快捷操作
+        with gr.Row():
+            clear_chat_btn = gr.Button(
+                "🧹 清除對話",
+                variant="secondary",
+                scale=1
+            )
+            download_btn = gr.Button(
+                "📥 下載問答記錄",
+                variant="primary",
+                scale=1
+            )
+            export_btn = gr.Button(
+                "📄 匯出為Word",
+                variant="secondary",
+                scale=1
+            )
+        # 問題範例
+        with gr.Group():
+            gr.Markdown("### 💡 問題範例")
+            gr.Examples(
+                examples=[
+                    "這份文檔的主要內容是什麼？",
+                    "請總結文檔的重點和關鍵概念",
+                    "文檔中提到了哪些重要數據或統計？",
+                    "能否詳細解釋某個特定主題或概念？",
+                    "文檔的結論是什麼？",
+                    "有哪些重要的建議或建議？",
+                    "文檔中提到了哪些風險或挑戰？",
+                    "請比較文檔中提到的不同觀點"
+                ],
+                inputs=question_input,
+                label="點擊範例快速填入"
+            )
+    # 隱藏的文件下載組件
+    download_file = gr.File(visible=False)
+    # 下載功能處理函數
+    def handle_download():
+        file_path = download_chat_history()
+        if file_path:
+            return gr.update(value=file_path, visible=True)
+        else:
+            gr.Warning("沒有聊天記錄可以下載！")
+            return gr.update(visible=False)
+    # 事件處理
+    process_btn.click(
+        fn=upload_and_process,
+        inputs=[file_upload],
+        outputs=[status_text, file_list],
+        show_progress=True
+    )
+    load_btn.click(
+        fn=load_existing_data,
+        outputs=[status_text, file_list]
+    )
+    clear_btn.click(
+        fn=clear_all_data,
+        outputs=[status_text, file_list]
+    )
+    ask_btn.click(
+        fn=ask_question,
+        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
+        outputs=[chatbot, question_input]
+    )
+    question_input.submit(
+        fn=ask_question,
+        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
+        outputs=[chatbot, question_input]
+    )
+    clear_chat_btn.click(
+        fn=clear_chat,
+        outputs=[chatbot, question_input]
+    )
+    download_btn.click(
+        fn=handle_download,
+        outputs=download_file
+    )
+    export_btn.click(
+        fn=export_to_word,
+        outputs=download_file
+    )
+if __name__ == "__main__":
+    # 嘗試載入現有的向量存儲
+    bot.load_vector_store()
+    # 啟動應用
+    demo.launch(
+        share=False,  # 設為 True 可獲得公共連結
+        server_name="127.0.0.1",  # 本地訪問
+        server_port=None,  # 自動選擇可用端���
+        show_error=True,
+        inbrowser=True  # 自動打開瀏覽器
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+# PDF 智能問答系統 - 依賴套件清單
+# 基於 Gemini 2.0 Flash 的 RAG 技術
+# ===== 核心框架 =====
+gradio>=4.0.0                    # Web 介面框架
+langchain>=0.1.0                 # LangChain 核心
+langchain-community>=0.0.20      # LangChain 社群擴展
+langchain-google-genai>=1.0.0    # Google Gemini 整合
+# ===== Google AI 服務 =====
+google-generativeai>=0.3.0      # Google Gemini API
+# ===== PDF 處理 =====
+PyPDF2>=3.0.0                   # PDF 文字提取
+# ===== 向量資料庫 =====
+faiss-cpu>=1.7.4                # FAISS 向量搜尋 (CPU 版本)
+# faiss-gpu>=1.7.4              # 如果使用 GPU，請取消註解此行並註解上行
+# ===== 文檔處理 =====
+python-docx>=0.8.11             # Word 文檔生成
+# ===== 環境和配置 =====
+python-dotenv>=1.0.0            # 環境變數管理
+# ===== 數值計算和文字處理 =====
+numpy>=1.24.0                   # 數值計算
+tiktoken>=0.5.0                 # OpenAI tokenizer
+# ===== HTTP 和網路 =====
+requests>=2.31.0                # HTTP 請求
+# ===== 工具和輔助 =====
+tqdm>=4.65.0                    # 進度條
+pydantic>=2.0.0                 # 資料驗證
+# ===== 可選增強套件 =====
+# 如果需要更強的 PDF 處理能力，可以選擇以下之一：
+# pymupdf>=1.23.0               # MuPDF Python 綁定，處理能力更強
+# pdfplumber>=0.9.0             # 另一個 PDF 處理選項
+# 如果需要更好的文字嵌入：
+# sentence-transformers>=2.2.0  # 更好的嵌入模型
+# 如果需要更好的文字分割：
+# spacy>=3.7.0                  # 自然語言處理
+# nltk>=3.8.0                  # 自然語言工具包
+# ===== 開發工具 (可選) =====
+# pytest>=7.4.0                # 測試框架
+# black>=23.0.0                 # 程式碼格式化
+# flake8>=6.0.0                 # 程式碼檢查
+# jupyter>=1.0.0                # Jupyter notebook
+# ===== 系統需求 =====
+# Python >= 3.8
+# 建議使用 Python 3.9 或更高版本
+# 記憶體建議 8GB 以上
+# 硬碟空間建議 2GB 以上用於模型和索引