Gemini_longchain_RAG

Runtime error

App Files Files Community

mikao007 commited on Oct 2, 2025

Commit

c242ec7

verified ·

1 Parent(s): 3bbf387

Upload G_L_RAG.py

Browse files

Files changed (1) hide show

G_L_RAG.py +697 -0

G_L_RAG.py ADDED Viewed

	@@ -0,0 +1,697 @@

+from dotenv import load_dotenv
+import os
+import gradio as gr
+from PyPDF2 import PdfReader
+import google.generativeai as genai
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
+from langchain_community.vectorstores import FAISS
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
+import shutil
+import tempfile
+from docx import Document
+from docx.shared import Inches
+from datetime import datetime
+# Load environment variables
+load_dotenv()
+# 延後讀取 API 金鑰：提供工具函式，實際需要時才讀取
+def _get_api_key() -> str:
+    candidate_keys = [
+        "GOOGLE_API_KEY",
+        "GEMINI_API_KEY",
+        "GOOGLE_GENAI_API_KEY",
+        "GENAI_API_KEY",
+    ]
+    for key_name in candidate_keys:
+        value = os.getenv(key_name, "").strip()
+        if value:
+            # 同步一份到 GOOGLE_API_KEY 以相容底層套件
+            os.environ["GOOGLE_API_KEY"] = value
+            return value
+    return ""
+class PDFChatBot:
+    def __init__(self):
+        self.vector_store = None
+        # 嵌入模型延後初始化，直到真的需要（處理或載入向量庫）
+        self.embeddings = None
+        self.processed_files = []
+        self.chat_history = []  # 儲存聊天歷史
+    def get_pdf_text(self, pdf_files):
+        """從多個PDF文件中提取文字"""
+        raw_text = ""
+        processed_count = 0
+        if not pdf_files:
+            return raw_text, processed_count
+        # 處理單個文件和多個文件
+        if not isinstance(pdf_files, list):
+            pdf_files = [pdf_files]
+        for pdf_file in pdf_files:
+            try:
+                # 如果是上傳的文件對象，使用其name屬性
+                pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
+                pdf_reader = PdfReader(pdf_path)
+                file_text = ""
+                for page in pdf_reader.pages:
+                    text = page.extract_text()
+                    if text:
+                        file_text += text + "\n"
+                if file_text.strip():
+                    raw_text += file_text
+                    processed_count += 1
+                    self.processed_files.append(os.path.basename(pdf_path))
+            except Exception as e:
+                print(f"讀取PDF時發生錯誤：{str(e)}")
+                continue
+        return raw_text, processed_count
+    def get_pdf_text_via_gemini(self, pdf_files):
+        """使用 Gemini 2.0 Flash 直接解析 PDF 文字（透過 Files API）。"""
+        api_key = _get_api_key()
+        if not api_key:
+            return "", 0
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel("gemini-2.0-flash-exp")
+        raw_text = ""
+        processed_count = 0
+        if not pdf_files:
+            return raw_text, processed_count
+        if not isinstance(pdf_files, list):
+            pdf_files = [pdf_files]
+        for pdf_file in pdf_files:
+            try:
+                pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
+                uploaded = genai.upload_file(pdf_path)
+                prompt = (
+                    "請從此 PDF 中提取可讀文字，按頁面順序輸出純文字。"
+                )
+                resp = model.generate_content([uploaded, prompt])
+                text = resp.text or ""
+                if text.strip():
+                    raw_text += text + "\n"
+                    processed_count += 1
+                    self.processed_files.append(os.path.basename(pdf_path))
+            except Exception as e:
+                print(f"使用Gemini解析PDF時發生錯誤：{str(e)}")
+                continue
+        return raw_text, processed_count
+    def get_text_chunks(self, text):
+        """將文字分割成區塊進行處理"""
+        text_splitter = CharacterTextSplitter(
+            separator="\n",
+            chunk_size=10000,
+            chunk_overlap=1000,
+            length_function=len
+        )
+        chunks = text_splitter.split_text(text)
+        return chunks
+    def create_vector_store(self, chunks):
+        """從文字區塊創建FAISS向量存儲"""
+        try:
+            if self.embeddings is None:
+                api_key = _get_api_key()
+                if not api_key:
+                    return False
+                self.embeddings = GoogleGenerativeAIEmbeddings(
+                    model="models/text-embedding-004",
+                    google_api_key=api_key
+                )
+            self.vector_store = FAISS.from_texts(chunks, self.embeddings)
+            self.vector_store.save_local("faiss_index")
+            return True
+        except Exception as e:
+            print(f"創建向量存儲時發生錯誤：{str(e)}")
+            return False
+    def load_vector_store(self):
+        """載入已存在的向量存儲"""
+        try:
+            if os.path.exists("faiss_index"):
+                if self.embeddings is None:
+                    api_key = _get_api_key()
+                    if not api_key:
+                        return False
+                    self.embeddings = GoogleGenerativeAIEmbeddings(
+                        model="models/text-embedding-004",
+                        google_api_key=api_key
+                    )
+                self.vector_store = FAISS.load_local(
+                    "faiss_index",
+                    embeddings=self.embeddings,
+                    allow_dangerous_deserialization=True
+                )
+                return True
+            else:
+                return False
+        except Exception as e:
+            print(f"載入向量存儲時發生錯誤：{str(e)}")
+            return False
+    def get_conversational_chain(self, temperature=0.3, max_tokens=4096):
+        """創建對話鏈"""
+        prompt_template = """
+        根據提供的內容盡可能詳細地回答問題。確保提供所有細節。
+        如果你需要更多細節來完美回答問題，那麼請詢問你認為需要了解的更多細節。
+        如果答案不在提供的內容中，只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。
+        內容:\n {context}\n
+        問題: \n{question}\n
+        回答:
+        """
+        # Using Flash 2.0 model（延後讀取 API Key）
+        api_key = _get_api_key()
+        if not api_key:
+            raise RuntimeError("尚未設定 API 金鑰，請於部署後設定 GOOGLE_API_KEY 再重試。")
+        model = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            google_api_key=api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=0.8,
+            top_k=40
+        )
+        prompt = PromptTemplate(
+            template=prompt_template,
+            input_variables=['context', 'question']
+        )
+        chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
+        return chain
+    def answer_question(self, question, temperature=0.3, max_tokens=4096, search_k=6):
+        """回答用戶問題"""
+        if not self.vector_store:
+            return "請先上傳並處理PDF文件！"
+        if not question.strip():
+            return "請輸入您的問題。"
+        try:
+            # 搜索相關文檔
+            docs = self.vector_store.similarity_search(question, k=search_k)
+            if not docs:
+                return "在上傳的文檔中找不到相關信息。"
+            # 生成回答
+            chain = self.get_conversational_chain(temperature, max_tokens)
+            response = chain(
+                {
+                    "input_documents": docs,
+                    "question": question,
+                },
+                return_only_outputs=True
+            )
+            return response["output_text"]
+        except Exception as e:
+            return f"處理問題時發生錯誤：{str(e)}"
+    def process_pdfs(self, pdf_files, progress=gr.Progress(), use_gemini=False):
+        """處理PDF文件"""
+        if not pdf_files:
+            return "請上傳至少一個PDF文件。", ""
+        self.processed_files = []
+        progress(0, desc="開始處理PDF文件...")
+        # 提取文字
+        progress(0.2, desc="提取PDF文字內容...")
+        if use_gemini:
+            raw_text, processed_count = self.get_pdf_text_via_gemini(pdf_files)
+        else:
+            raw_text, processed_count = self.get_pdf_text(pdf_files)
+        if not raw_text.strip():
+            return "無法從PDF文件中提取到文字。", ""
+        progress(0.4, desc="分割文字內容...")
+        # 分割文字
+        text_chunks = self.get_text_chunks(raw_text)
+        progress(0.6, desc="創建向量存儲...")
+        # 創建向量存儲
+        success = self.create_vector_store(text_chunks)
+        progress(1.0, desc="處理完成!")
+        if success:
+            file_list = "已處理的文件:\n" + "\n".join([f"• {file}" for file in self.processed_files])
+            return f"✅ 成功處理 {processed_count} 個PDF文件！\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list
+        else:
+            return "❌ PDF處理失敗，請重試。", ""
+    def clear_data(self):
+        """清除處理過的資料"""
+        try:
+            if os.path.exists("faiss_index"):
+                shutil.rmtree("faiss_index")
+            self.vector_store = None
+            self.processed_files = []
+            self.chat_history = []
+            return "✅ ��清除所有處理過的資料！", ""
+        except Exception as e:
+            return f"❌ 清除資料時發生錯誤：{str(e)}", ""
+    def create_docx_report(self, chat_history):
+        """創建包含聊天記錄的docx報告"""
+        try:
+            # 創建新的文檔
+            doc = Document()
+            # 添加標題
+            title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0)
+            title.alignment = 1  # 置中對齊
+            # 添加生成時間
+            doc.add_paragraph(f'生成時間：{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}')
+            # 添加處理的文件列表
+            if self.processed_files:
+                doc.add_heading('已處理的PDF文件：', level=2)
+                for i, file in enumerate(self.processed_files, 1):
+                    doc.add_paragraph(f'{i}. {file}', style='List Number')
+            doc.add_paragraph('')  # 空行
+            # 添加問答記錄
+            doc.add_heading('問答記錄：', level=2)
+            if not chat_history:
+                doc.add_paragraph('目前沒有問答記錄。')
+            else:
+                for i in range(0, len(chat_history), 2):
+                    if i + 1 < len(chat_history):
+                        question = chat_history[i]['content']
+                        answer = chat_history[i + 1]['content']
+                        # 問題
+                        q_paragraph = doc.add_paragraph()
+                        q_run = q_paragraph.add_run(f'問題 {(i//2)+1}：')
+                        q_run.bold = True
+                        q_run.font.size = Inches(0.14)
+                        q_paragraph.add_run(question)
+                        # 回答
+                        a_paragraph = doc.add_paragraph()
+                        a_run = a_paragraph.add_run('回答：')
+                        a_run.bold = True
+                        a_run.font.size = Inches(0.14)
+                        a_paragraph.add_run(answer)
+                        # 分隔線
+                        doc.add_paragraph('─' * 50)
+            # 保存到臨時文件
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
+            doc.save(temp_file.name)
+            temp_file.close()
+            return temp_file.name
+        except Exception as e:
+            print(f"創建docx文件時發生錯誤：{str(e)}")
+            return None
+# 初始化聊天機器人
+bot = PDFChatBot()
+# Gradio 接口函數
+def upload_and_process(files, use_gemini=False, progress=gr.Progress()):
+    return bot.process_pdfs(files, progress, use_gemini)
+def ask_question(question, history, temperature, max_tokens, search_k):
+    if not question.strip():
+        return history, ""
+    response = bot.answer_question(question, temperature, max_tokens, search_k)
+    # 使用新的消息格式
+    user_msg = {"role": "user", "content": question}
+    assistant_msg = {"role": "assistant", "content": response}
+    history.append(user_msg)
+    history.append(assistant_msg)
+    # 同步更新聊天歷史到bot實例
+    bot.chat_history = history.copy()
+    return history, ""
+def download_chat_history():
+    """下載聊天記錄為docx文件"""
+    if not bot.chat_history:
+        return None
+    docx_path = bot.create_docx_report(bot.chat_history)
+    return docx_path
+def export_to_word():
+    """匯出問答記錄為Word文件"""
+    if not bot.chat_history:
+        return None
+    docx_path = bot.create_docx_report(bot.chat_history)
+    return docx_path
+def clear_chat():
+    """清除聊天記錄"""
+    bot.chat_history = []
+    return [], ""
+def clear_all_data():
+    return bot.clear_data()
+def load_existing_data():
+    if bot.load_vector_store():
+        return "✅ 成功載入已處理的資料！", ""
+    else:
+        return "❌ 沒有找到已處理的資料。", ""
+def set_api_key(api_key: str):
+    """設定/更新 Google Gemini API 金鑰。
+    僅在記憶體與環境變數中更新，不會寫入硬碟。"""
+    key = (api_key or "").strip()
+    if not key:
+        return "❌ 未輸入任何金鑰。請貼上有效的 GOOGLE_API_KEY。"
+    os.environ["GOOGLE_API_KEY"] = key
+    # 重置 embeddings，確保後續以新金鑰初始化
+    try:
+        bot.embeddings = None
+    except Exception:
+        pass
+    return "✅ 已設定 API 金鑰（僅本次執行期間有效）。"
+# 創建自定義主題
+custom_theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="gray",
+    neutral_hue="slate",
+    font=gr.themes.GoogleFont("Noto Sans TC"),
+    font_mono=gr.themes.GoogleFont("JetBrains Mono")
+)
+# 創建 Gradio 介面
+with gr.Blocks(
+    title="PDF智能問答系統",
+    theme=custom_theme,
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .main-header {
+        text-align: center;
+        padding: 20px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 10px;
+        margin-bottom: 20px;
+    }
+    .status-box {
+        background-color: #f8f9fa;
+        border-left: 4px solid #007bff;
+        padding: 15px;
+        border-radius: 5px;
+    }
+    .file-info {
+        background-color: #e8f5e8;
+        border-left: 4px solid #28a745;
+        padding: 10px;
+        border-radius: 5px;
+    }
+    """
+) as demo:
+    # 主標題區域
+    with gr.Row():
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🤖 PDF智能問答系統</h1>
+            <p>基於 Gemini 2.0 Flash 的 RAG 技術 | 支持多語言問答</p>
+        </div>
+        """)
+    # 主要功能區域
+    with gr.Tab("📁 文件管理", id="file_tab"):
+        with gr.Row():
+            with gr.Column(scale=3):
+                # 文件上傳區域
+                with gr.Group():
+                    gr.Markdown("### 📤 上傳PDF文件")
+                    api_key_box = gr.Textbox(
+                        label="Google API Key (可選：部署後可在此貼上)",
+                        placeholder="以 sk- 或 AIza 開頭的金鑰（不會儲存到硬碟）",
+                        type="password"
+                    )
+                    set_key_btn = gr.Button("🔑 設定 API 金鑰")
+                file_upload = gr.File(
+                        file_count="multiple",
+                        file_types=[".pdf"],
+                        label="選擇PDF文件",
+                        height=150
+                    )
+                use_gemini_toggle = gr.Checkbox(label="使用 Gemini 解析 PDF（支援掃描影像）", value=False)
+                    # 處理選項
+                    with gr.Row():
+                        process_btn = gr.Button(
+                            "🚀 開始處理",
+                            variant="primary",
+                            size="lg",
+                            scale=2
+                        )
+                        load_btn = gr.Button(
+                            "📂 載入已處理資料",
+                            variant="secondary",
+                            scale=1
+                        )
+                        clear_btn = gr.Button(
+                            "🗑️ 清除所有資料",
+                            variant="stop",
+                            scale=1
+                        )
+            with gr.Column(scale=2):
+                # 狀態顯示區域
+                with gr.Group():
+                    gr.Markdown("### 📊 處理狀態")
+                    status_text = gr.Textbox(
+                        label="處理進度",
+                        lines=6,
+                        interactive=False,
+                        elem_classes=["status-box"]
+                    )
+                    # 文件列表
+                    gr.Markdown("### 📋 已處理文件")
+                    file_list = gr.Textbox(
+                        label="文件清單",
+                        lines=8,
+                        interactive=False,
+                        elem_classes=["file-info"]
+                    )
+    with gr.Tab("💬 智能問答", id="chat_tab"):
+        with gr.Row():
+            with gr.Column(scale=4):
+                # 聊天區域
+                chatbot = gr.Chatbot(
+                    label="💬 對話記錄",
+                    height=600,
+                    show_copy_button=True,
+                    type="messages",
+                    avatar_images=["👤", "🤖"]
+                )
+            with gr.Column(scale=1):
+                # 側邊欄功能
+                with gr.Group():
+                    gr.Markdown("### ⚙️ 問答設定")
+                    # 模型參數調整
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.3,
+                        step=0.1,
+                        label="創意度 (Temperature)",
+                        info="數值越高回答越有創意"
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=512,
+                        maximum=8192,
+                        value=4096,
+                        step=512,
+                        label="最大回答長度",
+                        info="控制回答的詳細程度"
+                    )
+                    search_k = gr.Slider(
+                        minimum=2,
+                        maximum=10,
+                        value=6,
+                        step=1,
+                        label="檢索文檔數量",
+                        info="搜索相關文檔的數量"
+                    )
+        # 輸入區域
+        with gr.Row():
+            question_input = gr.Textbox(
+                placeholder="請輸入您的問題... (支援中文、英文等多語言)",
+                label="💭 問題輸入",
+                lines=3,
+                scale=4,
+                max_lines=5
+            )
+            ask_btn = gr.Button(
+                "📤 發送問題",
+                variant="primary",
+                scale=1,
+                size="lg"
+            )
+        # 快捷操作
+        with gr.Row():
+            clear_chat_btn = gr.Button(
+                "🧹 清除對話",
+                variant="secondary",
+                scale=1
+            )
+            download_btn = gr.Button(
+                "📥 下載問答記錄",
+                variant="primary",
+                scale=1
+            )
+            export_btn = gr.Button(
+                "📄 匯出為Word",
+                variant="secondary",
+                scale=1
+            )
+        # 問題範例
+        with gr.Group():
+            gr.Markdown("### 💡 問題範例")
+            gr.Examples(
+                examples=[
+                    "這份文檔的主要內容是什麼？",
+                    "請總結文檔的重點和關鍵概念",
+                    "文檔中提到了哪些重要數據或統計？",
+                    "能否詳細解釋某個特定主題或概念？",
+                    "文檔的結論是什麼？",
+                    "有哪些重要的建議或建議？",
+                    "文檔中提到了哪些風險或挑戰？",
+                    "請比較文檔中提到的不同觀點"
+                ],
+                inputs=question_input,
+                label="點擊範例快速填入"
+            )
+    # 隱藏的文件下載組件
+    download_file = gr.File(visible=False)
+    # 下載功能處理函數
+    def handle_download():
+        file_path = download_chat_history()
+        if file_path:
+            return gr.update(value=file_path, visible=True)
+        else:
+            gr.Warning("沒有聊天記錄可以下載！")
+            return gr.update(visible=False)
+    # 事件處理
+    process_btn.click(
+        fn=upload_and_process,
+        inputs=[file_upload, use_gemini_toggle],
+        outputs=[status_text, file_list],
+        show_progress=True
+    )
+    set_key_btn.click(
+        fn=set_api_key,
+        inputs=[api_key_box],
+        outputs=[status_text]
+    )
+    load_btn.click(
+        fn=load_existing_data,
+        outputs=[status_text, file_list]
+    )
+    clear_btn.click(
+        fn=clear_all_data,
+        outputs=[status_text, file_list]
+    )
+    ask_btn.click(
+        fn=ask_question,
+        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
+        outputs=[chatbot, question_input]
+    )
+    question_input.submit(
+        fn=ask_question,
+        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
+        outputs=[chatbot, question_input]
+    )
+    clear_chat_btn.click(
+        fn=clear_chat,
+        outputs=[chatbot, question_input]
+    )
+    download_btn.click(
+        fn=handle_download,
+        outputs=download_file
+    )
+    export_btn.click(
+        fn=export_to_word,
+        outputs=download_file
+    )
+if __name__ == "__main__":
+    # 嘗試載入現有的向量存儲
+    bot.load_vector_store()
+    # 讀取部署相關配置
+    server_name = os.getenv("HOST", os.getenv("SERVER_NAME", "0.0.0.0"))
+    # 常見平台會傳入 PORT；若無則使用 7860（Gradio 預設）
+    server_port_env = os.getenv("PORT", os.getenv("SERVER_PORT"))
+    server_port = int(server_port_env) if server_port_env and server_port_env.isdigit() else 7860
+    inbrowser = os.getenv("INBROWSER", "false").lower() == "true"
+    share = os.getenv("GRADIO_SHARE", "false").lower() == "true"
+    # 啟動應用（綁定 0.0.0.0 以支援容器/雲端）
+    demo.launch(
+        share=share,
+        server_name=server_name,
+        server_port=server_port,
+        show_error=True,
+        inbrowser=inbrowser
+    )