Gemini_longchain_RAG

Runtime error

App Files Files Community

mikao007 commited on Oct 2, 2025

Commit

3bbf387

verified ·

1 Parent(s): 7d1a235

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -697

app.py CHANGED Viewed

@@ -1,697 +1,2 @@
-from dotenv import load_dotenv
-import os
-import gradio as gr
-from PyPDF2 import PdfReader
-import google.generativeai as genai
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_community.vectorstores import FAISS
-from langchain.chains.question_answering import load_qa_chain
-from langchain.prompts import PromptTemplate
-import shutil
-import tempfile
-from docx import Document
-from docx.shared import Inches
-from datetime import datetime
-# Load environment variables
-load_dotenv()
-# 延後讀取 API 金鑰：提供工具函式，實際需要時才讀取
-def _get_api_key() -> str:
-    candidate_keys = [
-        "GOOGLE_API_KEY",
-        "GEMINI_API_KEY",
-        "GOOGLE_GENAI_API_KEY",
-        "GENAI_API_KEY",
-    ]
-    for key_name in candidate_keys:
-        value = os.getenv(key_name, "").strip()
-        if value:
-            # 同步一份到 GOOGLE_API_KEY 以相容底層套件
-            os.environ["GOOGLE_API_KEY"] = value
-            return value
-    return ""
-class PDFChatBot:
-    def __init__(self):
-        self.vector_store = None
-        # 嵌入模型延後初始化，直到真的需要（處理或載入向量庫）
-        self.embeddings = None
-        self.processed_files = []
-        self.chat_history = []  # 儲存聊天歷史
-    def get_pdf_text(self, pdf_files):
-        """從多個PDF文件中提取文字"""
-        raw_text = ""
-        processed_count = 0
-        if not pdf_files:
-            return raw_text, processed_count
-        # 處理單個文件和多個文件
-        if not isinstance(pdf_files, list):
-            pdf_files = [pdf_files]
-        for pdf_file in pdf_files:
-            try:
-                # 如果是上傳的文件對象，使用其name屬性
-                pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
-                pdf_reader = PdfReader(pdf_path)
-                file_text = ""
-                for page in pdf_reader.pages:
-                    text = page.extract_text()
-                    if text:
-                        file_text += text + "\n"
-                if file_text.strip():
-                    raw_text += file_text
-                    processed_count += 1
-                    self.processed_files.append(os.path.basename(pdf_path))
-            except Exception as e:
-                print(f"讀取PDF時發生錯誤：{str(e)}")
-                continue
-        return raw_text, processed_count
-    def get_pdf_text_via_gemini(self, pdf_files):
-        """使用 Gemini 2.0 Flash 直接解析 PDF 文字（透過 Files API）。"""
-        api_key = _get_api_key()
-        if not api_key:
-            return "", 0
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel("gemini-2.0-flash-exp")
-        raw_text = ""
-        processed_count = 0
-        if not pdf_files:
-            return raw_text, processed_count
-        if not isinstance(pdf_files, list):
-            pdf_files = [pdf_files]
-        for pdf_file in pdf_files:
-            try:
-                pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
-                uploaded = genai.upload_file(pdf_path)
-                prompt = (
-                    "請從此 PDF 中提取可讀文字，按頁面順序輸出純文字。"
-                )
-                resp = model.generate_content([uploaded, prompt])
-                text = resp.text or ""
-                if text.strip():
-                    raw_text += text + "\n"
-                    processed_count += 1
-                    self.processed_files.append(os.path.basename(pdf_path))
-            except Exception as e:
-                print(f"使用Gemini解析PDF時發生錯誤：{str(e)}")
-                continue
-        return raw_text, processed_count
-    def get_text_chunks(self, text):
-        """將文字分割成區塊進行處理"""
-        text_splitter = CharacterTextSplitter(
-            separator="\n",
-            chunk_size=10000,
-            chunk_overlap=1000,
-            length_function=len
-        )
-        chunks = text_splitter.split_text(text)
-        return chunks
-    def create_vector_store(self, chunks):
-        """從文字區塊創建FAISS向量存儲"""
-        try:
-            if self.embeddings is None:
-                api_key = _get_api_key()
-                if not api_key:
-                    return False
-                self.embeddings = GoogleGenerativeAIEmbeddings(
-                    model="models/text-embedding-004",
-                    google_api_key=api_key
-                )
-            self.vector_store = FAISS.from_texts(chunks, self.embeddings)
-            self.vector_store.save_local("faiss_index")
-            return True
-        except Exception as e:
-            print(f"創建向量存儲時發生錯誤：{str(e)}")
-            return False
-    def load_vector_store(self):
-        """載入已存在的向量存儲"""
-        try:
-            if os.path.exists("faiss_index"):
-                if self.embeddings is None:
-                    api_key = _get_api_key()
-                    if not api_key:
-                        return False
-                    self.embeddings = GoogleGenerativeAIEmbeddings(
-                        model="models/text-embedding-004",
-                        google_api_key=api_key
-                    )
-                self.vector_store = FAISS.load_local(
-                    "faiss_index",
-                    embeddings=self.embeddings,
-                    allow_dangerous_deserialization=True
-                )
-                return True
-            else:
-                return False
-        except Exception as e:
-            print(f"載入向量存儲時發生錯誤：{str(e)}")
-            return False
-    def get_conversational_chain(self, temperature=0.3, max_tokens=4096):
-        """創建對話鏈"""
-        prompt_template = """
-        根據提供的內容盡可能詳細地回答問題。確保提供所有細節。
-        如果你需要更多細節來完美回答問題，那麼請詢問你認為需要了解的更多細節。
-        如果答案不在提供的內容中，只需說"在您提供的內容中找不到答案"。不要提供錯誤的答案。
-        內容:\n {context}\n
-        問題: \n{question}\n
-        回答:
-        """
-        # Using Flash 2.0 model（延後讀取 API Key）
-        api_key = _get_api_key()
-        if not api_key:
-            raise RuntimeError("尚未設定 API 金鑰，請於部署後設定 GOOGLE_API_KEY 再重試。")
-        model = ChatGoogleGenerativeAI(
-            model="gemini-2.0-flash-exp",
-            google_api_key=api_key,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=0.8,
-            top_k=40
-        )
-        prompt = PromptTemplate(
-            template=prompt_template,
-            input_variables=['context', 'question']
-        )
-        chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
-        return chain
-    def answer_question(self, question, temperature=0.3, max_tokens=4096, search_k=6):
-        """回答用戶問題"""
-        if not self.vector_store:
-            return "請先上傳並處理PDF文件！"
-        if not question.strip():
-            return "請輸入您的問題。"
-        try:
-            # 搜索相關文檔
-            docs = self.vector_store.similarity_search(question, k=search_k)
-            if not docs:
-                return "在上傳的文檔中找不到相關信息。"
-            # 生成回答
-            chain = self.get_conversational_chain(temperature, max_tokens)
-            response = chain(
-                {
-                    "input_documents": docs,
-                    "question": question,
-                },
-                return_only_outputs=True
-            )
-            return response["output_text"]
-        except Exception as e:
-            return f"處理問題時發生錯誤：{str(e)}"
-    def process_pdfs(self, pdf_files, progress=gr.Progress(), use_gemini=False):
-        """處理PDF文件"""
-        if not pdf_files:
-            return "請上傳至少一個PDF文件。", ""
-        self.processed_files = []
-        progress(0, desc="開始處理PDF文件...")
-        # 提取文字
-        progress(0.2, desc="提取PDF文字內容...")
-        if use_gemini:
-            raw_text, processed_count = self.get_pdf_text_via_gemini(pdf_files)
-        else:
-            raw_text, processed_count = self.get_pdf_text(pdf_files)
-        if not raw_text.strip():
-            return "無法從PDF文件中提取到文字。", ""
-        progress(0.4, desc="分割文字內容...")
-        # 分割文字
-        text_chunks = self.get_text_chunks(raw_text)
-        progress(0.6, desc="創建向量存儲...")
-        # 創建向量存儲
-        success = self.create_vector_store(text_chunks)
-        progress(1.0, desc="處理完成!")
-        if success:
-            file_list = "已處理的文件:\n" + "\n".join([f"• {file}" for file in self.processed_files])
-            return f"✅ 成功處理 {processed_count} 個PDF文件！\n總共 {len(text_chunks)} 個文字區塊\n現在您可以開始提問。", file_list
-        else:
-            return "❌ PDF處理失敗，請重試。", ""
-    def clear_data(self):
-        """清除處理過的資料"""
-        try:
-            if os.path.exists("faiss_index"):
-                shutil.rmtree("faiss_index")
-            self.vector_store = None
-            self.processed_files = []
-            self.chat_history = []
-            return "✅ 已清除所有處理過的資料！", ""
-        except Exception as e:
-            return f"❌ 清除資料時發生錯誤：{str(e)}", ""
-    def create_docx_report(self, chat_history):
-        """創建包含聊天記錄的docx報告"""
-        try:
-            # 創建新的文檔
-            doc = Document()
-            # 添加標題
-            title = doc.add_heading('PDF聊天機器人 - 問答記錄', 0)
-            title.alignment = 1  # 置中對齊
-            # 添加生成時間
-            doc.add_paragraph(f'生成時間：{datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")}')
-            # 添加處理的文件列表
-            if self.processed_files:
-                doc.add_heading('已處理的PDF文件：', level=2)
-                for i, file in enumerate(self.processed_files, 1):
-                    doc.add_paragraph(f'{i}. {file}', style='List Number')
-            doc.add_paragraph('')  # 空行
-            # 添加問答記錄
-            doc.add_heading('問答記錄：', level=2)
-            if not chat_history:
-                doc.add_paragraph('目前沒有問答記錄。')
-            else:
-                for i in range(0, len(chat_history), 2):
-                    if i + 1 < len(chat_history):
-                        question = chat_history[i]['content']
-                        answer = chat_history[i + 1]['content']
-                        # 問題
-                        q_paragraph = doc.add_paragraph()
-                        q_run = q_paragraph.add_run(f'問題 {(i//2)+1}：')
-                        q_run.bold = True
-                        q_run.font.size = Inches(0.14)
-                        q_paragraph.add_run(question)
-                        # 回答
-                        a_paragraph = doc.add_paragraph()
-                        a_run = a_paragraph.add_run('回答：')
-                        a_run.bold = True
-                        a_run.font.size = Inches(0.14)
-                        a_paragraph.add_run(answer)
-                        # 分隔線
-                        doc.add_paragraph('─' * 50)
-            # 保存到臨時文件
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx')
-            doc.save(temp_file.name)
-            temp_file.close()
-            return temp_file.name
-        except Exception as e:
-            print(f"創建docx文件時發生錯誤：{str(e)}")
-            return None
-# 初始化聊天機器人
-bot = PDFChatBot()
-# Gradio 接口函數
-def upload_and_process(files, use_gemini=False, progress=gr.Progress()):
-    return bot.process_pdfs(files, progress, use_gemini)
-def ask_question(question, history, temperature, max_tokens, search_k):
-    if not question.strip():
-        return history, ""
-    response = bot.answer_question(question, temperature, max_tokens, search_k)
-    # 使用新的消息格式
-    user_msg = {"role": "user", "content": question}
-    assistant_msg = {"role": "assistant", "content": response}
-    history.append(user_msg)
-    history.append(assistant_msg)
-    # 同步更新聊天歷史到bot實例
-    bot.chat_history = history.copy()
-    return history, ""
-def download_chat_history():
-    """下載聊天記錄為docx文件"""
-    if not bot.chat_history:
-        return None
-    docx_path = bot.create_docx_report(bot.chat_history)
-    return docx_path
-def export_to_word():
-    """匯出問答記錄為Word文件"""
-    if not bot.chat_history:
-        return None
-    docx_path = bot.create_docx_report(bot.chat_history)
-    return docx_path
-def clear_chat():
-    """清除聊天記錄"""
-    bot.chat_history = []
-    return [], ""
-def clear_all_data():
-    return bot.clear_data()
-def load_existing_data():
-    if bot.load_vector_store():
-        return "✅ 成功載入已處理的資料！", ""
-    else:
-        return "❌ 沒有找到已處理的資料。", ""
-def set_api_key(api_key: str):
-    """設定/更新 Google Gemini API 金鑰。
-    僅在記憶體與環境變數中更新，不會寫入硬碟。"""
-    key = (api_key or "").strip()
-    if not key:
-        return "❌ 未輸入任何金鑰。請貼上有效的 GOOGLE_API_KEY。"
-    os.environ["GOOGLE_API_KEY"] = key
-    # 重置 embeddings，確保後續以新金鑰初始化
-    try:
-        bot.embeddings = None
-    except Exception:
-        pass
-    return "✅ 已設定 API 金鑰（僅本次執行期間有效）。"
-# 創建自定義主題
-custom_theme = gr.themes.Soft(
-    primary_hue="blue",
-    secondary_hue="gray",
-    neutral_hue="slate",
-    font=gr.themes.GoogleFont("Noto Sans TC"),
-    font_mono=gr.themes.GoogleFont("JetBrains Mono")
-)
-# 創建 Gradio 介面
-with gr.Blocks(
-    title="PDF智能問答系統",
-    theme=custom_theme,
-    css="""
-    .gradio-container {
-        max-width: 1200px !important;
-        margin: auto !important;
-    }
-    .main-header {
-        text-align: center;
-        padding: 20px;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        border-radius: 10px;
-        margin-bottom: 20px;
-    }
-    .status-box {
-        background-color: #f8f9fa;
-        border-left: 4px solid #007bff;
-        padding: 15px;
-        border-radius: 5px;
-    }
-    .file-info {
-        background-color: #e8f5e8;
-        border-left: 4px solid #28a745;
-        padding: 10px;
-        border-radius: 5px;
-    }
-    """
-) as demo:
-    # 主標題區域
-    with gr.Row():
-        gr.HTML("""
-        <div class="main-header">
-            <h1>🤖 PDF智能問答系統</h1>
-            <p>基於 Gemini 2.0 Flash 的 RAG 技術 | 支持多語言問答</p>
-        </div>
-        """)
-    # 主要功能區域
-    with gr.Tab("📁 文件管理", id="file_tab"):
-        with gr.Row():
-            with gr.Column(scale=3):
-                # 文件上傳區域
-                with gr.Group():
-                    gr.Markdown("### 📤 上傳PDF文件")
-                    api_key_box = gr.Textbox(
-                        label="Google API Key (可選：部署後可在此貼上)",
-                        placeholder="以 sk- 或 AIza 開頭的金鑰（不會儲存到硬碟）",
-                        type="password"
-                    )
-                    set_key_btn = gr.Button("🔑 設定 API 金鑰")
-                file_upload = gr.File(
-                        file_count="multiple",
-                        file_types=[".pdf"],
-                        label="選擇PDF文件",
-                        height=150
-                    )
-                use_gemini_toggle = gr.Checkbox(label="使用 Gemini 解析 PDF（支援掃描影像）", value=False)
-                    # 處理選項
-                    with gr.Row():
-                        process_btn = gr.Button(
-                            "🚀 開始處理",
-                            variant="primary",
-                            size="lg",
-                            scale=2
-                        )
-                        load_btn = gr.Button(
-                            "📂 載入已處理資料",
-                            variant="secondary",
-                            scale=1
-                        )
-                        clear_btn = gr.Button(
-                            "🗑️ 清除所有資料",
-                            variant="stop",
-                            scale=1
-                        )
-            with gr.Column(scale=2):
-                # 狀態顯示區域
-                with gr.Group():
-                    gr.Markdown("### 📊 處理狀態")
-                    status_text = gr.Textbox(
-                        label="處理進度",
-                        lines=6,
-                        interactive=False,
-                        elem_classes=["status-box"]
-                    )
-                    # 文件列表
-                    gr.Markdown("### 📋 已處理文件")
-                    file_list = gr.Textbox(
-                        label="文件清單",
-                        lines=8,
-                        interactive=False,
-                        elem_classes=["file-info"]
-                    )
-    with gr.Tab("💬 智能問答", id="chat_tab"):
-        with gr.Row():
-            with gr.Column(scale=4):
-                # 聊天區域
-                chatbot = gr.Chatbot(
-                    label="💬 對話記錄",
-                    height=600,
-                    show_copy_button=True,
-                    type="messages",
-                    avatar_images=["👤", "🤖"]
-                )
-            with gr.Column(scale=1):
-                # 側邊欄功能
-                with gr.Group():
-                    gr.Markdown("### ⚙️ 問答設定")
-                    # 模型參數調整
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.3,
-                        step=0.1,
-                        label="創意度 (Temperature)",
-                        info="數值越高回答越有創意"
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=512,
-                        maximum=8192,
-                        value=4096,
-                        step=512,
-                        label="最大回答長度",
-                        info="控制回答的詳細程度"
-                    )
-                    search_k = gr.Slider(
-                        minimum=2,
-                        maximum=10,
-                        value=6,
-                        step=1,
-                        label="檢索文檔數量",
-                        info="搜索相關文檔的數量"
-                    )
-        # 輸入區域
-        with gr.Row():
-            question_input = gr.Textbox(
-                placeholder="請輸入您的問題... (支援中文、英文等多語言)",
-                label="💭 問題輸入",
-                lines=3,
-                scale=4,
-                max_lines=5
-            )
-            ask_btn = gr.Button(
-                "📤 發送問題",
-                variant="primary",
-                scale=1,
-                size="lg"
-            )
-        # 快捷操作
-        with gr.Row():
-            clear_chat_btn = gr.Button(
-                "🧹 清除對話",
-                variant="secondary",
-                scale=1
-            )
-            download_btn = gr.Button(
-                "📥 下載問答記錄",
-                variant="primary",
-                scale=1
-            )
-            export_btn = gr.Button(
-                "📄 匯出為Word",
-                variant="secondary",
-                scale=1
-            )
-        # 問題範例
-        with gr.Group():
-            gr.Markdown("### 💡 問題範例")
-            gr.Examples(
-                examples=[
-                    "這份文檔的主要內容是什麼？",
-                    "請總結文檔的重點和關鍵概念",
-                    "文檔中提到了哪些重要數據或統計？",
-                    "能否詳細解釋某個特定主題或概念？",
-                    "文檔的結論是什麼？",
-                    "有哪些重要的建議或建議？",
-                    "文檔中提到了哪些風險或挑戰？",
-                    "請比較文檔中提到的不同觀點"
-                ],
-                inputs=question_input,
-                label="點擊範例快速填入"
-            )
-    # 隱藏的文件下載組件
-    download_file = gr.File(visible=False)
-    # 下載功能處理函數
-    def handle_download():
-        file_path = download_chat_history()
-        if file_path:
-            return gr.update(value=file_path, visible=True)
-        else:
-            gr.Warning("沒有聊天記錄可以下載！")
-            return gr.update(visible=False)
-    # 事件處理
-    process_btn.click(
-        fn=upload_and_process,
-        inputs=[file_upload, use_gemini_toggle],
-        outputs=[status_text, file_list],
-        show_progress=True
-    )
-    set_key_btn.click(
-        fn=set_api_key,
-        inputs=[api_key_box],
-        outputs=[status_text]
-    )
-    load_btn.click(
-        fn=load_existing_data,
-        outputs=[status_text, file_list]
-    )
-    clear_btn.click(
-        fn=clear_all_data,
-        outputs=[status_text, file_list]
-    )
-    ask_btn.click(
-        fn=ask_question,
-        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
-        outputs=[chatbot, question_input]
-    )
-    question_input.submit(
-        fn=ask_question,
-        inputs=[question_input, chatbot, temperature, max_tokens, search_k],
-        outputs=[chatbot, question_input]
-    )
-    clear_chat_btn.click(
-        fn=clear_chat,
-        outputs=[chatbot, question_input]
-    )
-    download_btn.click(
-        fn=handle_download,
-        outputs=download_file
-    )
-    export_btn.click(
-        fn=export_to_word,
-        outputs=download_file
-    )
-if __name__ == "__main__":
-    # 嘗試載入現有的向量存儲
-    bot.load_vector_store()
-    # 讀取部署相關配置
-    server_name = os.getenv("HOST", os.getenv("SERVER_NAME", "0.0.0.0"))
-    # 常見平台會傳入 PORT；若無則使用 7860（Gradio 預設）
-    server_port_env = os.getenv("PORT", os.getenv("SERVER_PORT"))
-    server_port = int(server_port_env) if server_port_env and server_port_env.isdigit() else 7860
-    inbrowser = os.getenv("INBROWSER", "false").lower() == "true"
-    share = os.getenv("GRADIO_SHARE", "false").lower() == "true"
-    # 啟動應用（綁定 0.0.0.0 以支援容器/雲端）
-    demo.launch(
-        share=share,
-        server_name=server_name,
-        server_port=server_port,
-        show_error=True,
-        inbrowser=inbrowser
-    )


1	+ from G_L_RAG import demo
2	+ app = demo