Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import PyPDF2 | |
| import pandas as pd | |
| import os | |
| import google.generativeai as genai | |
| from datetime import datetime | |
| import logging | |
| import sys | |
| from io import BytesIO | |
| from pathlib import Path | |
| # 設定頁面配置 | |
| st.set_page_config( | |
| page_title="PDF 處理與翻譯工具", | |
| page_icon="📄", | |
| layout="wide" | |
| ) | |
| # 使用 Streamlit 的記錄方式,避免檔案權限問題 | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # 檢查是否有openpyxl套件,用於Excel匯出 | |
| try: | |
| import openpyxl | |
| EXCEL_SUPPORT = True | |
| except ImportError: | |
| EXCEL_SUPPORT = False | |
| # 應用標題 | |
| st.title("PDF 內容處理與翻譯工具") | |
| st.markdown("使用 Gemini API 來處理和翻譯 PDF 文件內容") | |
| # 嘗試讀取CSV示例 | |
| try: | |
| csv_path = Path(__file__).parent / "data.csv" | |
| if csv_path.exists(): | |
| df = pd.read_csv(csv_path) | |
| st.sidebar.subheader("範例數據") | |
| st.sidebar.dataframe(df) | |
| except Exception as e: | |
| # 靜默處理CSV讀取錯誤,不顯示給用戶 | |
| pass | |
| # 側邊欄設置 | |
| with st.sidebar: | |
| st.header("設置") | |
| api_key = st.text_input("輸入 Gemini API Key", type="password") | |
| target_language = st.selectbox( | |
| "選擇目標語言", | |
| ["繁體中文", "簡體中文", "英文", "日文", "韓文", "法文", "德文", "西班牙文"] | |
| ) | |
| st.markdown("---") | |
| st.markdown("### 關於") | |
| st.markdown("此應用程序使用 Streamlit 和 Gemini API 處理 PDF 文件內容。") | |
| # 主功能 | |
| def setup_gemini_api(api_key): | |
| """設置 Gemini API""" | |
| try: | |
| os.environ["GEMINI_API_KEY"] = api_key | |
| genai.configure(api_key=api_key) | |
| return genai.GenerativeModel("gemini-1.5-flash") | |
| except Exception as e: | |
| logger.error(f"Gemini API 設置失敗: {e}") | |
| st.error(f"Gemini API 設置失敗: {e}") | |
| return None | |
| def extract_text_from_pdf(pdf_file): | |
| """從PDF文件中提取所有文本內容,以頁面為單位返回文本列表""" | |
| try: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| total_pages = len(reader.pages) | |
| logger.info(f"PDF共有 {total_pages} 頁") | |
| pages_text = [] | |
| for page_num in range(total_pages): | |
| with st.spinner(f"提取第 {page_num+1} 頁..."): | |
| page_text = reader.pages[page_num].extract_text() | |
| pages_text.append(page_text) | |
| return pages_text, total_pages | |
| except Exception as e: | |
| logger.error(f"PDF文本提取錯誤: {e}") | |
| st.error(f"PDF文本提取錯誤: {e}") | |
| return [], 0 | |
| def translate_with_gemini(model, text, target_language): | |
| """使用Gemini將文本翻譯成目標語言""" | |
| try: | |
| prompt = f""" | |
| 請將以下文本翻譯成{target_language},保持專業和準確性: | |
| {text} | |
| 只需要返回翻譯後的文本,不要加入其他解釋或備註。 | |
| """ | |
| response = model.generate_content(prompt) | |
| return response.text.strip() | |
| except Exception as e: | |
| logger.error(f"Gemini翻譯失敗: {e}") | |
| return f"翻譯失敗: {str(e)}" | |
| def process_with_gemini(model, text, instruction="請解釋以下內容"): | |
| """使用Gemini處理文本""" | |
| try: | |
| prompt = f"{instruction}:\n\n{text}" | |
| response = model.generate_content(prompt) | |
| return response.text.strip() | |
| except Exception as e: | |
| logger.error(f"Gemini處理失敗: {e}") | |
| return f"處理失敗: {str(e)}" | |
| def provide_download_options(result_df): | |
| """提供下載選項,確保CSV中文不會出現亂碼""" | |
| # 使用UTF-8-SIG編碼保證中文在CSV中不會出現亂碼 | |
| csv_data = result_df.to_csv(index=False, encoding='utf-8-sig') | |
| st.download_button( | |
| label="下載結果為CSV", | |
| data=csv_data, | |
| file_name=f"pdf_process_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime="text/csv; charset=utf-8-sig", # 指定MIME類型包含編碼 | |
| ) | |
| # 如果支援Excel格式,也提供Excel下載選項 | |
| if EXCEL_SUPPORT: | |
| excel_buffer = BytesIO() | |
| result_df.to_excel(excel_buffer, index=False, engine='openpyxl') | |
| excel_data = excel_buffer.getvalue() | |
| st.download_button( | |
| label="下載結果為Excel", | |
| data=excel_data, | |
| file_name=f"pdf_process_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| key="excel_download" # 添加唯一key避免與CSV下載按鈕衝突 | |
| ) | |
| # 上傳PDF文件 | |
| uploaded_file = st.file_uploader("上傳PDF文件", type="pdf") | |
| if uploaded_file is not None: | |
| # 顯示上傳成功訊息 | |
| st.success(f"文件 '{uploaded_file.name}' 上傳成功!") | |
| # 檢查API Key是否已提供 | |
| if not api_key: | |
| st.warning("請先在側邊欄輸入 Gemini API Key") | |
| else: | |
| # 初始化Gemini模型 | |
| with st.spinner("初始化 Gemini API..."): | |
| model = setup_gemini_api(api_key) | |
| if model: | |
| # 處理PDF檔案 | |
| with st.spinner("正在提取PDF內容..."): | |
| try: | |
| # 直接從記憶體讀取PDF內容,避免寫入臨時文件 | |
| pdf_bytes = BytesIO(uploaded_file.getvalue()) | |
| pages_text, total_pages = extract_text_from_pdf(pdf_bytes) | |
| except Exception as e: | |
| st.error(f"處理PDF文件時發生錯誤: {e}") | |
| pages_text, total_pages = [], 0 | |
| if pages_text: | |
| # 創建一個選擇頁面的選項 | |
| page_to_process = st.selectbox( | |
| "選擇要處理的頁面", | |
| range(1, total_pages + 1), | |
| format_func=lambda x: f"第 {x} 頁" | |
| ) | |
| # 顯示選定頁面的內容預覽 | |
| st.subheader(f"第 {page_to_process} 頁內容預覽") | |
| selected_page_content = pages_text[page_to_process - 1] | |
| st.text_area("原始內容", selected_page_content, height=200) | |
| # 處理選項 | |
| process_method = st.radio( | |
| "選擇處理方式", | |
| ["翻譯原始內容", "先解釋後翻譯"] | |
| ) | |
| if st.button("開始處理"): | |
| with st.spinner("處理中..."): | |
| if process_method == "翻譯原始內容": | |
| # 直接翻譯 | |
| translation = translate_with_gemini(model, selected_page_content, target_language) | |
| # 顯示結果 | |
| st.subheader("翻譯結果") | |
| st.write(translation) | |
| # 準備下載結果 | |
| result_df = pd.DataFrame({ | |
| "時間戳記": [datetime.now().isoformat()], | |
| "原始內容": [selected_page_content], | |
| f"{target_language}翻譯": [translation] | |
| }) | |
| # 提供下載選項 | |
| provide_download_options(result_df) | |
| else: | |
| # 先解釋後翻譯 | |
| with st.spinner("解釋內容..."): | |
| explanation = process_with_gemini( | |
| model, | |
| selected_page_content, | |
| "請詳細解釋以下內容的主要要點和重要信息" | |
| ) | |
| with st.spinner("翻譯解釋..."): | |
| translation = translate_with_gemini(model, explanation, target_language) | |
| # 顯示結果 | |
| st.subheader("內容解釋") | |
| st.write(explanation) | |
| st.subheader(f"{target_language}翻譯") | |
| st.write(translation) | |
| # 準備下載結果 | |
| result_df = pd.DataFrame({ | |
| "時間戳記": [datetime.now().isoformat()], | |
| "原始內容": [selected_page_content], | |
| "內容解釋": [explanation], | |
| f"{target_language}翻譯": [translation] | |
| }) | |
| # 提供下載選項 | |
| provide_download_options(result_df) | |
| else: | |
| st.error("無法提取PDF內容,請檢查文件格式是否正確。") |