123Sabrina commited on
Commit
6f94d17
·
verified ·
1 Parent(s): 9b6d22e

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +0 -156
src/streamlit_app.py CHANGED
@@ -12,161 +12,6 @@ import base64
12
  import io
13
  from tqdm import tqdm
14
 
15
- # 設定日誌
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s - %(message)s',
19
- handlers=[
20
- logging.FileHandler("pdf_processing.log"),
21
- logging.StreamHandler(sys.stdout)
22
- ]
23
- )
24
- logger = logging.getLogger(__name__)
25
-
26
- # 設置頁面配置
27
- st.set_page_config(
28
- page_title="PDF 翻譯與處理工具",
29
- page_icon="📄",
30
- layout="wide",
31
- initial_sidebar_state="expanded",
32
- )
33
-
34
- # 應用標題
35
- st.title("PDF 翻譯與處理工具")
36
-
37
- # 初始化會話狀態
38
- if 'pdf_path' not in st.session_state:
39
- st.session_state.pdf_path = None
40
- if 'pdf_content' not in st.session_state:
41
- st.session_state.pdf_content = None
42
- if 'df' not in st.session_state:
43
- st.session_state.df = None
44
- if 'total_pages' not in st.session_state:
45
- st.session_state.total_pages = 0
46
- if 'model' not in st.session_state:
47
- st.session_state.model = None
48
- if 'current_page' not in st.session_state:
49
- st.session_state.current_page = 1
50
- if 'api_key' not in st.session_state:
51
- st.session_state.api_key = "AIzaSyCZDHwY2rIArR3KHf7RpktvRZ4m1sr4PeQ" # 默認API金鑰
52
- if 'results' not in st.session_state:
53
- st.session_state.results = {}
54
- if 'processing_complete' not in st.session_state:
55
- st.session_state.processing_complete = False
56
-
57
- def extract_text_from_pdf(pdf_file):
58
- """從PDF文件中提取所有文本內容,以頁面為單位返回文本列表"""
59
- try:
60
- pdf_reader = PyPDF2.PdfReader(pdf_file)
61
- total_pages = len(pdf_reader.pages)
62
- logger.info(f"PDF共有 {total_pages} 頁")
63
-
64
- pages_text = []
65
- progress_bar = st.progress(0)
66
- progress_text = st.empty()
67
-
68
- for page_num in range(total_pages):
69
- progress_text.text(f"正在提取PDF頁面 {page_num+1}/{total_pages}...")
70
- progress_value = (page_num + 1) / total_pages
71
- progress_bar.progress(progress_value)
72
-
73
- page_text = pdf_reader.pages[page_num].extract_text()
74
- pages_text.append(page_text)
75
-
76
- progress_text.text("PDF提取完成!")
77
- return pages_text, total_pages
78
- except Exception as e:
79
- logger.error(f"PDF文本提取錯誤: {e}")
80
- st.error(f"PDF提取錯誤: {str(e)}")
81
- return [], 0
82
-
83
- def pdf_to_dataframe(pages_text):
84
- """將PDF頁面文本轉換為DataFrame,每頁內容作為一個單獨的欄位"""
85
- try:
86
- # 創建一個字典,每一頁的內容對應一個欄位
87
- data_dict = {f'Page_{i+1}': [text] for i, text in enumerate(pages_text)}
88
-
89
- # 創建DataFrame
90
- return pd.DataFrame(data_dict)
91
- except Exception as e:
92
- logger.error(f"PDF轉DataFrame失敗: {e}")
93
- st.error(f"轉換數據失敗: {str(e)}")
94
- return None
95
-
96
- def setup_gemini_api(api_key):
97
- """設置 Gemini API"""
98
- try:
99
- os.environ["GOOGLE_API_KEY"] = api_key
100
- genai.configure(api_key=api_key)
101
- return genai.GenerativeModel("gemini-1.5-flash")
102
- except Exception as e:
103
- logger.error(f"Gemini API 設置失敗: {e}")
104
- st.error(f"API 設置失敗: {str(e)}")
105
- return None
106
-
107
- def translate_with_gemini(model, text, target_language="繁體中文"):
108
- """使用Gemini將文本翻譯成目標語言"""
109
- try:
110
- prompt = f"""
111
- 請將以下文本翻譯成{target_language},保持專業和準確性:
112
-
113
- {text}
114
-
115
- 只需要返回翻譯後的文本,不要加入其他解釋或備註。
116
- """
117
-
118
- response = model.generate_content(prompt)
119
- return response.text.strip()
120
- except Exception as e:
121
- logger.error(f"Gemini翻譯失敗: {e}")
122
- return f"翻譯失敗: {str(e)}"
123
-
124
- def process_with_gemini(model, text, instruction="請解釋以下內容"):
125
- """使用Gemini處理文本"""
126
- try:
127
- # 如果文本太長,可能需要切分處理
128
- if len(text) > 30000: # 假設 Gemini 的輸入限制為 30000 字符
129
- logger.info("文本過長,進行切分處理")
130
- chunks = split_text(text, 25000) # 切分為稍小的塊
131
-
132
- results = []
133
- progress_bar = st.progress(0)
134
- progress_text = st.empty()
135
-
136
- for i, chunk in enumerate(chunks):
137
- progress_text.text(f"正在處理文本塊 {i+1}/{len(chunks)}...")
138
- progress_value = (i + 1) / len(chunks)
139
- progress_bar.progress(progress_value)
140
-
141
- sub_prompt = f"{instruction} (部分 {i+1}/{len(chunks)}):\n\n{chunk}"
142
- response = model.generate_content(sub_prompt)
143
- results.append(response.text.strip())
144
-
145
- progress_text.text("處理完成!")
146
- return "\n\n".join(results)
147
- else:
148
- prompt = f"{instruction}:\n\n{text}"
149
- response = model.generate_content(prompt)
150
- return response.text.strip()
151
- except Exception as e:
152
- logger.error(f"Gemini處理失敗: {e}")
153
- return f"處理失敗: {str(e)}"
154
-
155
- def split_text(text, max_length):
156
- """將長文本切分為多個較小的塊"""import streamlit as st
157
- import PyPDF2
158
- import pandas as pd
159
- import os
160
- import google.generativeai as genai
161
- import csv
162
- from datetime import datetime
163
- import logging
164
- import sys
165
- import re
166
- import base64
167
- import io
168
- from tqdm import tqdm
169
-
170
  # 設定日誌 - 修改使用/tmp目錄,這在Hugging Face Space是可寫的
171
  try:
172
  logging.basicConfig(
@@ -190,7 +35,6 @@ except PermissionError:
190
 
191
  logger = logging.getLogger(__name__)
192
 
193
- # 其餘代碼保持不變...
194
  # 設置頁面配置
195
  st.set_page_config(
196
  page_title="PDF 翻譯與處理工具",
 
12
  import io
13
  from tqdm import tqdm
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # 設定日誌 - 修改使用/tmp目錄,這在Hugging Face Space是可寫的
16
  try:
17
  logging.basicConfig(
 
35
 
36
  logger = logging.getLogger(__name__)
37
 
 
38
  # 設置頁面配置
39
  st.set_page_config(
40
  page_title="PDF 翻譯與處理工具",