ss900371tw commited on
Commit
663082d
·
verified ·
1 Parent(s): dd69b4f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +96 -62
src/streamlit_app.py CHANGED
@@ -1,15 +1,13 @@
1
-
2
  import streamlit as st
3
  import os
4
  import io
5
  import json
6
- import csv # <--- 新增:用於處理 CSV
7
  import numpy as np
8
  import faiss
9
  import uuid
10
  import time
11
  import sys
12
-
13
  # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
14
  try:
15
  from huggingface_hub import InferenceClient
@@ -22,8 +20,7 @@ from langchain_core.documents import Document
22
  from langchain_community.vectorstores import FAISS
23
  from langchain_community.vectorstores.utils import DistanceStrategy
24
  from langchain_community.docstore.in_memory import InMemoryDocstore
25
-
26
- # 嘗試匯入 pypdftry
27
  try:
28
  import pypdf
29
  except ImportError:
@@ -37,14 +34,14 @@ st.markdown("已啟用:**IndexFlatIP** + **L2 正規化** + **Hugging Face Inf
37
  if 'execute_batch_analysis' not in st.session_state:
38
  st.session_state.execute_batch_analysis = False
39
  if 'batch_results' not in st.session_state:
40
- st.session_state.batch_results = None
41
  if 'rag_current_file_key' not in st.session_state:
42
  st.session_state.rag_current_file_key = None
43
- if 'batch_current_file_key' not in st.session_state: # 修改變數名稱以反映多格式
44
  st.session_state.batch_current_file_key = None
45
  if 'vector_store' not in st.session_state:
46
  st.session_state.vector_store = None
47
- if 'json_data_for_batch' not in st.session_state: # 變數名稱保留,但內容可能是轉換後的 dict
48
  st.session_state.json_data_for_batch = None
49
 
50
  # 設定模型 ID
@@ -54,24 +51,23 @@ WINDOW_SIZE = 8
54
  # --- 側邊欄設定 ---
55
  with st.sidebar:
56
  st.header("⚙️ 設定")
57
-
58
  if not os.environ.get("HF_TOKEN"):
59
- st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
60
-
61
  st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
62
  st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
63
-
64
  st.divider()
65
  st.subheader("📂 檔案上傳")
66
-
67
  # === 1. 批量分析檔案 (修改處:支援多種格式) ===
68
  batch_uploaded_file = st.file_uploader(
69
  "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
70
- type=['json', 'csv', 'txt'], # <--- 修改:新增 csv 和 txt
71
  key="batch_uploader",
72
  help="支援 JSON (Array), CSV (含標題), TXT (每行一條 Log)"
73
  )
74
-
75
  # === 2. RAG 知識庫檔案 ===
76
  rag_uploaded_file = st.file_uploader(
77
  "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
@@ -81,6 +77,16 @@ with st.sidebar:
81
 
82
  st.divider()
83
 
 
 
 
 
 
 
 
 
 
 
84
  st.subheader("💡 批量分析指令")
85
  analysis_prompt = st.text_area(
86
  "針對每個 Log/Alert 執行的指令",
@@ -88,31 +94,33 @@ with st.sidebar:
88
  height=200
89
  )
90
  st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析。")
91
-
92
- if batch_uploaded_file:
93
  if st.button("🚀 執行批量分析"):
94
  if not os.environ.get("HF_TOKEN"):
95
  st.error("無法執行,環境變數 **HF_TOKEN** 未設定。")
96
- else:
97
  st.session_state.execute_batch_analysis = True
98
  else:
99
  st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
100
-
101
  st.divider()
102
  st.subheader("🔍 RAG 檢索設定")
103
  similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
104
-
105
  st.divider()
106
  st.subheader("模型參數")
107
  system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
108
  max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
109
- temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
110
  top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
111
-
112
  st.divider()
113
  if st.button("🗑️ 清除所有紀錄"):
114
  for key in list(st.session_state.keys()):
115
- del st.session_state[key]
 
 
116
  st.rerun()
117
 
118
  # --- 初始化 Hugging Face LLM Client ---
@@ -131,9 +139,10 @@ inference_client = None
131
  if os.environ.get("HF_TOKEN"):
132
  with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
133
  inference_client = load_inference_client(MODEL_ID)
 
134
  if inference_client is None and os.environ.get("HF_TOKEN"):
135
  st.warning("Hugging Face Inference Client 無法連線。")
136
- elif not os.environ.get("HF_TOKEN"):
137
  st.error("請在環境變數中設定 HF_TOKEN。")
138
 
139
  # === Embedding 模型 (保持不變) ===
@@ -148,6 +157,7 @@ with st.spinner("正在載入 Embedding 模型..."):
148
 
149
  # === 建立向量庫 / Search 函數 (保持不變) ===
150
  def process_file_to_faiss(uploaded_file):
 
151
  text_content = ""
152
  try:
153
  if uploaded_file.type == "application/pdf":
@@ -159,31 +169,32 @@ def process_file_to_faiss(uploaded_file):
159
  else:
160
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
161
  text_content = stringio.read()
162
-
163
  if not text_content.strip(): return None, "File is empty"
164
-
165
  events = [line for line in text_content.splitlines() if line.strip()]
166
  docs = [Document(page_content=e) for e in events]
167
  if not docs: return None, "No documents created"
168
-
169
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
170
  embeddings_np = np.array(embeddings).astype("float32")
171
  faiss.normalize_L2(embeddings_np)
172
-
173
  dimension = embeddings_np.shape[1]
174
  index = faiss.IndexFlatIP(dimension)
175
  index.add(embeddings_np)
176
-
177
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
178
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
179
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
180
-
181
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
182
  return vector_store, f"{len(docs)} chunks created."
183
  except Exception as e:
184
  return None, f"Error: {str(e)}"
185
 
186
  def faiss_cosine_search_all(vector_store, query, threshold):
 
187
  q_emb = embedding_model.embed_query(query)
188
  q_emb = np.array([q_emb]).astype("float32")
189
  faiss.normalize_L2(q_emb)
@@ -201,6 +212,7 @@ def faiss_cosine_search_all(vector_store, query, threshold):
201
 
202
  # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
203
  def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
 
204
  if client is None: return "ERROR: Client Error", ""
205
  context_text = ""
206
  if vector_store:
@@ -208,14 +220,15 @@ def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_p
208
  if selected:
209
  retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
210
  context_text = "\n".join(retrieved_contents)
211
-
212
  rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
213
  log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
214
-
215
  messages = [
216
  {"role": "system", "content": sys_prompt},
217
  {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
218
  ]
 
219
  try:
220
  response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
221
  if response_stream and response_stream.choices:
@@ -244,31 +257,51 @@ elif 'vector_store' in st.session_state:
244
  # 支援 JSON, CSV, TXT 並統一轉換為 list of dicts
245
  if batch_uploaded_file:
246
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
247
-
248
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
249
  try:
250
- stringio = io.StringIO(batch_uploaded_file.getvalue().decode("utf-8"))
 
 
 
251
  parsed_data = None
252
 
 
 
253
  # --- Case 1: JSON ---
254
- if batch_uploaded_file.name.lower().endswith('.json'):
255
  parsed_data = json.load(stringio)
256
  st.toast("JSON 檔案載入成功", icon="📄")
257
-
258
- # --- Case 2: CSV ---
259
- elif batch_uploaded_file.name.lower().endswith('.csv'):
260
- # 使用 DictReader 將 CSV 轉List of Dicts
 
 
261
  reader = csv.DictReader(stringio)
262
  parsed_data = list(reader)
263
- st.toast("CSV 檔案已轉換為 JSON 結構", icon="📊")
264
-
265
- # --- Case 3: TXT ---
 
 
266
  else: # 預設為 TXT
267
- # 將每一行包裝成一個 JSON 物件: {"raw_content": "line text"}
268
- lines = stringio.readlines()
269
- parsed_data = [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
270
- st.toast("TXT 檔案已轉換為 JSON 結構", icon="📝")
271
-
 
 
 
 
 
 
 
 
 
 
 
272
  # 儲存處理後的數據
273
  st.session_state.json_data_for_batch = parsed_data
274
  st.session_state.batch_current_file_key = batch_file_key
@@ -285,18 +318,18 @@ elif 'json_data_for_batch' in st.session_state:
285
  del st.session_state.batch_results
286
  st.info("批量分析檔案已移除,已清除相關數據。")
287
 
288
- # === 執行批量分析邏輯 ===
289
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
290
  st.session_state.execute_batch_analysis = False
291
  start_time = time.time()
292
  st.session_state.batch_results = []
293
-
294
  if inference_client is None:
295
  st.error("Client 未連線,無法執行。")
296
  else:
297
  data_to_process = st.session_state.json_data_for_batch
298
  logs_list = []
299
-
300
  # 處理不同的 JSON 結構 (Dict vs List)
301
  if isinstance(data_to_process, list):
302
  logs_list = data_to_process
@@ -310,15 +343,15 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
310
  logs_list = [data_to_process]
311
  else:
312
  logs_list = [data_to_process]
313
-
314
  if logs_list:
315
  vs = st.session_state.get("vector_store", None)
316
-
317
  # --- 關鍵:在這裡做 JSON String 的轉換 ---
318
  # 無論來源是 CSV(Dict) 還是 TXT(Dict),都在這裡用 json.dumps 轉成字串
319
  # 這保證了 Prompt 收到的永遠是 JSON 格式的文字
320
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
321
-
322
  analysis_sequences = []
323
  for i in range(len(formatted_logs)):
324
  start_index = max(0, i - WINDOW_SIZE + 1)
@@ -333,17 +366,17 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
333
  "target_log_id": i + 1,
334
  "original_log_entry": logs_list[i]
335
  })
336
-
337
  total_sequences = len(analysis_sequences)
338
  st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
339
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
340
  results_container = st.container()
341
  full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
342
-
343
  for i, seq_data in enumerate(analysis_sequences):
344
  log_id = seq_data["target_log_id"]
345
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
346
-
347
  try:
348
  response, retrieved_ctx = generate_rag_response_hf_for_log(
349
  client=inference_client,
@@ -365,25 +398,26 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
365
  "context": retrieved_ctx
366
  }
367
  st.session_state.batch_results.append(item)
368
-
369
  with results_container:
370
  st.subheader(f"Log/Alert #{item['log_id']}")
371
  with st.expander("序列內容 (JSON Format)"):
372
- st.code(item["sequence_analyzed"], language='json') # 這裡顯示的會是 JSON 格式
373
-
 
374
  is_high = any(x in response.lower() for x in ['high risk'])
375
  if is_high: st.error(item['analysis_result'])
376
  else: st.info(item['analysis_result'])
377
  if item['context']:
378
  with st.expander("參考 RAG 片段"): st.code(item['context'])
379
  st.markdown("---")
380
-
381
  log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
382
  full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
383
-
384
  except Exception as e:
385
  st.error(f"Error Log {log_id}: {e}")
386
-
387
  end_time = time.time()
388
  progress_bar.empty()
389
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")
 
 
1
  import streamlit as st
2
  import os
3
  import io
4
  import json
5
+ import csv
6
  import numpy as np
7
  import faiss
8
  import uuid
9
  import time
10
  import sys
 
11
  # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
12
  try:
13
  from huggingface_hub import InferenceClient
 
20
  from langchain_community.vectorstores import FAISS
21
  from langchain_community.vectorstores.utils import DistanceStrategy
22
  from langchain_community.docstore.in_memory import InMemoryDocstore
23
+ # 嘗試匯入 pypdf
 
24
  try:
25
  import pypdf
26
  except ImportError:
 
34
  if 'execute_batch_analysis' not in st.session_state:
35
  st.session_state.execute_batch_analysis = False
36
  if 'batch_results' not in st.session_state:
37
+ st.session_state.batch_results = None
38
  if 'rag_current_file_key' not in st.session_state:
39
  st.session_state.rag_current_file_key = None
40
+ if 'batch_current_file_key' not in st.session_state:
41
  st.session_state.batch_current_file_key = None
42
  if 'vector_store' not in st.session_state:
43
  st.session_state.vector_store = None
44
+ if 'json_data_for_batch' not in st.session_state:
45
  st.session_state.json_data_for_batch = None
46
 
47
  # 設定模型 ID
 
51
  # --- 側邊欄設定 ---
52
  with st.sidebar:
53
  st.header("⚙️ 設定")
54
+
55
  if not os.environ.get("HF_TOKEN"):
56
+ st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
 
57
  st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
58
  st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
59
+
60
  st.divider()
61
  st.subheader("📂 檔案上傳")
62
+
63
  # === 1. 批量分析檔案 (修改處:支援多種格式) ===
64
  batch_uploaded_file = st.file_uploader(
65
  "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
66
+ type=['json', 'csv', 'txt'],
67
  key="batch_uploader",
68
  help="支援 JSON (Array), CSV (含標題), TXT (每行一條 Log)"
69
  )
70
+
71
  # === 2. RAG 知識庫檔案 ===
72
  rag_uploaded_file = st.file_uploader(
73
  "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
 
77
 
78
  st.divider()
79
 
80
+ # === TXT 處理方式選項 (新增) ===
81
+ st.subheader("📄 TXT 檔案處理")
82
+ txt_format_option = st.radio(
83
+ "TXT 內容轉換方式",
84
+ ["每行作為 `raw_log_entry` 的值", "忽略 (請確保您的 TXT 是有效的 JSON 陣列)"],
85
+ index=0,
86
+ help="選擇 TXT 檔案的每一行應如何轉換為 JSON 物件。"
87
+ )
88
+ st.divider()
89
+
90
  st.subheader("💡 批量分析指令")
91
  analysis_prompt = st.text_area(
92
  "針對每個 Log/Alert 執行的指令",
 
94
  height=200
95
  )
96
  st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析。")
97
+
98
+ if batch_uploaded_file:
99
  if st.button("🚀 執行批量分析"):
100
  if not os.environ.get("HF_TOKEN"):
101
  st.error("無法執行,環境變數 **HF_TOKEN** 未設定。")
102
+ else:
103
  st.session_state.execute_batch_analysis = True
104
  else:
105
  st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
106
+
107
  st.divider()
108
  st.subheader("🔍 RAG 檢索設定")
109
  similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
110
+
111
  st.divider()
112
  st.subheader("模型參數")
113
  system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
114
  max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
115
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
116
  top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
117
+
118
  st.divider()
119
  if st.button("🗑️ 清除所有紀錄"):
120
  for key in list(st.session_state.keys()):
121
+ # 排除 HF_TOKEN,如果它在 session_state
122
+ if key != 'HF_TOKEN':
123
+ del st.session_state[key]
124
  st.rerun()
125
 
126
  # --- 初始化 Hugging Face LLM Client ---
 
139
  if os.environ.get("HF_TOKEN"):
140
  with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
141
  inference_client = load_inference_client(MODEL_ID)
142
+
143
  if inference_client is None and os.environ.get("HF_TOKEN"):
144
  st.warning("Hugging Face Inference Client 無法連線。")
145
+ elif not os.environ.get("HF_TOKEN"):
146
  st.error("請在環境變數中設定 HF_TOKEN。")
147
 
148
  # === Embedding 模型 (保持不變) ===
 
157
 
158
  # === 建立向量庫 / Search 函數 (保持不變) ===
159
  def process_file_to_faiss(uploaded_file):
160
+ # ... (此函數內容保持不變,因為它是處理 RAG 文件的,與 CSV/TXT 批量分析邏輯獨立)
161
  text_content = ""
162
  try:
163
  if uploaded_file.type == "application/pdf":
 
169
  else:
170
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
171
  text_content = stringio.read()
172
+
173
  if not text_content.strip(): return None, "File is empty"
174
+
175
  events = [line for line in text_content.splitlines() if line.strip()]
176
  docs = [Document(page_content=e) for e in events]
177
  if not docs: return None, "No documents created"
178
+
179
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
180
  embeddings_np = np.array(embeddings).astype("float32")
181
  faiss.normalize_L2(embeddings_np)
182
+
183
  dimension = embeddings_np.shape[1]
184
  index = faiss.IndexFlatIP(dimension)
185
  index.add(embeddings_np)
186
+
187
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
188
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
189
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
190
+
191
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
192
  return vector_store, f"{len(docs)} chunks created."
193
  except Exception as e:
194
  return None, f"Error: {str(e)}"
195
 
196
  def faiss_cosine_search_all(vector_store, query, threshold):
197
+ # ... (此函數內容保持不變)
198
  q_emb = embedding_model.embed_query(query)
199
  q_emb = np.array([q_emb]).astype("float32")
200
  faiss.normalize_L2(q_emb)
 
212
 
213
  # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
214
  def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
215
+ # ... (此函數內容保持不變)
216
  if client is None: return "ERROR: Client Error", ""
217
  context_text = ""
218
  if vector_store:
 
220
  if selected:
221
  retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
222
  context_text = "\n".join(retrieved_contents)
223
+
224
  rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
225
  log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
226
+
227
  messages = [
228
  {"role": "system", "content": sys_prompt},
229
  {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
230
  ]
231
+
232
  try:
233
  response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
234
  if response_stream and response_stream.choices:
 
257
  # 支援 JSON, CSV, TXT 並統一轉換為 list of dicts
258
  if batch_uploaded_file:
259
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
260
+
261
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
262
  try:
263
+ # 必須使用 io.BytesIO 和 decode,才能正確處理 CSV/TXT 檔案
264
+ # 並且需要 rewind()
265
+ file_bytes = batch_uploaded_file.getvalue()
266
+ stringio = io.StringIO(file_bytes.decode("utf-8"))
267
  parsed_data = None
268
 
269
+ file_name_lower = batch_uploaded_file.name.lower()
270
+
271
  # --- Case 1: JSON ---
272
+ if file_name_lower.endswith('.json'):
273
  parsed_data = json.load(stringio)
274
  st.toast("JSON 檔案載入成功", icon="📄")
275
+
276
+ # --- Case 2: CSV (修正:使用 DictReader) ---
277
+ elif file_name_lower.endswith('.csv'):
278
+ # DictReader 會自動第一行視Key
279
+ # 必須使用 file_bytes.decode() 確保編碼正確性
280
+ stringio.seek(0)
281
  reader = csv.DictReader(stringio)
282
  parsed_data = list(reader)
283
+ if not parsed_data:
284
+ raise ValueError("CSV 檔案載入失敗或內容為空。")
285
+ st.toast("CSV 檔案已轉換為 JSON 結構 (第一行為 Key)", icon="📊")
286
+
287
+ # --- Case 3: TXT (修正:根據 radio 選項處理) ---
288
  else: # 預設為 TXT
289
+ if txt_format_option == "每行作為 `raw_log_entry` 的值":
290
+ stringio.seek(0)
291
+ lines = stringio.readlines()
292
+ # 將每一行包裝成一個 JSON 物件: {"raw_log_entry": "line text"}
293
+ parsed_data = [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
294
+ st.toast("TXT 檔案已轉換為 JSON 結構 (每行為 raw_log_entry)", icon="📝")
295
+ else:
296
+ # 如果用戶選擇忽略,則假設 TXT 內容本身就是一個有效的 JSON 陣列或物件
297
+ stringio.seek(0)
298
+ text_content = stringio.read().strip()
299
+ if text_content:
300
+ parsed_data = json.loads(text_content)
301
+ st.toast("TXT 檔案已作為 JSON 載入", icon="📝")
302
+ else:
303
+ raise ValueError("TXT 檔案內容為空。")
304
+
305
  # 儲存處理後的數據
306
  st.session_state.json_data_for_batch = parsed_data
307
  st.session_state.batch_current_file_key = batch_file_key
 
318
  del st.session_state.batch_results
319
  st.info("批量分析檔案已移除,已清除相關數據。")
320
 
321
+ # === 執行批量分析邏輯 (保持不變,因為 formatted_logs 已經將 Dict 轉為 JSON 字串) ===
322
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
323
  st.session_state.execute_batch_analysis = False
324
  start_time = time.time()
325
  st.session_state.batch_results = []
326
+
327
  if inference_client is None:
328
  st.error("Client 未連線,無法執行。")
329
  else:
330
  data_to_process = st.session_state.json_data_for_batch
331
  logs_list = []
332
+
333
  # 處理不同的 JSON 結構 (Dict vs List)
334
  if isinstance(data_to_process, list):
335
  logs_list = data_to_process
 
343
  logs_list = [data_to_process]
344
  else:
345
  logs_list = [data_to_process]
346
+
347
  if logs_list:
348
  vs = st.session_state.get("vector_store", None)
349
+
350
  # --- 關鍵:在這裡做 JSON String 的轉換 ---
351
  # 無論來源是 CSV(Dict) 還是 TXT(Dict),都在這裡用 json.dumps 轉成字串
352
  # 這保證了 Prompt 收到的永遠是 JSON 格式的文字
353
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
354
+
355
  analysis_sequences = []
356
  for i in range(len(formatted_logs)):
357
  start_index = max(0, i - WINDOW_SIZE + 1)
 
366
  "target_log_id": i + 1,
367
  "original_log_entry": logs_list[i]
368
  })
369
+
370
  total_sequences = len(analysis_sequences)
371
  st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
372
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
373
  results_container = st.container()
374
  full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
375
+
376
  for i, seq_data in enumerate(analysis_sequences):
377
  log_id = seq_data["target_log_id"]
378
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
379
+
380
  try:
381
  response, retrieved_ctx = generate_rag_response_hf_for_log(
382
  client=inference_client,
 
398
  "context": retrieved_ctx
399
  }
400
  st.session_state.batch_results.append(item)
401
+
402
  with results_container:
403
  st.subheader(f"Log/Alert #{item['log_id']}")
404
  with st.expander("序列內容 (JSON Format)"):
405
+ # 這裡顯示的會是 JSON 格式的 Log Sequence
406
+ st.code(item["sequence_analyzed"], language='json')
407
+
408
  is_high = any(x in response.lower() for x in ['high risk'])
409
  if is_high: st.error(item['analysis_result'])
410
  else: st.info(item['analysis_result'])
411
  if item['context']:
412
  with st.expander("參考 RAG 片段"): st.code(item['context'])
413
  st.markdown("---")
414
+
415
  log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
416
  full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
417
+
418
  except Exception as e:
419
  st.error(f"Error Log {log_id}: {e}")
420
+
421
  end_time = time.time()
422
  progress_bar.empty()
423
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")