ss900371tw commited on
Commit
70ed5d2
·
verified ·
1 Parent(s): 05d270b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +238 -224
src/streamlit_app.py CHANGED
@@ -23,7 +23,7 @@ from langchain_community.vectorstores import FAISS
23
  from langchain_community.vectorstores.utils import DistanceStrategy
24
  from langchain_community.docstore.in_memory import InMemoryDocstore
25
 
26
- # 嘗試匯入 pypdftry:
27
  try:
28
  import pypdf
29
  except ImportError:
@@ -57,61 +57,56 @@ MODEL_OPTIONS = {
57
  "fdtn-ai Foundation-Sec 8B Instruct (Hugging Face)": "fdtn-ai/Foundation-Sec-8B-Instruct",
58
  "Gemma 3 27B Instruct (Hugging Face)": "google/gemma-3-27b-it"
59
  }
60
-
61
  WINDOW_SIZE = 20 # 關聯 Log 的最大數量 (包含當前 Log)
62
 
63
  # === W3C Log 專屬解析器 (新增) ===
64
  def parse_w3c_log(log_content: str) -> List[Dict[str, Any]]:
65
  """
66
  解析 W3C Extended Log File Format (如 IIS Log),包括提取 #Fields:。
67
- Args:
68
- log_content (str): Log 檔案的字串內容。
69
- Returns:
70
- List[Dict[str, Any]]: 轉換後的 JSON 物件列表。
71
  """
72
  lines = log_content.splitlines()
73
  field_names = None
74
  data_lines = []
75
-
76
  for line in lines:
77
  line = line.strip()
78
  if not line:
79
  continue
80
-
81
  if line.startswith("#Fields:"):
82
  # 找到欄位定義,例如 "#Fields: date time s-ip cs-method ..."
83
  field_names = line.split()[1:] # 跳過 "#Fields:" 本身
84
  elif not line.startswith("#"):
85
  # 這是實際的資料行
86
  data_lines.append(line)
87
-
88
  if not field_names:
89
  # 如果沒有找到 #Fields,則退回到原始 Log 條目模式
90
- # st.warning("未檢測到 W3C Log 的 #Fields: 標頭,退回原始 Log 條目模式。")
91
  return [{"raw_log_entry": line} for line in lines if line.strip() and not line.startswith("#")]
92
 
93
  json_data = []
94
-
95
  # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充,使用底線版本)
 
96
  numeric_fields = ['sc_status', 'time_taken', 'bytes', 'resp_len', 'req_size']
97
-
98
  for data_line in data_lines:
99
  # W3C Log 預設使用空格分隔。這裡使用 split()
100
  values = data_line.split(' ')
101
-
102
  # 簡易的欄位數量檢查
103
  if len(values) != len(field_names):
104
  # 如果欄位數量不匹配,將該行視為原始 Log 條目
105
  json_data.append({"raw_log_entry": data_line})
106
  continue
107
-
108
  record = {}
109
  for key, value in zip(field_names, values):
110
  # 將 W3C 欄位名稱中的 '-' 替換成 Python 友好的 '_'
111
  key = key.strip().replace('-', '_')
112
-
113
  value = value.strip() if value else ""
114
-
115
  # 處理數字轉換
116
  if key in numeric_fields:
117
  try:
@@ -123,10 +118,10 @@ def parse_w3c_log(log_content: str) -> List[Dict[str, Any]]:
123
  record[key] = value
124
  else:
125
  record[key] = value
126
-
127
  if record:
128
  json_data.append(record)
129
-
130
  return json_data
131
 
132
  # === 核心檔案轉換函式 (CSV/TXT -> JSON List) (保留並微調) ===
@@ -137,9 +132,9 @@ def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Di
137
  log_content = file_content.decode("utf-8").strip()
138
  if not log_content:
139
  return []
140
-
141
  string_io = io.StringIO(log_content)
142
-
143
  # 嘗試使用 csv.DictReader 自動將第一行視為 Key
144
  try:
145
  reader = csv.DictReader(string_io)
@@ -150,16 +145,17 @@ def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Di
150
  json_data = []
151
  if reader and reader.fieldnames:
152
  # 使用者可能使用的數值欄位名稱
153
- numeric_fields = ['sc-status', 'time-taken', 'bytes', 'resp-len', 'req-size', 'status_code', 'size', 'duration']
154
-
155
  for row in reader:
156
  record = {}
157
  for key, value in row.items():
158
  if key is None: continue
159
-
160
- key = key.strip()
 
161
  value = value.strip() if value else ""
162
-
163
  # 處理數字轉換
164
  if key in numeric_fields:
165
  try:
@@ -171,16 +167,16 @@ def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Di
171
  record[key] = value
172
  else:
173
  record[key] = value
174
-
175
  if record:
176
  json_data.append(record)
177
-
178
- # 再次檢查是否為空,如果是空,可能不是標準 CSV/JSON
179
  if not json_data:
180
  string_io.seek(0)
181
  lines = string_io.readlines()
182
  return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
183
-
184
  return json_data
185
 
186
  # === 檔案類型分發器 (已修改) ===
@@ -188,12 +184,12 @@ def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
188
  """根據檔案類型,將上傳的檔案內容轉換為 Log JSON 列表。"""
189
  file_bytes = uploaded_file.getvalue()
190
  file_name_lower = uploaded_file.name.lower()
191
-
192
  # --- Case 1: JSON ---
193
  if file_name_lower.endswith('.json'):
194
  stringio = io.StringIO(file_bytes.decode("utf-8"))
195
  parsed_data = json.load(stringio)
196
-
197
  if isinstance(parsed_data, dict):
198
  # 處理包裹在 'alerts' 或 'logs' 鍵中的列表
199
  if 'alerts' in parsed_data and isinstance(parsed_data['alerts'], list):
@@ -206,28 +202,28 @@ def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
206
  return parsed_data # 列表直接返回
207
  else:
208
  raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
209
-
210
  # --- Case 2, 3, & 4: CSV/TXT/LOG ---
211
  elif file_name_lower.endswith(('.csv', '.txt', '.log')):
212
  file_type = 'csv' if file_name_lower.endswith('.csv') else ('log' if file_name_lower.endswith('.log') else 'txt')
213
-
214
  if file_type == 'log':
215
  # 針對 .log 檔案,嘗試使用 W3C 解析器
216
  log_content = file_bytes.decode("utf-8").strip()
217
  if not log_content: return []
218
  return parse_w3c_log(log_content)
219
-
220
  else:
221
  # CSV 和 TXT 保持使用原來的 csv.DictReader 邏輯
222
  return convert_csv_txt_to_json_list(file_bytes, file_type)
223
-
224
  else:
225
  raise ValueError("不支援的檔案類型。")
226
 
227
  # --- 側邊欄設定 (已更新 'type' 參數) ---
228
  with st.sidebar:
229
  st.header("⚙️ 設定")
230
-
231
  # --- 新增模型選單 ---
232
  selected_model_name = st.selectbox(
233
  "選擇 LLM 模型",
@@ -240,10 +236,11 @@ with st.sidebar:
240
  st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
241
  st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
242
  st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
243
-
244
  st.divider()
 
245
  st.subheader("📂 檔案上傳")
246
-
247
  # === 1. 批量分析檔案 (支援多種格式) ===
248
  batch_uploaded_file = st.file_uploader(
249
  "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
@@ -251,15 +248,16 @@ with st.sidebar:
251
  key="batch_uploader",
252
  help="支援 JSON (Array), CSV (含標題), TXT/LOG (視為 W3C 或一般 Log)"
253
  )
254
-
255
  # === 2. RAG 知識庫檔案 ===
256
  rag_uploaded_file = st.file_uploader(
257
  "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
258
  type=['txt', 'py', 'log', 'csv', 'md', 'pdf'], # <--- 這裡增加了 'log'
259
  key="rag_uploader"
260
  )
 
261
  st.divider()
262
-
263
  st.subheader("💡 批量分析指令")
264
  analysis_prompt = st.text_area(
265
  "針對每個 Log/Alert 執行的指令",
@@ -267,7 +265,7 @@ with st.sidebar:
267
  height=200
268
  )
269
  st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析 (使用 **IP 關聯視窗**)。")
270
-
271
  if batch_uploaded_file:
272
  if st.button("🚀 執行批量分析"):
273
  if not os.environ.get("HF_TOKEN"):
@@ -280,47 +278,50 @@ with st.sidebar:
280
  st.error("請先等待 Log 檔案解析完成。")
281
  else:
282
  st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
283
-
284
  st.divider()
 
285
  st.subheader("🔍 RAG 檢索設定")
286
  similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
287
-
288
  st.divider()
 
289
  st.subheader("模型參數")
290
  system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
291
  max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
292
  temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
293
  top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
294
-
295
  st.divider()
 
296
  if st.button("🗑️ 清除所有紀錄"):
297
  # 僅清除動態狀態,保留 HF_TOKEN
298
  for key in list(st.session_state.keys()):
299
  if key not in ['HF_TOKEN']:
300
- del st.session_state[key]
 
 
301
  st.rerun()
302
 
303
- # --- 初始化 Hugging Face LLM Client (已更新,MODEL_ID 作為參數) ---
304
- # 確保 load_inference_client 接受 model_id 作為參數,以利用 Streamlit 的快取機制。
305
  @st.cache_resource
306
  def load_inference_client(model_id):
307
  if not os.environ.get("HF_TOKEN"): return None
308
  try:
309
- client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN"))
310
- st.success(f"Hugging Face Inference Client **{model_id}** 載入成功。")
 
311
  return client
312
  except Exception as e:
313
- st.error(f"Hugging Face Inference Client 載入失敗: {e}")
314
  return None
315
 
316
  inference_client = None
317
  if os.environ.get("HF_TOKEN"):
318
- with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
319
- # 傳遞 MODEL_ID
320
- inference_client = load_inference_client(MODEL_ID)
321
-
322
- if inference_client is None and os.environ.get("HF_TOKEN"):
323
- st.warning(f"Hugging Face Inference Client **{MODEL_ID}** 無法連線。")
324
  elif not os.environ.get("HF_TOKEN"):
325
  st.error("請在環境變數中設定 HF_TOKEN。")
326
 
@@ -343,31 +344,31 @@ def process_file_to_faiss(uploaded_file):
343
  pdf_reader = pypdf.PdfReader(uploaded_file)
344
  for page in pdf_reader.pages:
345
  text_content += page.extract_text() + "\n"
346
- else: return None, "PDF library missing"
347
  else:
348
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
349
  text_content = stringio.read()
350
-
351
  if not text_content.strip(): return None, "File is empty"
352
-
353
  # 這裡將文件內容按行分割為 Document,每行一個 Document
354
  events = [line for line in text_content.splitlines() if line.strip()]
355
  docs = [Document(page_content=e) for e in events]
356
  if not docs: return None, "No documents created"
357
-
358
  # 進行 Embedding 和 FAISS 初始化 (IndexFlatIP + L2 normalization)
359
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
360
  embeddings_np = np.array(embeddings).astype("float32")
361
  faiss.normalize_L2(embeddings_np)
362
-
363
  dimension = embeddings_np.shape[1]
364
  index = faiss.IndexFlatIP(dimension) # 使用內積 (Inner Product)
365
  index.add(embeddings_np)
366
-
367
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
368
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
369
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
370
-
371
  # 使用 Cosine 距離策略,配合 IndexFlatIP 和 L2 normalization 達到 Cosine Similarity
372
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
373
  return vector_store, f"{len(docs)} chunks created."
@@ -381,7 +382,7 @@ def faiss_cosine_search_all(vector_store, query, threshold):
381
  index = vector_store.index
382
  D, I = index.search(q_emb, k=index.ntotal)
383
  selected = []
384
-
385
  # Cosine Similarity = D (IndexFlatIP + L2 normalization)
386
  for score, idx in zip(D[0], I[0]):
387
  if idx == -1: continue
@@ -390,32 +391,36 @@ def faiss_cosine_search_all(vector_store, query, threshold):
390
  doc_id = vector_store.index_to_docstore_id[idx]
391
  doc = vector_store.docstore.search(doc_id)
392
  selected.append((doc, score))
393
-
394
  selected.sort(key=lambda x: x[1], reverse=True)
395
  return selected
396
 
397
  # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
398
  def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
399
- if client is None: return "ERROR: Client Error", ""
 
400
  context_text = ""
401
-
402
  # RAG 檢索邏輯
403
  if vector_store:
404
- selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
405
- if selected:
406
- # 只取前 5 個最相關的片段
407
- retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
408
- context_text = "\n".join(retrieved_contents)
409
-
410
- rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===ANALYSIS INSTRUCTION: {user_prompt}Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
 
411
 
 
 
412
  log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: Max {WINDOW_SIZE} logs associated by IP) ==={log_sequence_text}=== END LOG SEQUENCE ==="""
413
-
414
  messages = [
415
  {"role": "system", "content": sys_prompt},
416
  {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
417
  ]
418
-
419
  try:
420
  # 使用 chat_completion 進行模型呼叫
421
  response_stream = client.chat_completion(
@@ -427,9 +432,11 @@ def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_p
427
  )
428
  if response_stream and response_stream.choices:
429
  return response_stream.choices[0].message.content.strip(), context_text
430
- else: return "Format Error: Model returned empty response or invalid format.", context_text
431
- except Exception as e:
432
- return f"Model Error: {str(e)}", context_text
 
 
433
 
434
  # =======================================================================
435
  # === 檔案處理區塊 (RAG 檔案) - 保持不變 ===
@@ -439,7 +446,7 @@ if rag_uploaded_file:
439
  # 清除舊的 vector store 以節省內存
440
  if 'vector_store' in st.session_state:
441
  del st.session_state.vector_store
442
-
443
  with st.spinner(f"正在建立 RAG 參考知識庫 ({rag_uploaded_file.name})..."):
444
  vs, msg = process_file_to_faiss(rag_uploaded_file)
445
  if vs:
@@ -449,7 +456,7 @@ if rag_uploaded_file:
449
  else:
450
  st.session_state.rag_current_file_key = None
451
  st.error(msg)
452
- elif 'vector_store' in st.session_state:
453
  del st.session_state.vector_store
454
  del st.session_state.rag_current_file_key
455
  st.info("RAG 檔案已移除,已清除相關知識庫。")
@@ -457,7 +464,7 @@ elif 'vector_store' in st.session_state:
457
  # === 檔案處理區塊 (批量分析檔案 - **已更新** ) ===
458
  if batch_uploaded_file:
459
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
460
-
461
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
462
  try:
463
  # 清除舊的數據
@@ -465,22 +472,24 @@ if batch_uploaded_file:
465
  del st.session_state.json_data_for_batch
466
  if 'batch_results' in st.session_state:
467
  del st.session_state.batch_results
 
468
  # 使用新的統一解析函式
469
  parsed_data = convert_uploaded_file_to_json_list(batch_uploaded_file)
470
-
471
  if not parsed_data:
472
  raise ValueError(f"{batch_uploaded_file.name} 檔案載入失敗或內容為空。")
473
-
474
  # 儲存處理後的數據
475
  st.session_state.json_data_for_batch = parsed_data
476
  st.session_state.batch_current_file_key = batch_file_key
477
  st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
478
-
479
  except Exception as e:
480
  st.error(f"檔案解析錯誤: {e}")
481
  if 'json_data_for_batch' in st.session_state:
482
  del st.session_state.json_data_for_batch
483
  st.session_state.batch_current_file_key = None # 設置為 None 避免錯誤的 Key
 
484
  elif 'json_data_for_batch' in st.session_state:
485
  # 檔案被移除,清除相關數據
486
  del st.session_state.json_data_for_batch
@@ -490,71 +499,78 @@ elif 'json_data_for_batch' in st.session_state:
490
  del st.session_state.batch_results
491
  st.info("批量分析檔案已移除,已清除相關數據和結果。")
492
 
 
493
  # === 執行批量分析邏輯 (已修改為 IP 關聯視窗) ===
494
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state and st.session_state.json_data_for_batch is not None:
495
  st.session_state.execute_batch_analysis = False
496
  start_time = time.time()
497
-
498
- # 這裡必須確保 st.session_state.batch_results 是 List,而不是 None
499
- if 'batch_results' not in st.session_state or st.session_state.batch_results is None:
500
- st.session_state.batch_results = []
501
-
502
  st.session_state.batch_results = []
503
-
504
  if inference_client is None:
505
- st.error("Client 未連線,無法執行。")
506
  else:
507
  logs_list = st.session_state.json_data_for_batch
508
-
509
  if logs_list:
510
  vs = st.session_state.get("vector_store", None)
511
-
512
  # 將 Log 條目轉換為 JSON 字串,用於 LLM 輸入
513
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
514
-
515
  analysis_sequences = []
516
-
517
  # --- 核心修改:基於 IP 關聯的 Log Sequence 建構 ---
518
  for i in range(len(formatted_logs)):
519
  current_log_entry = logs_list[i]
520
  current_log_str = formatted_logs[i]
521
-
522
  # 嘗試從當前 Log 條目中提取 IP 地址 (優先 W3C 格式,然後是一般日誌格式)
523
- # 使用者可以根據自己的日誌格式調整這裡的 Key
524
- target_ip = current_log_entry.get('c_ip') or current_log_entry.get('c-ip') or current_log_entry.get('remote_addr') or current_log_entry.get('source_ip')
525
-
 
 
 
 
 
526
  sequence_text = []
527
  correlated_logs = []
528
-
529
- if target_ip and target_ip != "-": # 假設 '-' 是 W3C 中的空值
530
-
 
531
  # 篩選過去的 Log,最多 WINDOW_SIZE - 1 個,且 IP 必須匹配
532
  # 從 i-1 倒序檢查到 0
533
  for j in range(i - 1, -1, -1):
534
  prior_log_entry = logs_list[j]
535
- prior_ip = prior_log_entry.get('c_ip') or prior_log_entry.get('c-ip') or prior_log_entry.get('remote_addr') or prior_log_entry.get('source_ip')
536
-
 
 
 
 
537
  # 檢查 IP 是否匹配
538
  if prior_ip == target_ip:
539
  # 插入到最前面,保持時間順序
540
  correlated_logs.insert(0, formatted_logs[j])
541
-
542
- # 限制累積的 Log 數量(不包含當前 Log)
543
- if len(correlated_logs) >= WINDOW_SIZE - 1:
544
- break
545
-
546
  # 1. 加入相關聯的 Log (時間較早的)
547
- for j, log_str in enumerate(correlated_logs):
548
- # log_idx 是這些 Log 在 logs_list 中的原始索引 (不完全準確,但提供參考)
549
- sequence_text.append(f"--- Correlated Log Index (IP:{target_ip}) ---\n{log_str}")
550
-
551
  else:
552
  # 如果沒有找到 IP,只分析當前 Log (確保 sequence_text 不是空的)
553
- st.warning(f"Log #{i+1} 找不到 IP 欄位 ({target_ip}),僅分析當前 Log 條目。")
554
-
555
  # 2. 加入當前的目標 Log
556
  sequence_text.append(f"--- TARGET LOG TO ANALYZE (Index {i+1}) ---\n{current_log_str}")
557
-
558
  analysis_sequences.append({
559
  "sequence_text": "\n\n".join(sequence_text),
560
  "target_log_id": i + 1,
@@ -565,12 +581,17 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
565
  total_sequences = len(analysis_sequences)
566
  st.header(f"⚡ 批量分析執行中 (IP 關聯視窗 $N={WINDOW_SIZE}$)...")
567
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
568
- results_container = st.container()
569
-
 
 
570
  for i, seq_data in enumerate(analysis_sequences):
571
  log_id = seq_data["target_log_id"]
 
 
572
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
573
-
 
574
  try:
575
  response, retrieved_ctx = generate_rag_response_hf_for_log(
576
  client=inference_client,
@@ -584,7 +605,7 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
584
  temperature=temperature,
585
  top_p=top_p
586
  )
587
-
588
  item = {
589
  "log_id": log_id,
590
  "log_content": seq_data["original_log_entry"],
@@ -592,139 +613,93 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
592
  "analysis_result": response,
593
  "context": retrieved_ctx
594
  }
595
-
596
  st.session_state.batch_results.append(item)
597
-
598
- with results_container:
599
- # 呈現 LLM 分析結果
600
- is_high = any(x in response.lower() for x in ['high-risk detected!'])
601
- is_medium = any(x in response.lower() for x in ['medium-risk detected!'])
602
- if is_high:
603
- st.subheader(f"Log/Alert #{item['log_id']} (HIGH RISK DETECTED)")
604
- with st.expander("序列內容 (JSON Format)"):
605
- st.code(item["sequence_analyzed"], language='json')
606
- st.error(item['analysis_result'])
607
- st.markdown("---")
608
-
609
- elif is_medium:
610
- st.subheader(f"Log/Alert #{item['log_id']} (MEDIUM RISK DETECTED)")
611
- with st.expander("序列內容 (JSON Format)"):
612
- st.code(item["sequence_analyzed"], language='json')
613
- st.warning(item['analysis_result'])
614
- st.markdown("---")
615
-
616
- if item['context']:
617
- with st.expander("參考 RAG 片段"): st.code(item['context'])
618
-
619
  except Exception as e:
620
- st.error(f"Error Log {log_id}: {e}")
621
-
 
 
 
 
 
 
 
622
  end_time = time.time()
623
  progress_bar.empty()
 
624
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")
 
 
 
 
 
625
  else:
626
  st.error("無法提取有效 Log,請檢查檔案格式。")
627
 
628
- # === 顯示結果 (歷史紀錄) - 保持不變,但加固了 session state 檢查 ===
629
  if st.session_state.get("batch_results") and isinstance(st.session_state.batch_results, list) and st.session_state.batch_results:
630
  st.header("⚡ 歷史分析結果")
631
-
 
632
  high_risk_data = []
633
- high_risk_items = []
634
  medium_risk_data = []
635
- medium_risk_items = []
636
-
637
- for item in st.session_state.batch_results:
638
- # 檢查 analysis_result 中是否包含 'High-risk detected' (不區分大小寫)
639
- is_high_risk = 'high-risk detected!' in item['analysis_result'].lower()
640
-
641
- if is_high_risk:
642
- high_risk_items.append(item)
643
-
644
- # --- 為 CSV 報告準備數據 ---
645
- log_content_str = json.dumps(item["log_content"], ensure_ascii=False)
646
- analysis_result_clean = item['analysis_result'].replace('\n', ' | ')
647
-
648
- high_risk_data.append({
649
- "Log_ID": item['log_id'],
650
- "Risk_Level": "HIGH_RISK",
651
- "Log_Content": log_content_str,
652
- "AI_Analysis_Result": analysis_result_clean
653
- })
654
 
 
655
  for item in st.session_state.batch_results:
656
- # 檢查 analysis_result 中是否包含 'High-risk detected' (不區分大小寫)
657
- is_medium_risk = 'medium-risk detected!' in item['analysis_result'].lower()
658
-
659
- if is_medium_risk:
660
- medium_risk_items.append(item)
661
-
662
- # --- 為 CSV 報告準備數據 ---
663
  log_content_str = json.dumps(item["log_content"], ensure_ascii=False)
664
  analysis_result_clean = item['analysis_result'].replace('\n', ' | ')
665
-
666
- medium_risk_data.append({
667
  "Log_ID": item['log_id'],
668
- "Risk_Level": "MEDIUM_RISK",
669
  "Log_Content": log_content_str,
670
  "AI_Analysis_Result": analysis_result_clean
671
- })
672
-
673
- # 顯示 High-Risk 報告的下載按鈕 (改為 CSV 邏輯)
 
 
 
 
 
674
  report_container = st.container()
675
 
676
  with report_container:
677
- if high_risk_items:
678
- st.success(f"✅ 檢測到 {len(high_risk_items)} 條高風險 Log/Alert。")
679
-
680
- # --- 構建 CSV 內容 ---
681
  csv_output = io.StringIO()
682
- csv_output.write("Log_ID,Risk_Level,Log_Content,AI_Analysis_Result\n")
683
-
684
- def escape_csv(value):
685
- # 替換內容中的所有雙引號為兩個雙引號,然後用雙引號包圍
686
- return f'"{str(value).replace('"', '""')}"'
687
-
688
- for row in high_risk_data:
689
- line = ",".join([
690
- str(row["Log_ID"]),
691
- row["Risk_Level"],
692
- escape_csv(row["Log_Content"]),
693
- escape_csv(row["AI_Analysis_Result"])
694
- ]) + "\n"
695
- csv_output.write(line)
696
-
697
  csv_content = csv_output.getvalue()
698
-
699
  # 顯示 CSV 報告的下載按鈕
700
  st.download_button(
701
  "📥 下載 **高風險** 分析報告 (.csv)",
702
  csv_content,
703
  "high_risk_report.csv",
704
  "text/csv"
705
- )
706
- if medium_risk_items:
707
- st.success(f"✅ 檢測到 {len(medium_risk_items)} 條高風險 Log/Alert。")
708
-
709
- # --- 構建 CSV 內容 ---
 
710
  csv_output = io.StringIO()
711
- csv_output.write("Log_ID,Risk_Level,Log_Content,AI_Analysis_Result\n")
712
-
713
- def escape_csv(value):
714
- # 替換內容中的所有雙引號為兩個雙引號,然後用雙引號包圍
715
- return f'"{str(value).replace('"', '""')}"'
716
-
717
- for row in high_risk_data:
718
- line = ",".join([
719
- str(row["Log_ID"]),
720
- row["Risk_Level"],
721
- escape_csv(row["Log_Content"]),
722
- escape_csv(row["AI_Analysis_Result"])
723
- ]) + "\n"
724
- csv_output.write(line)
725
-
726
  csv_content = csv_output.getvalue()
727
-
728
  # 顯示 CSV 報告的下載按鈕
729
  st.download_button(
730
  "📥 下載 **中風險** 分析報告 (.csv)",
@@ -732,7 +707,46 @@ if st.session_state.get("batch_results") and isinstance(st.session_state.batch_r
732
  "medium_risk_report.csv",
733
  "text/csv"
734
  )
 
 
 
 
 
 
 
 
 
 
735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  else:
737
- st.info("👍 未檢測到任何標註為 High-risk detected Log/Alert。")
738
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  from langchain_community.vectorstores.utils import DistanceStrategy
24
  from langchain_community.docstore.in_memory import InMemoryDocstore
25
 
26
+ # 嘗試匯入 pypdf
27
  try:
28
  import pypdf
29
  except ImportError:
 
57
  "fdtn-ai Foundation-Sec 8B Instruct (Hugging Face)": "fdtn-ai/Foundation-Sec-8B-Instruct",
58
  "Gemma 3 27B Instruct (Hugging Face)": "google/gemma-3-27b-it"
59
  }
 
60
  WINDOW_SIZE = 20 # 關聯 Log 的最大數量 (包含當前 Log)
61
 
62
  # === W3C Log 專屬解析器 (新增) ===
63
  def parse_w3c_log(log_content: str) -> List[Dict[str, Any]]:
64
  """
65
  解析 W3C Extended Log File Format (如 IIS Log),包括提取 #Fields:。
 
 
 
 
66
  """
67
  lines = log_content.splitlines()
68
  field_names = None
69
  data_lines = []
70
+
71
  for line in lines:
72
  line = line.strip()
73
  if not line:
74
  continue
75
+
76
  if line.startswith("#Fields:"):
77
  # 找到欄位定義,例如 "#Fields: date time s-ip cs-method ..."
78
  field_names = line.split()[1:] # 跳過 "#Fields:" 本身
79
  elif not line.startswith("#"):
80
  # 這是實際的資料行
81
  data_lines.append(line)
82
+
83
  if not field_names:
84
  # 如果沒有找到 #Fields,則退回到原始 Log 條目模式
 
85
  return [{"raw_log_entry": line} for line in lines if line.strip() and not line.startswith("#")]
86
 
87
  json_data = []
88
+
89
  # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充,使用底線版本)
90
+ # 這裡將W3C欄位名稱的'-'轉換為'_'進行匹配
91
  numeric_fields = ['sc_status', 'time_taken', 'bytes', 'resp_len', 'req_size']
92
+
93
  for data_line in data_lines:
94
  # W3C Log 預設使用空格分隔。這裡使用 split()
95
  values = data_line.split(' ')
96
+
97
  # 簡易的欄位數量檢查
98
  if len(values) != len(field_names):
99
  # 如果欄位數量不匹配,將該行視為原始 Log 條目
100
  json_data.append({"raw_log_entry": data_line})
101
  continue
102
+
103
  record = {}
104
  for key, value in zip(field_names, values):
105
  # 將 W3C 欄位名稱中的 '-' 替換成 Python 友好的 '_'
106
  key = key.strip().replace('-', '_')
107
+
108
  value = value.strip() if value else ""
109
+
110
  # 處理數字轉換
111
  if key in numeric_fields:
112
  try:
 
118
  record[key] = value
119
  else:
120
  record[key] = value
121
+
122
  if record:
123
  json_data.append(record)
124
+
125
  return json_data
126
 
127
  # === 核心檔案轉換函式 (CSV/TXT -> JSON List) (保留並微調) ===
 
132
  log_content = file_content.decode("utf-8").strip()
133
  if not log_content:
134
  return []
135
+
136
  string_io = io.StringIO(log_content)
137
+
138
  # 嘗試使用 csv.DictReader 自動將第一行視為 Key
139
  try:
140
  reader = csv.DictReader(string_io)
 
145
  json_data = []
146
  if reader and reader.fieldnames:
147
  # 使用者可能使用的數值欄位名稱
148
+ numeric_fields = ['sc_status', 'time_taken', 'bytes', 'resp_len', 'req_size', 'status_code', 'size', 'duration']
149
+
150
  for row in reader:
151
  record = {}
152
  for key, value in row.items():
153
  if key is None: continue
154
+
155
+ # 處理欄位名稱中的 '-'
156
+ key = key.strip().replace('-', '_')
157
  value = value.strip() if value else ""
158
+
159
  # 處理數字轉換
160
  if key in numeric_fields:
161
  try:
 
167
  record[key] = value
168
  else:
169
  record[key] = value
170
+
171
  if record:
172
  json_data.append(record)
173
+
174
+ # 再次檢查是否為空,如果是空,可能不是標準 CSV/JSON
175
  if not json_data:
176
  string_io.seek(0)
177
  lines = string_io.readlines()
178
  return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
179
+
180
  return json_data
181
 
182
  # === 檔案類型分發器 (已修改) ===
 
184
  """根據檔案類型,將上傳的檔案內容轉換為 Log JSON 列表。"""
185
  file_bytes = uploaded_file.getvalue()
186
  file_name_lower = uploaded_file.name.lower()
187
+
188
  # --- Case 1: JSON ---
189
  if file_name_lower.endswith('.json'):
190
  stringio = io.StringIO(file_bytes.decode("utf-8"))
191
  parsed_data = json.load(stringio)
192
+
193
  if isinstance(parsed_data, dict):
194
  # 處理包裹在 'alerts' 或 'logs' 鍵中的列表
195
  if 'alerts' in parsed_data and isinstance(parsed_data['alerts'], list):
 
202
  return parsed_data # 列表直接返回
203
  else:
204
  raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
205
+
206
  # --- Case 2, 3, & 4: CSV/TXT/LOG ---
207
  elif file_name_lower.endswith(('.csv', '.txt', '.log')):
208
  file_type = 'csv' if file_name_lower.endswith('.csv') else ('log' if file_name_lower.endswith('.log') else 'txt')
209
+
210
  if file_type == 'log':
211
  # 針對 .log 檔案,嘗試使用 W3C 解析器
212
  log_content = file_bytes.decode("utf-8").strip()
213
  if not log_content: return []
214
  return parse_w3c_log(log_content)
215
+
216
  else:
217
  # CSV 和 TXT 保持使用原來的 csv.DictReader 邏輯
218
  return convert_csv_txt_to_json_list(file_bytes, file_type)
219
+
220
  else:
221
  raise ValueError("不支援的檔案類型。")
222
 
223
  # --- 側邊欄設定 (已更新 'type' 參數) ---
224
  with st.sidebar:
225
  st.header("⚙️ 設定")
226
+
227
  # --- 新增模型選單 ---
228
  selected_model_name = st.selectbox(
229
  "選擇 LLM 模型",
 
236
  st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
237
  st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
238
  st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
239
+
240
  st.divider()
241
+
242
  st.subheader("📂 檔案上傳")
243
+
244
  # === 1. 批量分析檔案 (支援多種格式) ===
245
  batch_uploaded_file = st.file_uploader(
246
  "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
 
248
  key="batch_uploader",
249
  help="支援 JSON (Array), CSV (含標題), TXT/LOG (視為 W3C 或一般 Log)"
250
  )
251
+
252
  # === 2. RAG 知識庫檔案 ===
253
  rag_uploaded_file = st.file_uploader(
254
  "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
255
  type=['txt', 'py', 'log', 'csv', 'md', 'pdf'], # <--- 這裡增加了 'log'
256
  key="rag_uploader"
257
  )
258
+
259
  st.divider()
260
+
261
  st.subheader("💡 批量分析指令")
262
  analysis_prompt = st.text_area(
263
  "針對每個 Log/Alert 執行的指令",
 
265
  height=200
266
  )
267
  st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析 (使用 **IP 關聯視窗**)。")
268
+
269
  if batch_uploaded_file:
270
  if st.button("🚀 執行批量分析"):
271
  if not os.environ.get("HF_TOKEN"):
 
278
  st.error("請先等待 Log 檔案解析完成。")
279
  else:
280
  st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
281
+
282
  st.divider()
283
+
284
  st.subheader("🔍 RAG 檢索設定")
285
  similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
286
+
287
  st.divider()
288
+
289
  st.subheader("模型參數")
290
  system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
291
  max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
292
  temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
293
  top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
294
+
295
  st.divider()
296
+
297
  if st.button("🗑️ 清除所有紀錄"):
298
  # 僅清除動態狀態,保留 HF_TOKEN
299
  for key in list(st.session_state.keys()):
300
  if key not in ['HF_TOKEN']:
301
+ # 避免清除 cache resource
302
+ if not key.startswith('load_inference_client') and not key.startswith('load_embedding_model'):
303
+ del st.session_state[key]
304
  st.rerun()
305
 
306
+ # --- 初始化 Hugging Face LLM Client ---
 
307
  @st.cache_resource
308
  def load_inference_client(model_id):
309
  if not os.environ.get("HF_TOKEN"): return None
310
  try:
311
+ # 使用 os.environ.get("HF_TOKEN")
312
+ client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN"))
313
+ st.toast(f"Hugging Face Inference Client **{model_id}** 載入成功。", icon="✅")
314
  return client
315
  except Exception as e:
316
+ # st.error(f"Hugging Face Inference Client 載入失敗: {e}") # 避免在每次 rerun 時都顯示
317
  return None
318
 
319
  inference_client = None
320
  if os.environ.get("HF_TOKEN"):
321
+ # 在主腳本開始時嘗試載入 client
322
+ inference_client = load_inference_client(MODEL_ID)
323
+ if inference_client is None:
324
+ st.warning(f"Hugging Face Inference Client **{MODEL_ID}** 無法連線或 HF_TOKEN 無效。")
 
 
325
  elif not os.environ.get("HF_TOKEN"):
326
  st.error("請在環境變數中設定 HF_TOKEN。")
327
 
 
344
  pdf_reader = pypdf.PdfReader(uploaded_file)
345
  for page in pdf_reader.pages:
346
  text_content += page.extract_text() + "\n"
347
+ else: return None, "PDF library missing (pip install pypdf)"
348
  else:
349
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
350
  text_content = stringio.read()
351
+
352
  if not text_content.strip(): return None, "File is empty"
353
+
354
  # 這裡將文件內容按行分割為 Document,每行一個 Document
355
  events = [line for line in text_content.splitlines() if line.strip()]
356
  docs = [Document(page_content=e) for e in events]
357
  if not docs: return None, "No documents created"
358
+
359
  # 進行 Embedding 和 FAISS 初始化 (IndexFlatIP + L2 normalization)
360
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
361
  embeddings_np = np.array(embeddings).astype("float32")
362
  faiss.normalize_L2(embeddings_np)
363
+
364
  dimension = embeddings_np.shape[1]
365
  index = faiss.IndexFlatIP(dimension) # 使用內積 (Inner Product)
366
  index.add(embeddings_np)
367
+
368
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
369
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
370
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
371
+
372
  # 使用 Cosine 距離策略,配合 IndexFlatIP 和 L2 normalization 達到 Cosine Similarity
373
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
374
  return vector_store, f"{len(docs)} chunks created."
 
382
  index = vector_store.index
383
  D, I = index.search(q_emb, k=index.ntotal)
384
  selected = []
385
+
386
  # Cosine Similarity = D (IndexFlatIP + L2 normalization)
387
  for score, idx in zip(D[0], I[0]):
388
  if idx == -1: continue
 
391
  doc_id = vector_store.index_to_docstore_id[idx]
392
  doc = vector_store.docstore.search(doc_id)
393
  selected.append((doc, score))
394
+
395
  selected.sort(key=lambda x: x[1], reverse=True)
396
  return selected
397
 
398
  # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
399
  def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
400
+ if client is None: return "ERROR: Client 未連線或 HF Token 無效", ""
401
+
402
  context_text = ""
403
+
404
  # RAG 檢索邏輯
405
  if vector_store:
406
+ try:
407
+ selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
408
+ if selected:
409
+ # 只取前 5 個最相關的片段
410
+ retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
411
+ context_text = "\n".join(retrieved_contents)
412
+ except Exception as e:
413
+ context_text = f"RAG Retrieval Error: {str(e)}"
414
 
415
+ rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===ANALYSIS INSTRUCTION: {user_prompt}Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
416
+
417
  log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: Max {WINDOW_SIZE} logs associated by IP) ==={log_sequence_text}=== END LOG SEQUENCE ==="""
418
+
419
  messages = [
420
  {"role": "system", "content": sys_prompt},
421
  {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
422
  ]
423
+
424
  try:
425
  # 使用 chat_completion 進行模型呼叫
426
  response_stream = client.chat_completion(
 
432
  )
433
  if response_stream and response_stream.choices:
434
  return response_stream.choices[0].message.content.strip(), context_text
435
+ else:
436
+ return "Format Error: Model returned empty response or invalid format.", context_text
437
+
438
+ except Exception as e:
439
+ return f"Model Error: {str(e)}", context_text
440
 
441
  # =======================================================================
442
  # === 檔案處理區塊 (RAG 檔案) - 保持不變 ===
 
446
  # 清除舊的 vector store 以節省內存
447
  if 'vector_store' in st.session_state:
448
  del st.session_state.vector_store
449
+
450
  with st.spinner(f"正在建立 RAG 參考知識庫 ({rag_uploaded_file.name})..."):
451
  vs, msg = process_file_to_faiss(rag_uploaded_file)
452
  if vs:
 
456
  else:
457
  st.session_state.rag_current_file_key = None
458
  st.error(msg)
459
+ elif 'vector_store' in st.session_state and 'rag_current_file_key' in st.session_state:
460
  del st.session_state.vector_store
461
  del st.session_state.rag_current_file_key
462
  st.info("RAG 檔案已移除,已清除相關知識庫。")
 
464
  # === 檔案處理區塊 (批量分析檔案 - **已更新** ) ===
465
  if batch_uploaded_file:
466
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
467
+
468
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
469
  try:
470
  # 清除舊的數據
 
472
  del st.session_state.json_data_for_batch
473
  if 'batch_results' in st.session_state:
474
  del st.session_state.batch_results
475
+
476
  # 使用新的統一解析函式
477
  parsed_data = convert_uploaded_file_to_json_list(batch_uploaded_file)
478
+
479
  if not parsed_data:
480
  raise ValueError(f"{batch_uploaded_file.name} 檔案載入失敗或內容為空。")
481
+
482
  # 儲存處理後的數據
483
  st.session_state.json_data_for_batch = parsed_data
484
  st.session_state.batch_current_file_key = batch_file_key
485
  st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
486
+
487
  except Exception as e:
488
  st.error(f"檔案解析錯誤: {e}")
489
  if 'json_data_for_batch' in st.session_state:
490
  del st.session_state.json_data_for_batch
491
  st.session_state.batch_current_file_key = None # 設置為 None 避免錯誤的 Key
492
+
493
  elif 'json_data_for_batch' in st.session_state:
494
  # 檔案被移除,清除相關數據
495
  del st.session_state.json_data_for_batch
 
499
  del st.session_state.batch_results
500
  st.info("批量分析檔案已移除,已清除相關數據和結果。")
501
 
502
+
503
  # === 執行批量分析邏輯 (已修改為 IP 關聯視窗) ===
504
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state and st.session_state.json_data_for_batch is not None:
505
  st.session_state.execute_batch_analysis = False
506
  start_time = time.time()
507
+
508
+ # 執行前確保清空結果
 
 
 
509
  st.session_state.batch_results = []
510
+
511
  if inference_client is None:
512
+ st.error("Client 未連線,無法執行。請檢查 HF_TOKEN 和模型設定。")
513
  else:
514
  logs_list = st.session_state.json_data_for_batch
515
+
516
  if logs_list:
517
  vs = st.session_state.get("vector_store", None)
518
+
519
  # 將 Log 條目轉換為 JSON 字串,用於 LLM 輸入
520
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
521
+
522
  analysis_sequences = []
523
+
524
  # --- 核心修改:基於 IP 關聯的 Log Sequence 建構 ---
525
  for i in range(len(formatted_logs)):
526
  current_log_entry = logs_list[i]
527
  current_log_str = formatted_logs[i]
528
+
529
  # 嘗試從當前 Log 條目中提取 IP 地址 (優先 W3C 格式,然後是一般日誌格式)
530
+ # 這裡需要根據您的日誌格式調整 key,常見的有 c_ip, remote_addr, source_ip 等
531
+ # 我們使用 W3C 和一般日誌中常見的 key
532
+ target_ip = (current_log_entry.get('c_ip') or
533
+ current_log_entry.get('c_ip') or
534
+ current_log_entry.get('remote_addr') or
535
+ current_log_entry.get('source_ip') or
536
+ current_log_entry.get('client_ip'))
537
+
538
  sequence_text = []
539
  correlated_logs = []
540
+
541
+ # 檢查 IP 是否有效
542
+ if target_ip and target_ip != "-":
543
+
544
  # 篩選過去的 Log,最多 WINDOW_SIZE - 1 個,且 IP 必須匹配
545
  # 從 i-1 倒序檢查到 0
546
  for j in range(i - 1, -1, -1):
547
  prior_log_entry = logs_list[j]
548
+ prior_ip = (prior_log_entry.get('c_ip') or
549
+ prior_log_entry.get('c_ip') or
550
+ prior_log_entry.get('remote_addr') or
551
+ prior_log_entry.get('source_ip') or
552
+ prior_log_entry.get('client_ip'))
553
+
554
  # 檢查 IP 是否匹配
555
  if prior_ip == target_ip:
556
  # 插入到最前面,保持時間順序
557
  correlated_logs.insert(0, formatted_logs[j])
558
+
559
+ # 限制累積的 Log 數量(不包含當前 Log)
560
+ if len(correlated_logs) >= WINDOW_SIZE - 1:
561
+ break
562
+
563
  # 1. 加入相關聯的 Log (時間較早的)
564
+ for log_str in correlated_logs:
565
+ sequence_text.append(f"--- Correlated Log (IP:{target_ip}) ---\n{log_str}")
566
+
 
567
  else:
568
  # 如果沒有找到 IP,只分析當前 Log (確保 sequence_text 不是空的)
569
+ pass # sequence_text 最終只會包含 TARGET LOG
570
+
571
  # 2. 加入當前的目標 Log
572
  sequence_text.append(f"--- TARGET LOG TO ANALYZE (Index {i+1}) ---\n{current_log_str}")
573
+
574
  analysis_sequences.append({
575
  "sequence_text": "\n\n".join(sequence_text),
576
  "target_log_id": i + 1,
 
581
  total_sequences = len(analysis_sequences)
582
  st.header(f"⚡ 批量分析執行中 (IP 關聯視窗 $N={WINDOW_SIZE}$)...")
583
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
584
+
585
+ # 使用一個佔位符來顯示即時進度或警告,而不是結果
586
+ status_placeholder = st.empty()
587
+
588
  for i, seq_data in enumerate(analysis_sequences):
589
  log_id = seq_data["target_log_id"]
590
+
591
+ # 顯示進度
592
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
593
+ status_placeholder.text(f"正在分析 Log #{log_id} (IP 序列長度: {seq_data['sequence_text'].count('---')})...")
594
+
595
  try:
596
  response, retrieved_ctx = generate_rag_response_hf_for_log(
597
  client=inference_client,
 
605
  temperature=temperature,
606
  top_p=top_p
607
  )
608
+
609
  item = {
610
  "log_id": log_id,
611
  "log_content": seq_data["original_log_entry"],
 
613
  "analysis_result": response,
614
  "context": retrieved_ctx
615
  }
616
+
617
  st.session_state.batch_results.append(item)
618
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  except Exception as e:
620
+ st.session_state.batch_results.append({
621
+ "log_id": log_id,
622
+ "log_content": seq_data["original_log_entry"],
623
+ "sequence_analyzed": seq_data["sequence_text"],
624
+ "analysis_result": f"Model Execution Error: {e}",
625
+ "context": ""
626
+ })
627
+ status_placeholder.error(f"Error Log {log_id}: {e}")
628
+
629
  end_time = time.time()
630
  progress_bar.empty()
631
+ status_placeholder.empty()
632
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")
633
+
634
+ # 由於結果已在 session state 中,觸發一次重新運行以顯示歷史結果
635
+ # 這是必要的,因為批量分析在一個 if 區塊內執行,需要重新執行腳本來執行後續的顯示邏輯。
636
+ st.rerun()
637
+
638
  else:
639
  st.error("無法提取有效 Log,請檢查檔案格式。")
640
 
641
+ # === 顯示結果 (歷史紀錄) - 已修改為持久顯示結果,而非僅在執行時顯示 ===
642
  if st.session_state.get("batch_results") and isinstance(st.session_state.batch_results, list) and st.session_state.batch_results:
643
  st.header("⚡ 歷史分析結果")
644
+
645
+ # 初始化數據列表
646
  high_risk_data = []
 
647
  medium_risk_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
 
649
+ # 重新處理結果並分類
650
  for item in st.session_state.batch_results:
651
+ response = item['analysis_result'].lower()
652
+ is_high_risk = 'high-risk detected!' in response
653
+ is_medium_risk = 'medium-risk detected!' in response
654
+
655
+ if is_high_risk or is_medium_risk:
 
 
656
  log_content_str = json.dumps(item["log_content"], ensure_ascii=False)
657
  analysis_result_clean = item['analysis_result'].replace('\n', ' | ')
658
+
659
+ record = {
660
  "Log_ID": item['log_id'],
661
+ "Risk_Level": "HIGH_RISK" if is_high_risk else "MEDIUM_RISK",
662
  "Log_Content": log_content_str,
663
  "AI_Analysis_Result": analysis_result_clean
664
+ }
665
+
666
+ if is_high_risk:
667
+ high_risk_data.append(record)
668
+ elif is_medium_risk:
669
+ medium_risk_data.append(record)
670
+
671
+ # --- 下載按鈕邏輯 ---
672
  report_container = st.container()
673
 
674
  with report_container:
675
+ if high_risk_data:
676
+ st.success(f"✅ 檢測到 {len(high_risk_data)} 條高風險 Log/Alert。")
677
+
678
+ # --- 構建 High-Risk CSV 內容 ---
679
  csv_output = io.StringIO()
680
+ writer = csv.DictWriter(csv_output, fieldnames=["Log_ID", "Risk_Level", "Log_Content", "AI_Analysis_Result"])
681
+ writer.writeheader()
682
+ writer.writerows(high_risk_data)
 
 
 
 
 
 
 
 
 
 
 
 
683
  csv_content = csv_output.getvalue()
684
+
685
  # 顯示 CSV 報告的下載按鈕
686
  st.download_button(
687
  "📥 下載 **高風險** 分析報告 (.csv)",
688
  csv_content,
689
  "high_risk_report.csv",
690
  "text/csv"
691
+ )
692
+
693
+ if medium_risk_data:
694
+ st.warning(f"⚠️ 檢測到 {len(medium_risk_data)} 條中風險 Log/Alert。")
695
+
696
+ # --- 構建 Medium-Risk CSV 內容 ---
697
  csv_output = io.StringIO()
698
+ writer = csv.DictWriter(csv_output, fieldnames=["Log_ID", "Risk_Level", "Log_Content", "AI_Analysis_Result"])
699
+ writer.writeheader()
700
+ writer.writerows(medium_risk_data)
 
 
 
 
 
 
 
 
 
 
 
 
701
  csv_content = csv_output.getvalue()
702
+
703
  # 顯示 CSV 報告的下載按鈕
704
  st.download_button(
705
  "📥 下載 **中風險** 分析報告 (.csv)",
 
707
  "medium_risk_report.csv",
708
  "text/csv"
709
  )
710
+
711
+ if not high_risk_data and not medium_risk_data:
712
+ st.info("👍 未檢測到任何標註為 High-risk 或 Medium-risk 的 Log/Alert。")
713
+
714
+ # === 【關鍵修正】渲染所有分析結果的詳細內容 (確保持久性) ===
715
+ st.markdown("---")
716
+ st.subheader("分析結果詳細列表")
717
+
718
+ # 按照 Log ID 順序處理所有結果
719
+ all_results = sorted(st.session_state.batch_results, key=lambda x: x['log_id'])
720
 
721
+ for item in all_results:
722
+ response = item['analysis_result']
723
+
724
+ # 判斷風險等級
725
+ is_high = 'high-risk detected!' in response.lower()
726
+ is_medium = 'medium-risk detected!' in response.lower()
727
+
728
+ # 選擇標題和顏色
729
+ if is_high:
730
+ header_text = f"Log/Alert #{item['log_id']} (HIGH RISK DETECTED) 🔴"
731
+ alert_func = st.error
732
+ elif is_medium:
733
+ header_text = f"Log/Alert #{item['log_id']} (MEDIUM RISK DETECTED) 🟠"
734
+ alert_func = st.warning
735
  else:
736
+ header_text = f"Log/Alert #{item['log_id']} (Low/No Risk Detected) "
737
+ alert_func = st.info # 使用 info 來顯示不具備高/中風險的結果
738
+
739
+ st.subheader(header_text)
740
+
741
+ # 顯示序列內容、分析結果和上下文
742
+ with st.expander("序列內容 (JSON Format)"):
743
+ st.code(item["sequence_analyzed"], language='json')
744
+
745
+ # 根據風險等級顯示分析結果
746
+ alert_func(item['analysis_result'])
747
+
748
+ if item.get('context'):
749
+ with st.expander("參考 RAG 片段"):
750
+ st.code(item['context'], language='text')
751
+
752
+ st.markdown("---")