ss900371tw commited on
Commit
c7f04bb
·
verified ·
1 Parent(s): cba1cd9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +432 -172
src/streamlit_app.py CHANGED
@@ -1,14 +1,14 @@
1
-
2
  import streamlit as st
3
  import os
4
  import io
5
  import json
6
- import csv # <--- 新增:用於處理 CSV
7
  import numpy as np
8
  import faiss
9
  import uuid
10
  import time
11
  import sys
 
12
 
13
  # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
14
  try:
@@ -23,7 +23,7 @@ from langchain_community.vectorstores import FAISS
23
  from langchain_community.vectorstores.utils import DistanceStrategy
24
  from langchain_community.docstore.in_memory import InMemoryDocstore
25
 
26
- # 嘗試匯入 pypdftry
27
  try:
28
  import pypdf
29
  except ImportError:
@@ -31,91 +31,248 @@ except ImportError:
31
 
32
  # --- 頁面設定 ---
33
  st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
34
- st.title("🛡️ Meta-Llama-3-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
35
- st.markdown("已啟用:**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT 執行批量分析。")
36
 
 
37
  if 'execute_batch_analysis' not in st.session_state:
38
  st.session_state.execute_batch_analysis = False
39
  if 'batch_results' not in st.session_state:
40
- st.session_state.batch_results = None
41
  if 'rag_current_file_key' not in st.session_state:
42
  st.session_state.rag_current_file_key = None
43
- if 'batch_current_file_key' not in st.session_state: # 修改變數名稱以反映多格式
44
  st.session_state.batch_current_file_key = None
45
  if 'vector_store' not in st.session_state:
46
  st.session_state.vector_store = None
47
- if 'json_data_for_batch' not in st.session_state: # 變數名稱保留,但內容可能是轉換後的 dict
48
  st.session_state.json_data_for_batch = None
49
 
50
  # 設定模型 ID
51
- MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
52
- WINDOW_SIZE = 8
53
 
54
- # --- 側邊欄設定 ---
55
- with st.sidebar:
56
- st.header("⚙️ 設定")
57
-
58
- if not os.environ.get("HF_TOKEN"):
59
- st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
60
-
61
- st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
62
- st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
63
-
64
- st.divider()
65
- st.subheader("📂 檔案上傳")
66
-
67
- # === 1. 批量分析檔案 (修改處:支援多種格式) ===
68
- batch_uploaded_file = st.file_uploader(
69
- "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
70
- type=['json', 'csv', 'txt'], # <--- 修改:新增 csv 和 txt
71
- key="batch_uploader",
72
- help="支援 JSON (Array), CSV (含標題), TXT (每行一條 Log)"
73
- )
 
 
 
 
 
 
 
 
 
74
 
75
- # === 2. RAG 知識庫檔案 ===
76
- rag_uploaded_file = st.file_uploader(
77
- "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
78
- type=['txt', 'py', 'log', 'csv', 'md', 'pdf'],
79
- key="rag_uploader"
80
- )
81
 
82
- st.divider()
83
-
84
- st.subheader("💡 批量分析指令")
85
- analysis_prompt = st.text_area(
86
- "針對每個 Log/Alert 執行的指令",
87
- value="You are a security expert in charge of analyzing alerts related to Web Application Attacks and Brute Force & Reconnaissance. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Priority: Provide the overall priority level. (Answer High risk, Medium risk, or Low risk only) \n- Explanation: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, explain the potential impact and why this specific alert requires attention. If not, **omit the explanation section**. \n- Action Plan: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, What should be the immediate steps to address this specific alert? If not, **omit the action plan section**. \n\nStrictly use the information in the provided Log.",
88
- height=200
89
- )
90
- st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析。")
91
-
92
- if batch_uploaded_file:
93
- if st.button("🚀 執行批量分析"):
94
- if not os.environ.get("HF_TOKEN"):
95
- st.error("無法執行,環境變數 **HF_TOKEN** 未設定。")
96
- else:
97
- st.session_state.execute_batch_analysis = True
98
- else:
99
- st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- st.divider()
102
- st.subheader("🔍 RAG 檢索設定")
103
- similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- st.divider()
106
- st.subheader("模型參數")
107
- system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
108
- max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
109
- temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
110
- top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
 
111
 
112
- st.divider()
113
- if st.button("🗑️ 清除所有紀錄"):
114
- for key in list(st.session_state.keys()):
115
- del st.session_state[key]
116
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- # --- 初始化 Hugging Face LLM Client ---
 
119
  @st.cache_resource
120
  def load_inference_client(model_id):
121
  if not os.environ.get("HF_TOKEN"): return None
@@ -131,9 +288,10 @@ inference_client = None
131
  if os.environ.get("HF_TOKEN"):
132
  with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
133
  inference_client = load_inference_client(MODEL_ID)
 
134
  if inference_client is None and os.environ.get("HF_TOKEN"):
135
  st.warning("Hugging Face Inference Client 無法連線。")
136
- elif not os.environ.get("HF_TOKEN"):
137
  st.error("請在環境變數中設定 HF_TOKEN。")
138
 
139
  # === Embedding 模型 (保持不變) ===
@@ -159,25 +317,25 @@ def process_file_to_faiss(uploaded_file):
159
  else:
160
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
161
  text_content = stringio.read()
162
-
163
  if not text_content.strip(): return None, "File is empty"
164
-
165
  events = [line for line in text_content.splitlines() if line.strip()]
166
  docs = [Document(page_content=e) for e in events]
167
  if not docs: return None, "No documents created"
168
-
169
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
170
  embeddings_np = np.array(embeddings).astype("float32")
171
  faiss.normalize_L2(embeddings_np)
172
-
173
  dimension = embeddings_np.shape[1]
174
  index = faiss.IndexFlatIP(dimension)
175
  index.add(embeddings_np)
176
-
177
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
178
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
179
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
180
-
181
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
182
  return vector_store, f"{len(docs)} chunks created."
183
  except Exception as e:
@@ -199,32 +357,72 @@ def faiss_cosine_search_all(vector_store, query, threshold):
199
  selected.sort(key=lambda x: x[1], reverse=True)
200
  return selected
201
 
202
- # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
203
- def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
204
- if client is None: return "ERROR: Client Error", ""
205
- context_text = ""
206
- if vector_store:
207
- selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
208
- if selected:
209
- retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
210
- context_text = "\n".join(retrieved_contents)
211
-
212
- rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
213
- log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
214
-
215
- messages = [
216
- {"role": "system", "content": sys_prompt},
217
- {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
218
- ]
219
- try:
220
- response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
221
- if response_stream and response_stream.choices:
222
- return response_stream.choices[0].message.content.strip(), context_text
223
- else: return "Format Error", context_text
224
- except Exception as e: return f"Model Error: {str(e)}", context_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  # =======================================================================
227
- # === 檔案處理區塊 (RAG 檔案) ===
228
  if rag_uploaded_file:
229
  file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
230
  if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
@@ -240,44 +438,27 @@ elif 'vector_store' in st.session_state:
240
  del st.session_state.rag_current_file_key
241
  st.info("RAG 檔案已移除,已清除相關知識庫。")
242
 
243
- # === 檔案處理區塊 (批量分析檔案 - 重大修改處) ===
244
- # 支援 JSON, CSV, TXT 並統一轉換為 list of dicts
245
  if batch_uploaded_file:
246
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
247
-
248
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
249
  try:
250
- stringio = io.StringIO(batch_uploaded_file.getvalue().decode("utf-8"))
251
- parsed_data = None
252
-
253
- # --- Case 1: JSON ---
254
- if batch_uploaded_file.name.lower().endswith('.json'):
255
- parsed_data = json.load(stringio)
256
- st.toast("JSON 檔案載入成功", icon="📄")
257
-
258
- # --- Case 2: CSV ---
259
- elif batch_uploaded_file.name.lower().endswith('.csv'):
260
- # 使用 DictReader 將 CSV 轉為 List of Dicts
261
- reader = csv.DictReader(stringio)
262
- parsed_data = list(reader)
263
- st.toast("CSV 檔案已轉換為 JSON 結構", icon="📊")
264
-
265
- # --- Case 3: TXT ---
266
- else: # 預設為 TXT
267
- # 將每一行包裝成一個 JSON 物件: {"raw_content": "line text"}
268
- lines = stringio.readlines()
269
- parsed_data = [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
270
- st.toast("TXT 檔案已轉換為 JSON 結構", icon="📝")
271
-
272
  # 儲存處理後的數據
273
  st.session_state.json_data_for_batch = parsed_data
274
  st.session_state.batch_current_file_key = batch_file_key
275
-
 
276
  except Exception as e:
277
  st.error(f"檔案解析錯誤: {e}")
278
  if 'json_data_for_batch' in st.session_state:
279
  del st.session_state.json_data_for_batch
280
-
281
  elif 'json_data_for_batch' in st.session_state:
282
  del st.session_state.json_data_for_batch
283
  del st.session_state.batch_current_file_key
@@ -285,65 +466,87 @@ elif 'json_data_for_batch' in st.session_state:
285
  del st.session_state.batch_results
286
  st.info("批量分析檔案已移除,已清除相關數據。")
287
 
288
- # === 執行批量分析邏輯 ===
289
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
290
  st.session_state.execute_batch_analysis = False
291
  start_time = time.time()
292
  st.session_state.batch_results = []
293
-
294
  if inference_client is None:
295
  st.error("Client 未連線,無法執行。")
296
  else:
297
- data_to_process = st.session_state.json_data_for_batch
298
- logs_list = []
299
-
300
- # 處理不同的 JSON 結構 (Dict vs List)
301
- if isinstance(data_to_process, list):
302
- logs_list = data_to_process
303
- elif isinstance(data_to_process, dict):
304
- # 嘗試尋找常見的 key
305
- if 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
306
- logs_list = data_to_process['alerts']
307
- elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
308
- logs_list = data_to_process['logs']
309
- else:
310
- logs_list = [data_to_process]
311
- else:
312
- logs_list = [data_to_process]
313
-
314
  if logs_list:
315
  vs = st.session_state.get("vector_store", None)
316
-
317
  # --- 關鍵:在這裡做 JSON String 的轉換 ---
318
- # 無論來源是 CSV(Dict) 還是 TXT(Dict),都在這裡用 json.dumps 轉成字串
319
- # 這保證了 Prompt 收到的永遠是 JSON 格式的文字
320
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
321
-
322
  analysis_sequences = []
 
 
323
  for i in range(len(formatted_logs)):
324
- start_index = max(0, i - WINDOW_SIZE + 1)
325
- end_index = i + 1
326
- current_window = formatted_logs[start_index:end_index]
 
 
 
 
327
  sequence_text = []
328
- for j, log_str in enumerate(current_window):
329
- is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
330
- sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  analysis_sequences.append({
332
  "sequence_text": "\n\n".join(sequence_text),
333
  "target_log_id": i + 1,
334
  "original_log_entry": logs_list[i]
335
  })
336
-
 
337
  total_sequences = len(analysis_sequences)
338
- st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
339
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
340
  results_container = st.container()
341
  full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
342
-
343
  for i, seq_data in enumerate(analysis_sequences):
344
  log_id = seq_data["target_log_id"]
345
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
346
-
347
  try:
348
  response, retrieved_ctx = generate_rag_response_hf_for_log(
349
  client=inference_client,
@@ -365,36 +568,93 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
365
  "context": retrieved_ctx
366
  }
367
  st.session_state.batch_results.append(item)
368
-
369
  with results_container:
370
  st.subheader(f"Log/Alert #{item['log_id']}")
371
  with st.expander("序列內容 (JSON Format)"):
372
- st.code(item["sequence_analyzed"], language='json') # 這裡顯示的會是 JSON 格式
373
-
374
- is_high = any(x in response.lower() for x in ['high risk'])
375
  if is_high: st.error(item['analysis_result'])
376
  else: st.info(item['analysis_result'])
377
  if item['context']:
378
  with st.expander("參考 RAG 片段"): st.code(item['context'])
379
  st.markdown("---")
380
-
381
  log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
382
  full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
383
-
384
  except Exception as e:
385
  st.error(f"Error Log {log_id}: {e}")
386
-
387
  end_time = time.time()
388
  progress_bar.empty()
389
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")
390
  else:
391
  st.error("無法提取有效 Log,請檢查檔案格式。")
392
 
393
- # === 顯示結果 (歷史紀錄) ===
394
  if st.session_state.get("batch_results") and not st.session_state.execute_batch_analysis:
395
  st.header("⚡ 歷史分析結果")
396
- full_report_chunks = ["## Report\n\n"]
 
 
 
 
 
397
  for item in st.session_state.batch_results:
398
- log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
399
- full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\n{item['analysis_result']}\n")
400
- st.download_button("📥 下載完整報告 (.md)", "\n".join(full_report_chunks), "report.md", "text/markdown")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import os
3
  import io
4
  import json
5
+ import csv
6
  import numpy as np
7
  import faiss
8
  import uuid
9
  import time
10
  import sys
11
+ from typing import List, Dict, Any
12
 
13
  # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
14
  try:
 
23
  from langchain_community.vectorstores.utils import DistanceStrategy
24
  from langchain_community.docstore.in_memory import InMemoryDocstore
25
 
26
+ # 嘗試匯入 pypdf
27
  try:
28
  import pypdf
29
  except ImportError:
 
31
 
32
  # --- 頁面設定 ---
33
  st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
34
+ st.title("🛡️ fdtn-ai/Foundation-Sec-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
35
+ st.markdown("已啟用:**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT/**W3C Log** 執行批量分析。**批量分析序列已改為基於 IP 篩選。**")
36
 
37
+ # --- Streamlit Session State 初始化 (保持不變) ---
38
  if 'execute_batch_analysis' not in st.session_state:
39
  st.session_state.execute_batch_analysis = False
40
  if 'batch_results' not in st.session_state:
41
+ st.session_state.batch_results = None
42
  if 'rag_current_file_key' not in st.session_state:
43
  st.session_state.rag_current_file_key = None
44
+ if 'batch_current_file_key' not in st.session_state:
45
  st.session_state.batch_current_file_key = None
46
  if 'vector_store' not in st.session_state:
47
  st.session_state.vector_store = None
48
+ if 'json_data_for_batch' not in st.session_state:
49
  st.session_state.json_data_for_batch = None
50
 
51
  # 設定模型 ID
52
+ MODEL_ID = "fdtn-ai/Foundation-Sec-1.1-8B-Instruct"
53
+ WINDOW_SIZE = 20
54
 
55
+ # === W3C Log 專屬解析器 (保持不變) ===
56
+ def parse_w3c_log(log_content: str) -> List[Dict[str, Any]]:
57
+ """
58
+ 解析 W3C Extended Log File Format (如 IIS Log),包括提取 #Fields:。
59
+ Args:
60
+ log_content (str): Log 檔案的字串內容。
61
+ Returns:
62
+ List[Dict[str, Any]]: 轉換後的 JSON 物件列表。
63
+ """
64
+ lines = log_content.splitlines()
65
+ field_names = None
66
+ data_lines = []
67
+ for line in lines:
68
+ line = line.strip()
69
+ if not line:
70
+ continue
71
+
72
+ if line.startswith("#Fields:"):
73
+ # 找到欄位定義,例如 "#Fields: date time s-ip cs-method ..."
74
+ # .split() 會自動處理多個空格分隔
75
+ field_names = line.split()[1:] # 跳過 "#Fields:" 本身
76
+ elif not line.startswith("#"):
77
+ # 這是實際的資料行
78
+ data_lines.append(line)
79
+
80
+ if not field_names:
81
+ # 如果沒有找到 #Fields,則退回到原始 Log 條目模式
82
+ st.warning("未檢測到 W3C Log 的 #Fields: 標頭,退回原始 Log 條目模式。")
83
+ return [{"raw_log_entry": line} for line in lines if line.strip()]
84
 
85
+ json_data = []
86
+
87
+ # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充,使用底線版本)
88
+ numeric_fields = ['sc_status', 'time_taken', 'bytes', 'resp_len', 'req_size']
 
 
89
 
90
+ for data_line in data_lines:
91
+ # W3C Log 預設使用空格分隔。這裡使用 split()
92
+ values = data_line.split(' ')
93
+
94
+ # 簡易的欄位數量檢查
95
+ if len(values) != len(field_names):
96
+ # 如果欄位數量不匹配,將該行視為原始 Log 條目
97
+ json_data.append({"raw_log_entry": data_line})
98
+ continue
99
+
100
+ record = {}
101
+ for key, value in zip(field_names, values):
102
+ # W3C 欄位名稱中的 '-' 替換成 Python 友好的 '_'
103
+ key = key.strip().replace('-', '_')
104
+
105
+ value = value.strip() if value else ""
106
+
107
+ # 處理數字轉換
108
+ if key in numeric_fields:
109
+ try:
110
+ record[key] = int(value)
111
+ except ValueError:
112
+ try:
113
+ record[key] = float(value)
114
+ except ValueError:
115
+ record[key] = value
116
+ else:
117
+ record[key] = value
118
+
119
+ if record:
120
+ json_data.append(record)
121
+ return json_data
122
+
123
+ # === 核心檔案轉換函式 (CSV/TXT -> JSON List) (保持不變) ===
124
+ def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Dict[str, Any]]:
125
+ """
126
+ 將 CSV 或 TXT 檔案內容 (假定為 CSV 格式,含標頭) 轉換為 JSON 物件列表。
127
+ 這個函式現在專門處理非 W3C 格式的 CSV/TXT。
128
+ """
129
+ log_content = file_content.decode("utf-8").strip()
130
+ if not log_content:
131
+ return []
132
+ string_io = io.StringIO(log_content)
133
+
134
+ # 嘗試使用 csv.DictReader 自動將第一行視為 Key
135
+ try:
136
+ reader = csv.DictReader(string_io)
137
+ except Exception as e:
138
+ # 如果失敗,退回每行一個原始 Log 條目
139
+ st.warning(f"使用 csv.DictReader 失敗,嘗試將檔案視為每行一個原始 Log 條目: {e}")
140
+ return [{"raw_log_entry": line.strip()} for line in log_content.splitlines() if line.strip()]
141
+
142
+ json_data = []
143
+ if reader:
144
+ # 這裡檢查的是原始 CSV 標頭,但為了提取 IP,我們只需要確保它被解析即可
145
+ numeric_fields = ['sc-status', 'time-taken', 'bytes', 'resp-len', 'req-size']
146
 
147
+ for row in reader:
148
+ record = {}
149
+ for key, value in row.items():
150
+ if key is None: continue
151
+
152
+ key = key.strip()
153
+ value = value.strip() if value else ""
154
+
155
+ # 處理數字轉換
156
+ if key in numeric_fields:
157
+ try:
158
+ record[key] = int(value)
159
+ except ValueError:
160
+ try:
161
+ record[key] = float(value)
162
+ except ValueError:
163
+ record[key] = value
164
+ else:
165
+ record[key] = value
166
+
167
+ if record:
168
+ json_data.append(record)
169
+
170
+ # 再次檢查是否為空,如果是空且是小文件,可能不是標準 CSV/JSON
171
+ if not json_data:
172
+ string_io.seek(0)
173
+ lines = string_io.readlines()
174
+ return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
175
+
176
+ return json_data
177
+
178
+ # === 檔案類型分發器 (保持不變) ===
179
+ def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
180
+ """根據檔案類型,將上傳的檔案內容轉換為 Log JSON 列表。"""
181
+ file_bytes = uploaded_file.getvalue()
182
+ file_name_lower = uploaded_file.name.lower()
183
+
184
+ # --- Case 1: JSON ---
185
+ if file_name_lower.endswith('.json'):
186
+ stringio = io.StringIO(file_bytes.decode("utf-8"))
187
+ parsed_data = json.load(stringio)
188
+
189
+ if isinstance(parsed_data, dict):
190
+ if 'alerts' in parsed_data and isinstance(parsed_data['alerts'], list):
191
+ return parsed_data['alerts']
192
+ elif 'logs' in parsed_data and isinstance(parsed_data['logs'], list):
193
+ return parsed_data['logs']
194
+ else:
195
+ return [parsed_data]
196
+ elif isinstance(parsed_data, list):
197
+ return parsed_data
198
+ else:
199
+ raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
200
+
201
+ # --- Case 2, 3, & 4: CSV/TXT/LOG ---
202
+ elif file_name_lower.endswith(('.csv', '.txt', '.log')):
203
+ file_type = 'csv' if file_name_lower.endswith('.csv') else ('log' if file_name_lower.endswith('.log') else 'txt')
204
+
205
+ if file_type == 'log':
206
+ # 針對 .log 檔案,嘗試使用 W3C 解析器
207
+ log_content = file_bytes.decode("utf-8").strip()
208
+ if not log_content: return []
209
+
210
+ # 使用 W3C 解析器
211
+ return parse_w3c_log(log_content)
212
+
213
+ else:
214
+ # CSV 和 TXT 保持使用原來的 csv.DictReader 邏輯
215
+ return convert_csv_txt_to_json_list(file_bytes, file_type)
216
+
217
+ else:
218
+ raise ValueError("不支援的檔案類型。")
219
+
220
+ # === 提取 IP 位址的輔助函數 (新增) ===
221
+ def get_ip_from_log(log_entry: Dict[str, Any]) -> str:
222
+ """嘗試從 Log 字典中提取 Client IP。
223
+ 處理 W3C Log 的 'c_ip' 或 's_ip',或原始 Log 條目。
224
+ 注意:W3C 解析器已將 'c-ip' 轉換為 'c_ip'。
225
+ """
226
+ # 檢查 W3C/常見欄位 (已自動轉換為底線)
227
+ if 'c_ip' in log_entry:
228
+ return str(log_entry['c_ip']).strip()
229
+ elif 's_ip' in log_entry:
230
+ return str(log_entry['s_ip']).strip()
231
 
232
+ # 對於未解析的原始 Log 條目,暫時無法精確提取,返回空字串
233
+ return ""
234
+
235
+ # === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
236
+ def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
237
+ if client is None: return "ERROR: Client Error", ""
238
+ context_text = ""
239
 
240
+ # 1. RAG 檢索
241
+ if vector_store:
242
+ # 對於 Log 序列,我們通常只使用序列中的最後一條 Log 或整個序列進行檢索
243
+ # 為了平衡性能和準確性,這裡使用整個序列進行檢索
244
+ selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
245
+ if selected:
246
+ # 限制檢索結果數量,例如最多 5 個
247
+ retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
248
+ context_text = "\n".join(retrieved_contents)
249
+
250
+ # 2. 構建 Instruction
251
+ rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ===
252
+ {context_text if context_text else 'No relevant reference context found.'}
253
+ === END REFERENCE CONTEXT ===
254
+ ANALYSIS INSTRUCTION: {user_prompt}
255
+ Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
256
+
257
+ log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===
258
+ {log_sequence_text}
259
+ === END LOG SEQUENCE ==="""
260
+
261
+ messages = [
262
+ {"role": "system", "content": sys_prompt},
263
+ {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
264
+ ]
265
+
266
+ # 3. 呼叫 LLM
267
+ try:
268
+ response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
269
+ if response_stream and response_stream.choices:
270
+ return response_stream.choices[0].message.content.strip(), context_text
271
+ else: return "Format Error", context_text
272
+ except Exception as e: return f"Model Error: {str(e)}", context_text
273
 
274
+
275
+ # --- 初始化 Hugging Face LLM Client (保持不變) ---
276
  @st.cache_resource
277
  def load_inference_client(model_id):
278
  if not os.environ.get("HF_TOKEN"): return None
 
288
  if os.environ.get("HF_TOKEN"):
289
  with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
290
  inference_client = load_inference_client(MODEL_ID)
291
+
292
  if inference_client is None and os.environ.get("HF_TOKEN"):
293
  st.warning("Hugging Face Inference Client 無法連線。")
294
+ elif not os.environ.get("HF_TOKEN"):
295
  st.error("請在環境變數中設定 HF_TOKEN。")
296
 
297
  # === Embedding 模型 (保持不變) ===
 
317
  else:
318
  stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
319
  text_content = stringio.read()
320
+
321
  if not text_content.strip(): return None, "File is empty"
322
+
323
  events = [line for line in text_content.splitlines() if line.strip()]
324
  docs = [Document(page_content=e) for e in events]
325
  if not docs: return None, "No documents created"
326
+
327
  embeddings = embedding_model.embed_documents([d.page_content for d in docs])
328
  embeddings_np = np.array(embeddings).astype("float32")
329
  faiss.normalize_L2(embeddings_np)
330
+
331
  dimension = embeddings_np.shape[1]
332
  index = faiss.IndexFlatIP(dimension)
333
  index.add(embeddings_np)
334
+
335
  doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
336
  docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
337
  index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
338
+
339
  vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
340
  return vector_store, f"{len(docs)} chunks created."
341
  except Exception as e:
 
357
  selected.sort(key=lambda x: x[1], reverse=True)
358
  return selected
359
 
360
+
361
+ # --- 側邊欄設定 (保持不變) ---
362
+ with st.sidebar:
363
+ st.header("⚙️ 設定")
364
+
365
+ if not os.environ.get("HF_TOKEN"):
366
+ st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
367
+ st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Inference API)")
368
+ st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫,請確保您的 HF Token 具有存取權限。")
369
+
370
+ st.divider()
371
+ st.subheader("📂 檔案上傳")
372
+
373
+ # === 1. 批量分析檔案 (支援多種格式) ===
374
+ batch_uploaded_file = st.file_uploader(
375
+ "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
376
+ type=['json', 'csv', 'txt', 'log'], # <--- 這裡增加了 'log'
377
+ key="batch_uploader",
378
+ help="支援 JSON (Array), CSV (含標題), TXT/LOG (視為 W3C 或一般 Log)"
379
+ )
380
+
381
+ # === 2. RAG 知識庫檔案 ===
382
+ rag_uploaded_file = st.file_uploader(
383
+ "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
384
+ type=['txt', 'py', 'log', 'csv', 'md', 'pdf'], # <--- 這裡增加了 'log'
385
+ key="rag_uploader"
386
+ )
387
+ st.divider()
388
+
389
+ st.subheader("💡 批量分析���令")
390
+ analysis_prompt = st.text_area(
391
+ "針對每個 Log/Alert 執行的指令",
392
+ value="You are a security expert in charge of analyzing alerts related to Web Application Attacks and Brute Force & Reconnaissance. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Priority: Provide the overall priority level. (Answer High-risk detected!, Medium-risk detected!, or Low-risk detected! only) \n- Explanation: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, explain the potential impact and why this specific alert requires attention. If not, **omit the explanation section**. \n- Action Plan: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, What should be the immediate steps to address this specific alert? If not, **omit the action plan section**. \n\nStrictly use the information in the provided Log.",
393
+ height=200
394
+ )
395
+ st.markdown(f"此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析,並提供**最多 {WINDOW_SIZE} 條**相同 IP 的歷史 Log 作為上下文。")
396
+
397
+ if batch_uploaded_file:
398
+ if st.button("🚀 執行批量分析"):
399
+ if not os.environ.get("HF_TOKEN"):
400
+ st.error("無法執行,環境變數 **HF_TOKEN** 未設定。")
401
+ else:
402
+ st.session_state.execute_batch_analysis = True
403
+ else:
404
+ st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
405
+
406
+ st.divider()
407
+ st.subheader("🔍 RAG 檢索設定")
408
+ similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
409
+
410
+ st.divider()
411
+ st.subheader("模型參數")
412
+ system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
413
+ max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
414
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
415
+ top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
416
+
417
+ st.divider()
418
+ if st.button("🗑️ 清除所有紀錄"):
419
+ for key in list(st.session_state.keys()):
420
+ if key not in ['HF_TOKEN']: # 保留環境變數
421
+ del st.session_state[key]
422
+ st.rerun()
423
 
424
  # =======================================================================
425
+ # === 檔案處理區塊 (RAG 檔案) - 保持不變 ===
426
  if rag_uploaded_file:
427
  file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
428
  if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
 
438
  del st.session_state.rag_current_file_key
439
  st.info("RAG 檔案已移除,已清除相關知識庫。")
440
 
441
+ # === 檔案處理區塊 (批量分析檔案 - 保持不變 ) ===
 
442
  if batch_uploaded_file:
443
  batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
444
+
445
  if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
446
  try:
447
+ # 使用新的統一解析函式
448
+ parsed_data = convert_uploaded_file_to_json_list(batch_uploaded_file)
449
+
450
+ if not parsed_data:
451
+ raise ValueError(f"{batch_uploaded_file.name} 檔案載入失敗或內容為空。")
452
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  # 儲存處理後的數據
454
  st.session_state.json_data_for_batch = parsed_data
455
  st.session_state.batch_current_file_key = batch_file_key
456
+ st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
457
+
458
  except Exception as e:
459
  st.error(f"檔案解析錯誤: {e}")
460
  if 'json_data_for_batch' in st.session_state:
461
  del st.session_state.json_data_for_batch
 
462
  elif 'json_data_for_batch' in st.session_state:
463
  del st.session_state.json_data_for_batch
464
  del st.session_state.batch_current_file_key
 
466
  del st.session_state.batch_results
467
  st.info("批量分析檔案已移除,已清除相關數據。")
468
 
469
+ # === 執行批量分析邏輯 (已修改為 IP 篩選) ===
470
  if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
471
  st.session_state.execute_batch_analysis = False
472
  start_time = time.time()
473
  st.session_state.batch_results = []
474
+
475
  if inference_client is None:
476
  st.error("Client 未連線,無法執行。")
477
  else:
478
+ logs_list = st.session_state.json_data_for_batch
479
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  if logs_list:
481
  vs = st.session_state.get("vector_store", None)
482
+
483
  # --- 關鍵:在這裡做 JSON String 的轉換 ---
 
 
484
  formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
485
+
486
  analysis_sequences = []
487
+
488
+ # ** vvvv 替換此處邏輯為基於 IP 的篩選 vvvv **
489
  for i in range(len(formatted_logs)):
490
+ # 1. 取得當前 Log (目標 Log) IP
491
+ target_log = logs_list[i]
492
+ target_ip = get_ip_from_log(target_log)
493
+
494
+ # 2. 確定回溯的 Log 範圍 (只看前 N 條 Log,不包含當前 Log)
495
+ start_index = max(0, i - len(logs_list) + 1) # 回溯到最開始
496
+
497
  sequence_text = []
498
+
499
+ if not target_ip:
500
+ # 如果沒有 IP,則只分析當前 Log
501
+ # 這裡將 WINDOW_SIZE 設為 1,只包含自己
502
+ sequence_text.append(f"--- Log Index {i} (No IP found){' <<< TARGET LOG TO ANALYZE'} ---\n{formatted_logs[i]}")
503
+
504
+ else:
505
+ # 3. 篩選出與目標 IP 相同的 Log 條目
506
+ current_window_indices = []
507
+ # 倒序查找,確保最近的 Log 優先被選中
508
+
509
+ # 範圍是 i-1 倒數到 0 (含)
510
+ for j in range(i - 1, -1, -1):
511
+ prior_log = logs_list[j]
512
+ prior_ip = get_ip_from_log(prior_log)
513
+
514
+ if prior_ip == target_ip:
515
+ current_window_indices.append(j)
516
+ # 如果已經累積了 N-1 條,則停止
517
+ if len(current_window_indices) >= WINDOW_SIZE - 1:
518
+ break
519
+
520
+ # 4. 將選取的 Log 索引 (倒序的) 加上當前 Log 的索引 (i)
521
+ # 確保它們按照時間順序排列 (升序)
522
+ sorted_indices = sorted(current_window_indices) + [i]
523
+
524
+ # 5. 構建序列文本
525
+ for index in sorted_indices:
526
+ is_target = " <<< TARGET LOG TO ANALYZE" if index == i else ""
527
+ # 計算 Log 相對位置
528
+ relative_pos = i - index
529
+ # 使用 sorted_indices 的長度作為序列長度,而不是 WINDOW_SIZE
530
+ sequence_text.append(f"--- Log Index {index} (IP:{target_ip}, {relative_pos} prior logs){is_target} ---\n{formatted_logs[index]}")
531
+
532
+ # 6. 構建分析序列
533
  analysis_sequences.append({
534
  "sequence_text": "\n\n".join(sequence_text),
535
  "target_log_id": i + 1,
536
  "original_log_entry": logs_list[i]
537
  })
538
+ # ** ^^^^ 替換結束 ^^^^ **
539
+
540
  total_sequences = len(analysis_sequences)
541
+ st.header(f"⚡ 批量分析執行中 (基於 IP 篩選, Max $N={WINDOW_SIZE}$)...")
542
  progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
543
  results_container = st.container()
544
  full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
545
+
546
  for i, seq_data in enumerate(analysis_sequences):
547
  log_id = seq_data["target_log_id"]
548
  progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
549
+
550
  try:
551
  response, retrieved_ctx = generate_rag_response_hf_for_log(
552
  client=inference_client,
 
568
  "context": retrieved_ctx
569
  }
570
  st.session_state.batch_results.append(item)
571
+
572
  with results_container:
573
  st.subheader(f"Log/Alert #{item['log_id']}")
574
  with st.expander("序列內容 (JSON Format)"):
575
+ st.code(item["sequence_analyzed"], language='json')
576
+
577
+ is_high = any(x in response.lower() for x in ['high-risk detected'])
578
  if is_high: st.error(item['analysis_result'])
579
  else: st.info(item['analysis_result'])
580
  if item['context']:
581
  with st.expander("參考 RAG 片段"): st.code(item['context'])
582
  st.markdown("---")
583
+
584
  log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
585
  full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
586
+
587
  except Exception as e:
588
  st.error(f"Error Log {log_id}: {e}")
589
+
590
  end_time = time.time()
591
  progress_bar.empty()
592
  st.success(f"完成!耗時 {end_time - start_time:.2f} 秒。")
593
  else:
594
  st.error("無法提取有效 Log,請檢查檔案格式。")
595
 
596
+ # === 顯示結果 (歷史紀錄) - 保持不變 ===
597
  if st.session_state.get("batch_results") and not st.session_state.execute_batch_analysis:
598
  st.header("⚡ 歷史分析結果")
599
+
600
+ # 初始化一個列表來儲存高風險項目的結構化數據
601
+ high_risk_data = []
602
+
603
+ # 預處理所有結果,只篩選出 High-risk
604
+ high_risk_items = []
605
  for item in st.session_state.batch_results:
606
+ # 檢查 analysis_result 中是否包含 'High-risk detected' (不區分大小寫)
607
+ is_high_risk = 'high-risk detected!' in item['analysis_result'].lower()
608
+
609
+ if is_high_risk:
610
+ high_risk_items.append(item)
611
+
612
+ # --- 為 CSV 報告準備數據 ---
613
+ # log_content 在 CSV 中通常需要被序列化為單行字串
614
+ log_content_str = json.dumps(item["log_content"], ensure_ascii=False)
615
+
616
+ # 整理 AI 分析結果,去除可能的換行符,使其在 CSV 單元格內更整潔
617
+ analysis_result_clean = item['analysis_result'].replace('\n', ' | ')
618
+
619
+ high_risk_data.append({
620
+ "Log_ID": item['log_id'],
621
+ "Risk_Level": "HIGH_RISK",
622
+ "Log_Content": log_content_str,
623
+ "AI_Analysis_Result": analysis_result_clean
624
+ })
625
+
626
+ # 顯示 High-Risk 報告的下載按鈕 (改為 CSV 邏輯)
627
+ if high_risk_items:
628
+ st.success(f"✅ 檢測到 {len(high_risk_items)} ���高風險 Log/Alert。")
629
+
630
+ # --- 構建 CSV 內容 ---
631
+ csv_output = io.StringIO()
632
+
633
+ # 寫入 CSV 標題
634
+ csv_output.write("Log_ID,Risk_Level,Log_Content,AI_Analysis_Result\n")
635
+
636
+ # 轉義函數 (確保複雜欄位在 CSV 中不被破壞)
637
+ def escape_csv(value):
638
+ # 替換內容中的所有雙引號為兩個雙引號,然後用雙引號包圍
639
+ return f'"{str(value).replace('"', '""')}"'
640
+
641
+ for row in high_risk_data:
642
+ line = ",".join([
643
+ str(row["Log_ID"]),
644
+ row["Risk_Level"],
645
+ escape_csv(row["Log_Content"]),
646
+ escape_csv(row["AI_Analysis_Result"])
647
+ ]) + "\n"
648
+ csv_output.write(line)
649
+
650
+ csv_content = csv_output.getvalue()
651
+
652
+ # 顯示 CSV 報告的下載按鈕
653
+ st.download_button(
654
+ "📥 下載 **高風險** 分析報告 (.csv)",
655
+ csv_content,
656
+ "high_risk_report.csv",
657
+ "text/csv"
658
+ )
659
+ else:
660
+ st.info("👍 未檢測到任何標註為 High-risk detected 的 Log/Alert。")