Update src/streamlit_app.py
Browse files- src/streamlit_app.py +195 -163
src/streamlit_app.py
CHANGED
|
@@ -48,6 +48,15 @@ WINDOW_SIZE = 8
|
|
| 48 |
with st.sidebar:
|
| 49 |
st.header("⚙️ 設定")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# === 替換為 Hugging Face 模型名稱顯示 (移除 API Key 輸入) ===
|
| 52 |
st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Model)")
|
| 53 |
st.warning("⚠️ **注意**: 8B 模型需要大量 RAM/VRAM 和算力。運行可能較慢或失敗。")
|
|
@@ -77,11 +86,15 @@ with st.sidebar:
|
|
| 77 |
)
|
| 78 |
st.markdown("此指令將對 JSON 檔案中的**每一個 Log 條目**執行一次獨立分析。")
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
st.info("請上傳 JSON 檔案以啟用批量分析按鈕。")
|
|
|
|
|
|
|
| 85 |
|
| 86 |
st.divider()
|
| 87 |
|
|
@@ -96,38 +109,44 @@ with st.sidebar:
|
|
| 96 |
st.subheader("模型參數")
|
| 97 |
system_prompt = st.text_area("System Prompt (LLM 使用)", value="You are a Senior Security Analyst. Be professional.", height=100)
|
| 98 |
max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
|
| 99 |
-
temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
|
| 100 |
top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
|
| 101 |
|
| 102 |
st.divider()
|
| 103 |
if st.button("🗑️ 清除所有紀錄"):
|
| 104 |
for key in list(st.session_state.keys()):
|
| 105 |
-
if key not in []:
|
| 106 |
del st.session_state[key]
|
| 107 |
st.rerun()
|
| 108 |
|
| 109 |
-
# --- 初始化 Hugging Face LLM Client (
|
| 110 |
@st.cache_resource
|
| 111 |
-
def load_huggingface_llm(model_id):
|
| 112 |
if AutoModelForCausalLM is None:
|
| 113 |
st.error("無法載入 Hugging Face 依賴,請安裝:pip install transformers torch accelerate bitsandbytes")
|
| 114 |
return None
|
| 115 |
try:
|
| 116 |
# 使用量化 (4-bit) 減少記憶體消耗,這是運行 8B 模型的常見做法
|
| 117 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
model = AutoModelForCausalLM.from_pretrained(
|
| 119 |
-
model_id,
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
# load_in_4bit=True # 如果需要 4-bit 量化
|
| 124 |
)
|
|
|
|
| 125 |
# 使用 pipeline 簡化呼叫
|
| 126 |
llm_pipeline = pipeline(
|
| 127 |
"text-generation",
|
| 128 |
model=model,
|
| 129 |
tokenizer=tokenizer,
|
| 130 |
-
|
| 131 |
)
|
| 132 |
st.success(f"Hugging Face 模型 **{model_id}** 載入成功。")
|
| 133 |
return llm_pipeline
|
|
@@ -137,12 +156,14 @@ def load_huggingface_llm(model_id):
|
|
| 137 |
|
| 138 |
# 在 main 區塊外初始化 pipeline
|
| 139 |
llm_pipeline = None
|
| 140 |
-
if AutoModelForCausalLM is not None:
|
| 141 |
with st.spinner(f"正在載入 LLM 模型: {MODEL_ID} (8B)... (可能需要數分鐘)"):
|
| 142 |
-
llm_pipeline = load_huggingface_llm(MODEL_ID)
|
| 143 |
|
| 144 |
-
if llm_pipeline is None:
|
| 145 |
-
st.warning("Hugging Face LLM
|
|
|
|
|
|
|
| 146 |
# =======================================================================
|
| 147 |
|
| 148 |
|
|
@@ -238,13 +259,13 @@ def faiss_cosine_search_all(vector_store, query, threshold):
|
|
| 238 |
selected.sort(key=lambda x: x[1], reverse=True)
|
| 239 |
return selected
|
| 240 |
|
| 241 |
-
# === Hugging Face 生成單一 Log 分析回答 (核心批量處理函數) (
|
| 242 |
def generate_rag_response_hf_for_log(llm_pipeline, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
|
| 243 |
"""
|
| 244 |
使用 Hugging Face LLM 執行 RAG 增強的 Log 序列分析。
|
| 245 |
"""
|
| 246 |
if llm_pipeline is None:
|
| 247 |
-
return "ERROR: Hugging Face LLM Pipeline
|
| 248 |
|
| 249 |
context_text = ""
|
| 250 |
# 1. RAG 檢索邏輯
|
|
@@ -268,16 +289,26 @@ Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the *
|
|
| 268 |
{log_sequence_text}
|
| 269 |
=== END LOG SEQUENCE ==="""
|
| 270 |
|
| 271 |
-
# 整合 System Prompt、RAG、和 Log
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
f"
|
| 275 |
-
f"
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
)
|
| 279 |
|
| 280 |
-
#
|
| 281 |
try:
|
| 282 |
# Pipeline 參數設定
|
| 283 |
response = llm_pipeline(
|
|
@@ -365,152 +396,153 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
|
|
| 365 |
st.session_state.batch_results = []
|
| 366 |
|
| 367 |
if llm_pipeline is None:
|
| 368 |
-
st.error("Hugging Face LLM Pipeline
|
| 369 |
-
# 由於這是一個 Streamlit App,我們不直接 st.stop(),讓使用者可以檢查設定
|
| 370 |
st.session_state.execute_batch_analysis = False
|
| 371 |
-
|
| 372 |
-
data_to_process = st.session_state.json_data_for_batch
|
| 373 |
-
|
| 374 |
-
# 提取 Log 列表的邏輯 (保持不變)
|
| 375 |
-
logs_list = []
|
| 376 |
-
if isinstance(data_to_process, list):
|
| 377 |
-
logs_list = data_to_process
|
| 378 |
-
elif isinstance(data_to_process, dict):
|
| 379 |
-
if all(isinstance(v, (dict, str, list)) for v in data_to_process.values()):
|
| 380 |
-
logs_list = list(data_to_process.values())
|
| 381 |
-
elif 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
|
| 382 |
-
logs_list = data_to_process['alerts']
|
| 383 |
-
elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
|
| 384 |
-
logs_list = data_to_process['logs']
|
| 385 |
-
else:
|
| 386 |
-
logs_list = [data_to_process]
|
| 387 |
-
else:
|
| 388 |
-
logs_list = [data_to_process]
|
| 389 |
-
|
| 390 |
-
if logs_list:
|
| 391 |
-
vs = st.session_state.get("vector_store", None)
|
| 392 |
-
if vs:
|
| 393 |
-
st.success("✅ RAG 知識庫已啟用並用於分析。")
|
| 394 |
-
else:
|
| 395 |
-
st.warning("⚠️ RAG 知識庫未載入,將單純執行 Log 分析。")
|
| 396 |
-
|
| 397 |
-
# --- 新增:創建平移視窗序列 ---
|
| 398 |
-
|
| 399 |
-
# 將所有 Log 轉換為 JSON 格式化字串列表,以便後續拼接
|
| 400 |
-
formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
|
| 401 |
|
| 402 |
-
|
| 403 |
-
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
|
| 414 |
-
# 使用 i-len(current_window)+j+1 來計算原始索引
|
| 415 |
-
sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
|
| 416 |
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
"target_log_id": i + 1, # 該序列的分析目標是原始列表中的第 i+1 條 Log
|
| 420 |
-
"original_log_entry": logs_list[i]
|
| 421 |
-
})
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
-
|
| 428 |
-
st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
|
| 429 |
-
progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
|
| 430 |
-
results_container = st.container()
|
| 431 |
-
full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
|
| 432 |
-
|
| 433 |
-
priority_keyword = "Criticality/Priority:"
|
| 434 |
-
|
| 435 |
-
for i, seq_data in enumerate(analysis_sequences):
|
| 436 |
-
log_id = seq_data["target_log_id"]
|
| 437 |
-
progress_bar.progress((i + 1) / total_sequences, text=f"已處理 {i + 1}/{total_sequences} 個序列 (目標 Log #{log_id})...")
|
| 438 |
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
llm_pipeline=llm_pipeline, # <--- 新的 LLM pipeline
|
| 443 |
-
model_id=MODEL_ID,
|
| 444 |
-
log_sequence_text=seq_data["sequence_text"],
|
| 445 |
-
user_prompt=analysis_prompt,
|
| 446 |
-
sys_prompt=system_prompt,
|
| 447 |
-
vector_store=vs,
|
| 448 |
-
threshold=similarity_threshold,
|
| 449 |
-
max_output_tokens=max_output_tokens,
|
| 450 |
-
temperature=temperature,
|
| 451 |
-
top_p=top_p
|
| 452 |
-
)
|
| 453 |
-
|
| 454 |
-
# 儲存結果
|
| 455 |
-
item = {
|
| 456 |
-
"log_id": log_id,
|
| 457 |
-
"log_content": seq_data["original_log_entry"], # 記錄原始 Log 條目
|
| 458 |
-
"sequence_analyzed": seq_data["sequence_text"], # 記錄分析的序列
|
| 459 |
-
"analysis_result": response,
|
| 460 |
-
"context": retrieved_ctx
|
| 461 |
-
}
|
| 462 |
-
st.session_state.batch_results.append(item)
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
pass
|
| 479 |
-
|
| 480 |
-
st.markdown(f"### 🤖 分析結果 (針對 Log #{log_id})")
|
| 481 |
-
if is_high_priority:
|
| 482 |
-
st.error(item['analysis_result'])
|
| 483 |
-
else:
|
| 484 |
-
st.info(item['analysis_result'])
|
| 485 |
-
|
| 486 |
-
if item['context']:
|
| 487 |
-
with st.expander("參考的 RAG 知識庫片段"):
|
| 488 |
-
st.code(item['context'])
|
| 489 |
-
st.markdown("---")
|
| 490 |
-
|
| 491 |
-
# 報告 chunks
|
| 492 |
-
log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
|
| 493 |
-
full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']} (序列分析)\n\n#### 分析的序列內容\n```\n{seq_data['sequence_text']}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
|
| 494 |
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
with results_container:
|
| 505 |
-
st.error(error_message)
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
# === 顯示結果 (歷史紀錄) (保持不變) ===
|
| 516 |
if st.session_state.batch_results and not st.session_state.execute_batch_analysis:
|
|
|
|
| 48 |
with st.sidebar:
|
| 49 |
st.header("⚙️ 設定")
|
| 50 |
|
| 51 |
+
# === 新增 Hugging Face Token 輸入框 (重要) ===
|
| 52 |
+
hf_token = st.text_input(
|
| 53 |
+
"Hugging Face Access Token (必要)",
|
| 54 |
+
type="password",
|
| 55 |
+
key="hf_token_input",
|
| 56 |
+
help=f"載入 {MODEL_ID} 等受限模型需要您的 Hugging Face 存取 Token。"
|
| 57 |
+
)
|
| 58 |
+
# ============================================
|
| 59 |
+
|
| 60 |
# === 替換為 Hugging Face 模型名稱顯示 (移除 API Key 輸入) ===
|
| 61 |
st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Model)")
|
| 62 |
st.warning("⚠️ **注意**: 8B 模型需要大量 RAM/VRAM 和算力。運行可能較慢或失敗。")
|
|
|
|
| 86 |
)
|
| 87 |
st.markdown("此指令將對 JSON 檔案中的**每一個 Log 條目**執行一次獨立分析。")
|
| 88 |
|
| 89 |
+
# 檢查 Token 和檔案是否都已準備好
|
| 90 |
+
can_execute = json_uploaded_file and hf_token
|
| 91 |
+
|
| 92 |
+
if st.button("🚀 執行批量分析", disabled=not can_execute):
|
| 93 |
+
st.session_state.execute_batch_analysis = True
|
| 94 |
+
elif not json_uploaded_file:
|
| 95 |
st.info("請上傳 JSON 檔案以啟用批量分析按鈕。")
|
| 96 |
+
elif not hf_token:
|
| 97 |
+
st.info("請輸入 Hugging Face Access Token 以載入模型。")
|
| 98 |
|
| 99 |
st.divider()
|
| 100 |
|
|
|
|
| 109 |
st.subheader("模型參數")
|
| 110 |
system_prompt = st.text_area("System Prompt (LLM 使用)", value="You are a Senior Security Analyst. Be professional.", height=100)
|
| 111 |
max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
|
| 112 |
+
temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
|
| 113 |
top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
|
| 114 |
|
| 115 |
st.divider()
|
| 116 |
if st.button("🗑️ 清除所有紀錄"):
|
| 117 |
for key in list(st.session_state.keys()):
|
| 118 |
+
if key not in ['hf_token_input']: # 保留 Token 輸入
|
| 119 |
del st.session_state[key]
|
| 120 |
st.rerun()
|
| 121 |
|
| 122 |
+
# --- 初始化 Hugging Face LLM Client (重大替換:使用 Token 和 CausalLM) ---
|
| 123 |
@st.cache_resource
|
| 124 |
+
def load_huggingface_llm(model_id, hf_token):
|
| 125 |
if AutoModelForCausalLM is None:
|
| 126 |
st.error("無法載入 Hugging Face 依賴,請安裝:pip install transformers torch accelerate bitsandbytes")
|
| 127 |
return None
|
| 128 |
try:
|
| 129 |
# 使用量化 (4-bit) 減少記憶體消耗,這是運行 8B 模型的常見做法
|
| 130 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 131 |
+
model_id,
|
| 132 |
+
token=hf_token, # 傳遞 Token
|
| 133 |
+
trust_remote_code=True
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# 確保使用 AutoModelForCausalLM
|
| 137 |
model = AutoModelForCausalLM.from_pretrained(
|
| 138 |
+
model_id,
|
| 139 |
+
token=hf_token, # 傳遞 Token
|
| 140 |
+
torch_dtype=torch.bfloat16, # 建議使用 bfloat16 或 float16 減少記憶體
|
| 141 |
+
device_map="auto" # 自動將模型分配到 GPU/CPU
|
|
|
|
| 142 |
)
|
| 143 |
+
|
| 144 |
# 使用 pipeline 簡化呼叫
|
| 145 |
llm_pipeline = pipeline(
|
| 146 |
"text-generation",
|
| 147 |
model=model,
|
| 148 |
tokenizer=tokenizer,
|
| 149 |
+
model_kwargs={"torch_dtype": torch.bfloat16}
|
| 150 |
)
|
| 151 |
st.success(f"Hugging Face 模型 **{model_id}** 載入成功。")
|
| 152 |
return llm_pipeline
|
|
|
|
| 156 |
|
| 157 |
# 在 main 區塊外初始化 pipeline
|
| 158 |
llm_pipeline = None
|
| 159 |
+
if AutoModelForCausalLM is not None and hf_token: # 只有當 Token 存在時才嘗試載入
|
| 160 |
with st.spinner(f"正在載入 LLM 模型: {MODEL_ID} (8B)... (可能需要數分鐘)"):
|
| 161 |
+
llm_pipeline = load_huggingface_llm(MODEL_ID, hf_token) # 傳遞 Token
|
| 162 |
|
| 163 |
+
if llm_pipeline is None and hf_token:
|
| 164 |
+
st.warning("Hugging Face LLM 無法載入。請檢查依賴、環境資源或 Token 是否有效。")
|
| 165 |
+
elif not hf_token:
|
| 166 |
+
st.info("請在左側邊欄輸入 Hugging Face Access Token 以載入模型。")
|
| 167 |
# =======================================================================
|
| 168 |
|
| 169 |
|
|
|
|
| 259 |
selected.sort(key=lambda x: x[1], reverse=True)
|
| 260 |
return selected
|
| 261 |
|
| 262 |
+
# === Hugging Face 生成單一 Log 分析回答 (核心批量處理函數) (重大替換:使用 ChatML) ===
|
| 263 |
def generate_rag_response_hf_for_log(llm_pipeline, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
|
| 264 |
"""
|
| 265 |
使用 Hugging Face LLM 執行 RAG 增強的 Log 序列分析。
|
| 266 |
"""
|
| 267 |
if llm_pipeline is None:
|
| 268 |
+
return "ERROR: Hugging Face LLM Pipeline 未載入或 Token 無效。", ""
|
| 269 |
|
| 270 |
context_text = ""
|
| 271 |
# 1. RAG 檢索邏輯
|
|
|
|
| 289 |
{log_sequence_text}
|
| 290 |
=== END LOG SEQUENCE ==="""
|
| 291 |
|
| 292 |
+
# 3. 整合 System Prompt、RAG、和 Log 內容,使用 ChatML 格式
|
| 293 |
+
user_message_content = (
|
| 294 |
+
f"RAG & ANALYSIS INSTRUCTION:\n{rag_instruction}\n\n"
|
| 295 |
+
f"LOG DATA:\n{log_content_section}\n\n"
|
| 296 |
+
f"RESPONSE:"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
messages = [
|
| 300 |
+
{"role": "system", "content": sys_prompt},
|
| 301 |
+
{"role": "user", "content": user_message_content}
|
| 302 |
+
]
|
| 303 |
+
|
| 304 |
+
# 轉換為 Llama 3/ChatML 格式
|
| 305 |
+
full_prompt = llm_pipeline.tokenizer.apply_chat_template(
|
| 306 |
+
messages,
|
| 307 |
+
tokenize=False,
|
| 308 |
+
add_generation_prompt=True
|
| 309 |
)
|
| 310 |
|
| 311 |
+
# 4. 呼叫 Hugging Face Pipeline
|
| 312 |
try:
|
| 313 |
# Pipeline 參數設定
|
| 314 |
response = llm_pipeline(
|
|
|
|
| 396 |
st.session_state.batch_results = []
|
| 397 |
|
| 398 |
if llm_pipeline is None:
|
| 399 |
+
st.error("Hugging Face LLM Pipeline 未載入或 Token 無效,無法執行批量分析。")
|
|
|
|
| 400 |
st.session_state.execute_batch_analysis = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
+
else: # 只有當 LLM 載入成功時才繼續
|
| 403 |
+
data_to_process = st.session_state.json_data_for_batch
|
| 404 |
|
| 405 |
+
# 提取 Log 列表的邏輯 (保持不變)
|
| 406 |
+
logs_list = []
|
| 407 |
+
if isinstance(data_to_process, list):
|
| 408 |
+
logs_list = data_to_process
|
| 409 |
+
elif isinstance(data_to_process, dict):
|
| 410 |
+
if all(isinstance(v, (dict, str, list)) for v in data_to_process.values()):
|
| 411 |
+
logs_list = list(data_to_process.values())
|
| 412 |
+
elif 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
|
| 413 |
+
logs_list = data_to_process['alerts']
|
| 414 |
+
elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
|
| 415 |
+
logs_list = data_to_process['logs']
|
| 416 |
+
else:
|
| 417 |
+
logs_list = [data_to_process]
|
| 418 |
+
else:
|
| 419 |
+
logs_list = [data_to_process]
|
| 420 |
|
| 421 |
+
if logs_list:
|
| 422 |
+
vs = st.session_state.get("vector_store", None)
|
| 423 |
+
if vs:
|
| 424 |
+
st.success("✅ RAG 知識庫已啟用並用於分析。")
|
| 425 |
+
else:
|
| 426 |
+
st.warning("⚠️ RAG 知識庫未載入,將單純執行 Log 分析。")
|
| 427 |
+
|
| 428 |
+
# --- 新增:創建平移視窗序列 ---
|
| 429 |
|
| 430 |
+
# 將所有 Log 轉換為 JSON 格式化字串列表,以便後續拼接
|
| 431 |
+
formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
+
# 創建要分析的序列 (Sliding Window) 列表
|
| 434 |
+
analysis_sequences = []
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
+
for i in range(len(formatted_logs)):
|
| 437 |
+
start_index = max(0, i - WINDOW_SIZE + 1)
|
| 438 |
+
end_index = i + 1 # 終點為當前 Log
|
| 439 |
+
|
| 440 |
+
current_window = formatted_logs[start_index:end_index]
|
| 441 |
+
|
| 442 |
+
sequence_text = []
|
| 443 |
+
for j, log_str in enumerate(current_window):
|
| 444 |
+
is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
|
| 445 |
+
# 使用 i-len(current_window)+j+1 來計算原始索引
|
| 446 |
+
sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
|
| 447 |
+
|
| 448 |
+
analysis_sequences.append({
|
| 449 |
+
"sequence_text": "\n\n".join(sequence_text),
|
| 450 |
+
"target_log_id": i + 1, # 該序列的分析目標是原始列表中的第 i+1 條 Log
|
| 451 |
+
"original_log_entry": logs_list[i]
|
| 452 |
+
})
|
| 453 |
+
|
| 454 |
+
total_sequences = len(analysis_sequences)
|
| 455 |
+
if total_sequences < WINDOW_SIZE:
|
| 456 |
+
st.warning(f"Log 總數 ({total_sequences}) 少於視窗大小 ({WINDOW_SIZE}),分析的結果可能較不準確。")
|
| 457 |
+
|
| 458 |
+
# --- 執行序列分析 ---
|
| 459 |
+
st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
|
| 460 |
+
progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
|
| 461 |
+
results_container = st.container()
|
| 462 |
+
full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
|
| 463 |
|
| 464 |
+
priority_keyword = "Criticality/Priority:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
for i, seq_data in enumerate(analysis_sequences):
|
| 467 |
+
log_id = seq_data["target_log_id"]
|
| 468 |
+
progress_bar.progress((i + 1) / total_sequences, text=f"已處理 {i + 1}/{total_sequences} 個序列 (目標 Log #{log_id})...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
+
try:
|
| 471 |
+
# *** 替換為 Hugging Face 呼叫函數 ***
|
| 472 |
+
response, retrieved_ctx = generate_rag_response_hf_for_log(
|
| 473 |
+
llm_pipeline=llm_pipeline, # <--- 新的 LLM pipeline
|
| 474 |
+
model_id=MODEL_ID,
|
| 475 |
+
log_sequence_text=seq_data["sequence_text"],
|
| 476 |
+
user_prompt=analysis_prompt,
|
| 477 |
+
sys_prompt=system_prompt,
|
| 478 |
+
vector_store=vs,
|
| 479 |
+
threshold=similarity_threshold,
|
| 480 |
+
max_output_tokens=max_output_tokens,
|
| 481 |
+
temperature=temperature,
|
| 482 |
+
top_p=top_p
|
| 483 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
+
# 儲存結果
|
| 486 |
+
item = {
|
| 487 |
+
"log_id": log_id,
|
| 488 |
+
"log_content": seq_data["original_log_entry"], # 記錄原始 Log 條目
|
| 489 |
+
"sequence_analyzed": seq_data["sequence_text"], # 記錄分析的序列
|
| 490 |
+
"analysis_result": response,
|
| 491 |
+
"context": retrieved_ctx
|
| 492 |
+
}
|
| 493 |
+
st.session_state.batch_results.append(item)
|
|
|
|
|
|
|
| 494 |
|
| 495 |
+
# 結果顯示邏輯
|
| 496 |
+
with results_container:
|
| 497 |
+
st.subheader(f"Log/Alert #{item['log_id']} (序列分析完成)")
|
| 498 |
+
with st.expander(f"序列內容 (包含 {len(seq_data['sequence_text'].split('--- Log Index'))-1} 條 Log)"):
|
| 499 |
+
st.code(item["sequence_analyzed"], language='text')
|
| 500 |
+
|
| 501 |
+
# 顏色控制:
|
| 502 |
+
is_high_priority = False
|
| 503 |
+
if 'criticality/priority:' in response.lower():
|
| 504 |
+
try:
|
| 505 |
+
# 嘗試從 LLM 回應中提取優先級
|
| 506 |
+
priority_section = response.split('criticality/priority:')[1].split('\n')[0].strip()
|
| 507 |
+
if 'high' in priority_section.lower() or 'medium' in priority_section.lower() or 'yes' in priority_section.lower():
|
| 508 |
+
is_high_priority = True
|
| 509 |
+
except IndexError:
|
| 510 |
+
pass
|
| 511 |
+
|
| 512 |
+
st.markdown(f"### 🤖 分析結果 (針對 Log #{log_id})")
|
| 513 |
+
if is_high_priority:
|
| 514 |
+
st.error(item['analysis_result'])
|
| 515 |
+
else:
|
| 516 |
+
st.info(item['analysis_result'])
|
| 517 |
+
|
| 518 |
+
if item['context']:
|
| 519 |
+
with st.expander("參考的 RAG 知識庫片段"):
|
| 520 |
+
st.code(item['context'])
|
| 521 |
+
st.markdown("---")
|
| 522 |
+
|
| 523 |
+
# 報告 chunks
|
| 524 |
+
log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
|
| 525 |
+
full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']} (序列分析)\n\n#### 分析的序列內容\n```\n{seq_data['sequence_text']}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
|
| 526 |
+
|
| 527 |
+
except Exception as e:
|
| 528 |
+
error_message = f"ERROR: Log {log_id} 序列處理失敗: {e}"
|
| 529 |
+
st.session_state.batch_results.append({
|
| 530 |
+
"log_id": log_id,
|
| 531 |
+
"log_content": seq_data["original_log_entry"],
|
| 532 |
+
"sequence_analyzed": seq_data["sequence_text"],
|
| 533 |
+
"analysis_result": error_message,
|
| 534 |
+
"context": ""
|
| 535 |
+
})
|
| 536 |
+
with results_container:
|
| 537 |
+
st.error(error_message)
|
| 538 |
+
|
| 539 |
+
end_time = time.time()
|
| 540 |
+
progress_bar.empty()
|
| 541 |
+
st.success(f"批量分析完成!共處理 {total_sequences} 個 Log 序列,耗時 {end_time - start_time:.2f} 秒。")
|
| 542 |
+
st.divider()
|
| 543 |
+
|
| 544 |
+
else:
|
| 545 |
+
st.error("無法從上傳的 JSON 檔案中提取 Log 列表或有效的 Log 條目。請檢查檔案結構。")
|
| 546 |
|
| 547 |
# === 顯示結果 (歷史紀錄) (保持不變) ===
|
| 548 |
if st.session_state.batch_results and not st.session_state.execute_batch_analysis:
|