ss900371tw commited on
Commit
9f6a257
·
verified ·
1 Parent(s): 8ba72fa

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +529 -86
src/streamlit_app.py CHANGED
@@ -1,87 +1,530 @@
 
1
  import os
2
- import gradio as gr
3
- from huggingface_hub import InferenceClient
4
-
5
- """
6
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
- """
8
-
9
- # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
10
- client = InferenceClient(
11
- "meta-llama/Meta-Llama-3-8B-Instruct",
12
- token=os.environ.get("HF_TOKEN")
13
- )
14
-
15
-
16
- def respond(
17
- message,
18
- history: list[tuple[str, str]],
19
- max_tokens,
20
- temperature,
21
- top_p,
22
- ):
23
-
24
- name = "Ernest"
25
- system_message = f"""As a virtual mentor in cybersecurity called {name}, your role is to provide expert guidance and advice on protecting information and systems from cyber threats. You are an expert in:
26
- 1) Information Security;
27
- 2) Network Security;
28
- 3) Application Security;
29
- 4) Endpoint Security;
30
- 5) Data Security;
31
- 6) Identity and Access Management;
32
- 7) Database and Infrastructure Security;
33
- 8) Cloud Security;
34
- 9) Disaster Recovery/Business Continuity Planning;
35
- 10) Cyber Threat Intelligence;
36
- 11) Legal, Regulations, Compliance, and Ethics;
37
- 12) Operational Security (OpSec).
38
- Your responses should be informed by current best practices in security protocols, risk management, and ethical hacking. Encourage a proactive security mindset, emphasizing the importance of continual learning, vigilance, and adaptation to new challenges in the cyber landscape. Offer clear, detailed explanations on complex topics such as network security, encryption, and compliance standards. Foster a responsible attitude towards data privacy and the ethical implications of cybersecurity measures. Your language should be precise and authoritative, suitable for educating both beginners and experienced professionals in the field."""
39
-
40
- messages = [{"role": "system", "content": system_message}]
41
-
42
- for val in history:
43
- if val[0]:
44
- messages.append({"role": "user", "content": val[0]})
45
- if val[1]:
46
- messages.append({"role": "assistant", "content": val[1]})
47
-
48
- messages.append({"role": "user", "content": message})
49
-
50
- response = ""
51
-
52
- # Stream the model output safely
53
- for msg in client.chat_completion(
54
- messages,
55
- max_tokens=max_tokens,
56
- temperature=temperature,
57
- top_p=top_p,
58
- stream=True,
59
- ):
60
- if hasattr(msg, "choices") and msg.choices:
61
- delta = msg.choices[0].delta
62
- if hasattr(delta, "content") and delta.content:
63
- response += delta.content
64
- yield response
65
- # Ignore any events that do not contain content
66
-
67
- """
68
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
69
- """
70
- demo = gr.ChatInterface(
71
- respond,
72
- additional_inputs=[
73
- gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max new tokens"),
74
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
75
- gr.Slider(
76
- minimum=0.1,
77
- maximum=1.0,
78
- value=0.95,
79
- step=0.05,
80
- label="Top-p (nucleus sampling)",
81
- ),
82
- ],
83
- )
84
-
85
-
86
- if __name__ == "__main__":
87
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
  import os
3
+ import io
4
+ import json
5
+ import numpy as np
6
+ import faiss
7
+ import uuid
8
+ import time
9
+ import sys
10
+
11
+ # === HuggingFace 模型相關套件 (新增) ===
12
+ try:
13
+ # 確保只在需要時載入,避免在無 GPU 環境下強制載入導致錯誤
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
15
+ import torch
16
+ # 針對本地大模型:
17
+ # from accelerate import Accelerator # 建議安裝
18
+ # import bitsandbytes # 建議安裝
19
+ except ImportError:
20
+ st.error("請檢查是否安裝了所有 Hugging Face 相關依賴:pip install transformers torch accelerate bitsandbytes")
21
+ # 如果缺少,則退出或將相關變數設為 None
22
+ AutoModelForCausalLM, AutoTokenizer, pipeline, torch = None, None, None, None
23
+
24
+ # === LangChain/RAG 相關套件 (保持不變) ===
25
+ from langchain_community.embeddings import HuggingFaceEmbeddings
26
+ from langchain_core.documents import Document
27
+ from langchain_community.vectorstores import FAISS
28
+ from langchain_community.vectorstores.utils import DistanceStrategy
29
+ from langchain_community.docstore.in_memory import InMemoryDocstore
30
+
31
+ # 嘗試匯入 pypdf
32
+ try:
33
+ import pypdf
34
+ except ImportError:
35
+ pypdf = None
36
+
37
+ # --- 頁面設定 ---
38
+ st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
39
+ st.title("🛡️ Foundation-Sec-1.1-8B-Instruct with FAISS RAG & Batch Analysis")
40
+ st.markdown("已啟用:**IndexFlatIP** + **L2 正規化** + **Hugging Face LLM**。上傳 JSON 執行批量分析,上傳其他檔案作為 RAG 參考庫。")
41
+
42
+ # 設定模型 ID (替換為 Hugging Face 模型名稱)
43
+ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
44
+
45
+ WINDOW_SIZE = 8
46
+
47
+ # --- 側邊欄設定 ---
48
+ with st.sidebar:
49
+ st.header("⚙️ 設定")
50
+
51
+ # === 替換為 Hugging Face 模型名稱顯示 (移除 API Key 輸入) ===
52
+ st.info(f"LLM 模型:**{MODEL_ID}** (Hugging Face Model)")
53
+ st.warning("⚠️ **注意**: 8B 模型需要大量 RAM/VRAM 和算力。運行可能較慢或失敗。")
54
+
55
+ st.divider()
56
+
57
+ st.subheader("📂 檔案上傳")
58
+ # === 1. JSON 批量分析檔案 (新的上傳器) ===
59
+ json_uploaded_file = st.file_uploader(
60
+ "1️⃣ 上傳 **JSON** Log/Alert 檔案 (用於批量分析)",
61
+ type=['json'],
62
+ key="json_uploader"
63
+ )
64
+ # === 2. RAG 知識庫檔案 (新的上傳器) ===
65
+ rag_uploaded_file = st.file_uploader(
66
+ "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
67
+ type=['txt', 'py', 'log', 'csv', 'md', 'pdf'],
68
+ key="rag_uploader"
69
+ )
70
+ st.divider()
71
+
72
+ st.subheader("💡 批量分析指令 (針對 JSON 檔案)")
73
+ analysis_prompt = st.text_area(
74
+ "針對每個 Log/Alert 執行的指令",
75
+ value="You are a security expert in charge of analyzing a single alert and prioritizing its criticality. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Criticality/Priority: Is this alert critical? (Answer Yes/No only), and provide the overall priority level. (Answer High, Medium, or Low only) \n- Explanation: If this alert is critical or medium~high priority level, explain the potential impact and why this specific alert requires attention. If not, omit the explanation section. \n- Action Plan: If this alert is critical or medium~high priority level, What should be the immediate steps to address this specific alert? If not, omit the action plan section. \n\nStrictly use the information in the provided Log.",
76
+ height=200
77
+ )
78
+ st.markdown("此指令將對 JSON 檔案中的**每一個 Log 條目**執行一次獨立分析。")
79
+
80
+ if json_uploaded_file: # 移除 API Key 檢查
81
+ if st.button("🚀 執行批量分析"):
82
+ st.session_state.execute_batch_analysis = True
83
+ else:
84
+ st.info("請上傳 JSON 檔案以啟用批量分析按鈕。")
85
+
86
+ st.divider()
87
+
88
+ st.subheader("🔍 RAG 檢索設定")
89
+ similarity_threshold = st.slider(
90
+ "📐 Cosine Similarity 門檻",
91
+ 0.0, 1.0, 0.4, 0.01,
92
+ help="數值越大越相似。一般建議 0.4~0.7"
93
+ )
94
+ st.divider()
95
+
96
+ st.subheader("模型參數")
97
+ system_prompt = st.text_area("System Prompt (LLM 使用)", value="You are a Senior Security Analyst. Be professional.", height=100)
98
+ max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
99
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
100
+ top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
101
+
102
+ st.divider()
103
+ if st.button("🗑️ 清除所有紀錄"):
104
+ for key in list(st.session_state.keys()):
105
+ if key not in []:
106
+ del st.session_state[key]
107
+ st.rerun()
108
+
109
+ # --- 初始化 Hugging Face LLM Client (重大替換) ---
110
+ @st.cache_resource
111
+ def load_huggingface_llm(model_id):
112
+ if AutoModelForCausalLM is None:
113
+ st.error("無法載入 Hugging Face 依賴,請安裝:pip install transformers torch accelerate bitsandbytes")
114
+ return None
115
+ try:
116
+ # 使用量化 (4-bit) 減少記憶體消耗,這是運行 8B 模型的常見做法
117
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
118
+ model = AutoModelForCausalLM.from_pretrained(
119
+ model_id,
120
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
121
+ device_map="auto", # <--- 讓 accelerate 管理裝置
122
+ trust_remote_code=True,
123
+ # load_in_4bit=True # 如果需要 4-bit 量化
124
+ )
125
+ # 使用 pipeline 簡化呼叫
126
+ llm_pipeline = pipeline(
127
+ "text-generation",
128
+ model=model,
129
+ tokenizer=tokenizer,
130
+ # device=(0 if torch.cuda.is_available() else -1) # <--- **移除此參數**
131
+ )
132
+ st.success(f"Hugging Face 模型 **{model_id}** 載入成功。")
133
+ return llm_pipeline
134
+ except Exception as e:
135
+ st.error(f"Hugging Face 模型載入失敗: {e}")
136
+ return None
137
+
138
+ # 在 main 區塊外初始化 pipeline
139
+ llm_pipeline = None
140
+ if AutoModelForCausalLM is not None:
141
+ with st.spinner(f"正在載入 LLM 模型: {MODEL_ID} (8B)... (可能需要數分鐘)"):
142
+ llm_pipeline = load_huggingface_llm(MODEL_ID)
143
+
144
+ if llm_pipeline is None:
145
+ st.warning("Hugging Face LLM 無法載入。請檢查依賴和環境資源。")
146
+ # =======================================================================
147
+
148
+
149
+ # === Embedding 模型 (用於 RAG 參考庫) (保持不變) ===
150
+ @st.cache_resource
151
+ def load_embedding_model():
152
+ model_kwargs = {
153
+ 'device': 'cpu',
154
+ 'trust_remote_code': True
155
+ }
156
+ encode_kwargs = {
157
+ 'normalize_embeddings': False
158
+ }
159
+ # 選擇一個適合 RAG 的中文 Embedding Model
160
+ return HuggingFaceEmbeddings(
161
+ model_name="BAAI/bge-large-zh-v1.5",
162
+ model_kwargs=model_kwargs,
163
+ encode_kwargs=encode_kwargs
164
+ )
165
+
166
+ with st.spinner("正在載入 Embedding 模型..."):
167
+ embedding_model = load_embedding_model()
168
+
169
+ # === 建立向量庫 / Search 函數 (保持不變) ===
170
+ def process_file_to_faiss(uploaded_file):
171
+ text_content = ""
172
+ try:
173
+ if uploaded_file.type == "application/pdf":
174
+ if pypdf:
175
+ pdf_reader = pypdf.PdfReader(uploaded_file)
176
+ for page in pdf_reader.pages:
177
+ text_content += page.extract_text() + "\n"
178
+ else:
179
+ return None, "PDF library missing"
180
+ else:
181
+ stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
182
+ text_content = stringio.read()
183
+
184
+ if not text_content.strip():
185
+ return None, "File is empty"
186
+
187
+ # 嘗試以 </Event> 分割 Log,否則以換行符分割
188
+ events = [e + "</Event>" for e in text_content.split("</Event>") if e.strip()]
189
+ if len(events) <= 1:
190
+ events = [line for line in text_content.split("\n") if line.strip()]
191
+
192
+ docs = [Document(page_content=e) for e in events]
193
+
194
+ if not docs:
195
+ return None, "No documents created"
196
+
197
+ embeddings = embedding_model.embed_documents([d.page_content for d in docs])
198
+ embeddings_np = np.array(embeddings).astype("float32")
199
+ faiss.normalize_L2(embeddings_np) # L2 正規化
200
+
201
+ dimension = embeddings_np.shape[1]
202
+ index = faiss.IndexFlatIP(dimension) # IndexFlatIP (內積)
203
+ index.add(embeddings_np)
204
+
205
+ doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
206
+ docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
207
+ index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
208
+
209
+ vector_store = FAISS(
210
+ embedding_function=embedding_model,
211
+ index=index,
212
+ docstore=docstore,
213
+ index_to_docstore_id=index_to_docstore_id,
214
+ distance_strategy=DistanceStrategy.COSINE # 使用 Cosine 距離 (對應 IndexFlatIP)
215
+ )
216
+
217
+ return vector_store, f"{len(docs)} chunks created."
218
+ except Exception as e:
219
+ return None, f"Error: {str(e)}"
220
+
221
+ def faiss_cosine_search_all(vector_store, query, threshold):
222
+ q_emb = embedding_model.embed_query(query)
223
+ q_emb = np.array([q_emb]).astype("float32")
224
+ faiss.normalize_L2(q_emb)
225
+
226
+ index = vector_store.index
227
+ D, I = index.search(q_emb, k=index.ntotal)
228
+
229
+ selected = []
230
+ for score, idx in zip(D[0], I[0]):
231
+ if idx == -1: continue
232
+ # IndexFlatIP 輸出內積,與歸一化後的 Cosine Similarity 相同
233
+ if score >= threshold:
234
+ doc_id = vector_store.index_to_docstore_id[idx]
235
+ doc = vector_store.docstore.search(doc_id)
236
+ selected.append((doc, score))
237
+
238
+ selected.sort(key=lambda x: x[1], reverse=True)
239
+ return selected
240
+
241
+ # === Hugging Face 生成單一 Log 分析回答 (核心批量處理函數) (重大替換) ===
242
+ def generate_rag_response_hf_for_log(llm_pipeline, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
243
+ """
244
+ 使用 Hugging Face LLM 執行 RAG 增強的 Log 序列分析。
245
+ """
246
+ if llm_pipeline is None:
247
+ return "ERROR: Hugging Face LLM Pipeline 未載入。", ""
248
+
249
+ context_text = ""
250
+ # 1. RAG 檢索邏輯
251
+ if vector_store:
252
+ selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
253
+ if selected:
254
+ retrieved_contents = [
255
+ f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}"
256
+ for i, (doc, score) in enumerate(selected[:5]) # 限制檢索結果數量
257
+ ]
258
+ context_text = "\n".join(retrieved_contents)
259
+
260
+ # 2. 建構 Prompt 的 RAG 部分和指令部分 (針對 HF 指令模型)
261
+ rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ===
262
+ {context_text if context_text else 'No relevant reference context found.'}
263
+ === END REFERENCE CONTEXT ===
264
+ ANALYSIS INSTRUCTION: {user_prompt}
265
+ Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats. Focus on the **last log entry in the sequence** to determine its final criticality and priority, considering the preceding {WINDOW_SIZE} logs."""
266
+
267
+ log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===
268
+ {log_sequence_text}
269
+ === END LOG SEQUENCE ==="""
270
+
271
+ # 整合 System Prompt、RAG、和 Log 內容
272
+ # 注意:fdtn-ai/Foundation-Sec-1.1-8B-Instruct 遵循 ChatML 格式,但此處使用簡化的 instruction-tuning 格式
273
+ full_prompt = (
274
+ f"**SYSTEM INSTRUCTION**: {sys_prompt}\n\n"
275
+ f"**RAG & ANALYSIS INSTRUCTION**:\n{rag_instruction}\n\n"
276
+ f"**LOG DATA**:\n{log_content_section}\n\n"
277
+ f"**RESPONSE**:"
278
+ )
279
+
280
+ # 3. 呼叫 Hugging Face Pipeline
281
+ try:
282
+ # Pipeline 參數設定
283
+ response = llm_pipeline(
284
+ full_prompt,
285
+ max_new_tokens=max_output_tokens,
286
+ temperature=temperature,
287
+ top_p=top_p,
288
+ do_sample=True, # 啟用採樣
289
+ return_full_text=False # 只返回生成的文本
290
+ )
291
+
292
+ # 處理 pipeline 的輸出格式
293
+ if response and isinstance(response, list) and 'generated_text' in response[0]:
294
+ return response[0]['generated_text'].strip(), context_text
295
+ else:
296
+ return f"Hugging Face Pipeline 輸出格式錯誤: {response}", context_text
297
+
298
+ except Exception as e:
299
+ # 如果模型呼叫失敗,回傳詳細錯誤訊息
300
+ return f"Hugging Face Model Error: {str(e)}", context_text
301
+
302
+
303
+ # === 檔案處理和主執行邏輯 (保持結構,替換 LLM 呼叫) ===
304
+ # 初始化 Session State
305
+ if 'execute_batch_analysis' not in st.session_state:
306
+ st.session_state.execute_batch_analysis = False
307
+ if 'batch_results' not in st.session_state:
308
+ st.session_state.batch_results = None
309
+
310
+ # --- 1. 處理 RAG 知識庫檔案 (rag_uploaded_file) ---
311
+ if 'rag_current_file_key' not in st.session_state:
312
+ st.session_state.rag_current_file_key = None
313
+
314
+ if rag_uploaded_file:
315
+ file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
316
+
317
+ if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
318
+ # 偵測到新 RAG 檔案,需要重新建立知識庫
319
+ with st.spinner(f"正在建立 RAG 參考知識庫 ({rag_uploaded_file.name})..."):
320
+ vs, msg = process_file_to_faiss(rag_uploaded_file)
321
+ if vs:
322
+ st.session_state.vector_store = vs
323
+ st.session_state.rag_current_file_key = file_key
324
+ st.toast(f"RAG 參考知識庫已更新!{msg}", icon="✅")
325
+ else:
326
+ st.error(msg)
327
+ # 檔案移除/狀態清理 (如果使用者移除了 RAG 檔案)
328
+ elif 'vector_store' in st.session_state:
329
+ del st.session_state.vector_store
330
+ del st.session_state.rag_current_file_key
331
+ st.info("RAG 檔案已移除,已清除相關知識庫。")
332
+
333
+ # --- 2. 處理 JSON 批量分析檔案 (json_uploaded_file) ---
334
+ if 'json_current_file_key' not in st.session_state:
335
+ st.session_state.json_current_file_key = None
336
+
337
+ if json_uploaded_file:
338
+ json_file_key = f"json_{json_uploaded_file.name}_{json_uploaded_file.size}"
339
+
340
+ if st.session_state.json_current_file_key != json_file_key or 'json_data_for_batch' not in st.session_state:
341
+ try:
342
+ # 偵測到新 JSON 檔案
343
+ json_data = json.load(io.StringIO(json_uploaded_file.getvalue().decode("utf-8")))
344
+ st.session_state.json_data_for_batch = json_data
345
+ st.session_state.json_current_file_key = json_file_key
346
+ st.toast("JSON Log 檔案已載入,請按 '執行批量分析'。", icon="📄")
347
+
348
+ except Exception as e:
349
+ st.error(f"JSON 檔案解析錯誤: {e}")
350
+ if 'json_data_for_batch' in st.session_state:
351
+ del st.session_state.json_data_for_batch
352
+
353
+ # 檔案移除/狀態清理 (如果使用者移除了 JSON 檔案)
354
+ elif 'json_data_for_batch' in st.session_state:
355
+ del st.session_state.json_data_for_batch
356
+ del st.session_state.json_current_file_key
357
+ if "batch_results" in st.session_state:
358
+ del st.session_state.batch_results
359
+ st.info("JSON 檔案已移除,已清除日誌數據和分析結果。")
360
+
361
+ # === 執行批量分析邏輯 (包含顏色控制) ===
362
+ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
363
+ st.session_state.execute_batch_analysis = False
364
+ start_time = time.time() # 開始計時
365
+ st.session_state.batch_results = []
366
+
367
+ if llm_pipeline is None:
368
+ st.error("Hugging Face LLM Pipeline 未載入,請檢查依賴和環境資源,無法執行批量分析。")
369
+ # 由於這是一個 Streamlit App,我們不直接 st.stop(),讓使用者可以檢查設定
370
+ st.session_state.execute_batch_analysis = False
371
+
372
+ data_to_process = st.session_state.json_data_for_batch
373
+
374
+ # 提取 Log 列表的邏輯 (保持不變)
375
+ logs_list = []
376
+ if isinstance(data_to_process, list):
377
+ logs_list = data_to_process
378
+ elif isinstance(data_to_process, dict):
379
+ if all(isinstance(v, (dict, str, list)) for v in data_to_process.values()):
380
+ logs_list = list(data_to_process.values())
381
+ elif 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
382
+ logs_list = data_to_process['alerts']
383
+ elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
384
+ logs_list = data_to_process['logs']
385
+ else:
386
+ logs_list = [data_to_process]
387
+ else:
388
+ logs_list = [data_to_process]
389
+
390
+ if logs_list:
391
+ vs = st.session_state.get("vector_store", None)
392
+ if vs:
393
+ st.success("✅ RAG 知識庫已啟用並用於分析。")
394
+ else:
395
+ st.warning("⚠️ RAG 知識庫未載入,將單純執行 Log 分析。")
396
+
397
+ # --- 新增:創建平移視窗序列 ---
398
+
399
+ # 將所有 Log 轉換為 JSON 格式化字串列表,以便後續拼接
400
+ formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
401
+
402
+ # 創建要分析的序列 (Sliding Window) 列表
403
+ analysis_sequences = []
404
+
405
+ for i in range(len(formatted_logs)):
406
+ start_index = max(0, i - WINDOW_SIZE + 1)
407
+ end_index = i + 1 # 終點為當前 Log
408
+
409
+ current_window = formatted_logs[start_index:end_index]
410
+
411
+ sequence_text = []
412
+ for j, log_str in enumerate(current_window):
413
+ is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
414
+ # 使用 i-len(current_window)+j+1 來計算原始索引
415
+ sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
416
+
417
+ analysis_sequences.append({
418
+ "sequence_text": "\n\n".join(sequence_text),
419
+ "target_log_id": i + 1, # 該序列的分析目標是原始列表中的第 i+1 條 Log
420
+ "original_log_entry": logs_list[i]
421
+ })
422
+
423
+ total_sequences = len(analysis_sequences)
424
+ if total_sequences < WINDOW_SIZE:
425
+ st.warning(f"Log 總數 ({total_sequences}) 少於視窗大小 ({WINDOW_SIZE}),分析的結果可能較不準確。")
426
+
427
+ # --- 執行序列分析 ---
428
+ st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
429
+ progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
430
+ results_container = st.container()
431
+ full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
432
+
433
+ priority_keyword = "Criticality/Priority:"
434
+
435
+ for i, seq_data in enumerate(analysis_sequences):
436
+ log_id = seq_data["target_log_id"]
437
+ progress_bar.progress((i + 1) / total_sequences, text=f"已處理 {i + 1}/{total_sequences} 個序列 (目標 Log #{log_id})...")
438
+
439
+ try:
440
+ # *** 替換為 Hugging Face 呼叫函數 ***
441
+ response, retrieved_ctx = generate_rag_response_hf_for_log(
442
+ llm_pipeline=llm_pipeline, # <--- 新的 LLM pipeline
443
+ model_id=MODEL_ID,
444
+ log_sequence_text=seq_data["sequence_text"],
445
+ user_prompt=analysis_prompt,
446
+ sys_prompt=system_prompt,
447
+ vector_store=vs,
448
+ threshold=similarity_threshold,
449
+ max_output_tokens=max_output_tokens,
450
+ temperature=temperature,
451
+ top_p=top_p
452
+ )
453
+
454
+ # 儲存結果
455
+ item = {
456
+ "log_id": log_id,
457
+ "log_content": seq_data["original_log_entry"], # 記錄原始 Log 條目
458
+ "sequence_analyzed": seq_data["sequence_text"], # 記錄分析的序列
459
+ "analysis_result": response,
460
+ "context": retrieved_ctx
461
+ }
462
+ st.session_state.batch_results.append(item)
463
+
464
+ # 結果顯示邏輯
465
+ with results_container:
466
+ st.subheader(f"Log/Alert #{item['log_id']} (序列分析完成)")
467
+ with st.expander(f"序列內容 (包含 {len(seq_data['sequence_text'].split('--- Log Index'))-1} 條 Log)"):
468
+ st.code(item["sequence_analyzed"], language='text')
469
+
470
+ # 顏色控制:
471
+ is_high_priority = False
472
+ if 'criticality/priority:' in response.lower():
473
+ try:
474
+ priority_section = response.split('Criticality/Priority:')[1].split('\n')[0].strip()
475
+ if 'high' in priority_section.lower() or 'medium' in priority_section.lower() or 'yes' in priority_section.lower():
476
+ is_high_priority = True
477
+ except IndexError:
478
+ pass
479
+
480
+ st.markdown(f"### 🤖 分析結果 (針對 Log #{log_id})")
481
+ if is_high_priority:
482
+ st.error(item['analysis_result'])
483
+ else:
484
+ st.info(item['analysis_result'])
485
+
486
+ if item['context']:
487
+ with st.expander("參考的 RAG 知識庫片段"):
488
+ st.code(item['context'])
489
+ st.markdown("---")
490
+
491
+ # 報告 chunks
492
+ log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
493
+ full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']} (序列分析)\n\n#### 分析的序列內容\n```\n{seq_data['sequence_text']}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
494
+
495
+ except Exception as e:
496
+ error_message = f"ERROR: Log {log_id} 序列處理失敗: {e}"
497
+ st.session_state.batch_results.append({
498
+ "log_id": log_id,
499
+ "log_content": seq_data["original_log_entry"],
500
+ "sequence_analyzed": seq_data["sequence_text"],
501
+ "analysis_result": error_message,
502
+ "context": ""
503
+ })
504
+ with results_container:
505
+ st.error(error_message)
506
+
507
+ end_time = time.time()
508
+ progress_bar.empty()
509
+ st.success(f"批量分析完成!共處理 {total_sequences} 個 Log 序列,耗時 {end_time - start_time:.2f} 秒。")
510
+ st.divider()
511
+
512
+ else:
513
+ st.error("無法從上傳的 JSON 檔案中提取 Log 列表或有效的 Log 條目。請檢查檔案結構。")
514
+
515
+ # === 顯示結果 (歷史紀錄) (保持不變) ===
516
+ if st.session_state.batch_results and not st.session_state.execute_batch_analysis:
517
+ st.header("⚡ 上次分析結果 (歷史紀錄)")
518
+
519
+ full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
520
+ for item in st.session_state.batch_results:
521
+ log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
522
+ full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']}\n\n#### 原始內容\n```json\n{log_content_str_for_report}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
523
+
524
+ st.info(f"偵測到 {len(st.session_state.batch_results)} 條 Log 的歷史分析結果。")
525
+ st.download_button(
526
+ label="📥 下載上次的完整報告 (.md)",
527
+ data="\n".join(full_report_chunks),
528
+ file_name="security_batch_analysis_report_history.md",
529
+ mime="text/markdown"
530
+ )