Spaces:

ss900371tw
/

FoundationSec8BSimilarityAP3

Running

App Files Files Community

ss900371tw commited on 2 days ago

Commit

96f2763

verified ·

1 Parent(s): 9f6a257

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +195 -163

src/streamlit_app.py CHANGED Viewed

@@ -48,6 +48,15 @@ WINDOW_SIZE = 8
 with st.sidebar:
     st.header("⚙️ 設定")
     # === 替換為 Hugging Face 模型名稱顯示 (移除 API Key 輸入) ===
     st.info(f"LLM 模型：**{MODEL_ID}** (Hugging Face Model)")
     st.warning("⚠️ **注意**: 8B 模型需要大量 RAM/VRAM 和算力。運行可能較慢或失敗。")
@@ -77,11 +86,15 @@ with st.sidebar:
     )
     st.markdown("此指令將對 JSON 檔案中的**每一個 Log 條目**執行一次獨立分析。")
-    if json_uploaded_file: # 移除 API Key 檢查
-        if st.button("🚀 執行批量分析"):
-            st.session_state.execute_batch_analysis = True
-    else:
         st.info("請上傳 JSON 檔案以啟用批量分析按鈕。")
     st.divider()
@@ -96,38 +109,44 @@ with st.sidebar:
     st.subheader("模型參數")
     system_prompt = st.text_area("System Prompt (LLM 使用)", value="You are a Senior Security Analyst. Be professional.", height=100)
     max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
-    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
     top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
     st.divider()
     if st.button("🗑️ 清除所有紀錄"):
         for key in list(st.session_state.keys()):
-            if key not in []:
                 del st.session_state[key]
         st.rerun()
-# --- 初始化 Hugging Face LLM Client (重大替換) ---
 @st.cache_resource
-def load_huggingface_llm(model_id):
     if AutoModelForCausalLM is None:
         st.error("無法載入 Hugging Face 依賴，請安裝：pip install transformers torch accelerate bitsandbytes")
         return None
     try:
         # 使用量化 (4-bit) 減少記憶體消耗，這是運行 8B 模型的常見做法
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
-            device_map="auto", # <--- 讓 accelerate 管理裝置
-            trust_remote_code=True,
-            # load_in_4bit=True # 如果需要 4-bit 量化
         )
         # 使用 pipeline 簡化呼叫
         llm_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            # device=(0 if torch.cuda.is_available() else -1) # <--- **移除此參數**
         )
         st.success(f"Hugging Face 模型 **{model_id}** 載入成功。")
         return llm_pipeline
@@ -137,12 +156,14 @@ def load_huggingface_llm(model_id):
 # 在 main 區塊外初始化 pipeline
 llm_pipeline = None
-if AutoModelForCausalLM is not None:
     with st.spinner(f"正在載入 LLM 模型: {MODEL_ID} (8B)... (可能需要數分鐘)"):
-        llm_pipeline = load_huggingface_llm(MODEL_ID)
-if llm_pipeline is None:
-    st.warning("Hugging Face LLM 無法載入。請檢查依賴和環境資源。")
 # =======================================================================
@@ -238,13 +259,13 @@ def faiss_cosine_search_all(vector_store, query, threshold):
     selected.sort(key=lambda x: x[1], reverse=True)
     return selected
-# === Hugging Face 生成單一 Log 分析回答 (核心批量處理函數) (重大替換) ===
 def generate_rag_response_hf_for_log(llm_pipeline, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
     """
     使用 Hugging Face LLM 執行 RAG 增強的 Log 序列分析。
     """
     if llm_pipeline is None:
-        return "ERROR: Hugging Face LLM Pipeline 未載入。", ""
     context_text = ""
     # 1. RAG 檢索邏輯
@@ -268,16 +289,26 @@ Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the *
 {log_sequence_text}
 === END LOG SEQUENCE ==="""
-    # 整合 System Prompt、RAG、和 Log 內容
-    # 注意：fdtn-ai/Foundation-Sec-1.1-8B-Instruct 遵循 ChatML 格式，但此處使用簡化的 instruction-tuning 格式
-    full_prompt = (
-        f"**SYSTEM INSTRUCTION**: {sys_prompt}\n\n"
-        f"**RAG & ANALYSIS INSTRUCTION**:\n{rag_instruction}\n\n"
-        f"**LOG DATA**:\n{log_content_section}\n\n"
-        f"**RESPONSE**:"
     )
-    # 3. 呼叫 Hugging Face Pipeline
     try:
         # Pipeline 參數設定
         response = llm_pipeline(
@@ -365,152 +396,153 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
     st.session_state.batch_results = []
     if llm_pipeline is None:
-        st.error("Hugging Face LLM Pipeline 未載入，請檢查依賴和環境資源，無法執行批量分析。")
-        # 由於這是一個 Streamlit App，我們不直接 st.stop()，讓使用者可以檢查設定
         st.session_state.execute_batch_analysis = False
-    data_to_process = st.session_state.json_data_for_batch
-    # 提取 Log 列表的邏輯 (保持不變)
-    logs_list = []
-    if isinstance(data_to_process, list):
-        logs_list = data_to_process
-    elif isinstance(data_to_process, dict):
-        if all(isinstance(v, (dict, str, list)) for v in data_to_process.values()):
-            logs_list = list(data_to_process.values())
-        elif 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
-            logs_list = data_to_process['alerts']
-        elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
-            logs_list = data_to_process['logs']
-        else:
-            logs_list = [data_to_process]
-    else:
-        logs_list = [data_to_process]
-    if logs_list:
-        vs = st.session_state.get("vector_store", None)
-        if vs:
-            st.success("✅ RAG 知識庫已啟用並用於分析。")
-        else:
-            st.warning("⚠️ RAG 知識庫未載入，將單純執行 Log 分析。")
-        # --- 新增：創建平移視窗序列 ---
-        # 將所有 Log 轉換為 JSON 格式化字串列表，以便後續拼接
-        formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
-        # 創建要分析的序列 (Sliding Window) 列表
-        analysis_sequences = []
-        for i in range(len(formatted_logs)):
-            start_index = max(0, i - WINDOW_SIZE + 1)
-            end_index = i + 1 # 終點為當前 Log
-            current_window = formatted_logs[start_index:end_index]
-            sequence_text = []
-            for j, log_str in enumerate(current_window):
-                is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
-                # 使用 i-len(current_window)+j+1 來計算原始索引
-                sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
-            analysis_sequences.append({
-                "sequence_text": "\n\n".join(sequence_text),
-                "target_log_id": i + 1, # 該序列的分析目標是原始列表中的第 i+1 條 Log
-                "original_log_entry": logs_list[i]
-            })
-        total_sequences = len(analysis_sequences)
-        if total_sequences < WINDOW_SIZE:
-            st.warning(f"Log 總數 ({total_sequences}) 少於視窗大小 ({WINDOW_SIZE})，分析的結果可能較不準確。")
-        # --- 執行序列分析 ---
-        st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
-        progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
-        results_container = st.container()
-        full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
-        priority_keyword = "Criticality/Priority:"
-        for i, seq_data in enumerate(analysis_sequences):
-            log_id = seq_data["target_log_id"]
-            progress_bar.progress((i + 1) / total_sequences, text=f"已處理 {i + 1}/{total_sequences} 個序列 (目標 Log #{log_id})...")
-            try:
-                # *** 替換為 Hugging Face 呼叫函數 ***
-                response, retrieved_ctx = generate_rag_response_hf_for_log(
-                    llm_pipeline=llm_pipeline, # <--- 新的 LLM pipeline
-                    model_id=MODEL_ID,
-                    log_sequence_text=seq_data["sequence_text"],
-                    user_prompt=analysis_prompt,
-                    sys_prompt=system_prompt,
-                    vector_store=vs,
-                    threshold=similarity_threshold,
-                    max_output_tokens=max_output_tokens,
-                    temperature=temperature,
-                    top_p=top_p
-                )
-                # 儲存結果
-                item = {
-                    "log_id": log_id,
-                    "log_content": seq_data["original_log_entry"], # 記錄原始 Log 條目
-                    "sequence_analyzed": seq_data["sequence_text"], # 記錄分析的序列
-                    "analysis_result": response,
-                    "context": retrieved_ctx
-                }
-                st.session_state.batch_results.append(item)
-                # 結果顯示邏輯
-                with results_container:
-                    st.subheader(f"Log/Alert #{item['log_id']} (序列分析完成)")
-                    with st.expander(f"序列內容 (包含 {len(seq_data['sequence_text'].split('--- Log Index'))-1} 條 Log)"):
-                        st.code(item["sequence_analyzed"], language='text')
-                    # 顏色控制：
-                    is_high_priority = False
-                    if 'criticality/priority:' in response.lower():
-                        try:
-                            priority_section = response.split('Criticality/Priority:')[1].split('\n')[0].strip()
-                            if 'high' in priority_section.lower() or 'medium' in priority_section.lower() or 'yes' in priority_section.lower():
-                                is_high_priority = True
-                        except IndexError:
-                            pass
-                    st.markdown(f"### 🤖 分析結果 (針對 Log #{log_id})")
-                    if is_high_priority:
-                        st.error(item['analysis_result'])
-                    else:
-                        st.info(item['analysis_result'])
-                    if item['context']:
-                        with st.expander("參考的 RAG 知識庫片段"):
-                            st.code(item['context'])
-                    st.markdown("---")
-                    # 報告 chunks
-                    log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
-                    full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']} (序列分析)\n\n#### 分析的序列內容\n```\n{seq_data['sequence_text']}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
-            except Exception as e:
-                error_message = f"ERROR: Log {log_id} 序列處理失敗: {e}"
-                st.session_state.batch_results.append({
-                    "log_id": log_id,
-                    "log_content": seq_data["original_log_entry"],
-                    "sequence_analyzed": seq_data["sequence_text"],
-                    "analysis_result": error_message,
-                    "context": ""
-                })
-                with results_container:
-                    st.error(error_message)
-        end_time = time.time()
-        progress_bar.empty()
-        st.success(f"批量分析完成！共處理 {total_sequences} 個 Log 序列，耗時 {end_time - start_time:.2f} 秒。")
-        st.divider()
-    else:
-        st.error("無法從上傳的 JSON 檔案中提取 Log 列表或有效的 Log 條目。請檢查檔案結構。")
 # === 顯示結果 (歷史紀錄) (保持不變) ===
 if st.session_state.batch_results and not st.session_state.execute_batch_analysis:

 with st.sidebar:
     st.header("⚙️ 設定")
+    # === 新增 Hugging Face Token 輸入框 (重要) ===
+    hf_token = st.text_input(
+        "Hugging Face Access Token (必要)",
+        type="password",
+        key="hf_token_input",
+        help=f"載入 {MODEL_ID} 等受限模型需要您的 Hugging Face 存取 Token。"
+    )
+    # ============================================
     # === 替換為 Hugging Face 模型名稱顯示 (移除 API Key 輸入) ===
     st.info(f"LLM 模型：**{MODEL_ID}** (Hugging Face Model)")
     st.warning("⚠️ **注意**: 8B 模型需要大量 RAM/VRAM 和算力。運行可能較慢或失敗。")
     )
     st.markdown("此指令將對 JSON 檔案中的**每一個 Log 條目**執行一次獨立分析。")
+    # 檢查 Token 和檔案是否都已準備好
+    can_execute = json_uploaded_file and hf_token
+    if st.button("🚀 執行批量分析", disabled=not can_execute):
+        st.session_state.execute_batch_analysis = True
+    elif not json_uploaded_file:
         st.info("請上傳 JSON 檔案以啟用批量分析按鈕。")
+    elif not hf_token:
+        st.info("請輸入 Hugging Face Access Token 以載入模型。")
     st.divider()
     st.subheader("模型參數")
     system_prompt = st.text_area("System Prompt (LLM 使用)", value="You are a Senior Security Analyst. Be professional.", height=100)
     max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
+    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
     top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
     st.divider()
     if st.button("🗑️ 清除所有紀錄"):
         for key in list(st.session_state.keys()):
+            if key not in ['hf_token_input']: # 保留 Token 輸入
                 del st.session_state[key]
         st.rerun()
+# --- 初始化 Hugging Face LLM Client (重大替換：使用 Token 和 CausalLM) ---
 @st.cache_resource
+def load_huggingface_llm(model_id, hf_token):
     if AutoModelForCausalLM is None:
         st.error("無法載入 Hugging Face 依賴，請安裝：pip install transformers torch accelerate bitsandbytes")
         return None
     try:
         # 使用量化 (4-bit) 減少記憶體消耗，這是運行 8B 模型的常見做法
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            token=hf_token, # 傳遞 Token
+            trust_remote_code=True
+        )
+        # 確保使用 AutoModelForCausalLM
         model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            token=hf_token, # 傳遞 Token
+            torch_dtype=torch.bfloat16, # 建議使用 bfloat16 或 float16 減少記憶體
+            device_map="auto" # 自動將模型分配到 GPU/CPU
         )
         # 使用 pipeline 簡化呼叫
         llm_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            model_kwargs={"torch_dtype": torch.bfloat16}
         )
         st.success(f"Hugging Face 模型 **{model_id}** 載入成功。")
         return llm_pipeline
 # 在 main 區塊外初始化 pipeline
 llm_pipeline = None
+if AutoModelForCausalLM is not None and hf_token: # 只有當 Token 存在時才嘗試載入
     with st.spinner(f"正在載入 LLM 模型: {MODEL_ID} (8B)... (可能需要數分鐘)"):
+        llm_pipeline = load_huggingface_llm(MODEL_ID, hf_token) # 傳遞 Token
+if llm_pipeline is None and hf_token:
+    st.warning("Hugging Face LLM 無法載入。請檢查依賴、環境資源或 Token 是否有效。")
+elif not hf_token:
+     st.info("請在左側邊欄輸入 Hugging Face Access Token 以載入模型。")
 # =======================================================================
     selected.sort(key=lambda x: x[1], reverse=True)
     return selected
+# === Hugging Face 生成單一 Log 分析回答 (核心批量處理函數) (重大替換：使用 ChatML) ===
 def generate_rag_response_hf_for_log(llm_pipeline, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
     """
     使用 Hugging Face LLM 執行 RAG 增強的 Log 序列分析。
     """
     if llm_pipeline is None:
+        return "ERROR: Hugging Face LLM Pipeline 未載入或 Token 無效。", ""
     context_text = ""
     # 1. RAG 檢索邏輯
 {log_sequence_text}
 === END LOG SEQUENCE ==="""
+    # 3. 整合 System Prompt、RAG、和 Log 內容，使用 ChatML 格式
+    user_message_content = (
+        f"RAG & ANALYSIS INSTRUCTION:\n{rag_instruction}\n\n"
+        f"LOG DATA:\n{log_content_section}\n\n"
+        f"RESPONSE:"
+    )
+    messages = [
+        {"role": "system", "content": sys_prompt},
+        {"role": "user", "content": user_message_content}
+    ]
+    # 轉換為 Llama 3/ChatML 格式
+    full_prompt = llm_pipeline.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
+    # 4. 呼叫 Hugging Face Pipeline
     try:
         # Pipeline 參數設定
         response = llm_pipeline(
     st.session_state.batch_results = []
     if llm_pipeline is None:
+        st.error("Hugging Face LLM Pipeline 未載入或 Token 無效，無法執行批量分析。")
         st.session_state.execute_batch_analysis = False
+    else: # 只有當 LLM 載入成功時才繼續
+        data_to_process = st.session_state.json_data_for_batch
+        # 提取 Log 列表的邏輯 (保持不變)
+        logs_list = []
+        if isinstance(data_to_process, list):
+            logs_list = data_to_process
+        elif isinstance(data_to_process, dict):
+            if all(isinstance(v, (dict, str, list)) for v in data_to_process.values()):
+                logs_list = list(data_to_process.values())
+            elif 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
+                logs_list = data_to_process['alerts']
+            elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
+                logs_list = data_to_process['logs']
+            else:
+                logs_list = [data_to_process]
+        else:
+            logs_list = [data_to_process]
+        if logs_list:
+            vs = st.session_state.get("vector_store", None)
+            if vs:
+                st.success("✅ RAG 知識庫已啟用並用於分析。")
+            else:
+                st.warning("⚠️ RAG 知識庫未載入，將單純執行 Log 分析。")
+            # --- 新增：創建平移視窗序列 ---
+            # 將所有 Log 轉換為 JSON 格式化字串列表，以便後續拼接
+            formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
+            # 創建要分析的序列 (Sliding Window) 列表
+            analysis_sequences = []
+            for i in range(len(formatted_logs)):
+                start_index = max(0, i - WINDOW_SIZE + 1)
+                end_index = i + 1 # 終點為當前 Log
+                current_window = formatted_logs[start_index:end_index]
+                sequence_text = []
+                for j, log_str in enumerate(current_window):
+                    is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
+                    # 使用 i-len(current_window)+j+1 來計算原始索引
+                    sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
+                analysis_sequences.append({
+                    "sequence_text": "\n\n".join(sequence_text),
+                    "target_log_id": i + 1, # 該序列的分析目標是原始列表中的第 i+1 條 Log
+                    "original_log_entry": logs_list[i]
+                })
+            total_sequences = len(analysis_sequences)
+            if total_sequences < WINDOW_SIZE:
+                st.warning(f"Log 總數 ({total_sequences}) 少於視窗大小 ({WINDOW_SIZE})，分析的結果可能較不準確。")
+            # --- 執行序列分析 ---
+            st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
+            progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
+            results_container = st.container()
+            full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
+            priority_keyword = "Criticality/Priority:"
+            for i, seq_data in enumerate(analysis_sequences):
+                log_id = seq_data["target_log_id"]
+                progress_bar.progress((i + 1) / total_sequences, text=f"已處理 {i + 1}/{total_sequences} 個序列 (目標 Log #{log_id})...")
+                try:
+                    # *** 替換為 Hugging Face 呼叫函數 ***
+                    response, retrieved_ctx = generate_rag_response_hf_for_log(
+                        llm_pipeline=llm_pipeline, # <--- 新的 LLM pipeline
+                        model_id=MODEL_ID,
+                        log_sequence_text=seq_data["sequence_text"],
+                        user_prompt=analysis_prompt,
+                        sys_prompt=system_prompt,
+                        vector_store=vs,
+                        threshold=similarity_threshold,
+                        max_output_tokens=max_output_tokens,
+                        temperature=temperature,
+                        top_p=top_p
+                    )
+                    # 儲存結果
+                    item = {
+                        "log_id": log_id,
+                        "log_content": seq_data["original_log_entry"], # 記錄原始 Log 條目
+                        "sequence_analyzed": seq_data["sequence_text"], # 記錄分析的序列
+                        "analysis_result": response,
+                        "context": retrieved_ctx
+                    }
+                    st.session_state.batch_results.append(item)
+                    # 結果顯示邏輯
+                    with results_container:
+                        st.subheader(f"Log/Alert #{item['log_id']} (序列分析完成)")
+                        with st.expander(f"序列內容 (包含 {len(seq_data['sequence_text'].split('--- Log Index'))-1} 條 Log)"):
+                            st.code(item["sequence_analyzed"], language='text')
+                        # 顏色控制：
+                        is_high_priority = False
+                        if 'criticality/priority:' in response.lower():
+                            try:
+                                # 嘗試從 LLM 回應中提取優先級
+                                priority_section = response.split('criticality/priority:')[1].split('\n')[0].strip()
+                                if 'high' in priority_section.lower() or 'medium' in priority_section.lower() or 'yes' in priority_section.lower():
+                                    is_high_priority = True
+                            except IndexError:
+                                pass
+                        st.markdown(f"### 🤖 分析結果 (針對 Log #{log_id})")
+                        if is_high_priority:
+                            st.error(item['analysis_result'])
+                        else:
+                            st.info(item['analysis_result'])
+                        if item['context']:
+                            with st.expander("參考的 RAG 知識庫片段"):
+                                st.code(item['context'])
+                        st.markdown("---")
+                        # 報告 chunks
+                        log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
+                        full_report_chunks.append(f"---\n\n### Log/Alert #{item['log_id']} (序列分析)\n\n#### 分析的序列內容\n```\n{seq_data['sequence_text']}\n```\n\n#### LLM 分析結果\n{item['analysis_result']}\n")
+                except Exception as e:
+                    error_message = f"ERROR: Log {log_id} 序列處理失敗: {e}"
+                    st.session_state.batch_results.append({
+                        "log_id": log_id,
+                        "log_content": seq_data["original_log_entry"],
+                        "sequence_analyzed": seq_data["sequence_text"],
+                        "analysis_result": error_message,
+                        "context": ""
+                    })
+                    with results_container:
+                        st.error(error_message)
+            end_time = time.time()
+            progress_bar.empty()
+            st.success(f"批量分析完成！共處理 {total_sequences} 個 Log 序列，耗時 {end_time - start_time:.2f} 秒。")
+            st.divider()
+        else:
+            st.error("無法從上傳的 JSON 檔案中提取 Log 列表或有效的 Log 條目。請檢查檔案結構。")
 # === 顯示結果 (歷史紀錄) (保持不變) ===
 if st.session_state.batch_results and not st.session_state.execute_batch_analysis: