HML

Sleeping

App Files Files Community

ss900371tw commited on Dec 18, 2025

Commit

c7f04bb

verified ·

1 Parent(s): cba1cd9

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +432 -172

src/streamlit_app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import streamlit as st
 import os
 import io
 import json
-import csv  # <--- 新增：用於處理 CSV
 import numpy as np
 import faiss
 import uuid
 import time
 import sys
 # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
 try:
@@ -23,7 +23,7 @@ from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_community.docstore.in_memory import InMemoryDocstore
-# 嘗試匯入 pypdftry
 try:
     import pypdf
 except ImportError:
@@ -31,91 +31,248 @@ except ImportError:
 # --- 頁面設定 ---
 st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
-st.title("🛡️ Meta-Llama-3-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
-st.markdown("已啟用：**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT 執行批量分析。")
 if 'execute_batch_analysis' not in st.session_state:
     st.session_state.execute_batch_analysis = False
 if 'batch_results' not in st.session_state:
-    st.session_state.batch_results = None
 if 'rag_current_file_key' not in st.session_state:
     st.session_state.rag_current_file_key = None
-if 'batch_current_file_key' not in st.session_state: # 修改變數名稱以反映多格式
     st.session_state.batch_current_file_key = None
 if 'vector_store' not in st.session_state:
     st.session_state.vector_store = None
-if 'json_data_for_batch' not in st.session_state: # 變數名稱保留，但內容可能是轉換後的 dict
     st.session_state.json_data_for_batch = None
 # 設定模型 ID
-MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-WINDOW_SIZE = 8
-# --- 側邊欄設定 ---
-with st.sidebar:
-    st.header("⚙️ 設定")
-    if not os.environ.get("HF_TOKEN"):
-         st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
-    st.info(f"LLM 模型：**{MODEL_ID}** (Hugging Face Inference API)")
-    st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫，請確保您的 HF Token 具有存取權限。")
-    st.divider()
-    st.subheader("📂 檔案上傳")
-    # === 1. 批量分析檔案 (修改處：支援多種格式) ===
-    batch_uploaded_file = st.file_uploader(
-        "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
-        type=['json', 'csv', 'txt'], # <--- 修改：新增 csv 和 txt
-        key="batch_uploader",
-        help="支援 JSON (Array), CSV (含標題), TXT (每行一條 Log)"
-    )
-    # === 2. RAG 知識庫檔案 ===
-    rag_uploaded_file = st.file_uploader(
-        "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
-        type=['txt', 'py', 'log', 'csv', 'md', 'pdf'],
-        key="rag_uploader"
-    )
-    st.divider()
-    st.subheader("💡 批量分析指令")
-    analysis_prompt = st.text_area(
-        "針對每個 Log/Alert 執行的指令",
-        value="You are a security expert in charge of analyzing alerts related to Web Application Attacks and Brute Force & Reconnaissance. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Priority: Provide the overall priority level. (Answer High risk, Medium risk, or Low risk only) \n- Explanation: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, explain the potential impact and why this specific alert requires attention. If not, **omit the explanation section**. \n- Action Plan: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, What should be the immediate steps to address this specific alert? If not, **omit the action plan section**. \n\nStrictly use the information in the provided Log.",
-        height=200
-    )
-    st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析。")
-    if batch_uploaded_file:
-        if st.button("🚀 執行批量分析"):
-            if not os.environ.get("HF_TOKEN"):
-                st.error("無法執行，環境變數 **HF_TOKEN** 未設定。")
-            else:
-                st.session_state.execute_batch_analysis = True
-    else:
-        st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
-    st.divider()
-    st.subheader("🔍 RAG 檢索設定")
-    similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
-    st.divider()
-    st.subheader("模型參數")
-    system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
-    max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
-    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
-    top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
-    st.divider()
-    if st.button("🗑️ 清除所有紀錄"):
-        for key in list(st.session_state.keys()):
-             del st.session_state[key]
-        st.rerun()
-# --- 初始化 Hugging Face LLM Client ---
 @st.cache_resource
 def load_inference_client(model_id):
     if not os.environ.get("HF_TOKEN"): return None
@@ -131,9 +288,10 @@ inference_client = None
 if os.environ.get("HF_TOKEN"):
     with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
         inference_client = load_inference_client(MODEL_ID)
 if inference_client is None and os.environ.get("HF_TOKEN"):
     st.warning("Hugging Face Inference Client 無法連線。")
-elif not os.environ.get("HF_TOKEN"):
     st.error("請在環境變數中設定 HF_TOKEN。")
 # === Embedding 模型 (保持不變) ===
@@ -159,25 +317,25 @@ def process_file_to_faiss(uploaded_file):
         else:
             stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
             text_content = stringio.read()
         if not text_content.strip(): return None, "File is empty"
         events = [line for line in text_content.splitlines() if line.strip()]
         docs = [Document(page_content=e) for e in events]
         if not docs: return None, "No documents created"
         embeddings = embedding_model.embed_documents([d.page_content for d in docs])
         embeddings_np = np.array(embeddings).astype("float32")
         faiss.normalize_L2(embeddings_np)
         dimension = embeddings_np.shape[1]
         index = faiss.IndexFlatIP(dimension)
         index.add(embeddings_np)
         doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
         docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
         index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
         vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
         return vector_store, f"{len(docs)} chunks created."
     except Exception as e:
@@ -199,32 +357,72 @@ def faiss_cosine_search_all(vector_store, query, threshold):
     selected.sort(key=lambda x: x[1], reverse=True)
     return selected
-# === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
-def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
-    if client is None: return "ERROR: Client Error", ""
-    context_text = ""
-    if vector_store:
-        selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
-        if selected:
-            retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
-            context_text = "\n".join(retrieved_contents)
-    rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
-    log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
-    messages = [
-        {"role": "system", "content": sys_prompt},
-        {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
-    ]
-    try:
-        response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
-        if response_stream and response_stream.choices:
-            return response_stream.choices[0].message.content.strip(), context_text
-        else: return "Format Error", context_text
-    except Exception as e: return f"Model Error: {str(e)}", context_text
 # =======================================================================
-# === 檔案處理區塊 (RAG 檔案) ===
 if rag_uploaded_file:
     file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
     if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
@@ -240,44 +438,27 @@ elif 'vector_store' in st.session_state:
     del st.session_state.rag_current_file_key
     st.info("RAG 檔案已移除，已清除相關知識庫。")
-# === 檔案處理區塊 (批量分析檔案 - 重大修改處) ===
-# 支援 JSON, CSV, TXT 並統一轉換為 list of dicts
 if batch_uploaded_file:
     batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
     if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
         try:
-            stringio = io.StringIO(batch_uploaded_file.getvalue().decode("utf-8"))
-            parsed_data = None
-            # --- Case 1: JSON ---
-            if batch_uploaded_file.name.lower().endswith('.json'):
-                parsed_data = json.load(stringio)
-                st.toast("JSON 檔案載入成功", icon="📄")
-            # --- Case 2: CSV ---
-            elif batch_uploaded_file.name.lower().endswith('.csv'):
-                # 使用 DictReader 將 CSV 轉為 List of Dicts
-                reader = csv.DictReader(stringio)
-                parsed_data = list(reader)
-                st.toast("CSV 檔案已轉換為 JSON 結構", icon="📊")
-            # --- Case 3: TXT ---
-            else: # 預設為 TXT
-                # 將每一行包裝成一個 JSON 物件: {"raw_content": "line text"}
-                lines = stringio.readlines()
-                parsed_data = [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
-                st.toast("TXT 檔案已轉換為 JSON 結構", icon="📝")
             # 儲存處理後的數據
             st.session_state.json_data_for_batch = parsed_data
             st.session_state.batch_current_file_key = batch_file_key
         except Exception as e:
             st.error(f"檔案解析錯誤: {e}")
             if 'json_data_for_batch' in st.session_state:
                 del st.session_state.json_data_for_batch
 elif 'json_data_for_batch' in st.session_state:
     del st.session_state.json_data_for_batch
     del st.session_state.batch_current_file_key
@@ -285,65 +466,87 @@ elif 'json_data_for_batch' in st.session_state:
         del st.session_state.batch_results
     st.info("批量分析檔案已移除，已清除相關數據。")
-# === 執行批量分析邏輯 ===
 if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
     st.session_state.execute_batch_analysis = False
     start_time = time.time()
     st.session_state.batch_results = []
     if inference_client is None:
         st.error("Client 未連線，無法執行。")
     else:
-        data_to_process = st.session_state.json_data_for_batch
-        logs_list = []
-        # 處理不同的 JSON 結構 (Dict vs List)
-        if isinstance(data_to_process, list):
-            logs_list = data_to_process
-        elif isinstance(data_to_process, dict):
-            # 嘗試尋找常見的 key
-            if 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
-                logs_list = data_to_process['alerts']
-            elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
-                logs_list = data_to_process['logs']
-            else:
-                logs_list = [data_to_process]
-        else:
-            logs_list = [data_to_process]
         if logs_list:
             vs = st.session_state.get("vector_store", None)
             # --- 關鍵：在這裡做 JSON String 的轉換 ---
-            # 無論來源是 CSV(Dict) 還是 TXT(Dict)，都在這裡用 json.dumps 轉成字串
-            # 這保證了 Prompt 收到的永遠是 JSON 格式的文字
             formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
             analysis_sequences = []
             for i in range(len(formatted_logs)):
-                start_index = max(0, i - WINDOW_SIZE + 1)
-                end_index = i + 1
-                current_window = formatted_logs[start_index:end_index]
                 sequence_text = []
-                for j, log_str in enumerate(current_window):
-                    is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
-                    sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
                 analysis_sequences.append({
                     "sequence_text": "\n\n".join(sequence_text),
                     "target_log_id": i + 1,
                     "original_log_entry": logs_list[i]
                 })
             total_sequences = len(analysis_sequences)
-            st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
             progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
             results_container = st.container()
             full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
             for i, seq_data in enumerate(analysis_sequences):
                 log_id = seq_data["target_log_id"]
                 progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
                 try:
                     response, retrieved_ctx = generate_rag_response_hf_for_log(
                         client=inference_client,
@@ -365,36 +568,93 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
                         "context": retrieved_ctx
                     }
                     st.session_state.batch_results.append(item)
                     with results_container:
                         st.subheader(f"Log/Alert #{item['log_id']}")
                         with st.expander("序列內容 (JSON Format)"):
-                            st.code(item["sequence_analyzed"], language='json') # 這裡顯示的會是 JSON 格式
-                        is_high = any(x in response.lower() for x in ['high risk'])
                         if is_high: st.error(item['analysis_result'])
                         else: st.info(item['analysis_result'])
                         if item['context']:
                             with st.expander("參考 RAG 片段"): st.code(item['context'])
                         st.markdown("---")
                         log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
                         full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
                 except Exception as e:
                     st.error(f"Error Log {log_id}: {e}")
             end_time = time.time()
             progress_bar.empty()
             st.success(f"完成！耗時 {end_time - start_time:.2f} 秒。")
         else:
             st.error("無法提取有效 Log，請檢查檔案格式。")
-# === 顯示結果 (歷史紀錄) ===
 if st.session_state.get("batch_results") and not st.session_state.execute_batch_analysis:
     st.header("⚡ 歷史分析結果")
-    full_report_chunks = ["## Report\n\n"]
     for item in st.session_state.batch_results:
-        log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
-        full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\n{item['analysis_result']}\n")
-    st.download_button("📥 下載完整報告 (.md)", "\n".join(full_report_chunks), "report.md", "text/markdown")

 import streamlit as st
 import os
 import io
 import json
+import csv
 import numpy as np
 import faiss
 import uuid
 import time
 import sys
+from typing import List, Dict, Any
 # === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
 try:
 from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_community.docstore.in_memory import InMemoryDocstore
+# 嘗試匯入 pypdf
 try:
     import pypdf
 except ImportError:
 # --- 頁面設定 ---
 st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
+st.title("🛡️ fdtn-ai/Foundation-Sec-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
+st.markdown("已啟用：**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT/**W3C Log** 執行批量分析。**批量分析序列已改為基於 IP 篩選。**")
+# --- Streamlit Session State 初始化 (保持不變) ---
 if 'execute_batch_analysis' not in st.session_state:
     st.session_state.execute_batch_analysis = False
 if 'batch_results' not in st.session_state:
+    st.session_state.batch_results = None
 if 'rag_current_file_key' not in st.session_state:
     st.session_state.rag_current_file_key = None
+if 'batch_current_file_key' not in st.session_state:
     st.session_state.batch_current_file_key = None
 if 'vector_store' not in st.session_state:
     st.session_state.vector_store = None
+if 'json_data_for_batch' not in st.session_state:
     st.session_state.json_data_for_batch = None
 # 設定模型 ID
+MODEL_ID = "fdtn-ai/Foundation-Sec-1.1-8B-Instruct"
+WINDOW_SIZE = 20
+# === W3C Log 專屬解析器 (保持不變) ===
+def parse_w3c_log(log_content: str) -> List[Dict[str, Any]]:
+    """
+    解析 W3C Extended Log File Format (如 IIS Log)，包括提取 #Fields:。
+    Args:
+        log_content (str): Log 檔案的字串內容。
+    Returns:
+        List[Dict[str, Any]]: 轉換後的 JSON 物件列表。
+    """
+    lines = log_content.splitlines()
+    field_names = None
+    data_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("#Fields:"):
+            # 找到欄位定義，例如 "#Fields: date time s-ip cs-method ..."
+            # .split() 會自動處理多個空格分隔
+            field_names = line.split()[1:] # 跳過 "#Fields:" 本身
+        elif not line.startswith("#"):
+            # 這是實際的資料行
+            data_lines.append(line)
+    if not field_names:
+        # 如果沒有找到 #Fields，則退回到原始 Log 條目模式
+        st.warning("未檢測到 W3C Log 的 #Fields: 標頭，退回原始 Log 條目模式。")
+        return [{"raw_log_entry": line} for line in lines if line.strip()]
+    json_data = []
+    # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充，使用底線版本)
+    numeric_fields = ['sc_status', 'time_taken', 'bytes', 'resp_len', 'req_size']
+    for data_line in data_lines:
+        # W3C Log 預設使用空格分隔。這裡使用 split()
+        values = data_line.split(' ')
+        # 簡易的欄位數量檢查
+        if len(values) != len(field_names):
+            # 如果欄位數量不匹配，將該行視為原始 Log 條目
+            json_data.append({"raw_log_entry": data_line})
+            continue
+        record = {}
+        for key, value in zip(field_names, values):
+            # 將 W3C 欄位名稱中的 '-' 替換成 Python 友好的 '_'
+            key = key.strip().replace('-', '_')
+            value = value.strip() if value else ""
+            # 處理數字轉換
+            if key in numeric_fields:
+                try:
+                    record[key] = int(value)
+                except ValueError:
+                    try:
+                        record[key] = float(value)
+                    except ValueError:
+                        record[key] = value
+            else:
+                record[key] = value
+        if record:
+            json_data.append(record)
+    return json_data
+# === 核心檔案轉換函式 (CSV/TXT -> JSON List) (保持不變) ===
+def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Dict[str, Any]]:
+    """
+    將 CSV 或 TXT 檔案內容 (假定為 CSV 格式，含標頭) 轉換為 JSON 物件列表。
+    這個函式現在專門處理非 W3C 格式的 CSV/TXT。
+    """
+    log_content = file_content.decode("utf-8").strip()
+    if not log_content:
+        return []
+    string_io = io.StringIO(log_content)
+    # 嘗試使用 csv.DictReader 自動將第一行視為 Key
+    try:
+        reader = csv.DictReader(string_io)
+    except Exception as e:
+        # 如果失敗，退回每行一個原始 Log 條目
+        st.warning(f"使用 csv.DictReader 失敗，嘗試將檔案視為每行一個原始 Log 條目: {e}")
+        return [{"raw_log_entry": line.strip()} for line in log_content.splitlines() if line.strip()]
+    json_data = []
+    if reader:
+        # 這裡檢查的是原始 CSV 標頭，但為了提取 IP，我們只需要確保它被解析即可
+        numeric_fields = ['sc-status', 'time-taken', 'bytes', 'resp-len', 'req-size']
+        for row in reader:
+            record = {}
+            for key, value in row.items():
+                if key is None: continue
+                key = key.strip()
+                value = value.strip() if value else ""
+                # 處理數字轉換
+                if key in numeric_fields:
+                    try:
+                        record[key] = int(value)
+                    except ValueError:
+                        try:
+                            record[key] = float(value)
+                        except ValueError:
+                            record[key] = value
+                else:
+                    record[key] = value
+            if record:
+                 json_data.append(record)
+        # 再次檢查是否為空，如果是空且是小文件，可能不是標準 CSV/JSON
+    if not json_data:
+        string_io.seek(0)
+        lines = string_io.readlines()
+        return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
+    return json_data
+# === 檔案類型分發器 (保持不變) ===
+def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
+    """根據檔案類型，將上傳的檔案內容轉換為 Log JSON 列表。"""
+    file_bytes = uploaded_file.getvalue()
+    file_name_lower = uploaded_file.name.lower()
+    # --- Case 1: JSON ---
+    if file_name_lower.endswith('.json'):
+        stringio = io.StringIO(file_bytes.decode("utf-8"))
+        parsed_data = json.load(stringio)
+        if isinstance(parsed_data, dict):
+            if 'alerts' in parsed_data and isinstance(parsed_data['alerts'], list):
+                return parsed_data['alerts']
+            elif 'logs' in parsed_data and isinstance(parsed_data['logs'], list):
+                return parsed_data['logs']
+            else:
+                return [parsed_data]
+        elif isinstance(parsed_data, list):
+            return parsed_data
+        else:
+            raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
+    # --- Case 2, 3, & 4: CSV/TXT/LOG ---
+    elif file_name_lower.endswith(('.csv', '.txt', '.log')):
+        file_type = 'csv' if file_name_lower.endswith('.csv') else ('log' if file_name_lower.endswith('.log') else 'txt')
+        if file_type == 'log':
+            # 針對 .log 檔案，嘗試使用 W3C 解析器
+            log_content = file_bytes.decode("utf-8").strip()
+            if not log_content: return []
+            # 使用 W3C 解析器
+            return parse_w3c_log(log_content)
+        else:
+            # CSV 和 TXT 保持使用原來的 csv.DictReader 邏輯
+            return convert_csv_txt_to_json_list(file_bytes, file_type)
+    else:
+        raise ValueError("不支援的檔案類型。")
+# === 提取 IP 位址的輔助函數 (新增) ===
+def get_ip_from_log(log_entry: Dict[str, Any]) -> str:
+    """嘗試從 Log 字典中提取 Client IP。
+    處理 W3C Log 的 'c_ip' 或 's_ip'，或原始 Log 條目。
+    注意：W3C 解析器已將 'c-ip' 轉換為 'c_ip'。
+    """
+    # 檢查 W3C/常見欄位 (已自動轉換為底線)
+    if 'c_ip' in log_entry:
+        return str(log_entry['c_ip']).strip()
+    elif 's_ip' in log_entry:
+        return str(log_entry['s_ip']).strip()
+    # 對於未解析的原始 Log 條目，暫時無法精確提取，返回空字串
+    return ""
+# === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
+def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
+    if client is None: return "ERROR: Client Error", ""
+    context_text = ""
+    # 1. RAG 檢索
+    if vector_store:
+        # 對於 Log 序列，我們通常只使用序列中的最後一條 Log 或整個序列進行檢索
+        # 為了平衡性能和準確性，這裡使用整個序列進行檢索
+        selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
+        if selected:
+            # 限制檢索結果數量，例如最多 5 個
+            retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
+            context_text = "\n".join(retrieved_contents)
+    # 2. 構建 Instruction
+    rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ===
+{context_text if context_text else 'No relevant reference context found.'}
+=== END REFERENCE CONTEXT ===
+ANALYSIS INSTRUCTION: {user_prompt}
+Based on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
+    log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===
+{log_sequence_text}
+=== END LOG SEQUENCE ==="""
+    messages = [
+        {"role": "system", "content": sys_prompt},
+        {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
+    ]
+    # 3. 呼叫 LLM
+    try:
+        response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
+        if response_stream and response_stream.choices:
+            return response_stream.choices[0].message.content.strip(), context_text
+        else: return "Format Error", context_text
+    except Exception as e: return f"Model Error: {str(e)}", context_text
+# --- 初始化 Hugging Face LLM Client (保持不變) ---
 @st.cache_resource
 def load_inference_client(model_id):
     if not os.environ.get("HF_TOKEN"): return None
 if os.environ.get("HF_TOKEN"):
     with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
         inference_client = load_inference_client(MODEL_ID)
 if inference_client is None and os.environ.get("HF_TOKEN"):
     st.warning("Hugging Face Inference Client 無法連線。")
+elif not os.environ.get("HF_TOKEN"):
     st.error("請在環境變數中設定 HF_TOKEN。")
 # === Embedding 模型 (保持不變) ===
         else:
             stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
             text_content = stringio.read()
         if not text_content.strip(): return None, "File is empty"
         events = [line for line in text_content.splitlines() if line.strip()]
         docs = [Document(page_content=e) for e in events]
         if not docs: return None, "No documents created"
         embeddings = embedding_model.embed_documents([d.page_content for d in docs])
         embeddings_np = np.array(embeddings).astype("float32")
         faiss.normalize_L2(embeddings_np)
         dimension = embeddings_np.shape[1]
         index = faiss.IndexFlatIP(dimension)
         index.add(embeddings_np)
         doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
         docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
         index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
         vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
         return vector_store, f"{len(docs)} chunks created."
     except Exception as e:
     selected.sort(key=lambda x: x[1], reverse=True)
     return selected
+# --- 側邊欄設定 (保持不變) ---
+with st.sidebar:
+    st.header("⚙️ 設定")
+    if not os.environ.get("HF_TOKEN"):
+        st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
+    st.info(f"LLM 模型：**{MODEL_ID}** (Hugging Face Inference API)")
+    st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫，請確保您的 HF Token 具有存取權限。")
+    st.divider()
+    st.subheader("📂 檔案上傳")
+    # === 1. 批量分析檔案 (支援多種格式) ===
+    batch_uploaded_file = st.file_uploader(
+        "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
+        type=['json', 'csv', 'txt', 'log'], # <--- 這裡增加了 'log'
+        key="batch_uploader",
+        help="支援 JSON (Array), CSV (含標題), TXT/LOG (視為 W3C 或一般 Log)"
+    )
+    # === 2. RAG 知識庫檔案 ===
+    rag_uploaded_file = st.file_uploader(
+        "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
+        type=['txt', 'py', 'log', 'csv', 'md', 'pdf'], # <--- 這裡增加了 'log'
+        key="rag_uploader"
+    )
+    st.divider()
+    st.subheader("💡 批量分析���令")
+    analysis_prompt = st.text_area(
+        "針對每個 Log/Alert 執行的指令",
+        value="You are a security expert in charge of analyzing alerts related to Web Application Attacks and Brute Force & Reconnaissance. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Priority: Provide the overall priority level. (Answer High-risk detected!, Medium-risk detected!, or Low-risk detected! only) \n- Explanation: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, explain the potential impact and why this specific alert requires attention. If not, **omit the explanation section**. \n- Action Plan: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, What should be the immediate steps to address this specific alert? If not, **omit the action plan section**. \n\nStrictly use the information in the provided Log.",
+        height=200
+    )
+    st.markdown(f"此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析，並提供**最多 {WINDOW_SIZE} 條**相同 IP 的歷史 Log 作為上下文。")
+    if batch_uploaded_file:
+        if st.button("🚀 執行批量分析"):
+            if not os.environ.get("HF_TOKEN"):
+                st.error("無法執行，環境變數 **HF_TOKEN** 未設定。")
+            else:
+                st.session_state.execute_batch_analysis = True
+    else:
+        st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
+    st.divider()
+    st.subheader("🔍 RAG 檢索設定")
+    similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
+    st.divider()
+    st.subheader("模型參數")
+    system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
+    max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
+    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
+    top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
+    st.divider()
+    if st.button("🗑️ 清除所有紀錄"):
+        for key in list(st.session_state.keys()):
+            if key not in ['HF_TOKEN']: # 保留環境變數
+                del st.session_state[key]
+        st.rerun()
 # =======================================================================
+# === 檔案處理區塊 (RAG 檔案) - 保持不變 ===
 if rag_uploaded_file:
     file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
     if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
     del st.session_state.rag_current_file_key
     st.info("RAG 檔案已移除，已清除相關知識庫。")
+# === 檔案處理區塊 (批量分析檔案 - 保持不變 ) ===
 if batch_uploaded_file:
     batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
     if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
         try:
+            # 使用新的統一解析函式
+            parsed_data = convert_uploaded_file_to_json_list(batch_uploaded_file)
+            if not parsed_data:
+                raise ValueError(f"{batch_uploaded_file.name} 檔案載入失敗或內容為空。")
             # 儲存處理後的數據
             st.session_state.json_data_for_batch = parsed_data
             st.session_state.batch_current_file_key = batch_file_key
+            st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
         except Exception as e:
             st.error(f"檔案解析錯誤: {e}")
             if 'json_data_for_batch' in st.session_state:
                 del st.session_state.json_data_for_batch
 elif 'json_data_for_batch' in st.session_state:
     del st.session_state.json_data_for_batch
     del st.session_state.batch_current_file_key
         del st.session_state.batch_results
     st.info("批量分析檔案已移除，已清除相關數據。")
+# === 執行批量分析邏輯 (已修改為 IP 篩選) ===
 if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
     st.session_state.execute_batch_analysis = False
     start_time = time.time()
     st.session_state.batch_results = []
     if inference_client is None:
         st.error("Client 未連線，無法執行。")
     else:
+        logs_list = st.session_state.json_data_for_batch
         if logs_list:
             vs = st.session_state.get("vector_store", None)
             # --- 關鍵：在這裡做 JSON String 的轉換 ---
             formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
             analysis_sequences = []
+            # ** vvvv 替換此處邏輯為基於 IP 的篩選 vvvv **
             for i in range(len(formatted_logs)):
+                # 1. 取得當前 Log (目標 Log) 的 IP
+                target_log = logs_list[i]
+                target_ip = get_ip_from_log(target_log)
+                # 2. 確定回溯的 Log 範圍 (只看前 N 條 Log，不包含當前 Log)
+                start_index = max(0, i - len(logs_list) + 1) # 回溯到最開始
                 sequence_text = []
+                if not target_ip:
+                    # 如果沒有 IP，則只分析當前 Log
+                    # 這裡將 WINDOW_SIZE 設為 1，只包含自己
+                    sequence_text.append(f"--- Log Index {i} (No IP found){' <<< TARGET LOG TO ANALYZE'} ---\n{formatted_logs[i]}")
+                else:
+                    # 3. 篩選出與目標 IP 相同的 Log 條目
+                    current_window_indices = []
+                    # 倒序查找，確保最近的 Log 優先被選中
+                    # 範圍是 i-1 倒數到 0 (含)
+                    for j in range(i - 1, -1, -1):
+                        prior_log = logs_list[j]
+                        prior_ip = get_ip_from_log(prior_log)
+                        if prior_ip == target_ip:
+                            current_window_indices.append(j)
+                            # 如果已經累積了 N-1 條，則停止
+                            if len(current_window_indices) >= WINDOW_SIZE - 1:
+                                break
+                    # 4. 將選取的 Log 索引 (倒序的) 加上當前 Log 的索引 (i)
+                    # 確保它們按照時間順序排列 (升序)
+                    sorted_indices = sorted(current_window_indices) + [i]
+                    # 5. 構建序列文本
+                    for index in sorted_indices:
+                        is_target = " <<< TARGET LOG TO ANALYZE" if index == i else ""
+                        # 計算 Log 相對位置
+                        relative_pos = i - index
+                        # 使用 sorted_indices 的長度作為序列長度，而不是 WINDOW_SIZE
+                        sequence_text.append(f"--- Log Index {index} (IP:{target_ip}, {relative_pos} prior logs){is_target} ---\n{formatted_logs[index]}")
+                # 6. 構建分析序列
                 analysis_sequences.append({
                     "sequence_text": "\n\n".join(sequence_text),
                     "target_log_id": i + 1,
                     "original_log_entry": logs_list[i]
                 })
+            # ** ^^^^ 替換結束 ^^^^ **
             total_sequences = len(analysis_sequences)
+            st.header(f"⚡ 批量分析執行中 (基於 IP 篩選, Max $N={WINDOW_SIZE}$)...")
             progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
             results_container = st.container()
             full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
             for i, seq_data in enumerate(analysis_sequences):
                 log_id = seq_data["target_log_id"]
                 progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
                 try:
                     response, retrieved_ctx = generate_rag_response_hf_for_log(
                         client=inference_client,
                         "context": retrieved_ctx
                     }
                     st.session_state.batch_results.append(item)
                     with results_container:
                         st.subheader(f"Log/Alert #{item['log_id']}")
                         with st.expander("序列內容 (JSON Format)"):
+                            st.code(item["sequence_analyzed"], language='json')
+                        is_high = any(x in response.lower() for x in ['high-risk detected'])
                         if is_high: st.error(item['analysis_result'])
                         else: st.info(item['analysis_result'])
                         if item['context']:
                             with st.expander("參考 RAG 片段"): st.code(item['context'])
                         st.markdown("---")
                         log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
                         full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
                 except Exception as e:
                     st.error(f"Error Log {log_id}: {e}")
             end_time = time.time()
             progress_bar.empty()
             st.success(f"完成！耗時 {end_time - start_time:.2f} 秒。")
         else:
             st.error("無法提取有效 Log，請檢查檔案格式。")
+# === 顯示結果 (歷史紀錄) - 保持不變 ===
 if st.session_state.get("batch_results") and not st.session_state.execute_batch_analysis:
     st.header("⚡ 歷史分析結果")
+    # 初始化一個列表來儲存高風險項目的結構化數據
+    high_risk_data = []
+    # 預處理所有結果，只篩選出 High-risk
+    high_risk_items = []
     for item in st.session_state.batch_results:
+        # 檢查 analysis_result 中是否包含 'High-risk detected' (不區分大小寫)
+        is_high_risk = 'high-risk detected!' in item['analysis_result'].lower()
+        if is_high_risk:
+            high_risk_items.append(item)
+            # --- 為 CSV 報告準備數據 ---
+            # log_content 在 CSV 中通常需要被序列化為單行字串
+            log_content_str = json.dumps(item["log_content"], ensure_ascii=False)
+            # 整理 AI 分析結果，去除可能的換行符，使其在 CSV 單元格內更整潔
+            analysis_result_clean = item['analysis_result'].replace('\n', ' | ')
+            high_risk_data.append({
+                "Log_ID": item['log_id'],
+                "Risk_Level": "HIGH_RISK",
+                "Log_Content": log_content_str,
+                "AI_Analysis_Result": analysis_result_clean
+            })
+    # 顯示 High-Risk 報告的下載按鈕 (改為 CSV 邏輯)
+    if high_risk_items:
+        st.success(f"✅ 檢測到 {len(high_risk_items)} ���高風險 Log/Alert。")
+        # --- 構建 CSV 內容 ---
+        csv_output = io.StringIO()
+        # 寫入 CSV 標題
+        csv_output.write("Log_ID,Risk_Level,Log_Content,AI_Analysis_Result\n")
+        # 轉義函數 (確保複雜欄位在 CSV 中不被破壞)
+        def escape_csv(value):
+            # 替換內容中的所有雙引號為兩個雙引號，然後用雙引號包圍
+            return f'"{str(value).replace('"', '""')}"'
+        for row in high_risk_data:
+            line = ",".join([
+                str(row["Log_ID"]),
+                row["Risk_Level"],
+                escape_csv(row["Log_Content"]),
+                escape_csv(row["AI_Analysis_Result"])
+            ]) + "\n"
+            csv_output.write(line)
+        csv_content = csv_output.getvalue()
+        # 顯示 CSV 報告的下載按鈕
+        st.download_button(
+            "📥 下載 **高風險** 分析報告 (.csv)",
+             csv_content,
+             "high_risk_report.csv",
+             "text/csv"
+        )
+    else:
+        st.info("👍 未檢測到任何標註為 High-risk detected 的 Log/Alert。")