Spaces:

ss900371tw
/

FoundationSec8BSimilarityDB1

Running

File size: 19,963 Bytes

c5a7cf0
b5ae04d
e528918
 
23f2ebe
c5a7cf0
ba7b7f3
 
582b8f8
4e5ef71
6524ee2
c5a7cf0
418b5c8
50c0658
418b5c8
50c0658
418b5c8
c5a7cf0
50c0658
a554b5e
c8bbb2d
7877959
49cb911
582b8f8
c5a7cf0
 
e528918
 
 
 
f1e77b2
e528918
50c0658
418b5c8
c5a7cf0
f45f091
 
 
 
c5a7cf0
f45f091
 
c5a7cf0
 
d775ed8
 
c5a7cf0
d775ed8
c5a7cf0
 
8e4173a
50c0658
e528918
a554b5e
e528918
 
 
418b5c8
 
 
 
 
ef94d18
e528918
50c0658
c5a7cf0
 
 
 
 
 
 
23f2ebe
c5a7cf0
 
23f2ebe
ef94d18
23f2ebe
 
 
c5a7cf0
23f2ebe
50c0658
c5a7cf0
23f2ebe
ef94d18
2d7fa4f
23f2ebe
 
c5a7cf0
ef94d18
c5a7cf0
23f2ebe
418b5c8
 
c5a7cf0
 
23f2ebe
c5a7cf0
6524ee2
e528918
a554b5e
c5a7cf0
ef94d18
c5a7cf0
50c0658
c5a7cf0
50c0658
 
23f2ebe
ef94d18
582b8f8
23f2ebe
 
c5a7cf0
e528918
 
c5a7cf0
e528918
418b5c8
c5a7cf0
e528918
c5a7cf0
418b5c8
 
e528918
418b5c8
50c0658
ef94d18
418b5c8
 
 
 
 
c5a7cf0
 
 
50c0658
c5a7cf0
a554b5e
 
c5a7cf0
 
 
ef94d18
ba7b7f3
a554b5e
 
50c0658
ba7b7f3
a554b5e
e528918
 
 
 
 
a554b5e
c5a7cf0
e528918
 
a554b5e
c5a7cf0
 
 
7a2e2d7
1bd9676
c5a7cf0
 
7877959
 
c5a7cf0
 
7877959
c5a7cf0
7877959
c5a7cf0
7877959
 
 
c5a7cf0
 
582b8f8
e528918
a554b5e
 
ba7b7f3
 
25689f2
 
ba7b7f3
25689f2
ba7b7f3
 
7877959
25689f2
 
 
 
7877959
ba7b7f3
 
c5a7cf0
418b5c8
c5a7cf0
50c0658
a554b5e
50c0658
 
c5a7cf0
50c0658
c5a7cf0
 
418b5c8
23f2ebe
418b5c8
 
 
 
23f2ebe
c5a7cf0
418b5c8
c5a7cf0
 
 
418b5c8
c5a7cf0
 
23f2ebe
 
 
 
 
 
 
 
 
c5a7cf0
23f2ebe
 
 
 
418b5c8
c5a7cf0
 
 
 
582b8f8
c5a7cf0
23f2ebe
c5a7cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23f2ebe
c5a7cf0
23f2ebe
 
c5a7cf0
23f2ebe
 
c5a7cf0
23f2ebe
ef94d18
c5a7cf0
925eb3e
c5a7cf0
23f2ebe
ef94d18
c5a7cf0
23f2ebe
ef94d18
418b5c8
c5a7cf0
 
 
 
418b5c8
c5a7cf0
 
 
 
 
 
 
 
 
 
 
50c0658
 
ef94d18
c5a7cf0
 
23f2ebe
c5a7cf0
 
 
 
ef94d18
c5a7cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
ef94d18
c5a7cf0
 
 
 
 
50c0658
c5a7cf0
 
 
50c0658
c5a7cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef94d18
2d7fa4f
c5a7cf0
 
 
 
 
ef94d18
c5a7cf0
 
 
 
 
 
 
 
 
 
 
23f2ebe
c5a7cf0
5aa5596
c5a7cf0
 
23f2ebe
 
c5a7cf0


import streamlit as st
import os
import io
import json
import csv  # <--- 新增：用於處理 CSV
import numpy as np
import faiss
import uuid
import time
import sys

# === HuggingFace 模型相關套件 (替換為 InferenceClient) ===
try:
    from huggingface_hub import InferenceClient
except ImportError:
    st.error("請檢查是否安裝了所有 Hugging Face 相關依賴：pip install huggingface-hub")

# === LangChain/RAG 相關套件 (保持不變) ===
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.docstore.in_memory import InMemoryDocstore

# 嘗試匯入 pypdftry
try:
    import pypdf
except ImportError:
    pypdf = None

# --- 頁面設定 ---
st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
st.title("🛡️ Meta-Llama-3-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
st.markdown("已啟用：**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT 執行批量分析。")

if 'execute_batch_analysis' not in st.session_state:
    st.session_state.execute_batch_analysis = False
if 'batch_results' not in st.session_state:
    st.session_state.batch_results = None 
if 'rag_current_file_key' not in st.session_state:
    st.session_state.rag_current_file_key = None
if 'batch_current_file_key' not in st.session_state: # 修改變數名稱以反映多格式
    st.session_state.batch_current_file_key = None
if 'vector_store' not in st.session_state:
    st.session_state.vector_store = None
if 'json_data_for_batch' not in st.session_state: # 變數名稱保留，但內容可能是轉換後的 dict
    st.session_state.json_data_for_batch = None

# 設定模型 ID
MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct" 
WINDOW_SIZE = 8

# --- 側邊欄設定 ---
with st.sidebar:
    st.header("⚙️ 設定")
    
    if not os.environ.get("HF_TOKEN"):
         st.error("環境變數 **HF_TOKEN** 未設定。請設定後重新啟動應用程式。")
    
    st.info(f"LLM 模型：**{MODEL_ID}** (Hugging Face Inference API)")
    st.warning("⚠️ **注意**: 該模型使用 Inference API 呼叫，請確保您的 HF Token 具有存取權限。")
    
    st.divider()
    st.subheader("📂 檔案上傳")
    
    # === 1. 批量分析檔案 (修改處：支援多種格式) ===
    batch_uploaded_file = st.file_uploader(
        "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
        type=['json', 'csv', 'txt'], # <--- 修改：新增 csv 和 txt
        key="batch_uploader",
        help="支援 JSON (Array), CSV (含標題), TXT (每行一條 Log)"
    )

    # === 2. RAG 知識庫檔案 ===
    rag_uploaded_file = st.file_uploader(
        "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
        type=['txt', 'py', 'log', 'csv', 'md', 'pdf'],
        key="rag_uploader"
    )

    st.divider()
    
    st.subheader("💡 批量分析指令")
    analysis_prompt = st.text_area(
        "針對每個 Log/Alert 執行的指令",
        value="You are a security expert in charge of analyzing alerts related to Web Application Attacks and Brute Force & Reconnaissance. Respond with a clear, structured analysis using the following mandatory sections: \n\n- Priority: Provide the overall priority level. (Answer High risk, Medium risk, or Low risk only) \n- Explanation: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, explain the potential impact and why this specific alert requires attention. If not, **omit the explanation section**. \n- Action Plan: If this alert is highly related to Web Application Attacks and Brute Force & Reconnaissance, What should be the immediate steps to address this specific alert? If not, **omit the action plan section**. \n\nStrictly use the information in the provided Log.",
        height=200
    )
    st.markdown("此指令將對檔案中的**每一個 Log 條目**執行一次獨立分析。")
    
    if batch_uploaded_file: 
        if st.button("🚀 執行批量分析"):
            if not os.environ.get("HF_TOKEN"):
                st.error("無法執行，環境變數 **HF_TOKEN** 未設定。")
            else: 
                st.session_state.execute_batch_analysis = True
    else:
        st.info("請上傳 Log 檔案以啟用批量分析按鈕。")
        
    st.divider()
    st.subheader("🔍 RAG 檢索設定")
    similarity_threshold = st.slider("📐 Cosine Similarity 門檻", 0.0, 1.0, 0.4, 0.01)
    
    st.divider()
    st.subheader("模型參數")
    system_prompt = st.text_area("System Prompt", value="You are a Senior Security Analyst, named Ernest. You provide expert, authoritative, and concise advice on Information Security. Your analysis must be based strictly on the provided context.", height=100)
    max_output_tokens = st.slider("Max Output Tokens", 128, 4096, 2048, 128)
    temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1) 
    top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
    
    st.divider()
    if st.button("🗑️ 清除所有紀錄"):
        for key in list(st.session_state.keys()):
             del st.session_state[key]
        st.rerun()

# --- 初始化 Hugging Face LLM Client ---
@st.cache_resource
def load_inference_client(model_id):
    if not os.environ.get("HF_TOKEN"): return None
    try:
        client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN"))
        st.success(f"Hugging Face Inference Client **{model_id}** 載入成功。")
        return client
    except Exception as e:
        st.error(f"Hugging Face Inference Client 載入失敗: {e}")
        return None

inference_client = None
if os.environ.get("HF_TOKEN"):
    with st.spinner(f"正在連線到 Inference Client: {MODEL_ID}..."):
        inference_client = load_inference_client(MODEL_ID)
if inference_client is None and os.environ.get("HF_TOKEN"):
    st.warning("Hugging Face Inference Client 無法連線。")
elif not os.environ.get("HF_TOKEN"): 
    st.error("請在環境變數中設定 HF_TOKEN。")

# === Embedding 模型 (保持不變) ===
@st.cache_resource
def load_embedding_model():
    model_kwargs = {'device': 'cpu', 'trust_remote_code': True}
    encode_kwargs = {'normalize_embeddings': False}
    return HuggingFaceEmbeddings(model_name="BAAI/bge-large-zh-v1.5", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

with st.spinner("正在載入 Embedding 模型..."):
    embedding_model = load_embedding_model()

# === 建立向量庫 / Search 函數 (保持不變) ===
def process_file_to_faiss(uploaded_file):
    text_content = ""
    try:
        if uploaded_file.type == "application/pdf":
            if pypdf:
                pdf_reader = pypdf.PdfReader(uploaded_file)
                for page in pdf_reader.pages:
                    text_content += page.extract_text() + "\n"
            else: return None, "PDF library missing"
        else:
            stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
            text_content = stringio.read()
            
        if not text_content.strip(): return None, "File is empty"
        
        events = [line for line in text_content.splitlines() if line.strip()]
        docs = [Document(page_content=e) for e in events]
        if not docs: return None, "No documents created"
        
        embeddings = embedding_model.embed_documents([d.page_content for d in docs])
        embeddings_np = np.array(embeddings).astype("float32")
        faiss.normalize_L2(embeddings_np)
        
        dimension = embeddings_np.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(embeddings_np)
        
        doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
        docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
        index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
        
        vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
        return vector_store, f"{len(docs)} chunks created."
    except Exception as e:
        return None, f"Error: {str(e)}"

def faiss_cosine_search_all(vector_store, query, threshold):
    q_emb = embedding_model.embed_query(query)
    q_emb = np.array([q_emb]).astype("float32")
    faiss.normalize_L2(q_emb)
    index = vector_store.index
    D, I = index.search(q_emb, k=index.ntotal)
    selected = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1: continue
        if score >= threshold:
            doc_id = vector_store.index_to_docstore_id[idx]
            doc = vector_store.docstore.search(doc_id)
            selected.append((doc, score))
    selected.sort(key=lambda x: x[1], reverse=True)
    return selected

# === Hugging Face 生成單一 Log 分析回答 (保持不變) ===
def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_prompt, sys_prompt, vector_store, threshold, max_output_tokens, temperature, top_p):
    if client is None: return "ERROR: Client Error", ""
    context_text = ""
    if vector_store:
        selected = faiss_cosine_search_all(vector_store, log_sequence_text, threshold)
        if selected:
            retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
            context_text = "\n".join(retrieved_contents)
            
    rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
    log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
    
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
    ]
    try:
        response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
        if response_stream and response_stream.choices:
            return response_stream.choices[0].message.content.strip(), context_text
        else: return "Format Error", context_text
    except Exception as e: return f"Model Error: {str(e)}", context_text

# =======================================================================
# === 檔案處理區塊 (RAG 檔案) ===
if rag_uploaded_file:
    file_key = f"vs_{rag_uploaded_file.name}_{rag_uploaded_file.size}"
    if st.session_state.rag_current_file_key != file_key or 'vector_store' not in st.session_state:
        with st.spinner(f"正在建立 RAG 參考知識庫 ({rag_uploaded_file.name})..."):
            vs, msg = process_file_to_faiss(rag_uploaded_file)
            if vs:
                st.session_state.vector_store = vs
                st.session_state.rag_current_file_key = file_key
                st.toast(f"RAG 參考知識庫已更新！{msg}", icon="✅")
            else: st.error(msg)
elif 'vector_store' in st.session_state:
    del st.session_state.vector_store
    del st.session_state.rag_current_file_key
    st.info("RAG 檔案已移除，已清除相關知識庫。")

# === 檔案處理區塊 (批量分析檔案 - 重大修改處) ===
# 支援 JSON, CSV, TXT 並統一轉換為 list of dicts
if batch_uploaded_file:
    batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
    
    if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
        try:
            stringio = io.StringIO(batch_uploaded_file.getvalue().decode("utf-8"))
            parsed_data = None
            
            # --- Case 1: JSON ---
            if batch_uploaded_file.name.lower().endswith('.json'):
                parsed_data = json.load(stringio)
                st.toast("JSON 檔案載入成功", icon="📄")

            # --- Case 2: CSV ---
            elif batch_uploaded_file.name.lower().endswith('.csv'):
                # 使用 DictReader 將 CSV 轉為 List of Dicts
                reader = csv.DictReader(stringio)
                parsed_data = list(reader)
                st.toast("CSV 檔案已轉換為 JSON 結構", icon="📊")

            # --- Case 3: TXT ---
            else: # 預設為 TXT
                # 將每一行包裝成一個 JSON 物件: {"raw_content": "line text"}
                lines = stringio.readlines()
                parsed_data = [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
                st.toast("TXT 檔案已轉換為 JSON 結構", icon="📝")
            
            # 儲存處理後的數據
            st.session_state.json_data_for_batch = parsed_data
            st.session_state.batch_current_file_key = batch_file_key
            
        except Exception as e:
            st.error(f"檔案解析錯誤: {e}")
            if 'json_data_for_batch' in st.session_state:
                del st.session_state.json_data_for_batch

elif 'json_data_for_batch' in st.session_state:
    del st.session_state.json_data_for_batch
    del st.session_state.batch_current_file_key
    if "batch_results" in st.session_state:
        del st.session_state.batch_results
    st.info("批量分析檔案已移除，已清除相關數據。")

# === 執行批量分析邏輯 ===
if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.session_state:
    st.session_state.execute_batch_analysis = False
    start_time = time.time()
    st.session_state.batch_results = []
    
    if inference_client is None:
        st.error("Client 未連線，無法執行。")
    else:
        data_to_process = st.session_state.json_data_for_batch
        logs_list = []
        
        # 處理不同的 JSON 結構 (Dict vs List)
        if isinstance(data_to_process, list):
            logs_list = data_to_process
        elif isinstance(data_to_process, dict):
            # 嘗試尋找常見的 key
            if 'alerts' in data_to_process and isinstance(data_to_process['alerts'], list):
                logs_list = data_to_process['alerts']
            elif 'logs' in data_to_process and isinstance(data_to_process['logs'], list):
                logs_list = data_to_process['logs']
            else:
                logs_list = [data_to_process]
        else:
            logs_list = [data_to_process]
            
        if logs_list:
            vs = st.session_state.get("vector_store", None)
            
            # --- 關鍵：在這裡做 JSON String 的轉換 ---
            # 無論來源是 CSV(Dict) 還是 TXT(Dict)，都在這裡用 json.dumps 轉成字串
            # 這保證了 Prompt 收到的永遠是 JSON 格式的文字
            formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
            
            analysis_sequences = []
            for i in range(len(formatted_logs)):
                start_index = max(0, i - WINDOW_SIZE + 1)
                end_index = i + 1
                current_window = formatted_logs[start_index:end_index]
                sequence_text = []
                for j, log_str in enumerate(current_window):
                    is_target = " <<< TARGET LOG TO ANALYZE" if j == len(current_window) - 1 else ""
                    sequence_text.append(f"--- Log Index {i - len(current_window) + j + 1} ({len(current_window)-j} prior logs){is_target} ---\n{log_str}")
                analysis_sequences.append({
                    "sequence_text": "\n\n".join(sequence_text),
                    "target_log_id": i + 1,
                    "original_log_entry": logs_list[i]
                })
            
            total_sequences = len(analysis_sequences)
            st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
            progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
            results_container = st.container()
            full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
            
            for i, seq_data in enumerate(analysis_sequences):
                log_id = seq_data["target_log_id"]
                progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
                
                try:
                    response, retrieved_ctx = generate_rag_response_hf_for_log(
                        client=inference_client,
                        model_id=MODEL_ID,
                        log_sequence_text=seq_data["sequence_text"],
                        user_prompt=analysis_prompt,
                        sys_prompt=system_prompt,
                        vector_store=vs,
                        threshold=similarity_threshold,
                        max_output_tokens=max_output_tokens,
                        temperature=temperature,
                        top_p=top_p
                    )
                    item = {
                        "log_id": log_id,
                        "log_content": seq_data["original_log_entry"],
                        "sequence_analyzed": seq_data["sequence_text"],
                        "analysis_result": response,
                        "context": retrieved_ctx
                    }
                    st.session_state.batch_results.append(item)
                    
                    with results_container:
                        st.subheader(f"Log/Alert #{item['log_id']}")
                        with st.expander("序列內容 (JSON Format)"):
                            st.code(item["sequence_analyzed"], language='json') # 這裡顯示的會是 JSON 格式
                        
                        is_high = any(x in response.lower() for x in ['high risk'])
                        if is_high: st.error(item['analysis_result'])
                        else: st.info(item['analysis_result'])
                        if item['context']:
                            with st.expander("參考 RAG 片段"): st.code(item['context'])
                        st.markdown("---")
                        
                        log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
                        full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
                
                except Exception as e:
                    st.error(f"Error Log {log_id}: {e}")
            
            end_time = time.time()
            progress_bar.empty()
            st.success(f"完成！耗時 {end_time - start_time:.2f} 秒。")
        else:
            st.error("無法提取有效 Log，請檢查檔案格式。")

# === 顯示結果 (歷史紀錄) ===
if st.session_state.get("batch_results") and not st.session_state.execute_batch_analysis:
    st.header("⚡ 歷史分析結果")
    full_report_chunks = ["## Report\n\n"]
    for item in st.session_state.batch_results:
        log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
        full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\n{item['analysis_result']}\n")
    st.download_button("📥 下載完整報告 (.md)", "\n".join(full_report_chunks), "report.md", "text/markdown")