Spaces:

ss900371tw
/

HM

Sleeping

App Files Files Community

ss900371tw commited on Dec 15, 2025

Commit

84006ad

verified ·

1 Parent(s): bcb1b3e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +76 -74

src/streamlit_app.py CHANGED Viewed

@@ -32,7 +32,7 @@ except ImportError:
 # --- 頁面設定 ---
 st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
 st.title("🛡️ Meta-Llama-3-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
-st.markdown("已啟用：**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT 執行批量分析。")
 # --- Streamlit Session State 初始化 (保持不變) ---
 if 'execute_batch_analysis' not in st.session_state:
@@ -49,22 +49,19 @@ if 'json_data_for_batch' not in st.session_state:
     st.session_state.json_data_for_batch = None
 # 設定模型 ID
-MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 WINDOW_SIZE = 8
-# === 核心檔案轉換函式 (CSV/TXT -> JSON List) ===
 def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Dict[str, Any]]:
     """
-    將 CSV 或 TXT 檔案內容 (假定為 CSV 格式，含標頭) 轉換為 JSON 物件列表。
     Args:
         file_content (bytes): 上傳檔案的二進位內容。
-        file_type (str): 檔案類型 ('csv' 或 'txt')。
     Returns:
         List[Dict[str, Any]]: 轉換後的 JSON 物件列表。
     """
-    # 這裡我們使用 decode("utf-8") 來處理內容
     log_content = file_content.decode("utf-8").strip()
     if not log_content:
         return []
@@ -72,49 +69,54 @@ def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Di
     # 使用 StringIO 讓 csv 模組可以處理字串內容
     string_io = io.StringIO(log_content)
-    # 使用 csv.DictReader 自動將第一行視為 Key
-    # 如果 TXT 內容是 JSON，這裡會出錯，但在 Streamlit 上傳區塊會處理
-    reader = csv.DictReader(string_io)
     json_data = []
-    # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充)
-    numeric_fields = ['sc-status', 'time-taken', 'bytes', 'resp-len', 'req-size']
-    for row in reader:
-        record = {}
-        for key, value in row.items():
-            key = key.strip() # 清理 key
-            value = value.strip() # 清理 value
-            # 處理數字轉換
-            if key in numeric_fields:
-                try:
-                    # 嘗試轉換為整數 (如果有小數點，int() 會拋出錯誤)
-                    record[key] = int(value)
-                except ValueError:
                     try:
-                        # 嘗試轉換為浮點數
-                        record[key] = float(value)
                     except ValueError:
-                        # 轉換失敗則保持為字串
-                        record[key] = value
-            else:
-                record[key] = value
-        if record: # 確保不是空紀錄
-            json_data.append(record)
-    if not json_data and file_type == 'txt':
-        # 如果 csv.DictReader 失敗，嘗試將 TXT 視為每行一個原始 Log
-        # (作為備用選項，類似您原始的 'raw_log_entry' 邏輯，但更簡化)
         string_io.seek(0)
         lines = string_io.readlines()
-        if len(lines) > 0 and len(lines) <= 2: # 判斷是否為小文件，更可能不是標準 CSV/JSON
              return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
     return json_data
 def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
     """根據檔案類型，將上傳的檔案內容轉換為 Log JSON 列表。"""
     file_bytes = uploaded_file.getvalue()
@@ -140,10 +142,11 @@ def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
         else:
             raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
-    # --- Case 2 & 3: CSV/TXT ---
-    elif file_name_lower.endswith(('.csv', '.txt')):
-        # 假設 CSV 和 TXT 都是逗號分隔格式 (含標頭)
-        return convert_csv_txt_to_json_list(file_bytes, 'csv' if file_name_lower.endswith('.csv') else 'txt')
     else:
         raise ValueError("不支援的檔案類型。")
@@ -163,15 +166,15 @@ with st.sidebar:
     # === 1. 批量分析檔案 (支援多種格式) ===
     batch_uploaded_file = st.file_uploader(
         "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
-        type=['json', 'csv', 'txt'],
         key="batch_uploader",
-        help="支援 JSON (Array), CSV (含標題), TXT (視為 CSV 或每行一個 Log)"
     )
     # === 2. RAG 知識庫檔案 ===
     rag_uploaded_file = st.file_uploader(
         "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
-        type=['txt', 'py', 'log', 'csv', 'md', 'pdf'],
         key="rag_uploader"
     )
     st.divider()
@@ -210,7 +213,7 @@ with st.sidebar:
     if st.button("🗑️ 清除所有紀錄"):
         for key in list(st.session_state.keys()):
             # 排除 HF_TOKEN，如果它在 session_state 中
-            if key != 'HF_TOKEN':
                 del st.session_state[key]
         st.rerun()
@@ -238,7 +241,6 @@ elif not os.environ.get("HF_TOKEN"):
 # === Embedding 模型 (保持不變) ===
 @st.cache_resource
 def load_embedding_model():
     model_kwargs = {'device': 'cpu', 'trust_remote_code': True}
     encode_kwargs = {'normalize_embeddings': False}
@@ -260,25 +262,25 @@ def process_file_to_faiss(uploaded_file):
         else:
             stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
             text_content = stringio.read()
         if not text_content.strip(): return None, "File is empty"
         events = [line for line in text_content.splitlines() if line.strip()]
         docs = [Document(page_content=e) for e in events]
         if not docs: return None, "No documents created"
         embeddings = embedding_model.embed_documents([d.page_content for d in docs])
         embeddings_np = np.array(embeddings).astype("float32")
         faiss.normalize_L2(embeddings_np)
         dimension = embeddings_np.shape[1]
         index = faiss.IndexFlatIP(dimension)
         index.add(embeddings_np)
         doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
         docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
         index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
         vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
         return vector_store, f"{len(docs)} chunks created."
     except Exception as e:
@@ -309,15 +311,15 @@ def generate_rag_response_hf_for_log(client, model_id, log_sequence_text, user_p
         if selected:
             retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
             context_text = "\n".join(retrieved_contents)
     rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
     log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
     messages = [
         {"role": "system", "content": sys_prompt},
         {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
     ]
     try:
         response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
         if response_stream and response_stream.choices:
@@ -345,7 +347,7 @@ elif 'vector_store' in st.session_state:
 # === 檔案處理區塊 (批量分析檔案 - **優化重寫** ) ===
 if batch_uploaded_file:
     batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
     if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
         try:
             # 使用新的統一解析函式
@@ -358,7 +360,7 @@ if batch_uploaded_file:
             st.session_state.json_data_for_batch = parsed_data
             st.session_state.batch_current_file_key = batch_file_key
             st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
         except Exception as e:
             st.error(f"檔案解析錯誤: {e}")
             if 'json_data_for_batch' in st.session_state:
@@ -375,20 +377,20 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
     st.session_state.execute_batch_analysis = False
     start_time = time.time()
     st.session_state.batch_results = []
     if inference_client is None:
         st.error("Client 未連線，無法執行。")
     else:
         # 在新的邏輯中，st.session_state.json_data_for_batch 已經是一個 List[Dict]
         logs_list = st.session_state.json_data_for_batch
         if logs_list:
             vs = st.session_state.get("vector_store", None)
             # --- 關鍵：在這裡做 JSON String 的轉換 ---
             # 確保 Prompt 收到的永遠是 JSON 格式的文字
             formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
             analysis_sequences = []
             for i in range(len(formatted_logs)):
                 start_index = max(0, i - WINDOW_SIZE + 1)
@@ -403,17 +405,17 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
                     "target_log_id": i + 1,
                     "original_log_entry": logs_list[i]
                 })
             total_sequences = len(analysis_sequences)
             st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
             progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
             results_container = st.container()
             full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
             for i, seq_data in enumerate(analysis_sequences):
                 log_id = seq_data["target_log_id"]
                 progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
                 try:
                     response, retrieved_ctx = generate_rag_response_hf_for_log(
                         client=inference_client,
@@ -435,26 +437,26 @@ if st.session_state.execute_batch_analysis and 'json_data_for_batch' in st.sessi
                         "context": retrieved_ctx
                     }
                     st.session_state.batch_results.append(item)
                     with results_container:
                         st.subheader(f"Log/Alert #{item['log_id']}")
                         with st.expander("序列內容 (JSON Format)"):
                             # 這裡顯示的會是 JSON 格式的 Log Sequence
                             st.code(item["sequence_analyzed"], language='json')
                         is_high = any(x in response.lower() for x in ['high-risk detected'])
                         if is_high: st.error(item['analysis_result'])
                         else: st.info(item['analysis_result'])
                         if item['context']:
                             with st.expander("參考 RAG 片段"): st.code(item['context'])
                         st.markdown("---")
                         log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
                         full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
                 except Exception as e:
                     st.error(f"Error Log {log_id}: {e}")
             end_time = time.time()
             progress_bar.empty()
             st.success(f"完成！耗時 {end_time - start_time:.2f} 秒。")

 # --- 頁面設定 ---
 st.set_page_config(page_title="Cybersecurity AI Assistant (Hugging Face RAG & Batch Analysis)", page_icon="🛡️", layout="wide")
 st.title("🛡️ Meta-Llama-3-8B-Instruct with FAISS RAG & Batch Analysis (Inference Client)")
+st.markdown("已啟用：**IndexFlatIP** + **L2 正規化** + **Hugging Face Inference Client (API)**。支援 JSON/CSV/TXT/**LOG** 執行批量分析。") # <--- 這裡更新了說明
 # --- Streamlit Session State 初始化 (保持不變) ---
 if 'execute_batch_analysis' not in st.session_state:
     st.session_state.json_data_for_batch = None
 # 設定模型 ID
+MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 WINDOW_SIZE = 8
+# === 核心檔案轉換函式 (CSV/TXT/LOG -> JSON List) ===
 def convert_csv_txt_to_json_list(file_content: bytes, file_type: str) -> List[Dict[str, Any]]:
     """
+    將 CSV、TXT 或 LOG 檔案內容 (假定為 CSV 格式，含標頭) 轉換為 JSON 物件列表。
     Args:
         file_content (bytes): 上傳檔案的二進位內容。
+        file_type (str): 檔案類型 ('csv', 'txt', 或 'log')。
     Returns:
         List[Dict[str, Any]]: 轉換後的 JSON 物件列表。
     """
     log_content = file_content.decode("utf-8").strip()
     if not log_content:
         return []
     # 使用 StringIO 讓 csv 模組可以處理字串內容
     string_io = io.StringIO(log_content)
+    # 嘗試使用 csv.DictReader 自動將第一行視為 Key
+    try:
+        reader = csv.DictReader(string_io)
+    except Exception as e:
+        # 如果檔案內容不是標準 CSV (例如純粹的無標頭 LOG 條目)，csv.DictReader 可能會失敗
+        # 這裡的 fallback 邏輯將會處理
+        st.warning(f"使用 csv.DictReader 失敗，嘗試將檔案視為每行一個原始 Log 條目: {e}")
+        reader = None
     json_data = []
+    if reader:
+        # 定義需要轉換為數字的欄位名稱 (可根據您的需求擴充)
+        numeric_fields = ['sc-status', 'time-taken', 'bytes', 'resp-len', 'req-size']
+        for row in reader:
+            record = {}
+            for key, value in row.items():
+                if key is None: continue # 跳過沒有標頭的欄位
+                key = key.strip() # 清理 key
+                value = value.strip() if value else "" # 清理 value
+                # 處理數字轉換
+                if key in numeric_fields:
                     try:
+                        record[key] = int(value)
                     except ValueError:
+                        try:
+                            record[key] = float(value)
+                        except ValueError:
+                            record[key] = value
+                else:
+                    record[key] = value
+            if record: # 確保不是空紀錄
+                json_data.append(record)
+    # Fallback: 如果 csv.DictReader 失敗或沒有產生結果 (例如檔案是純 Log，沒有標準 CSV 標頭)
+    if not json_data:
+        # 嘗試將檔案視為每行一個原始 Log 條目
         string_io.seek(0)
         lines = string_io.readlines()
+        if len(lines) > 0:
              return [{"raw_log_entry": line.strip()} for line in lines if line.strip()]
     return json_data
 def convert_uploaded_file_to_json_list(uploaded_file) -> List[Dict[str, Any]]:
     """根據檔案類型，將上傳的檔案內容轉換為 Log JSON 列表。"""
     file_bytes = uploaded_file.getvalue()
         else:
             raise ValueError("JSON 檔案格式不支援 (非 List 或 Dict)。")
+    # --- Case 2, 3, & 4: CSV/TXT/LOG --- <--- 這裡增加了 .log
+    elif file_name_lower.endswith(('.csv', '.txt', '.log')):
+        # 假設 CSV/TXT/LOG 都是逗號分隔格式 (含標頭) 或每行一個原始 Log
+        file_type = 'csv' if file_name_lower.endswith('.csv') else ('log' if file_name_lower.endswith('.log') else 'txt')
+        return convert_csv_txt_to_json_list(file_bytes, file_type)
     else:
         raise ValueError("不支援的檔案類型。")
     # === 1. 批量分析檔案 (支援多種格式) ===
     batch_uploaded_file = st.file_uploader(
         "1️⃣ 上傳 **Log/Alert 檔案** (用於批量分析)",
+        type=['json', 'csv', 'txt', 'log'], # <--- 這裡增加了 'log'
         key="batch_uploader",
+        help="支援 JSON (Array), CSV (含標題), TXT/LOG (視為 CSV 或每行一個 Log)"
     )
     # === 2. RAG 知識庫檔案 ===
     rag_uploaded_file = st.file_uploader(
         "2️⃣ 上傳 **RAG 參考知識庫** (Logs/PDF/Code 等)",
+        type=['txt', 'py', 'log', 'csv', 'md', 'pdf'], # <--- 這裡增加了 'log'
         key="rag_uploader"
     )
     st.divider()
     if st.button("🗑️ 清除所有紀錄"):
         for key in list(st.session_state.keys()):
             # 排除 HF_TOKEN，如果它在 session_state 中
+            if key != 'HF_TOKEN':
                 del st.session_state[key]
         st.rerun()
 # === Embedding 模型 (保持不變) ===
 @st.cache_resource
 def load_embedding_model():
     model_kwargs = {'device': 'cpu', 'trust_remote_code': True}
     encode_kwargs = {'normalize_embeddings': False}
         else:
             stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
             text_content = stringio.read()
         if not text_content.strip(): return None, "File is empty"
         events = [line for line in text_content.splitlines() if line.strip()]
         docs = [Document(page_content=e) for e in events]
         if not docs: return None, "No documents created"
         embeddings = embedding_model.embed_documents([d.page_content for d in docs])
         embeddings_np = np.array(embeddings).astype("float32")
         faiss.normalize_L2(embeddings_np)
         dimension = embeddings_np.shape[1]
         index = faiss.IndexFlatIP(dimension)
         index.add(embeddings_np)
         doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]
         docstore = InMemoryDocstore({_id: doc for _id, doc in zip(doc_ids, docs)})
         index_to_docstore_id = {i: _id for i, _id in enumerate(doc_ids)}
         vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, distance_strategy=DistanceStrategy.COSINE)
         return vector_store, f"{len(docs)} chunks created."
     except Exception as e:
         if selected:
             retrieved_contents = [f"--- Reference Chunk (sim={score:.3f}) ---\n{doc.page_content}" for i, (doc, score) in enumerate(selected[:5])]
             context_text = "\n".join(retrieved_contents)
     rag_instruction = f"""=== RETRIEVED REFERENCE CONTEXT (Cosine ≥ {threshold}) ==={context_text if context_text else 'No relevant reference context found.'}=== END REFERENCE CONTEXT ===\nANALYSIS INSTRUCTION: {user_prompt}\nBased on the provided LOG SEQUENCE and REFERENCE CONTEXT, you must analyze the **entire sequence** to detect any continuous attack chains or evolving threats."""
     log_content_section = f"""=== CURRENT LOG SEQUENCE TO ANALYZE (Window Size: {WINDOW_SIZE}) ===\n{log_sequence_text}\n=== END LOG SEQUENCE ==="""
     messages = [
         {"role": "system", "content": sys_prompt},
         {"role": "user", "content": f"{rag_instruction}\n\n{log_content_section}"}
     ]
     try:
         response_stream = client.chat_completion(messages, max_tokens=max_output_tokens, temperature=temperature, top_p=top_p, stream=False)
         if response_stream and response_stream.choices:
 # === 檔案處理區塊 (批量分析檔案 - **優化重寫** ) ===
 if batch_uploaded_file:
     batch_file_key = f"batch_{batch_uploaded_file.name}_{batch_uploaded_file.size}"
     if st.session_state.batch_current_file_key != batch_file_key or 'json_data_for_batch' not in st.session_state:
         try:
             # 使用新的統一解析函式
             st.session_state.json_data_for_batch = parsed_data
             st.session_state.batch_current_file_key = batch_file_key
             st.toast(f"檔案已解析並轉換為 {len(parsed_data)} 個 Log 條目。", icon="✅")
         except Exception as e:
             st.error(f"檔案解析錯誤: {e}")
             if 'json_data_for_batch' in st.session_state:
     st.session_state.execute_batch_analysis = False
     start_time = time.time()
     st.session_state.batch_results = []
     if inference_client is None:
         st.error("Client 未連線，無法執行。")
     else:
         # 在新的邏輯中，st.session_state.json_data_for_batch 已經是一個 List[Dict]
         logs_list = st.session_state.json_data_for_batch
         if logs_list:
             vs = st.session_state.get("vector_store", None)
             # --- 關鍵：在這裡做 JSON String 的轉換 ---
             # 確保 Prompt 收到的永遠是 JSON 格式的文字
             formatted_logs = [json.dumps(log, indent=2, ensure_ascii=False) for log in logs_list]
             analysis_sequences = []
             for i in range(len(formatted_logs)):
                 start_index = max(0, i - WINDOW_SIZE + 1)
                     "target_log_id": i + 1,
                     "original_log_entry": logs_list[i]
                 })
             total_sequences = len(analysis_sequences)
             st.header(f"⚡ 批量分析執行中 (平移視窗 $N={WINDOW_SIZE}$)...")
             progress_bar = st.progress(0, text=f"準備處理 {total_sequences} 個序列...")
             results_container = st.container()
             full_report_chunks = ["## Cybersecurity Batch Analysis Report\n\n"]
             for i, seq_data in enumerate(analysis_sequences):
                 log_id = seq_data["target_log_id"]
                 progress_bar.progress((i + 1) / total_sequences, text=f"Processing {i + 1}/{total_sequences} (Log #{log_id})...")
                 try:
                     response, retrieved_ctx = generate_rag_response_hf_for_log(
                         client=inference_client,
                         "context": retrieved_ctx
                     }
                     st.session_state.batch_results.append(item)
                     with results_container:
                         st.subheader(f"Log/Alert #{item['log_id']}")
                         with st.expander("序列內容 (JSON Format)"):
                             # 這裡顯示的會是 JSON 格式的 Log Sequence
                             st.code(item["sequence_analyzed"], language='json')
                         is_high = any(x in response.lower() for x in ['high-risk detected'])
                         if is_high: st.error(item['analysis_result'])
                         else: st.info(item['analysis_result'])
                         if item['context']:
                             with st.expander("參考 RAG 片段"): st.code(item['context'])
                         st.markdown("---")
                         log_content_str_for_report = json.dumps(item["log_content"], indent=2, ensure_ascii=False).replace("`", "\\`")
                         full_report_chunks.append(f"---\n\n### Log #{item['log_id']}\n```json\n{log_content_str_for_report}\n```\nResult:\n{item['analysis_result']}\n")
                 except Exception as e:
                     st.error(f"Error Log {log_id}: {e}")
             end_time = time.time()
             progress_bar.empty()
             st.success(f"完成！耗時 {end_time - start_time:.2f} 秒。")