PiloBi
/

Qwen2.5-VL-3B-Instruct-SFT

Safetensors

Model card Files Files and versions

xet

Community

PiloBi commited on Dec 26, 2025

Commit

cc427cc

verified ·

1 Parent(s): 67c5a2e

Upload RL_infer.py with huggingface_hub

Browse files

Files changed (1) hide show

RL_infer.py +880 -0

RL_infer.py ADDED Viewed

	@@ -0,0 +1,880 @@

+# import json
+# import re
+# from PIL import Image
+# from transformers import AutoModelForVision2Seq, AutoProcessor
+# import torch
+# import os
+# from qwen_vl_utils import process_vision_info
+# # --- 1. 辅助函数 ---
+# def load_test_data(file_path):
+#     """
+#     根据文件扩展名自动加载 .json 或 .jsonl 文件。
+#     对于 .json 文件，尝试不同的常见键来查找样本列表。
+#     """
+#     _, ext = os.path.splitext(file_path)
+#     ext = ext.lower()
+#     test_samples = []
+#     if ext == '.jsonl':
+#         print(f"Loading data from JSON Lines file: {file_path}")
+#         with open(file_path, 'r', encoding='utf-8') as f:
+#             for i, line in enumerate(f):
+#                 line = line.strip()
+#                 if not line:
+#                     continue
+#                 try:
+#                     test_samples.append(json.loads(line))
+#                 except json.JSONDecodeError as e:
+#                     print(f"Warning: Skipping invalid JSON line {i+1} in {file_path}: {e}")
+#     elif ext == '.json':
+#         print(f"Loading data from JSON file: {file_path}")
+#         try:
+#             with open(file_path, 'r', encoding='utf-8') as f:
+#                 data = json.load(f)
+#             if isinstance(data, list):
+#                 print("  Detected JSON array format.")
+#                 test_samples = data
+#             elif isinstance(data, dict):
+#                 print("  Detected JSON object format. Searching for samples...")
+#                 possible_keys = ['data', 'samples', 'instances', 'items', 'conversations', 'messages']
+#                 found = False
+#                 for key in possible_keys:
+#                     if key in data and isinstance(data[key], list) and len(data[key]) > 0:
+#                         # 简单检查列表第一个元素是否像样本 (dict with 'messages')
+#                         first_item = data[key][0]
+#                         if isinstance(first_item, dict) and 'messages' in first_item:
+#                             print(f"  Found samples under key '{key}'.")
+#                             test_samples = data[key]
+#                             found = True
+#                             break
+#                 if not found:
+#                     # 启发式：查找第一个值是列表且列表元素是字典的键
+#                     for key, value in data.items():
+#                         if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict) and 'messages' in value[0]:
+#                             print(f"  Found samples under key '{key}' (heuristic).")
+#                             test_samples = value
+#                             found = True
+#                             break
+#                 if not found:
+#                     print(f"  Error: Could not find a list of samples in the JSON object. Keys found: {list(data.keys())}")
+#             else:
+#                 print(f"  Error: Unexpected JSON structure. Root element type: {type(data)}")
+#         except json.JSONDecodeError as e:
+#             print(f"Error: Failed to decode JSON from {file_path}: {e}")
+#         except Exception as e:
+#             print(f"Error: An unexpected error occurred while loading {file_path}: {e}")
+#     else:
+#         print(f"Error: Unsupported file extension '{ext}'. Please provide a .json or .jsonl file.")
+#     print(f"Loaded {len(test_samples)} samples.")
+#     # 验证加载的样本结构
+#     if test_samples and isinstance(test_samples, list):
+#         print("Performing basic structure validation on loaded samples...")
+#         sample_count_to_check = min(5, len(test_samples))
+#         for i in range(sample_count_to_check):
+#             s = test_samples[i]
+#             if not isinstance(s, dict):
+#                 print(f"  CRITICAL: Sample {i} is not a dict. Type: {type(s)}")
+#                 # 可以选择在这里中断或清理数据
+#                 # return []
+#             elif 'messages' not in s or 'images' not in s:
+#                 print(f"  WARNING: Sample {i} might be missing 'messages' or 'images' keys. Found keys: {list(s.keys())}")
+#             else:
+#                 if not isinstance(s['messages'], list):
+#                     print(f"  CRITICAL: Sample {i} 'messages' is not a list. Type: {type(s['messages'])}")
+#                 if not isinstance(s['images'], list):
+#                     print(f"  CRITICAL: Sample {i} 'images' is not a list. Type: {type(s['images'])}")
+#         print("Structure validation complete.")
+#     elif test_samples:
+#          print(f"CRITICAL: Expected test_samples to be a list after loading, got {type(test_samples)}.")
+#          test_samples = [] # Reset to empty list on critical error
+#     return test_samples
+# def extract_components(text):
+#     """从模型输出或标签中提取 <think>, <control>, <answer> 组件"""
+#     think_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
+#     control_match = re.search(r'<control>(.*?)</control>', text)
+#     answer_match = re.search(r'<answer>(.*?)</answer>', text)
+#     return {
+#         'think': think_match.group(1).strip() if think_match else "",
+#         'control': control_match.group(1).strip() if control_match else "",
+#         'answer': answer_match.group(1).strip() if answer_match else ""
+#     }
+# def calculate_accuracy(pred_list, true_list):
+#     """计算准确率 (用于 <answer>)"""
+#     if len(pred_list) != len(true_list):
+#          raise ValueError("Prediction and truth lists must have the same length for accuracy calculation.")
+#     if not pred_list:
+#         return 0.0
+#     correct = sum(p == t for p, t in zip(pred_list, true_list))
+#     return correct / len(pred_list)
+# # --- 2. 主评估逻辑 ---
+# def main():
+#     # --- 配置 ---
+#     # 替换为您的模型路径
+#     model_path = "/data/LLM-SFT/SFT_Output/multiclsTask/Qwen2.5-VL-3B-Instruct/SFT/checkpoint-894"
+#     # 替换为您的测试集路径 (.json 或 .jsonl)
+#     test_data_path = "/data/LLM-SFT/datasets/driver_behavior_datasets/output_test.jsonl"
+#     output_file = model_path + "/eval/detailed_model_evaluation_results.json"
+#     # --- 加载模型和处理器 ---
+#     print("Loading model and processor...")
+#     try:
+#         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+#         model = AutoModelForVision2Seq.from_pretrained(
+#             model_path,
+#             trust_remote_code=True,
+#             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+#         )
+#         model.eval()
+#         if torch.cuda.is_available():
+#             model = model.to('cuda')
+#             print("Model loaded on GPU.")
+#         else:
+#             print("Model loaded on CPU.")
+#     except Exception as e:
+#         print(f"Failed to load model/processor: {e}")
+#         return # Exit if model loading fails
+#     # --- 加载测试数据 ---
+#     try:
+#         test_samples = load_test_data(test_data_path)
+#         # print('test_samples',test_samples)
+#         if not test_samples:
+#             print("No samples loaded. Exiting.")
+#             return
+#     except Exception as e:
+#         print(f"Failed to load test data: {e}")
+#         return
+#     # --- 推理和收集结果 (带解析) ---
+#     results = []
+#     pred_answers = []
+#     true_answers = []
+#     pred_controls = [] # 存储 control 字符串用于后续分析
+#     true_controls = []
+#     print("Starting inference...")
+#     for i, sample in enumerate(test_samples):
+#         try:
+#             conversation = sample['messages']
+#             image_path = sample['images'][0]
+#             if not os.path.exists(image_path):
+#                  print(f"Warning: Image not found: {image_path}. Skipping sample {i}.")
+#                  # 为保持列表对齐，添加空占位符
+#                  pred_answers.append("")
+#                  true_answers.append(extract_components(conversation[-1]['content'])['answer'])
+#                  pred_controls.append("")
+#                  true_controls.append(extract_components(conversation[-1]['content'])['control'])
+#                  continue
+#             image = Image.open(image_path).convert('RGB')
+#             # 准备输入
+#             # 注意：Qwen VL 系列通常期望 messages 是一个列表，其中包含 role 和 content
+#             # processor 会处理 <image> token 和图像的对齐
+#             # print('conversation[:-1]',conversation[:-1])
+#             texts = processor.apply_chat_template(conversation[:-1], tokenize=False, add_generation_prompt=True)
+#             image_inputs, video_inputs = process_vision_info(conversation[:-1])
+#             inputs = processor(
+#                 text=texts,
+#                 images=image_inputs,
+#                 videos=video_inputs,
+#                 padding=True,
+#                 return_tensors="pt",
+#             )
+#             if torch.cuda.is_available():
+#                 inputs = {k: v.to('cuda') for k, v in inputs.items()}
+#             # 生成
+#             with torch.no_grad():
+#                 generated_ids = model.generate(**inputs,
+#                                             max_new_tokens=200,
+#                                             # num_beams=5,
+#                                             do_sample=True,
+#                                             top_p=0.75,
+#                                             top_k=50,
+#                                             temperature=0.2
+#                                             # repetition_penalty=1.2,
+#                                             # early_stopping=True
+#                                             )
+#             generated_ids_trimmed = [
+#                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+#             ]
+#             output_text = processor.batch_decode(
+#                 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+#             )
+#             # 提取标签
+#             ground_truth = conversation[-1]['content']
+#             # 解析模型输出和标签
+#             pred_components = extract_components(output_text)
+#             true_components = extract_components(ground_truth)
+#             # 收集用于评估的数据
+#             pred_answers.append(pred_components['answer'])
+#             true_answers.append(true_components['answer'])
+#             pred_controls.append(pred_components['control'])
+#             true_controls.append(true_components['control'])
+#             # 打印部分样本进行观察
+#             if i < 3: # 打印前3个样本
+#                 print(f"\n--- Sample {i+1} ---")
+#                 print(f"  Image: {image_path}")
+#                 print(f"  Input Text: {input_text}")
+#                 print(f"  Full Decoded Output: {decoded_output}")
+#                 print(f"  Processed Output Text: {output_text}")
+#                 print(f"  Parsed Prediction: {pred_components}")
+#                 print(f"  Ground Truth: {ground_truth}")
+#                 print(f"  Parsed Truth: {true_components}")
+#             # 存储详细结果
+#             results.append({
+#                 "sample_id": i,
+#                 "image_path": image_path,
+#                 "input_text": input_text,
+#                 "model_output_raw": decoded_output,
+#                 "model_output_processed": output_text,
+#                 "parsed_prediction": pred_components,
+#                 "ground_truth_raw": ground_truth,
+#                 "parsed_truth": true_components
+#             })
+#         except Exception as e:
+#             print(f"Error processing sample {i}: {e}")
+#             # 错误样本也计入评估列表，但标记为空或错误
+#             pred_answers.append("ERROR")
+#             true_answers.append(extract_components(conversation[-1]['content'])['answer'] if 'conversation' in locals() else "")
+#             pred_controls.append("ERROR")
+#             true_controls.append(extract_components(conversation[-1]['content'])['control'] if 'conversation' in locals() else "")
+#             results.append({
+#                 "sample_id": i,
+#                 "image_path": image_path if 'image_path' in locals() else "N/A",
+#                 "input_text": conversation[-2]['content'] if 'conversation' in locals() else "N/A",
+#                 "model_output_raw": f"ERROR: {e}",
+#                 "model_output_processed": f"ERROR: {e}",
+#                 "parsed_prediction": {"think": "", "control": "", "answer": "ERROR"},
+#                 "ground_truth_raw": conversation[-1]['content'] if 'conversation' in locals() else "N/A",
+#                 "parsed_truth": extract_components(conversation[-1]['content']) if 'conversation' in locals() else {"think": "", "control": "", "answer": ""}
+#             })
+#     # --- 保存详细结果 ---
+#     with open(output_file, 'w', encoding='utf-8') as f:
+#         json.dump(results, f, indent=2, ensure_ascii=False)
+#     print(f"\nDetailed results saved to {output_file}")
+#     # --- 深入定量评估 ---
+#     print(f"\n--- Quantitative Evaluation ---")
+#     total_samples = len(test_samples)
+#     successful_samples = len([r for r in results if not r['model_output_raw'].startswith("ERROR")])
+#     print(f"Total samples: {total_samples}, Successfully processed: {successful_samples}")
+#     if successful_samples == 0:
+#         print("No samples were processed successfully. Skipping quantitative evaluation.")
+#         return
+#     # a. <answer> 标签准确率 (仅计算成功处理的样本)
+#     # 过滤掉错误样本
+#     filtered_pred_answers = [p for p in pred_answers if p != "ERROR"]
+#     filtered_true_answers = [t for p, t in zip(pred_answers, true_answers) if p != "ERROR"]
+#     if filtered_pred_answers:
+#         answer_accuracy = calculate_accuracy(filtered_pred_answers, filtered_true_answers)
+#         print(f"<answer> Tag Accuracy (on successful samples): {answer_accuracy:.4f} ({sum(p==t for p,t in zip(filtered_pred_answers, filtered_true_answers))}/{len(filtered_true_answers)})")
+#     else:
+#         print("No valid <answer> predictions to evaluate.")
+#         answer_accuracy = 0.0
+#     # b. <control> 指令分析
+#     filtered_pred_controls = [c for p, c in zip(pred_answers, pred_controls) if p != "ERROR"]
+#     filtered_true_controls = [t for p, t in zip(pred_answers, true_controls) if p != "ERROR"]
+#     if filtered_pred_controls:
+#         control_non_empty_pred = [c != "" for c in filtered_pred_controls]
+#         control_non_empty_true = [c != "" for c in filtered_true_controls]
+#         control_existence_acc = calculate_accuracy(control_non_empty_pred, control_non_empty_true)
+#         print(f"<control> Tag Presence Accuracy (on successful samples): {control_existence_acc:.4f}")
+#     else:
+#         print("No valid <control> predictions to evaluate.")
+#         control_existence_acc = 0.0
+#     # c. 分类别 <answer> 准确率
+#     if filtered_true_answers:
+#         unique_labels = sorted(list(set(filtered_true_answers + filtered_pred_answers)))
+#         print("\nPer-class <answer> accuracy:")
+#         class_acc = {}
+#         for label in unique_labels:
+#             tp = sum(1 for p, t in zip(filtered_pred_answers, filtered_true_answers) if p == label and t == label)
+#             total_true = sum(1 for t in filtered_true_answers if t == label)
+#             class_acc[label] = tp / total_true if total_true > 0 else 0.0
+#             print(f"  Accuracy for '{label}': {class_acc[label]:.4f} ({tp}/{total_true if total_true > 0 else 'N/A'})")
+#     # d. (可选) 文本相似度评估 (需要安装 nltk 或 rouge-score)
+#     # 示例使用 ROUGE (需要 pip install rouge)
+#     from rouge import Rouge
+#     rouge = Rouge()
+#     avg_rouge_scores = {'rouge-1': 0.0, 'rouge-2': 0.0, 'rouge-l': 0.0}
+#     valid_samples_for_rouge = 0
+#     for res in results:
+#         if not res['model_output_raw'].startswith("ERROR") and res['parsed_truth']['think'] and res['parsed_prediction']['think']:
+#             try:
+#                 scores = rouge.get_scores(res['parsed_prediction']['think'], res['parsed_truth']['think'])
+#                 for metric in avg_rouge_scores:
+#                     avg_rouge_scores[metric] += scores[0][metric]['f']
+#                 valid_samples_for_rouge += 1
+#             except Exception as e:
+#                 print(f"ROUGE calculation error for sample {res['sample_id']}: {e}")
+#     if valid_samples_for_rouge > 0:
+#         for metric in avg_rouge_scores:
+#             avg_rouge_scores[metric] /= valid_samples_for_rouge
+#         print(f"\nAverage ROUGE Scores (on <think> tags, {valid_samples_for_rouge} valid samples):")
+#         for metric, score in avg_rouge_scores.items():
+#             print(f"  {metric.upper()}: {score:.4f}")
+#     else:
+#         print("\nNo valid samples for ROUGE calculation on <think> tags.")
+#     # --- 7. 错误案例分析 ---
+#     print(f"\n--- Error Analysis ---")
+#     error_count = sum(1 for r in results if r['model_output_raw'].startswith("ERROR"))
+#     if error_count > 0:
+#         print(f"Number of samples with processing errors: {error_count}")
+#         # 可以在这里打印错误详情
+#     else:
+#         print("No processing errors detected during inference.")
+#     print("Samples where <answer> prediction was incorrect (excluding errors):")
+#     incorrect_count = 0
+#     for res in results:
+#         # 只分析成功处理且预测错误的样本
+#         if not res['model_output_raw'].startswith("ERROR") and \
+#            res['parsed_prediction']['answer'] != res['parsed_truth']['answer']:
+#             incorrect_count += 1
+#             if incorrect_count <= 5: # 只打印前5个错误案例
+#                 print(f"  Sample ID: {res['sample_id']}")
+#                 print(f"    Image: {res['image_path']}")
+#                 print(f"    Input: {res['input_text']}")
+#                 print(f"    Predicted Answer: '{res['parsed_prediction']['answer']}'")
+#                 print(f"    True Answer:      '{res['parsed_truth']['answer']}'")
+#                 print(f"    Predicted Control: '{res['parsed_prediction']['control']}'")
+#                 print(f"    True Control:      '{res['parsed_truth']['control']}'")
+#                 # print(f"    Predicted Think: '{res['parsed_prediction']['think']}'") # 可选
+#                 # print(f"    True Think:      '{res['parsed_truth']['think']}'") # 可选
+#                 print("-" * 20)
+#     if incorrect_count > 5:
+#         print(f"... and {incorrect_count - 5} more incorrect predictions.")
+#     elif incorrect_count == 0:
+#         print("  All successful predictions matched the ground truth <answer>.")
+#     # --- 8. 总结 ---
+#     print(f"\n--- Summary ---")
+#     print(f"Total samples processed: {total_samples}")
+#     print(f"Successfully processed samples: {successful_samples}")
+#     if filtered_pred_answers:
+#         print(f"<answer> Accuracy (successful samples): {answer_accuracy:.4f}")
+#     if filtered_pred_controls:
+#         print(f"<control> Presence Accuracy (successful samples): {control_existence_acc:.4f}")
+#     print("Per-class accuracies calculated above (if applicable).")
+#     print("Detailed results are available in the output file.")
+# if __name__ == "__main__":
+#     main()
+import json
+import os
+from typing import Dict, List, Any, Tuple
+import re
+from collections import defaultdict, Counter
+import ast
+def load_data(file_path: str) -> List[Dict]:
+    """
+    Load data from JSON or JSONL file
+    """
+    data = []
+    # Check file extension to determine format
+    if file_path.lower().endswith('.json'):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    elif file_path.lower().endswith('.jsonl'):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data.append(json.loads(line))
+    else:
+        # Try to auto-detect based on content
+        with open(file_path, 'r', encoding='utf-8') as f:
+            first_line = f.readline().strip()
+            f.seek(0)
+            if first_line.startswith('['):  # JSON array
+                data = json.load(f)
+            else:  # Assume JSONL
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        data.append(json.loads(line))
+    return data
+def parse_think_content(think_str: str) -> Dict[str, str]:
+    """
+    Parse <think> content to extract behavior description
+    """
+    if not think_str:
+        return {"raw": "", "behavior": ""}
+    # Remove <think> tags and extract content
+    clean_str = re.sub(r'<think>|</think>', '', think_str).strip()
+    return {
+        "raw": clean_str,
+        "behavior": clean_str  # For now, the behavior is the full content
+    }
+def parse_control_content(control_str: str) -> Dict[str, Any]:
+    """
+    Parse <control> content to extract control command and parameters
+    """
+    if not control_str:
+        return {"raw": "", "command": "", "parameters": {}, "type": "none"}
+    clean_str = re.sub(r'<control>|</control>', '', control_str).strip()
+    # Extract command and parameters
+    command = clean_str
+    params = {}
+    control_type = "other"
+    if "(" in clean_str and ")" in clean_str:
+        # Pattern like: MonitorPassenger(SwellingDetected)
+        match = re.match(r'(\w+)\(([^)]+)\)', clean_str)
+        if match:
+            command = match.group(1)
+            param_str = match.group(2)
+            params = {"parameter": param_str}
+            if "Monitor" in command:
+                control_type = "monitoring"
+            elif "Alert" in command:
+                control_type = "alerting"
+            elif "set" in command:
+                control_type = "setting"
+    elif "|" in clean_str:
+        # Pattern like: setMute|false|
+        parts = clean_str.split("|")
+        command = parts[0] if parts else ""
+        params = {"params": parts[1:] if len(parts) > 1 else []}
+        control_type = "command"
+    else:
+        command = clean_str
+        control_type = "function"
+    return {
+        "raw": clean_str,
+        "command": command,
+        "parameters": params,
+        "type": control_type
+    }
+def parse_answer_content(answer_str: str) -> Dict[str, str]:
+    """
+    Parse <answer> content to extract the final answer
+    """
+    if not answer_str:
+        return {"raw": "", "category": "", "description": ""}
+    clean_str = re.sub(r'<answer>|</answer>', '', answer_str).strip()
+    # Try to categorize the answer
+    category = "other"
+    if any(keyword in clean_str.lower() for keyword in ["swelling", "eye", "face", "facial"]):
+        category = "physical_symptom"
+    elif any(keyword in clean_str.lower() for keyword in ["sleep", "drowsy", "tired", "yawn"]):
+        category = "drowsiness"
+    elif any(keyword in clean_str.lower() for keyword in ["phone", "call", "text", "mobile"]):
+        category = "distraction"
+    elif any(keyword in clean_str.lower() for keyword in ["smoke", "cigarette"]):
+        category = "smoking"
+    elif any(keyword in clean_str.lower() for keyword in ["drunk", "alcohol", "intoxicated"]):
+        category = "intoxication"
+    elif any(keyword in clean_str.lower() for keyword in ["mouth", "corner", "slanting"]):
+        category = "facial_expression"
+    elif any(keyword in clean_str.lower() for keyword in ["head", "cover", "hold"]):
+        category = "head_behavior"
+    elif any(keyword in clean_str.lower() for keyword in ["arm", "hand", "slip", "droop"]):
+        category = "limb_behavior"
+    elif any(keyword in clean_str.lower() for keyword in ["radio", "adjust", "control"]):
+        category = "vehicle_control"
+    return {
+        "raw": clean_str,
+        "category": category,
+        "description": clean_str
+    }
+def extract_all_components(text: str) -> Dict[str, str]:
+    """
+    Extract think, control, and answer components from text
+    """
+    components = {
+        "think": "",
+        "control": "",
+        "answer": ""
+    }
+    # Extract <think> content
+    think_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
+    if think_match:
+        components["think"] = think_match.group(1).strip()
+    # Extract <control> content
+    control_match = re.search(r'<control>(.*?)</control>', text, re.DOTALL)
+    if control_match:
+        components["control"] = control_match.group(1).strip()
+    # Extract <answer> content
+    answer_match = re.search(r'<answer>(.*?)</answer>', text, re.DOTALL)
+    if answer_match:
+        components["answer"] = answer_match.group(1).strip()
+    return components
+def calculate_component_accuracy(predicted_components: Dict, actual_components: Dict) -> Dict[str, float]:
+    """
+    Calculate accuracy for each component
+    """
+    accuracy = {}
+    # Think component accuracy
+    accuracy['think'] = calculate_similarity(
+        predicted_components.get('think', ''),
+        actual_components.get('think', '')
+    )
+    # Control component accuracy
+    accuracy['control'] = calculate_similarity(
+        predicted_components.get('control', ''),
+        actual_components.get('control', '')
+    )
+    # Answer component accuracy
+    accuracy['answer'] = calculate_similarity(
+        predicted_components.get('answer', ''),
+        actual_components.get('answer', '')
+    )
+    return accuracy
+def calculate_similarity(str1: str, str2: str) -> float:
+    """
+    Calculate similarity between two strings
+    """
+    if not str1 and not str2:
+        return 1.0
+    if not str1 or not str2:
+        return 0.0
+    str1_lower = str1.lower().strip()
+    str2_lower = str2.lower().strip()
+    if str1_lower == str2_lower:
+        return 1.0
+    # Calculate word overlap
+    words1 = set(str1_lower.split())
+    words2 = set(str2_lower.split())
+    if len(words1) == 0 and len(words2) == 0:
+        return 1.0
+    if len(words1) == 0 or len(words2) == 0:
+        return 0.0
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    # Jaccard similarity
+    jaccard = len(intersection) / len(union) if union else 0
+    # Also consider sequence similarity for exact matches
+    if str1_lower in str2_lower or str2_lower in str1_lower:
+        return max(jaccard, 0.8)
+    return jaccard
+def evaluate_component_quality(parsed_component: Dict, expected_component: Dict) -> Dict[str, float]:
+    """
+    Evaluate the quality of component parsing and prediction
+    """
+    quality = {}
+    if parsed_component.get('type') == expected_component.get('type'):
+        quality['type_match'] = 1.0
+    else:
+        quality['type_match'] = 0.0
+    # Evaluate content quality based on component type
+    if parsed_component.get('type') == 'monitoring':
+        quality['content_quality'] = 1.0 if 'Monitor' in parsed_component.get('command', '') else 0.0
+    elif parsed_component.get('type') == 'alerting':
+        quality['content_quality'] = 1.0 if 'Alert' in parsed_component.get('command', '') else 0.0
+    else:
+        quality['content_quality'] = 0.5  # Default medium quality
+    return quality
+def comprehensive_evaluation(data: List[Dict]) -> Dict[str, Any]:
+    """
+    Comprehensive evaluation of all three components
+    """
+    total_samples = len(data)
+    results = {
+        'overall_metrics': {},
+        'component_wise_metrics': {
+            'think': {'accuracy_scores': [], 'quality_scores': []},
+            'control': {'accuracy_scores': [], 'quality_scores': []},
+            'answer': {'accuracy_scores': [], 'quality_scores': []}
+        },
+        'detailed_analysis': [],
+        'error_patterns': {
+            'think_errors': [],
+            'control_errors': [],
+            'answer_errors': []
+        }
+    }
+    for idx, sample in enumerate(data):
+        # Extract components from response and labels
+        response_components = extract_all_components(sample.get('response', ''))
+        label_components = extract_all_components(sample.get('labels', ''))
+        # Parse components for deeper analysis
+        parsed_think = parse_think_content(response_components['think'])
+        parsed_control = parse_control_content(response_components['control'])
+        parsed_answer = parse_answer_content(response_components['answer'])
+        actual_think = parse_think_content(label_components['think'])
+        actual_control = parse_control_content(label_components['control'])
+        actual_answer = parse_answer_content(label_components['answer'])
+        # Calculate component-wise accuracy
+        component_accuracy = calculate_component_accuracy(response_components, label_components)
+        # Calculate component quality
+        think_quality = evaluate_component_quality(parsed_think, actual_think)
+        control_quality = evaluate_component_quality(parsed_control, actual_control)
+        answer_quality = evaluate_component_quality(parsed_answer, actual_answer)
+        # Store component-wise metrics
+        for comp in ['think', 'control', 'answer']:
+            results['component_wise_metrics'][comp]['accuracy_scores'].append(component_accuracy[comp])
+            results['component_wise_metrics'][comp]['quality_scores'].append(
+                think_quality.get('content_quality', 0.5) if comp == 'think' else
+                control_quality.get('content_quality', 0.5) if comp == 'control' else
+                answer_quality.get('content_quality', 0.5)
+            )
+        # Store detailed analysis
+        detailed_result = {
+            'index': idx,
+            'response_components': response_components,
+            'label_components': label_components,
+            'parsed_response': {
+                'think': parsed_think,
+                'control': parsed_control,
+                'answer': parsed_answer
+            },
+            'parsed_labels': {
+                'think': actual_think,
+                'control': actual_control,
+                'answer': actual_answer
+            },
+            'component_accuracy': component_accuracy,
+            'component_quality': {
+                'think': think_quality,
+                'control': control_quality,
+                'answer': answer_quality
+            },
+            'overall_score': sum(component_accuracy.values()) / 3 if component_accuracy else 0
+        }
+        results['detailed_analysis'].append(detailed_result)
+        # Analyze errors
+        if component_accuracy['think'] < 0.5:
+            results['error_patterns']['think_errors'].append(idx)
+        if component_accuracy['control'] < 0.5:
+            results['error_patterns']['control_errors'].append(idx)
+        if component_accuracy['answer'] < 0.5:
+            results['error_patterns']['answer_errors'].append(idx)
+    # Calculate overall metrics
+    overall_metrics = {}
+    for comp in ['think', 'control', 'answer']:
+        acc_scores = results['component_wise_metrics'][comp]['accuracy_scores']
+        qual_scores = results['component_wise_metrics'][comp]['quality_scores']
+        overall_metrics[f'{comp}_avg_accuracy'] = sum(acc_scores) / len(acc_scores) if acc_scores else 0
+        overall_metrics[f'{comp}_avg_quality'] = sum(qual_scores) / len(qual_scores) if qual_scores else 0
+        overall_metrics[f'{comp}_std_accuracy'] = (
+            sum((x - overall_metrics[f'{comp}_avg_accuracy'])**2 for x in acc_scores) / len(acc_scores)
+        )**0.5 if acc_scores else 0
+    # Calculate overall system performance
+    overall_metrics['total_samples'] = total_samples
+    overall_metrics['avg_overall_score'] = sum(
+        d['overall_score'] for d in results['detailed_analysis']
+    ) / total_samples if total_samples > 0 else 0
+    results['overall_metrics'] = overall_metrics
+    return results
+def generate_evaluation_report(results: Dict[str, Any]) -> str:
+    """
+    Generate comprehensive evaluation report
+    """
+    report = []
+    report.append("="*100)
+    report.append("COMPREHENSIVE EVALUATION OF IN-VEHICLE MULTIMODAL AI MODEL")
+    report.append("="*100)
+    metrics = results['overall_metrics']
+    report.append(f"\n📊 OVERALL SYSTEM PERFORMANCE:")
+    report.append(f"   Total Samples: {metrics['total_samples']}")
+    report.append(f"   Average Overall Score: {metrics['avg_overall_score']:.4f}")
+    report.append(f"\n🔍 COMPONENT-WISE PERFORMANCE:")
+    for comp in ['think', 'control', 'answer']:
+        avg_acc = metrics.get(f'{comp}_avg_accuracy', 0)
+        avg_qual = metrics.get(f'{comp}_avg_quality', 0)
+        std_acc = metrics.get(f'{comp}_std_accuracy', 0)
+        report.append(f"   {comp.upper()}:")
+        report.append(f"     Average Accuracy: {avg_acc:.4f}")
+        report.append(f"     Average Quality: {avg_qual:.4f}")
+        report.append(f"     Std Deviation: {std_acc:.4f}")
+    # Error analysis
+    error_patterns = results['error_patterns']
+    report.append(f"\n❌ ERROR ANALYSIS:")
+    report.append(f"   Think component errors: {len(error_patterns['think_errors'])} samples")
+    report.append(f"   Control component errors: {len(error_patterns['control_errors'])} samples")
+    report.append(f"   Answer component errors: {len(error_patterns['answer_errors'])} samples")
+    # Sample error analysis
+    if results['detailed_analysis']:
+        sample_analysis = results['detailed_analysis'][0]  # Show first sample as example
+        report.append(f"\n📋 SAMPLE ANALYSIS (First Sample):")
+        report.append(f"   Think Accuracy: {sample_analysis['component_accuracy']['think']:.4f}")
+        report.append(f"   Control Accuracy: {sample_analysis['component_accuracy']['control']:.4f}")
+        report.append(f"   Answer Accuracy: {sample_analysis['component_accuracy']['answer']:.4f}")
+        report.append(f"   Overall Score: {sample_analysis['overall_score']:.4f}")
+    # Component type analysis
+    report.append(f"\n🔧 COMPONENT TYPE ANALYSIS:")
+    # Analyze control command types
+    control_types = []
+    for analysis in results['detailed_analysis']:
+        control_type = analysis['parsed_response']['control'].get('type', 'unknown')
+        control_types.append(control_type)
+    type_counts = Counter(control_types)
+    report.append("   Control Command Types:")
+    for control_type, count in type_counts.most_common():
+        report.append(f"     {control_type}: {count} samples")
+    # Answer category analysis
+    answer_categories = []
+    for analysis in results['detailed_analysis']:
+        answer_category = analysis['parsed_response']['answer'].get('category', 'unknown')
+        answer_categories.append(answer_category)
+    category_counts = Counter(answer_categories)
+    report.append("   Answer Categories:")
+    for category, count in category_counts.most_common():
+        report.append(f"     {category}: {count} samples")
+    report.append(f"\n🎯 RECOMMENDATIONS:")
+    if metrics.get('think_avg_accuracy', 0) < 0.7:
+        report.append("   - Improve think component (behavior analysis)")
+    if metrics.get('control_avg_accuracy', 0) < 0.7:
+        report.append("   - Improve control component (command generation)")
+    if metrics.get('answer_avg_accuracy', 0) < 0.7:
+        report.append("   - Improve answer component (final classification)")
+    return "\n".join(report)
+def save_evaluation_results(results: Dict[str, Any], output_path: str):
+    """
+    Save evaluation results to JSON file
+    """
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+def main(input_file: str, output_file: str = None):
+    """
+    Main function to perform comprehensive evaluation
+    """
+    print(f"Loading data from: {input_file}")
+    # Load data
+    data = load_data(input_file)
+    print(f"Loaded {len(data)} samples")
+    # Perform comprehensive evaluation
+    print("Performing comprehensive evaluation...")
+    results = comprehensive_evaluation(data)
+    # Generate and print report
+    report = generate_evaluation_report(results)
+    print(report)
+    # Save results if output path provided
+    if output_file:
+        save_evaluation_results(results, output_file)
+        print(f"\nDetailed evaluation results saved to: {output_file}")
+    return results
+if __name__ == "__main__":
+    import sys
+    # if len(sys.argv) < 2:
+    #     print("Usage: python comprehensive_evaluation.py <input_file> [output_file]")
+    #     print("  input_file: Path to JSON or JSONL file containing model predictions")
+    #     print("  output_file: Optional path to save detailed evaluation results")
+    #     sys.exit(1)
+    # input_file = sys.argv[1]
+    # output_file = sys.argv[2] if len(sys.argv) > 2 else None
+    input_file = r"/data/LLM-SFT/SFT_Output/multiclsTask/Qwen2.5-VL-3B-Instruct/v0-20251123-182828/checkpoint-264/infer_result/20251124-175009.jsonl"
+    output_file = r"/data/LLM-SFT/SFT_Output/multiclsTask/Qwen2.5-VL-3B-Instruct/v0-20251123-182828/checkpoint-264/eval/20251124-175009.jsonl"
+    results = main(input_file, output_file)