YUNTA88 commited on Mar 26

Commit

c6cbaf4

verified ·

1 Parent(s): 24f08aa

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

eval_footprint/__pycache__/eval_openended_inference.cpython-311.pyc +0 -0
eval_footprint/comparison_report.json +97 -0
eval_footprint/convert_openended.py +61 -0
eval_footprint/create_openended_test.py +211 -0
eval_footprint/eval_deepseek_judge.py +382 -0
eval_footprint/eval_fullft_math_nf.py +258 -0
eval_footprint/eval_fullft_math_nf_old.py +259 -0
eval_footprint/eval_fullft_math_nf_old_final.py +258 -0
eval_footprint/eval_fullft_phyx_math_nf.py +258 -0
eval_footprint/eval_fullft_phyx_nf.py +258 -0
eval_footprint/eval_inference_lora_math_f.py +262 -0
eval_footprint/eval_judge_fullft_math_nf.py +293 -0
eval_footprint/eval_judge_fullft_phyx_nf.py +296 -0
eval_footprint/eval_judge_lora_math_f.py +377 -0
eval_footprint/eval_judge_lora_phyx_f.py +340 -0
eval_footprint/eval_lora_phyx_f.py +259 -0
eval_footprint/eval_lora_phyx_f_final.py +258 -0
eval_footprint/eval_openended_inference.py +259 -0
eval_footprint/eval_openended_judge.py +467 -0
eval_footprint/eval_sft_only.py +114 -0
eval_footprint/eval_sft_vs_base_multigpu.py +359 -0
eval_footprint/eval_single_model_template.py +242 -0
eval_footprint/inference_fullft_math_nf.jsonl +0 -0
eval_footprint/inference_fullft_math_nf_old.jsonl +0 -0
eval_footprint/inference_fullft_phyx_math_nf.jsonl +0 -0
eval_footprint/inference_fullft_phyx_nf.jsonl +0 -0
eval_footprint/inference_lora_phyx_f.jsonl +0 -0
eval_footprint/inference_results_base.jsonl +0 -0
eval_footprint/inference_results_lora_math_f.jsonl +0 -0
eval_footprint/inference_results_sft.jsonl +0 -0
eval_footprint/inference_results_sft.jsonl_gpu4.jsonl +0 -0
eval_footprint/inference_results_sft.jsonl_gpu5.jsonl +0 -0
eval_footprint/inference_results_sft.jsonl_gpu6.jsonl +0 -0
eval_footprint/inference_results_sft.jsonl_gpu7.jsonl +0 -0
eval_footprint/report_lora_math_f.json +39 -0
eval_footprint/run_inference_single.py +120 -0
eval_footprint/scored_results_base.jsonl +0 -0
eval_footprint/scored_results_lora_math_f.jsonl +0 -0
eval_footprint/scored_results_sft.jsonl +0 -0
eval_footprint/scored_results_sft_phyx.jsonl +0 -0
eval_footprint/sft_report_phyx.json +32 -0
eval_footprint/simple_eval.py +68 -0
eval_footprint/single_model_eval.py +103 -0
eval_footprint/test_1533_openended.jsonl +0 -0

eval_footprint/__pycache__/eval_openended_inference.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

eval_footprint/comparison_report.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "timestamp": "2026-02-26 04:38:24",
+  "scoring_method": "rule-based + Gemini 2.5 Flash judge (parallel)",
+  "base": {
+    "model": "Base",
+    "total": 1533,
+    "rule_correct": 48,
+    "gemini_rescued": 108,
+    "final_correct": 156,
+    "final_acc": 10.18,
+    "category_stats": {
+      "Mechanics": {
+        "total": 276,
+        "rule_correct": 8,
+        "gemini_correct": 33,
+        "final_correct": 41
+      },
+      "Waves/Acoustics": {
+        "total": 253,
+        "rule_correct": 7,
+        "gemini_correct": 20,
+        "final_correct": 27
+      },
+      "Electromagnetism": {
+        "total": 275,
+        "rule_correct": 7,
+        "gemini_correct": 16,
+        "final_correct": 23
+      },
+      "Modern Physics": {
+        "total": 222,
+        "rule_correct": 6,
+        "gemini_correct": 14,
+        "final_correct": 20
+      },
+      "Optics": {
+        "total": 252,
+        "rule_correct": 16,
+        "gemini_correct": 15,
+        "final_correct": 31
+      },
+      "Thermodynamics": {
+        "total": 255,
+        "rule_correct": 4,
+        "gemini_correct": 10,
+        "final_correct": 14
+      }
+    }
+  },
+  "sft": {
+    "model": "SFT",
+    "total": 1533,
+    "rule_correct": 41,
+    "gemini_rescued": 69,
+    "final_correct": 110,
+    "final_acc": 7.18,
+    "category_stats": {
+      "Mechanics": {
+        "total": 276,
+        "rule_correct": 2,
+        "gemini_correct": 27,
+        "final_correct": 29
+      },
+      "Waves/Acoustics": {
+        "total": 253,
+        "rule_correct": 8,
+        "gemini_correct": 12,
+        "final_correct": 20
+      },
+      "Electromagnetism": {
+        "total": 275,
+        "rule_correct": 7,
+        "gemini_correct": 8,
+        "final_correct": 15
+      },
+      "Modern Physics": {
+        "total": 222,
+        "rule_correct": 7,
+        "gemini_correct": 10,
+        "final_correct": 17
+      },
+      "Optics": {
+        "total": 252,
+        "rule_correct": 13,
+        "gemini_correct": 6,
+        "final_correct": 19
+      },
+      "Thermodynamics": {
+        "total": 255,
+        "rule_correct": 4,
+        "gemini_correct": 6,
+        "final_correct": 10
+      }
+    }
+  },
+  "improvement": "-3.00%"
+}

eval_footprint/convert_openended.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env python3
+"""Convert MCQ test set to open-ended format. Uses pyarrow (not pandas)."""
+import json, re, os
+PHYX_TEST = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/PhyX_test.jsonl"
+TEST_PARQUET = "/workspace/rl4phyx/RL4Phyx/SFT/eval_data/test_1533.parquet"
+OUTPUT = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
+# Get test indices from parquet via pyarrow
+import pyarrow.parquet as pq
+table = pq.read_table(TEST_PARQUET)
+ei_col = table.column("extra_info")
+test_indices = set()
+for i in range(len(table)):
+    ei = ei_col[i].as_py()
+    test_indices.add(ei["index"])
+print(f"Test indices: {len(test_indices)}")
+# Read all PhyX samples
+all_samples = {}
+with open(PHYX_TEST) as f:
+    for line in f:
+        if line.strip():
+            d = json.loads(line)
+            all_samples[d["index"]] = d
+print(f"Total PhyX: {len(all_samples)}")
+# Convert
+results = []
+for idx in sorted(test_indices):
+    s = all_samples[idx]
+    opts = dict(re.findall(r'([ABCD]):\s*"([^"]*)"', s.get("options", "")))
+    letter = s["answer"].strip().upper()
+    actual = opts.get(letter, letter)
+    results.append({
+        "index": idx,
+        "category": s.get("category", ""),
+        "subfield": s.get("subfield", ""),
+        "description": s.get("description", ""),
+        "question": s.get("question", ""),
+        "image": s.get("image", ""),
+        "ground_truth_letter": letter,
+        "ground_truth_value": actual,
+        "options_original": s.get("options", ""),
+        "reasoning_type": s.get("reasoning_type", []),
+        "image_caption": s.get("image_caption", ""),
+    })
+with open(OUTPUT, "w") as f:
+    for r in results:
+        f.write(json.dumps(r, ensure_ascii=False) + "\n")
+print(f"Saved {len(results)} samples to {OUTPUT}")
+from collections import Counter
+cats = Counter(r["category"] for r in results)
+for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+    print(f"  {c}: {n}")
+ex = results[0]
+print(f"Ex: q={ex['question'][:80]}  gt={ex['ground_truth_value']}")

eval_footprint/create_openended_test.py ADDED Viewed

	@@ -0,0 +1,211 @@

+#!/usr/bin/env python3
+"""
+Convert MCQ test set to open-ended format.
+NO pandas dependency — uses only json, re, os (avoids Docker import issues).
+Input:  PhyX_test.jsonl (3000 questions with MCQ options)
+        SFT train indices file (to exclude training samples)
+Output: test_1533_openended.jsonl (open-ended format)
+"""
+import json
+import re
+import os
+# ============ CONFIG ============
+PHYX_TEST = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/PhyX_test.jsonl"
+# The SFT training set indices — we need to exclude these
+SFT_TRAIN_PARQUET_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/data"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+OUTPUT_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
+TOTAL_PHYX = 3000
+SFT_TRAIN_COUNT = 1467  # 3000 - 1533
+# ================================
+def parse_options(options_str):
+    """Parse 'A:"7.55N",B:"5.55N",C:"7.65N",D:"6.65N"'
+    Returns dict: {'A': '7.55N', 'B': '5.55N', ...}
+    """
+    result = {}
+    matches = re.findall(r'([ABCD]):\s*"([^"]*)"', options_str)
+    for letter, value in matches:
+        result[letter] = value
+    return result
+def get_train_indices():
+    """Get the indices of samples used in SFT training.
+    We know SFT used specific indices from PhyX_test.jsonl.
+    Read the SFT training jsonl to find which indices were used.
+    """
+    train_indices = set()
+    # Try reading the SFT training data to extract indices
+    sft_dirs = [
+        "/workspace/rl4phyx/RL4Phyx/SFT/data",
+        "/workspace/rl4phyx/RL4Phyx/SFT/sft_data",
+    ]
+    for sft_dir in sft_dirs:
+        if not os.path.exists(sft_dir):
+            continue
+        for fname in os.listdir(sft_dir):
+            fpath = os.path.join(sft_dir, fname)
+            if fname.endswith('.jsonl'):
+                with open(fpath, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        if line.strip():
+                            try:
+                                data = json.loads(line)
+                                if 'extra_info' in data and 'index' in data['extra_info']:
+                                    train_indices.add(data['extra_info']['index'])
+                                elif 'index' in data:
+                                    train_indices.add(data['index'])
+                            except:
+                                pass
+            elif fname.endswith('.json'):
+                with open(fpath, 'r', encoding='utf-8') as f:
+                    try:
+                        data = json.load(f)
+                        if isinstance(data, list):
+                            for item in data:
+                                if 'index' in item:
+                                    train_indices.add(item['index'])
+                    except:
+                        pass
+    return train_indices
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Step 1: Read all PhyX_test.jsonl samples
+    all_samples = []
+    with open(PHYX_TEST, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                all_samples.append(json.loads(line))
+    print(f"Total PhyX_test samples: {len(all_samples)}")
+    # Step 2: Get training indices to exclude
+    train_indices = get_train_indices()
+    print(f"Found {len(train_indices)} SFT training indices to exclude")
+    # If we couldn't find training indices, try alternate approach
+    # The test set was created by selecting samples with split='test'
+    if len(train_indices) == 0:
+        print("No training indices found from files, checking for 'split' field...")
+        # Check if samples have a split field
+        has_split = any('split' in s for s in all_samples)
+        if has_split:
+            test_samples = [s for s in all_samples if s.get('split') == 'test']
+            train_samples = [s for s in all_samples if s.get('split') != 'test']
+            train_indices = {s['index'] for s in train_samples}
+            print(f"Using split field: {len(test_samples)} test, {len(train_samples)} train")
+        else:
+            # Check the SFT config for variance selection
+            # The test set matches the complement of the training set
+            # Try reading parquet via pyarrow directly (might work even if pandas broken)
+            print("Trying pyarrow directly...")
+            try:
+                import pyarrow.parquet as pq
+                for root, dirs, files in os.walk("/workspace/rl4phyx/RL4Phyx/SFT"):
+                    for f_name in files:
+                        if 'train' in f_name.lower() and f_name.endswith('.parquet'):
+                            fpath = os.path.join(root, f_name)
+                            table = pq.read_table(fpath)
+                            # Try to extract indices
+                            if 'extra_info' in table.column_names:
+                                for row in table.to_pydict()['extra_info']:
+                                    if isinstance(row, dict) and 'index' in row:
+                                        train_indices.add(row['index'])
+                            print(f"  Read {fpath}: got {len(train_indices)} indices so far")
+            except Exception as e:
+                print(f"pyarrow failed: {e}")
+    # Step 3: Build test set
+    if len(train_indices) > 0:
+        test_samples = [s for s in all_samples if s['index'] not in train_indices]
+    else:
+        # Fallback: just use all 3000 samples
+        print("WARNING: Could not identify train/test split. Using all 3000 samples.")
+        test_samples = all_samples
+    print(f"Test samples to convert: {len(test_samples)}")
+    # Step 4: Convert to open-ended format
+    openended = []
+    for sample in test_samples:
+        options = parse_options(sample.get('options', ''))
+        letter_answer = sample['answer'].strip().upper()
+        actual_answer = options.get(letter_answer, letter_answer)
+        entry = {
+            'index': sample['index'],
+            'category': sample.get('category', ''),
+            'subfield': sample.get('subfield', ''),
+            'description': sample.get('description', ''),
+            'question': sample.get('question', ''),
+            'image': sample.get('image', ''),
+            'ground_truth_letter': letter_answer,
+            'ground_truth_value': actual_answer,
+            'options_original': sample.get('options', ''),
+            'reasoning_type': sample.get('reasoning_type', []),
+            'image_caption': sample.get('image_caption', ''),
+        }
+        openended.append(entry)
+    # Step 5: Stats
+    from collections import Counter
+    cats = Counter(e['category'] for e in openended)
+    print(f"\nConverted {len(openended)} samples to open-ended format")
+    print("Per-category distribution:")
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Answer type analysis
+    numeric_count = 0
+    text_count = 0
+    for e in openended:
+        val = e['ground_truth_value']
+        # Check if answer is numeric-ish
+        clean = re.sub(r'[^\d.\-eE]', '', val)
+        try:
+            float(clean)
+            numeric_count += 1
+        except:
+            text_count += 1
+    print(f"\nAnswer types: {numeric_count} numeric-ish, {text_count} text/symbolic")
+    # Step 6: Save
+    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+        for entry in openended:
+            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+    print(f"\nSaved to: {OUTPUT_FILE}")
+    # Also copy the conversion script itself
+    import shutil
+    script_dest = os.path.join(OUTPUT_DIR, "create_openended_test.py")
+    shutil.copy2(__file__, script_dest)
+    print(f"Script copied to: {script_dest}")
+    # Show examples
+    print("\n=== EXAMPLE 1 ===")
+    ex = openended[0]
+    print(f"  index: {ex['index']}")
+    print(f"  category: {ex['category']}")
+    print(f"  question: {ex['question']}")
+    print(f"  ground_truth_value: {ex['ground_truth_value']}")
+    if len(openended) > 100:
+        print("\n=== EXAMPLE 100 ===")
+        ex = openended[100]
+        print(f"  index: {ex['index']}")
+        print(f"  category: {ex['category']}")
+        print(f"  question: {ex['question']}")
+        print(f"  ground_truth_value: {ex['ground_truth_value']}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_deepseek_judge.py ADDED Viewed

	@@ -0,0 +1,382 @@

+#!/usr/bin/env python3
+"""
+Score inference results using DeepSeek-V3 as LLM Judge.
+*** EXACTLY aligned with PhyX official evaluation pipeline ***
+(from killthefullmoon/PhyX -> vlmeval/dataset/utils/phyx.py)
+Pipeline:
+  1. Extract answer from \boxed{} or "final answer:" pattern
+  2. String-level matching
+  3. LLM judge with 5-shot ICE prompt, retry 5 times with increasing temperature
+Usage:
+    python3 eval_deepseek_judge.py
+"""
+import json, os, re, time, sys, ast
+from collections import defaultdict
+import urllib.request
+import urllib.error
+# ===================== CONFIG =====================
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"  # Official DeepSeek-V3
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
+SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
+OUTPUT_DIR = RESULTS_DIR
+FAIL_MSG = 'Failed to obtain answer via API.'
+RETRY = 5
+# ==================================================
+# ============= PhyX ICE (In-Context Examples) =============
+# Exactly from PhyX source code: get_ICE()
+def get_ICE():
+    example_1 = """
+Ground truth answer: 502 \n
+Predicted answer: The mass of block (B) is:
+[
+\\boxed{ 50 \\sqrt{101} }
+] \n
+Judegement: 1
+"""
+    example_2 = """
+Ground truth answer: 46.3 kN \n
+Predicted answer: The tension ( T_B ) in the cable is approximately:
+[
+\\boxed{46300 }
+] \n
+Judegement: 1
+"""
+    example_3 = """
+Ground truth answer: 12 m/s \n
+Predicted answer: The speed of the box after 2.00 seconds is:
+[
+\\boxed{11.3, \\text{m/s}}
+] \n
+Judegement: 0
+"""
+    example_4 = """
+Ground truth answer: 36.00 kg \n
+Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
+[
+\\boxed{36.1, \\text\\{kg\\}}
+] \n
+Judegement: 1
+"""
+    example_5 = """
+Ground truth answer: 3.2 m \n
+Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
+Judegement: 1
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+# ============= PhyX Prompt Builder =============
+# Exactly from PhyX source code: build_phyx_gpt4_prompt()
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = """
+Please read the following example. Given predicted answer and ground truth answer,
+compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
+If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If the given predicted mentions "approximately", then allow the Approximation Error, \
+such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
+"""
+    prompt = task_description
+    examples = get_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += 'Ground truth answer: {} \n'.format(gt_answer)
+    prompt += 'Predicted answer: {} \n'.format(pred)
+    prompt += 'Judegement:'
+    return prompt
+# ============= PhyX Answer Extraction =============
+# Exactly from PhyX source code
+def mapping_str(input_str):
+    d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
+    output = input_str
+    for k, v in d.items():
+        try:
+            output = output.replace(k, v)
+        except:
+            pass
+    return output
+def extract_boxed_content(s):
+    """Extract content from \\boxed{...} handling nested braces. From PhyX source."""
+    start = s.find(r'\boxed{')
+    if start == -1:
+        return None
+    content_start = start + len(r'\boxed{')
+    rest = s[content_start:]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                return rest[:i]
+            else:
+                depth -= 1
+    return None
+def PhyX_process_line(prediction_str, gt_answer):
+    """
+    PhyX rule-based answer extraction and string matching.
+    Returns: dict with 'extracted', 'match' (0 or 1)
+    """
+    ret = {}
+    ret['gt'] = str(gt_answer)
+    ret['pred'] = prediction_str.strip()
+    if ret['pred'] == FAIL_MSG:
+        ret['match'] = 0
+        ret["extracted"] = "Fail to Call API"
+        return ret
+    # Try extracting from \boxed{}
+    boxed_answer = extract_boxed_content(ret['pred'])
+    if boxed_answer is not None:
+        boxed_answer = mapping_str(boxed_answer)
+        ret["extracted"] = boxed_answer
+    else:
+        # Try "final answer:" or "correct answer:" pattern
+        pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^:：]*[:：]\s*(.*?)(?=\n\n\n|\Z)'
+        flags = re.IGNORECASE | re.DOTALL
+        match = re.search(pattern, ret['pred'], flags=flags)
+        if match:
+            extracted_answer = match.group(1)
+            extracted_answer = mapping_str(extracted_answer)
+            ret["extracted"] = extracted_answer
+        else:
+            ret["extracted"] = "SAME as predict"
+    # String-level matching (PhyX logic)
+    gt_lower = ret['gt'].strip().lower()
+    extracted_lower = ret["extracted"].strip().lower()
+    pred_lower = ret["pred"].strip().lower()
+    if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
+        ret['match'] = 1
+        return ret
+    ret['match'] = 0
+    return ret
+# ============= DeepSeek API =============
+def call_deepseek(prompt, temperature=0.0):
+    """Call DeepSeek-V3 API (OpenAI-compatible)."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
+    }
+    data = json.dumps({
+        "model": DEEPSEEK_MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 200,
+    }).encode('utf-8')
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read().decode())
+            return result['choices'][0]['message']['content']
+    except Exception as e:
+        return FAIL_MSG
+# ============= PhyX Evaluation Logic =============
+# Exactly from PhyX source code: PhyX_auxeval()
+def PhyX_auxeval(gt_answer, prediction):
+    """
+    Evaluate a single prediction against ground truth.
+    Follows PhyX pipeline exactly:
+      1. Extract answer (boxed/regex)
+      2. String-level match
+      3. LLM judge with 5 retries, increasing temperature
+    Returns: dict(log, res, extracted)
+    """
+    log = ''
+    # Step 1: Rule-based extraction
+    tmp = PhyX_process_line(prediction, gt_answer)
+    if tmp["extracted"] == "Fail to Call API":
+        log += "Fail to Call API"
+        return dict(log=log, res=0, extracted="Fail to Call API")
+    if tmp["extracted"] != "SAME as predict":
+        prediction_extracted = tmp["extracted"]
+    else:
+        prediction_extracted = prediction
+    # Step 2: String-level match
+    if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
+    # Step 3: LLM judge with retries (PhyX uses 5 retries with temp = i * 0.5)
+    prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: answer and prediction are {gt_answer} and {prediction_extracted}, failed to compare.\n'
+        else:
+            log += 'Compared at semantic level. '
+            if "1" in res:
+                log += "Semantic equal via LLM."
+                return dict(log=log, res=1, extracted=prediction_extracted)
+            elif "0" in res:
+                log += f"LLM judgement {res}"
+                return dict(log=log, res=0, extracted=prediction_extracted)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res=0, extracted=prediction_extracted)
+# ============= Main Scoring =============
+def score_results(results_file, model_name):
+    """Score all results from a JSONL file."""
+    results = []
+    with open(results_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"  Using PhyX-aligned pipeline with DeepSeek-V3 judge")
+    print(f"{'='*60}")
+    total = len(results)
+    hit = 0
+    cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
+    scored = []
+    string_match = 0
+    llm_match = 0
+    llm_called = 0
+    for i, r in enumerate(results):
+        gt = r['ground_truth_value']
+        prediction = r['model_output']
+        cat = r.get('category', 'unknown')
+        cat_stats[cat]['total'] += 1
+        eval_result = PhyX_auxeval(gt, prediction)
+        r['extracted_answer'] = eval_result['extracted']
+        r['eval_log'] = eval_result['log']
+        r['res'] = eval_result['res']
+        if eval_result['res'] == 1:
+            hit += 1
+            cat_stats[cat]['correct'] += 1
+        if "string level" in eval_result['log']:
+            string_match += 1
+        elif "semantic level" in eval_result['log'] or "LLM judgement" in eval_result['log']:
+            llm_called += 1
+            if eval_result['res'] == 1:
+                llm_match += 1
+        scored.append(r)
+        if (i + 1) % 50 == 0:
+            print(f"  [{i+1}/{total}] acc={hit/(i+1)*100:.1f}% "
+                  f"(str_match={string_match}, llm_called={llm_called}, llm_match={llm_match})",
+                  flush=True)
+    acc = hit / total * 100
+    print(f"\n  RESULTS for {model_name}:")
+    print(f"    Total:           {total}")
+    print(f"    String matches:  {string_match}")
+    print(f"    LLM calls:       {llm_called}")
+    print(f"    LLM matches:     {llm_match}")
+    print(f"    Final correct:   {hit} ({acc:.1f}%)")
+    print(f"\n    Per category:")
+    for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
+        cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+        print(f"      {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
+    return scored, {
+        'model': model_name,
+        'total': total,
+        'string_matches': string_match,
+        'llm_calls': llm_called,
+        'llm_matches': llm_match,
+        'final_correct': hit,
+        'final_acc': round(acc, 2),
+        'category_stats': {k: dict(v) for k, v in cat_stats.items()}
+    }
+def main():
+    print("="*60)
+    print("  PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
+    print(f"  Pipeline: extract → string match → LLM judge (5 retries)")
+    print(f"  Results dir: {RESULTS_DIR}")
+    print("="*60)
+    # Test API
+    print("\nTesting DeepSeek API...")
+    test = call_deepseek("Say 'OK' if you can read this.")
+    if test == FAIL_MSG:
+        print(f"  API FAILED: {test}")
+        sys.exit(1)
+    print(f"  API OK: {test[:50]}")
+    # Score both models
+    base_scored, base_stats = score_results(BASE_RESULTS, "Base (Qwen2.5-VL-3B)")
+    sft_scored, sft_stats = score_results(SFT_RESULTS, "SFT (Cold-Start Full FT)")
+    # Save scored results
+    for scored, name in [(base_scored, "base"), (sft_scored, "sft")]:
+        out_file = os.path.join(OUTPUT_DIR, f"scored_results_{name}_phyx.jsonl")
+        with open(out_file, 'w', encoding='utf-8') as f:
+            for r in scored:
+                f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Save comparison report
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
+        'base': base_stats,
+        'sft': sft_stats,
+    }
+    report_file = os.path.join(OUTPUT_DIR, "comparison_report_phyx.json")
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    # Final comparison
+    print(f"\n{'='*60}")
+    print(f"  FINAL COMPARISON (PhyX-aligned)")
+    print(f"{'='*60}")
+    print(f"  Base accuracy:  {base_stats['final_acc']}%")
+    print(f"  SFT accuracy:   {sft_stats['final_acc']}%")
+    print(f"  Improvement:    {sft_stats['final_acc'] - base_stats['final_acc']:+.1f}%")
+    print(f"\n  Report saved: {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_fullft_math_nf.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_coldstart.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_coldstart/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    pass  # SKIP BASE
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_coldstart.jsonl")
+    base_results = []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_fullft_math_nf_old.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_sft.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    print("\n>>> Starting BASE model inference...", flush=True)
+    run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
+    base_results = merge_results(base_files, base_final)
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_fullft_math_nf_old_final.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_phyx_50000.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_phyx_50000/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    pass  # SKIP BASE
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000.jsonl")
+    base_results = []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_fullft_phyx_math_nf.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_combined_v3.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_combined_v3/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_combined_v3")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    pass  # Skip base
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_combined_v3.jsonl")
+    base_results = []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_fullft_phyx_nf.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_phyx_fullft_v3.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_phyx_fullft_v3/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx_fullft_v3")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    pass  # Skip base
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx_fullft_v3.jsonl")
+    base_results = []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_inference_lora_math_f.py ADDED Viewed

	@@ -0,0 +1,262 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_lora_math_f.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/lora_math_f/merged"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = []
+SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_lora_math_f")
+    # Run base model (SKIPPED if BASE_GPUS is empty)
+    if BASE_GPUS:
+        print("\n>>> Starting BASE model inference...", flush=True)
+        run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
+    else:
+        print("\n>>> SKIPPING BASE model (BASE_GPUS is empty)", flush=True)
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_lora_math_f.jsonl")
+    base_results = merge_results(base_files, base_final) if BASE_GPUS else []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_judge_fullft_math_nf.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/usr/bin/env python3
+"""
+Score base and fullft inference results using DeepSeek-V3 judge (20 threads).
+PhyX-aligned pipeline: extract -> string match -> LLM judge (5 retries).
+"""
+import json, os, re, time, sys
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import urllib.request
+import urllib.error
+# ===================== CONFIG =====================
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
+COLDSTART_RESULTS = os.path.join(RESULTS_DIR, "inference_results_coldstart.jsonl")
+# removed
+FAIL_MSG = 'Failed to obtain answer via API.'
+RETRY = 5
+# ==================================================
+def get_ICE():
+    example_1 = """
+Ground truth answer: 502 \n
+Predicted answer: The mass of block (B) is:
+[
+\\boxed{ 50 \\sqrt{101} }
+] \n
+Judegement: 1
+"""
+    example_2 = """
+Ground truth answer: 46.3 kN \n
+Predicted answer: The tension ( T_B ) in the cable is approximately:
+[
+\\boxed{46300 }
+] \n
+Judegement: 1
+"""
+    example_3 = """
+Ground truth answer: 12 m/s \n
+Predicted answer: The speed of the box after 2.00 seconds is:
+[
+\\boxed{11.3, \\text{m/s}}
+] \n
+Judegement: 0
+"""
+    example_4 = """
+Ground truth answer: 36.00 kg \n
+Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
+[
+\\boxed{36.1, \\text\\{kg\\}}
+] \n
+Judegement: 1
+"""
+    example_5 = """
+Ground truth answer: 3.2 m \n
+Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
+Judegement: 1
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = """
+Please read the following example. Given predicted answer and ground truth answer,
+compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
+If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If the given predicted mentions "approximately", then allow the Approximation Error, \
+such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
+"""
+    prompt = task_description
+    for example in get_ICE():
+        prompt += example + '\n'
+    prompt += 'Ground truth answer: {} \n'.format(gt_answer)
+    prompt += 'Predicted answer: {} \n'.format(pred)
+    prompt += 'Judegement:'
+    return prompt
+def mapping_str(input_str):
+    d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
+    output = input_str
+    for k, v in d.items():
+        try:
+            output = output.replace(k, v)
+        except:
+            pass
+    return output
+def extract_boxed_content(s):
+    start = s.find(r'\boxed{')
+    if start == -1:
+        return None
+    content_start = start + len(r'\boxed{')
+    rest = s[content_start:]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                return rest[:i]
+            else:
+                depth -= 1
+    return None
+def PhyX_process_line(prediction_str, gt_answer):
+    ret = {}
+    ret['gt'] = str(gt_answer)
+    ret['pred'] = prediction_str.strip()
+    if ret['pred'] == FAIL_MSG:
+        ret['match'] = 0
+        ret["extracted"] = "Fail to Call API"
+        return ret
+    boxed_answer = extract_boxed_content(ret['pred'])
+    if boxed_answer is not None:
+        ret["extracted"] = mapping_str(boxed_answer)
+    else:
+        pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^:：]*[:：]\s*(.*?)(?=\n\n\n|\Z)'
+        match = re.search(pattern, ret['pred'], re.IGNORECASE | re.DOTALL)
+        if match:
+            ret["extracted"] = mapping_str(match.group(1))
+        else:
+            ret["extracted"] = "SAME as predict"
+    gt_lower = ret['gt'].strip().lower()
+    extracted_lower = ret["extracted"].strip().lower()
+    pred_lower = ret["pred"].strip().lower()
+    if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
+        ret['match'] = 1
+        return ret
+    ret['match'] = 0
+    return ret
+def call_deepseek(prompt, temperature=0.0):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
+    }
+    data = json.dumps({
+        "model": DEEPSEEK_MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 200,
+    }).encode('utf-8')
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read().decode())
+            return result['choices'][0]['message']['content']
+    except:
+        return FAIL_MSG
+def PhyX_auxeval(gt_answer, prediction):
+    log = ''
+    tmp = PhyX_process_line(prediction, gt_answer)
+    if tmp["extracted"] == "Fail to Call API":
+        return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
+    prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
+    if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
+    prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: failed.\n'
+        else:
+            log += 'Compared at semantic level. '
+            if "1" in res:
+                log += "Semantic equal via LLM."
+                return dict(log=log, res=1, extracted=prediction_extracted)
+            elif "0" in res:
+                log += f"LLM judgement {res}"
+                return dict(log=log, res=0, extracted=prediction_extracted)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res=0, extracted=prediction_extracted)
+def _eval_single(args):
+    idx, r = args
+    gt = r['ground_truth_value']
+    prediction = r['model_output']
+    eval_result = PhyX_auxeval(gt, prediction)
+    r['extracted_answer'] = eval_result['extracted']
+    r['eval_log'] = eval_result['log']
+    r['res'] = eval_result['res']
+    return idx, r
+def score_results(results_file, model_name, output_file):
+    results = []
+    with open(results_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"  Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
+    print(f"{'='*60}")
+    total = len(results)
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
+        done = 0
+        for future in as_completed(futures):
+            done += 1
+            if done % 100 == 0 or done == total:
+                hit = sum(1 for r in results if r.get('res') == 1)
+                print(f"  [{done}/{total}] processed, correct={hit}", flush=True)
+    hit = 0
+    string_match = 0
+    llm_match = 0
+    llm_called = 0
+    cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
+    for r in results:
+        cat = r.get('category', 'unknown')
+        cat_stats[cat]['total'] += 1
+        if r.get('res') == 1:
+            hit += 1
+            cat_stats[cat]['correct'] += 1
+        log = r.get('eval_log', '')
+        if "string level" in log:
+            string_match += 1
+        elif "semantic level" in log or "LLM judgement" in log:
+            llm_called += 1
+            if r.get('res') == 1:
+                llm_match += 1
+    acc = hit / total * 100
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"\n  {model_name}: {hit}/{total} ({acc:.1f}%)")
+    print(f"    String: {string_match}, LLM calls: {llm_called}, LLM match: {llm_match}")
+    print(f"    Per category:")
+    for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
+        cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+        print(f"      {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
+    return {
+        'model': model_name, 'total': total,
+        'string_matches': string_match, 'llm_calls': llm_called, 'llm_matches': llm_match,
+        'final_correct': hit, 'final_acc': round(acc, 2),
+        'category_stats': {k: dict(v) for k, v in cat_stats.items()}
+    }
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("="*60)
+    print("  PhyX-ALIGNED EVAL: Coldstart")
+    print(f"  Output: {OUTPUT_DIR}")
+    print("="*60)
+    print("\nTesting DeepSeek API...")
+    test = call_deepseek("Say 'OK' if you can read this.")
+    if test == FAIL_MSG:
+        print(f"  API FAILED"); sys.exit(1)
+    print(f"  API OK: {test[:50]}")
+    cs_stats = score_results(COLDSTART_RESULTS, "Coldstart (50K math)",
+                             os.path.join(OUTPUT_DIR, "scored_results_coldstart.jsonl"))
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
+        'coldstart': cs_stats,
+    }
+    report_file = os.path.join(OUTPUT_DIR, "report_coldstart.json")
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n{'='*60}")
+    print(f"  RESULTS")
+    print(f"{'='*60}")
+    print(f"  Coldstart accuracy: {cs_stats['final_acc']}%")
+    print(f"  Report: {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_judge_fullft_phyx_nf.py ADDED Viewed

	@@ -0,0 +1,296 @@

+#!/usr/bin/env python3
+"""
+Score base and fullft inference results using DeepSeek-V3 judge (20 threads).
+PhyX-aligned pipeline: extract -> string match -> LLM judge (5 retries).
+"""
+import json, os, re, time, sys
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import urllib.request
+import urllib.error
+# ===================== CONFIG =====================
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
+BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
+FULLFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
+FAIL_MSG = 'Failed to obtain answer via API.'
+RETRY = 5
+# ==================================================
+def get_ICE():
+    example_1 = """
+Ground truth answer: 502 \n
+Predicted answer: The mass of block (B) is:
+[
+\\boxed{ 50 \\sqrt{101} }
+] \n
+Judegement: 1
+"""
+    example_2 = """
+Ground truth answer: 46.3 kN \n
+Predicted answer: The tension ( T_B ) in the cable is approximately:
+[
+\\boxed{46300 }
+] \n
+Judegement: 1
+"""
+    example_3 = """
+Ground truth answer: 12 m/s \n
+Predicted answer: The speed of the box after 2.00 seconds is:
+[
+\\boxed{11.3, \\text{m/s}}
+] \n
+Judegement: 0
+"""
+    example_4 = """
+Ground truth answer: 36.00 kg \n
+Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
+[
+\\boxed{36.1, \\text\\{kg\\}}
+] \n
+Judegement: 1
+"""
+    example_5 = """
+Ground truth answer: 3.2 m \n
+Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
+Judegement: 1
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = """
+Please read the following example. Given predicted answer and ground truth answer,
+compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
+If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If the given predicted mentions "approximately", then allow the Approximation Error, \
+such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
+"""
+    prompt = task_description
+    for example in get_ICE():
+        prompt += example + '\n'
+    prompt += 'Ground truth answer: {} \n'.format(gt_answer)
+    prompt += 'Predicted answer: {} \n'.format(pred)
+    prompt += 'Judegement:'
+    return prompt
+def mapping_str(input_str):
+    d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
+    output = input_str
+    for k, v in d.items():
+        try:
+            output = output.replace(k, v)
+        except:
+            pass
+    return output
+def extract_boxed_content(s):
+    start = s.find(r'\boxed{')
+    if start == -1:
+        return None
+    content_start = start + len(r'\boxed{')
+    rest = s[content_start:]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                return rest[:i]
+            else:
+                depth -= 1
+    return None
+def PhyX_process_line(prediction_str, gt_answer):
+    ret = {}
+    ret['gt'] = str(gt_answer)
+    ret['pred'] = prediction_str.strip()
+    if ret['pred'] == FAIL_MSG:
+        ret['match'] = 0
+        ret["extracted"] = "Fail to Call API"
+        return ret
+    boxed_answer = extract_boxed_content(ret['pred'])
+    if boxed_answer is not None:
+        ret["extracted"] = mapping_str(boxed_answer)
+    else:
+        pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^:：]*[:：]\s*(.*?)(?=\n\n\n|\Z)'
+        match = re.search(pattern, ret['pred'], re.IGNORECASE | re.DOTALL)
+        if match:
+            ret["extracted"] = mapping_str(match.group(1))
+        else:
+            ret["extracted"] = "SAME as predict"
+    gt_lower = ret['gt'].strip().lower()
+    extracted_lower = ret["extracted"].strip().lower()
+    pred_lower = ret["pred"].strip().lower()
+    if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
+        ret['match'] = 1
+        return ret
+    ret['match'] = 0
+    return ret
+def call_deepseek(prompt, temperature=0.0):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
+    }
+    data = json.dumps({
+        "model": DEEPSEEK_MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 200,
+    }).encode('utf-8')
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read().decode())
+            return result['choices'][0]['message']['content']
+    except:
+        return FAIL_MSG
+def PhyX_auxeval(gt_answer, prediction):
+    log = ''
+    tmp = PhyX_process_line(prediction, gt_answer)
+    if tmp["extracted"] == "Fail to Call API":
+        return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
+    prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
+    if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
+    prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: failed.\n'
+        else:
+            log += 'Compared at semantic level. '
+            if "1" in res:
+                log += "Semantic equal via LLM."
+                return dict(log=log, res=1, extracted=prediction_extracted)
+            elif "0" in res:
+                log += f"LLM judgement {res}"
+                return dict(log=log, res=0, extracted=prediction_extracted)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res=0, extracted=prediction_extracted)
+def _eval_single(args):
+    idx, r = args
+    gt = r['ground_truth_value']
+    prediction = r['model_output']
+    eval_result = PhyX_auxeval(gt, prediction)
+    r['extracted_answer'] = eval_result['extracted']
+    r['eval_log'] = eval_result['log']
+    r['res'] = eval_result['res']
+    return idx, r
+def score_results(results_file, model_name, output_file):
+    results = []
+    with open(results_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"  Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
+    print(f"{'='*60}")
+    total = len(results)
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
+        done = 0
+        for future in as_completed(futures):
+            done += 1
+            if done % 100 == 0 or done == total:
+                hit = sum(1 for r in results if r.get('res') == 1)
+                print(f"  [{done}/{total}] processed, correct={hit}", flush=True)
+    hit = 0
+    string_match = 0
+    llm_match = 0
+    llm_called = 0
+    cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
+    for r in results:
+        cat = r.get('category', 'unknown')
+        cat_stats[cat]['total'] += 1
+        if r.get('res') == 1:
+            hit += 1
+            cat_stats[cat]['correct'] += 1
+        log = r.get('eval_log', '')
+        if "string level" in log:
+            string_match += 1
+        elif "semantic level" in log or "LLM judgement" in log:
+            llm_called += 1
+            if r.get('res') == 1:
+                llm_match += 1
+    acc = hit / total * 100
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"\n  {model_name}: {hit}/{total} ({acc:.1f}%)")
+    print(f"    String: {string_match}, LLM calls: {llm_called}, LLM match: {llm_match}")
+    print(f"    Per category:")
+    for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
+        cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+        print(f"      {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
+    return {
+        'model': model_name, 'total': total,
+        'string_matches': string_match, 'llm_calls': llm_called, 'llm_matches': llm_match,
+        'final_correct': hit, 'final_acc': round(acc, 2),
+        'category_stats': {k: dict(v) for k, v in cat_stats.items()}
+    }
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("="*60)
+    print("  PhyX-ALIGNED EVAL: Base + FullFT")
+    print(f"  Output: {OUTPUT_DIR}")
+    print("="*60)
+    print("\nTesting DeepSeek API...")
+    test = call_deepseek("Say 'OK' if you can read this.")
+    if test == FAIL_MSG:
+        print(f"  API FAILED"); sys.exit(1)
+    print(f"  API OK: {test[:50]}")
+    base_stats = score_results(BASE_RESULTS, "Base (Qwen2.5-VL-3B-Instruct)",
+                               os.path.join(OUTPUT_DIR, "scored_results_base.jsonl"))
+    fullft_stats = score_results(FULLFT_RESULTS, "SFT-fullft (Cold-Start)",
+                                 os.path.join(OUTPUT_DIR, "scored_results_fullft.jsonl"))
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
+        'base': base_stats, 'fullft': fullft_stats,
+    }
+    report_file = os.path.join(OUTPUT_DIR, "comparison_report_base_fullft.json")
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n{'='*60}")
+    print(f"  RESULTS")
+    print(f"{'='*60}")
+    print(f"  Base accuracy:   {base_stats['final_acc']}%")
+    print(f"  FullFT accuracy: {fullft_stats['final_acc']}%")
+    print(f"  Report: {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_judge_lora_math_f.py ADDED Viewed

	@@ -0,0 +1,377 @@

+#!/usr/bin/env python3
+"""
+Score inference results using DeepSeek-V3 as LLM Judge.
+*** EXACTLY aligned with PhyX official evaluation pipeline ***
+(from killthefullmoon/PhyX -> vlmeval/dataset/utils/phyx.py)
+Pipeline:
+  1. Extract answer from \boxed{} or "final answer:" pattern
+  2. String-level matching
+  3. LLM judge with 5-shot ICE prompt, retry 5 times with increasing temperature
+Usage:
+    python3 eval_deepseek_judge.py
+"""
+import json, os, re, time, sys, ast
+from collections import defaultdict
+import urllib.request
+import urllib.error
+# ===================== CONFIG =====================
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"  # Official DeepSeek-V3
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
+SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_lora_math_f.jsonl")
+OUTPUT_DIR = RESULTS_DIR
+FAIL_MSG = 'Failed to obtain answer via API.'
+RETRY = 5
+# ==================================================
+# ============= PhyX ICE (In-Context Examples) =============
+# Exactly from PhyX source code: get_ICE()
+def get_ICE():
+    example_1 = """
+Ground truth answer: 502 \n
+Predicted answer: The mass of block (B) is:
+[
+\\boxed{ 50 \\sqrt{101} }
+] \n
+Judegement: 1
+"""
+    example_2 = """
+Ground truth answer: 46.3 kN \n
+Predicted answer: The tension ( T_B ) in the cable is approximately:
+[
+\\boxed{46300 }
+] \n
+Judegement: 1
+"""
+    example_3 = """
+Ground truth answer: 12 m/s \n
+Predicted answer: The speed of the box after 2.00 seconds is:
+[
+\\boxed{11.3, \\text{m/s}}
+] \n
+Judegement: 0
+"""
+    example_4 = """
+Ground truth answer: 36.00 kg \n
+Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
+[
+\\boxed{36.1, \\text\\{kg\\}}
+] \n
+Judegement: 1
+"""
+    example_5 = """
+Ground truth answer: 3.2 m \n
+Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
+Judegement: 1
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+# ============= PhyX Prompt Builder =============
+# Exactly from PhyX source code: build_phyx_gpt4_prompt()
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = """
+Please read the following example. Given predicted answer and ground truth answer,
+compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
+If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If the given predicted mentions "approximately", then allow the Approximation Error, \
+such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
+"""
+    prompt = task_description
+    examples = get_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += 'Ground truth answer: {} \n'.format(gt_answer)
+    prompt += 'Predicted answer: {} \n'.format(pred)
+    prompt += 'Judegement:'
+    return prompt
+# ============= PhyX Answer Extraction =============
+# Exactly from PhyX source code
+def mapping_str(input_str):
+    d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
+    output = input_str
+    for k, v in d.items():
+        try:
+            output = output.replace(k, v)
+        except:
+            pass
+    return output
+def extract_boxed_content(s):
+    """Extract content from \\boxed{...} handling nested braces. From PhyX source."""
+    start = s.find(r'\boxed{')
+    if start == -1:
+        return None
+    content_start = start + len(r'\boxed{')
+    rest = s[content_start:]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                return rest[:i]
+            else:
+                depth -= 1
+    return None
+def PhyX_process_line(prediction_str, gt_answer):
+    """
+    PhyX rule-based answer extraction and string matching.
+    Returns: dict with 'extracted', 'match' (0 or 1)
+    """
+    ret = {}
+    ret['gt'] = str(gt_answer)
+    ret['pred'] = prediction_str.strip()
+    if ret['pred'] == FAIL_MSG:
+        ret['match'] = 0
+        ret["extracted"] = "Fail to Call API"
+        return ret
+    # Try extracting from \boxed{}
+    boxed_answer = extract_boxed_content(ret['pred'])
+    if boxed_answer is not None:
+        boxed_answer = mapping_str(boxed_answer)
+        ret["extracted"] = boxed_answer
+    else:
+        # Try "final answer:" or "correct answer:" pattern
+        pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^:：]*[:：]\s*(.*?)(?=\n\n\n|\Z)'
+        flags = re.IGNORECASE | re.DOTALL
+        match = re.search(pattern, ret['pred'], flags=flags)
+        if match:
+            extracted_answer = match.group(1)
+            extracted_answer = mapping_str(extracted_answer)
+            ret["extracted"] = extracted_answer
+        else:
+            ret["extracted"] = "SAME as predict"
+    # String-level matching (PhyX logic)
+    gt_lower = ret['gt'].strip().lower()
+    extracted_lower = ret["extracted"].strip().lower()
+    pred_lower = ret["pred"].strip().lower()
+    if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
+        ret['match'] = 1
+        return ret
+    ret['match'] = 0
+    return ret
+# ============= DeepSeek API =============
+def call_deepseek(prompt, temperature=0.0):
+    """Call DeepSeek-V3 API (OpenAI-compatible)."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
+    }
+    data = json.dumps({
+        "model": DEEPSEEK_MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 200,
+    }).encode('utf-8')
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read().decode())
+            return result['choices'][0]['message']['content']
+    except Exception as e:
+        return FAIL_MSG
+# ============= PhyX Evaluation Logic =============
+# Exactly from PhyX source code: PhyX_auxeval()
+def PhyX_auxeval(gt_answer, prediction):
+    """
+    Evaluate a single prediction against ground truth.
+    Follows PhyX pipeline exactly:
+      1. Extract answer (boxed/regex)
+      2. String-level match
+      3. LLM judge with 5 retries, increasing temperature
+    Returns: dict(log, res, extracted)
+    """
+    log = ''
+    # Step 1: Rule-based extraction
+    tmp = PhyX_process_line(prediction, gt_answer)
+    if tmp["extracted"] == "Fail to Call API":
+        log += "Fail to Call API"
+        return dict(log=log, res=0, extracted="Fail to Call API")
+    if tmp["extracted"] != "SAME as predict":
+        prediction_extracted = tmp["extracted"]
+    else:
+        prediction_extracted = prediction
+    # Step 2: String-level match
+    if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
+    # Step 3: LLM judge with retries (PhyX uses 5 retries with temp = i * 0.5)
+    prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: answer and prediction are {gt_answer} and {prediction_extracted}, failed to compare.\n'
+        else:
+            log += 'Compared at semantic level. '
+            if "1" in res:
+                log += "Semantic equal via LLM."
+                return dict(log=log, res=1, extracted=prediction_extracted)
+            elif "0" in res:
+                log += f"LLM judgement {res}"
+                return dict(log=log, res=0, extracted=prediction_extracted)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res=0, extracted=prediction_extracted)
+# ============= Main Scoring =============
+def score_results(results_file, model_name):
+    """Score all results from a JSONL file."""
+    results = []
+    with open(results_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"  Using PhyX-aligned pipeline with DeepSeek-V3 judge")
+    print(f"{'='*60}")
+    total = len(results)
+    hit = 0
+    cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
+    scored = []
+    string_match = 0
+    llm_match = 0
+    llm_called = 0
+    for i, r in enumerate(results):
+        gt = r['ground_truth_value']
+        prediction = r['model_output']
+        cat = r.get('category', 'unknown')
+        cat_stats[cat]['total'] += 1
+        eval_result = PhyX_auxeval(gt, prediction)
+        r['extracted_answer'] = eval_result['extracted']
+        r['eval_log'] = eval_result['log']
+        r['res'] = eval_result['res']
+        if eval_result['res'] == 1:
+            hit += 1
+            cat_stats[cat]['correct'] += 1
+        if "string level" in eval_result['log']:
+            string_match += 1
+        elif "semantic level" in eval_result['log'] or "LLM judgement" in eval_result['log']:
+            llm_called += 1
+            if eval_result['res'] == 1:
+                llm_match += 1
+        scored.append(r)
+        if (i + 1) % 50 == 0:
+            print(f"  [{i+1}/{total}] acc={hit/(i+1)*100:.1f}% "
+                  f"(str_match={string_match}, llm_called={llm_called}, llm_match={llm_match})",
+                  flush=True)
+    acc = hit / total * 100
+    print(f"\n  RESULTS for {model_name}:")
+    print(f"    Total:           {total}")
+    print(f"    String matches:  {string_match}")
+    print(f"    LLM calls:       {llm_called}")
+    print(f"    LLM matches:     {llm_match}")
+    print(f"    Final correct:   {hit} ({acc:.1f}%)")
+    print(f"\n    Per category:")
+    for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
+        cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+        print(f"      {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
+    return scored, {
+        'model': model_name,
+        'total': total,
+        'string_matches': string_match,
+        'llm_calls': llm_called,
+        'llm_matches': llm_match,
+        'final_correct': hit,
+        'final_acc': round(acc, 2),
+        'category_stats': {k: dict(v) for k, v in cat_stats.items()}
+    }
+def main():
+    print("="*60)
+    print("  PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
+    print(f"  Scoring: lora_math_f ONLY (base already scored)")
+    print(f"  Results dir: {RESULTS_DIR}")
+    print("="*60)
+    # Test API
+    print("\nTesting DeepSeek API...")
+    test = call_deepseek("Say 'OK' if you can read this.")
+    if test == FAIL_MSG:
+        print(f"  API FAILED: {test}")
+        sys.exit(1)
+    print(f"  API OK: {test[:50]}")
+    # Score lora_math_f only
+    sft_scored, sft_stats = score_results(SFT_RESULTS, "lora_math_f (LoRA+freeze+math)")
+    # Save scored results
+    out_file = os.path.join(OUTPUT_DIR, "scored_results_lora_math_f.jsonl")
+    with open(out_file, 'w', encoding='utf-8') as f:
+        for r in sft_scored:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Save report
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
+        'model': sft_stats,
+    }
+    report_file = os.path.join(OUTPUT_DIR, "report_lora_math_f.json")
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n{'='*60}")
+    print(f"  SCORING COMPLETE: lora_math_f")
+    print(f"{'='*60}")
+    print(f"  Accuracy:  {sft_stats['final_acc']}%")
+    print(f"  Scored:    {out_file}")
+    print(f"  Report:    {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_judge_lora_phyx_f.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#!/usr/bin/env python3
+"""
+Score phyx and phyx_50000 inference results using DeepSeek-V3 judge.
+PhyX-aligned pipeline: extract → string match → LLM judge (5 retries).
+"""
+import json, os, re, time, sys, ast
+from collections import defaultdict
+import urllib.request
+import urllib.error
+# ===================== CONFIG =====================
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
+PHYX_RESULTS = os.path.join(RESULTS_DIR, "inference_results_phyx.jsonl")
+PHYX50K_RESULTS = os.path.join(RESULTS_DIR, "inference_results_phyx_50000.jsonl")
+FAIL_MSG = 'Failed to obtain answer via API.'
+RETRY = 5
+# ==================================================
+# ============= PhyX ICE =============
+def get_ICE():
+    example_1 = """
+Ground truth answer: 502 \n
+Predicted answer: The mass of block (B) is:
+[
+\\boxed{ 50 \\sqrt{101} }
+] \n
+Judegement: 1
+"""
+    example_2 = """
+Ground truth answer: 46.3 kN \n
+Predicted answer: The tension ( T_B ) in the cable is approximately:
+[
+\\boxed{46300 }
+] \n
+Judegement: 1
+"""
+    example_3 = """
+Ground truth answer: 12 m/s \n
+Predicted answer: The speed of the box after 2.00 seconds is:
+[
+\\boxed{11.3, \\text{m/s}}
+] \n
+Judegement: 0
+"""
+    example_4 = """
+Ground truth answer: 36.00 kg \n
+Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
+[
+\\boxed{36.1, \\text\\{kg\\}}
+] \n
+Judegement: 1
+"""
+    example_5 = """
+Ground truth answer: 3.2 m \n
+Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
+Judegement: 1
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = """
+Please read the following example. Given predicted answer and ground truth answer,
+compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
+If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If the given predicted mentions "approximately", then allow the Approximation Error, \
+such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
+"""
+    prompt = task_description
+    for example in get_ICE():
+        prompt += example + '\n'
+    prompt += 'Ground truth answer: {} \n'.format(gt_answer)
+    prompt += 'Predicted answer: {} \n'.format(pred)
+    prompt += 'Judegement:'
+    return prompt
+# ============= PhyX Answer Extraction =============
+def mapping_str(input_str):
+    d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
+    output = input_str
+    for k, v in d.items():
+        try:
+            output = output.replace(k, v)
+        except:
+            pass
+    return output
+def extract_boxed_content(s):
+    start = s.find(r'\boxed{')
+    if start == -1:
+        return None
+    content_start = start + len(r'\boxed{')
+    rest = s[content_start:]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == '{':
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                return rest[:i]
+            else:
+                depth -= 1
+    return None
+def PhyX_process_line(prediction_str, gt_answer):
+    ret = {}
+    ret['gt'] = str(gt_answer)
+    ret['pred'] = prediction_str.strip()
+    if ret['pred'] == FAIL_MSG:
+        ret['match'] = 0
+        ret["extracted"] = "Fail to Call API"
+        return ret
+    boxed_answer = extract_boxed_content(ret['pred'])
+    if boxed_answer is not None:
+        boxed_answer = mapping_str(boxed_answer)
+        ret["extracted"] = boxed_answer
+    else:
+        pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^:：]*[:：]\s*(.*?)(?=\n\n\n|\Z)'
+        flags = re.IGNORECASE | re.DOTALL
+        match = re.search(pattern, ret['pred'], flags=flags)
+        if match:
+            ret["extracted"] = mapping_str(match.group(1))
+        else:
+            ret["extracted"] = "SAME as predict"
+    gt_lower = ret['gt'].strip().lower()
+    extracted_lower = ret["extracted"].strip().lower()
+    pred_lower = ret["pred"].strip().lower()
+    if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
+        ret['match'] = 1
+        return ret
+    ret['match'] = 0
+    return ret
+# ============= DeepSeek API =============
+def call_deepseek(prompt, temperature=0.0):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
+    }
+    data = json.dumps({
+        "model": DEEPSEEK_MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 200,
+    }).encode('utf-8')
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read().decode())
+            return result['choices'][0]['message']['content']
+    except Exception as e:
+        return FAIL_MSG
+# ============= PhyX Evaluation =============
+def PhyX_auxeval(gt_answer, prediction):
+    log = ''
+    tmp = PhyX_process_line(prediction, gt_answer)
+    if tmp["extracted"] == "Fail to Call API":
+        return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
+    prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
+    if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
+    prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG in res:
+            log += f'Try {i}: failed to compare.\n'
+        else:
+            log += 'Compared at semantic level. '
+            if "1" in res:
+                log += "Semantic equal via LLM."
+                return dict(log=log, res=1, extracted=prediction_extracted)
+            elif "0" in res:
+                log += f"LLM judgement {res}"
+                return dict(log=log, res=0, extracted=prediction_extracted)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res=0, extracted=prediction_extracted)
+# ============= Scoring =============
+def _eval_single(args):
+    """Evaluate a single sample (for thread pool)."""
+    idx, r = args
+    gt = r['ground_truth_value']
+    prediction = r['model_output']
+    eval_result = PhyX_auxeval(gt, prediction)
+    r['extracted_answer'] = eval_result['extracted']
+    r['eval_log'] = eval_result['log']
+    r['res'] = eval_result['res']
+    return idx, r
+def score_results(results_file, model_name, output_file):
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    results = []
+    with open(results_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"  Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
+    print(f"{'='*60}")
+    total = len(results)
+    # Parallel evaluation with 20 threads
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
+        done = 0
+        for future in as_completed(futures):
+            done += 1
+            if done % 100 == 0 or done == total:
+                # Count current stats
+                hit = sum(1 for r in results if r.get('res') == 1)
+                print(f"  [{done}/{total}] processed, current correct={hit}", flush=True)
+    # Compute final stats
+    hit = 0
+    string_match = 0
+    llm_match = 0
+    llm_called = 0
+    cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
+    for r in results:
+        cat = r.get('category', 'unknown')
+        cat_stats[cat]['total'] += 1
+        if r.get('res') == 1:
+            hit += 1
+            cat_stats[cat]['correct'] += 1
+        log = r.get('eval_log', '')
+        if "string level" in log:
+            string_match += 1
+        elif "semantic level" in log or "LLM judgement" in log:
+            llm_called += 1
+            if r.get('res') == 1:
+                llm_match += 1
+    acc = hit / total * 100
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"\n  {model_name}: {hit}/{total} ({acc:.1f}%)")
+    print(f"    String matches: {string_match}, LLM calls: {llm_called}, LLM matches: {llm_match}")
+    print(f"    Per category:")
+    for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
+        cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+        print(f"      {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
+    return {
+        'model': model_name,
+        'total': total,
+        'string_matches': string_match,
+        'llm_calls': llm_called,
+        'llm_matches': llm_match,
+        'final_correct': hit,
+        'final_acc': round(acc, 2),
+        'category_stats': {k: dict(v) for k, v in cat_stats.items()}
+    }
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("="*60)
+    print("  PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
+    print(f"  Pipeline: extract -> string match -> LLM judge (5 retries)")
+    print(f"  Output dir: {OUTPUT_DIR}")
+    print("="*60)
+    # Test API
+    print("\nTesting DeepSeek API...")
+    test = call_deepseek("Say 'OK' if you can read this.")
+    if test == FAIL_MSG:
+        print(f"  API FAILED: {test}")
+        sys.exit(1)
+    print(f"  API OK: {test[:50]}")
+    # Score phyx
+    phyx_stats = score_results(
+        PHYX_RESULTS, "SFT-phyx (1467 physics)",
+        os.path.join(OUTPUT_DIR, "scored_results_phyx.jsonl")
+    )
+    # Score phyx_50000
+    phyx50k_stats = score_results(
+        PHYX50K_RESULTS, "SFT-phyx_50000 (53001 combined)",
+        os.path.join(OUTPUT_DIR, "scored_results_phyx_50000.jsonl")
+    )
+    # Save report
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
+        'phyx': phyx_stats,
+        'phyx_50000': phyx50k_stats,
+    }
+    report_file = os.path.join(OUTPUT_DIR, "comparison_report.json")
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n{'='*60}")
+    print(f"  RESULTS")
+    print(f"{'='*60}")
+    print(f"  phyx accuracy:      {phyx_stats['final_acc']}%")
+    print(f"  phyx_50000 accuracy: {phyx50k_stats['final_acc']}%")
+    print(f"\n  Report: {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_lora_phyx_f.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_sft.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    print("\n>>> Starting BASE model inference...", flush=True)
+    run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
+    base_results = merge_results(base_files, base_final)
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_lora_phyx_f_final.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_phyx.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_phyx/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [0, 1, 2, 3]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    pass  # SKIP BASE
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx.jsonl")
+    base_results = []
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_openended_inference.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_sft.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    mp.set_start_method('spawn', force=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
+    print(f"  Base model: {BASE_MODEL}")
+    print(f"  SFT model:  {SFT_MODEL}")
+    print(f"  Base GPUs:  {BASE_GPUS}")
+    print(f"  SFT GPUs:   {SFT_GPUS}")
+    print("=" * 60)
+    # Load test data
+    samples = load_test_data()
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = Counter(s['category'] for s in samples)
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    # Run both models (each uses 4 GPUs internally for parallel inference)
+    t0 = time.time()
+    base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
+    sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
+    # Run base model on GPUs 0-3 (4 workers in parallel)
+    print("\n>>> Starting BASE model inference...", flush=True)
+    run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
+    # Run SFT model on GPUs 4-7 (4 workers in parallel)
+    print("\n>>> Starting SFT model inference...", flush=True)
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    # Merge results
+    base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
+    sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
+    base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
+    sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
+    base_results = merge_results(base_files, base_final)
+    sft_results = merge_results(sft_files, sft_final)
+    elapsed = time.time() - t0
+    print(f"\n{'=' * 60}")
+    print(f"  INFERENCE COMPLETE in {elapsed/60:.1f} min")
+    print(f"  Base results: {len(base_results)} → {base_final}")
+    print(f"  SFT results:  {len(sft_results)} → {sft_final}")
+    print(f"{'=' * 60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_openended_judge.py ADDED Viewed

	@@ -0,0 +1,467 @@

+#!/usr/bin/env python3
+"""
+Phase 2: Score open-ended inference results.
+Two-stage scoring (adapted from eval_dual.py + MetaPhyX DeepSeek judge):
+  Stage 1: Rule-based (boxed extraction + normalization + numeric tolerance)
+           - If CORRECT → done, count as correct
+           - If WRONG or UNCERTAIN → go to Stage 2
+  Stage 2: Gemini 2.5 Flash LLM-as-Judge
+           - Sends model's full response + ground truth to Gemini
+           - Gemini determines [[YES]] or [[NO]] equivalence
+Usage:
+    python eval_openended_judge.py [--results_dir PATH] [--api_key KEY]
+Inputs:
+    inference_results_base.jsonl
+    inference_results_sft.jsonl
+Outputs:
+    scored_results_base.jsonl
+    scored_results_sft.jsonl
+    comparison_report.json
+"""
+import json, os, re, time, sys, argparse
+from collections import defaultdict, Counter
+# ===================== CONFIG =====================
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyCXQ9gjVmRhoB1OVSqElnTB6p83GLX4W4w")
+GEMINI_MODEL = "gemini-2.5-flash"
+MAX_RETRIES = 3
+RATE_LIMIT_DELAY = 0.5  # seconds between Gemini calls
+# ===================== RULE-BASED SCORING =====================
+# Adapted from eval_dual.py (verl/utils/reward_score/utils/utils.py approach)
+def _strip_string(string):
+    """Normalize math string: remove LaTeX formatting, units, whitespace."""
+    string = string.replace("\n", "")
+    string = string.replace("\\!", "")
+    string = string.replace("\\\\", "\\")
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    string = string.replace("\\$", "")
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        if len(splits) == 2:
+            string = splits[0]
+    string = string.replace("\\%", "")
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    string = string.replace(" ", "")
+    return string
+def _normalize(expr):
+    """Normalize answer expression for comparison."""
+    if expr is None:
+        return None
+    m = re.search("^\\\\text\\{(?P<text>.+?)\\}$", expr)
+    if m is not None:
+        expr = m.group("text")
+    expr = expr.replace("\\%", "%")
+    expr = expr.replace("\\$", "$")
+    expr = expr.replace("$", "")
+    expr = expr.replace("%", "")
+    expr = expr.replace(" or ", " , ")
+    expr = expr.replace(" and ", " , ")
+    for unit in ["degree", "cm", "centimeter", "meter", "mile", "second", "minute",
+                 "hour", "day", "week", "month", "year", "foot", "feet", "inch", "yard",
+                 "newton", "joule", "watt", "ampere", "volt", "ohm", "hertz",
+                 "kilogram", "gram", "liter", "mole", "kelvin", "pascal",
+                 "m/s", "km/h", "rad/s", "N", "J", "W", "A", "V", "Hz", "Pa", "kg", "mol"]:
+        expr = re.sub(f"\\s*{re.escape(unit)}(es)?(s)?\\s*(\\^[0-9]+)?", "", expr, flags=re.IGNORECASE)
+    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+        expr = expr[1:-1]
+    try:
+        if "." in expr:
+            val = float(expr)
+            if abs(val - int(round(val))) <= 1e-7:
+                expr = str(int(round(val)))
+    except:
+        pass
+    expr = re.sub("- *", "-", expr)
+    expr = expr.replace(" ", "")
+    expr = expr.replace("{", "")
+    expr = expr.replace("}", "")
+    expr = expr.lower()
+    return expr
+def extract_boxed_answer(text):
+    """Extract the last \\boxed{} content from text."""
+    idx = text.rfind("\\boxed")
+    if idx < 0:
+        idx = text.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    num_left = 0
+    right_idx = None
+    while i < len(text):
+        if text[i] == "{":
+            num_left += 1
+        if text[i] == "}":
+            num_left -= 1
+            if num_left == 0:
+                right_idx = i
+                break
+        i += 1
+    if right_idx is None:
+        return None
+    boxed = text[idx:right_idx + 1]
+    left = "\\boxed{"
+    if boxed.startswith(left) and boxed.endswith("}"):
+        return boxed[len(left):-1]
+    return None
+def extract_answer_from_text(text):
+    """Try to extract answer: first from \\boxed{}, then from common patterns."""
+    # Handle <think>...</think>
+    if '<think>' in text and '</think>' in text:
+        text = text.split('</think>')[-1]
+    # Priority 1: \boxed{}
+    boxed = extract_boxed_answer(text)
+    if boxed:
+        return boxed
+    # Priority 2: Common answer patterns
+    patterns = [
+        r'(?:the answer is|answer is|答案是|答案为)[:\s]*(.+?)(?:\.|$)',
+        r'(?:therefore|thus|so|hence)[,\s]+(?:the answer is\s+)?(.+?)(?:\.|$)',
+    ]
+    for p in patterns:
+        m = re.search(p, text, re.IGNORECASE)
+        if m:
+            ans = m.group(1).strip()
+            if len(ans) < 100:
+                return ans
+    return None
+def rule_based_score(prediction, ground_truth):
+    """
+    Rule-based scoring: extract answer + normalize + compare.
+    Returns: (is_correct: bool, reason: str)
+    """
+    model_answer = extract_answer_from_text(prediction)
+    if model_answer is None:
+        return False, "no_answer_extracted"
+    gt_norm = _normalize(ground_truth)
+    pred_norm = _normalize(model_answer)
+    if gt_norm is None or pred_norm is None:
+        return False, "normalize_failed"
+    # Direct match after normalization
+    if gt_norm == pred_norm:
+        return True, "exact_match"
+    # Numeric comparison (1% tolerance)
+    try:
+        gt_float = float(gt_norm.replace(",", ""))
+        pred_float = float(pred_norm.replace(",", ""))
+        if abs(gt_float - pred_float) < 1e-6:
+            return True, "numeric_match"
+        if gt_float != 0 and abs((gt_float - pred_float) / gt_float) < 0.01:
+            return True, "numeric_close"
+    except:
+        pass
+    # Short answer containment (e.g., "III", "decreasing")
+    if len(ground_truth.strip()) <= 10:
+        gt_clean = ground_truth.strip()
+        if re.search(r'\b' + re.escape(gt_clean) + r'\b', prediction, re.IGNORECASE):
+            return True, "containment_match"
+    return False, f"no_match(pred={pred_norm[:30]},gt={gt_norm[:30]})"
+# ===================== GEMINI LLM-AS-JUDGE =====================
+# Adapted from eval_dual.py + MetaPhyX deepscaler ORM prompt
+ORM_PROMPT = """You are an expert in verifying if two physics answers are the same.
+Your input is a physics question prompt and two answers:
+- Answer 1: the model's prediction
+- Answer 2: the ground truth answer
+Determine if they are equivalent.
+Guidelines for equivalence:
+- Different forms of the same number (0.5 = 1/2 = 50%)
+- Same physical quantity with different units or notation (7.55N = 7.55 N = 7.55 newtons)
+- Semantically equivalent descriptions ("point III" and "III", "decreasing" and "the velocity is decreasing")
+- Algebraically equivalent expressions (x+1)^2 = x^2+2x+1
+- Same choice letter or option name
+- Correct numerical value even if formatting differs
+- Minor rounding differences within 2% are acceptable
+Your output must follow this format:
+1) Brief explanation for why the answers are equivalent or not.
+2) Final answer: [[YES]] or [[NO]]
+"""
+def call_gemini(prompt, api_key):
+    """Call Gemini API using urllib (no external deps)."""
+    import urllib.request, urllib.error
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={api_key}"
+    payload = json.dumps({
+        "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {
+            "temperature": 0.0,
+            "maxOutputTokens": 512,
+        }
+    }).encode('utf-8')
+    req = urllib.request.Request(
+        url, data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    for attempt in range(MAX_RETRIES):
+        try:
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                result = json.loads(resp.read().decode('utf-8'))
+                text = result['candidates'][0]['content']['parts'][0]['text']
+                return text.strip()
+        except urllib.error.HTTPError as e:
+            if e.code == 429:
+                wait = (attempt + 1) * 5
+                print(f"    Rate limited, waiting {wait}s...")
+                time.sleep(wait)
+            else:
+                print(f"    HTTP error {e.code}")
+                if attempt == MAX_RETRIES - 1:
+                    return None
+                time.sleep(2)
+        except Exception as e:
+            print(f"    Error: {e}")
+            if attempt == MAX_RETRIES - 1:
+                return None
+            time.sleep(2)
+    return None
+def gemini_judge(prediction, ground_truth, api_key):
+    """Use Gemini to judge if model's prediction matches ground truth."""
+    user_msg = f"""
+Model's full response (contains reasoning and answer):
+{prediction[:2000]}
+Ground truth answer: {ground_truth}
+"""
+    response = call_gemini(ORM_PROMPT + "\n\n" + user_msg, api_key)
+    if response is None:
+        return False, "api_error"
+    if "[[YES]]" in response:
+        return True, response[:200]
+    elif "[[NO]]" in response:
+        return False, response[:200]
+    else:
+        lower = response.lower()
+        if "yes" in lower and "no" not in lower:
+            return True, response[:200]
+        return False, response[:200]
+# ===================== MAIN EVALUATION =====================
+def score_model(results, model_name, api_key, output_file):
+    """
+    Score all results using two-stage approach:
+      1. Rule-based first → if correct, DONE
+      2. If rule-based says wrong/uncertain → Gemini fallback
+    """
+    print(f"\n{'='*60}")
+    print(f"  Scoring: {model_name} ({len(results)} samples)")
+    print(f"{'='*60}")
+    rule_correct = 0
+    rule_wrong_gemini_correct = 0
+    rule_wrong_gemini_wrong = 0
+    gemini_errors = 0
+    total = len(results)
+    cat_stats = defaultdict(lambda: {'total': 0, 'rule_correct': 0, 'gemini_correct': 0, 'final_correct': 0})
+    for i, r in enumerate(results):
+        cat = r.get('category', 'Unknown')
+        pred = r.get('model_output', '')
+        gt = r.get('ground_truth_value', '')
+        cat_stats[cat]['total'] += 1
+        # === Stage 1: Rule-based ===
+        rule_match, rule_reason = rule_based_score(pred, gt)
+        r['rule_match'] = rule_match
+        r['rule_reason'] = rule_reason
+        if rule_match:
+            # Rule says CORRECT → done
+            rule_correct += 1
+            cat_stats[cat]['rule_correct'] += 1
+            cat_stats[cat]['final_correct'] += 1
+            r['final_correct'] = True
+            r['final_method'] = f"rule:{rule_reason}"
+            r['gemini_called'] = False
+        else:
+            # Rule says WRONG → Gemini fallback
+            r['gemini_called'] = True
+            gemini_match, gemini_reason = gemini_judge(pred, gt, api_key)
+            r['gemini_match'] = gemini_match
+            r['gemini_reason'] = gemini_reason
+            if gemini_match:
+                rule_wrong_gemini_correct += 1
+                cat_stats[cat]['gemini_correct'] += 1
+                cat_stats[cat]['final_correct'] += 1
+                r['final_correct'] = True
+                r['final_method'] = "gemini_override"
+            else:
+                rule_wrong_gemini_wrong += 1
+                r['final_correct'] = False
+                r['final_method'] = f"wrong:{rule_reason}"
+            time.sleep(RATE_LIMIT_DELAY)
+        # Progress
+        final_correct_so_far = rule_correct + rule_wrong_gemini_correct
+        if (i + 1) % 10 == 0 or (i + 1) == total:
+            acc_so_far = final_correct_so_far / (i + 1)
+            print(f"  [{i+1}/{total}] acc={acc_so_far:.1%} "
+                  f"(rule✓={rule_correct} gemini✓={rule_wrong_gemini_correct} ✗={rule_wrong_gemini_wrong})",
+                  flush=True)
+    # Save scored results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Summary
+    final_correct = rule_correct + rule_wrong_gemini_correct
+    final_acc = final_correct / total if total > 0 else 0
+    print(f"\n{'─'*60}")
+    print(f"  {model_name} — RESULTS")
+    print(f"{'─'*60}")
+    print(f"  Rule-based correct : {rule_correct}/{total} ({100*rule_correct/total:.1f}%)")
+    print(f"  Gemini rescued     : {rule_wrong_gemini_correct} (rule wrong → Gemini correct)")
+    print(f"  Final accuracy     : {final_correct}/{total} ({100*final_acc:.1f}%)")
+    print(f"  Gemini calls made  : {rule_wrong_gemini_correct + rule_wrong_gemini_wrong}")
+    print(f"\n  Per-category:")
+    for cat, s in sorted(cat_stats.items()):
+        acc = s['final_correct'] / s['total'] if s['total'] > 0 else 0
+        print(f"    {cat:25s}: {s['final_correct']}/{s['total']} ({acc:.1%})"
+              f"  [rule={s['rule_correct']}, gemini+={s['gemini_correct']}]")
+    return {
+        'model': model_name,
+        'total': total,
+        'rule_correct': rule_correct,
+        'gemini_rescued': rule_wrong_gemini_correct,
+        'final_correct': final_correct,
+        'final_acc': round(100 * final_acc, 2),
+        'category_stats': {cat: dict(s) for cat, s in cat_stats.items()},
+    }
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--results_dir', type=str, default=None)
+    parser.add_argument('--api_key', type=str, default=None)
+    args = parser.parse_args()
+    api_key = args.api_key or GEMINI_API_KEY
+    # Find results directory
+    results_dir = args.results_dir
+    if results_dir is None:
+        for d in [os.path.dirname(os.path.abspath(__file__)),
+                  '/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/']:
+            if os.path.exists(os.path.join(d, 'inference_results_base.jsonl')):
+                results_dir = d
+                break
+        if results_dir is None:
+            print("ERROR: Cannot find inference results. Use --results_dir")
+            sys.exit(1)
+    print("=" * 60)
+    print("  OPEN-ENDED EVAL: Rule-based + Gemini 2.5 Flash Judge")
+    print(f"  Results dir: {results_dir}")
+    print("=" * 60)
+    # Load test data for context
+    test_file = os.path.join(results_dir, 'test_1533_openended.jsonl')
+    if os.path.exists(test_file):
+        with open(test_file, 'r') as f:
+            test_data = {json.loads(l)['index']: json.loads(l) for l in f if l.strip()}
+        print(f"Test data loaded: {len(test_data)} samples")
+    # Load and score base model
+    base_file = os.path.join(results_dir, 'inference_results_base.jsonl')
+    with open(base_file, 'r') as f:
+        base_results = [json.loads(l) for l in f if l.strip()]
+    base_scored_file = os.path.join(results_dir, 'scored_results_base.jsonl')
+    base_stats = score_model(base_results, "Qwen2.5-VL-3B (Base)", api_key, base_scored_file)
+    # Load and score SFT model
+    sft_file = os.path.join(results_dir, 'inference_results_sft.jsonl')
+    with open(sft_file, 'r') as f:
+        sft_results = [json.loads(l) for l in f if l.strip()]
+    sft_scored_file = os.path.join(results_dir, 'scored_results_sft.jsonl')
+    sft_stats = score_model(sft_results, "Qwen2.5-VL-3B (SFT)", api_key, sft_scored_file)
+    # Comparison
+    delta = sft_stats['final_acc'] - base_stats['final_acc']
+    report = {
+        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'scoring_method': 'rule-based + Gemini 2.5 Flash judge (fallback)',
+        'base': base_stats,
+        'sft': sft_stats,
+        'improvement': f"{delta:+.2f}%",
+    }
+    report_file = os.path.join(results_dir, 'comparison_report.json')
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n{'='*60}")
+    print(f"  FINAL COMPARISON")
+    print(f"{'='*60}")
+    print(f"  Base accuracy:  {base_stats['final_acc']}% ({base_stats['final_correct']}/{base_stats['total']})")
+    print(f"  SFT accuracy:   {sft_stats['final_acc']}% ({sft_stats['final_correct']}/{sft_stats['total']})")
+    print(f"  Improvement:    {delta:+.2f}%")
+    print(f"\n  Per-category:")
+    all_cats = sorted(set(list(base_stats['category_stats'].keys()) + list(sft_stats['category_stats'].keys())))
+    for cat in all_cats:
+        b = base_stats['category_stats'].get(cat, {'final_correct': 0, 'total': 0})
+        s = sft_stats['category_stats'].get(cat, {'final_correct': 0, 'total': 0})
+        b_acc = b['final_correct'] / b['total'] if b['total'] > 0 else 0
+        s_acc = s['final_correct'] / s['total'] if s['total'] > 0 else 0
+        print(f"    {cat:25s}  Base: {b_acc:.1%}  SFT: {s_acc:.1%}  Δ: {(s_acc-b_acc)*100:+.1f}%")
+    print(f"\n  Report: {report_file}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

eval_footprint/eval_sft_only.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import json, os, re, time, sys, ast
+from collections import defaultdict
+import urllib.request
+DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
+DEEPSEEK_MODEL = "deepseek-chat"
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
+RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
+FAIL_MSG = "Failed to obtain answer via API."
+RETRY = 5
+def get_ICE():
+    example_1 = "\nGround truth answer: 502 \\n\nPredicted answer: The mass of block (B) is:\n[\n\\\\boxed{ 50 \\\\sqrt{101} }\n] \\n\nJudegement: 1\n"
+    example_2 = "\nGround truth answer: 46.3 kN \\n\nPredicted answer: The tension ( T_B ) in the cable is approximately:\n[\n\\\\boxed{46300 }\n] \\n\nJudegement: 1\n"
+    example_3 = "\nGround truth answer: 12 m/s \\n\nPredicted answer: The speed of the box after 2.00 seconds is:\n[\n\\\\boxed{11.3, \\\\text{m/s}}\n] \\n\nJudegement: 0\n"
+    example_4 = "\nGround truth answer: 36.00 kg \\n\nPredicted answer: The mass of the hanging block ( m_2 ) must be approximately:\n[\n\\\\boxed{36.1, \\\\text\\\\{kg\\\\}}\n] \\n\nJudegement: 1\n"
+    example_5 = "\nGround truth answer: 3.2 m \\n\nPredicted answer: The stuntman and villain slide approximately \\\\frac{10}{3.1415} meters**.\nJudegement: 1\n"
+    return [example_1, example_2, example_3, example_4, example_5]
+def build_phyx_gpt4_prompt(gt_answer, pred):
+    task_description = "\nPlease read the following example. Given predicted answer and ground truth answer,\ncompare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.\nIf the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.\nIf the given predicted mentions \"approximately\", then allow the Approximation Error, \\\nsuch as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \\n\n\n"
+    prompt = task_description
+    for ex in get_ICE():
+        prompt += ex + "\n"
+    prompt += "Ground truth answer: {} \n".format(gt_answer)
+    prompt += "Predicted answer: {} \n".format(pred)
+    prompt += "Judegement:"
+    return prompt
+def mapping_str(s):
+    return s.replace("\\dfrac", "\\frac").replace("\\pi", "3.14")
+def extract_boxed_content(s):
+    start = s.find(r"\boxed{")
+    if start == -1: return None
+    rest = s[start + len(r"\boxed{"):]
+    depth = 0
+    for i, ch in enumerate(rest):
+        if ch == "{": depth += 1
+        elif ch == "}":
+            if depth == 0: return rest[:i]
+            else: depth -= 1
+    return None
+def call_deepseek(prompt, temperature=0.0):
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {DEEPSEEK_API_KEY}"}
+    data = json.dumps({"model": DEEPSEEK_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": temperature, "max_tokens": 200}).encode("utf-8")
+    try:
+        req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read().decode())["choices"][0]["message"]["content"]
+    except:
+        return FAIL_MSG
+def PhyX_auxeval(gt_answer, prediction):
+    log = ""
+    pred = prediction.strip()
+    boxed = extract_boxed_content(pred)
+    if boxed is not None:
+        extracted = mapping_str(boxed)
+    else:
+        m = re.search(r"\b(?:final\s+answer|correct\s+answer)\b[^:]*[:]\s*(.*?)(?=\n\n\n|\Z)", pred, re.IGNORECASE | re.DOTALL)
+        extracted = mapping_str(m.group(1)) if m else pred
+    if str(gt_answer).strip().lower() == extracted.strip().lower():
+        return dict(log="Matched at string level", res=1, extracted=extracted)
+    prompt = build_phyx_gpt4_prompt(gt_answer, extracted)
+    for i in range(RETRY):
+        res = call_deepseek(prompt, temperature=i * 0.5)
+        if FAIL_MSG not in res:
+            if "1" in res: return dict(log="Semantic equal via LLM", res=1, extracted=extracted)
+            elif "0" in res: return dict(log=f"LLM judgement {res}", res=0, extracted=extracted)
+    return dict(log="All retries failed", res=0, extracted=extracted)
+# Main
+results = []
+with open(SFT_RESULTS, "r") as f:
+    for line in f:
+        if line.strip(): results.append(json.loads(line))
+print(f"Scoring SFT model ({len(results)} samples)")
+hit = 0
+cat_stats = defaultdict(lambda: {"total": 0, "correct": 0})
+scored = []
+for i, r in enumerate(results):
+    gt = r["ground_truth_value"]
+    ev = PhyX_auxeval(gt, r["model_output"])
+    r["extracted_answer"] = ev["extracted"]
+    r["eval_log"] = ev["log"]
+    r["res"] = ev["res"]
+    if ev["res"] == 1:
+        hit += 1
+        cat_stats[r.get("category", "unknown")]["correct"] += 1
+    cat_stats[r.get("category", "unknown")]["total"] += 1
+    scored.append(r)
+    if (i+1) % 50 == 0:
+        print(f"  [{i+1}/{len(results)}] acc={hit/(i+1)*100:.1f}%", flush=True)
+acc = hit / len(results) * 100
+print(f"\nSFT Final: {hit}/{len(results)} ({acc:.1f}%)")
+for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]["total"]):
+    print(f"  {cat}: {s['correct']}/{s['total']} ({s['correct']/s['total']*100:.1f}%)")
+with open(os.path.join(RESULTS_DIR, "scored_results_sft_phyx.jsonl"), "w") as f:
+    for r in scored: f.write(json.dumps(r, ensure_ascii=False) + "\n")
+json.dump({"model": "SFT", "total": len(results), "correct": hit, "acc": round(acc, 2),
+           "categories": {k: dict(v) for k,v in cat_stats.items()}},
+          open(os.path.join(RESULTS_DIR, "sft_report_phyx.json"), "w"), indent=2, ensure_ascii=False)

eval_footprint/eval_sft_vs_base_multigpu.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python3
+"""
+Multi-GPU parallel evaluation: Base vs SFT on 1533 physics test set.
+Strategy:
+  - Base model: 4 workers on GPU 0,1,2,3  (each processes ~383 samples)
+  - SFT  model: 4 workers on GPU 4,5,6,7  (each processes ~383 samples)
+  - All 8 workers run SIMULTANEOUSLY → ~8x speedup vs single GPU
+"""
+import json
+import os
+import sys
+import time
+# CRITICAL: Set offline mode before importing transformers
+os.environ['HF_HUB_OFFLINE'] = '1'
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+import pandas as pd
+import numpy as np
+import torch
+import re
+from PIL import Image
+from collections import defaultdict
+from multiprocessing import Process, Queue
+# ============ CONFIG ============
+TEST_PARQUET = "/workspace/rl4phyx/RL4Phyx/SFT/eval_data/test_1533.parquet"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+MODELS = {
+    "base": "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct",
+    "sft":  "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b/merged",
+}
+# GPU assignment: base on GPUs 0-3, sft on GPUs 4-7
+MODEL_GPUS = {
+    "base": [0, 1, 2, 3],
+    "sft":  [4, 5, 6, 7],
+}
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/eval_results"
+MAX_NEW_TOKENS = 1024
+# ================================
+def extract_choice(solution_str):
+    """Extract A/B/C/D from model output."""
+    if not solution_str:
+        return None
+    boxed = re.search(r'\\boxed\{([ABCD])\}', solution_str, re.IGNORECASE)
+    if boxed:
+        return boxed.group(1).upper()
+    patterns = [
+        r'(?:answer|choice)[是为:\s]*([ABCD])\b',
+        r'\b([ABCD])\s*(?:is correct|is the correct)',
+        r'(?:correct answer is)\s*([ABCD])\b',
+    ]
+    for p in patterns:
+        match = re.search(p, solution_str, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+    matches = re.findall(r'\b([ABCD])\b', solution_str.upper())
+    if matches:
+        return matches[-1]
+    return None
+def compute_score(solution_str, ground_truth):
+    """Score MCQ: 1.0 if correct, 0.0 otherwise."""
+    if '<think>' in solution_str and '</think>' in solution_str:
+        solution_str = solution_str.split('</think>')[-1]
+    model_choice = extract_choice(solution_str)
+    correct_choice = ground_truth.strip().upper()
+    if model_choice is None:
+        return 0.0
+    return 1.0 if model_choice == correct_choice else 0.0
+def load_test_data(parquet_path):
+    """Load test parquet and return list of dicts."""
+    df = pd.read_parquet(parquet_path)
+    samples = []
+    for _, row in df.iterrows():
+        ei = row['extra_info']
+        rm = row['reward_model']
+        prompt_content = row['prompt'][0]['content']
+        text = ""
+        for item in prompt_content:
+            if item.get('type') == 'text' and item.get('text'):
+                text = item['text']
+        samples.append({
+            'index': ei['index'],
+            'category': ei['category'],
+            'image_path': ei['image_path'],
+            'ground_truth': rm['ground_truth'],
+            'prompt_text': text,
+        })
+    return samples
+def worker_evaluate(model_name, model_path, gpu_id, chunk, image_dir, result_queue):
+    """Single GPU worker: load model, run inference on chunk, put results in queue."""
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+    try:
+        from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+        print(f"[{model_name}@GPU{gpu_id}] Loading model... ({len(chunk)} samples)", flush=True)
+        processor = AutoProcessor.from_pretrained(
+            model_path,
+            min_pixels=3136,
+            max_pixels=200704,
+            local_files_only=True,
+            trust_remote_code=True,
+        )
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="sdpa",
+            device_map="cuda:0",  # maps to the CUDA_VISIBLE_DEVICES GPU
+            local_files_only=True,
+            trust_remote_code=True,
+        )
+        model.eval()
+        print(f"[{model_name}@GPU{gpu_id}] Model loaded, starting inference", flush=True)
+        results = []
+        correct = 0
+        t0 = time.time()
+        for i, s in enumerate(chunk):
+            img_path = os.path.join(image_dir, s['image_path'])
+            try:
+                image = Image.open(img_path).convert('RGB')
+            except Exception as e:
+                print(f"[{model_name}@GPU{gpu_id}] WARN: Cannot load {img_path}: {e}", flush=True)
+                image = Image.new('RGB', (224, 224), 'white')
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": s['prompt_text']},
+                ]
+            }]
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = processor(
+                text=[text],
+                images=[image],
+                return_tensors="pt",
+                padding=True,
+            ).to("cuda:0")
+            with torch.no_grad():
+                output_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=MAX_NEW_TOKENS,
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                )
+            gen_ids = output_ids[:, inputs['input_ids'].shape[1]:]
+            prediction = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
+            score = compute_score(prediction, s['ground_truth'])
+            is_correct = score == 1.0
+            if is_correct:
+                correct += 1
+            extracted = extract_choice(prediction)
+            results.append({
+                'index': s['index'],
+                'category': s['category'],
+                'ground_truth': s['ground_truth'],
+                'extracted_answer': extracted,
+                'prediction': prediction[:300],
+                'correct': is_correct,
+            })
+            if (i + 1) % 50 == 0 or i == 0 or (i + 1) == len(chunk):
+                elapsed = time.time() - t0
+                eta = elapsed / (i + 1) * (len(chunk) - i - 1)
+                acc = correct / (i + 1) * 100
+                print(f"[{model_name}@GPU{gpu_id}] {i+1}/{len(chunk)} "
+                      f"acc={acc:.1f}% elapsed={elapsed:.0f}s eta={eta:.0f}s", flush=True)
+        elapsed_total = time.time() - t0
+        print(f"[{model_name}@GPU{gpu_id}] DONE: {correct}/{len(chunk)} "
+              f"({correct/len(chunk)*100:.1f}%) in {elapsed_total:.0f}s", flush=True)
+        result_queue.put({
+            'model': model_name,
+            'gpu_id': gpu_id,
+            'results': results,
+            'correct': correct,
+            'total': len(chunk),
+            'time_seconds': elapsed_total,
+        })
+    except Exception as e:
+        import traceback
+        print(f"[{model_name}@GPU{gpu_id}] ERROR: {e}", flush=True)
+        traceback.print_exc()
+        result_queue.put({
+            'model': model_name,
+            'gpu_id': gpu_id,
+            'results': [],
+            'correct': 0,
+            'total': len(chunk),
+            'time_seconds': 0,
+            'error': str(e),
+        })
+def split_chunks(data, n):
+    """Split data into n roughly equal chunks."""
+    k, m = divmod(len(data), n)
+    return [data[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)]
+def main():
+    print("=" * 60)
+    print("  MULTI-GPU EVAL: Base vs SFT on Physics Test Set")
+    print("  8 GPUs: Base→[0,1,2,3]  SFT→[4,5,6,7]")
+    print("=" * 60, flush=True)
+    # Load test data
+    samples = load_test_data(TEST_PARQUET)
+    print(f"\nLoaded {len(samples)} test samples")
+    cats = defaultdict(int)
+    for s in samples:
+        cats[s['category']] += 1
+    for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {cat}: {cnt}")
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Launch all 8 workers simultaneously
+    result_queue = Queue()
+    processes = []
+    for model_name, model_path in MODELS.items():
+        gpus = MODEL_GPUS[model_name]
+        chunks = split_chunks(samples, len(gpus))
+        for gpu_id, chunk in zip(gpus, chunks):
+            p = Process(
+                target=worker_evaluate,
+                args=(model_name, model_path, gpu_id, chunk, IMAGE_DIR, result_queue),
+            )
+            p.start()
+            processes.append(p)
+            print(f"  Launched {model_name} worker on GPU {gpu_id} ({len(chunk)} samples)")
+    print(f"\nAll {len(processes)} workers launched, waiting for completion...", flush=True)
+    # Collect results
+    worker_results = []
+    for _ in range(len(processes)):
+        worker_results.append(result_queue.get())
+    for p in processes:
+        p.join()
+    # Aggregate per model
+    all_evals = {}
+    for model_name in MODELS:
+        model_results = [w for w in worker_results if w['model'] == model_name]
+        all_results = []
+        total_correct = 0
+        total_count = 0
+        total_time = 0
+        cat_stats = defaultdict(lambda: {'correct': 0, 'total': 0})
+        for w in model_results:
+            if 'error' in w:
+                print(f"  WARNING: {model_name}@GPU{w['gpu_id']} had error: {w['error']}")
+                continue
+            all_results.extend(w['results'])
+            total_correct += w['correct']
+            total_count += w['total']
+            total_time = max(total_time, w['time_seconds'])
+        for r in all_results:
+            cat = r['category']
+            cat_stats[cat]['total'] += 1
+            if r['correct']:
+                cat_stats[cat]['correct'] += 1
+        accuracy = total_correct / total_count * 100 if total_count > 0 else 0
+        all_evals[model_name] = {
+            'accuracy': accuracy,
+            'correct': total_correct,
+            'total': total_count,
+            'category_stats': dict(cat_stats),
+            'results': all_results,
+            'time_seconds': total_time,
+        }
+        # Save per-model results
+        out_path = os.path.join(OUTPUT_DIR, f"eval_{model_name}.jsonl")
+        with open(out_path, 'w', encoding='utf-8') as f:
+            for r in all_results:
+                f.write(json.dumps(r, ensure_ascii=False) + '\n')
+        print(f"\n{'─'*60}")
+        print(f"  {model_name.upper()} — RESULTS")
+        print(f"{'─'*60}")
+        print(f"  Overall accuracy: {total_correct}/{total_count} ({accuracy:.1f}%)")
+        print(f"  Wall-clock time: {total_time:.0f}s")
+        for cat in sorted(cat_stats.keys()):
+            s = cat_stats[cat]
+            pct = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
+            print(f"    {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({pct:.1f}%)")
+    # Final comparison
+    base = all_evals['base']
+    sft = all_evals['sft']
+    print(f"\n{'='*60}")
+    print(f"  FINAL COMPARISON: Base vs SFT")
+    print(f"{'='*60}")
+    print(f"\n{'Metric':30s} {'Base':>10s} {'SFT':>10s} {'Delta':>10s}")
+    print("─" * 60)
+    print(f"{'Overall Accuracy':30s} {base['accuracy']:>9.1f}% {sft['accuracy']:>9.1f}% {sft['accuracy']-base['accuracy']:>+9.1f}%")
+    all_cats = sorted(set(list(base['category_stats'].keys()) + list(sft['category_stats'].keys())))
+    print(f"\n  Per-category comparison:")
+    for cat in all_cats:
+        bs = base['category_stats'].get(cat, {'correct': 0, 'total': 0})
+        ss = sft['category_stats'].get(cat, {'correct': 0, 'total': 0})
+        b_pct = bs['correct'] / bs['total'] * 100 if bs['total'] > 0 else 0
+        s_pct = ss['correct'] / ss['total'] * 100 if ss['total'] > 0 else 0
+        delta = s_pct - b_pct
+        print(f"    {cat:25s}: Base={b_pct:5.1f}%  SFT={s_pct:5.1f}%  Δ={delta:+5.1f}%")
+    # Save comparison
+    comparison = {
+        'base_accuracy': base['accuracy'],
+        'sft_accuracy': sft['accuracy'],
+        'delta': sft['accuracy'] - base['accuracy'],
+        'base_time': base['time_seconds'],
+        'sft_time': sft['time_seconds'],
+        'base_categories': base['category_stats'],
+        'sft_categories': sft['category_stats'],
+    }
+    comp_path = os.path.join(OUTPUT_DIR, "comparison.json")
+    with open(comp_path, 'w') as f:
+        json.dump(comparison, f, indent=2, ensure_ascii=False)
+    print(f"\n  Comparison saved to: {comp_path}")
+    print("\n=== EVALUATION COMPLETE ===", flush=True)
+if __name__ == '__main__':
+    main()

eval_footprint/eval_single_model_template.py ADDED Viewed

	@@ -0,0 +1,242 @@

+#!/usr/bin/env python3
+"""
+Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
+Runs both Base and SFT models on the 1533 open-ended physics test set.
+Saves raw model outputs for later judging.
+Usage (inside Docker container):
+    cd /tmp && python3 /path/to/eval_openended_inference.py
+Output:
+    sft_eval_footprint/inference_results_base.jsonl
+    sft_eval_footprint/inference_results_sft.jsonl
+"""
+import os
+import sys
+import json
+import re
+import time
+import torch
+import multiprocessing as mp
+from collections import Counter
+# ============ CONFIG ============
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
+SFT_MODEL = "MODEL_PATH_PLACEHOLDER"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
+BASE_GPUS = [0, 1, 2, 3]
+SFT_GPUS = [4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+# ================================
+def load_test_data():
+    """Load test samples from JSONL."""
+    samples = []
+    with open(TEST_FILE, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_open_ended_prompt(sample):
+    """Build an open-ended prompt (no MCQ options)."""
+    desc = sample.get('description', '')
+    question = sample.get('question', '')
+    prompt = f"""Look at the image and answer the physics question.
+{desc}
+{question}
+Please reason step by step, and put your final answer within \\boxed{{}}.
+"""
+    return prompt.strip()
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    """Worker: load model on specific GPU and run inference on assigned samples."""
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    from PIL import Image
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        min_pixels=3136,
+        max_pixels=200704,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        idx = sample['index']
+        prompt_text = build_open_ended_prompt(sample)
+        image_path = os.path.join(IMAGE_DIR, sample['image'])
+        # Build messages
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt_text},
+            ],
+        }]
+        try:
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+            generated = output_ids[0][inputs.input_ids.shape[1]:]
+            response = processor.decode(generated, skip_special_tokens=True)
+        except Exception as e:
+            response = f"ERROR: {str(e)}"
+        result = {
+            "index": idx,
+            "category": sample['category'],
+            "subfield": sample.get('subfield', ''),
+            "question": sample['question'],
+            "ground_truth_value": sample['ground_truth_value'],
+            "ground_truth_letter": sample.get('ground_truth_letter', ''),
+            "model_output": response,
+            "model_name": model_name,
+            "gpu_id": gpu_id,
+        }
+        results.append(result)
+        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
+            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+    # Write results
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
+    return len(results)
+def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
+    """Split samples across GPUs and run in parallel."""
+    n = len(samples)
+    k = len(gpu_ids)
+    chunk_size = (n + k - 1) // k
+    processes = []
+    output_files = []
+    for i, gpu_id in enumerate(gpu_ids):
+        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
+        if not chunk:
+            continue
+        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
+        output_files.append(out_file)
+        p = mp.Process(
+            target=worker_inference,
+            args=(gpu_id, model_path, chunk, out_file, model_name)
+        )
+        processes.append(p)
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_files
+def merge_results(output_files, final_output):
+    """Merge per-GPU result files into one."""
+    all_results = []
+    for f in output_files:
+        if os.path.exists(f):
+            with open(f, 'r', encoding='utf-8') as fh:
+                for line in fh:
+                    if line.strip():
+                        all_results.append(json.loads(line))
+    # Sort by index for consistency
+    all_results.sort(key=lambda x: x['index'])
+    with open(final_output, 'w', encoding='utf-8') as f:
+        for r in all_results:
+            f.write(json.dumps(r, ensure_ascii=False) + '\n')
+    # Cleanup per-GPU files
+    for f in output_files:
+        if os.path.exists(f):
+            os.remove(f)
+    return all_results
+def main():
+    import json, os
+    TEST_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
+    samples = []
+    with open(TEST_FILE) as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    print(f"Loaded {len(samples)} test samples")
+    print(f"Model: {SFT_MODEL}")
+    sft_output = os.path.join(OUTPUT_DIR, "OUTPUT_NAME_PLACEHOLDER")
+    SFT_GPUS = list(range(8))
+    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
+    sft_final = sft_output + ".jsonl" if not sft_output.endswith(".jsonl") else sft_output
+    if not os.path.exists(sft_final):
+        # merge from per-gpu files
+        all_r = []
+        for gpu in SFT_GPUS:
+            gf = sft_output + f"_gpu{gpu}.jsonl"
+            if os.path.exists(gf):
+                with open(gf) as f:
+                    for line in f:
+                        if line.strip():
+                            all_r.append(line)
+                os.remove(gf)
+        with open(sft_final, 'w') as f:
+            for line in all_r:
+                f.write(line)
+    with open(sft_final) as f:
+        count = sum(1 for _ in f)
+    print(f"Total: {count} results -> {sft_final}")
+if __name__ == "__main__":
+    main()

eval_footprint/inference_fullft_math_nf.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_fullft_math_nf_old.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_fullft_phyx_math_nf.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_fullft_phyx_nf.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_lora_phyx_f.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_base.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_lora_math_f.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_sft.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_sft.jsonl_gpu4.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_sft.jsonl_gpu5.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_sft.jsonl_gpu6.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/inference_results_sft.jsonl_gpu7.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/report_lora_math_f.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "timestamp": "2026-03-16 17:48:12",
+  "scoring_method": "PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)",
+  "model": {
+    "model": "lora_math_f (LoRA+freeze+math)",
+    "total": 1533,
+    "string_matches": 13,
+    "llm_calls": 1520,
+    "llm_matches": 297,
+    "final_correct": 310,
+    "final_acc": 20.22,
+    "category_stats": {
+      "Mechanics": {
+        "total": 276,
+        "correct": 63
+      },
+      "Waves/Acoustics": {
+        "total": 253,
+        "correct": 47
+      },
+      "Electromagnetism": {
+        "total": 275,
+        "correct": 61
+      },
+      "Modern Physics": {
+        "total": 222,
+        "correct": 55
+      },
+      "Optics": {
+        "total": 252,
+        "correct": 46
+      },
+      "Thermodynamics": {
+        "total": 255,
+        "correct": 38
+      }
+    }
+  }
+}

eval_footprint/run_inference_single.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python3
+"""Single-model inference on 8 GPUs. Usage: python3 run_inference_single.py <MODEL_PATH> <MODEL_NAME>"""
+import os, sys, json, torch, multiprocessing as mp
+os.environ["HF_HUB_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+MODEL_PATH = sys.argv[1] if len(sys.argv) > 1 else "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/lora_math_f/merged"
+MODEL_NAME = sys.argv[2] if len(sys.argv) > 2 else "lora_math_f"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
+GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
+MAX_NEW_TOKENS = 2048
+def load_test_data():
+    samples = []
+    with open(TEST_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                samples.append(json.loads(line))
+    return samples
+def build_prompt(sample):
+    desc = sample.get("description", "")
+    question = sample.get("question", "")
+    return f"Look at the image and answer the physics question.\n\n{desc}\n\n{question}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}."
+def worker_inference(gpu_id, model_path, samples, output_file, model_name):
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    device = f"cuda:{gpu_id}"
+    print(f"[{model_name}][GPU {gpu_id}] Loading model from {model_path}...", flush=True)
+    processor = AutoProcessor.from_pretrained(model_path, min_pixels=3136, max_pixels=200704)
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path, torch_dtype=torch.bfloat16, device_map=device, attn_implementation="sdpa"
+    )
+    model.eval()
+    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples...", flush=True)
+    results = []
+    for i, sample in enumerate(samples):
+        try:
+            image_path = os.path.join(IMAGE_DIR, sample[image])
+            prompt = build_prompt(sample)
+            messages = [{"role": "user", "content": [
+                {"type": "image", "image": f"file://{image_path}"},
+                {"type": "text", "text": prompt}
+            ]}]
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device)
+            with torch.no_grad():
+                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)
+            input_len = inputs["input_ids"].shape[1]
+            response = processor.decode(output_ids[0][input_len:], skip_special_tokens=True)
+            result = {**sample, "model_output": response, "model_name": model_name}
+            results.append(result)
+            if (i + 1) % 10 == 0:
+                print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
+        except Exception as e:
+            print(f"[{model_name}][GPU {gpu_id}] Error on sample {i}: {e}", flush=True)
+            results.append({**sample, "model_output": f"ERROR: {e}", "model_name": model_name})
+    with open(output_file, "w", encoding="utf-8") as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    print(f"[{model_name}][GPU {gpu_id}] Done. Saved {len(results)} results to {output_file}", flush=True)
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    samples = load_test_data()
+    print(f"Model: {MODEL_PATH}")
+    print(f"Name: {MODEL_NAME}")
+    print(f"Loaded {len(samples)} test samples")
+    n = len(samples)
+    n_gpus = len(GPUS)
+    chunk_size = (n + n_gpus - 1) // n_gpus
+    processes = []
+    for idx, gpu_id in enumerate(GPUS):
+        start = idx * chunk_size
+        end = min(start + chunk_size, n)
+        chunk = samples[start:end]
+        if not chunk:
+            continue
+        out_file = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}_gpu{gpu_id}.jsonl")
+        p = mp.Process(target=worker_inference, args=(gpu_id, MODEL_PATH, chunk, out_file, MODEL_NAME))
+        p.start()
+        processes.append(p)
+    for p in processes:
+        p.join()
+    # Merge
+    merged = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}.jsonl")
+    with open(merged, "w", encoding="utf-8") as out:
+        for gpu_id in GPUS:
+            part = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}_gpu{gpu_id}.jsonl")
+            if os.path.exists(part):
+                with open(part) as inp:
+                    for line in inp:
+                        out.write(line)
+    # Count and check errors
+    with open(merged) as f:
+        results = [json.loads(l) for l in f if l.strip()]
+    errors = sum(1 for r in results if r.get("model_output","").startswith("ERROR"))
+    print(f"\n===== Inference Complete =====")
+    print(f"Total: {len(results)}, Valid: {len(results)-errors}, Errors: {errors}")
+    print(f"Output: {merged}")

eval_footprint/scored_results_base.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/scored_results_lora_math_f.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/scored_results_sft.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/scored_results_sft_phyx.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval_footprint/sft_report_phyx.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "model": "SFT",
+  "total": 1533,
+  "correct": 351,
+  "acc": 22.9,
+  "categories": {
+    "Mechanics": {
+      "total": 276,
+      "correct": 76
+    },
+    "Waves/Acoustics": {
+      "total": 253,
+      "correct": 45
+    },
+    "Electromagnetism": {
+      "total": 275,
+      "correct": 67
+    },
+    "Modern Physics": {
+      "total": 222,
+      "correct": 66
+    },
+    "Optics": {
+      "total": 252,
+      "correct": 53
+    },
+    "Thermodynamics": {
+      "total": 255,
+      "correct": 44
+    }
+  }
+}

eval_footprint/simple_eval.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import json, os, sys, time, torch
+MODEL_PATH = sys.argv[1]
+OUTPUT_NAME = sys.argv[2]
+EVAL_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+TEST_FILE = os.path.join(EVAL_DIR, "test_1533_openended.jsonl")
+print(f"Model: {MODEL_PATH}")
+print(f"Output: {OUTPUT_NAME}")
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH, torch_dtype=torch.bfloat16, device_map="cuda",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model.eval()
+samples = []
+with open(TEST_FILE) as f:
+    for line in f:
+        if line.strip():
+            samples.append(json.loads(line))
+print(f"Loaded {len(samples)} samples")
+results = []
+t0 = time.time()
+for idx, sample in enumerate(samples):
+    desc = sample.get("description", "")
+    q = sample.get("question", "")
+    parts = [p for p in [desc, q] if p]
+    parts.append("Please reason step by step, and put your final answer within \\boxed{}.")
+    prompt_text = "\n\n".join(parts)
+    img = sample.get("image_path", "")
+    content = []
+    if img and os.path.exists(img):
+        content.append({"type": "image", "image": f"file://{img}"})
+    content.append({"type": "text", "text": prompt_text})
+    messages = [{"role": "user", "content": content}]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs,
+                      padding=True, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
+    out_ids = ids[0][len(inputs.input_ids[0]):]
+    response = processor.decode(out_ids, skip_special_tokens=True)
+    sample["model_output"] = response
+    results.append(sample)
+    if (idx + 1) % 50 == 0:
+        elapsed = time.time() - t0
+        rate = (idx + 1) / elapsed
+        eta = (len(samples) - idx - 1) / rate / 60
+        print(f"  {idx+1}/{len(samples)} ({rate:.1f}/s, ETA {eta:.0f}min)", flush=True)
+output_file = os.path.join(EVAL_DIR, f"inference_results_{OUTPUT_NAME}.jsonl")
+with open(output_file, "w", encoding="utf-8") as f:
+    for r in results:
+        f.write(json.dumps(r, ensure_ascii=False) + "\n")
+print(f"\nDone: {len(results)} -> {output_file}")

eval_footprint/single_model_eval.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys, os, json, torch, time
+from multiprocessing import Process
+from PIL import Image
+EVAL_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
+TEST_FILE = os.path.join(EVAL_DIR, "test_1533_openended.jsonl")
+MODEL_PATH = sys.argv[1]
+OUTPUT_NAME = sys.argv[2]
+NUM_GPUS = 8
+def build_open_ended_prompt(sample):
+    parts = []
+    desc = sample.get("description", "")
+    q = sample.get("question", "")
+    if desc: parts.append(desc)
+    if q: parts.append(q)
+    parts.append("Please reason step by step, and put your final answer within \\boxed{}.")
+    return "\n\n".join(parts)
+def worker_inference(gpu_id, model_path, samples, out_file, model_name):
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path, torch_dtype=torch.bfloat16, device_map="cuda",
+        attn_implementation="sdpa"
+    )
+    processor = AutoProcessor.from_pretrained(model_path)
+    model.eval()
+    results = []
+    for idx, sample in enumerate(samples):
+        try:
+            prompt_text = build_open_ended_prompt(sample)
+            img = sample.get("image_path", "")
+            content = []
+            if img and os.path.exists(img):
+                content.append({"type": "image", "image": f"file://{img}"})
+            content.append({"type": "text", "text": prompt_text})
+            messages = [{"role": "user", "content": content}]
+            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(text=[text], images=image_inputs, videos=video_inputs,
+                              padding=True, return_tensors="pt").to("cuda")
+            with torch.no_grad():
+                ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
+            out_ids = ids[0][len(inputs.input_ids[0]):]
+            response = processor.decode(out_ids, skip_special_tokens=True)
+            sample["model_output"] = response
+        except Exception as e:
+            sample["model_output"] = f"Error: {str(e)}"
+        results.append(sample)
+        if (idx + 1) % 50 == 0:
+            print(f"  [GPU {gpu_id}] {idx+1}/{len(samples)}", flush=True)
+    with open(out_file, "w", encoding="utf-8") as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    print(f"  [GPU {gpu_id}] Done: {len(results)} -> {out_file}", flush=True)
+def main():
+    print(f"\nModel: {MODEL_PATH}")
+    print(f"Output: {OUTPUT_NAME}")
+    samples = []
+    with open(TEST_FILE) as f:
+        for line in f:
+            if line.strip(): samples.append(json.loads(line))
+    print(f"Test samples: {len(samples)}")
+    output_file = os.path.join(EVAL_DIR, f"inference_results_{OUTPUT_NAME}.jsonl")
+    chunk_size = len(samples) // NUM_GPUS
+    procs = []
+    for gpu in range(NUM_GPUS):
+        s = gpu * chunk_size
+        e = s + chunk_size if gpu < NUM_GPUS - 1 else len(samples)
+        gpu_file = os.path.join(EVAL_DIR, f"_temp_{OUTPUT_NAME}_gpu{gpu}.jsonl")
+        p = Process(target=worker_inference, args=(gpu, MODEL_PATH, samples[s:e], gpu_file, OUTPUT_NAME))
+        p.start()
+        procs.append((p, gpu_file))
+    for p, _ in procs:
+        p.join()
+    all_results = []
+    for _, gf in procs:
+        if os.path.exists(gf):
+            with open(gf) as f:
+                for line in f:
+                    if line.strip(): all_results.append(line)
+            os.remove(gf)
+    with open(output_file, "w") as f:
+        for line in all_results:
+            f.write(line)
+    print(f"Total: {len(all_results)} results -> {output_file}")
+if __name__ == "__main__":
+    main()

eval_footprint/test_1533_openended.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff