kixx commited on Jun 25, 2025

Commit

b1e25b1

verified ·

1 Parent(s): c0bf946

Upload 34 files

Browse files

Files changed (35) hide show

.gitattributes +6 -0
README.md +150 -3
assets/overview.png +3 -0
data/2wikimqa.jsonl +0 -0
data/2wikimqa_antifact.jsonl +0 -0
data/hotpotqa.jsonl +3 -0
data/hotpotqa_antifact.jsonl +3 -0
data/hotpotqa_random.jsonl +3 -0
data/multifieldqa_en.jsonl +0 -0
data/multifieldqa_en_antifact.jsonl +0 -0
data/musique.jsonl +3 -0
data/musique_antifact.jsonl +3 -0
detect/contextleakage.py +194 -0
detect/contextleakage_api.py +87 -0
detect/question_rephrase_answer_api.py +153 -0
detect/question_rephrase_answer_qwen3.py +235 -0
detect/question_rephrase_answer_vllm.py +226 -0
eval/__init__.py +2 -0
eval/eval_with_api.py +94 -0
eval/evaluation.py +115 -0
main_gpu.py +427 -0
random_alternative_answer.py +418 -0
requirements.txt +38 -0
training_result/training_loss_antifact_llama.csv +41 -0
training_result/training_loss_antifact_qwen38.csv +41 -0
training_result/training_loss_llama.csv +41 -0
training_result/training_loss_phi4.csv +41 -0
training_result/training_loss_phi4_antifact.csv +41 -0
training_result/training_loss_qwen38.csv +41 -0
utils/__init__.py +2 -0
utils/convert.py +52 -0
utils/draw.py +82 -0
utils/llmjudge.py +49 -0
utils/metrics.py +152 -0
utils/util.py +104 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/overview.png filter=lfs diff=lfs merge=lfs -text
+data/hotpotqa_antifact.jsonl filter=lfs diff=lfs merge=lfs -text
+data/hotpotqa_random.jsonl filter=lfs diff=lfs merge=lfs -text
+data/hotpotqa.jsonl filter=lfs diff=lfs merge=lfs -text
+data/musique_antifact.jsonl filter=lfs diff=lfs merge=lfs -text
+data/musique.jsonl filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,150 @@
----
-license: cc-by-4.0
----

+# LastingBench: Defend Benchmarks Against Knowledge Leakage.
+Welcome to the repository for the research paper: "LastingBench: Defend Benchmarks Against Knowledge Leakage." This project addresses the growing concern about large language models (LLMs) "cheating" on standard Question Answering (QA) benchmarks by memorizing task-specific data, which undermines the validity of benchmark evaluations as they no longer reflect genuine model capabilities but instead the effects of data leakage.
+## Project Overview
+![Overview](./assets/overview.png)
+LastingBench introduces a novel framework designed to continuously reinforce and safeguard existing benchmarks against knowledge leakage. The project aims to:
+- **Detect knowledge leakage** through context and question perturbation techniques
+- **Rewrite leaked content** to counterfactual alternatives that disrupt memorization while preserving the benchmark's original evaluative intent
+- **Evaluate model responses** to contextual evidence and reasoning patterns
+- **Provide practical solutions** to ensure benchmark robustness over time, promoting fairer and more interpretable evaluations of LLMs
+## Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/Seriousss/lastingbench
+```
+2. Create and activate conda environment:
+```bash
+conda create -n lastingbench python=3.12
+conda activate lastingbench
+```
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+4. Set up environment variables:
+```bash
+export OPENAI_BASE_URL="your-api-base-url"
+export OPENAI_API_KEY="your-api-key"
+export CUDA_VISIBLE_DEVICES="0,1,2,3"  # Adjust based on your GPU setup
+```
+## Usage
+LastingBench provides three main functionalities: **Detection**, **Rewrite**, and **Training Comparision**.
+### 🔍 Detection
+Detect knowledge leakage through various perturbation techniques.
+#### 1. Context Leakage Detection
+Evaluate models using exact-match scoring on benchmark datasets:
+```bash
+# Using vLLM for most models
+python -m detect.contextleakage --hf_model "Qwen/Qwen2.5-7B-Instruct" \
+    --dataset_subset "hotpotqa" --cuda_devices "0,1"
+# Using Transformers for Qwen3 models
+python -m detect.contextleakage --hf_model "Qwen/Qwen3-8B" \
+    --is_qwen3 --max_new_tokens 30
+python -m detect.contextleakage_api --model "deepseek-r1" --dataset_subset "hotpotqa"
+```
+#### 2. Question Perturbation Detection
+Rephrase questions to opposite meanings and test model consistency:
+```bash
+# Using OpenAI API
+python -m detect.question_rephrase_answer_api \
+    --model_name "gpt-4o" --dataset_subset "2wikimqa" \
+    --rephrase_type "opposite" --sample_count 100
+# Using local vLLM models
+python -m detect.question_rephrase_answer_vllm \
+    --model_name "Qwen/Qwen2.5-7B-Instruct" --dataset_subset "hotpotqa" --rephrase_type "similar"
+# Using Qwen3 with Transformers
+python -m detect.question_rephrase_answer_qwen3 \
+    --model_name "Qwen/Qwen3-8B" --dataset_subset "2wikimqa"
+```
+### ✏️ Rewrite
+Generate counterfactual answers and rewrite leaked evidence to create robust benchmarks.
+`
+#### 1. Evidence Finding and Counterfactual Rewriting Pipeline
+Run the complete finding and rewriting pipeline:
+```bash
+# Specify custom output file and dataset
+python main_gpu.py --output custom_output.jsonl \
+    --dataset_subset "hotpotqa" --start_idx 0 --max_samples 100
+```
+Convert and merge JSONL files with question-answer mappings:
+```bash
+# Merge single mapping file with original dataset
+python utils/convert.py original.jsonl revised.jsonl custom_output.jsonl
+```
+The original and revised dataset can be found under the **data** folder.
+#### 2. Random Answer Rewriting
+Create random alternatives to disrupt memorization:
+```bash
+# Specify custom output file and dataset
+python random_alternative_answer.py --output random_hotpot.jsonl \
+    --dataset_subset "hotpotqa" --start_idx 0 --max_samples 50
+```
+### 🚀Dataset evaluations on model inference and training
+#### 1. Model Inference Evaluation
+Comprehensive evaluation on original and revised benchmarks:
+```bash
+# Transformers-based evaluation
+python -m eval.evaluation -i data/hotpotqa.jsonl -model "Qwen/Qwen3-8B" -k 40 -t 0.5
+# API-based evaluation
+python -m eval.eval_with_api.py --input data/hotpotqa_antifact.jsonl \
+    --model "deepseek-r1" --max_tokens 30 --temperature 0.5
+```
+#### 2. Model training Evaluation
+Compare training dynamics between original and rewritten datasets:
+The training loss data can be found under **training_result**.
+To repoduce the picture in our paper:
+```bash
+python utils/draw.py training_result/training_loss_qwen38.csv training_result/training_loss_antifact_qwen38.csv \
+    --title "Original vs Rewritten Training Loss"
+```
+### 📊 Utility Functions
+Additional tools for analysis and metrics:
+- **Metrics Calculation**: F1 scores, EM scores, and custom evaluation metrics
+- **Document Retrieval**: BM25-based retrieval for evidence analysis
+All scripts support various parameters for customization. Use `--help` with any script to see available options.

assets/overview.png ADDED Viewed

Git LFS Details

SHA256: af5065239c895aa1097048009adab15ce0369801cafc5f84b82cd147246b2077
Pointer size: 131 Bytes
Size of remote file: 782 kB

data/2wikimqa.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/2wikimqa_antifact.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/hotpotqa.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0005ab2a1bc2ac3a70352dccbf96cccc4e0aac6bb677f6a55180fa51b92ef6f
+size 11483614

data/hotpotqa_antifact.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94d00ea5c5f14ce5c0638bc6c66658e93425d87f377bf1d3da4a004fd15fcb6f
+size 11447185

data/hotpotqa_random.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d2072c7caac2fdc462c0777bbcb38b41292974383db46a2fedb4c3b86f3c825
+size 11461405

data/multifieldqa_en.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/multifieldqa_en_antifact.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/musique.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ac69b91281c4ec6b21316cb7282e83fb6b4dda04fc68480bb8d8ed1e19ff7bd
+size 14085077

data/musique_antifact.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e80355c61ec5d4f79e2d3610fa9c25156746079bff21268932ae9cc8d23acdfc
+size 14057004

detect/contextleakage.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Evaluate models on a LongBench subset with Exact-Match (EM).
+Supports both Qwen3 (Transformers) and other models (vLLM).
+Requirements
+------------
+pip install vllm datasets tqdm transformers accelerate
+"""
+import argparse, logging, time, torch
+from pathlib import Path
+from datasets import load_dataset
+from tqdm import tqdm
+from utils.metrics import qa_em_score
+import os
+# ---------------------------- CLI ------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument("--hf_model",
+    default="Qwen/Qwen3-8B-Instruct",
+    help="Model name or local path")
+parser.add_argument("--is_qwen3", action="store_true",
+    help="Set this flag if using Qwen3 model (uses Transformers). Otherwise uses vLLM.")
+parser.add_argument("--max_new_tokens", type=int, default=20)
+parser.add_argument("--max_tokens", type=int, default=20,
+    help="For vLLM models (ignored if --is_qwen3)")
+parser.add_argument("--temperature", type=float, default=0.0)
+parser.add_argument("--top_p", type=float, default=1.0)
+parser.add_argument("--tensor_parallel_size", type=int, default=2,
+    help="GPU parallel size for vLLM (ignored if --is_qwen3)")
+parser.add_argument("--dataset_repo", default="THUDM/LongBench")
+parser.add_argument("--dataset_subset", default="hotpotqa")
+parser.add_argument("--split", default="test")
+parser.add_argument("--sleep", type=float, default=0.0)
+parser.add_argument("--log", default="summary.log")
+parser.add_argument("--cuda_devices", default="1,6",
+    help="CUDA visible devices")
+args = parser.parse_args()
+# Set CUDA devices
+os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_devices
+# --------------------------- logging ---------------------------------
+logging.basicConfig(
+    filename=args.log,
+    level=logging.INFO,
+    format="%(asctime)s - %(message)s",
+    filemode="a",
+)
+logging.getLogger().addHandler(logging.StreamHandler())
+# ------------------------- dataset -----------------------------------
+ds = load_dataset(args.dataset_repo, args.dataset_subset, split=args.split)
+total = len(ds)
+logging.info("Loaded %d samples from %s/%s[%s]",
+             total, args.dataset_repo, args.dataset_subset, args.split)
+if args.is_qwen3:
+    # ---------------------- Qwen3 with Transformers ----------------------------
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    load_kwargs = dict(
+        trust_remote_code=True,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model,
+                                              trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.hf_model,
+        torch_dtype=torch.float16,
+        **load_kwargs
+    )
+    EOS_ID      = tokenizer.eos_token_id
+    THINK_ENDID = 151668  # </think> token id
+    gen_kwargs = dict(
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        do_sample=args.temperature > 0,
+        eos_token_id=EOS_ID,
+    )
+    # -------------------------- Qwen3 loop -------------------------------------
+    correct_em = 0
+    for ex in tqdm(ds, desc="Evaluating with Transformers (Qwen3)"):
+        q = ex["input"]
+        golds = ex["answers"]
+        msgs = [
+            {"role": "system", "content": "You are a QA assistant."},
+            {"role": "user",
+             "content": f"Question: {q}\n"
+                        "Please reply with *only* the final answer—no extra words."}
+        ]
+        prompt = tokenizer.apply_chat_template(
+            msgs,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False   # Qwen3 thinking mode
+        )
+        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outs = model.generate(**inputs, **gen_kwargs)[0]
+        # Extract newly generated tokens
+        new_ids = outs[len(inputs.input_ids[0]):].tolist()
+        # Find </think> (if not exist idx=0)
+        try:
+            idx = len(new_ids) - new_ids[::-1].index(THINK_ENDID)
+        except ValueError:
+            idx = 0
+        content = tokenizer.decode(new_ids[idx:],
+                                   skip_special_tokens=True).strip("\n").strip()
+        # Only use content for EM comparison
+        if any(qa_em_score(content, g) for g in golds):
+            correct_em += 1
+        if args.sleep:
+            time.sleep(args.sleep)
+else:
+    # ---------------------- Other models with vLLM ----------------------------
+    from vllm import LLM, SamplingParams
+    # Initialize vLLM
+    llm = LLM(
+        model=args.hf_model,
+        tensor_parallel_size=args.tensor_parallel_size,
+    )
+    sampler = SamplingParams(
+        temperature=args.temperature,
+        max_tokens=args.max_tokens,
+        top_p=args.top_p,
+        stop=["</assistant>", "</s>", "<|end_of_text|>"],
+    )
+    # -------------------------- vLLM loop -------------------------------------
+    correct_em = 0
+    for ex in tqdm(ds, desc="Evaluating with vLLM"):
+        question = ex["input"]
+        golds = ex["answers"]      # list[str]
+        chat_params = SamplingParams(
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
+            top_p=args.top_p,
+            stop=["</s>", "<|end_of_text|>"],   # Safety stop tokens
+        )
+        messages = [
+            {"role": "system",
+             "content": "You are a QA assistant."},
+            {"role": "user",
+             "content": f"Question: {question}\n"
+                        "Please first reply with *only* the final answer—no extra words.\n Answer:"}
+        ]
+        result = llm.chat(messages, sampling_params=chat_params)
+        # vLLM returns list[RequestOutput]; take first output's first candidate
+        pred = result[0].outputs[0].text.strip()
+        print(f"A: {pred}\nG: {golds}\n")
+        if any(qa_em_score(pred, g) for g in golds):
+            correct_em += 1
+        if args.sleep:
+            time.sleep(args.sleep)
+# -------------------------- result -----------------------------------
+em = correct_em / total
+model_type = "Qwen3 (Transformers)" if args.is_qwen3 else "vLLM"
+logging.info("RESULT | model=%s | type=%s | subset=%s | EM=%.4f",
+             args.hf_model, model_type, args.dataset_subset, em)
+print(
+    f"\n=== SUMMARY ===\n"
+    f"Model   : {args.hf_model}\n"
+    f"Type    : {model_type}\n"
+    f"Subset  : {args.dataset_subset} ({args.split})\n"
+    f"EM      : {em:.4f}\n"
+    f"(Log in {Path(args.log).resolve()})"
+)

detect/contextleakage_api.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os, time, argparse, logging
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+from utils.metrics import qa_em_score
+# ----------------------------------------------------------------------
+# CLI
+# ----------------------------------------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument("--model",          default="gpt-4o")
+parser.add_argument("--dataset_repo",   default="THUDM/LongBench")
+parser.add_argument("--dataset_subset", default="hotpotqa")
+parser.add_argument("--split",          default="test")
+parser.add_argument("--max_tokens",     type=int,   default=30)
+parser.add_argument("--temperature",    type=float, default=0.0)
+parser.add_argument("--sleep",          type=float, default=0.5,
+                    help="seconds to wait between requests")
+parser.add_argument("--log",            default="summary.log",
+                    help="append overall score here")
+args = parser.parse_args()
+# ----------------------------------------------------------------------
+# Logging (append mode)
+# ----------------------------------------------------------------------
+logging.basicConfig(
+    filename=args.log,
+    level=logging.INFO,
+    format="%(asctime)s - %(message)s",
+    filemode="a",
+)
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+logging.getLogger().addHandler(console)
+# ----------------------------------------------------------------------
+# OpenAI client
+# ----------------------------------------------------------------------
+client = OpenAI(
+  api_key=os.environ.get("OPENAI_API_KEY"),
+  base_url=os.environ.get("OPENAI_BASE_URL")
+)
+# ----------------------------------------------------------------------
+# Load dataset
+# ----------------------------------------------------------------------
+ds = load_dataset(args.dataset_repo, args.dataset_subset, split=args.split)
+total = len(ds)
+logging.info("Loaded %d samples from %s/%s[%s]",
+             total, args.dataset_repo, args.dataset_subset, args.split)
+# ----------------------------------------------------------------------
+# Evaluation loop
+# ----------------------------------------------------------------------
+correct_em = 0
+for ex in tqdm(ds, desc="Evaluating"):
+    question = ex["input"]
+    golds    = ex["answers"]
+    resp = client.chat.completions.create(
+        model=args.model,
+        messages=[
+            {"role": "system", "content": "You are a QA assistant."},
+            {"role": "user",
+            "content": f"Question: {question}\n"
+            "Please first reply with *only* the final answer—no extra words.\n Answer:"}
+        ],
+        temperature=args.temperature,
+        max_tokens=args.max_tokens,
+    )
+    pred = resp.choices[0].message.content.strip()
+    print(f"A: {pred}\n G: {golds}")
+    if any(qa_em_score(pred, g) for g in golds):
+        correct_em += 1
+    time.sleep(args.sleep)
+em_score = correct_em / total
+logging.info("RESULT | model=%s | subset=%s | EM=%.4f",
+             args.model, args.dataset_subset, em_score)
+print(f"\n=== SUMMARY ===\nModel   : {args.model}"
+      f"\nDataset : {args.dataset_subset} ({args.split})"
+      f"\nEM      : {em_score:.4f}\n"
+      f"(Appended to {args.log})")

detect/question_rephrase_answer_api.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import json
+import time
+import os
+import argparse
+from datasets import load_dataset
+from openai import OpenAI
+from tqdm import tqdm
+from utils.metrics import qa_f1_score, qa_em_score # Import evaluation functions
+# Configure OpenAI API
+client = OpenAI(
+  api_key=os.environ.get("OPENAI_API_KEY"),
+  base_url=os.environ.get("OPENAI_BASE_URL")
+)
+def get_openai_response(prompt, model="gpt-4o", retries=3, delay=2):
+    """Call OpenAI API to get response with retry mechanism"""
+    for attempt in range(retries):
+        try:
+            completion = client.chat.completions.create(
+                model=model,
+                messages=[{'role': 'user', 'content': prompt}],
+                max_tokens=100
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"Attempt {attempt + 1} failed: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries reached. Skipping this request.")
+                return "Failed to get response"
+def rephrase_question_api(question, model_name, rephrase_type="opposite"):
+    """Use OpenAI API to rephrase question (English prompt)"""
+    if rephrase_type == "opposite":
+        prompt = f"""Please rephrase the following question to have the exact opposite meaning.
+Question: {question}
+Return only the rephrased question with the opposite meaning, without any explanations or other content."""
+    elif rephrase_type == "similar":
+        prompt = f"""Please rephrase the following question to be synonymous, maintaining the original meaning but using different wording:
+Question: {question}
+Return only the rephrased question, without any explanations or other content."""
+    else:
+        raise ValueError(f"Invalid rephrase_type: {rephrase_type}. Must be 'opposite' or 'similar'.")
+    return get_openai_response(prompt, model=model_name)
+def answer_question_with_context_api(question, context, model_name, max_tokens_for_answer=30):
+    """Use OpenAI API to answer question based on context (English prompt)"""
+    prompt = f"""Please answer the question based on the following context:
+Context:
+{context}
+Question: {question}
+Only output the answer, no any other text. If the answer is not in the context, please say "I don't know".
+Answer:"""
+    try:
+        completion = client.chat.completions.create(
+            model=model_name,
+            messages=[{'role': 'user', 'content': prompt}],
+            max_tokens=max_tokens_for_answer
+        )
+        return completion.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Answer generation failed for model {model_name}: {e}")
+        return "Failed to get answer"
+def main(args):
+    # Load dataset
+    print(f"Loading dataset {args.dataset_name}, subset {args.dataset_subset}...")
+    try:
+        dataset = load_dataset(args.dataset_name, args.dataset_subset)["test"]
+        print(f"Successfully loaded dataset with {len(dataset)} samples.")
+    except Exception as e:
+        print(f"Failed to load dataset: {e}")
+        return
+    em_match_count = 0  # Counter for EM matches
+    successfully_processed_samples = 0 # Counter for successfully processed samples
+    num_samples_to_process = len(dataset) if args.sample_count == -1 else min(args.sample_count, len(dataset))
+    print(f"Processing {num_samples_to_process} samples. Rephrasing with GPT-4o (opposite meaning). Answering with {args.model_name} (max 30 tokens for answer)...")
+    for i in tqdm(range(num_samples_to_process), desc="Processing samples"):
+        example = dataset[i]
+        original_question = example['input']
+        context = example['context']
+        ground_truth_answers = example['answers']
+        print(f"Original question: {original_question}")
+        # Use API to rephrase question, fixed using gpt-4o
+        rephrased_question = rephrase_question_api(original_question, "gpt-4o", args.rephrase_type)
+        print(f"Rephrased question (opposite): {rephrased_question}")
+        if rephrased_question == "Failed to get response" or rephrased_question == "Failed to rephrase question": # Broader check
+            print(f"Skipping sample {i+1} due to rephrasing failure.")
+            continue
+        # Use rephrased question and context to get answer, using args.model_name, answer length limited to 30 tokens
+        rephrased_answer = answer_question_with_context_api(rephrased_question, context, args.model_name, max_tokens_for_answer=30)
+        # print(f"Answer to rephrased question: {rephrased_answer}")
+        if rephrased_answer == "Failed to get answer":
+            print(f"Skipping sample {i+1} due to answer generation failure.")
+            continue
+        if not ground_truth_answers:
+            print(f"Skipping sample {i+1} due to missing ground truth answers.")
+            continue
+        successfully_processed_samples += 1
+        sample_had_em_match = False
+        for gt_ans in ground_truth_answers:
+            em = qa_em_score(rephrased_answer, gt_ans)
+            if em > 0: # EM is 1.0 for a match
+                sample_had_em_match = True
+                break
+        if sample_had_em_match:
+            em_match_count += 1
+        # print(f"Sample EM with original GT: {1 if sample_had_em_match else 0}")
+    if successfully_processed_samples > 0:
+        print(f"\n--- Evaluation Summary ---")
+        print(f"Answering Model : {args.model_name}")
+        print(f"Dataset         : {args.dataset_name} ({args.dataset_subset})")
+        print(f"Successfully Processed Samples for Evaluation: {successfully_processed_samples}")
+        print(f"Max Answer Tokens: 30")
+        print(f"Count of EM with original ground truth (after rephrase): {em_match_count}")
+    else:
+        print("\nNo samples were processed adequately to provide an evaluation summary.")
+    print("Processing complete!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Rephrase questions to opposite meaning with GPT-4o, answer with specified OpenAI model, then count EM against original GT.")
+    parser.add_argument("--model_name", type=str, default="gpt-4o", help="Name of the OpenAI model to use for Answering.")
+    parser.add_argument("--dataset_name", type=str, default="THUDM/LongBench", help="Name of the Hugging Face dataset.")
+    parser.add_argument("--dataset_subset", type=str, default="2wikimqa", help="Subset of the dataset.")
+    parser.add_argument("--sample_count", type=int, default=-1, help="Number of samples to process. -1 for all samples.")
+    parser.add_argument("--rephrase_type", type=str, default="opposite", choices=["opposite", "similar"], help="Type of rephrasing: 'opposite' for opposite meaning or 'similar' for similar meaning.")
+    args = parser.parse_args()
+    main(args)

detect/question_rephrase_answer_qwen3.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import time
+import os
+import argparse
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from openai import OpenAI # Added for GPT-4o rephrasing
+from utils.metrics import qa_f1_score, qa_em_score
+THINK_END_ID = 151668 # </think> token ID for Qwen models (like Qwen1.5/Qwen2)
+# --- OpenAI Client for Rephrasing ---
+openai_client = OpenAI(
+    api_key=os.environ.get("OPENAI_API_KEY"),
+    base_url=os.environ.get("OPENAI_BASE_URL")
+)
+def get_openai_rephrase_response(prompt, model="gpt-4o", retries=3, delay=2):
+    """Call OpenAI API for rephrasing."""
+    for attempt in range(retries):
+        try:
+            completion = openai_client.chat.completions.create(
+                model=model,
+                messages=[{'role': 'user', 'content': prompt}],
+                max_tokens=100
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"OpenAI Rephrase attempt {attempt + 1} failed: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying OpenAI rephrase in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries for OpenAI rephrase reached.")
+                return "Failed to rephrase question"
+def rephrase_question_with_gpt4o(question, rephrase_type="opposite"):
+    if rephrase_type == "opposite":
+        prompt = f"""Please rephrase the following question to have the exact opposite meaning.
+Question: {question}
+Return only the rephrased question with the opposite meaning, without any explanations or other content."""
+    elif rephrase_type == "similar":
+        prompt = f"""Please rephrase the following question to be synonymous, maintaining the original meaning but using different wording:
+Question: {question}
+Return only the rephrased question, without any explanations or other content."""
+    else:
+        raise ValueError(f"Invalid rephrase_type: {rephrase_type}. Must be 'opposite' or 'similar'.")
+    return get_openai_rephrase_response(prompt)
+# --- Qwen3-Specific Hugging Face Model Functions (for Answering) ---
+def get_qwen3_hf_response(prompt_text, model, tokenizer, device, max_new_tokens=40, retries=2, delay=5):
+    """Generate a response from a Qwen3-like HF model. max_new_tokens default to 30."""
+    for attempt in range(retries):
+        try:
+            messages = [{"role": "user", 'content': prompt_text}]
+            chat_template_args = {
+                "tokenize": False,
+                "add_generation_prompt": True
+            }
+            # Qwen models (like Qwen1.5, Qwen2) often use/support enable_thinking
+            # Check if tokenizer's apply_chat_template supports 'enable_thinking'
+            # This check is simplified; for robust production, inspect.signature might be better
+            # but for Qwen-specific, we assume it or it gracefully ignores.
+            try:
+                # Attempt to use enable_thinking=False for Qwen models
+                processed_prompt = tokenizer.apply_chat_template(
+                    messages, **chat_template_args, enable_thinking=False
+                )
+            except TypeError:
+                # Fallback if enable_thinking is not a valid kwarg for the specific tokenizer version
+                print("Warning: Tokenizer does not support 'enable_thinking' in apply_chat_template. Proceeding without it.")
+                processed_prompt = tokenizer.apply_chat_template(messages, **chat_template_args)
+            except Exception as e:
+                print(f"Warning: Error applying chat template: {e}. Using raw prompt.")
+                processed_prompt = prompt_text # Fallback to raw prompt
+            inputs = tokenizer(processed_prompt, return_tensors="pt", padding=True, truncation=True).to(device)
+            generated_ids_full = model.generate(
+                inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                max_new_tokens=max_new_tokens,
+                pad_token_id=tokenizer.eos_token_id
+            )
+            # Get only newly generated tokens
+            output_only_ids_list = generated_ids_full[0][inputs.input_ids.shape[1]:].tolist()
+            # Strip <think>...</think> tags specifically for Qwen
+            try:
+                # Find the last occurrence of THINK_END_ID and take tokens after it
+                cut_index = len(output_only_ids_list) - output_only_ids_list[::-1].index(THINK_END_ID)
+                final_ids_to_decode = output_only_ids_list[cut_index:]
+            except ValueError:
+                # THINK_END_ID not found, use all generated new tokens
+                final_ids_to_decode = output_only_ids_list
+            response = tokenizer.decode(final_ids_to_decode, skip_special_tokens=True).strip()
+            return response
+        except Exception as e:
+            print(f"Qwen HF Model generation attempt {attempt + 1} failed: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries for Qwen HF model reached. Skipping this request.")
+                return "Failed to get Qwen HF response"
+def answer_question_with_context_qwen3_hf(question, context, model, tokenizer, device):
+    """Answer a question with context using a Qwen3-like HF model."""
+    prompt = f"""Please answer the question based on the following context:
+Context:
+{context}
+Question: {question}
+Only output the answer, no any other text. If the answer is not in the context, please say "I don't know".
+Answer:"""
+    return get_qwen3_hf_response(prompt, model, tokenizer, device)
+def main(args):
+    hf_device_setting = "auto"
+    print(f"Attempting to use device: {hf_device_setting} for Qwen HF model.")
+    print(f"Loading Qwen HF model for Answering: {args.model_name}...")
+    hf_model = None
+    hf_tokenizer = None
+    try:
+        hf_tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code_hf)
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            args.model_name,
+            device_map=hf_device_setting,
+            trust_remote_code=args.trust_remote_code_hf,
+            torch_dtype="bfloat16"
+        )
+        hf_model.eval()
+        print(f"Successfully loaded Qwen HF model {args.model_name}.")
+    except Exception as e:
+        print(f"Failed to load Qwen HF model {args.model_name}: {e}")
+        return
+    print(f"Loading dataset {args.dataset_name}, subset {args.dataset_subset}...")
+    try:
+        dataset = load_dataset(args.dataset_name, args.dataset_subset)["test"]
+        print(f"Successfully loaded dataset with {len(dataset)} samples.")
+    except Exception as e:
+        print(f"Failed to load dataset: {e}")
+        return
+    em_match_count = 0 # Counter for EM matches
+    em_match_original_count = 0 # Counter for EM matches
+    successfully_processed_samples = 0 # Counter for successfully processed samples
+    num_samples_to_process = len(dataset) if args.sample_count == -1 else min(args.sample_count, len(dataset))
+    print(f"Processing {num_samples_to_process} samples. Rephrasing with GPT-4o (opposite meaning). Answering with Qwen HF model {args.model_name} (max 30 tokens)...")
+    for i in tqdm(range(num_samples_to_process), desc="Processing samples"):
+        example = dataset[i]
+        original_question = example['input']
+        context = example['context']
+        ground_truth_answers = example['answers']
+        print(original_question)
+        rephrased_question = rephrase_question_with_gpt4o(original_question, args.rephrase_type)
+        print(rephrased_question)
+        if rephrased_question == "Failed to rephrase question":
+            print(f"Skipping sample {i+1} due to rephrasing failure.")
+            continue
+        rephrased_answer = answer_question_with_context_qwen3_hf(rephrased_question, context, hf_model, hf_tokenizer, hf_model.device)
+        print(rephrased_answer)
+        original_answer = answer_question_with_context_qwen3_hf(original_question, context, hf_model, hf_tokenizer, hf_model.device)
+        if not ground_truth_answers:
+            print(f"Skipping sample {i+1} due to missing ground truth answers.")
+            continue
+        print(original_answer)
+        successfully_processed_samples += 1
+        sample_had_em_match = False
+        for gt_ans in ground_truth_answers:
+            em = qa_em_score(rephrased_answer, gt_ans)
+            if em > 0: # Check for exact match (assuming qa_em_score returns 1.0 for EM)
+                sample_had_em_match = True
+                break
+        if sample_had_em_match:
+            em_match_count += 1
+        sample_had_em_match = False
+        for gt_ans in ground_truth_answers:
+            em = qa_em_score(original_answer, gt_ans)
+            if em > 0: # Check for exact match (assuming qa_em_score returns 1.0 for EM)
+                sample_had_em_match = True
+                break
+        if sample_had_em_match:
+            em_match_original_count += 1
+    if successfully_processed_samples > 0:
+        print(f"\n--- Evaluation Summary ---")
+        print(f"Answering Qwen HF Model: {args.model_name}")
+        print(f"Dataset: {args.dataset_name} ({args.dataset_subset})")
+        print(f"Successfully Processed Samples for Evaluation: {successfully_processed_samples}")
+        print(f"Count of EM with original ground truth (after rephrase): {em_match_count}")
+        print(f"Count of EM with original ground truth (before rephrase): {em_match_original_count}")
+    else:
+        print("\nNo samples were processed adequately to provide an evaluation summary.")
+    print("Processing complete!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Rephrase with GPT-4o, Answer with local Qwen3-like HF Model, then Evaluate.")
+    parser.add_argument("--model_name", type=str, default="Qwen/Qwen1.5-7B-Chat", help="Name of the Qwen3-like Hugging Face model for Answering.")
+    parser.add_argument("--trust_remote_code_hf", action="store_true", default=True, help="Set to true if the Hugging Face model requires remote code (default: True for Qwen). Argument is present for explicitness but defaults to True.")
+    parser.add_argument("--dataset_name", type=str, default="THUDM/LongBench", help="Name of the Hugging Face dataset.")
+    parser.add_argument("--dataset_subset", type=str, default="2wikimqa", help="Subset of the dataset.")
+    parser.add_argument("--sample_count", type=int, default=5, help="Number of samples to process. -1 for all. Default: 5.")
+    parser.add_argument("--rephrase_type", type=str, default="opposite", choices=["opposite", "similar"], help="Type of rephrasing: 'opposite' for opposite meaning or 'similar' for similar meaning.")
+    args = parser.parse_args()
+    if openai_client.api_key == "your_api_key_here":
+        print("CRITICAL ERROR: Please replace 'your_api_key_here' with your actual OpenAI API key in the script.")
+    else:
+        main(args)

detect/question_rephrase_answer_vllm.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import time
+import os
+import argparse
+# import torch # torch might not be directly needed if vLLM handles all device aspects
+from datasets import load_dataset
+from tqdm import tqdm
+from openai import OpenAI # For GPT-4o rephrasing
+from vllm import LLM, SamplingParams # For vLLM inference
+from transformers import AutoTokenizer # Import AutoTokenizer
+from utils.metrics import qa_f1_score, qa_em_score
+# This will be respected by vLLM if CUDA_VISIBLE_DEVICES is set before vLLM import
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2" # User can set this outside the script
+# --- OpenAI Client for Rephrasing ---
+openai_client = OpenAI(
+    api_key=os.environ.get("OPENAI_API_KEY"),
+    base_url=os.environ.get("OPENAI_BASE_URL")
+)
+def get_openai_rephrase_response(prompt, model="gpt-4o", retries=3, delay=2):
+    """Call OpenAI API for rephrasing."""
+    for attempt in range(retries):
+        try:
+            completion = openai_client.chat.completions.create(
+                model=model,
+                messages=[{'role': 'user', 'content': prompt}],
+                max_tokens=100 # Max tokens for rephrased question
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"OpenAI Rephrase attempt {attempt + 1} failed: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying OpenAI rephrase in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries for OpenAI rephrase reached.")
+                return "Failed to rephrase question"
+def rephrase_question_with_gpt4o(question, rephrase_type="opposite"):
+    """Rephrase a question using GPT-4o (English prompt)."""
+    if rephrase_type == "opposite":
+        prompt = f"""Please rephrase the following question to have the exact opposite meaning.
+Question: {question}
+Return only the rephrased question with the opposite meaning, without any explanations or other content."""
+    elif rephrase_type == "similar":
+        prompt = f"""Please rephrase the following question to be synonymous, maintaining the original meaning but using different wording:
+Question: {question}
+Return only the rephrased question, without any explanations or other content."""
+    else:
+        raise ValueError(f"Invalid rephrase_type: {rephrase_type}. Must be 'opposite' or 'similar'.")
+    return get_openai_rephrase_response(prompt)
+# --- vLLM Model Functions (for Answering) ---
+def get_vllm_response(prompt_text, llm_instance, sampling_params_instance, retries=2, delay=5):
+    """Generate a response from a vLLM instance."""
+    for attempt in range(retries):
+        try:
+            # vLLM generate method expects a list of prompts
+            outputs = llm_instance.generate([prompt_text], sampling_params_instance)
+            # For a single prompt, the result is in the first element of the output list
+            # Each output object has a list of `outputs` (for n>1 in SamplingParams)
+            response = outputs[0].outputs[0].text.strip()
+            return response
+        except Exception as e:
+            print(f"vLLM generation attempt {attempt + 1} failed: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying vLLM generation in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries for vLLM generation reached.")
+                return "Failed to get vLLM response"
+def answer_question_with_context_vllm(question, context, llm_instance, sampling_params_instance, tokenizer):
+    """Answer a question with context using a vLLM model and chat template (English prompt)."""
+    # Construct prompt using chat template, similar to evaluation.py
+    prompt_content = (
+        f"Answer the question based on the given passages. "
+        "Only give me your answer and do not output any other words.\\n"
+        "The following are given passages:\\n"
+        f"{context}\\n"
+        "Please strictly follow the context. "
+        f"Question: {question}\\n"
+        "Answer:"
+    )
+    messages = [{"role": "user", "content": prompt_content}]
+    # Apply chat template
+    # Note: Some tokenizers might not have a chat template configured, or might have different ways to apply it.
+    # This is a common way for many models.
+    try:
+        final_prompt_text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    except Exception as e:
+        print(f"Failed to apply chat template: {e}. Falling back to basic prompt string.")
+        # Fallback to a simpler prompt if template application fails
+        final_prompt_text = f"Context:\\n{context}\\n\\nQuestion: {question}\\n\\nAnswer:"
+    return get_vllm_response(final_prompt_text, llm_instance, sampling_params_instance)
+def main(args):
+    # Load Tokenizer for the vLLM model
+    print(f"Loading tokenizer for model: {args.model_name}...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
+        print("Successfully loaded tokenizer.")
+    except Exception as e:
+        print(f"Failed to load tokenizer for {args.model_name}: {e}")
+        print("Please ensure the model name is correct and the tokenizer can be loaded.")
+        return
+    # Load vLLM Model (for Answering)
+    print(f"Loading vLLM model for Answering: {args.model_name}...")
+    print(f"(This may take a while depending on the model size and download speed if not cached).")
+    vllm_model = None
+    try:
+        # You can expose more vLLM LLM parameters as args if needed
+        # (e.g., tensor_parallel_size, dtype, gpu_memory_utilization)
+        vllm_model = LLM(
+            model=args.model_name,
+            trust_remote_code=args.trust_remote_code,
+            dtype="bfloat16", # Use dtype from command line arguments
+            # Add other vLLM LLM constructor arguments here if needed, e.g.:
+            tensor_parallel_size=2
+        )
+        print(f"Successfully loaded vLLM model {args.model_name} with dtype='{args.dtype}' and tensor_parallel_size={args.tensor_parallel_size}.")
+    except Exception as e:
+        print(f"Failed to load vLLM model {args.model_name}: {e}")
+        print("Please ensure vLLM is installed correctly and the model identifier is valid.")
+        return
+    # Define Sampling Parameters for vLLM
+    # max_tokens is equivalent to max_new_tokens in HF
+    # temperature=0.0 for greedy decoding, good for QA tasks for more deterministic output.
+    # Adjust temperature (e.g., 0.7) and top_p (e.g., 0.95) for more diverse outputs if needed.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=30) # Set temperature to 0.0 for deterministic QA
+    # Load dataset
+    print(f"Loading dataset {args.dataset_name}, subset {args.dataset_subset}...")
+    try:
+        dataset = load_dataset(args.dataset_name, args.dataset_subset)["test"]
+        print(f"Successfully loaded dataset with {len(dataset)} samples.")
+    except Exception as e:
+        print(f"Failed to load dataset: {e}")
+        return
+    em_match_count = 0  # Counter for EM matches
+    em_match_original_count = 0 # Counter for EM matches
+    successfully_processed_samples = 0 # Counter for successfully processed samples
+    num_samples_to_process = len(dataset) if args.sample_count == -1 else min(args.sample_count, len(dataset))
+    print(f"Processing {num_samples_to_process} samples. Rephrasing with GPT-4o (opposite meaning). Answering with vLLM model {args.model_name} (max 30 tokens)...")
+    for i in tqdm(range(num_samples_to_process), desc="Processing samples with vLLM"):
+        example = dataset[i]
+        original_question = example['input']
+        context = example['context']
+        ground_truth_answers = example['answers']
+        rephrased_question = rephrase_question_with_gpt4o(original_question, args.rephrase_type) # Use new rephrasing
+        if rephrased_question == "Failed to rephrase question":
+            print(f"Skipping sample {i+1} due to rephrasing failure.")
+            continue
+        rephrased_answer = answer_question_with_context_vllm(rephrased_question, context, vllm_model, sampling_params, tokenizer)
+        # print(f"Rephrased question: {rephrased_question}") # Optional: for debugging
+        # print(f"Answer to rephrased: {rephrased_answer}") # Optional: for debugging
+        original_answer = answer_question_with_context_vllm(original_question, context, vllm_model, sampling_params, tokenizer)
+        # print(f"Original question: {original_question}") # Optional: for debugging
+        # print(f"Answer to original: {original_answer}") # Optional: for debugging
+        if not ground_truth_answers:
+            print(f"Skipping sample {i+1} due to missing ground truth answers.")
+            continue
+        print(original_answer)
+        successfully_processed_samples += 1
+        sample_had_em_match = False
+        em_match_count += qa_em_score(rephrased_answer, ground_truth_answers[0])
+        sample_had_em_match = False
+        print(original_answer)
+        print(ground_truth_answers[0])
+        em_match_original_count += qa_em_score(original_answer, ground_truth_answers[0])
+    if successfully_processed_samples > 0:
+        print(f"Answering vLLM Model: {args.model_name}")
+        print(f"Dataset             : {args.dataset_name} ({args.dataset_subset})")
+        print(f"Successfully Processed Samples for Evaluation: {successfully_processed_samples}")
+        print(f"Max Answer Tokens   : 30") # Reflects SamplingParams
+        print(f"Count of EM with original ground truth (after rephrase): {em_match_count}")
+        print(f"Count of EM with original ground truth (before rephrase): {em_match_original_count}")
+    else:
+        print("\nNo samples were processed adequately to provide an evaluation summary.")
+    print("vLLM processing complete!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Rephrase with GPT-4o, Answer with local vLLM-hosted Model, then Evaluate.")
+    parser.add_argument("--model_name", type=str, default="facebook/opt-125m", help="Name/path of the Hugging Face model for Answering via vLLM (e.g., 'mistralai/Mistral-7B-Instruct-v0.1').")
+    parser.add_argument("--dataset_name", type=str, default="THUDM/LongBench", help="Name of the Hugging Face dataset.")
+    parser.add_argument("--dataset_subset", type=str, default="2wikimqa", help="Subset of the dataset.")
+    parser.add_argument("--sample_count", type=int, default=3, help="Number of samples to process. -1 for all. Default: 3 for quick testing.")
+    parser.add_argument("--trust_remote_code", action="store_true", help="Set to true if the Hugging Face model for vLLM requires remote code.")
+    parser.add_argument("--tensor_parallel_size", type=int, default=1, help="Tensor parallel size for vLLM.")
+    parser.add_argument("--dtype", type=str, default="auto", help="Data type for the model. Examples: 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'. Default is 'auto'.")
+    parser.add_argument("--rephrase_type", type=str, default="opposite", choices=["opposite", "similar"], help="Type of rephrasing: 'opposite' for opposite meaning or 'similar' for similar meaning.")
+    args = parser.parse_args()
+    main(args)

eval/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from .evaluation import *
2	+

eval/eval_with_api.py ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Single-phase evaluator (DeepSeek API) — Calculate EM / F1 only.
+Usage Example
+--------
+python eval_single_phase.py --input data/2wikimqa.jsonl
+"""
+import argparse, time, jsonlines, os
+from pathlib import Path
+from tqdm import tqdm
+from openai import OpenAI
+from utils.metrics import qa_em_score, qa_f1_score
+from utils.llmjudge import judge_answer_with_api
+# -------------------- CLI --------------------
+p = argparse.ArgumentParser("Single-phase evaluator")
+p.add_argument("--input", required=True, help="Path to the *.jsonl file to evaluate")
+p.add_argument("--model",       default="deepseek-r1")
+p.add_argument("--temperature", type=float, default=0.5)
+p.add_argument("--max_tokens",  type=int,   default=30)
+p.add_argument("--sleep",       type=float, default=0.0)
+args = p.parse_args()
+client = OpenAI(
+    base_url=os.environ.get("OPENAI_BASE_URL"),
+    api_key=os.environ.get("OPENAI_API_KEY")
+)
+# -------------------- helper --------------------
+def ask(context: str, question: str) -> str:
+    """Call DeepSeek to get answer (return final answer only)"""
+    messages = [
+        {"role": "system",
+         "content": ("You are a QA assistant. "
+                     "Answer strictly based on the passages; "
+                     "output only the final answer.")},
+        {"role": "user",
+         "content": f"Answer the question and output only the final answer without extra words. Passages:\n{context}\n\nQuestion: {question}\nAnswer:"}
+    ]
+    resp = client.chat.completions.create(
+        model=args.model,
+        messages=messages,
+        temperature=args.temperature,
+        max_tokens=args.max_tokens
+    )
+    if not resp.choices[0].message.content:
+        return "None"
+    return resp.choices[0].message.content.strip()
+# -------------------- core eval --------------------
+def evaluate_file(path: Path):
+    dataset = path.stem
+    data = {obj["input"]: obj for obj in jsonlines.open(path)}
+    total   = len(data)
+    em_hits = 0
+    f1_sum  = 0.0
+    for q, item in tqdm(data.items(), desc=f"{dataset}"):
+        ctx   = item["context"]
+        golds = item["answers"] if isinstance(item["answers"], list) else [item["answers"]]
+        pred  = ask(ctx, q).split('.', 1)[0]        # Cut off extra explanations
+        if pred == "None":
+            continue
+        em    = max(qa_em_score(pred, g)  for g in golds)
+        f1    = max(qa_f1_score(pred, g) for g in golds)
+        em_hits += em
+        f1_sum  += f1
+        if args.sleep:
+            time.sleep(args.sleep)
+    print(f"\n=== {dataset.upper()} SUMMARY ===")
+    print(f"Total samples : {total}")
+    print(f"Exact Match   : {em_hits}/{total}  ({em_hits/total:.2%})")
+    print(f"Average F1    : {f1_sum/total:.4f}")
+    print("-" * 40 + "\n")
+# -------------------- run --------------------
+input_path = Path(args.input)
+if not input_path.exists():
+    raise SystemExit(f"File does not exist: {input_path}")
+evaluate_file(input_path)

eval/evaluation.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import argparse, os, jsonlines, torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from utils.metrics import qa_f1_score, qa_em_score
+THINK_END_ID = 151668        # "</think>" token id for Qwen3
+# --------------------------------------------------
+def strip_think(token_ids):
+    try:
+        cut = len(token_ids) - token_ids[::-1].index(THINK_END_ID)
+        return token_ids[cut:]
+    except ValueError:
+        return token_ids
+def main():
+    # ---------- CLI ----------
+    parser = argparse.ArgumentParser(
+        description="Evaluate HotpotQA JSONL with Transformers + Qwen3-8B"
+    )
+    parser.add_argument("-i", "--input", required=True,
+                        help="Path to input JSONL file")
+    parser.add_argument("--model", required=True,
+                        help="HF model name, e.g. Qwen/Qwen3-8B")
+    parser.add_argument("-d", "--devices", default="0",
+                        help="CUDA_VISIBLE_DEVICES (comma-separated)")
+    parser.add_argument("-t", "--temperature", type=float, default=0.5,
+                        help="Sampling temperature")
+    parser.add_argument("-k", "--max_tokens", type=int, default=40,
+                        help="max_new_tokens")
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+    gen_cfg = GenerationConfig(
+        temperature=args.temperature,
+        max_new_tokens=args.max_tokens,
+        do_sample=args.temperature > 0
+    )
+    with jsonlines.open(args.input) as reader:
+        data = list(reader)
+    total_f1 = total_em = 0.0
+    for idx, item in enumerate(data):
+        question = item.get("input", "")
+        context  = item.get("context", "")
+        answers  = item.get("answers", [])
+        if not answers:
+            print(f"[{idx}] no gold answer, skip")
+            continue
+        gold = answers[0]
+        print(gold)
+        # ----- Prompt -----
+        prompt = (
+            "Answer the question based on the given passages. "
+            "Only give me your answer and do not output any other words.\n"
+            "Passages:\n"
+            f"{context}\n"
+            f"Question: {question}\n"
+            "Answer:"
+        )
+        messages = [{"role": "user", "content": prompt}]
+        chat_text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False
+        )
+        inputs = tokenizer([chat_text], return_tensors="pt").to(model.device)
+        # ----- Generate -----
+        try:
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_new_tokens=args.max_tokens)
+        except ValueError as e:
+            if "position ids exceed" in str(e).lower() or "sequence length" in str(e).lower():
+                print(f"[{idx}] prompt too long – skipped")
+                continue
+            raise
+        print("im here")
+        new_ids = outputs[0][len(inputs.input_ids[0]):].tolist()
+        try:
+            index = len(new_ids) - new_ids[::-1].index(151668)
+        except ValueError:
+            index = 0
+        answer = tokenizer.decode(new_ids[index:], skip_special_tokens=True).strip("\n")
+        answer = answer.strip()
+        # ----- Score -----
+        f1 = qa_f1_score(answer, gold)
+        em = qa_em_score(answer, gold)
+        total_f1 += f1
+        total_em += em
+        print(f"[{idx}] Q: {question}")
+        print(f"    Resp: {answer!r} | Gold: {gold!r}")
+        print(f"    F1={f1:.2f}, EM={em:.2f}")
+    n = len(data)
+    print(f"\nOverall F1: {total_f1/n:.4f}")
+    print(f"Overall EM: {total_em/n:.4f}")
+if __name__ == "__main__":
+    main()

main_gpu.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import json
+import time
+import re
+import os
+import argparse
+from datasets import load_dataset
+from haystack import Pipeline, Document
+from haystack.utils import Secret
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
+from haystack import Document  # haystack Document for storing context
+from nltk.tokenize import sent_tokenize
+from utils.util import retriveDoc,compute_best_sentence_f1
+from openai import OpenAI
+import asyncio, json, torch, math
+from typing import List, Tuple
+# Hugging Face transformers related
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from utils.metrics import qa_f1_score
+from utils.llmjudge import judge_answer_with_api
+client = OpenAI(
+    base_url=os.environ.get("OPENAI_BASE_URL"),
+    api_key=os.environ.get("OPENAI_API_KEY")
+)
+# Load models using transformers
+tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen-14B-Chat", trust_remote_code=True)
+model1 = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-14B-Chat", trust_remote_code=True,device_map="cuda:0",torch_dtype=torch.bfloat16)
+tok_qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True)
+model_qwen = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True,
+    device_map="cuda:1",torch_dtype=torch.bfloat16
+).eval()
+def get_transformers_answer(prompt, tokenizer, model, max_new_tokens=100, temperature=0.7, top_p=0.9, retries=3, delay=5):
+    """
+    Use transformers model.generate method for inference with retry mechanism,
+    strip the input prompt part through token-level slicing,
+    and return the newly generated text.
+    """
+    import time
+    for attempt in range(retries):
+        try:
+            # Encode prompt as model input tensor
+            model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+            # Call generate, the generated id sequence contains both prompt and subsequent generated text
+            generated_ids = model.generate(
+                **model_inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p
+            )
+            # Calculate the token count corresponding to the prompt
+            input_length = model_inputs.input_ids.shape[1]
+            # Strip the prompt part from the front of the output, keeping only the newly added part
+            output_ids = generated_ids[0][input_length:]
+            # Decode generated text
+            answer = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+            return answer
+        except Exception as e:
+            print(f"Error on attempt {attempt + 1}: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries reached, skipping this request.")
+                return None
+def truncate_answer(answer):
+    """Truncate answer, only take the part before the first period"""
+    return answer.split('.')[0].strip() if answer else "No answer"
+def write_to_log(filename, data):
+    """Write data to log file"""
+    with open(filename, 'a', encoding='utf-8') as file:
+        file.write(data + '\n')
+def remove_think_tags(text: str) -> str:
+    """Remove all <think> ... </think> blocks"""
+    return re.sub(r'<think>(.*?)</think>', '', text, flags=re.DOTALL).strip()
+def build_prompt(context: str, question: str) -> str:
+    prompt = (
+        f"Answer the question based on the given passages. The following are the passages:\n"
+        f"{context}\n"
+        f"Answer the question based on the given passages.\n"
+        f"Question: {question}.\n"
+        f"Please first provide your answer in the format of Answer:[Your answer]. Then provide your reasoning process step-by-step.(Only include explicit clues) "
+        f"At the end of each reasoning step, include a new line that specifies the key information or reference content used in that step. "
+        f"Please ensure that the [reference content] you include is the complete original sentence or consecutive sentences from the text. Please do not change the punctuation.  Do not use ellipses inside the sentence. "
+        f"Follow this format:\n"
+        f"Answer: [Your answer]\n"
+        f"Step-by-step Reasoning:\n"
+        f"1. [Reasoning step 1]\n"
+        f"[replaced by your reference content]\n"
+        f"2. [Reasoning step 2]\n"
+        f"[replaced by your reference content]\n"
+    )
+    return prompt
+def extract_final_bullet_passage(answer_text: str):
+    reasoning_pattern = r"Step-by-step Reasoning:\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return None, None
+    reasoning_text = reasoning_match.group(1).strip()
+    bullet_pattern = r"(?m)^(\d+\.\s.*?)(?=(?:\n\d+\.\s)|\Z)"
+    bullets = re.findall(bullet_pattern, reasoning_text, flags=re.DOTALL)
+    if not bullets:
+        print("No bullet blocks found.")
+        return None, None
+    passage_pattern = re.compile(
+        r'(?i)(?:\*\*)?passage\s+(\d+)(?:\*\*)?\s*:\s*("([^"]*)"|(.+?))(?=\Z|\n\s*\n|$)',
+        flags=re.DOTALL
+    )
+    for bullet in reversed(bullets):
+        matches = passage_pattern.findall(bullet)
+        if matches:
+            last_match = matches[-1]
+            passage_number = last_match[0]
+            quoted_snippet = last_match[2]
+            non_quoted_snippet = last_match[3]
+            snippet = non_quoted_snippet.strip() if non_quoted_snippet.strip() else quoted_snippet.strip()
+            return passage_number, snippet
+    return None, None
+def extract_all_bullet_passages(answer_text: str):
+    reasoning_pattern = r"Step-by-step Reasoning:\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return []
+    reasoning_text = reasoning_match.group(1).strip()
+    bullet_pattern = re.compile(r"^(\d+\.\s.*?)(?=^\d+\.\s|\Z)", re.MULTILINE | re.DOTALL)
+    bullets = bullet_pattern.findall(reasoning_text)
+    if not bullets:
+        return []
+    results = []
+    for bullet_index, bullet_text in enumerate(bullets, start=1):
+        results.append({
+            'bullet_index': bullet_index,
+            'snippet': bullet_text.strip()
+        })
+    print(results)
+    return results
+def extract_evidence(answer_text: str):
+    reasoning_pattern = r"(?i)Evidence\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return []
+    reasoning_text = reasoning_match.group(1).strip()
+            # Extract all bullet segments
+    bullet_pattern = re.compile(r"^(\d+\.\s.*?)(?=^\d+\.\s|\Z)", re.MULTILINE | re.DOTALL)
+    bullets = bullet_pattern.findall(reasoning_text)
+    if not bullets:
+        return []
+    # Find the index of the first bullet starting with 1.
+    start_index = -1
+    for i, bullet in enumerate(bullets):
+        if bullet.strip().startswith("1."):
+            start_index = i
+            break
+    if start_index == -1:
+        return []  # No valid starting bullet
+    # Only keep the part starting from the first valid bullet
+    bullets = bullets[start_index:]
+    results = []
+    for bullet_index, bullet_text in enumerate(bullets, start=1):
+        results.append({
+            'bullet_index': bullet_index,
+            'snippet': bullet_text.strip()
+        })
+    return results
+def get_answer_with_retry(model, prompt, retries=3, delay=5):
+    """Call the model to get the answer based on the prompt, with retry on failure."""
+    for attempt in range(retries):
+        try:
+            completion = client.chat.completions.create(
+                model=model,
+                messages=[{'role': 'user', 'content': prompt}]
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"Error on attempt {attempt + 1}: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries reached, skipping this request.")
+                return None
+@torch.no_grad()
+def qwen_answer_and_ppl(question: str, context: str) -> Tuple[str,float]:
+    prompt = f"{context}\n\nQuestion: {question}\nAnswer:"
+    inputs = tok_qwen(prompt, return_tensors="pt").to(model_qwen.device)
+    gen_ids = model_qwen.generate(**inputs, max_new_tokens=30, eos_token_id=tok_qwen.eos_token_id)
+    ans_ids = gen_ids[0][inputs.input_ids.shape[1]:]
+    answer = tok_qwen.decode(ans_ids, skip_special_tokens=True).strip()
+    # Calculate PPL
+    full_ids = torch.cat([inputs.input_ids[0], ans_ids])
+    logits  = model_qwen(full_ids.unsqueeze(0)).logits[0,:-1]
+    tgt     = full_ids[1:]
+    logp    = torch.log_softmax(logits, dim=-1)
+    sel     = logp[range(len(tgt)), tgt]
+    ppl     = math.exp(-sel.mean().item())
+    return answer, ppl
+def extract_json_from_gpt_response(text: str) -> dict | None:
+    """
+    Finds the first JSON block inside ```json ... ``` or ``` … ``` and returns it as a dict.
+    """
+    # Try to find a ```json … ``` block first
+    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
+    if not m:
+        # Fallback: any ``` … ``` block that looks like JSON
+        m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
+    if not m:
+        # Lastly, maybe the model just spit raw JSON without fences
+        m = re.search(r"(\{.*?\})", text, flags=re.DOTALL)
+    if not m:
+        return None
+    json_str = m.group(1)
+    try:
+        return json.loads(json_str)
+    except json.JSONDecodeError:
+        # clean up trailing commas, etc.
+        cleaned = re.sub(r",\s*([\]}])", r"\1", json_str)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            return None
+async def multi_proposal_pipeline(
+    question: str,
+    original_context: str,       # directly pass in example['context']
+    unique_sents: List[str],
+    correct_answer: str,
+    rounds: int = 3
+) -> dict:
+    best = {"ppl": float("0"), "context": None, "answer": None}
+    for i in range(rounds):
+        # Construct GPT-4o prompt
+        numbered = "\n\n".join(f"{j+1}. {s}" for j, s in enumerate(unique_sents))
+        prompt = (
+            "You are a creative contrarian. Given the question below, and the original answer, first propose a concise alternative answer—that is, a plausible but intentionally misleading answer. "
+            "Followed are some sentences supporting the original answer, please rewrite them. When rewriting each sentence, modify only the parts necessary to support the antifact answer. Parts unrelated to the answer must keep their original meaning. Be sure that the modified evidence sentences are sufficient to answer the original question. Output must be strictly in the specified JSON format, with no additional text.\n"
+            '{\n'
+            '  "answer": "<your antifact answer here, just provide the answer phrase, no need for complete sentence>",\n'
+            '  "revised": [\n'
+            '    "<rewritten sentence 1>",\n'
+            '    "<rewritten sentence 2>",\n'
+            '    ...\n'
+            '  ]\n'
+            '}\n\n'
+            f"Question:\n{question}\n\n"
+            f"Original answer:\n{correct_answer}\n\n"
+            f"Sentences to rewrite:\n{numbered}"
+        )
+        print(f"[Proposal {i+1}] Prompt: {prompt}")
+        rsp = client.chat.completions.create(
+            model="gpt-4o", temperature=0.7,
+            messages=[{"role":"user","content":prompt}]
+        )
+        js = extract_json_from_gpt_response(rsp.choices[0].message.content)
+        if not js:
+            print("[Proposal {i+1}] Failed to parse JSON")
+            continue
+        revised   = js["revised"]     # List[str]
+        proposed  = js["answer"]      # Answer given by GPT-4o (optional record)
+        new_ctx = original_context
+        for old, new in zip(unique_sents, revised):
+            new_ctx = new_ctx.replace(old, new)
+        # Use Qwen to calculate answer & PPL
+        ans_i, ppl_i = qwen_answer_and_ppl(question, new_ctx)
+        print(f"[Proposal {i+1}] PPL = {ppl_i:.2f}")
+        if ppl_i > best["ppl"]:
+            best.update({"ppl": ppl_i, "context": new_ctx, "answer": proposed})
+    return best
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="LastingBench main pipeline for context rewriting")
+    parser.add_argument("--output", "-o", type=str, default="output.jsonl",
+                       help="Output JSONL file path (default: output.jsonl)")
+    parser.add_argument("--dataset_repo", type=str, default="THUDM/LongBench",
+                       help="Dataset repository name (default: THUDM/LongBench)")
+    parser.add_argument("--dataset_subset", type=str, default="multifieldqa_en",
+                       help="Dataset subset name (default: multifieldqa_en)")
+    parser.add_argument("--split", type=str, default="test",
+                       help="Dataset split (default: test)")
+    parser.add_argument("--start_idx", type=int, default=0,
+                       help="Starting index for processing (default: 0)")
+    parser.add_argument("--max_samples", type=int, default=-1,
+                       help="Maximum number of samples to process (-1 for all, default: -1)")
+    args = parser.parse_args()
+    out_file = args.output
+    # Load dataset
+    longbench = load_dataset(args.dataset_repo, args.dataset_subset)[args.split]
+    print(f"Output file: {out_file}")
+    print(f"Dataset: {args.dataset_repo}/{args.dataset_subset}[{args.split}]")
+    print(f"Total samples: {len(longbench)}")
+    f1_score_total = 0
+    llm_judge_score_total = 0
+    count = 0
+    # Determine processing range
+    start_idx = args.start_idx
+    end_idx = len(longbench) if args.max_samples == -1 else min(start_idx + args.max_samples, len(longbench))
+    print(f"Processing samples from index {start_idx} to {end_idx-1}")
+    for idx in range(start_idx, end_idx):
+        example = longbench[idx]
+        question = example['input']
+        print(f"Question: {question}")
+        context = example['context']
+        correct_answer = example['answers'][0]
+        print(f"Processing example {idx + 1}:")
+        print(f"Correct Answer: {correct_answer}")
+        # Build prompts
+        prompt_with_context = build_prompt(context, question)
+        # Get answers using transformers pipelines
+        answer_with_context = get_answer_with_retry('deepseek-r1', prompt_with_context)
+        # Extract content after "Answer:" from answer_with_context
+        answer_with_context_simple = (
+            answer_with_context
+            .split("Answer:", 1)[-1]          # First keep the part after Answer:
+            .split("Step-by-step Reasoning", 1)[0]  # Then cut before Step-by-step Reasoning
+            .strip()
+        )
+        print(f"Answer with context: {answer_with_context_simple}")
+        result = judge_answer_with_api(question, correct_answer, answer_with_context_simple)
+        print(f"Answer judge result: {result}")
+        if not result:
+            continue
+        answer_with_context = remove_think_tags(answer_with_context or "")
+        evidence = extract_all_bullet_passages(answer_with_context)
+        page_contents = []
+        if evidence:
+            count += 1
+            for ev in evidence:
+                snippet = ev['snippet']
+                result = retriveDoc(context, snippet)
+                # result["context"] is a set of Document objects
+                page_contents += [doc.page_content for doc in result]
+            unique_page_contents = list(dict.fromkeys(page_contents))
+            aggregated_content = "\n".join(unique_page_contents)
+            prompt_final = (
+                f"Please answer the question based on the context.\nContext: {aggregated_content}.\n Question: {question}.\n"
+                f"Please only provide your answer. "
+                f"Your Answer:"
+            )
+            final_answer = get_transformers_answer(prompt_final, tokenizer1, model1)
+            if judge_answer_with_api(question, correct_answer, final_answer):
+                print("correct")
+            else:
+                print("incorrect")
+                result_query = retriveDoc(context, question)
+                page_contents += [doc.page_content for doc in result_query]
+            unique_page_contents = list(dict.fromkeys(page_contents))
+            best = asyncio.run(
+            multi_proposal_pipeline(
+                question,
+                context,
+                unique_page_contents,
+                correct_answer
+            )
+        )
+            record = {
+                "question": question,
+                "answer":   best["answer"],
+                "context":  best["context"]
+            }
+            # Append one line of JSON each loop
+            with open(out_file, "a", encoding="utf-8") as fout:
+                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
+if __name__ == "__main__":
+    main()

random_alternative_answer.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import json
+import time
+import re
+import os
+import argparse
+from datasets import load_dataset
+from nltk.tokenize import sent_tokenize
+from utils.util import retriveDoc,compute_best_sentence_f1
+from openai import OpenAI
+import asyncio, json, torch, math
+from typing import List, Tuple
+# Hugging Face transformers related
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from utils.metrics import qa_f1_score
+from utils.llmjudge import judge_answer_with_api
+client = OpenAI(
+    base_url=os.environ.get("OPENAI_BASE_URL"),
+    api_key=os.environ.get("OPENAI_API_KEY")
+)
+# Load models using transformers
+tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen-14B-Chat", trust_remote_code=True)
+model1 = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-14B-Chat", trust_remote_code=True,device_map="cuda:0",torch_dtype=torch.bfloat16)
+tok_qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True)
+model_qwen = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True,
+    device_map="cuda:1",torch_dtype=torch.bfloat16
+).eval()
+def get_transformers_answer(prompt, tokenizer, model, max_new_tokens=100, temperature=0.7, top_p=0.9, retries=3, delay=5):
+    """
+    Use transformers model.generate method for inference with retry mechanism,
+    use chat template to format input, and strip the input prompt part through token-level slicing,
+    return the newly generated text.
+    """
+    import time
+    for attempt in range(retries):
+        try:
+            # Convert original prompt to message format
+            messages = [{"role": "user", "content": prompt}]
+            # Try to use chat template to format input
+            try:
+                formatted_prompt = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            except Exception as e:
+                print(f"Unable to apply chat template: {e}, falling back to basic text input")
+                formatted_prompt = prompt  # Fall back to original prompt as input
+            # Encode formatted prompt as model input tensor
+            model_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+            # Call generate, the generated id sequence contains both prompt and subsequent generated text
+            generated_ids = model.generate(
+                **model_inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p
+            )
+            # Calculate the token count corresponding to the prompt
+            input_length = model_inputs.input_ids.shape[1]
+            # Strip the prompt part from the front of the output, keeping only the newly added part
+            output_ids = generated_ids[0][input_length:]
+            # Decode generated text
+            answer = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+            return answer
+        except Exception as e:
+            print(f"Error on attempt {attempt + 1}: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries reached, skipping this request.")
+                return None
+def truncate_answer(answer):
+    """Truncate answer, only take the part before the first period"""
+    return answer.split('.')[0].strip() if answer else "No answer"
+def write_to_log(filename, data):
+    """Write data to log file"""
+    with open(filename, 'a', encoding='utf-8') as file:
+        file.write(data + '\n')
+def remove_think_tags(text: str) -> str:
+    """Remove all <think> ... </think> blocks"""
+    return re.sub(r'<think>(.*?)</think>', '', text, flags=re.DOTALL).strip()
+def build_prompt(context: str, question: str) -> str:
+    prompt = (
+        f"Answer the question based on the given passages. The following are the passages:\n"
+        f"{context}\n"
+        f"Answer the question based on the given passages.\n"
+        f"Question: {question}.\n"
+        f"Answer:\n"
+        f"Please first provide your answer in the format of Answer:[Your answer]. Then provide your reasoning process step-by-step.(Only include explicit clues) "
+        f"At the end of each reasoning step, include a new line that specifies the key information or reference content used in that step. "
+        f"Please ensure that the [reference content] you include is the complete original sentence or consecutive sentences from the text. Please do not change the punctuation.  Do not use ellipses inside the sentence. "
+        f"Follow this format:\n"
+        f"Answer: [Your answer]\n"
+        f"Step-by-step Reasoning:\n"
+        f"1. [Reasoning step 1]\n"
+        f"[replaced by your reference content]\n"
+        f"2. [Reasoning step 2]\n"
+        f"[replaced by your reference content]\n"
+    )
+    return prompt
+def extract_final_bullet_passage(answer_text: str):
+    reasoning_pattern = r"Step-by-step Reasoning:\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return None, None
+    reasoning_text = reasoning_match.group(1).strip()
+    bullet_pattern = r"(?m)^(\d+\.\s.*?)(?=(?:\n\d+\.\s)|\Z)"
+    bullets = re.findall(bullet_pattern, reasoning_text, flags=re.DOTALL)
+    if not bullets:
+        print("No bullet blocks found.")
+        return None, None
+    passage_pattern = re.compile(
+        r'(?i)(?:\*\*)?passage\s+(\d+)(?:\*\*)?\s*:\s*("([^"]*)"|(.+?))(?=\Z|\n\s*\n|$)',
+        flags=re.DOTALL
+    )
+    for bullet in reversed(bullets):
+        matches = passage_pattern.findall(bullet)
+        if matches:
+            last_match = matches[-1]
+            passage_number = last_match[0]
+            quoted_snippet = last_match[2]
+            non_quoted_snippet = last_match[3]
+            snippet = non_quoted_snippet.strip() if non_quoted_snippet.strip() else quoted_snippet.strip()
+            return passage_number, snippet
+    return None, None
+def extract_all_bullet_passages(answer_text: str):
+    reasoning_pattern = r"Step-by-step Reasoning:\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return []
+    reasoning_text = reasoning_match.group(1).strip()
+    bullet_pattern = re.compile(r"^(\d+\.\s.*?)(?=^\d+\.\s|\Z)", re.MULTILINE | re.DOTALL)
+    bullets = bullet_pattern.findall(reasoning_text)
+    if not bullets:
+        return []
+    results = []
+    for bullet_index, bullet_text in enumerate(bullets, start=1):
+        results.append({
+            'bullet_index': bullet_index,
+            'snippet': bullet_text.strip()
+        })
+    print(results)
+    return results
+def extract_evidence(answer_text: str):
+    reasoning_pattern = r"(?i)Evidence\s*(.*)"
+    reasoning_match = re.search(reasoning_pattern, answer_text, flags=re.DOTALL)
+    if not reasoning_match:
+        return []
+    reasoning_text = reasoning_match.group(1).strip()
+    # Extract all bullet segments
+    bullet_pattern = re.compile(r"^(\d+\.\s.*?)(?=^\d+\.\s|\Z)", re.MULTILINE | re.DOTALL)
+    bullets = bullet_pattern.findall(reasoning_text)
+    if not bullets:
+        return []
+    # Find the index of the first bullet starting with 1.
+    start_index = -1
+    for i, bullet in enumerate(bullets):
+        if bullet.strip().startswith("1."):
+            start_index = i
+            break
+    if start_index == -1:
+        return []  # No valid starting bullet
+    # Only keep the part starting from the first valid bullet
+    bullets = bullets[start_index:]
+    results = []
+    for bullet_index, bullet_text in enumerate(bullets, start=1):
+        results.append({
+            'bullet_index': bullet_index,
+            'snippet': bullet_text.strip()
+        })
+    return results
+def get_answer_with_retry(model, prompt, retries=3, delay=5):
+    """Call the model to get the answer based on the prompt, with retry on failure."""
+    for attempt in range(retries):
+        try:
+            completion = client.chat.completions.create(
+                model=model,
+                messages=[{'role': 'user', 'content': prompt}]
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            print(f"Error on attempt {attempt + 1}: {e}")
+            if attempt < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print("Max retries reached, skipping this request.")
+                return None
+def extract_json_from_gpt_response(text: str) -> dict | None:
+    """
+    Finds the first JSON block inside ```json ... ``` or ``` … ``` and returns it as a dict.
+    """
+    # Try to find a ```json … ``` block first
+    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
+    if not m:
+        # Fallback: any ``` … ``` block that looks like JSON
+        m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
+    if not m:
+        # Lastly, maybe the model just spit raw JSON without fences
+        m = re.search(r"(\{.*?\})", text, flags=re.DOTALL)
+    if not m:
+        return None
+    json_str = m.group(1)
+    try:
+        return json.loads(json_str)
+    except json.JSONDecodeError:
+        # clean up trailing commas, etc.
+        cleaned = re.sub(r",\s*([\]}])", r"\1", json_str)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            return None
+async def random_alternative_answer(
+    question: str,
+    original_context: str,
+    unique_sents: List[str],
+    correct_answer: str
+) -> dict:
+    """Generate random alternative answer and modified evidence"""
+    # Construct GPT-4o prompt
+    numbered = "\n\n".join(f"{j+1}. {s}" for j, s in enumerate(unique_sents))
+    prompt = (
+        "You are a creative assistant. Given the question below and the original answer, propose a plausible alternative answer that is **different** from the original but still reasonable. "
+        "Then rewrite the provided sentences to support your alternative answer. When rewriting each sentence, modify only the parts necessary to support the alternative answer. "
+        "Parts unrelated to the answer must keep their original meaning. Be sure that the modified evidence sentences are sufficient to answer the original question. "
+        "Output must be strictly in the specified JSON format, with no additional text.\n"
+        '{\n'
+        '  "answer": "<your alternative answer here, just provide the answer phrase, no need for complete sentence>",\n'
+        '  "revised": [\n'
+        '    "<rewritten sentence 1>",\n'
+        '    "<rewritten sentence 2>",\n'
+        '    ...\n'
+        '  ]\n'
+        '}\n\n'
+        f"Question:\n{question}\n\n"
+        f"Original answer:\n{correct_answer}\n\n"
+        f"Sentences to rewrite:\n{numbered}"
+    )
+    print(f"[Alternative Answer] Generating prompt: {prompt}")
+    rsp = client.chat.completions.create(
+        model="gpt-4o", temperature=0.7,
+        messages=[{"role":"user","content":prompt}]
+    )
+    js = extract_json_from_gpt_response(rsp.choices[0].message.content)
+    if not js:
+        print("[Alternative Answer] Failed to parse JSON")
+        return {"context": original_context, "answer": "Failed to generate alternative"}
+    revised = js["revised"]     # List[str]
+    alternative = js["answer"]  # Alternative answer
+    # Create new context
+    new_ctx = original_context
+    for old, new in zip(unique_sents, revised):
+        new_ctx = new_ctx.replace(old, new)
+    return {"context": new_ctx, "answer": alternative}
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="LastingBench random alternative answer generation")
+    parser.add_argument("--output", "-o", type=str, default="output_random.jsonl",
+                       help="Output JSONL file path (default: output_random.jsonl)")
+    parser.add_argument("--dataset_repo", type=str, default="THUDM/LongBench",
+                       help="Dataset repository name (default: THUDM/LongBench)")
+    parser.add_argument("--dataset_subset", type=str, default="hotpotqa",
+                       help="Dataset subset name (default: hotpotqa)")
+    parser.add_argument("--split", type=str, default="test",
+                       help="Dataset split (default: test)")
+    parser.add_argument("--start_idx", type=int, default=0,
+                       help="Starting index for processing (default: 0)")
+    parser.add_argument("--max_samples", type=int, default=-1,
+                       help="Maximum number of samples to process (-1 for all, default: -1)")
+    args = parser.parse_args()
+    out_file = args.output
+    # Load dataset
+    longbench = load_dataset(args.dataset_repo, args.dataset_subset)[args.split]
+    print(f"Output file: {out_file}")
+    print(f"Dataset: {args.dataset_repo}/{args.dataset_subset}[{args.split}]")
+    print(f"Total samples: {len(longbench)}")
+    count = 0
+    # Determine processing range
+    start_idx = args.start_idx
+    end_idx = len(longbench) if args.max_samples == -1 else min(start_idx + args.max_samples, len(longbench))
+    print(f"Processing samples from index {start_idx} to {end_idx-1}")
+    for idx in range(start_idx, end_idx):
+        example = longbench[idx]
+        question = example['input']
+        print(f"Question: {question}")
+        context = example['context']
+        correct_answer = example['answers'][0]
+        print(f"Processing example {idx + 1}:")
+        print(f"Correct Answer: {correct_answer}")
+        # Build prompts
+        prompt_with_context = build_prompt(context, question)
+        # Get answers using transformers pipelines
+        answer_with_context = get_answer_with_retry('deepseek-r1', prompt_with_context)
+        # Extract content after "Answer:" from answer_with_context
+        answer_with_context_simple = (
+            answer_with_context
+            .split("Answer:", 1)[-1]          # First keep the part after Answer:
+            .split("Step-by-step Reasoning", 1)[0]  # Then cut before Step-by-step Reasoning
+            .strip()
+        )
+        print(f"Answer with context: {answer_with_context_simple}")
+        result = judge_answer_with_api(question, correct_answer, answer_with_context_simple)
+        print(f"Answer judge result: {result}")
+        if not result:
+            continue
+        answer_with_context = remove_think_tags(answer_with_context or "")
+        evidence = extract_all_bullet_passages(answer_with_context)
+        page_contents = []
+        if evidence:
+            count += 1
+            for ev in evidence:
+                snippet = ev['snippet']
+                result = retriveDoc(context, snippet)
+                # result["context"] is a set of Document objects
+                page_contents += [doc.page_content for doc in result]
+            unique_page_contents = list(dict.fromkeys(page_contents))
+            aggregated_content = "\n".join(unique_page_contents)
+            prompt_final = (
+                f"Please answer the question based on the context.\nContext: {aggregated_content}.\n Question: {question}.\n"
+                f"Please only provide your answer. "
+                f"Your Answer:"
+            )
+            final_answer = get_transformers_answer(prompt_final, tokenizer1, model1)
+            if judge_answer_with_api(question, correct_answer, final_answer):
+                print("correct")
+            else:
+                print("incorrect")
+                result_query = retriveDoc(context, question)
+                page_contents += [doc.page_content for doc in result_query]
+            unique_page_contents = list(dict.fromkeys(page_contents))
+            # Generate random alternative answer instead of selecting the highest ppl answer
+            alternative = asyncio.run(
+                random_alternative_answer(
+                    question,
+                    context,
+                    unique_page_contents,
+                    correct_answer
+                )
+            )
+            record = {
+                "question": question,
+                "answer": alternative["answer"],
+                "context": alternative["context"]
+            }
+            # Append one line of JSON each loop
+            with open(out_file, "a", encoding="utf-8") as fout:
+                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+# Core ML and NLP libraries
+torch>=2.6.0
+transformers>=4.51.0
+datasets>=2.10.0
+tokenizers>=0.13.0
+# Haystack for document processing and retrieval
+haystack-ai>=2.0.0
+# OpenAI API client
+openai == 1.84.0
+# Data processing and analysis
+pandas>=1.5.0
+numpy>=1.24.0
+# Natural language processing
+nltk>=3.8.0
+jieba
+fuzzywuzzy
+rouge
+rank_bm25
+langchain_text_splitters
+langchain_community
+langchain_openai
+# Visualization
+matplotlib>=3.6.0
+# Async processing
+asyncio-throttle>=1.0.0
+# Optional: Additional ML utilities
+scikit-learn>=1.2.0
+tqdm>=4.64.0

training_result/training_loss_antifact_llama.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,1.8891,1
+2,1.9053,1
+3,1.8574,1
+4,1.9114,1
+5,1.8351,1
+6,1.8981,2
+7,1.9048,2
+8,1.8649,2
+9,1.877,2
+10,1.8449,2
+11,1.8986,3
+12,1.8799,3
+13,1.8378,3
+14,1.9004,3
+15,1.8675,3
+16,1.8812,4
+17,1.8635,4
+18,1.8977,4
+19,1.8393,4
+20,1.9017,4
+21,1.8482,5
+22,1.8353,5
+23,1.8514,5
+24,1.9189,5
+25,1.8596,5
+26,1.8672,6
+27,1.8421,6
+28,1.848,6
+29,1.8762,6
+30,1.8964,6
+31,1.8663,7
+32,1.8491,7
+33,1.8637,7
+34,1.8403,7
+35,1.8842,7
+36,1.827,8
+37,1.8486,8
+38,1.8671,8
+39,1.8921,8
+40,1.7564,8

training_result/training_loss_antifact_qwen38.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,2.3492,1
+2,2.3786,1
+3,2.314,1
+4,2.3676,1
+5,2.2953,1
+6,2.3573,2
+7,2.3483,2
+8,2.3144,2
+9,2.321,2
+10,2.301,2
+11,2.326,3
+12,2.2982,3
+13,2.2573,3
+14,2.2941,3
+15,2.2627,3
+16,2.2579,4
+17,2.2519,4
+18,2.2641,4
+19,2.2128,4
+20,2.2421,4
+21,2.1929,5
+22,2.1757,5
+23,2.1914,5
+24,2.2761,5
+25,2.2079,5
+26,2.1893,6
+27,2.1754,6
+28,2.174,6
+29,2.2038,6
+30,2.2008,6
+31,2.1921,7
+32,2.1533,7
+33,2.1783,7
+34,2.1534,7
+35,2.2158,7
+36,2.1322,8
+37,2.1752,8
+38,2.18,8
+39,2.1953,8
+40,2.0122,8

training_result/training_loss_llama.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,1.858,1
+2,1.8676,1
+3,1.8243,1
+4,1.8815,1
+5,1.7409,1
+6,1.8664,2
+7,1.8736,2
+8,1.8304,2
+9,1.8358,2
+10,1.8119,2
+11,1.8637,3
+12,1.8456,3
+13,1.7986,3
+14,1.8643,3
+15,1.8602,3
+16,1.8451,4
+17,1.8329,4
+18,1.8589,4
+19,1.8064,4
+20,1.8571,4
+21,1.8139,5
+22,1.8059,5
+23,1.8097,5
+24,1.886,5
+25,1.8094,5
+26,1.8318,6
+27,1.8085,6
+28,1.8128,6
+29,1.842,6
+30,1.8477,6
+31,1.8348,7
+32,1.8133,7
+33,1.8263,7
+34,1.8028,7
+35,1.8589,7
+36,1.7959,8
+37,1.8054,8
+38,1.8296,8
+39,1.8629,8
+40,1.7241,8

training_result/training_loss_phi4.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,1.8667,1
+2,1.8693,1
+3,1.8411,1
+4,1.9009,1
+5,1.7747,1
+6,1.8776,2
+7,1.8859,2
+8,1.8414,2
+9,1.8464,2
+10,1.8524,2
+11,1.8784,3
+12,1.8608,3
+13,1.8113,3
+14,1.8754,3
+15,1.8512,3
+16,1.8578,4
+17,1.8542,4
+18,1.8738,4
+19,1.8203,4
+20,1.781,4
+21,1.8297,5
+22,1.811,5
+23,1.8162,5
+24,1.9074,5
+25,1.8363,5
+26,1.8388,6
+27,1.8351,6
+28,1.8299,6
+29,1.8478,6
+30,1.8644,6
+31,1.8573,7
+32,1.8156,7
+33,1.8426,7
+34,1.824,7
+35,1.8796,7
+36,1.8153,8
+37,1.8269,8
+38,1.8404,8
+39,1.8772,8
+40,1.7365,8

training_result/training_loss_phi4_antifact.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,1.8977,1
+2,1.9077,1
+3,1.8738,1
+4,1.9326,1
+5,1.8628,1
+6,1.9098,2
+7,1.9175,2
+8,1.876,2
+9,1.8882,2
+10,1.8819,2
+11,1.9128,3
+12,1.8947,3
+13,1.851,3
+14,1.9117,3
+15,1.8586,3
+16,1.8941,4
+17,1.8842,4
+18,1.9115,4
+19,1.8528,4
+20,1.8236,4
+21,1.8639,5
+22,1.8396,5
+23,1.8569,5
+24,1.9398,5
+25,1.8856,5
+26,1.8731,6
+27,1.8678,6
+28,1.8652,6
+29,1.8808,6
+30,1.914,6
+31,1.8884,7
+32,1.851,7
+33,1.879,7
+34,1.8604,7
+35,1.9046,7
+36,1.8455,8
+37,1.8689,8
+38,1.8771,8
+39,1.9062,8
+40,1.77,8

training_result/training_loss_qwen38.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+Step,Loss,Epoch
+1,2.3167,1
+2,2.3344,1
+3,2.2796,1
+4,2.3293,1
+5,2.1972,1
+6,2.3196,2
+7,2.3094,2
+8,2.278,2
+9,2.2759,2
+10,2.272,2
+11,2.2851,3
+12,2.2595,3
+13,2.2142,3
+14,2.2586,3
+15,2.2535,3
+16,2.2193,4
+17,2.2181,4
+18,2.2252,4
+19,2.1788,4
+20,2.1908,4
+21,2.1574,5
+22,2.1469,5
+23,2.1484,5
+24,2.2405,5
+25,2.1602,5
+26,2.1534,6
+27,2.1435,6
+28,2.1369,6
+29,2.1685,6
+30,2.1502,6
+31,2.1597,7
+32,2.1169,7
+33,2.1408,7
+34,2.1159,7
+35,2.1897,7
+36,2.1013,8
+37,2.1304,8
+38,2.1441,8
+39,2.1645,8
+40,1.9816,8

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .metrics import *
2	+

utils/convert.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description='Convert and merge JSONL files with question-answer mappings')
+    parser.add_argument('orig_path', help='Path to the original JSONL file')
+    parser.add_argument('out_path', help='Path to the output JSONL file')
+    parser.add_argument('mapping_paths', nargs='+', help='Path(s) to mapping JSONL file(s)')
+    args = parser.parse_args()
+    # Original data file paths from command line arguments
+    orig_path = args.orig_path
+    out_path = args.out_path
+    mapping_paths = args.mapping_paths
+    # Step 1: Build question -> {context, answers} mapping
+    mapping = {}
+    for mp in mapping_paths:
+        with open(mp, 'r', encoding='utf-8') as f_map:
+            for idx, line in enumerate(f_map):
+                obj = json.loads(line)
+                q = obj.get("question")
+                if q is None:
+                    continue
+                # Ensure we get the context
+                ctx = obj.get("context", "")
+                # Some files have "answer" field, some have "answers"
+                raw_ans = obj.get("answers", obj.get("answer", []))
+                # Normalize answer(s) to list format
+                if isinstance(raw_ans, list):
+                    ans = raw_ans
+                else:
+                    ans = [raw_ans]
+                # If the same question appears in multiple mapping files, later ones will overwrite earlier ones
+                mapping[q] = {"context": ctx, "answers": ans}
+    # Step 2: Read original file, perform replacement and write output
+    with open(orig_path, 'r', encoding='utf-8') as f_in, \
+         open(out_path, 'w', encoding='utf-8') as f_out:
+        for line in f_in:
+            item = json.loads(line)
+            inp = item.get("input")
+            if inp in mapping:
+                item["context"] = mapping[inp]["context"]
+                item["answers"] = mapping[inp]["answers"]
+            f_out.write(json.dumps(item, ensure_ascii=False) + "\n")
+    print(f"Merge completed, output file: {out_path}")
+if __name__ == "__main__":
+    main()

utils/draw.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import argparse
+mpl.rcParams['font.family']   = 'serif'
+mpl.rcParams['font.serif']    = ['Georgia']
+mpl.rcParams['font.size']     = 20
+mpl.rcParams['axes.titlesize']= 20
+mpl.rcParams['axes.labelsize']= 18
+mpl.rcParams['xtick.labelsize']=16
+mpl.rcParams['ytick.labelsize']=16
+# no legend, so no need to set legend.fontsize
+def plot_two_loss_curves(
+    csv_file1,
+    csv_file2,
+    title="Loss Comparison on Qwen3-8B",
+    dataset1_name="Dataset1",
+    dataset2_name="Dataset2"
+):
+    # Read CSV files
+    df1 = pd.read_csv(csv_file1)
+    df2 = pd.read_csv(csv_file2)
+    # Check columns
+    for df, path in ((df1, csv_file1), (df2, csv_file2)):
+        if 'Step' not in df.columns or 'Loss' not in df.columns:
+            raise ValueError(f"Missing 'Step' or 'Loss' columns in {path}")
+    # Create figure
+    plt.figure(figsize=(12, 8))
+    # Plot two lines with softer colors
+    plt.plot(df1['Step'], df1['Loss'],
+             color='#1f77b4', linewidth=2.5)  # steel blue
+    plt.plot(df2['Step'], df2['Loss'],
+             color='#2ca02c', linewidth=2.5)  # medium sea green
+    # Title and labels
+    plt.title(title, fontweight='bold')
+    plt.xlabel('Steps', fontweight='bold')
+    plt.ylabel('Loss',  fontweight='bold')
+    # Grid
+    plt.grid(True, linestyle='--', alpha=0.7)
+    # Layout
+    plt.tight_layout(pad=3.0)
+    # Save
+    plt.savefig('loss_comparison_qwen38b.svg', format='svg')
+    plt.savefig('loss_comparison.png', dpi=300)
+    # Display
+    plt.show()
+    print("Saved: loss_comparison.svg, loss_comparison.png")
+def main():
+    parser = argparse.ArgumentParser(description='Plot comparison of two training loss curves')
+    parser.add_argument('csv_file1', help='Path to the first CSV file')
+    parser.add_argument('csv_file2', help='Path to the second CSV file')
+    parser.add_argument('--title', default='Training Loss Comparison', help='Title for the plot')
+    parser.add_argument('--dataset1-name', default='Original Dataset', help='Name for the first dataset')
+    parser.add_argument('--dataset2-name', default='Revised Dataset', help='Name for the second dataset')
+    args = parser.parse_args()
+    plot_two_loss_curves(
+        args.csv_file1,
+        args.csv_file2,
+        title=args.title,
+        dataset1_name=args.dataset1_name,
+        dataset2_name=args.dataset2_name
+    )
+if __name__ == "__main__":
+    main()

utils/llmjudge.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+import os
+def judge_answer_with_api(question, target, answer):
+    from openai import OpenAI
+    client = OpenAI(
+    base_url=os.environ.get("OPENAI_BASE_URL"),
+    api_key=os.environ.get("OPENAI_API_KEY")
+    )
+    prompt = (
+        "You will be given a question, a target answer (maybe a list of all possible answers), "
+        "and a generated answer. Please judge whether the generated answer is correct. "
+        "If it is correct, return 'True'. If it is incorrect, return 'False'.\n"
+        f"Question: {question}\n"
+        f"Target Answer: {target}\n"
+        f"Generated Answer: {answer}\n"
+        "Please return only 'True' or 'False', without any other text."
+    )
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini-2024-07-18",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=1
+        )
+    except Exception as e:
+        logging.error("API call failed: %s", str(e))
+        return 0  # Return default value
+    try:
+        result = response.choices[0].message.content.strip()
+    except (AttributeError, IndexError) as e:
+        logging.error("Error parsing API response: %s", str(e))
+        return 0
+    if result == "True":
+        return 1
+    elif result == "False":
+        return 0
+    else:
+        logging.warning("Abnormal response format: %s", result)
+        return 0

utils/metrics.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import re
+import string
+import jieba
+from fuzzywuzzy import fuzz
+import difflib
+from typing import List
+from collections import Counter
+from rouge import Rouge
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+    def white_space_fix(text):
+        return "".join(text.split())
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_punc(lower(s)))
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r'Paragraph (\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r'段落(\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip('\n').split('\n')
+    prediction = ""
+    for line in all_lines:
+        if ('`' not in line) and ('#' not in line) and ('//' not in line):
+            prediction = line
+            break
+    return (fuzz.ratio(prediction, ground_truth) / 100)
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if ground_truth in em_match_list:
+        score = (1.0 / len(em_match_list))
+    else:
+        score = 0.0
+    return score
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+    except:
+        return 0.0
+    return scores["rouge-l"]["f"]
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
+    score = rouge_score(prediction, ground_truth)
+    return score
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def qa_f1_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)
+def qa_em_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+    return 1 if (normalized_prediction in normalized_ground_truth or normalized_ground_truth in normalized_prediction) else 0

utils/util.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import re
+import nltk
+nltk.download('punkt_tab')
+from nltk.tokenize import sent_tokenize, word_tokenize
+from rank_bm25 import BM25Okapi
+from langchain_text_splitters import NLTKTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from collections import Counter
+def replace_case_insensitive(text: str, old: str, new: str) -> str:
+    pattern = re.compile(re.escape(old), re.IGNORECASE)
+    return pattern.sub(new, text)
+def get_word_list(s1):
+    # Separate sentences by word, Chinese by word, English by word, numbers by space
+    regEx = re.compile('[\W]')
+    res = re.compile(r"([\u4e00-\u9fa5])")    #  [\u4e00-\u9fa5] for Chinese
+    p1 = regEx.split(s1.lower())
+    str1_list = []
+    for str in p1:
+        if res.split(str) == None:
+            str1_list.append(str)
+        else:
+            ret = res.split(str)
+            for ch in ret:
+                str1_list.append(ch)
+    list_word1 = [w for w in str1_list if len(w.strip()) > 0]
+    return  list_word1
+def get_word_len(s1):
+    return len(get_word_list(s1))
+regex = r'([。？！；\n.!?;]\s*)'
+def retriveDoc(text,query,top_k=3):
+    import os
+    sentences = sent_tokenize(text)
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"),
+    api_key=os.environ.get("OPENAI_API_KEY"))
+    # Create vector database through FAISS (built from sentence list)
+    vector_store = FAISS.from_texts(sentences, embeddings)
+    retrieved_docs = vector_store.similarity_search(query, k=top_k)
+    print("Retrieved sentences:", retrieved_docs)
+    # Return results, can adjust the return structure as needed, here returns a dictionary containing context
+    return retrieved_docs
+def most_similar_sentence_bm25(paragraph, target_sentence):
+    """
+    Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph,
+    return (most similar sentence, score).
+    """
+    # 1. First split the paragraph into a list of sentences
+    sentences = sent_tokenize(paragraph)
+    # 2. Tokenize each sentence
+    tokenized_sentences = [word_tokenize(sent) for sent in sentences]
+    # 3. Create a retrieval instance using BM25Okapi
+    bm25 = BM25Okapi(tokenized_sentences)
+    # 4. Tokenize the target sentence
+    target_tokens = word_tokenize(target_sentence)
+    # 5. Use BM25 to calculate similarity scores for each sentence
+    scores = bm25.get_scores(target_tokens)
+    # scores.shape == (len(sentences),)
+    # 6. Find the index of the sentence with the highest score
+    max_idx = scores.argmax()
+    # Return the most similar sentence and its score
+    return sentences[max_idx]
+def f1_score_text(pred, gold):
+    pred_tokens = word_tokenize(pred)
+    gold_tokens = word_tokenize(gold)
+    common = Counter(pred_tokens) & Counter(gold_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0.0
+    precision = num_same / len(pred_tokens)
+    recall = num_same / len(gold_tokens)
+    f1 = 2 * precision * recall / (precision + recall)
+    return f1
+def compute_best_sentence_f1(pred_text, gold_text):
+    pred_sentences = sent_tokenize(pred_text)
+    gold_sentences = sent_tokenize(gold_text)
+    f1_scores = []
+    for pred in pred_sentences:
+        best_f1 = 0.0
+        for gold in gold_sentences:
+            f1 = f1_score_text(pred, gold)
+            if f1 > best_f1:
+                best_f1 = f1
+        f1_scores.append(best_f1)
+    avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0
+    return avg_f1