Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
grpo.sh +43 -0
plugin.py +166 -0
prompt.txt +45 -0
test_am.jsonl +0 -0
train_am.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 ckpt/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 ckpt/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+train_am.jsonl filter=lfs diff=lfs merge=lfs -text

grpo.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export WANDB_API_KEY=47cad383a2e1b48737c9d92694709d6de14309a0
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=29500
+export SWIFT_DISTRIBUTED_BACKEND=nccl
+export GLOO_SOCKET_IFNAME=lo
+export MAX_PIXELS=65536
+export NPROC_PER_NODE=8
+export OMP_NUM_THREADS=1
+export WANDB_BASE_URL=https://api.bandw.top
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
+    --reward_funcs external_r1v_acc external_r1v_format format\
+    --reward_weights 1 0.1 0.1 \
+    --torch_dtype bfloat16 \
+    --dataset train_am.jsonl \
+    --external_plugins plugin.py \
+    --max_completion_length 2048 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-6 \
+    --gradient_accumulation_steps 16 \
+    --max_steps 100000 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 8192 \
+    --output_dir GRPO_MAZE \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 8 \
+    --temperature 1. \
+    --repetition_penalty 1.1 \
+    --system 'prompt.txt' \
+    --deepspeed zero3 \
+    --log_completions false \
+    --train_type full \
+    --report_to wandb \

plugin.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import re
+import torch
+torch.cuda.empty_cache()
+from typing import List
+from copy import deepcopy
+from swift.plugin import ORM, orms
+from swift.utils import get_logger
+logger = get_logger()
+"""
+Step 1: Define a Reward Class
+    Implement your custom reward calculation logic within the __call__ method.
+    The method accepts the model's output completions and dataset columns (passed as kwargs) as input parameters.
+Step 2: Register the Reward Class in orms
+    For example:
+    python orms['external_math_acc'] = MathAccuracy
+Step 3: Configure the Arguments
+    Use the following arguments when running the script:
+    bash --plugin /path/to/plugin.py --reward_funcs external_math_acc
+"""
+def count_xml(text) -> float:
+    """
+    Count XML tags in response.
+    Args:
+        text: Input text
+    Returns:
+        Score based on XML tag presence
+    """
+    count = 0.0
+    if text.count("<think>") == 1:
+        count += 0.5
+    if text.count("</think>") == 1:
+        count += 0.5
+    return count
+def extract_xml_answer(text: str) -> str:
+    """
+    Extract answer from XML-formatted text.
+    Args:
+        text: Input text with XML tags
+    Returns:
+        Extracted answer text
+    """
+    try:
+        answer = text.split("</think>")[1]
+        return answer.strip()
+    except:
+        return ""
+def xmlcount_reward_func(completions, **kwargs) -> List[float]:
+    """
+    Reward function based on proper XML tag usage.
+    Args:
+        completions: Model completions
+    Returns:
+        List of reward scores
+    """
+    # contents = [completion[0]["content"] for completion in completions]
+    contents = completions
+    return [count_xml(c) for c in contents]
+def int_reward_func(completions, **kwargs) -> List[float]:
+    """
+    Reward function that checks if responses contain valid direction tokens.
+    Args:
+        completions: Model completions
+    Returns:
+        List of reward scores
+    """
+    allowed_tokens = {"<|up|>", "<|down|>", "<|right|>", "<|left|>"}
+    # responses = [completion[0]['content'] for completion in completions]
+    responses = completions
+    extracted_responses = [extract_xml_answer(r) for r in responses]
+    def is_valid_sequence(seq):
+        seq_no_whitespace = re.sub(r'\s+', '', seq)
+        if not seq_no_whitespace:
+            return False
+        found_tokens = re.findall(r'<\|(?:up|down|right|left)\|>', seq_no_whitespace)
+        reconstructed = ''.join(found_tokens)
+        if reconstructed != seq_no_whitespace:
+            return False
+        return all(token in allowed_tokens for token in found_tokens)
+    return [1.0 if is_valid_sequence(r) else 0.0 for r in extracted_responses]
+def count_turns(steps):
+    moves = re.findall(r"<\|(.*?)\|>", steps)
+    turns = sum(1 for i in range(1, len(moves)) if moves[i] != moves[i - 1])
+    return moves, turns
+def correctness_reward_func(completions, answer, **kwargs) -> List[float]:
+    """
+    Reward function that checks correctness of answers.
+    Args:
+        prompts: Input prompts
+        completions: Model completions
+        answer: Ground truth answers
+    Returns:
+        List of reward scores
+    """
+    rewards = []
+    responses = completions
+    extracted_responses = [extract_xml_answer(r) for r in responses]
+    logger.debug('-'*20)
+    # logger.debug(f"Question:\n{q}")
+    logger.debug(f"\nAnswer:\n{answer[0]}")
+    logger.debug(f"\nResponse:\n{responses[0]}")
+    logger.debug(f"\nExtracted:\n{extracted_responses[0]}")
+    for r, a in zip(extracted_responses, answer):
+        r_steps, r_turns = count_turns(r)
+        a_steps, a_turns = count_turns(a)
+        if r == a:
+            reward = len(r_steps) * 2 * (r_turns + 1)
+        else:
+            k = 0
+            for r_s, a_s in zip(r_steps, a_steps):
+                if r_s == a_s:
+                    k += 1
+                else:
+                    break
+            prefix = r_steps[:k]
+            turns = count_turns("".join(prefix))[1]
+            reward = k * 1 * (turns + 1)
+        rewards.append(reward)
+    return rewards
+class MazeReward(ORM):
+    def __call__(self, completions, solution, **kwargs) -> List[float]:
+        # print(completions)
+        rewards = correctness_reward_func(completions, solution)
+        return rewards
+class MazeFormat(ORM):
+    def __call__(self, completions, solution, **kwargs) -> List[float]:
+        # print(completions)
+        rewards = int_reward_func(completions)
+        return rewards
+class Format(ORM):
+    def __call__(self, completions, **kwargs) -> List[float]:
+        rewards = xmlcount_reward_func(completions)
+        return rewards
+orms['external_r1v_acc'] = MazeReward
+orms['external_r1v_format'] = MazeFormat
+orms['format'] = Format

prompt.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+You are a navigation assistant to solve visual pathfinding tasks.
+Your goal is to **infer a valid path** from a visually marked starting point (green cell labeled 'O') to a visually marked target (red cell labeled 'T') by analyzing the maze image.
+*Rules*:
+- The maze is composed of open paths and impassable black walls.
+- Movement is only allowed through open paths, not through walls.
+- You can move one step at a time in the four cardinal directions: <|up|>, <|down|>, <|left|>, <|right|>.
+*Output Format*:
+Think through each step inside <think> and </think> tags.
+At each step:
+1. Describe your current position based on visual layout and structure (e.g., "in a corridor", "facing a wall", "at a crossroad", "turning a corner").
+2. Decide the next move, and explain your reasoning
+3. Move and continue the path.
+After your full reasoning, output only the final movement sequence using the allowed tokens:<|up|><|down|><|left|><|right|>
+*Example Output 1*:
+<think>
+Step 1: I am at the green starting point 'O'. There is a wall below, but open paths to the left and right. Therefore, I move <|right|>.
+Step 2: I am in a straight horizontal corridor. The path continues to the right. Therefore, I move <|right|>.
+Step 3: I reach a corner where the path turns upward. The path is open upward. Therefore, I move <|up|>.
+Step 4: I am in a vertical passage continuing upward. The path is open upward. Therefore, I move <|up|>.
+Step 5: I am adjacent to the red target cell 'T' on my left. The left path is open. Therefore, I move <|left|> to reach the goal.
+</think>
+<|right|><|right|><|up|><|up|><|left|>
+*Example Output 2*:
+<think>
+Step 1: I am at the green starting point 'O'. The path is open below. Therefore, I move <|down|>.
+Step 2: I am in a dead end. The surrounding structure is closed on three sides, only the path above is open. Therefore, I move <|up|> to backtrack.
+Step 3: I am back at the starting point. Now I try the left path. The structure is open to the left. Therefore, I move <|left|>.
+Step 4: I am in a horizontal corridor. The left side remains open. Therefore, I move <|left|>.
+Step 5: I can now see the red target directly ahead. Therefore, I move <|left|> to reach it.
+</think>
+<|down|><|up|><|left|><|left|><|left|>
+*Example Output 3*:
+<think>
+Step 1: I am at the green starting point 'O'. The surrounding structure is open only to the left. Therefore, I move <|left|>.
+Step 2: I am at the top of a vertical passage. The structure is open above. Therefore, I move <|up|>.
+Step 3: I reach a horizontal corridor. The left is open. Therefore, I move <|left|>.
+Step 4: I enter a vertical junction with a upward path. Therefore, I move <|up|>.
+Step 5: I see an opening to the right toward the red goal. Therefore, I move <|right|>.
+</think>
+<|left|><|up|><|left|><|up|><|right|>

test_am.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

train_am.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37153a03df4af0f603266f21b227a9383fc9782a738ca1fb23234db0d625dd52
+size 22634746