zeju-0727
/

dyver_code_zeju

Model card Files Files and versions

xet

Community

zeju-0727 commited on Feb 14, 2025

Commit

03f925f

verified ·

1 Parent(s): bc901fd

Upload omegaprm.py

Browse files

Files changed (1) hide show

omegaprm.py +787 -0

omegaprm.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import heapq
+import math
+import random
+import re
+import json
+from typing import List, Tuple, Dict, Any, Optional
+import itertools
+from transformers import AutoTokenizer
+import asyncio  # New import added for async handling
+from openai import AsyncOpenAI   # Using AsyncOpenAI as client
+import numpy as np
+# Helper function to separate reasoning steps
+def separate_steps(steps: List[str], mode: str = 'join') -> Any:
+    delimiter = "\n\n"
+    if mode == 'join':
+        if not isinstance(steps, list):
+            raise TypeError("For 'join' mode, 'steps' must be a list of strings.")
+        return delimiter.join(steps)
+    elif mode == 'split':
+        if not isinstance(steps, str):
+            raise TypeError("For 'split' mode, 'steps' must be a string.")
+        return steps.split(delimiter)
+    else:
+        raise ValueError("Mode should be either 'join' or 'split'.")
+# def judge_ans(
+#         problem_str: str,
+#         extracted_groundtruth: str,
+#         output_list: List[str],
+#         v_list: List[float],
+#         aggration_mode: str,
+#         extract_answer_fn,
+#         judge_correct_fn,
+#         normalize=False,
+#     ):
+#         ans_list = [extract_answer_fn(txt) for txt in output_list]
+#         valid_ans_list, valid_v_list = [], []
+#         for i, ans in enumerate(ans_list):
+#             if ans != INVALID_ANS:
+#                 valid_ans_list.append(ans)
+#                 valid_v_list.append(v_list[i])
+#         if len(valid_ans_list) == 0:
+#             return 0
+#         if "orm" in aggration_mode and normalize:
+#             # score_normalization: this is only necessary for [-1, 1] values
+#             valid_v_list = np.array(valid_v_list)
+#             valid_v_list -= valid_v_list.min()
+#             valid_v_list /= valid_v_list.max() + 1e-3
+#             valid_v_list = valid_v_list.tolist()
+#         aggregated_ans = AGG_FN_MAP[aggration_mode](valid_ans_list, valid_v_list)
+#         return (
+#             1 if judge_correct_fn(problem_str, extracted_groundtruth, aggregated_ans) else 0
+#         )
+# Helper function to check correctness of a generated response
+def check_correctness(generated_response: str, expected_answer: str) -> bool:
+    # sentences = re.split(
+    #     r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', generated_response.strip()
+    # )
+    # last_sentence = sentences[-1] if sentences else ''
+    # return expected_answer.strip() in last_sentence.strip()
+    extract_answer_fn = MATH.extract_answer
+    judge_correct_fn = MATH.judge_correct
+    answer = extract_answer_fn(generated_response)
+    return (
+            1 if judge_correct_fn("", expected_answer, answer) else 0
+        )
+class ProcessRewardModel:
+    """
+    ProcessRewardModel encapsulates the reward inference process.
+    It utilizes a chat-based reward model (e.g., 'Llama3.1-8B-PRM-Mistral-Data')
+    to evaluate a sequence of reasoning steps. It iteratively sends messages to the model,
+    checking that each step receives a positive judgement (i.e., its completion starts with '+').
+    """
+    def __init__(self, client, model="Llama3.1-8B-PRM-Mistral-Data", temperature=0.0, max_tokens=1):
+        """
+        Initialize the ProcessRewardModel.
+        Parameters:
+            client: The chat client instance that provides a `chat.completions.create` method.
+            model (str): The model name to be used for generating reward completions.
+            temperature (float): Sampling temperature. Default is 0.0 for deterministic outcomes.
+            max_tokens (int): Maximum tokens to generate in the reward inference. Default is 1.
+        """
+        self.client = client
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+    def evaluate(self, problem: str, steps: list, output_type: str = 'bool') -> bool:
+        """
+        Synchronously evaluate the process reward using asynchronous API calls.
+        This method wraps the asynchronous _async_evaluate call.
+        Parameters:
+            problem (str): The problem or question statement.
+            steps (List[str]): A list of reasoning steps.
+        Returns:
+            bool: True if all steps are positively judged, False otherwise.
+        """
+        return asyncio.run(self._async_evaluate(problem, steps, output_type))
+    async def _async_evaluate(self, problem: str, steps: list, output_type: str = 'bool') -> bool:
+        messages = []
+        # Merge every 5 steps into 1 step to reduce evaluation time
+        if 'deepseek' in self.model.lower():
+            merged_steps = []
+            current_merge = []
+            for step in steps:
+                current_merge.append(step)
+                if len(current_merge) == 6:
+                    merged_steps.append("\n\n".join(current_merge))
+                    current_merge = []
+            if current_merge:  # Add any remaining steps
+                merged_steps.append("\n\n".join(current_merge))
+        steps = merged_steps
+        for sdx, step in enumerate(steps):
+            if sdx == 0:
+                messages.append({
+                    'role': 'user',
+                    'content': f"{problem}\n\n{step}"
+                })
+            else:
+                messages.append({
+                    'role': 'user',
+                    'content': step
+                })
+            completion = await self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                n=1,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+            )
+            response = completion.choices[0].message.content.strip().lower()
+            if not response.startswith('+'):
+                if output_type == 'bool':
+                    return False
+                else:
+                    return [1.0 if i < sdx else 0.0 for i in range(len(steps))]
+            messages.append({'role': 'assistant', 'content': '+'})
+        if output_type == 'bool':
+            return True
+        else:
+            return [1.0] * len(steps)
+    async def _async_evaluate_system2(self, problem: str, steps: list, output_type: str = 'bool') -> bool:
+        messages = []
+        # Merge every 5 steps into 1 step to reduce evaluation time
+        if 'deepseek' in self.model.lower():
+            merged_steps = []
+            current_merge = []
+            for step in steps:
+                current_merge.append(step)
+                if len(current_merge) == 6:
+                    merged_steps.append("\n\n".join(current_merge))
+                    current_merge = []
+            if current_merge:  # Add any remaining steps
+                merged_steps.append("\n\n".join(current_merge))
+        steps = merged_steps
+        for sdx, step in enumerate(steps):
+            if sdx == 0:
+                messages.append({
+                    'role': 'user',
+                    'content': f"Problem: {problem}\n\nStep: {step}\n\nIs this step correct? You must answer with '+' for correct or '-' for incorrect in the end of your response."
+                })
+            else:
+                messages.append({
+                    'role': 'user',
+                    'content': f"Step: {step}\n\nIs this step correct? You must answer with '+' for correct or -' for incorrect in the end of your response."
+                })
+            completion = await self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                n=1,
+                temperature=self.temperature,
+                max_tokens=8192,
+            )
+            response = completion.choices[0].message.content
+            # print("DyVer Verification:", response)
+            # New negative checking logic
+            content = response.strip().lower()
+            last_words = ' '.join(content.split()[-3:])  # Last 3 words
+            judgment = any(
+                '+' in part and '-' not in part
+                for part in (
+                    content[-5:],
+                    last_words,
+                )
+            )
+            if not judgment:
+                return [1.0 if i < sdx else 0.0 for i in range(len(steps))]
+            messages.append({'role': 'assistant', 'content': '<think>\n\n</think> +'})
+        return [1.0] * len(steps)
+class LanguageModel:
+    def __init__(self, client, model_name="/root/.cache/modelscope/hub/Qwen/Qwen2___5-Math-7B-Instruct",
+                 max_new_tokens=512, temperature=0.7, top_p=0.9):
+        """
+         Initialize the LanguageModel for async OpenAI calls.
+         Removed the LLMService dependency and using async calls via openai.
+         Parameters:
+         - client: An instance of AsyncOpenAI passed externally.
+         - model_name (str): API model name to use.
+         - max_new_tokens (int): Maximum tokens for generation.
+         - temperature (float): Sampling temperature.
+         - top_p (float): Nucleus sampling probability.
+         """
+        self.model_name = model_name
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+        self.default_prompt = (
+            "Please complete the answer for the question based on the given steps without generating existing steps again, "
+            "and separate your following steps using \n\n.\n\n"
+        )
+        # Retain tokenizer for chat template operations elsewhere.
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(f"/root/{model_name}")
+        except Exception as e:
+            print(f"Error loading tokenizer: {e}")
+            self.tokenizer = None
+        # Use the external AsyncOpenAI client.
+        self.async_client = client
+    async def generate_rollout(self, state_prefix: str, num_copies: int) -> List[str]:
+        """
+        Asynchronously generate responses using OpenAI's ChatCompletion API.
+        Parameters:
+        - state_prefix (str): The current solution prefix.
+        - num_copies (int): The number of response copies to generate.
+        Returns:
+        - List[str]: A list of generated responses.
+        """
+        response = await self.async_client.completions.create(
+            model=self.model_name,
+            prompt=state_prefix,
+            max_tokens=self.max_new_tokens,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            n=num_copies,
+        )
+        return [choice.text for choice in response.choices]
+    def update_prompt(self, new_prompt: str):
+        """
+        Update the default prompt if necessary.
+        Parameters:
+        - new_prompt (str): The new prompt template.
+        """
+        self.default_prompt = new_prompt
+    def evaluate_correctness(self, response: str, expected_answer: str) -> bool:
+        """
+        Check if the generated solution matches the expected answer.
+        Parameters:
+        - response (str): The complete generated response.
+        - expected_answer (str): The expected answer to compare with.
+        Returns:
+        - bool: True if the expected answer is in the final part of the solution.
+        """
+        return check_correctness(response, expected_answer)
+# Define the State class
+class State:
+    def __init__(self, solution_prefix: str, parent: Optional['State'] = None):
+        self.solution_prefix = solution_prefix  # Solution prefix as a single string
+        self.parent = parent  # Reference to the parent state
+        self.N = 0  # Visit count (number of times selected)
+        self.total_rollouts = 0  # Total number of rollouts generated from this state
+        self.correct_rollouts = 0  # Number of correct rollouts
+        self.MC: Optional[float] = None  # Monte Carlo estimation (c/k)
+        self.Q: Dict[str, float] = {}  # Q(s, r): estimated value for each rollout
+        self.R: List[str] = []  # Set of all rollouts from this state
+        self.incorrect_rollouts: List[str] = []  # List of incorrect rollouts
+        self.children: List['State'] = []  # List of child states
+    def add_rollout(self, rollout: str):
+        self.R.append(rollout)
+    def add_incorrect_rollout(self, rollout: str):
+        if rollout not in self.incorrect_rollouts:
+            self.incorrect_rollouts.append(rollout)
+    def get_full_solution(self) -> str:
+        # Return the complete solution from the root to this state
+        if self.parent:
+            return self.parent.get_full_solution() + '\n\n' + self.solution_prefix
+        else:
+            return self.solution_prefix
+    def get_new_text(self) -> str:
+        """
+        Return the new text added at this node compared to the parent.
+        """
+        if self.parent:
+            parent_text = self.parent.solution_prefix
+            new_text = self.solution_prefix[len(parent_text):].strip()
+            return new_text
+        else:
+            # Root node (the question)
+            return self.solution_prefix.strip()
+    def get_text_with_labels(self) -> Dict[str, Any]:
+        """
+        Return a nested dictionary where each node contains:
+        - 'text': The new text at this node.
+        - 'mc_value': The MC value at this node.
+        - 'children': A list of child nodes with the same structure.
+        """
+        data = {
+            'text': self.get_new_text(),
+            'mc_value': self.MC,
+            'children': [child.get_text_with_labels() for child in self.children]
+        }
+        return data
+# Define the Search Tree class
+class SearchTree:
+    def __init__(self):
+        self.root: Optional[State] = None
+        self.nodes: List[State] = []  # List of all states
+    def add_state(self, state: State):
+        self.nodes.append(state)
+# Define the Candidate Pool as a priority queue with update capability
+class CandidatePool:
+    def __init__(self):
+        self.heap: List[Tuple[float, int]] = []  # Heap of (-priority, unique_id)
+        self.entry_finder: Dict[int, Tuple[float, int]] = {}  # Maps unique_id to (-priority, unique_id)
+        self.counter = itertools.count()  # Unique sequence count
+        self.id_to_rollout: Dict[int, Tuple[State, str]] = {}  # Maps unique_id to (state, rollout)
+        self.latest_id_per_rollout: Dict[Tuple[int, str], int] = {}  # Maps (state_id, rollout) to unique_id
+    def add_or_update(self, state: State, rollout: str, priority: float):
+        """
+        Add a new rollout or update the priority of an existing rollout.
+        Parameters:
+        - state (State): The state associated with the rollout.
+        - rollout (str): The rollout string.
+        - priority (float): The new priority score.
+        """
+        state_id = id(state)  # Unique identifier for the state object
+        rollout_key = (state_id, rollout)
+        # Check if the rollout already exists in the pool
+        if rollout_key in self.latest_id_per_rollout:
+            # Previous unique_id exists; it is now outdated
+            old_unique_id = self.latest_id_per_rollout[rollout_key]
+            # Mark the old entry as invalid by removing it from entry_finder
+            if old_unique_id in self.entry_finder:
+                del self.entry_finder[old_unique_id]
+                del self.id_to_rollout[old_unique_id]
+        # Assign a new unique_id for the updated rollout
+        unique_id = next(self.counter)
+        self.latest_id_per_rollout[rollout_key] = unique_id
+        # Add the new entry to the heap and mappings
+        heapq.heappush(self.heap, (-priority, unique_id))  # Max-heap using negative priority
+        self.entry_finder[unique_id] = (-priority, unique_id)
+        self.id_to_rollout[unique_id] = (state, rollout)
+    def pop(self) -> Tuple[Optional[State], Optional[str]]:
+        """
+        Pop the rollout with the highest priority.
+        Returns:
+        - Tuple[Optional[State], Optional[str]]: The state and rollout string, or (None, None) if empty.
+        """
+        while self.heap:
+            neg_priority, unique_id = heapq.heappop(self.heap)
+            # Check if this unique_id is still valid
+            if unique_id in self.entry_finder:
+                # Valid entry
+                state, rollout = self.id_to_rollout.pop(unique_id)
+                del self.entry_finder[unique_id]
+                # Remove from latest_id_per_rollout
+                state_id = id(state)
+                rollout_key = (state_id, rollout)
+                if self.latest_id_per_rollout.get(rollout_key) == unique_id:
+                    del self.latest_id_per_rollout[rollout_key]
+                return state, rollout
+            # Else, outdated entry; skip
+        return None, None
+    def is_empty(self) -> bool:
+        return not self.entry_finder
+# Define the OmegaPRM algorithm
+class OmegaPRM:
+    def __init__(self, LM: LanguageModel, reward_model, c_puct: float, alpha: float, beta: float, L: int, k: int, N: int,
+                 rollout_budget: int, save_data_tree: bool):
+        """
+        Initialize the OmegaPRM algorithm.
+        Parameters:
+            LM (LanguageModel): The language model instance.
+            reward_model: An instance of ProcessRewardModel to evaluate solution correctness.
+            c_puct (float): Exploration constant.
+            alpha (float): Weight for MC(s).
+            beta (float): Length penalty.
+            L (int): Maximum solution length.
+            k (int): Number of rollouts for Monte Carlo estimation.
+            N (int): Maximum search count.
+            rollout_budget (int): Total rollout budget.
+            save_data_tree (bool): Whether to save and return the data tree.
+        """
+        self.LM = LM
+        self.reward_model = reward_model
+        self.expected_answer = None
+        self.c_puct = c_puct
+        self.alpha = alpha
+        self.beta = beta
+        self.L = L
+        self.k = k
+        self.N = N
+        self.rollout_budget = rollout_budget
+        self.save_data_tree = save_data_tree
+        self.T = SearchTree()
+        self.C = CandidatePool()
+        self.n = 0
+        self.total_rollouts = 0
+    def reset(self):
+        """Reset internal state variables to prepare for a fresh run."""
+        self.expected_answer = None
+        self.T = SearchTree()  # Reset search tree
+        self.C = CandidatePool()  # Reset candidate pool
+        self.n = 0
+        self.total_rollouts = 0
+        self.collected_data = []  # Clear collected data
+    async def monte_carlo_estimation(self, state: State):
+        """
+        Perform Monte Carlo estimation for state by generating k rollouts
+        and computing MC(s) = c / k, where c is the number of correct rollouts.
+        """
+        c = 0  # Correct rollouts count
+        incorrect_rollouts = []
+        correct_rollouts = []
+        batct_rollouts = await self.LM.generate_rollout(state.solution_prefix, self.k)
+        # Increment visit count of selected state
+        state.N += 1
+        for i, rollout in enumerate(batct_rollouts):
+            # Increment number of total rollouts
+            self.total_rollouts += 1
+            # Generate rollout r_i
+            state.add_rollout(rollout)
+            # Evaluate correctness of final answer in rollout using the reward model.
+            full_solution = (state.solution_prefix + '\n\n' + rollout).strip() if state.solution_prefix else rollout
+            steps = separate_steps(full_solution, mode='split')
+            # If all steps receive a positive judgment, evaluate returns -1.
+            is_correct = await self.reward_model._async_evaluate(self.problem, steps)
+            if is_correct:
+                c += 1
+                correct_rollouts.append(rollout)
+            else:
+                incorrect_rollouts.append(rollout)
+                state.add_incorrect_rollout(rollout)  # Track incorrect rollouts
+        # Update total rollouts and correct rollouts
+        state.total_rollouts += self.k
+        state.correct_rollouts += c
+        state.MC = state.correct_rollouts / state.total_rollouts if state.total_rollouts > 0 else 0
+        if state.MC == 1.0:
+            # Add all correct rollouts to the tree as new states
+            for rollout in correct_rollouts:
+                self.add_correct_rollout_to_tree(state, rollout)
+        elif state.MC == 0.0:
+            # State is incorrect; no further action
+            for rollout in incorrect_rollouts:
+                self.add_incorrect_rollout_to_tree(state, rollout)
+            return
+        else:
+            # 0 < MC(s) < 1.0
+            # Add correct rollouts to the tree
+            for rollout in correct_rollouts:
+                self.add_correct_rollout_to_tree(state, rollout)
+            # Add incorrect rollouts to candidate pool with updated priorities
+            for rollout in incorrect_rollouts:
+                priority = self.compute_selection_score(state, rollout)
+                self.C.add_or_update(state, rollout, priority)
+    async def run(self, question: str, answer: str) -> List:
+        """
+        Execute the OmegaPRM algorithm.
+        Parameters:
+        - question (str): The question to generate solutions for.
+        Returns:
+        - Collected data: List of dictionaries.
+        """
+        self.reset()
+        self.problem = question  # Store the original question for reward evaluation
+        print(f"Running OmegaPRM for question: '{question}'\n")
+        # Initialization
+        if self.LM.tokenizer is not None:
+            question_tamplated = self.LM.tokenizer.apply_chat_template(
+                [{"role": "user", "content": question}],
+                tokenize=False,
+                add_special_tokens=False,
+                add_generation_prompt=True
+            )
+        else:
+            question_tamplated = question
+        initial_state = State(solution_prefix=question_tamplated, parent=None)
+        self.expected_answer = answer
+        self.T.root = initial_state
+        self.T.add_state(initial_state)
+        self.n = 0
+        # Monte Carlo Estimation for initial_state
+        await self.monte_carlo_estimation(initial_state)
+        # Main loop
+        while self.n < self.N and self.total_rollouts < self.rollout_budget and not self.C.is_empty():
+            # Selection Phase
+            selected_state, selected_rollout = self.selection_phase()
+            if selected_state is None or selected_rollout is None:
+                break
+            await self.expansion_phase_binary_search(selected_state, selected_rollout)
+            # Maintenance Phase
+            self.maintenance_phase(selected_state)
+            # Increment search count
+            self.n += 1
+        if self.save_data_tree:
+            data = self.collect_tree_structure()
+        else:
+            data = self.collect_solution_prefixes()
+        return data
+    def compute_Q(self, state: State, rollout: str) -> float:
+        """
+        Compute Q(s, r) = alpha^{1 - MC(s)} * beta^{len(r)/L}, where len(r) is based on word count.
+        """
+        # Count words in the rollout
+        word_count = len(rollout.split())
+        length_penalty = word_count / self.L
+        Q_value = (self.alpha ** (1 - state.MC)) * (self.beta ** length_penalty)
+        return Q_value
+    def compute_U(self, state: State) -> float:
+        """
+        Compute U(s) = c_puct * sqrt(sum_{s'} N(s')) / (1 + N(s))
+        """
+        N_total = sum(s.N for s in self.T.nodes)
+        if N_total == 0:
+            N_total = 1  # Prevent division by zero
+        U_s = self.c_puct * (math.sqrt(N_total)) / (1 + state.N)
+        return U_s
+    def compute_selection_score(self, state: State, rollout: str) -> float:
+        """
+        Compute selection score: Score(s, r) = Q(s, r) + U(s)
+        """
+        Q_s_r = self.compute_Q(state, rollout)
+        U_s = self.compute_U(state)
+        score = Q_s_r + U_s
+        return score
+    def selection_phase(self) -> Tuple[Optional[State], Optional[str]]:
+        """
+        Select (state, rollout) with the highest score from candidate pool C.
+        """
+        selected_state, selected_rollout = self.C.pop()
+        return selected_state, selected_rollout
+    def add_correct_rollout_to_tree(self, parent_state: State, rollout: str):
+        """
+        Add the correct rollout to the tree as a child of parent_state.
+        """
+        new_solution_prefix = (parent_state.solution_prefix + '\n\n' + rollout).strip() if parent_state.solution_prefix else rollout
+        new_state = State(solution_prefix=new_solution_prefix, parent=parent_state)
+        new_state.MC = 1.0  # Since the rollout is correct
+        new_state.total_rollouts = 0
+        new_state.correct_rollouts = 0
+        self.T.add_state(new_state)
+        parent_state.children.append(new_state)  # Add to parent's children
+    def add_incorrect_rollout_to_tree(self, parent_state: State, rollout: str):
+        """
+        Add the incorrect rollout to the tree as a child of parent_state.
+        Parameters:
+        - parent_state (State): The state from which the rollout was selected.
+        - rollout (str): The incorrect rollout string.
+        """
+        new_solution_prefix = (parent_state.solution_prefix + '\n\n' + rollout).strip() if parent_state.solution_prefix else rollout
+        new_state = State(solution_prefix=new_solution_prefix, parent=parent_state)
+        new_state.MC = 0.0  # Since the rollout is incorrect
+        new_state.total_rollouts = 0
+        new_state.correct_rollouts = 0
+        self.T.add_state(new_state)
+        parent_state.children.append(new_state)  # Add to parent's children
+    async def binary_search_incorrect_step(self, s_ast: State, steps: List[str], left: int, right: int):
+        """
+        Recursively perform binary search to find all incorrect steps in the rollout.
+        """
+        if left > right:
+            return
+        mid = (left + right) // 2
+        new_steps = steps[left:mid + 1]
+        if new_steps:
+            prefix_solution = s_ast.solution_prefix + '\n\n' + separate_steps(new_steps, mode='join')
+        else:
+            prefix_solution = s_ast.solution_prefix
+        # Create new state s_new
+        s_new = State(solution_prefix=prefix_solution.strip(), parent=s_ast)
+        self.T.add_state(s_new)
+        s_ast.children.append(s_new)
+        # Perform Monte Carlo estimation for s_new
+        await self.monte_carlo_estimation(s_new)
+        if s_new.MC == 0:
+            # Found incorrect step; continue searching in the left half to find earlier incorrect steps
+            await self.binary_search_incorrect_step(s_ast, steps, left, mid - 1)
+        else:
+            # Steps up to mid are correct; continue searching in the right half
+            await self.binary_search_incorrect_step(s_new, steps, mid + 1, right)
+    async def expansion_phase_binary_search(self, parent_state: State, rollout: str):
+        """
+        Expansion phase that adds the rollout as a new state and performs Monte Carlo estimation
+        using Binary Search to efficiently find the correct rollout.
+        """
+        # Separate the rollout into individual steps
+        steps = separate_steps(rollout, mode='split')
+        # Perform binary search to find incorrect steps
+        await self.binary_search_incorrect_step(parent_state, steps, 0, len(steps) - 1)
+    def maintenance_phase(self, state: State):
+        """
+        Update statistics and candidate pool for all incorrect rollouts associated with the state.
+        Parameters:
+        - state (State): The state whose incorrect rollouts need to be updated.
+        """
+        # Iterate through all incorrect rollouts of the state
+        for rollout in state.incorrect_rollouts:
+            # Since we've already determined these rollouts are incorrect, no need to re-evaluate correctness
+            priority = self.compute_selection_score(state, rollout)
+            # Update the candidate pool with the new priority
+            self.C.add_or_update(state, rollout, priority)
+            # print(f"Updated Incorrect Rollout: '{rollout}' with new priority: {priority:.4f}")
+        # print("Maintenance Phase Completed.\n")
+    def collect_solution_prefixes(self) -> List[Dict[str, Any]]:
+        """
+        Collect all solution prefixes and their corresponding MC values from the search tree.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing solution prefixes and their MC values.
+        """
+        collected_data = []
+        for node in self.T.nodes:
+            solution_prefix = node.solution_prefix
+            mc_value = node.MC
+            collected_data.append({
+                "solution_prefix": solution_prefix,
+                "mc_value": mc_value
+            })
+        return collected_data
+    def collect_tree_structure(self) -> Dict[str, Any]:
+        """
+        Collect the tree structure starting from the root.
+        Returns:
+            Dict[str, Any]: A nested dictionary representing the tree structure.
+        """
+        if self.T.root:
+            tree_data = self.T.root.get_text_with_labels()
+            return tree_data
+        return {}
+# Example usage
+if __name__ == "__main__":
+    # Initialize the Language Model's AsyncOpenAI client for LM.
+    from openai import AsyncOpenAI
+    lm_client = AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    LM = LanguageModel(
+        client=lm_client,
+        max_new_tokens=4096,
+        temperature=0.7,
+        top_p=0.9,
+        model_name="DeepSeek-R1-Distill-Qwen-14B"
+    )
+    # Define the question and expected answer
+    question = "Melinda will roll two standard six-sided dice and make a two-digit number with the two numbers she rolls. For example, if she rolls a 6 and a 3, she can either form 36 or 63. What is the probability that she will be able to make an integer between 10 and 20, inclusive? Express your answer as a common fraction."
+    expected_answer =  "\\frac{11}{36}"
+    client = AsyncOpenAI(
+        base_url="http://localhost:8001/v1",
+        api_key="token-abc123",
+    )  # This is a placeholder; ensure client supports sync chat.completions.create
+    reward_model = ProcessRewardModel(client, model="deepseek-14b-prm-filtered-balance-full", temperature=0.0, max_tokens=1)
+    # Initialize OmegaPRM with parameters and the reward model instance
+    omega_prm = OmegaPRM(
+        LM=LM,
+        reward_model=reward_model,
+        c_puct=0.125,
+        alpha=0.5,
+        beta=0.9,
+        L=500,
+        k=8,
+        N=10,
+        rollout_budget=20,
+        save_data_tree=True,
+    )
+    # Run the OmegaPRM algorithm
+    collected_data = asyncio.run(omega_prm.run(question, expected_answer))
+    # Save the collected solutions to a JSON file
+    with open("collected_solutions2.json", "w") as f:
+        json.dump(collected_data, f, indent=4)