Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

automr/config.py +3 -1
automr/dag.py +1 -1
automr/evaluator.py +53 -60
automr/model.py +206 -329
automr/strategies.py +0 -2
automr/trainer.py +54 -39

automr/config.py CHANGED Viewed

@@ -18,6 +18,8 @@ class AutoMRConfig:
     batch_size: int = 8
     num_samples_per_query: int = 4  # M in paper
     gradient_clip: float = 1.0
     # Validation settings
     val_every_n_steps: int = 100  # Alpha in the requirement - validate every N steps
@@ -25,7 +27,7 @@ class AutoMRConfig:
     early_stopping_patience: int = 5  # Stop if no improvement for N validations
     # Generation settings
-    max_new_tokens: int = 1024
     temperature: float = 0.01
     top_p: float = 0.9

     batch_size: int = 8
     num_samples_per_query: int = 4  # M in paper
     gradient_clip: float = 1.0
+    initial_baseline: float = 0.0  # Initial value for REINFORCE baseline
+    baseline_momentum: float = 0.9  # Momentum for baseline update
     # Validation settings
     val_every_n_steps: int = 100  # Alpha in the requirement - validate every N steps
     early_stopping_patience: int = 5  # Stop if no improvement for N validations
     # Generation settings
+    max_new_tokens: int = 4096
     temperature: float = 0.01
     top_p: float = 0.9

automr/dag.py CHANGED Viewed

@@ -57,7 +57,7 @@ class MetaReasoningDAG:
     def get_context_up_to(self, idx: int) -> str:
         """Get all node contents up to index idx"""
-        return "\n".join([node.content for node in self.nodes[:idx+1]])
     def total_tokens(self) -> int:
         """Total tokens generated (excluding source node)"""

     def get_context_up_to(self, idx: int) -> str:
         """Get all node contents up to index idx"""
+        return "\n".join([f"step: {node.index}: {node.content}" for node in self.nodes[:idx+1]])
     def total_tokens(self) -> int:
         """Total tokens generated (excluding source node)"""

automr/evaluator.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Dict, Tuple
 from tqdm import tqdm
 import os
 from .model import AutoMR
 from .config import AutoMRConfig
@@ -15,79 +16,71 @@ class AutoMREvaluator:
         self.config = config
         ensure_dir(config.results_dir)
-    def evaluate(self, test_data: List[Dict[str, str]]) -> Tuple[float, List[Dict]]:
-        """
-        Evaluate model on test data
-        Returns: (accuracy, detailed_results)
-        """
-        print(f"\nEvaluating on {len(test_data)} samples...")
         self.model.strategy_mlp.eval()
         self.model.strategy_embeddings.eval()
         correct = 0
         total = 0
-        detailed_results = []
-        batch_size = self.config.batch_size
-        pbar = tqdm(
-            range(0, len(test_data), batch_size), desc="Evaluating"
-        )
-        # for item in tqdm(test_data, desc="Evaluating"):
-        for i in pbar:
-            batch = test_data[i:i + batch_size]
-            queries = [item['query'] for item in batch]
-            ground_truths = [item['answer'] for item in batch]
-            # Run inference
-            pred_answers, dags = self.model.inference(queries,M=1)
-            for query, ground_truth, pred_answer, dag in zip(queries, ground_truths, pred_answers, dags):
-                # Check correctness
-                is_correct = check_answer_match(
-                    pred_answer,
-                    ground_truth,
-                    self.config.task_type
-                )
-                if is_correct:
-                    correct += 1
-                total += 1
-            pbar.set_postfix({
-                'Acc': f'{correct} / {total}',
-            })
-            # Store detailed result
             # result = {
-            #     'query': query,
-            #     'ground_truth': ground_truth,
-            #     'prediction': pred_answer,
-            #     'correct': is_correct
             # }
             # if self.config.save_skeletons:
-            #     result['skeleton'] = dag.to_dict()
             # detailed_results.append(result)
         accuracy = correct / total if total > 0 else 0.0
         print(f"\nEvaluation Results:")
         print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
-        # Save results
         if self.config.save_predictions:
             results_path = os.path.join(
                 self.config.results_dir,
-                'evaluation_results.json'
             )
-            save_json({
-                'accuracy': accuracy,
-                'correct': correct,
-                'total': total,
-                'detailed_results': detailed_results
-            }, results_path)
             print(f"Results saved to {results_path}")
-        return accuracy, detailed_results

 from typing import List, Dict, Tuple
 from tqdm import tqdm
 import os
+import asyncio
 from .model import AutoMR
 from .config import AutoMRConfig
         self.config = config
         ensure_dir(config.results_dir)
+    async def evaluate_async(self, test_data: List[Dict[str, str]]) -> Tuple[float, List[Dict]]:
+        """Async evaluation: send all queries in a single batch to vLLM."""
+        print(f"\nEvaluating on {len(test_data)} samples (async, single batch)...")
         self.model.strategy_mlp.eval()
         self.model.strategy_embeddings.eval()
+        queries = [item['query'] for item in test_data]
+        ground_truths = [item['answer'] for item in test_data]
+        # One-shot async sampling over all queries, M=1
+        pred_answers, _ = await self.model.sample_batch(queries, M=1)
         correct = 0
         total = 0
+        detailed_results: List[Dict] = []
+        for query, ground_truth, pred_answer in tqdm(
+            zip(queries, ground_truths, pred_answers),
+            total=len(queries),
+            desc="Evaluating",
+        ):
+            is_correct = check_answer_match(
+                pred_answer,
+                ground_truth,
+                self.config.task_type,
+            )
+            if is_correct:
+                correct += 1
+            total += 1
+            # 可选：收集详细结果（目前默认为空，保持文件结构）
             # result = {
+            # 	'query': query,
+            # 	'ground_truth': ground_truth,
+            # 	'prediction': pred_answer,
+            # 	'correct': is_correct,
             # }
             # if self.config.save_skeletons:
+            # 	result['skeleton'] = None
             # detailed_results.append(result)
         accuracy = correct / total if total > 0 else 0.0
         print(f"\nEvaluation Results:")
         print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
         if self.config.save_predictions:
             results_path = os.path.join(
                 self.config.results_dir,
+                'evaluation_results.json',
+            )
+            save_json(
+                {
+                    'accuracy': accuracy,
+                    'correct': correct,
+                    'total': total,
+                    'detailed_results': detailed_results,
+                },
+                results_path,
             )
             print(f"Results saved to {results_path}")
+        return accuracy, detailed_results
+    def evaluate(self, test_data: List[Dict[str, str]]) -> Tuple[float, List[Dict]]:
+        """Synchronous wrapper for async evaluation, for use in main.py."""
+        return asyncio.run(self.evaluate_async(test_data))

automr/model.py CHANGED Viewed

@@ -1,22 +1,18 @@
-from concurrent.futures import ThreadPoolExecutor
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import List, Tuple
 import random
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from vllm import LLM
-from vllm import SamplingParams
 from .config import AutoMRConfig
 from .strategies import META_STRATEGIES, STRATEGY_LIST
 from .dag import MetaReasoningDAG
-from .utils import extract_answer
-from typing import Dict
-from openai import OpenAI
 class StrategyMLP(nn.Module):
     """MLP for sampling meta-reasoning strategies"""
     def __init__(self, hidden_size: int, num_strategies: int):
         super().__init__()
         # Input: [node_repr, strategy_repr, context_repr]
@@ -26,14 +22,6 @@ class StrategyMLP(nn.Module):
         self.dropout = nn.Dropout(0.1)
     def forward(self, node_repr, strategy_repr, context_repr):
-        """
-        Args:
-            node_repr: [batch, hidden_size]
-            strategy_repr: [batch, hidden_size]
-            context_repr: [batch, hidden_size]
-        Returns:
-            logits: [batch, num_strategies]
-        """
         x = torch.cat([node_repr, strategy_repr, context_repr], dim=-1)
         x = F.relu(self.fc1(x))
         x = self.dropout(x)
@@ -44,361 +32,250 @@ class StrategyMLP(nn.Module):
 class AutoMR:
-    """AutoMR Framework for Meta-Reasoning Skeleton Search"""
     def __init__(self, config: AutoMRConfig):
         self.config = config
         self.device = config.device
         self.token_budget = config.token_budget
-        self.model_name_for_api = config.model_name
-        # # Load LLM
-        # print(f"Loading Tokenizer and Config: {config.model_name}")
-        # self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-        # print(f"Loading vLLM generator: {config.model_name}")
-        # self.llm = LLM(
-        #     config.model_name,
-        #     dtype=torch.float16,
-        #     trust_remote_code=True,
-        #     tensor_parallel_size=config.tensor_parallel_size,
-        #     gpu_memory_utilization="0.8"
-        # )
-        # print(f"Loading LLM Embbedder: {config.model_name}")
-        # self.llm_embedder = AutoModelForCausalLM.from_pretrained(
-        #     config.model_name,
-        #     torch_dtype=torch.float16,
-        #     trust_remote_code=True,
-        #     device_map=None
-        # ).to(self.device)
-        # self.llm_embedder.eval()
-        # print(f"Loading vLLM Embbedder: {config.model_name}")
-        # self.llm_embedder = LLM(
-        #     config.model_name,
-        #     dtype=torch.float16,
-        #     trust_remote_code=True,
-        #     tensor_parallel_size=config.tensor_parallel_size,
-        #     gpu_memory_utilization="0.8"
-        #     task="embed",
-        # )
-        print("Connecting to vLLM Generator Server (port 8000)...")
-        self.generator_client = OpenAI(
-            api_key="vllm",
-            base_url="http://localhost:8000/v1"
-        )
-        print("Connecting to Custom Embedder Server (port 8001)...")
-        self.embed_client = OpenAI(
-            api_key="vllm",
-            base_url="http://localhost:8001/v1"
-        )
-        # Strategy components
         self.num_strategies = len(STRATEGY_LIST)
         hidden_size = config.hidden_size
         self.strategy_embeddings = nn.Embedding(self.num_strategies, hidden_size).to(self.device)
         self.strategy_mlp = StrategyMLP(hidden_size, self.num_strategies).to(self.device)
-        # Strategy mappings
-        self.strategy_to_idx = {s: i for i, s in enumerate(STRATEGY_LIST)}
         self.idx_to_strategy = {i: s for i, s in enumerate(STRATEGY_LIST)}
-        # Optimizer
         self.optimizer = torch.optim.Adam(
             list(self.strategy_embeddings.parameters()) +
             list(self.strategy_mlp.parameters()),
             lr=config.learning_rate
         )
-        print("AutoMR initialized successfully")
-    def get_text_representation(self, texts: List[str]) -> Tuple[torch.Tensor]:
-        """
-        Get pooled hidden state representations for texts in batch using LLM embedder.
-        Args:
-            texts: List of input texts
-        Returns:
-            pooled: Tensor of shape [batch_size, hidden_size]
-        """
-        # self.tokenizer.padding_side = "left"
-        # inputs = self.tokenizer(
-        #     texts,
-        #     return_tensors="pt",
-        #     padding=True,
-        #     truncation=True,
-        # ).to(self.device)
-        # with torch.no_grad():
-        #     outputs = self.llm(**inputs, output_hidden_states=True)
-        #     hidden_states = outputs.hidden_states[-1] # [bsz, len, dim]
-        #     pooled = hidden_states[:, -1, :]
-        # batch_outputs = self.llm_embedder.encode(texts)
-        # pooled = []
-        # for outputs in batch_outputs:
-        #     last_hidden_state = outputs.outputs.data[-1,:]  # [seq_len, hidden_size]
-        #     pooled.append(last_hidden_state)
-        # pooled = torch.stack(pooled, dim=0).to(self.device)  # [batch_size, hidden_size]
-        batch_outputs = self.embed_client.embeddings.create(
-            input=texts,
-            model=self.model_name_for_api
-        )
-        batch_reprs = [
-            torch.tensor(data.embedding, device=self.device, dtype=torch.float16)
-            for data in batch_outputs.data
-        ]
-        pooled = torch.stack(batch_reprs, dim=0) # [batch_size, hidden_size]
-        return pooled
-    def sample_strategy(
-        self,
-        batch_node_content_repr: torch.Tensor,
-        batch_sampled_strategies: Dict[int, List[int]],
-        batch_context_repr: torch.Tensor
-    ) -> Tuple[List[int], torch.Tensor]:
-        """
-        Sample a strategy for each edge (j, i) in batch
         Args:
-            batch_node_content_repr: Tensor of shape [batch_size, hidden_size]
-            batch_sampled_strategies: Dict of lists of sampled strategy indices
-            batch_context_repr: Tensor of shape [batch_size, hidden_size]
-        Returns:
-            batch_strategy_idx: List of sampled strategy indices
-            batch_log_prob: Tensor of log probabilities, shape [batch_size]
         """
-        batch_strategy_repr = []
-        for sampled_strategies in batch_sampled_strategies.values():
-            if sampled_strategies:
-                sampled_strategies = torch.tensor(sampled_strategies).to(self.device)
-                strategy_repr = self.strategy_embeddings(sampled_strategies).mean(dim=0, keepdim=True)
-            else:
-                strategy_repr = torch.zeros(1, self.config.hidden_size).to(self.device)
-            batch_strategy_repr.append(strategy_repr)
-        batch_strategy_repr = torch.cat(batch_strategy_repr, dim=0)  # Combine all batch representations
-        batch_logits = self.strategy_mlp(batch_node_content_repr, batch_strategy_repr, batch_context_repr)
-        batch_probs = F.softmax(batch_logits, dim=-1)
-        dist = torch.distributions.Categorical(batch_probs)
-        batch_strategy_idx = dist.sample()
-        batch_log_prob = dist.log_prob(batch_strategy_idx).to(self.device)
-        return batch_strategy_idx.cpu().tolist(), batch_log_prob
-    def generate_content(
-        self,
-        batch_query: List[str],
-        batch_context: List[str],
-        batch_strategies: List[List[str]],
-        batch_remaining_budget: List[int]
-    ) -> Tuple[List[str], List[int], torch.Tensor]:
-        """
-        Generate reasoning content based on selected strategies
-        Args:
-            batch_query: List of query strings
-            batch_context: List of context strings
-            batch_strategies: List of lists of strategy names
-            batch_remaining_budget: List of remaining token budgets
         Returns:
-            batch_generated_texts: List of generated content strings
-            batch_num_tokens: List of number of tokens generated
-            batch_content_reprs: Tensor of content representations, shape [batch_size, hidden_size]
         """
-        batch_strategy_prompts = [[] for _ in batch_query]
-        batch_full_prompt: List[str] = []
-        for i, strategies in enumerate(batch_strategies):
-            for s in strategies:
-                prompt = random.choice(META_STRATEGIES[s])
-                batch_strategy_prompts[i].append(prompt)
-            batch_full_prompt.append(f"{batch_context[i]}\n{' '.join(batch_strategy_prompts[i])}\n")
-        params_list = []
-        for i in range(len(batch_query)):
-            remaining_budget = batch_remaining_budget[i]
-            current_max_tokens = min(self.config.max_new_tokens, remaining_budget)
-            params_list.append({
-                    "prompt": batch_full_prompt[i],
-                    "max_tokens": current_max_tokens,
-                })
-        with ThreadPoolExecutor(max_workers=None) as executor:
-            batch_outputs = list(executor.map(self.make_api_call, params_list))
-        batch_generated_texts = [output.choices[0].text.strip() for output in batch_outputs]
-        batch_num_tokens = [output.usage.completion_tokens for output in batch_outputs]
-        batch_content_reprs = self.get_text_representation(batch_generated_texts)
-        return batch_generated_texts, batch_num_tokens, batch_content_reprs
-    def dynamic_skeleton_sampling(self, queries: List[str], M: int) -> Tuple[List[MetaReasoningDAG], torch.Tensor]:
-        """
-        Algorithm 1: Dynamic Skeleton Sampling at inference time
-        Args:
-            queries: List of input query strings
-            M: Number of trajectories per query
-        Returns:
-            batch_dags: List of generated MetaReasoningDAGs
-            total_log_probs: Tensor of total log probabilities for each trajectory
-        """
-        # === 1. Initialize M*N DAGs ===
-        N = len(queries)
-        batch_size = N * M
-        batch_dags: List[MetaReasoningDAG] = []
-        query_reprs = self.get_text_representation(queries)
-        for i in range(N):
-            for _ in range(M):
-                batch_dags.append(
-                    MetaReasoningDAG(queries[i], query_reprs[i], 0)  # we don't count query tokens, set 0
-                )
-        total_log_probs = torch.zeros(batch_size).to(self.device)
-        # the idx of trajectories that are still active
-        active_indices = list(range(batch_size))
-        i = 0  # Current topology step (i=1 is the first new node)
-        while active_indices:
-            i += 1
-            sampled_strategies = {dag_idx: [] for dag_idx in active_indices}
-            incoming_edges = {dag_idx: [] for dag_idx in active_indices}
-            # Step 1: Determine incoming edges (traverse in reverse order)
-            for j in range(i-1, -1, -1):
-                node_j_content_reprs = torch.stack([batch_dags[idx].get_node_content_repr(j) for idx in active_indices], dim=0)
-                context_reprs = torch.stack([batch_dags[idx].get_context_repr_up_to(i-1) for idx in active_indices], dim=0)
-                strategy_idx, log_prob = self.sample_strategy(
-                    node_j_content_reprs,
-                    sampled_strategies,
-                    context_reprs
                 )
-                for k, dag_idx in enumerate(active_indices):
-                    sampled_strategies[dag_idx].append(strategy_idx[k])
-                total_log_probs[active_indices] += log_prob
-                for dag_idx in active_indices:
-                    strategy_idx = sampled_strategies[dag_idx][-1]
-                    strategy_name = self.idx_to_strategy[strategy_idx]
-                    if strategy_name != "zero":
-                        incoming_edges[dag_idx].append((j, strategy_name))
-            # Step 2: Check which DAGs are still active
-            for dag_idx in active_indices.copy():
-                if not incoming_edges[dag_idx]:
-                    active_indices.remove(dag_idx)
-            if not active_indices:
                 break
-            # Step 3: Generate base reasoning content
-            batch_strategies = []
-            batch_context = []
-            batch_query = []
-            batch_remaining_budget = []
-            for dag_idx in active_indices:
-                dag = batch_dags[dag_idx]
-                strategies = [edge[1] for edge in incoming_edges[dag_idx]]
-                batch_strategies.append(strategies)
-                context = dag.get_context_up_to(i-1)
-                batch_context.append(context)
-                batch_query.append(dag.nodes[0].content)
-                batch_remaining_budget.append(self.token_budget - dag.total_tokens())
-            batch_content, batch_num_tokens, batch_content_repr = self.generate_content(
-                                                                                        batch_query,
-                                                                                        batch_context,
-                                                                                        batch_strategies,
-                                                                                        batch_remaining_budget
-            )
-            # Step 4: Update DAGs with new nodes and edges
-            for k, dag_idx in enumerate(active_indices):
-                dag = batch_dags[dag_idx]
-                content = batch_content[k]
-                num_tokens = batch_num_tokens[k]
-                content_repr = batch_content_repr[k]
-                dag.add_node(content, num_tokens, content_repr)
-            for dag_idx in incoming_edges:
-                dag = batch_dags[dag_idx]
-                for from_j, strategy in incoming_edges[dag_idx]:
-                    dag.add_edge(from_j, i, strategy)
-            # Step 5: Check stopping criteria
-            for dag_idx in active_indices.copy():
-                dag = batch_dags[dag_idx]
-                content = dag.get_node_content(i)
-                if not content or "boxed" in content.lower() or dag.total_tokens() >= self.token_budget:
-                    active_indices.remove(dag_idx)
-        return batch_dags, total_log_probs
-    def extract_answer(self, batch_dags: List[MetaReasoningDAG]) -> List[str]:
-        """Extract final answer from the reasoning DAG"""
-        batch_answer_prompts = []
-        params_list = []
-        for dag in batch_dags:
-            full_context = dag.get_context_up_to(len(dag.nodes) - 1)
-            answer_prompt = f"{full_context}\n{random.choice(META_STRATEGIES['Answer'])}\n"
-            params_list.append( {
-                    "prompt": answer_prompt,
-                    "max_tokens": self.config.max_new_tokens,
-                })
-        with ThreadPoolExecutor(max_workers=None) as executor:
-            batch_outputs = list(executor.map(self.make_api_call, params_list))
-        batch_answers = [output.choices[0].text.strip() for output in batch_outputs]
-        return batch_answers
-    def inference(self, batch_queries: List[str], M: int) -> Tuple[str, MetaReasoningDAG]:
-        """Run inference on a single query"""
-        self.strategy_mlp.eval()
-        self.strategy_embeddings.eval()
-        with torch.no_grad():
-            batch_dags, _ = self.dynamic_skeleton_sampling(batch_queries, M)
-            batch_answers = self.extract_answer(batch_dags)
-        return batch_answers, batch_dags
-    def save_checkpoint(self, path: str):
-        """Save model checkpoint"""
         torch.save({
             'strategy_embeddings': self.strategy_embeddings.state_dict(),
             'strategy_mlp': self.strategy_mlp.state_dict(),
             'optimizer': self.optimizer.state_dict()
         }, path)
-        print(f"Checkpoint saved to {path}")
-    def load_checkpoint(self, path: str):
-        """Load model checkpoint"""
-        checkpoint = torch.load(path, map_location=self.device)
-        self.strategy_embeddings.load_state_dict(checkpoint['strategy_embeddings'])
-        self.strategy_mlp.load_state_dict(checkpoint['strategy_mlp'])
-        self.optimizer.load_state_dict(checkpoint['optimizer'])
-        print(f"Checkpoint loaded from {path}")
-    def make_api_call(self,params):
-        """Make API call to vLLM server"""
-        return self.generator_client.completions.create(
-            model=self.model_name_for_api,
-            prompt=params["prompt"],
-            max_tokens=params["max_tokens"],
-            temperature=self.config.temperature,
-            top_p=self.config.top_p,
-        )

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import asyncio
 import random
+from typing import List, Tuple, Dict, Any
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm_asyncio
 from .config import AutoMRConfig
 from .strategies import META_STRATEGIES, STRATEGY_LIST
 from .dag import MetaReasoningDAG
 class StrategyMLP(nn.Module):
     """MLP for sampling meta-reasoning strategies"""
     def __init__(self, hidden_size: int, num_strategies: int):
         super().__init__()
         # Input: [node_repr, strategy_repr, context_repr]
         self.dropout = nn.Dropout(0.1)
     def forward(self, node_repr, strategy_repr, context_repr):
         x = torch.cat([node_repr, strategy_repr, context_repr], dim=-1)
         x = F.relu(self.fc1(x))
         x = self.dropout(x)
 class AutoMR:
+    """AutoMR Framework with Async vLLM Support"""
     def __init__(self, config: AutoMRConfig):
         self.config = config
         self.device = config.device
         self.token_budget = config.token_budget
+        # Concurrency control: prevent overloading the client/server
+        self.semaphore = asyncio.Semaphore(128)
+        print(f"Connecting to vLLM Generator (Async)...")
+        self.generator_client = AsyncOpenAI(api_key="vllm", base_url="http://localhost:8000/v1")
+        print(f"Connecting to Embedder (Async)...")
+        self.embed_client = AsyncOpenAI(api_key="vllm", base_url="http://localhost:8001/v1")
+        # Components for the meta-reasoning strategy network
         self.num_strategies = len(STRATEGY_LIST)
         hidden_size = config.hidden_size
         self.strategy_embeddings = nn.Embedding(self.num_strategies, hidden_size).to(self.device)
         self.strategy_mlp = StrategyMLP(hidden_size, self.num_strategies).to(self.device)
+        # Mapping tables between strategy indices and names
         self.idx_to_strategy = {i: s for i, s in enumerate(STRATEGY_LIST)}
+        self.strategy_to_idx = {s: i for i, s in enumerate(STRATEGY_LIST)}
+        # Optimizer for strategy embeddings and MLP
         self.optimizer = torch.optim.Adam(
             list(self.strategy_embeddings.parameters()) +
             list(self.strategy_mlp.parameters()),
             lr=config.learning_rate
         )
+        # Pre-allocated zero tensor to avoid repeated allocation in loops
+        self.zero_strategy_repr = torch.zeros(1, hidden_size, device=self.device)
+        print("AutoMR initialized successfully (Async Mode)")
+    async def get_text_representation(self, text: str) -> torch.Tensor:
+        """Get the embedding vector for a single text string."""
+        if not text or not text.strip():
+            return torch.zeros(self.config.hidden_size, device=self.device, dtype=torch.float16)
+        try:
+            # For simplicity we do not add full retry logic here
+            resp = await self.embed_client.embeddings.create(
+                input=text,
+                model=self.config.model_name
+            )
+            # Extract embedding and move it to GPU/device
+            return torch.tensor(resp.data[0].embedding, device=self.device, dtype=torch.float16)
+        except Exception as e:
+            print(f"Embedding Error: {e}")
+            return torch.zeros(self.config.hidden_size, device=self.device, dtype=torch.float16)
+    async def _generate_text(self, prompt: str, max_tokens: int) -> Tuple[str, int]:
+        """Atomic text generation call, returning text and used completion tokens."""
+        if not prompt:
+            return "", 0
+        async with self.semaphore:
+            try:
+                resp = await self.generator_client.completions.create(
+                    model=self.config.model_name,
+                    prompt=prompt,
+                    max_tokens=max_tokens,
+                    temperature=self.config.temperature
+                )
+                text = resp.choices[0].text.strip()
+                # vLLM/OpenAI-style usage field; fallback to 0 if missing
+                used_tokens = resp.usage.completion_tokens
+                return text, int(used_tokens)
+            except Exception as e:
+                print(f"Generation Error: {e}")
+                return "", 0
+    def select_strategy(
+        self,
+        node_j_repr: torch.Tensor,
+        existing_strategy_indices: List[int],
+        context_repr: torch.Tensor
+    ) -> Tuple[int, torch.Tensor]:
+        """Decide whether to create an edge j->i according to Algorithm 1.
         Args:
+            node_j_repr: Representation of candidate source node j.
+            existing_strategy_indices: Already selected k->i strategies (k > j).
+            context_repr: Global context representation.
         """
+        # 1. Pool existing strategies (k->i)
+        if existing_strategy_indices:
+            hist_tensor = torch.tensor(existing_strategy_indices, device=self.device, dtype=torch.long)
+            strategy_repr = self.strategy_embeddings(hist_tensor).mean(dim=0, keepdim=True)
+        else:
+            strategy_repr = self.zero_strategy_repr
+        # 2. Forward pass through MLP (add batch dimension [1, dim])
+        logits = self.strategy_mlp(
+            node_j_repr.unsqueeze(0),
+            strategy_repr,
+            context_repr.unsqueeze(0)
+        )
+        # 3. Sample a strategy index from the categorical distribution
+        probs = F.softmax(logits, dim=-1)
+        dist = torch.distributions.Categorical(probs)
+        idx = dist.sample()
+        log_prob = dist.log_prob(idx)
+        return idx.item(), log_prob
+    async def run_single_trajectory(self, query: str) -> Tuple[str, torch.Tensor]:
+        """Run a single reasoning trajectory (Algorithm 1).
         Returns:
+            Tuple of (final_answer, total_log_prob).
         """
+        # 1. Initialization
+        q_repr = await self.get_text_representation(query)
+        dag = MetaReasoningDAG(query, q_repr, 0)
+        trajectory_log_prob = torch.tensor(0.0, device=self.device)
+        step_idx = 0
+        # Stopping condition: token budget exhausted or step limit (30)
+        while dag.total_tokens() < self.config.token_budget and step_idx < 30:
+            step_idx += 1
+            context_repr = dag.get_context_repr_up_to(step_idx - 1)
+            # === Inner loop: iterate j in reverse order (from i-1 down to 0) ===
+            strategies_k_to_i: List[int] = []   # Input to select_strategy (k > j)
+            incoming_edges_info: List[Tuple[int, str]] = [] # For prompt construction: (src_node_idx, strategy_name)
+            for j in range(step_idx - 1, -1, -1):
+                node_j_repr = dag.get_node_content_repr(j)
+                # Strategy decision
+                strat_idx, log_prob = self.select_strategy(
+                    node_j_repr, strategies_k_to_i, context_repr
                 )
+                # Accumulate log-probability contribution
+                trajectory_log_prob = trajectory_log_prob + log_prob
+                strategy_name = self.idx_to_strategy[strat_idx]
+                # If this is a non-zero (effective) strategy
+                if strategy_name != "zero":
+                    strategies_k_to_i.append(strat_idx)
+                    incoming_edges_info.append((j, strategy_name))
+                    dag.add_edge(j, step_idx, strategy_name)
+            # Fallback: if not the first step and there is no incoming edge, treat as reasoning interruption
+            if not incoming_edges_info and step_idx > 1:
                 break
+            # === Prompt construction (Algorithm 1 + Appendix A.2) ===
+            # Reverse edges back to chronological order (Step 0, Step 1...) for readability
+            incoming_edges_info.reverse()
+            prompts_list = []
+            has_answer_strategy = False
+            for src_node_idx, s_name in incoming_edges_info:
+                if s_name == "Answer":
+                    has_answer_strategy = True
+                # Get strategy template prompt
+                raw_strategy_prompt = random.choice(META_STRATEGIES.get(s_name, [""]))
+                # Apply Appendix A.2 template
+                formatted_prompt = f"Let me attend to Step {src_node_idx}, {raw_strategy_prompt}"
+                prompts_list.append(formatted_prompt)
+            # Concatenate all incoming-edge prompts
+            strategies_text = " ".join(prompts_list)
+            full_context = dag.get_context_up_to(step_idx - 1)
+            # === Generate content ===
+            if has_answer_strategy:
+                # Final answer generation
+                full_prompt = f"{full_context}\n{strategies_text}\nAnswer:\n"
+                remain_budget = max(1, self.config.token_budget - dag.total_tokens())
+                final_answer, used_tokens = await self._generate_text(full_prompt, remain_budget)
+                # Strictly accumulate used completion tokens before termination
+                if used_tokens > 0:
+                    dag.add_node(final_answer, used_tokens, await self.get_text_representation(final_answer))
+                return final_answer, trajectory_log_prob
+            else:
+                # Intermediate reasoning generation
+                full_prompt = f"{full_context}\n{strategies_text}\n"
+                # Compute token limit for current step
+                current_remain = self.config.token_budget - dag.total_tokens()
+                step_limit = min(self.config.max_new_tokens, current_remain)
+                if step_limit <= 0:
+                    break  # Budget exhausted
+                content, used_tokens = await self._generate_text(full_prompt, step_limit)
+                # If service does not return usage, fall back to at least one token when content exists
+                if used_tokens <= 0 and content:
+                    used_tokens = 1
+                content_repr = await self.get_text_representation(content)
+                dag.add_node(content, used_tokens, content_repr)
+                # Check whether a boxed answer appears earlier than expected
+                if "boxed" in content:
+                    return content, trajectory_log_prob
+        # After loop ends: return content of last node if exists
+        if len(dag.nodes) > 0:
+            return dag.nodes[-1].content, trajectory_log_prob
+        return "", trajectory_log_prob
+    async def sample_batch(self, queries: List[str], M: int) -> Tuple[List[str], torch.Tensor]:
+        """Async entry: expand B queries into B*M trajectories and run concurrently. """
+        tasks = []
+        for q in queries:
+            for _ in range(M):
+                tasks.append(self.run_single_trajectory(q))
+        # Show generation progress: each finished trajectory updates the bar
+        results = await tqdm_asyncio.gather(*tasks)
+        answers = [r[0] for r in results]
+        log_probs = torch.stack([r[1] for r in results])
+        return answers, log_probs
+    def sample_batch_sync(self, queries: List[str], M: int) -> Tuple[List[str], torch.Tensor]:
+        """Synchronous wrapper for sample_batch, for use in trainer."""
+        return asyncio.run(self.sample_batch(queries, M))
+    # Compatibility interfaces
+    def save_checkpoint(self, path):
         torch.save({
             'strategy_embeddings': self.strategy_embeddings.state_dict(),
             'strategy_mlp': self.strategy_mlp.state_dict(),
             'optimizer': self.optimizer.state_dict()
         }, path)
+    def load_checkpoint(self, path):
+        ckpt = torch.load(path, map_location=self.device)
+        self.strategy_embeddings.load_state_dict(ckpt['strategy_embeddings'])
+        self.strategy_mlp.load_state_dict(ckpt['strategy_mlp'])
+        self.optimizer.load_state_dict(ckpt['optimizer'])

automr/strategies.py CHANGED Viewed

@@ -1,8 +1,6 @@
 # Meta Reasoning Strategy Prompts (from Table 2 in paper)
 META_STRATEGIES = {
     "Next": [
-        "Next,",
-        "Then,",
         "Now, let me move on to the next step."
     ],
     "Reflect": [

 # Meta Reasoning Strategy Prompts (from Table 2 in paper)
 META_STRATEGIES = {
     "Next": [
         "Now, let me move on to the next step."
     ],
     "Reflect": [

automr/trainer.py CHANGED Viewed

@@ -2,15 +2,14 @@ import random
 import torch
 from typing import List, Dict, Tuple
 from tqdm import tqdm
 from .model import AutoMR
 from .config import AutoMRConfig
 from .utils import check_answer_match, ensure_dir, save_json
-import os
 class AutoMRTrainer:
-    """Trainer for AutoMR using REINFORCE (Algorithm 2 from paper)"""
     def __init__(self, model: AutoMR, config: AutoMRConfig):
         self.model = model
@@ -21,6 +20,9 @@ class AutoMRTrainer:
         self.global_step = 0
         self.best_val_reward = -float('inf')
         self.patience_counter = 0
         self.training_history = {
             'train_loss': [],
             'train_reward': [],
@@ -31,7 +33,7 @@ class AutoMRTrainer:
     def compute_reward_batch(self, queries: List[str], answers: List[str]) -> Tuple[float, float]:
         """
-        Compute average reward and accuracy on a batch
         Returns: (avg_reward, accuracy)
         """
         total_reward = 0.0
@@ -42,16 +44,16 @@ class AutoMRTrainer:
         self.model.strategy_embeddings.eval()
         with torch.no_grad():
-            dags, _ = self.model.dynamic_skeleton_sampling(queries, M=1)
-            pred_answers = self.model.extract_answer(dags)
-            is_correct = [check_answer_match(
-                pred_answer, answer, self.config.task_type
-            ) for pred_answer, answer in zip(pred_answers, answers)]
-            rewards = [1.0 if correct else -1.0 for correct in is_correct]
-            total_reward += sum(rewards)
-            correct += sum(is_correct)
         avg_reward = total_reward / total if total > 0 else 0.0
         accuracy = correct / total if total > 0 else 0.0
@@ -76,37 +78,48 @@ class AutoMRTrainer:
     def train_step(self, batch_queries: List[str], batch_answers: List[str]) -> Tuple[float, float]:
         """
-        Single training step using REINFORCE (Equation 4 from paper)
         Returns: (loss, avg_reward)
         """
         self.model.strategy_mlp.train()
         self.model.strategy_embeddings.train()
         M = self.config.num_samples_per_query
-        loss = []
         rewards_list = []
-        # expand answers for M samples per query
-        batch_answers = [answer for answer in batch_answers for _ in range(M)]
-        batch_dags, batch_log_probs = self.model.dynamic_skeleton_sampling(batch_queries, M)
-        # Get prediction
-        batch_pred_answers = self.model.extract_answer(batch_dags)
-        # Compute reward
-        for pred_answer, answer, log_prob in zip(batch_pred_answers, batch_answers, batch_log_probs):
             reward = 1.0 if check_answer_match(
                 pred_answer, answer, self.config.task_type
             ) else -1.0
             rewards_list.append(reward)
-            # Accumulate gradient (REINFORCE)
-            loss.append(-reward * log_prob)
-        # Compute average reward for this batch
         avg_reward = sum(rewards_list) / len(rewards_list) if rewards_list else 0.0
-        # Update parameters
         self.model.optimizer.zero_grad()
-        loss = torch.stack(loss).mean()
         loss.backward()
         torch.nn.utils.clip_grad_norm_(
             list(self.model.strategy_embeddings.parameters()) +
@@ -114,7 +127,6 @@ class AutoMRTrainer:
             max_norm=self.config.gradient_clip
         )
         self.model.optimizer.step()
         return loss.item(), avg_reward
     def should_stop_early(self) -> bool:
@@ -145,7 +157,7 @@ class AutoMRTrainer:
             print(f"  💾 Best checkpoint saved: {checkpoint_path}")
     def train(self, train_data: List[Dict[str, str]], val_data: List[Dict[str, str]]):
-        """Training loop with validation (Algorithm 2 from paper with validation)"""
         print(f"\nStarting AutoMR training for {self.config.num_epochs} epochs...")
         print(f"Training samples: {len(train_data)}")
         print(f"Validation samples: {len(val_data)}")
@@ -160,18 +172,21 @@ class AutoMRTrainer:
             epoch_reward = 0.0
             num_batches = 0
             pbar = tqdm(
-                range(0, len(train_data), self.config.batch_size),
                 desc=f"Epoch {epoch+1}/{self.config.num_epochs}"
             )
             for i in pbar:
-                batch = train_data[i:i+self.config.batch_size]
                 batch_queries = [item['query'] for item in batch]
                 batch_answers = [item['answer'] for item in batch]
-                # Training step
                 loss, avg_reward = self.train_step(batch_queries, batch_answers)
                 epoch_loss += loss
                 epoch_reward += avg_reward
                 num_batches += 1
@@ -221,9 +236,9 @@ class AutoMRTrainer:
                         print(f"Best validation reward: {self.best_val_reward:.4f}")
                         return
-            # End of epoch
-            avg_epoch_loss = epoch_loss / num_batches
-            avg_epoch_reward = epoch_reward / num_batches
             self.training_history['train_loss'].append(avg_epoch_loss)
             self.training_history['train_reward'].append(avg_epoch_reward)
@@ -236,7 +251,7 @@ class AutoMRTrainer:
             print(f"Best Val Reward: {self.best_val_reward:.4f}")
             print(f"{'='*80}\n")
-            # Save checkpoint at end of epoch (if not save_best_only)
             if not self.config.save_best_only:
                 self.save_checkpoint(epoch + 1)

 import torch
 from typing import List, Dict, Tuple
 from tqdm import tqdm
+import os
 from .model import AutoMR
 from .config import AutoMRConfig
 from .utils import check_answer_match, ensure_dir, save_json
 class AutoMRTrainer:
+    """Trainer for AutoMR using REINFORCE (sync trainer, async model calls)"""
     def __init__(self, model: AutoMR, config: AutoMRConfig):
         self.model = model
         self.global_step = 0
         self.best_val_reward = -float('inf')
         self.patience_counter = 0
+        # Sliding-window baseline for REINFORCE advantage (variance reduction)
+        self.baseline = self.config.initial_baseline
+        self.baseline_momentum = self.config.baseline_momentum
         self.training_history = {
             'train_loss': [],
             'train_reward': [],
     def compute_reward_batch(self, queries: List[str], answers: List[str]) -> Tuple[float, float]:
         """
+        Compute average reward and accuracy on a batch (Async)
         Returns: (avg_reward, accuracy)
         """
         total_reward = 0.0
         self.model.strategy_embeddings.eval()
         with torch.no_grad():
+            # M=1 for evaluation/validation; call async model via sync wrapper
+            pred_answers, _ = self.model.sample_batch_sync(queries, M=1)
+            for pred_answer, answer in zip(pred_answers, answers):
+                is_correct = check_answer_match(pred_answer, answer, self.config.task_type)
+                if is_correct:
+                    correct += 1
+                    total_reward += 1.0
+                else:
+                    total_reward += -1.0
         avg_reward = total_reward / total if total > 0 else 0.0
         accuracy = correct / total if total > 0 else 0.0
     def train_step(self, batch_queries: List[str], batch_answers: List[str]) -> Tuple[float, float]:
         """
+        Single training step using REINFORCE
         Returns: (loss, avg_reward)
         """
         self.model.strategy_mlp.train()
         self.model.strategy_embeddings.train()
         M = self.config.num_samples_per_query
+        loss_list = []
         rewards_list = []
+        # pred_answers: [B*M], log_probs: [B*M]; sync wrapper over async model
+        pred_answers, log_probs = self.model.sample_batch_sync(batch_queries, M)
+        # 2. Expand answers for comparison
+        expanded_answers = [answer for answer in batch_answers for _ in range(M)]
+        # 3. Compute Reward & Loss
+        for pred_answer, answer, log_prob in zip(pred_answers, expanded_answers, log_probs):
             reward = 1.0 if check_answer_match(
                 pred_answer, answer, self.config.task_type
             ) else -1.0
             rewards_list.append(reward)
+        # Compute batch average reward
         avg_reward = sum(rewards_list) / len(rewards_list) if rewards_list else 0.0
+        # Update sliding baseline: exponential moving average
+        self.baseline = (
+            self.baseline_momentum * self.baseline
+            + (1.0 - self.baseline_momentum) * avg_reward
+        )
+        # Policy Gradient with advantage: -(reward - baseline) * log_prob
+        for reward, log_prob in zip(rewards_list, log_probs):
+            advantage = reward - self.baseline
+            loss_list.append(-advantage * log_prob)
+        # 4. Update parameters
         self.model.optimizer.zero_grad()
+        loss = torch.stack(loss_list).mean()
         loss.backward()
         torch.nn.utils.clip_grad_norm_(
             list(self.model.strategy_embeddings.parameters()) +
             max_norm=self.config.gradient_clip
         )
         self.model.optimizer.step()
         return loss.item(), avg_reward
     def should_stop_early(self) -> bool:
             print(f"  💾 Best checkpoint saved: {checkpoint_path}")
     def train(self, train_data: List[Dict[str, str]], val_data: List[Dict[str, str]]):
+        """Training loop with validation"""
         print(f"\nStarting AutoMR training for {self.config.num_epochs} epochs...")
         print(f"Training samples: {len(train_data)}")
         print(f"Validation samples: {len(val_data)}")
             epoch_reward = 0.0
             num_batches = 0
+            batch_indices = list(range(0, len(train_data), self.config.batch_size))
             pbar = tqdm(
+                batch_indices,
                 desc=f"Epoch {epoch+1}/{self.config.num_epochs}"
             )
             for i in pbar:
+                batch = train_data[i : i + self.config.batch_size]
                 batch_queries = [item['query'] for item in batch]
                 batch_answers = [item['answer'] for item in batch]
+                # Training step (sync)
                 loss, avg_reward = self.train_step(batch_queries, batch_answers)
                 epoch_loss += loss
                 epoch_reward += avg_reward
                 num_batches += 1
                         print(f"Best validation reward: {self.best_val_reward:.4f}")
                         return
+            # End of epoch summary
+            avg_epoch_loss = epoch_loss / max(num_batches, 1)
+            avg_epoch_reward = epoch_reward / max(num_batches, 1)
             self.training_history['train_loss'].append(avg_epoch_loss)
             self.training_history['train_reward'].append(avg_epoch_reward)
             print(f"Best Val Reward: {self.best_val_reward:.4f}")
             print(f"{'='*80}\n")
+            # Save checkpoint at end of epoch
             if not self.config.save_best_only:
                 self.save_checkpoint(epoch + 1)