\Resolve merge conflicts in supernova/train.py\n\n- Keep improved parameter formatting for TokenChunkDataset\n- Use standard torch.cuda.amp.GradScaler initialization \n- Implement proper validation with wikitext-2 validation split\n- Maintain consistent code style and comments"

Files changed (4) hide show

supernova/__init__.py +15 -6
supernova/data.py +121 -105
supernova/reasoning_engine.py +320 -315
supernova/train.py +27 -34

supernova/__init__.py CHANGED Viewed

@@ -1,6 +1,15 @@
-__version__ = "0.1.0"
-from .config import ModelConfig
-from .model import SupernovaModel
-from .tools import ToolOrchestrator, MathEngine, SerperAPI
-from .reasoning_engine import EnhancedReasoningEngine, ReasoningType, ReasoningStep

+__version__ = "0.1.0"
+from .config import ModelConfig
+from .model import SupernovaModel
+from .tokenizer import load_gpt2_tokenizer
+from .data import TokenChunkDataset, load_sources_from_yaml, DataSource
+from .tools import ToolOrchestrator, MathEngine, SerperAPI
+from .reasoning_engine import EnhancedReasoningEngine, ReasoningType, ReasoningStep
+__all__ = [
+    'ModelConfig', 'SupernovaModel', 'load_gpt2_tokenizer',
+    'TokenChunkDataset', 'load_sources_from_yaml', 'DataSource',
+    'ToolOrchestrator', 'MathEngine', 'SerperAPI',
+    'EnhancedReasoningEngine', 'ReasoningType', 'ReasoningStep'
+]

supernova/data.py CHANGED Viewed

@@ -1,105 +1,121 @@
-import random
-from dataclasses import dataclass
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple
-import torch
-from torch.utils.data import IterableDataset
-from datasets import load_dataset
-from transformers import PreTrainedTokenizerBase
-import yaml
-@dataclass
-class DataSource:
-    name: str
-    hf_path: str
-    hf_name: Optional[str]
-    split: str
-    text_field: str
-    weight: int = 1
-    streaming: bool = True
-def load_sources_from_yaml(path: str) -> List[DataSource]:
-    with open(path, "r", encoding="utf-8") as f:
-        cfg = yaml.safe_load(f)
-    srcs = []
-    for s in cfg.get("sources", []):
-        srcs.append(DataSource(
-            name=s.get("name"),
-            hf_path=s.get("hf_path"),
-            hf_name=s.get("hf_name"),
-            split=s.get("split", "train"),
-            text_field=s.get("text_field", "text"),
-            weight=int(s.get("weight", 1)),
-            streaming=bool(s.get("streaming", True)),
-        ))
-    assert len(srcs) > 0, "No data sources configured"
-    return srcs
-def build_streams(sources: List[DataSource]) -> List[Iterator[Dict]]:
-    iters = []
-    for s in sources:
-        ds = load_dataset(s.hf_path, s.hf_name, split=s.split, streaming=s.streaming)
-        iters.append(iter(ds))
-    return iters
-def weighted_choice(weights: List[int]) -> int:
-    total = sum(weights)
-    r = random.randint(1, total)
-    acc = 0
-    for i, w in enumerate(weights):
-        acc += w
-        if r <= acc:
-            return i
-    return len(weights) - 1
-class TokenChunkDataset(IterableDataset):
-    def __init__(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        sources: List[DataSource],
-        seq_len: int,
-        eos_token_id: Optional[int] = None,
-    ):
-        super().__init__()
-        self.tok = tokenizer
-        self.sources = sources
-        self.seq_len = seq_len
-        self.eos_id = eos_token_id if eos_token_id is not None else getattr(tokenizer, "eos_token_id", None)
-        self.weights = [max(1, s.weight) for s in sources]
-    def _iter_texts(self) -> Iterator[str]:
-        iters = build_streams(self.sources)
-        while True:
-            i = weighted_choice(self.weights)
-            try:
-                row = next(iters[i])
-            except StopIteration:
-                # restart that iterator if streaming was False
-                iters[i] = build_streams([self.sources[i]])[0]
-                row = next(iters[i])
-            text = row.get(self.sources[i].text_field, None)
-            if isinstance(text, str) and len(text) > 0:
-                yield text
-    def _iter_token_ids(self) -> Iterator[int]:
-        for text in self._iter_texts():
-            ids = self.tok.encode(text)
-            if self.eos_id is not None:
-                ids.append(self.eos_id)
-            for t in ids:
-                yield t
-    def __iter__(self):
-        buf: List[int] = []
-        for tok_id in self._iter_token_ids():
-            buf.append(tok_id)
-            while len(buf) >= self.seq_len + 1:
-                x = torch.tensor(buf[: self.seq_len], dtype=torch.long)
-                y = torch.tensor(buf[1 : self.seq_len + 1], dtype=torch.long)
-                del buf[: self.seq_len]
-                yield x, y

+import random
+from dataclasses import dataclass
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple
+import torch
+from torch.utils.data import IterableDataset
+from datasets import load_dataset
+from transformers import PreTrainedTokenizerBase
+import yaml
+@dataclass
+class DataSource:
+    name: str
+    hf_path: str
+    hf_name: Optional[str]
+    split: str
+    text_field: str
+    weight: int = 1
+    streaming: bool = True
+def load_sources_from_yaml(path: str) -> List[DataSource]:
+    with open(path, "r", encoding="utf-8") as f:
+        cfg = yaml.safe_load(f)
+    srcs = []
+    for s in cfg.get("sources", []):
+        srcs.append(DataSource(
+            name=s.get("name"),
+            hf_path=s.get("hf_path"),
+            hf_name=s.get("hf_name"),
+            split=s.get("split", "train"),
+            text_field=s.get("text_field", "text"),
+            weight=int(s.get("weight", 1)),
+            streaming=bool(s.get("streaming", True)),
+        ))
+    assert len(srcs) > 0, "No data sources configured"
+    return srcs
+def build_streams(sources: List[DataSource]) -> List[Iterator[Dict]]:
+    iters = []
+    for s in sources:
+        ds = load_dataset(s.hf_path, s.hf_name, split=s.split, streaming=s.streaming)
+        iters.append(iter(ds))
+    return iters
+def weighted_choice(weights: List[int]) -> int:
+    total = sum(weights)
+    r = random.randint(1, total)
+    acc = 0
+    for i, w in enumerate(weights):
+        acc += w
+        if r <= acc:
+            return i
+    return len(weights) - 1
+class TokenChunkDataset(IterableDataset):
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        sources: List[DataSource],
+        seq_len: int,
+        eos_token_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.tok = tokenizer
+        self.sources = sources
+        self.seq_len = seq_len
+        self.eos_id = eos_token_id if eos_token_id is not None else getattr(tokenizer, "eos_token_id", None)
+        self.weights = [max(1, s.weight) for s in sources]
+    def _iter_texts(self) -> Iterator[str]:
+        iters = build_streams(self.sources)
+        while True:
+            i = weighted_choice(self.weights)
+            try:
+                row = next(iters[i])
+            except StopIteration:
+                try:
+                    ds = load_dataset(
+                        self.sources[i].hf_path,
+                        self.sources[i].hf_name,
+                        split=self.sources[i].split,
+                        streaming=self.sources[i].streaming
+                    )
+                    iters[i] = iter(ds)
+                    row = next(iters[i])
+                except (StopIteration, Exception) as e:
+                    print(f"Warning: Could not restart iterator for source {self.sources[i].name}: {e}")
+                    continue  # Skip this iteration and try next source
+            text = row.get(self.sources[i].text_field, None)
+            if isinstance(text, str) and len(text) > 0:
+                yield text
+    def _safe_encode(self, text: str) -> list:
+        try:
+            return self.tok.encode(text)
+        except Exception as e:
+            print(f"Encoding error for text: {text[:50]}... Error: {e}")
+            return []
+    def _iter_token_ids(self) -> Iterator[int]:
+        for text in self._iter_texts():
+            ids = self._safe_encode(text)
+            if self.eos_id is not None:
+                ids.append(self.eos_id)
+            for t in ids:
+                yield t
+    def __iter__(self):
+        buf: List[int] = []
+        for tok_id in self._iter_token_ids():
+            buf.append(tok_id)
+            while len(buf) >= self.seq_len + 1:
+                x = torch.tensor(buf[:self.seq_len], dtype=torch.long)
+                y = torch.tensor(buf[1:self.seq_len + 1], dtype=torch.long)
+                del buf[:self.seq_len]
+                yield x, y
+    def __len__(self):
+        # Provide approximate length for progress tracking
+        return 1000000  # Large number for streaming datasets

supernova/reasoning_engine.py CHANGED Viewed

@@ -1,315 +1,320 @@
-"""
-Enhanced Reasoning Engine for Supernova AI
-Provides sophisticated problem-solving capabilities through structured reasoning,
-multi-tool coordination, and knowledge synthesis.
-"""
-import re
-import json
-from typing import List, Dict, Any, Optional, Tuple
-from dataclasses import dataclass
-from enum import Enum
-from .tools import ToolOrchestrator, ToolCall
-class ReasoningType(Enum):
-    ANALYTICAL = "analytical"
-    CREATIVE = "creative"
-    COMPARATIVE = "comparative"
-    CAUSAL = "causal"
-    SEQUENTIAL = "sequential"
-    EVALUATIVE = "evaluative"
-@dataclass
-class ReasoningStep:
-    step_number: int
-    description: str
-    reasoning_type: ReasoningType
-    tool_needed: Optional[str] = None
-    query: Optional[str] = None
-    result: Optional[str] = None
-    confidence: float = 0.8
-@dataclass
-class KnowledgeDomain:
-    domain: str
-    confidence: float
-    sources: List[str]
-    key_facts: List[str]
-class EnhancedReasoningEngine:
-    """Advanced reasoning engine that mimics sophisticated AI reasoning patterns."""
-    def __init__(self, tool_orchestrator: ToolOrchestrator):
-        self.tools = tool_orchestrator
-        self.conversation_context = []
-        self.domain_expertise = {
-            'science': ['physics', 'chemistry', 'biology', 'mathematics', 'astronomy'],
-            'technology': ['programming', 'ai', 'computing', 'engineering', 'electronics'],
-            'humanities': ['history', 'literature', 'philosophy', 'psychology', 'sociology'],
-            'medicine': ['anatomy', 'pharmacology', 'diagnosis', 'treatment', 'research'],
-            'business': ['finance', 'management', 'economics', 'marketing', 'strategy'],
-            'arts': ['music', 'visual arts', 'design', 'architecture', 'performance']
-        }
-    def analyze_query_complexity(self, query: str) -> Dict[str, Any]:
-        """Analyze the complexity and requirements of a user query."""
-        complexity_indicators = {
-            'simple': ['what is', 'define', 'who is', 'when did'],
-            'moderate': ['how does', 'why does', 'explain', 'compare', 'analyze'],
-            'complex': ['evaluate', 'synthesize', 'create', 'design', 'solve for multiple', 'consider all factors']
-        }
-        domains_detected = []
-        for domain, keywords in self.domain_expertise.items():
-            if any(keyword in query.lower() for keyword in keywords):
-                domains_detected.append(domain)
-        complexity_level = 'simple'
-        for level, indicators in complexity_indicators.items():
-            if any(indicator in query.lower() for indicator in indicators):
-                complexity_level = level
-        requires_multi_step = any(phrase in query.lower() for phrase in [
-            'step by step', 'first...then', 'multiple', 'several', 'both', 'compare and contrast'
-        ])
-        return {
-            'complexity': complexity_level,
-            'domains': domains_detected,
-            'multi_step_needed': requires_multi_step,
-            'estimated_steps': min(5, len(domains_detected) + (2 if requires_multi_step else 1))
-        }
-    def decompose_complex_query(self, query: str, analysis: Dict[str, Any]) -> List[ReasoningStep]:
-        """Break down complex queries into manageable reasoning steps."""
-        steps = []
-        step_num = 1
-        # Step 1: Information Gathering
-        if analysis['complexity'] in ['moderate', 'complex']:
-            # Determine if we need current information
-            if any(term in query.lower() for term in ['current', 'latest', 'recent', 'today', '2024', '2025']):
-                steps.append(ReasoningStep(
-                    step_number=step_num,
-                    description="Gather current information from web sources",
-                    reasoning_type=ReasoningType.ANALYTICAL,
-                    tool_needed="serper",
-                    query=query
-                ))
-                step_num += 1
-            # Check if mathematical computation is needed
-            if any(term in query.lower() for term in ['calculate', 'compute', 'solve', 'derivative', 'integral']):
-                steps.append(ReasoningStep(
-                    step_number=step_num,
-                    description="Perform mathematical computation",
-                    reasoning_type=ReasoningType.ANALYTICAL,
-                    tool_needed="math_engine",
-                    query=query
-                ))
-                step_num += 1
-        # Step 2: Domain-specific analysis
-        for domain in analysis['domains']:
-            steps.append(ReasoningStep(
-                step_number=step_num,
-                description=f"Analyze from {domain} perspective",
-                reasoning_type=ReasoningType.ANALYTICAL,
-                tool_needed=None,  # Will use model generation with domain context
-                query=f"From a {domain} perspective: {query}"
-            ))
-            step_num += 1
-        # Step 3: Synthesis and evaluation
-        if analysis['complexity'] == 'complex':
-            steps.append(ReasoningStep(
-                step_number=step_num,
-                description="Synthesize information and provide comprehensive analysis",
-                reasoning_type=ReasoningType.EVALUATIVE,
-                tool_needed=None,
-                query=query
-            ))
-        return steps if steps else [ReasoningStep(1, "Direct response", ReasoningType.ANALYTICAL, query=query)]
-    def execute_reasoning_chain(self, steps: List[ReasoningStep], model, tokenizer) -> List[ReasoningStep]:
-        """Execute a chain of reasoning steps, using tools and model generation as needed."""
-        results = []
-        context_info = []
-        for step in steps:
-            if step.tool_needed:
-                # Use appropriate tool
-                tool_call = ToolCall(tool=step.tool_needed, query=step.query)
-                executed_call = self.tools.execute_tool_call(tool_call)
-                if executed_call.result:
-                    step.result = executed_call.result
-                    step.confidence = 0.9
-                    context_info.append(f"{step.description}: {executed_call.result}")
-                else:
-                    step.result = f"Tool execution failed: {executed_call.error}"
-                    step.confidence = 0.3
-            else:
-                # Use model generation with enhanced context
-                enhanced_context = self._build_enhanced_context(step, context_info)
-                try:
-                    response = self._generate_with_context(model, tokenizer, enhanced_context, step.query)
-                    step.result = response
-                    step.confidence = 0.7
-                    context_info.append(f"{step.description}: {response}")
-                except Exception as e:
-                    step.result = f"Generation failed: {str(e)}"
-                    step.confidence = 0.2
-            results.append(step)
-        return results
-    def _build_enhanced_context(self, step: ReasoningStep, context_info: List[str]) -> str:
-        """Build enhanced context for model generation."""
-        context_parts = [
-            "You are Supernova, an advanced AI assistant with deep expertise across multiple domains.",
-            "Apply sophisticated reasoning and provide comprehensive, nuanced responses.",
-            ""
-        ]
-        if context_info:
-            context_parts.extend([
-                "Previous analysis steps:",
-                *[f"- {info}" for info in context_info],
-                ""
-            ])
-        reasoning_guidance = {
-            ReasoningType.ANALYTICAL: "Analyze systematically, consider multiple factors, and provide evidence-based insights.",
-            ReasoningType.CREATIVE: "Think creatively, explore innovative solutions, and consider unconventional approaches.",
-            ReasoningType.COMPARATIVE: "Compare different perspectives, weigh pros and cons, and identify key differences.",
-            ReasoningType.CAUSAL: "Identify cause-and-effect relationships, trace underlying mechanisms, and explain why things happen.",
-            ReasoningType.SEQUENTIAL: "Break down into logical steps, show progression, and maintain clear sequencing.",
-            ReasoningType.EVALUATIVE: "Make judgments based on criteria, assess quality and effectiveness, and provide recommendations."
-        }
-        context_parts.extend([
-            f"Reasoning approach: {reasoning_guidance.get(step.reasoning_type, 'Provide thorough analysis.')}",
-            f"Focus area: {step.description}",
-            ""
-        ])
-        return "\n".join(context_parts)
-    def _generate_with_context(self, model, tokenizer, context: str, query: str, max_tokens: int = 400) -> str:
-        """Generate response using the model with enhanced context."""
-        full_prompt = f"{context}\nUser Query: {query}\n\nDetailed Response:"
-        # Use the existing generate function (simplified version)
-        model.eval()
-        device = next(model.parameters()).device
-        input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(device)
-        with torch.no_grad():
-            for _ in range(max_tokens):
-                if input_ids.size(1) >= model.cfg.n_positions:
-                    input_cond = input_ids[:, -model.cfg.n_positions:]
-                else:
-                    input_cond = input_ids
-                logits, _ = model(input_cond)
-                logits = logits[:, -1, :] / 0.8  # temperature
-                # Top-k sampling
-                v, _ = torch.topk(logits, min(50, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float("Inf")
-                probs = torch.softmax(logits, dim=-1)
-                next_id = torch.multinomial(probs, num_samples=1)
-                input_ids = torch.cat([input_ids, next_id], dim=1)
-        response = tokenizer.decode(input_ids[0].tolist())
-        # Extract the response part
-        if "Detailed Response:" in response:
-            response = response.split("Detailed Response:", 1)[1].strip()
-        return response
-    def synthesize_final_response(self, steps: List[ReasoningStep], original_query: str) -> str:
-        """Synthesize all reasoning steps into a comprehensive final response."""
-        successful_steps = [step for step in steps if step.result and step.confidence > 0.5]
-        if not successful_steps:
-            return "I apologize, but I encountered difficulties processing your request. Could you please rephrase or provide more specific details?"
-        # Build comprehensive response
-        response_parts = []
-        # Add executive summary for complex queries
-        if len(successful_steps) > 2:
-            response_parts.append("Here's my comprehensive analysis:")
-            response_parts.append("")
-        # Include results from each step
-        for step in successful_steps:
-            if step.tool_needed in ['math_engine', 'serper']:
-                # Tool results are already well-formatted
-                response_parts.append(step.result)
-            else:
-                # Model-generated responses
-                response_parts.append(step.result)
-            response_parts.append("")
-        # Add synthesis for multi-step responses
-        if len(successful_steps) > 2:
-            confidence_score = sum(step.confidence for step in successful_steps) / len(successful_steps)
-            synthesis_parts = [
-                "**Key Insights:**",
-                "• Multiple perspectives have been considered",
-                f"• Analysis confidence: {confidence_score:.1%}",
-                "• Both current information and domain expertise were utilized"
-            ]
-            response_parts.extend(synthesis_parts)
-        return "\n".join(response_parts).strip()
-    def process_complex_query(self, query: str, model, tokenizer) -> str:
-        """Main method to process complex queries with enhanced reasoning."""
-        # Analyze query complexity and requirements
-        analysis = self.analyze_query_complexity(query)
-        # For simple queries, use direct processing
-        if analysis['complexity'] == 'simple' and not analysis['multi_step_needed']:
-            tool_call = self.tools.route_query(query)
-            if tool_call:
-                executed_call = self.tools.execute_tool_call(tool_call)
-                if executed_call.result:
-                    return executed_call.result
-            # Fall back to enhanced model generation
-            context = self._build_enhanced_context(
-                ReasoningStep(1, "Direct response", ReasoningType.ANALYTICAL),
-                []
-            )
-            return self._generate_with_context(model, tokenizer, context, query)
-        # For complex queries, use multi-step reasoning
-        reasoning_steps = self.decompose_complex_query(query, analysis)
-        executed_steps = self.execute_reasoning_chain(reasoning_steps, model, tokenizer)
-        return self.synthesize_final_response(executed_steps, query)
-# Import torch and other needed modules here to avoid import issues
-import torch
-try:
-    import sympy as sp
-    import numpy as np
-except ImportError:
-    pass

+"""
+Enhanced Reasoning Engine for Supernova AI
+Provides sophisticated problem-solving capabilities through structured reasoning,
+multi-tool coordination, and knowledge synthesis.
+"""
+import torch
+import numpy as np
+try:
+    import sympy as sp
+except ImportError:
+    sp = None
+import re
+import json
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+from enum import Enum
+from .tools import ToolOrchestrator, ToolCall
+class ReasoningType(Enum):
+    ANALYTICAL = "analytical"
+    CREATIVE = "creative"
+    COMPARATIVE = "comparative"
+    CAUSAL = "causal"
+    SEQUENTIAL = "sequential"
+    EVALUATIVE = "evaluative"
+@dataclass
+class ReasoningStep:
+    step_number: int
+    description: str
+    reasoning_type: ReasoningType
+    tool_needed: Optional[str] = None
+    query: Optional[str] = None
+    result: Optional[str] = None
+    confidence: float = 0.8
+@dataclass
+class KnowledgeDomain:
+    domain: str
+    confidence: float
+    sources: List[str]
+    key_facts: List[str]
+class EnhancedReasoningEngine:
+    """Advanced reasoning engine that mimics sophisticated AI reasoning patterns."""
+    def __init__(self, tool_orchestrator: ToolOrchestrator):
+        self.tools = tool_orchestrator
+        self.conversation_context = []
+        self.domain_expertise = {
+            'science': ['physics', 'chemistry', 'biology', 'mathematics', 'astronomy'],
+            'technology': ['programming', 'ai', 'computing', 'engineering', 'electronics'],
+            'humanities': ['history', 'literature', 'philosophy', 'psychology', 'sociology'],
+            'medicine': ['anatomy', 'pharmacology', 'diagnosis', 'treatment', 'research'],
+            'business': ['finance', 'management', 'economics', 'marketing', 'strategy'],
+            'arts': ['music', 'visual arts', 'design', 'architecture', 'performance']
+        }
+    def analyze_query_complexity(self, query: str) -> Dict[str, Any]:
+        """Analyze the complexity and requirements of a user query."""
+        complexity_indicators = {
+            'simple': ['what is', 'define', 'who is', 'when did'],
+            'moderate': ['how does', 'why does', 'explain', 'compare', 'analyze'],
+            'complex': ['evaluate', 'synthesize', 'create', 'design', 'solve for multiple', 'consider all factors']
+        }
+        domains_detected = []
+        for domain, keywords in self.domain_expertise.items():
+            if any(keyword in query.lower() for keyword in keywords):
+                domains_detected.append(domain)
+        complexity_level = 'simple'
+        for level, indicators in complexity_indicators.items():
+            if any(indicator in query.lower() for indicator in indicators):
+                complexity_level = level
+        requires_multi_step = any(phrase in query.lower() for phrase in [
+            'step by step', 'first...then', 'multiple', 'several', 'both', 'compare and contrast'
+        ])
+        return {
+            'complexity': complexity_level,
+            'domains': domains_detected,
+            'multi_step_needed': requires_multi_step,
+            'estimated_steps': min(5, len(domains_detected) + (2 if requires_multi_step else 1))
+        }
+    def decompose_complex_query(self, query: str, analysis: Dict[str, Any]) -> List[ReasoningStep]:
+        """Break down complex queries into manageable reasoning steps."""
+        steps = []
+        step_num = 1
+        # Step 1: Information Gathering
+        if analysis['complexity'] in ['moderate', 'complex']:
+            # Determine if we need current information
+            if any(term in query.lower() for term in ['current', 'latest', 'recent', 'today', '2024', '2025']):
+                steps.append(ReasoningStep(
+                    step_number=step_num,
+                    description="Gather current information from web sources",
+                    reasoning_type=ReasoningType.ANALYTICAL,
+                    tool_needed="serper",
+                    query=query
+                ))
+                step_num += 1
+            # Check if mathematical computation is needed
+            if any(term in query.lower() for term in ['calculate', 'compute', 'solve', 'derivative', 'integral']):
+                steps.append(ReasoningStep(
+                    step_number=step_num,
+                    description="Perform mathematical computation",
+                    reasoning_type=ReasoningType.ANALYTICAL,
+                    tool_needed="math_engine",
+                    query=query
+                ))
+                step_num += 1
+        # Step 2: Domain-specific analysis
+        for domain in analysis['domains']:
+            steps.append(ReasoningStep(
+                step_number=step_num,
+                description=f"Analyze from {domain} perspective",
+                reasoning_type=ReasoningType.ANALYTICAL,
+                tool_needed=None,  # Will use model generation with domain context
+                query=f"From a {domain} perspective: {query}"
+            ))
+            step_num += 1
+        # Step 3: Synthesis and evaluation
+        if analysis['complexity'] == 'complex':
+            steps.append(ReasoningStep(
+                step_number=step_num,
+                description="Synthesize information and provide comprehensive analysis",
+                reasoning_type=ReasoningType.EVALUATIVE,
+                tool_needed=None,
+                query=query
+            ))
+        return steps if steps else [ReasoningStep(1, "Direct response", ReasoningType.ANALYTICAL, query=query)]
+    def execute_reasoning_chain(self, steps: List[ReasoningStep], model, tokenizer) -> List[ReasoningStep]:
+        """Execute a chain of reasoning steps, using tools and model generation as needed."""
+        results = []
+        context_info = []
+        for step in steps:
+            if step.tool_needed:
+                # Use appropriate tool
+                tool_call = ToolCall(tool=step.tool_needed, query=step.query)
+                executed_call = self.tools.execute_tool_call(tool_call)
+                if executed_call.result:
+                    step.result = executed_call.result
+                    step.confidence = 0.9
+                    context_info.append(f"{step.description}: {executed_call.result}")
+                else:
+                    step.result = f"Tool execution failed: {executed_call.error}"
+                    step.confidence = 0.3
+            else:
+                # Use model generation with enhanced context
+                enhanced_context = self._build_enhanced_context(step, context_info)
+                try:
+                    response = self._generate_with_context(model, tokenizer, enhanced_context, step.query)
+                    step.result = response
+                    step.confidence = 0.7
+                    context_info.append(f"{step.description}: {response}")
+                except Exception as e:
+                    step.result = f"Generation failed: {str(e)}"
+                    step.confidence = 0.2
+            results.append(step)
+        return results
+    def _build_enhanced_context(self, step: ReasoningStep, context_info: List[str]) -> str:
+        """Build enhanced context for model generation."""
+        context_parts = [
+            "You are Supernova, an advanced AI assistant with deep expertise across multiple domains.",
+            "Apply sophisticated reasoning and provide comprehensive, nuanced responses.",
+            ""
+        ]
+        if context_info:
+            context_parts.extend([
+                "Previous analysis steps:",
+                *[f"- {info}" for info in context_info],
+                ""
+            ])
+        reasoning_guidance = {
+            ReasoningType.ANALYTICAL: "Analyze systematically, consider multiple factors, and provide evidence-based insights.",
+            ReasoningType.CREATIVE: "Think creatively, explore innovative solutions, and consider unconventional approaches.",
+            ReasoningType.COMPARATIVE: "Compare different perspectives, weigh pros and cons, and identify key differences.",
+            ReasoningType.CAUSAL: "Identify cause-and-effect relationships, trace underlying mechanisms, and explain why things happen.",
+            ReasoningType.SEQUENTIAL: "Break down into logical steps, show progression, and maintain clear sequencing.",
+            ReasoningType.EVALUATIVE: "Make judgments based on criteria, assess quality and effectiveness, and provide recommendations."
+        }
+        context_parts.extend([
+            f"Reasoning approach: {reasoning_guidance.get(step.reasoning_type, 'Provide thorough analysis.')}",
+            f"Focus area: {step.description}",
+            ""
+        ])
+        return "\n".join(context_parts)
+    def _generate_with_context(self, model, tokenizer, context: str, query: str, max_tokens: int = 400) -> str:
+        """Generate response using the model with enhanced context."""
+        full_prompt = f"{context}\nUser Query: {query}\n\nDetailed Response:"
+        # Use the existing generate function (simplified version)
+        model.eval()
+        device = next(model.parameters()).device
+        input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            for _ in range(max_tokens):
+                if input_ids.size(1) >= model.cfg.n_positions:
+                    input_cond = input_ids[:, -model.cfg.n_positions:]
+                else:
+                    input_cond = input_ids
+                logits, _ = model(input_cond)
+                logits = logits[:, -1, :] / 0.8  # temperature
+                # Top-k sampling
+                v, _ = torch.topk(logits, min(50, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+                probs = torch.softmax(logits, dim=-1)
+                next_id = torch.multinomial(probs, num_samples=1)
+                input_ids = torch.cat([input_ids, next_id], dim=1)
+        response = tokenizer.decode(input_ids[0].tolist())
+        # Extract the response part
+        if "Detailed Response:" in response:
+            response = response.split("Detailed Response:", 1)[1].strip()
+        return response
+    def synthesize_final_response(self, steps: List[ReasoningStep], original_query: str) -> str:
+        """Synthesize all reasoning steps into a comprehensive final response."""
+        successful_steps = [step for step in steps if step.result and step.confidence > 0.5]
+        if not successful_steps:
+            return "I apologize, but I encountered difficulties processing your request. Could you please rephrase or provide more specific details?"
+        # Build comprehensive response
+        response_parts = []
+        # Add executive summary for complex queries
+        if len(successful_steps) > 2:
+            response_parts.append("Here's my comprehensive analysis:")
+            response_parts.append("")
+        # Include results from each step
+        for step in successful_steps:
+            if step.tool_needed in ['math_engine', 'serper']:
+                # Tool results are already well-formatted
+                response_parts.append(step.result)
+            else:
+                # Model-generated responses
+                response_parts.append(step.result)
+            response_parts.append("")
+        # Add synthesis for multi-step responses
+        if len(successful_steps) > 2:
+            confidence_score = sum(step.confidence for step in successful_steps) / len(successful_steps)
+            synthesis_parts = [
+                "**Key Insights:**",
+                "• Multiple perspectives have been considered",
+                f"• Analysis confidence: {confidence_score:.1%}",
+                "• Both current information and domain expertise were utilized"
+            ]
+            response_parts.extend(synthesis_parts)
+        return "\n".join(response_parts).strip()
+    def process_complex_query(self, query: str, model, tokenizer) -> str:
+        """Main method to process complex queries with enhanced reasoning."""
+        # Analyze query complexity and requirements
+        analysis = self.analyze_query_complexity(query)
+        # For simple queries, use direct processing
+        if analysis['complexity'] == 'simple' and not analysis['multi_step_needed']:
+            tool_call = self.tools.route_query(query)
+            if tool_call:
+                executed_call = self.tools.execute_tool_call(tool_call)
+                if executed_call.result:
+                    return executed_call.result
+            # Fall back to enhanced model generation
+            context = self._build_enhanced_context(
+                ReasoningStep(1, "Direct response", ReasoningType.ANALYTICAL),
+                []
+            )
+            return self._generate_with_context(model, tokenizer, context, query)
+        # For complex queries, use multi-step reasoning
+        reasoning_steps = self.decompose_complex_query(query, analysis)
+        executed_steps = self.execute_reasoning_chain(reasoning_steps, model, tokenizer)
+        return self.synthesize_final_response(executed_steps, query)
+# Import torch and other needed modules here to avoid import issues
+import torch
+try:
+    import sympy as sp
+    import numpy as np
+except ImportError:
+    pass

supernova/train.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# train.py (improved)
 import argparse
 import json
 import math
@@ -15,11 +14,11 @@ from transformers import get_cosine_schedule_with_warmup
 from .config import ModelConfig
 from .model import SupernovaModel
 from .tokenizer import load_gpt2_tokenizer
-from .data import load_sources_from_yaml, TokenChunkDataset
-# -----------------------
 # Utilities
-# -----------------------
 def compute_grad_norm(model: nn.Module) -> float:
     total = 0.0
     for p in model.parameters():
@@ -61,9 +60,9 @@ class EMA:
                 p.data.copy_(self.backup[name])
         del self.backup
-# -----------------------
 # Training loop
-# -----------------------
 def train(
     config_path: str,
     data_config_path: str,
@@ -145,12 +144,12 @@ def train(
         seq_len=seq_len,
         eos_token_id=tok.eos_token_id
     )
     sampler = DistributedSampler(ds) if ddp else None
     dl = DataLoader(
         ds,
         batch_size=batch_size,
-        shuffle=(sampler is None),
         sampler=sampler,
         num_workers=num_workers,
         pin_memory=pin_memory,
@@ -158,7 +157,7 @@ def train(
         drop_last=True,
     )
-    # optimizer with simple parameter grouping example to avoid weight decay on norms/bias
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
@@ -174,25 +173,17 @@ def train(
         ]
     optimizer = torch.optim.AdamW(param_groups(model), lr=lr, betas=(0.9, 0.95), eps=1e-8)
-    # scheduler
     scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps)
     # AMP scaler
-    if device.type == "cuda":
-        scaler = torch.amp.GradScaler('cuda', enabled=True)
-    else:
-        scaler = torch.amp.GradScaler('cpu', enabled=False)
     # EMA
     ema = EMA(model if not ddp else model.module, decay=ema_decay) if use_ema else None
-    # logging + checkpoint dir
     os.makedirs(out_dir, exist_ok=True)
     writer = SummaryWriter(log_dir=os.path.join(out_dir, "runs")) if use_tensorboard and (not ddp or local_rank == 0) else None
-    # validation dataset (simple split: user should provide a separate validation YAML ideally)
-    # TODO: Implement a proper validation dataset pipeline. For now, we use a small random subset of training data.
     val_ds = None
     val_dl = None
@@ -202,7 +193,6 @@ def train(
     if resume_from and os.path.exists(resume_from):
         ckpt = torch.load(resume_from, map_location=device)
         model_state = ckpt["model_state_dict"]
-        # if ddp, load into module
         target = model.module if ddp else model
         target.load_state_dict(model_state)
         optimizer.load_state_dict(ckpt.get("optimizer_state_dict", {}))
@@ -221,12 +211,12 @@ def train(
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
-    early_stop_patience = 10_000  # you can tune this
     # training loop
     while step < max_steps:
         if sampler is not None:
-            sampler.set_epoch(step)  # shuffle differently per epoch for DDP
         for batch in dl:
             x, y = batch
@@ -243,7 +233,6 @@ def train(
             running_loss += loss.item()
             if micro % grad_accum == 0:
-                # gradient clipping
                 if clip_grad_norm is not None:
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
@@ -255,7 +244,6 @@ def train(
                 if ema:
                     ema.update(model if not ddp else model.module)
                 step += 1
                 # logging
@@ -275,11 +263,20 @@ def train(
                 # periodic validation
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
-                        # quick in-memory val split: take first N batches (user should replace with real val)
-                        # NOTE: for production, create a dedicated validation dataset.
-                        val_sources = sources[: max(1, len(sources) // 20)]
-                        if not val_sources:
-                            val_sources = sources[:1]  # fallback to at least one source
                         val_ds = TokenChunkDataset(
                             tokenizer=tok,
                             sources=val_sources,
@@ -289,7 +286,6 @@ def train(
                         val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)
                     model.eval()
-                    # optionally swap in EMA weights for evaluation
                     if ema:
                         ema.store(model if not ddp else model.module)
                         ema.copy_to(model if not ddp else model.module)
@@ -310,12 +306,10 @@ def train(
                         writer.add_scalar("val/loss", mean_val, step)
                     print(f"[eval] step={step} val_loss={mean_val:.6f}")
-                    # restore weights
                     if ema:
                         ema.restore(model if not ddp else model.module)
                     model.train()
-                    # early stop / best model saving
                     if mean_val < best_val_loss:
                         best_val_loss = mean_val
                         no_improve_steps = 0
@@ -331,7 +325,7 @@ def train(
                         }
                         if not ddp or local_rank == 0:
                             atomic_save(ckpt, best_path)
-                            print(f"Saved best checkpoint to {best_path}")
                     else:
                         no_improve_steps += validate_every
                         if no_improve_steps >= early_stop_patience:
@@ -378,7 +372,6 @@ def train(
     if writer:
         writer.close()
 if __name__ == "__main__":
     ap = argparse.ArgumentParser()
     ap.add_argument("--config", required=True)

 import argparse
 import json
 import math
 from .config import ModelConfig
 from .model import SupernovaModel
 from .tokenizer import load_gpt2_tokenizer
+from .data import load_sources_from_yaml, TokenChunkDataset, DataSource
+# ------------------------------
 # Utilities
+# ------------------------------
 def compute_grad_norm(model: nn.Module) -> float:
     total = 0.0
     for p in model.parameters():
                 p.data.copy_(self.backup[name])
         del self.backup
+# ------------------------------
 # Training loop
+# ------------------------------
 def train(
     config_path: str,
     data_config_path: str,
         seq_len=seq_len,
         eos_token_id=tok.eos_token_id
     )
     sampler = DistributedSampler(ds) if ddp else None
+    # NOTE: NO shuffle for IterableDataset!
     dl = DataLoader(
         ds,
         batch_size=batch_size,
         sampler=sampler,
         num_workers=num_workers,
         pin_memory=pin_memory,
         drop_last=True,
     )
+    # optimizer
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
         ]
     optimizer = torch.optim.AdamW(param_groups(model), lr=lr, betas=(0.9, 0.95), eps=1e-8)
     scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps)
     # AMP scaler
+    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
     # EMA
     ema = EMA(model if not ddp else model.module, decay=ema_decay) if use_ema else None
     os.makedirs(out_dir, exist_ok=True)
     writer = SummaryWriter(log_dir=os.path.join(out_dir, "runs")) if use_tensorboard and (not ddp or local_rank == 0) else None
+    # validation
     val_ds = None
     val_dl = None
     if resume_from and os.path.exists(resume_from):
         ckpt = torch.load(resume_from, map_location=device)
         model_state = ckpt["model_state_dict"]
         target = model.module if ddp else model
         target.load_state_dict(model_state)
         optimizer.load_state_dict(ckpt.get("optimizer_state_dict", {}))
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
+    early_stop_patience = 10_000 # you can tune this
     # training loop
     while step < max_steps:
         if sampler is not None:
+            sampler.set_epoch(step) # shuffle differently per epoch for DDP
         for batch in dl:
             x, y = batch
             running_loss += loss.item()
             if micro % grad_accum == 0:
                 if clip_grad_norm is not None:
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
                 if ema:
                     ema.update(model if not ddp else model.module)
                 step += 1
                 # logging
                 # periodic validation
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
+                        # Use a proper validation dataset with wikitext-2 validation split
+                        # This provides more reliable validation than using training data subsets
+                        val_sources = []
+                        for source in sources[:min(3, len(sources))]:
+                            val_source = DataSource(
+                                name=f"{source.name}_val",
+                                hf_path="wikitext",
+                                hf_name="wikitext-2-v1",
+                                split="validation",
+                                text_field="text",
+                                weight=1,
+                                streaming=False
+                            )
+                            val_sources.append(val_source)
                         val_ds = TokenChunkDataset(
                             tokenizer=tok,
                             sources=val_sources,
                         val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)
                     model.eval()
                     if ema:
                         ema.store(model if not ddp else model.module)
                         ema.copy_to(model if not ddp else model.module)
                         writer.add_scalar("val/loss", mean_val, step)
                     print(f"[eval] step={step} val_loss={mean_val:.6f}")
                     if ema:
                         ema.restore(model if not ddp else model.module)
                     model.train()
                     if mean_val < best_val_loss:
                         best_val_loss = mean_val
                         no_improve_steps = 0
                         }
                         if not ddp or local_rank == 0:
                             atomic_save(ckpt, best_path)
+                        print(f"Saved best checkpoint to {best_path}")
                     else:
                         no_improve_steps += validate_every
                         if no_improve_steps >= early_stop_patience:
     if writer:
         writer.close()
 if __name__ == "__main__":
     ap = argparse.ArgumentParser()
     ap.add_argument("--config", required=True)