NoesisLab
/

Geilim-1B-SR-Instruct

+"""
+Hybrid ASPP-Attention Architecture (Asterisk Model)
+Combines Adjacency-Structured Parallel Propagation (ASPP) with standard attention mechanisms
+to enhance model expressiveness while maintaining efficiency.
+Architecture Design:
+- Hybrid layers: Standard attention + ASPP operator in parallel
+- Gate mechanism for dynamic fusion
+- Knowledge distillation from SmolLM2-135M base model
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaRMSNorm,
+    LlamaMLP,
+)
+from transformers import AutoConfig, AutoModelForCausalLM
+from typing import Optional, Tuple, List
+class AsteriskConfig(LlamaConfig):
+    """
+    Configuration class for Asterisk model.
+    Inherits from LlamaConfig with custom model_type.
+    """
+    model_type = "asterisk"
+    def __init__(
+        self,
+        hybrid_layer_indices: Optional[List[int]] = None,
+        aspp_hidden_dim: Optional[int] = None,
+        aspp_num_steps: int = 2,
+        aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 1,  # Fixed at 1 for Union-Find (only parent)
+        # π-flow parameters
+        pi_flow: bool = False,
+        pi_flow_steps: int = 1,
+        pi_flow_scale: float = 0.2,
+        pi_flow_use_gate: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.hybrid_layer_indices = hybrid_layer_indices
+        self.aspp_hidden_dim = aspp_hidden_dim
+        self.aspp_num_steps = aspp_num_steps
+        self.aspp_dropout = aspp_dropout
+        self.aspp_num_neighbors = aspp_num_neighbors
+        # π-flow config
+        self.pi_flow = pi_flow
+        self.pi_flow_steps = pi_flow_steps
+        self.pi_flow_scale = pi_flow_scale
+        self.pi_flow_use_gate = pi_flow_use_gate
+class ASPPOperator(nn.Module):
+    """
+    Asterisk Operator (ASPP) - Union-Find Graph Propagation
+    Uses Union-Find (Disjoint Set Union) structure for dynamic parent connections:
+    - Each position maintains a parent pointer: parent[i]
+    - Initial structure: parent[i] = max(0, i-1) (linear chain)
+    - Message passing: aggregate self + parent features
+    - Can apply path compression for optimization
+    Advantages:
+    - O(n) complexity with simple indexing
+    - Dynamic grouping of related positions
+    - Efficient parent-only propagation (no complex gather)
+    - Nearly constant time find with path compression
+    Complexity: O(n) with α(n) ≈ O(1) per operation
+    Message passing: h_i^(t+1) = φ(h_i^(t), h_parent[i])
+    Args:
+        hidden_size: Dimension of hidden states (input/output)
+        aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
+        num_steps: Number of evolution steps K (default: 2)
+        dropout: Dropout rate for regularization (default: 0.1)
+        num_neighbors: Fixed at 1 (only parent) for Union-Find structure
+    """
+    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1, num_neighbors: int = 1):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
+        self.num_steps = num_steps
+        self.num_neighbors = 1  # Fixed: only parent
+        # Projection to lower dimension (if specified)
+        self.use_projection = (self.aspp_hidden_dim != hidden_size)
+        if self.use_projection:
+            self.down_proj = nn.Linear(hidden_size, self.aspp_hidden_dim)
+            self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
+            self.proj_dropout = nn.Dropout(dropout)
+        # Message aggregation function: combines self + parent
+        self.message_net = nn.Sequential(
+            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim * 2),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
+            nn.Dropout(dropout),
+        )
+        # Learnable K-step parameter
+        self.k_logit = nn.Parameter(torch.tensor(1.0))
+        # Learnable residual scale
+        self.residual_scale = nn.Parameter(torch.tensor(0.1))
+        # Layer norm for stability
+        self.norm = nn.LayerNorm(self.aspp_hidden_dim, eps=1e-5)
+    def compute_parent_indices(self, seq_len: int, device) -> torch.Tensor:
+        """
+        Compute parent index for each position using Union-Find structure
+        Simple implementation: parent[i] = i-1 (linear chain)
+        - Position 0 points to itself (root)
+        - All others point to previous position
+        Can be extended with dynamic union operations based on:
+        - Semantic similarity
+        - Positional heuristics
+        - Learned grouping
+        Returns: [seq_len] tensor of parent indices
+        """
+        # Initialize: parent[i] = max(0, i-1)
+        parent_indices = torch.arange(seq_len, device=device) - 1
+        parent_indices[0] = 0  # Root points to itself
+        parent_indices = torch.clamp(parent_indices, 0, seq_len - 1)
+        return parent_indices
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, hidden_size]
+        Returns:
+            evolved_states: [batch_size, seq_len, hidden_size]
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+        # Project to lower dimension if needed
+        if self.use_projection:
+            h_t = self.down_proj(hidden_states)
+            h_t = self.proj_dropout(h_t)
+        else:
+            h_t = hidden_states
+        # Learnable number of steps
+        k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
+        # K-step Union-Find graph propagation
+        for t in range(k_steps):
+            # 1. Compute parent indices using Union-Find structure
+            parent_indices = self.compute_parent_indices(seq_len, h_t.device)  # [L]
+            # 2. Gather parent features (super simple indexing!)
+            # h_t: [B, L, D], parent_indices: [L]
+            # Just gather from parent positions
+            parent_features = h_t[:, parent_indices, :]  # [B, L, D]
+            # 3. Message passing: combine self + parent
+            message_input = torch.cat([h_t, parent_features], dim=-1)  # [B, L, 2D]
+            h_t_next = self.message_net(message_input)  # [B, L, D]
+            # 4. Scaled residual connection for stability
+            h_t = h_t + self.residual_scale * h_t_next
+            h_t = self.norm(h_t)
+        # Project back to original dimension if needed
+        if self.use_projection:
+            h_t = self.up_proj(h_t)
+            h_t = self.proj_dropout(h_t)
+        return h_t
+class HybridASPPAttentionLayer(LlamaDecoderLayer):
+    """
+    Hybrid layer combining ASPP operator and standard attention
+    Inherits from LlamaDecoderLayer to maintain compatibility
+    Architecture:
+    1. Parallel branches:
+       - ASPP operator for local structured reasoning
+       - Standard LlamaAttention for global context
+    2. Gated fusion of both outputs
+    3. π-flow refinement (optional, per-layer)
+    4. Feed-forward network
+    """
+    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 1):
+        # Initialize parent LlamaDecoderLayer
+        super().__init__(config, layer_idx)
+        # Add ASPP branch
+        self.aspp_operator = ASPPOperator(
+            hidden_size=config.hidden_size,
+            aspp_hidden_dim=aspp_hidden_dim,
+            num_steps=aspp_num_steps,
+            dropout=aspp_dropout,
+            num_neighbors=aspp_num_neighbors
+        )
+        # Gated fusion mechanism with dropout
+        self.fusion_gate = nn.Sequential(
+            nn.Linear(config.hidden_size * 2, config.hidden_size),
+            nn.Dropout(aspp_dropout),
+            nn.Sigmoid()
+        )
+        # Initialize gate to be balanced (output 0.5 initially)
+        with torch.no_grad():
+            self.fusion_gate[0].bias.fill_(0.0)  # sigmoid(0) = 0.5
+        # π-flow: Per-layer refinement ASPP
+        if getattr(config, 'pi_flow', False):
+            self.pi_flow_aspp = ASPPOperator(
+                hidden_size=config.hidden_size,
+                aspp_hidden_dim=aspp_hidden_dim,
+                num_steps=aspp_num_steps,
+                dropout=aspp_dropout,
+                num_neighbors=aspp_num_neighbors
+            )
+            # Learnable flow scale (per-layer)
+            self.pi_flow_scale = nn.Parameter(
+                torch.tensor(getattr(config, 'pi_flow_scale', 0.2))
+            )
+            # Token-wise adaptive gating (optional)
+            if getattr(config, 'pi_flow_use_gate', True):
+                self.pi_flow_gate = nn.Sequential(
+                    nn.Linear(config.hidden_size, config.hidden_size // 4),
+                    nn.SiLU(),
+                    nn.Dropout(aspp_dropout),
+                    nn.Linear(config.hidden_size // 4, 1),
+                    nn.Sigmoid()
+                )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Override LlamaDecoderLayer.forward to add ASPP branch and π-flow
+        Returns single tensor like LlamaDecoderLayer
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # ASPP branch
+        aspp_output = self.aspp_operator(hidden_states)
+        # Attention branch - use parent's self_attn (returns tuple, discard cache with _)
+        attn_output, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        # Gated fusion
+        fusion_input = torch.cat([aspp_output, attn_output], dim=-1)
+        gate = self.fusion_gate(fusion_input)
+        # Combine with gating: gate * ASPP + (1-gate) * Attention
+        fused_output = gate * aspp_output + (1 - gate) * attn_output
+        # Residual connection
+        hidden_states = residual + fused_output
+        # π-flow: Multi-step refinement in probability space (per-layer)
+        if hasattr(self, 'pi_flow_aspp'):
+            pi_flow_steps = getattr(self.config if hasattr(self, 'config') else kwargs.get('config'), 'pi_flow_steps', 1)
+            for step in range(pi_flow_steps):
+                # Compute velocity field v(h) using ASPP
+                v = self.pi_flow_aspp(hidden_states)
+                # Compute adaptive gate (per-token flow strength)
+                if hasattr(self, 'pi_flow_gate'):
+                    gate = self.pi_flow_gate(hidden_states)  # [B, L, 1]
+                    alpha = self.pi_flow_scale * gate
+                else:
+                    alpha = self.pi_flow_scale
+                # Euler step: h' = h + α * v(h)
+                hidden_states = hidden_states + alpha * v
+        # MLP block (use parent's mlp)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        # Return only hidden_states tensor, like LlamaDecoderLayer
+        return hidden_states
+class AsteriskLlamaModel(LlamaModel):
+    """
+    Asterisk-Llama model with full hybrid ASPP-Attention architecture
+    All layers use hybrid ASPP+Attention by default for maximum expressiveness.
+    """
+    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 2):
+        super().__init__(config)
+        # Determine which layers to make hybrid (default: ALL layers)
+        if hybrid_layer_indices is None:
+            # Use ALL layers as hybrid (full hybrid architecture)
+            num_layers = config.num_hidden_layers
+            hybrid_layer_indices = list(range(num_layers))
+        self.hybrid_layer_indices = hybrid_layer_indices
+        # Replace specified layers with hybrid layers (with per-layer π-flow if enabled)
+        for idx in hybrid_layer_indices:
+            if idx < len(self.layers):
+                self.layers[idx] = HybridASPPAttentionLayer(
+                    config,
+                    layer_idx=idx,
+                    aspp_hidden_dim=aspp_hidden_dim,
+                    aspp_num_steps=aspp_num_steps,
+                    aspp_dropout=aspp_dropout,
+                    aspp_num_neighbors=aspp_num_neighbors
+                )
+        # Initialize weights
+        self.post_init()
+class AsteriskForCausalLM(LlamaForCausalLM):
+    """
+    Asterisk Causal LM with Hybrid ASPP-Attention architecture
+    Registered as: AsteriskForCausalLM
+    """
+    config_class = AsteriskConfig
+    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 2):
+        # Read all ASPP parameters from config if not explicitly provided
+        if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
+            hybrid_layer_indices = config.hybrid_layer_indices
+        if aspp_hidden_dim is None and hasattr(config, 'aspp_hidden_dim'):
+            aspp_hidden_dim = config.aspp_hidden_dim
+        if hasattr(config, 'aspp_num_steps'):
+            aspp_num_steps = config.aspp_num_steps
+        if hasattr(config, 'aspp_dropout'):
+            aspp_dropout = config.aspp_dropout
+        if hasattr(config, 'aspp_num_neighbors'):
+            aspp_num_neighbors = config.aspp_num_neighbors
+        super().__init__(config)
+        # Replace model with Asterisk version
+        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
+        # Store hybrid layer info in config for serialization
+        self.config.hybrid_layer_indices = hybrid_layer_indices
+        # Initialize weights
+        self.post_init()
+    @classmethod
+    def from_pretrained_base(
+        cls,
+        base_model_path: str,
+        config: Optional[AsteriskConfig] = None,  # NEW: Accept pre-configured config
+        hybrid_layer_indices: Optional[List[int]] = None,
+        aspp_hidden_dim: Optional[int] = None,
+        aspp_num_steps: int = 2,
+        aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 1,  # Fixed at 1 for Union-Find (only parent)
+        # π-flow parameters
+        pi_flow: bool = False,
+        pi_flow_steps: int = 1,
+        pi_flow_scale: float = 0.2,
+        pi_flow_use_gate: bool = True,
+        **kwargs
+    ):
+        """
+        Load base model and convert to Asterisk architecture
+        Args:
+            base_model_path: Path to base SmolLM2 model
+            config: Pre-configured AsteriskConfig (if provided, other ASPP params are ignored)
+            hybrid_layer_indices: Which layers to make hybrid (None for all)
+            aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
+            aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
+            aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
+            aspp_num_neighbors: Number of neighbors for Union-Find (fixed at 1: only parent)
+            pi_flow: Enable π-flow refinement step (default: False)
+            pi_flow_steps: Number of flow refinement steps (default: 1)
+            pi_flow_scale: Initial flow scale parameter (default: 0.2)
+            pi_flow_use_gate: Use token-wise adaptive gating (default: True)
+        """
+        # Load base model
+        base_model = LlamaForCausalLM.from_pretrained(base_model_path, **kwargs)
+        base_config = base_model.config
+        # Use provided config or create new one
+        if config is not None:
+            # Use pre-configured config
+            asterisk_config = config
+        else:
+            # Create Asterisk config from base config with ASPP + π-flow params
+            asterisk_config = AsteriskConfig(
+                **base_config.to_dict(),
+                hybrid_layer_indices=hybrid_layer_indices,
+                aspp_hidden_dim=aspp_hidden_dim,
+                aspp_num_steps=aspp_num_steps,
+                aspp_dropout=aspp_dropout,
+                aspp_num_neighbors=aspp_num_neighbors,
+                pi_flow=pi_flow,
+                pi_flow_steps=pi_flow_steps,
+                pi_flow_scale=pi_flow_scale,
+                pi_flow_use_gate=pi_flow_use_gate,
+            )
+        # Create Asterisk model (config already contains all ASPP params)
+        asterisk_model = cls(asterisk_config)
+        # Transfer weights from base model (non-hybrid layers and embeddings)
+        asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
+        print(f"✓ Converted base model to Asterisk architecture with Graph Propagation")
+        print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
+        aspp_dim_str = f"{asterisk_config.aspp_hidden_dim}" if asterisk_config.aspp_hidden_dim else f"{base_config.hidden_size} (full)"
+        print(f"  ASPP config: dim={aspp_dim_str}, steps={asterisk_config.aspp_num_steps}, dropout={asterisk_config.aspp_dropout}, neighbors={asterisk_config.aspp_num_neighbors}")
+        if asterisk_config.pi_flow:
+            print(f"  π-flow enabled: steps={asterisk_config.pi_flow_steps}, scale={asterisk_config.pi_flow_scale}, gate={asterisk_config.pi_flow_use_gate}")
+        return asterisk_model, base_model
+# Register the model for AutoModel
+AutoConfig.register("asterisk", AsteriskConfig)
+AutoModelForCausalLM.register(AsteriskConfig, AsteriskForCausalLM)
+def get_model_info(model):
+    """Print model architecture information"""
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"  • Total parameters: {total_params:,}")
+    print(f"  • Trainable parameters: {trainable_params:,}")
+    print(f"  • Model size: {total_params * 4 / 1024**2:.2f} MB (fp32)")
+    if isinstance(model, AsteriskForCausalLM):
+        print(f"  • Hybrid layer indices: {model.model.hybrid_layer_indices}")
+        print(f"  • Number of hybrid layers: {len(model.model.hybrid_layer_indices)}")

README.md CHANGED Viewed

@@ -1,3 +1,467 @@
----
-license: apache-2.0
----

+---
+language:
+- sr
+- en
+license: apache-2.0
+tags:
+- text-generation
+- reasoning
+- serbian
+- asterisk
+- aspp
+- hybrid-architecture
+- multilingual
+datasets:
+- ODA-Mixture-100k
+- ultrachat_200k_serbian
+metrics:
+- accuracy
+- perplexity
+base_model: Geilim-1B-Instruct
+model-index:
+- name: Geilim-1B-SR-Instruct
+  results: []
+---
+# Geilim-1B-SR-Instruct
+<div align="center">
+  <h3>🇷🇸 Serbian Reasoning Model - AI Democratization Project</h3>
+  <p><em>Bringing advanced reasoning capabilities to Serbian language</em></p>
+</div>
+## Model Description
+**Geilim-1B-SR-Instruct** is a 1.3B parameter Serbian reasoning model that combines:
+- **Base**: Geilim-1B-Instruct (1B parameters, Llama-3 architecture, 16 layers)
+- **Architecture**: Asterisk hybrid ASPP + Attention
+- **Training**: 50% ODA-Mixture-100k (reasoning) + 50% UltraChat Serbian (conversations)
+- **Goal**: Democratize AI by bringing reasoning to underrepresented languages
+### Key Features
+- ✅ **Hybrid Architecture**: All 16 layers use ASPP + standard Attention
+- ✅ **Graph-based Reasoning**: Union-Find structure with 6-step iterative propagation
+- ✅ **π-flow Refinement**: 4-step continuous flow dynamics for enhanced reasoning
+- ✅ **Bilingual**: Serbian language with preserved English reasoning capabilities
+- ✅ **Efficient**: ~1.3B total parameters, trainable on 2x consumer GPUs
+## Model Details
+### Model Architecture
+```
+Input → Embedding
+  ↓
+Layers 0-15: Hybrid ASPP + Attention (ALL 16 layers)
+  ├─ ASPP Branch (Union-Find graph reasoning)
+  │   ├─ 6-step iterative propagation
+  │   ├─ Hidden dim: 512 (reduced from 2048)
+  │   └─ π-flow: 4-step refinement
+  └─ Attention Branch (standard self-attention)
+  ↓
+  Gated Fusion: output = gate * ASPP(x) + (1-gate) * Attention(x)
+  ↓
+Output → LM Head
+```
+### Technical Specifications
+- **Parameters**: ~1.3B (1B base + 300M ASPP/π-flow)
+- **Layers**: 16 (all hybrid)
+- **Hidden Size**: 2048
+- **Attention Heads**: 32
+- **KV Heads**: 8 (GQA)
+- **Vocabulary**: 128,256 tokens
+- **Context Length**: 131,072 tokens (with RoPE scaling)
+- **Precision**: bfloat16
+### ASPP Configuration
+- **Hidden Dim**: 512 (dimensionality reduction)
+- **Iteration Steps**: 6
+- **Dropout**: 0.15
+- **Graph Structure**: Union-Find (parent-only connections)
+### π-flow Configuration
+- **Steps**: 4
+- **Scale**: 0.4
+- **Gating**: Adaptive per-token
+- **Purpose**: Multi-step refinement in probability space
+## Intended Use
+### Primary Use Cases
+1. **Serbian Language Tasks**:
+   - Conversational AI in Serbian
+   - Question answering in Serbian
+   - Text generation and completion
+2. **Reasoning Tasks**:
+   - Mathematical problem solving
+   - Code generation and debugging
+   - Step-by-step logical reasoning
+3. **Bilingual Applications**:
+   - Serbian-English translation assistance
+   - Cross-lingual reasoning tasks
+### Out-of-Scope Use
+- Production-critical applications without further testing
+- Tasks requiring real-time factual accuracy (model may hallucinate)
+- Languages other than Serbian and English (limited support)
+## How to Use
+### Installation
+```bash
+pip install torch transformers accelerate
+```
+### Basic Usage
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load model and tokenizer
+model_name = "NoesisLab/Geilim-1B-SR-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+# Serbian conversation
+messages = [
+    {"role": "user", "content": "Kakvu ulogu igraju nagrade i pozitivno pojačanje u dresuri Bigla i kako se mogu efikasno koristiti bez podsticanja lošeg ponašanja?"}
+]
+# Apply chat template
+input_text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+# Tokenize
+inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+# Generate
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=200,
+    temperature=0.7,
+    top_p=0.9,
+    repetition_penalty=1.1,
+    do_sample=True,
+)
+# Decode
+response = tokenizer.decode(
+    outputs[0][inputs['input_ids'].shape[1]:],
+    skip_special_tokens=True
+)
+print(response)
+```
+### Recommended Generation Parameters
+```python
+generation_config = {
+    "max_new_tokens": 200,
+    "temperature": 0.7,          # Balance creativity and coherence
+    "top_p": 0.9,                # Nucleus sampling
+    "repetition_penalty": 1.1,   # Reduce repetition
+    "do_sample": True,
+}
+```
+## Training Data
+### Dataset Composition
+The model was trained on a balanced mix of two datasets:
+#### 1. ODA-Mixture-100k (50% - Reasoning Data)
+**101,306 reasoning samples** across three domains:
+- **Math** (50,244 samples): AM-Thinking-v1-Distilled-math
+  - Mathematical problem solving with step-by-step reasoning
+  - Format: instruction → response (reasoning trace) → final answer
+- **Code** (50,245 samples): AM-Thinking-v1-Distilled-code
+  - Programming problems with detailed solutions
+  - Code generation, debugging, and explanation tasks
+- **General** (817 samples): LIMO
+  - General reasoning tasks
+  - Logic puzzles, common sense reasoning
+#### 2. UltraChat Serbian (50% - Language Data)
+**207,588 high-quality Serbian conversations**:
+- Translated from UltraChat 200k
+- Multi-turn dialogues covering diverse topics
+- Topics: science, culture, daily life, reasoning, education
+- Format: `messages_srb` (Serbian), `messages_eng` (English reference)
+### Data Mixing Strategy
+- **Balanced 50/50 split**: Preserve reasoning while learning Serbian
+- **Automatic sampling**: Match smaller dataset size
+- **Total samples**: ~100k (sampled from 202k available)
+- **Train/Test split**: 95% / 5%
+## Training Procedure
+### Training Hyperparameters
+- **Epochs**: 2
+- **Batch Size**: 2 per device
+- **Gradient Accumulation**: 8 steps (effective batch size = 16)
+- **Learning Rate**: 5e-5
+- **Warmup Ratio**: 0.1 (10% of training)
+- **Weight Decay**: 0.05
+- **Max Gradient Norm**: 1.0
+- **Optimizer**: AdamW
+- **Precision**: bfloat16 mixed precision
+- **Gradient Checkpointing**: Enabled
+- **Max Sequence Length**: 2048 tokens
+### Training Infrastructure
+- **Framework**: HuggingFace Transformers + TRL SFTTrainer
+- **Distributed Training**: Accelerate (multi-GPU)
+- **GPUs**: 1x RTX PRO 6000
+- **Training Time**: ~6-8 hours
+- **Memory per GPU**: ~15GB
+## Evaluation
+### Qualitative Evaluation
+The model demonstrates:
+- ✅ Fluent Serbian language generation
+- ✅ Step-by-step reasoning in Serbian
+- ✅ Mathematical problem solving
+- ✅ Code understanding and generation
+- ✅ Multi-turn conversation capabilities
+## Limitations and Biases
+### Known Limitations
+1. **Language Coverage**: Primarily trained on Serbian and English; limited support for other languages
+2. **Factual Accuracy**: May generate plausible but incorrect information (hallucination)
+3. **Context Length**: While supporting 131k tokens, performance may degrade on very long contexts
+4. **Domain Specificity**: Best performance on conversational and reasoning tasks; may struggle with highly specialized domains
+5. **Training Data**: Limited to ~100k samples; may not cover all Serbian language variations
+### Potential Biases
+- **Translation Bias**: Serbian data is translated from English, may not reflect natural Serbian expressions
+- **Domain Bias**: Reasoning data focuses on math and code; may be less effective on other domains
+- **Cultural Bias**: Training data may reflect Western cultural perspectives
+### Recommendations
+- Verify factual claims with authoritative sources
+- Test thoroughly before deployment in production
+- Monitor for biased or inappropriate outputs
+- Consider fine-tuning on domain-specific data for specialized applications
+## Ethical Considerations
+### AI Democratization
+This model is part of an effort to democratize AI by bringing advanced capabilities to underrepresented languages. Serbian, despite having ~12 million speakers, has limited AI resources compared to high-resource languages.
+### Responsible Use
+Users should:
+- Be aware of potential biases and limitations
+- Not use for malicious purposes (misinformation, harassment, etc.)
+- Respect privacy and data protection regulations
+- Consider societal impact of deployments
+### Environmental Impact
+- **Training**: ~6-8 hours on 2x A100 GPUs
+- **Carbon Footprint**: Estimated ~5-10 kg CO2eq (depends on energy source)
+- **Inference**: Efficient at 1.3B parameters, suitable for edge deployment
+## Technical Details
+### Asterisk Architecture
+The model uses the **Asterisk** architecture, which combines:
+1. **ASPP (Adjacency-Structured Parallel Propagation)**:
+   - Graph-based reasoning with Union-Find structure
+   - Each token maintains parent pointer: `parent[i] = i-1`
+   - Iterative message passing: `h_i^(t+1) = φ(h_i^(t), h_parent[i])`
+   - 6 propagation steps per layer
+2. **π-flow Refinement**:
+   - Continuous flow dynamics: `h' = h + α * v(h)`
+   - Learnable velocity field for multi-step refinement
+   - Adaptive per-token gating
+   - 4 refinement steps per layer
+3. **Hybrid Fusion**:
+   - Parallel execution of ASPP and standard Attention
+   - Gated combination: `output = gate * ASPP(x) + (1-gate) * Attention(x)`
+   - Applied to all 16 layers
+### Model Configuration
+```json
+{
+  "model_type": "asterisk",
+  "hidden_size": 2048,
+  "num_hidden_layers": 16,
+  "num_attention_heads": 32,
+  "num_key_value_heads": 8,
+  "intermediate_size": 8192,
+  "vocab_size": 128256,
+  "max_position_embeddings": 131072,
+  "aspp_hidden_dim": 512,
+  "aspp_num_steps": 6,
+  "aspp_dropout": 0.15,
+  "aspp_num_neighbors": 1,
+  "pi_flow": true,
+  "pi_flow_steps": 4,
+  "pi_flow_scale": 0.4,
+  "pi_flow_use_gate": true,
+  "hybrid_layer_indices": null
+}
+```
+## Comparison with Other Models
+| Model | Base | Params | Layers | Language | Reasoning | Architecture |
+|-------|------|--------|--------|----------|-----------|--------------|
+| SmolLM2-135M | - | 135M | 30 | English | ❌ | Transformer |
+| Asterisk | SmolLM2 | 171M | 30 | English | ✅ ASPP | Hybrid |
+| **Geilim-1B-SR** | Geilim-1B | 1.3B | 16 | Serbian | ✅ ASPP | Hybrid |
+### Advantages
+- ✅ **Efficient Size**: 1.3B parameters, suitable for consumer hardware
+- ✅ **Full Hybrid**: All 16 layers use ASPP + Attention
+- ✅ **Bilingual**: Serbian + English capabilities
+- ✅ **Reasoning**: Math, code, and general reasoning
+- ✅ **Fast Training**: ~6-8 hours on 2x A100
+- ✅ **Low Memory**: ~3GB inference, ~20GB training per GPU
+## Hardware Requirements
+### Inference
+- **Minimum**: 1x GPU with 8GB VRAM (e.g., RTX 3060)
+- **Recommended**: 1x GPU with 16GB+ VRAM (e.g., RTX 4080, A100)
+- **CPU Only**: Possible but slow (~10-20x slower)
+### Training
+- **Minimum**: 2x GPU with 24GB VRAM (e.g., RTX 3090/4090)
+- **Recommended**: 2x GPU with 40GB VRAM (e.g., A100)
+- **Memory**: ~20GB per GPU with gradient checkpointing
+## Model Card Authors
+- **NoesisLab**
+## Citation
+If you use this model in your research or applications, please cite:
+```bibtex
+@software{geilim_1b_sr_2026,
+  title={Geilim-1B-SR-Instruct: Serbian Reasoning Model with Asterisk Architecture},
+  author={NoesisLab},
+  year={2026},
+  url={https://huggingface.co/NoesisLab/Geilim-1B-SR-Instruct},
+  note={AI Democratization - Bringing reasoning to underrepresented languages}
+}
+```
+### Related Papers
+```bibtex
+@article{asterisk_2026,
+  title={Asterisk: Hybrid ASPP-Attention Architecture for Efficient Reasoning},
+  author={NoesisLab},
+  year={2026},
+  note={Graph-based reasoning with Union-Find propagation}
+}
+```
+## Acknowledgments
+- **Geilim-1B-Instruct**: Base model (Llama-3 architecture, 1B parameters)
+- **ODA-Mixture-100k**: Reasoning dataset (Math, Code, General)
+- **UltraChat**: High-quality conversation dataset
+- **Serbian NLP Community**: Language support and feedback
+- **HuggingFace**: Transformers library and model hosting
+- **Accelerate**: Distributed training framework
+## License
+This model is released under the **Apache 2.0 License**, same as the base model.
+```
+Copyright 2026 Asterisk Project
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+```
+## Version History
+- **v1.0** (2026-02): Initial release
+  - 1.3B parameters (1B base + 300M ASPP/π-flow)
+  - Trained on 100k samples (50% ODA-Mixture + 50% UltraChat Serbian)
+  - All 16 layers use hybrid ASPP + Attention
+  - Supports Serbian and English
+## Contact and Support
+- **Email**: lizx93@mail2.sysu.edu.cn
+---
+<div align="center">
+  <h3>🇷🇸 Democratizing AI, one language at a time!</h3>
+  <p><em>Making advanced AI technology accessible to every language</em></p>

handler.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# handler.py
+from __future__ import annotations
+from typing import Any, Dict, List, Union
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+Json = Dict[str, Any]
+Messages = List[Dict[str, str]]  # [{"role":"user|assistant|system", "content":"..."}]
+def _is_messages(x: Any) -> bool:
+    return (
+        isinstance(x, list)
+        and len(x) > 0
+        and all(isinstance(m, dict) and "role" in m and "content" in m for m in x)
+    )
+class EndpointHandler:
+    """
+    Hugging Face Inference Endpoints custom handler.
+    Expects:
+      - request body is a dict
+      - always contains `inputs`
+      - may contain `parameters` for generation
+    """
+    def __init__(self, model_dir: str):
+        self.model_dir = model_dir
+        # Pick dtype/device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        if self.device == "cuda":
+            # bfloat16 is usually safe on A100/H100; if your instance doesn't support bf16, change to float16
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
+        # IMPORTANT: trust_remote_code=True because repo contains AsteriskForCausalLM.py + auto_map
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir,
+            trust_remote_code=True,
+            use_fast=True,
+        )
+        # Make sure pad token exists (your config uses pad_token_id=2 which equals eos_token_id in many llama-like models)
+        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            trust_remote_code=True,
+            torch_dtype=self.dtype,
+            device_map="auto" if self.device == "cuda" else None,
+        )
+        if self.device != "cuda":
+            self.model.to(self.device)
+        self.model.eval()
+    @torch.inference_mode()
+    def __call__(self, data: Json) -> Union[Json, List[Json]]:
+        inputs = data.get("inputs", "")
+        params = data.get("parameters", {}) or {}
+        # Generation defaults (can be overridden via `parameters`)
+        max_new_tokens = int(params.get("max_new_tokens", 256))
+        temperature = float(params.get("temperature", 0.7))
+        top_p = float(params.get("top_p", 0.95))
+        top_k = int(params.get("top_k", 0))
+        repetition_penalty = float(params.get("repetition_penalty", 1.0))
+        do_sample = bool(params.get("do_sample", temperature > 0))
+        num_beams = int(params.get("num_beams", 1))
+        def _one(item: Any) -> Json:
+            # Accept:
+            # 1) string prompt
+            # 2) messages list: [{"role":"user","content":"..."}]
+            # 3) dict {"messages":[...]} (common chat style)
+            if isinstance(item, dict) and "messages" in item:
+                item = item["messages"]
+            if _is_messages(item):
+                # Chat template path exists in repo; tokenizer.apply_chat_template will use it if configured
+                input_ids = self.tokenizer.apply_chat_template(
+                    item,
+                    return_tensors="pt",
+                    add_generation_prompt=True,
+                )
+            else:
+                if not isinstance(item, str):
+                    item = str(item)
+                enc = self.tokenizer(item, return_tensors="pt")
+                input_ids = enc["input_ids"]
+            input_ids = input_ids.to(self.model.device)
+            input_len = input_ids.shape[-1]
+            gen_ids = self.model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                temperature=temperature if do_sample else None,
+                top_p=top_p if do_sample else None,
+                top_k=top_k if do_sample and top_k > 0 else None,
+                num_beams=num_beams,
+                repetition_penalty=repetition_penalty,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+            )
+            # Only return newly generated tokens
+            new_tokens = gen_ids[0, input_len:]
+            text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
+            return {"generated_text": text}
+        # Batch support
+        if isinstance(inputs, list) and not _is_messages(inputs):
+            return [_one(x) for x in inputs]
+        else:
+            return _one(inputs)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers==4.57.6
+torch
+accelerate