32x_Quantum_NLP / src /cst /quantum /quantum_transformer_utils.py
melhelbawi's picture
feat: establish Quantum-Enhanced CST project with core components, training pipelines, and evaluation utilities, and update README.md.
94c2e42
# CST / QCST Dual License
# Non-commercial research use only.
# Commercial use requires explicit permission.
# Copyright (c) 2025 Mohamed Mohamed Elhelbawi
# All rights reserved.
# See LICENSE file in the project root for full license information.
"""
Quantum Transformer Utilities - Standalone Implementation
Transformer components without classical dependencies
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Dict, List
import math
import logging
logger = logging.getLogger(__name__)
class QuantumPositionalEncoding(nn.Module):
"""Positional encoding for quantum-enhanced sequences"""
def __init__(self, d_model: int, max_len: int = 5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
if d_model % 2 == 1:
pe[:, 1::2] = torch.cos(position * div_term[:-1])
else:
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
return x + self.pe[:, :x.size(1)]
class QuantumScaledDotProductAttention(nn.Module):
"""Quantum-aware scaled dot-product attention"""
def __init__(self, d_k: int, dropout: float = 0.0):
super().__init__()
self.d_k = d_k
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
"""Compute attention with quantum awareness"""
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention_weights = F.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)
output = torch.matmul(attention_weights, value)
return output, attention_weights
class QuantumMultiHeadAttention(nn.Module):
"""Multi-head attention optimized for quantum integration"""
def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
super().__init__()
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
self.attention = QuantumScaledDotProductAttention(self.d_k, dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
# Linear projections
Q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
# Apply attention
attn_output, attn_weights = self.attention(Q, K, V, mask)
# Concatenate heads
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(batch_size, -1, self.d_model)
# Final linear projection
output = self.W_o(attn_output)
output = self.dropout(output)
return output, attn_weights
class QuantumFeedForward(nn.Module):
"""Feed-forward network for quantum transformer"""
def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.linear2(self.dropout(F.gelu(self.linear1(x))))
class QuantumTransformerLayer(nn.Module):
"""Single transformer layer with quantum awareness"""
def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
super().__init__()
self.self_attention = QuantumMultiHeadAttention(d_model, num_heads, dropout)
self.feed_forward = QuantumFeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, x, mask=None):
# Self-attention with residual
attn_output, _ = self.self_attention(x, x, x, mask)
x = self.norm1(x + self.dropout1(attn_output))
# Feed-forward with residual
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout2(ff_output))
return x
class QuantumTransformer(nn.Module):
"""
Quantum-Enhanced Transformer
Transformer architecture optimized for quantum circuit integration
"""
def __init__(self, d_model: int, num_heads: int, num_layers: int,
d_ff: int, max_seq_len: int = 5000, dropout: float = 0.1):
super().__init__()
self.d_model = d_model
self.positional_encoding = QuantumPositionalEncoding(d_model, max_seq_len)
self.layers = nn.ModuleList([
QuantumTransformerLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
self.norm = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
"""
Forward pass through quantum transformer
Args:
x: [batch_size, seq_len, d_model] input tensor
mask: Optional attention mask
Returns:
output: [batch_size, seq_len, d_model] transformed tensor
"""
# Apply positional encoding
x = self.positional_encoding(x)
x = self.dropout(x)
# Apply transformer layers
for layer in self.layers:
x = layer(x, mask)
# Final normalization
x = self.norm(x)
return x
class QuantumCSTransformer(nn.Module):
"""
Quantum-Enhanced CST Transformer
Transformer specifically designed for Contextual Spectrum Tokenization
"""
def __init__(self, config):
super().__init__()
self.config = config
# Embedding layers
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
self.position_embedding = QuantumPositionalEncoding(config.d_model)
# Transformer backbone
self.transformer = QuantumTransformer(
d_model=config.d_model,
num_heads=config.num_heads,
num_layers=config.num_transformer_layers,
d_ff=config.transformer_ff_dim,
dropout=config.dropout_rate
)
# Output projection
self.output_projection = nn.Linear(config.d_model, config.vocab_size)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, token_ids: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Forward pass through quantum CST transformer
Args:
token_ids: [batch_size, seq_len] token indices
mask: Optional attention mask
Returns:
logits: [batch_size, seq_len, vocab_size] output logits
"""
# Embed tokens
x = self.token_embedding(token_ids)
x = self.position_embedding(x)
x = self.dropout(x)
# Apply transformer
x = self.transformer(x, mask)
# Project to vocabulary
logits = self.output_projection(x)
return logits
def collate_quantum_batch(batch: List[Dict], device: str = 'cpu') -> Dict[str, torch.Tensor]:
"""
Collate function for quantum CST batches
Handles quantum-specific data organization
Args:
batch: List of batch items
device: Device to move tensors to
Returns:
Dictionary of collated tensors
"""
token_ids_list = []
lengths = []
for item in batch:
if isinstance(item, dict):
token_ids = item.get('token_ids', item.get('input_ids'))
else:
token_ids = item
token_ids_list.append(torch.tensor(token_ids, dtype=torch.long))
lengths.append(len(token_ids))
# Pad sequences
max_length = max(lengths)
padded_token_ids = torch.zeros(len(batch), max_length, dtype=torch.long)
attention_mask = torch.zeros(len(batch), max_length, dtype=torch.float)
for i, (token_ids, length) in enumerate(zip(token_ids_list, lengths)):
padded_token_ids[i, :length] = token_ids
attention_mask[i, :length] = 1.0
return {
'token_ids': padded_token_ids.to(device),
'attention_mask': attention_mask.to(device),
'lengths': torch.tensor(lengths, dtype=torch.long).to(device)
}