|
|
"""
|
|
|
GitPulse Model - Multimodal Transformer for GitHub Project Health Prediction
|
|
|
|
|
|
基于 Transformer+Text 的多模态时序预测模型
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import json
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
from typing import Optional, Tuple
|
|
|
from transformers import DistilBertModel, DistilBertTokenizer
|
|
|
|
|
|
|
|
|
class TextEncoder(nn.Module):
|
|
|
"""文本编码器:基于 DistilBERT"""
|
|
|
|
|
|
def __init__(self, d_model=128, freeze_bert=True):
|
|
|
super().__init__()
|
|
|
self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
|
|
|
|
|
|
if freeze_bert:
|
|
|
for param in self.bert.parameters():
|
|
|
param.requires_grad = False
|
|
|
|
|
|
|
|
|
self.proj = nn.Sequential(
|
|
|
nn.Linear(768, d_model * 2),
|
|
|
nn.LayerNorm(d_model * 2),
|
|
|
nn.GELU(),
|
|
|
nn.Dropout(0.1),
|
|
|
nn.Linear(d_model * 2, d_model),
|
|
|
nn.LayerNorm(d_model)
|
|
|
)
|
|
|
|
|
|
|
|
|
self.attn_pool = nn.Linear(768, 1)
|
|
|
|
|
|
def forward(self, input_ids, attention_mask):
|
|
|
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
|
|
hidden = outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
attn_weights = self.attn_pool(hidden).squeeze(-1)
|
|
|
attn_weights = attn_weights.masked_fill(attention_mask == 0, -1e9)
|
|
|
attn_weights = torch.softmax(attn_weights, dim=-1)
|
|
|
|
|
|
pooled = torch.bmm(attn_weights.unsqueeze(1), hidden).squeeze(1)
|
|
|
|
|
|
return self.proj(pooled)
|
|
|
|
|
|
|
|
|
class TransformerTSEncoder(nn.Module):
|
|
|
"""时序编码器:Transformer"""
|
|
|
|
|
|
def __init__(self, n_vars=16, d_model=128, n_heads=4, n_layers=2,
|
|
|
hist_len=128, dropout=0.1):
|
|
|
super().__init__()
|
|
|
|
|
|
self.input_proj = nn.Sequential(
|
|
|
nn.Linear(n_vars, d_model),
|
|
|
nn.LayerNorm(d_model),
|
|
|
nn.Dropout(dropout)
|
|
|
)
|
|
|
|
|
|
self.pos_embedding = nn.Parameter(torch.randn(1, hist_len, d_model) * 0.02)
|
|
|
|
|
|
encoder_layer = nn.TransformerEncoderLayer(
|
|
|
d_model=d_model, nhead=n_heads,
|
|
|
dim_feedforward=d_model * 4, dropout=dropout,
|
|
|
activation='gelu', batch_first=True
|
|
|
)
|
|
|
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
|
|
|
self.norm = nn.LayerNorm(d_model)
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
|
x = self.input_proj(x)
|
|
|
x = x + self.pos_embedding[:, :x.size(1), :]
|
|
|
x = self.encoder(x)
|
|
|
return self.norm(x)
|
|
|
|
|
|
|
|
|
class AdaptiveFusion(nn.Module):
|
|
|
"""自适应融合层"""
|
|
|
|
|
|
def __init__(self, d_model, min_weight=0.1, max_weight=0.3):
|
|
|
super().__init__()
|
|
|
self.min_weight = min_weight
|
|
|
self.max_weight = max_weight
|
|
|
|
|
|
self.gate = nn.Sequential(
|
|
|
nn.Linear(d_model * 2, d_model),
|
|
|
nn.GELU(),
|
|
|
nn.Linear(d_model, 1),
|
|
|
nn.Sigmoid()
|
|
|
)
|
|
|
|
|
|
def forward(self, ts_feat, text_feat):
|
|
|
combined = torch.cat([ts_feat, text_feat], dim=-1)
|
|
|
raw_weight = self.gate(combined)
|
|
|
weight = self.min_weight + (self.max_weight - self.min_weight) * raw_weight
|
|
|
return ts_feat * (1 - weight) + text_feat * weight
|
|
|
|
|
|
|
|
|
class GitPulseModel(nn.Module):
|
|
|
"""
|
|
|
GitPulse: Multimodal Transformer for Time Series Prediction
|
|
|
|
|
|
结合项目文本描述和历史时序数据进行预测
|
|
|
"""
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
n_vars: int = 16,
|
|
|
d_model: int = 128,
|
|
|
n_heads: int = 4,
|
|
|
n_layers: int = 2,
|
|
|
hist_len: int = 128,
|
|
|
pred_len: int = 32,
|
|
|
dropout: float = 0.1,
|
|
|
freeze_bert: bool = True
|
|
|
):
|
|
|
super().__init__()
|
|
|
|
|
|
self.n_vars = n_vars
|
|
|
self.d_model = d_model
|
|
|
self.hist_len = hist_len
|
|
|
self.pred_len = pred_len
|
|
|
|
|
|
|
|
|
self.ts_encoder = TransformerTSEncoder(
|
|
|
n_vars=n_vars, d_model=d_model, n_heads=n_heads,
|
|
|
n_layers=n_layers, hist_len=hist_len, dropout=dropout
|
|
|
)
|
|
|
|
|
|
|
|
|
self.text_encoder = TextEncoder(d_model=d_model, freeze_bert=freeze_bert)
|
|
|
|
|
|
|
|
|
self.fusion = AdaptiveFusion(d_model)
|
|
|
|
|
|
|
|
|
self.pred_head = nn.Sequential(
|
|
|
nn.Linear(d_model, d_model * 2),
|
|
|
nn.GELU(),
|
|
|
nn.Dropout(dropout),
|
|
|
nn.Linear(d_model * 2, n_vars)
|
|
|
)
|
|
|
|
|
|
|
|
|
self.temporal_proj = nn.Linear(hist_len, pred_len)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
x: torch.Tensor,
|
|
|
input_ids: Optional[torch.Tensor] = None,
|
|
|
attention_mask: Optional[torch.Tensor] = None,
|
|
|
return_auxiliary: bool = False
|
|
|
) -> torch.Tensor:
|
|
|
"""
|
|
|
Args:
|
|
|
x: 历史时序 [B, hist_len, n_vars]
|
|
|
input_ids: 文本 token IDs [B, L]
|
|
|
attention_mask: 注意力掩码 [B, L]
|
|
|
|
|
|
Returns:
|
|
|
预测序列 [B, pred_len, n_vars]
|
|
|
"""
|
|
|
|
|
|
ts_encoded = self.ts_encoder(x)
|
|
|
ts_global = ts_encoded.mean(dim=1)
|
|
|
|
|
|
|
|
|
if input_ids is not None and attention_mask is not None:
|
|
|
text_feat = self.text_encoder(input_ids, attention_mask)
|
|
|
fused = self.fusion(ts_global, text_feat)
|
|
|
else:
|
|
|
fused = ts_global
|
|
|
|
|
|
|
|
|
pred_feat = self.pred_head(ts_encoded)
|
|
|
pred_feat = pred_feat.transpose(1, 2)
|
|
|
output = self.temporal_proj(pred_feat)
|
|
|
output = output.transpose(1, 2)
|
|
|
|
|
|
if return_auxiliary:
|
|
|
return output, torch.tensor(0.0), torch.tensor(0.0), {}
|
|
|
return output
|
|
|
|
|
|
@classmethod
|
|
|
def from_pretrained(cls, path: str, device: str = 'cuda'):
|
|
|
"""从预训练权重加载模型"""
|
|
|
config_path = os.path.join(path, 'config.json')
|
|
|
weights_path = os.path.join(path, 'gitpulse_weights.pt')
|
|
|
|
|
|
|
|
|
if os.path.exists(config_path):
|
|
|
with open(config_path, 'r') as f:
|
|
|
config = json.load(f)
|
|
|
else:
|
|
|
config = {}
|
|
|
|
|
|
|
|
|
model = cls(
|
|
|
n_vars=config.get('n_vars', 16),
|
|
|
d_model=config.get('d_model', 128),
|
|
|
n_heads=config.get('n_heads', 4),
|
|
|
n_layers=config.get('n_layers', 2),
|
|
|
hist_len=config.get('hist_len', 128),
|
|
|
pred_len=config.get('pred_len', 32)
|
|
|
)
|
|
|
|
|
|
|
|
|
if os.path.exists(weights_path):
|
|
|
state_dict = torch.load(weights_path, map_location=device, weights_only=False)
|
|
|
model.load_state_dict(state_dict, strict=False)
|
|
|
print(f"✓ Loaded weights from {weights_path}")
|
|
|
|
|
|
return model.to(device)
|
|
|
|
|
|
def predict(
|
|
|
self,
|
|
|
time_series: torch.Tensor,
|
|
|
text: str = None,
|
|
|
tokenizer: DistilBertTokenizer = None
|
|
|
) -> torch.Tensor:
|
|
|
"""便捷预测接口"""
|
|
|
self.eval()
|
|
|
|
|
|
if text is not None and tokenizer is not None:
|
|
|
encoded = tokenizer(
|
|
|
text, padding='max_length', truncation=True,
|
|
|
max_length=128, return_tensors='pt'
|
|
|
)
|
|
|
input_ids = encoded['input_ids'].to(time_series.device)
|
|
|
attention_mask = encoded['attention_mask'].to(time_series.device)
|
|
|
else:
|
|
|
input_ids = None
|
|
|
attention_mask = None
|
|
|
|
|
|
with torch.no_grad():
|
|
|
output = self.forward(time_series, input_ids, attention_mask)
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
def get_model_info():
|
|
|
return {
|
|
|
'name': 'GitPulse',
|
|
|
'version': '1.0',
|
|
|
'architecture': 'Transformer+Text',
|
|
|
'description': 'Multimodal time series prediction model for GitHub project health',
|
|
|
'metrics': {
|
|
|
'R2': 0.7559,
|
|
|
'MSE': 0.0755,
|
|
|
'DA': 0.8668,
|
|
|
'TA@0.2': 0.8160
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|