HuggingFaceFW/fineweb-edu
Viewer • Updated • 3.5B • 625k • 1.09k
This is a BERT-style masked language model pretrained from scratch using streaming data from FineWeb-Edu.
import torch
from transformers import AutoTokenizer
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"your_hf_username/bert-edu-pretrained-384d"
)
# Load model weights (custom architecture)
from model import BERT, BERTConfig # your model definition
config = BERTConfig(
vocab_size=30522,
dim=384,
n_layers=6,
n_heads=6,
seq_len=128
)
model = BERT(config)
model.load_state_dict(
torch.load("pytorch_model.bin", map_location="cpu")
)
model.eval()
# Input with [MASK]
text = "Machine learning is [MASK]."
inputs = tokenizer(text, return_tensors="pt")
# Forward pass
with torch.no_grad():
logits = model(
inputs["input_ids"],
torch.zeros_like(inputs["input_ids"])
)
mask_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero()[0, 1]
pred_id = logits[0, mask_index].argmax(dim=-1)
print("Prediction:", tokenizer.decode(pred_id))
import torch.nn as nn
class BertForSentiment(nn.Module):
def __init__(self, bert, hidden_size, num_labels=2):
super().__init__()
self.bert = bert
self.classifier = nn.Linear(hidden_size, num_labels)
def forward(self, input_ids, attention_mask=None):
seg = torch.zeros_like(input_ids)
hidden_states = self.bert(input_ids, seg)
cls_token = hidden_states[:, 0, :]
return self.classifier(cls_token)
# Step-2 :
bert = BERT(config)
bert.load_state_dict(
torch.load("pytorch_model.bin", map_location="cpu")
)
model = BertForSentiment(bert, hidden_size=384)
# train onto sentiemental data(imdb)
from datasets import load_dataset
dataset = load_dataset("imdb")
# tokenize → train → evaluate