Spaces:

Aravindhan11
/

Distributed-Transformer-Framework

Running

App Files Files Community

Distributed-Transformer-Framework / finetune.py

Aravindhan11

Deploy Intelligent Distributed LLaMA Framework

52510e8 verified 3 days ago

raw

history blame contribute delete

7.18 kB

	import os
	import time
	import torch
	import torch.nn.functional as F
	from torch.optim import AdamW
	from transformers import AutoTokenizer, AutoConfig
	from datasets import load_dataset
	import numpy as np

	class LLMTrainer:
	def __init__(
	self,
	model,
	tokenizer,
	device="cpu",
	learning_rate=3e-4,
	seq_len=32,
	batch_size=1,
	gradient_accumulation_steps=1
	):
	self.model = model
	self.tokenizer = tokenizer
	self.device = device
	self.learning_rate = learning_rate
	self.seq_len = seq_len
	self.batch_size = batch_size
	self.grad_acc_steps = gradient_accumulation_steps

	self.model.to(device)
	self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)
	self.global_step = 0
	self.tokens_processed = 0

	def prepare_dataset(self, dataset_source: str, num_samples: int = 100):
	"""
	Creates tokenized chunks from either a Hugging Face dataset name, a path to a raw txt file,
	or an inline sample text.
	"""
	print(f"Preparing dataset from: {dataset_source}...")

	raw_text = ""

	# Check if source is a local file
	if os.path.exists(dataset_source) and os.path.isfile(dataset_source):
	with open(dataset_source, "r", encoding="utf-8") as f:
	raw_text = f.read()
	elif dataset_source.startswith("hf:"):
	# Load from HF Datasets, e.g. "hf:roneneldan/TinyStories"
	hf_path = dataset_source.split("hf:")[-1]
	try:
	ds = load_dataset(hf_path, split="train", streaming=True)
	# Read a few samples
	texts = []
	for i, item in enumerate(ds):
	if i >= num_samples:
	break
	texts.append(item.get("text", ""))
	raw_text = "\n\n".join(texts)
	except Exception as e:
	print(f"Error loading Hugging Face dataset: {e}. Falling back to default corpus.")
	raw_text = self._get_fallback_text()
	else:
	# Inline raw text or fallback
	if len(dataset_source.strip()) > 50:
	raw_text = dataset_source
	else:
	raw_text = self._get_fallback_text()

	# Tokenize the corpus
	print("Tokenizing corpus...")
	tokenized = self.tokenizer.encode(raw_text, add_special_tokens=True)

	# Chunk into sequence_length + 1
	sequence_length = self.seq_len
	chunks = []
	for i in range(0, len(tokenized) - sequence_length, sequence_length):
	chunk = tokenized[i : i + sequence_length + 1]
	if len(chunk) == sequence_length + 1:
	chunks.append(chunk)

	print(f"Dataset prepared! Total sequence chunks: {len(chunks)}")
	return chunks

	def train_step(self, batch_chunks):
	"""
	Performs a single gradient step. Handles batching and gradient accumulation.
	"""
	self.model.train()
	self.optimizer.zero_grad()

	accumulated_loss = 0.0

	# Loop through gradient accumulation steps
	for step in range(self.grad_acc_steps):
	# Select chunk slice for this micro-batch
	start_idx = step * self.batch_size
	end_idx = start_idx + self.batch_size

	# Pad or slice if needed
	micro_batch = batch_chunks[start_idx:end_idx]
	if not micro_batch:
	continue

	# Prepare tensor data
	tensor_batch = torch.tensor(micro_batch, dtype=torch.long, device=self.device)
	input_ids = tensor_batch[:, :-1].contiguous()
	target_ids = tensor_batch[:, 1:].contiguous()

	# Forward pass
	outputs = self.model(input_ids=input_ids) # [batch, seq, vocab]

	# Compute loss
	b, s = input_ids.shape
	outputs = outputs.view(b * s, -1)
	target_ids = target_ids.reshape(-1)

	loss = F.cross_entropy(outputs, target_ids, reduction="mean") / self.grad_acc_steps
	loss.backward()

	accumulated_loss += loss.item() * self.grad_acc_steps
	self.tokens_processed += b * s

	self.optimizer.step()
	self.global_step += 1

	return accumulated_loss

	def _get_fallback_text(self):
	return """
	Distributed systems allow multiple computer networks to collaborate and compute large workloads together.
	Transformer neural networks are highly scalable attention-based models that form the backbone of modern Generative AI.
	Pre-training involves training large language models on large corpora of text datasets, teaching them syntax, logic, and base knowledge.
	Fine-tuning adapts these models to specific downstream tasks, like customer support, coding assistance, or instruction following.
	This framework is an advanced, intelligent tool that makes it incredibly easy to load, adapt, and serve open-source LLMs.
	"""

	def fit_generator(self, dataset_source: str, max_steps: int = 50, callback=None):
	"""
	An active generator that runs the fine-tuning loop and yields metrics step-by-step.
	"""
	chunks = self.prepare_dataset(dataset_source)
	if not chunks:
	yield {"status": "error", "message": "Dataset preparation failed."}
	return

	step = 0
	total_chunks = len(chunks)
	batch_capacity = self.batch_size * self.grad_acc_steps

	chunk_idx = 0

	start_time = time.time()

	while step < max_steps:
	# Check if we ran out of chunks and loop them
	if chunk_idx + batch_capacity > total_chunks:
	chunk_idx = 0

	batch_chunks = chunks[chunk_idx : chunk_idx + batch_capacity]
	if len(batch_chunks) < batch_capacity:
	chunk_idx = 0
	continue

	chunk_idx += batch_capacity

	# Perform training step
	step_start = time.time()
	loss = self.train_step(batch_chunks)
	step_duration = time.time() - step_start

	step += 1

	# Log metrics
	tokens_per_sec = (batch_capacity * self.seq_len) / step_duration
	elapsed = time.time() - start_time

	metrics = {
	"step": step,
	"max_steps": max_steps,
	"loss": round(loss, 4),
	"speed": f"{tokens_per_sec:.1f} tokens/s",
	"tokens": self.tokens_processed,
	"elapsed": f"{elapsed:.1f}s",
	"memory": f"{torch.cuda.memory_reserved() / 1e9:.2f}GB" if torch.cuda.is_available() else "0.00GB"
	}

	if callback:
	callback(metrics)

	yield metrics