Mujib87 / pipeline.sh
likhonsheikhdev's picture
Update pipeline.sh
eefb8b4 verified
#!/bin/bash
set -euo pipefail
set -x # MODIFICATION: Add this line to print every command
# =============================================================================
# BENGALI-CODE LLM - DEV PIPELINE SCRIPT
# =============================================================================
# This script is designed to run in the resource-constrained Hugging Face Space.
echo "🚀 Initializing Dev Pipeline..."
# ... the rest of the file is exactly the same ...
# (No other changes needed in this file)
# --- Configuration ---
VOCAB_SIZE=16000 # Smaller vocab for faster dev run
PROJECT_DIR="$(pwd)"
# --- Create Directory Structure ---
mkdir -p {data/{raw,processed},tokenizer,models,checkpoints,results,logs,scripts,configs}
# --- 1. Data Collection (Sample Data) ---
echo "📚 Step 1: Creating a small sample dataset..."
cat > data/raw/sample_data.txt <<'EOF'
আমার সোনার বাংলা, আমি তোমায় ভালোবাসি।
The quick brown fox jumps over the lazy dog.
def factorial(n):
# This function calculates the factorial of a number
if n == 0:
return 1
else:
return n * factorial(n-1)
import math
print(math.pi)
EOF
echo "✅ Sample dataset created."
# --- 2. Preprocessing & Tokenizer Training ---
echo "🧹 Step 2: Preprocessing data..."
cat data/raw/*.txt > data/processed/combined.txt
head -n 3 data/processed/combined.txt > data/processed/train.txt
tail -n +4 data/processed/combined.txt > data/processed/validation.txt
echo "✅ Data preprocessed."
echo "🔤 Step 3: Training tokenizer..."
python3 << EOF
import sentencepiece as spm
import os
os.makedirs('tokenizer', exist_ok=True)
spm.SentencePieceTrainer.train(
input='data/processed/train.txt',
model_prefix='tokenizer/bengali_code_dev',
vocab_size=${VOCAB_SIZE},
model_type='bpe',
pad_id=0, unk_id=1, bos_id=2, eos_id=3
)
EOF
echo "✅ Tokenizer trained."
# --- 3. Model Training (Tiny Dev Model) ---
echo "🧠 Step 4: Configuring and Training Tiny Model..."
cat > scripts/train_dev.py << 'EOF'
import torch, argparse, sentencepiece as spm
from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
class Tokenizer:
def __init__(self, path): self.sp = spm.SentencePieceProcessor(model_file=path)
def __call__(self, t, **k): return {'input_ids': self.sp.encode(t, out_type=int)}
def decode(self, ids, **k): return self.sp.decode(ids)
@property
def vocab_size(self): return self.sp.vocab_size()
@property
def pad_token_id(self): return self.sp.pad_id()
tokenizer = Tokenizer(path="tokenizer/bengali_code_dev.model")
dataset = load_dataset("text", data_files={"train": "data/processed/train.txt", "validation": "data/processed/validation.txt"})
tokenized_ds = dataset.map(lambda e: tokenizer(e["text"]), remove_columns=["text"])
config = AutoConfig.from_pretrained("gpt2", vocab_size=tokenizer.vocab_size, n_layer=2, n_head=2, n_embd=128)
model = AutoModelForCausalLM.from_config(config)
print(f"✅ Tiny model created with ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters.")
trainer = Trainer(
model=model,
args=TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=1, report_to="none"),
train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["validation"],
tokenizer=tokenizer, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
print("🚀 Starting training...")
trainer.train()
print("✅ Training complete.")
EOF
python3 scripts/train_dev.py
echo "🎉 PIPELINE COMPLETED SUCCESSFULLY!"