Spaces:
Sleeping
Sleeping
| set -euo pipefail | |
| set -x # MODIFICATION: Add this line to print every command | |
| # ============================================================================= | |
| # BENGALI-CODE LLM - DEV PIPELINE SCRIPT | |
| # ============================================================================= | |
| # This script is designed to run in the resource-constrained Hugging Face Space. | |
| echo "🚀 Initializing Dev Pipeline..." | |
| # ... the rest of the file is exactly the same ... | |
| # (No other changes needed in this file) | |
| # --- Configuration --- | |
| VOCAB_SIZE=16000 # Smaller vocab for faster dev run | |
| PROJECT_DIR="$(pwd)" | |
| # --- Create Directory Structure --- | |
| mkdir -p {data/{raw,processed},tokenizer,models,checkpoints,results,logs,scripts,configs} | |
| # --- 1. Data Collection (Sample Data) --- | |
| echo "📚 Step 1: Creating a small sample dataset..." | |
| cat > data/raw/sample_data.txt <<'EOF' | |
| আমার সোনার বাংলা, আমি তোমায় ভালোবাসি। | |
| The quick brown fox jumps over the lazy dog. | |
| def factorial(n): | |
| # This function calculates the factorial of a number | |
| if n == 0: | |
| return 1 | |
| else: | |
| return n * factorial(n-1) | |
| import math | |
| print(math.pi) | |
| EOF | |
| echo "✅ Sample dataset created." | |
| # --- 2. Preprocessing & Tokenizer Training --- | |
| echo "🧹 Step 2: Preprocessing data..." | |
| cat data/raw/*.txt > data/processed/combined.txt | |
| head -n 3 data/processed/combined.txt > data/processed/train.txt | |
| tail -n +4 data/processed/combined.txt > data/processed/validation.txt | |
| echo "✅ Data preprocessed." | |
| echo "🔤 Step 3: Training tokenizer..." | |
| python3 << EOF | |
| import sentencepiece as spm | |
| import os | |
| os.makedirs('tokenizer', exist_ok=True) | |
| spm.SentencePieceTrainer.train( | |
| input='data/processed/train.txt', | |
| model_prefix='tokenizer/bengali_code_dev', | |
| vocab_size=${VOCAB_SIZE}, | |
| model_type='bpe', | |
| pad_id=0, unk_id=1, bos_id=2, eos_id=3 | |
| ) | |
| EOF | |
| echo "✅ Tokenizer trained." | |
| # --- 3. Model Training (Tiny Dev Model) --- | |
| echo "🧠 Step 4: Configuring and Training Tiny Model..." | |
| cat > scripts/train_dev.py << 'EOF' | |
| import torch, argparse, sentencepiece as spm | |
| from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| from datasets import load_dataset | |
| class Tokenizer: | |
| def __init__(self, path): self.sp = spm.SentencePieceProcessor(model_file=path) | |
| def __call__(self, t, **k): return {'input_ids': self.sp.encode(t, out_type=int)} | |
| def decode(self, ids, **k): return self.sp.decode(ids) | |
| @property | |
| def vocab_size(self): return self.sp.vocab_size() | |
| @property | |
| def pad_token_id(self): return self.sp.pad_id() | |
| tokenizer = Tokenizer(path="tokenizer/bengali_code_dev.model") | |
| dataset = load_dataset("text", data_files={"train": "data/processed/train.txt", "validation": "data/processed/validation.txt"}) | |
| tokenized_ds = dataset.map(lambda e: tokenizer(e["text"]), remove_columns=["text"]) | |
| config = AutoConfig.from_pretrained("gpt2", vocab_size=tokenizer.vocab_size, n_layer=2, n_head=2, n_embd=128) | |
| model = AutoModelForCausalLM.from_config(config) | |
| print(f"✅ Tiny model created with ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters.") | |
| trainer = Trainer( | |
| model=model, | |
| args=TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=1, report_to="none"), | |
| train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["validation"], | |
| tokenizer=tokenizer, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| ) | |
| print("🚀 Starting training...") | |
| trainer.train() | |
| print("✅ Training complete.") | |
| EOF | |
| python3 scripts/train_dev.py | |
| echo "🎉 PIPELINE COMPLETED SUCCESSFULLY!" |