likhonsheikhdev commited on
Commit
3190ed7
·
verified ·
1 Parent(s): f538642

Create pipeline.sh

Browse files
Files changed (1) hide show
  1. pipeline.sh +93 -0
pipeline.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # =============================================================================
5
+ # BENGALI-CODE LLM - DEV PIPELINE SCRIPT
6
+ # =============================================================================
7
+ # This script is designed to run in the resource-constrained Hugging Face Space.
8
+
9
+ echo "🚀 Initializing Dev Pipeline..."
10
+
11
+ # --- Configuration ---
12
+ VOCAB_SIZE=16000 # Smaller vocab for faster dev run
13
+ PROJECT_DIR="$(pwd)"
14
+
15
+ # --- Create Directory Structure ---
16
+ mkdir -p {data/{raw,processed},tokenizer,models,checkpoints,results,logs,scripts,configs}
17
+
18
+ # --- 1. Data Collection (Sample Data) ---
19
+ echo "📚 Step 1: Creating a small sample dataset..."
20
+ cat > data/raw/sample_data.txt <<'EOF'
21
+ আমার সোনার বাংলা, আমি তোমায় ভালোবাসি।
22
+ The quick brown fox jumps over the lazy dog.
23
+ def factorial(n):
24
+ # This function calculates the factorial of a number
25
+ if n == 0:
26
+ return 1
27
+ else:
28
+ return n * factorial(n-1)
29
+ import math
30
+ print(math.pi)
31
+ EOF
32
+ echo "✅ Sample dataset created."
33
+
34
+ # --- 2. Preprocessing & Tokenizer Training ---
35
+ echo "🧹 Step 2: Preprocessing data..."
36
+ cat data/raw/*.txt > data/processed/combined.txt
37
+ head -n 3 data/processed/combined.txt > data/processed/train.txt
38
+ tail -n +4 data/processed/combined.txt > data/processed/validation.txt
39
+ echo "✅ Data preprocessed."
40
+
41
+ echo "🔤 Step 3: Training tokenizer..."
42
+ python3 << EOF
43
+ import sentencepiece as spm
44
+ import os
45
+ os.makedirs('tokenizer', exist_ok=True)
46
+ spm.SentencePieceTrainer.train(
47
+ input='data/processed/train.txt',
48
+ model_prefix='tokenizer/bengali_code_dev',
49
+ vocab_size=${VOCAB_SIZE},
50
+ model_type='bpe',
51
+ pad_id=0, unk_id=1, bos_id=2, eos_id=3
52
+ )
53
+ EOF
54
+ echo "✅ Tokenizer trained."
55
+
56
+ # --- 3. Model Training (Tiny Dev Model) ---
57
+ echo "🧠 Step 4: Configuring and Training Tiny Model..."
58
+ cat > scripts/train_dev.py << 'EOF'
59
+ import torch, argparse, sentencepiece as spm
60
+ from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
61
+ from datasets import load_dataset
62
+
63
+ class Tokenizer:
64
+ def __init__(self, path): self.sp = spm.SentencePieceProcessor(model_file=path)
65
+ def __call__(self, t, **k): return {'input_ids': self.sp.encode(t, out_type=int)}
66
+ def decode(self, ids, **k): return self.sp.decode(ids)
67
+ @property
68
+ def vocab_size(self): return self.sp.vocab_size()
69
+ @property
70
+ def pad_token_id(self): return self.sp.pad_id()
71
+
72
+ tokenizer = Tokenizer(path="tokenizer/bengali_code_dev.model")
73
+ dataset = load_dataset("text", data_files={"train": "data/processed/train.txt", "validation": "data/processed/validation.txt"})
74
+ tokenized_ds = dataset.map(lambda e: tokenizer(e["text"]), remove_columns=["text"])
75
+
76
+ config = AutoConfig.from_pretrained("gpt2", vocab_size=tokenizer.vocab_size, n_layer=2, n_head=2, n_embd=128)
77
+ model = AutoModelForCausalLM.from_config(config)
78
+ print(f"✅ Tiny model created with ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters.")
79
+
80
+ trainer = Trainer(
81
+ model=model,
82
+ args=TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=1, report_to="none"),
83
+ train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["validation"],
84
+ tokenizer=tokenizer, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
85
+ )
86
+ print("🚀 Starting training...")
87
+ trainer.train()
88
+ print("✅ Training complete.")
89
+ EOF
90
+
91
+ python3 scripts/train_dev.py
92
+
93
+ echo "🎉 PIPELINE COMPLETED SUCCESSFULLY!"