Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -47,13 +47,11 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
|
|
| 47 |
|
| 48 |
# ==== Format data ====
|
| 49 |
def format_example(item):
|
| 50 |
-
# Use "text" or "content" column if available
|
| 51 |
text = (
|
| 52 |
item.get("text")
|
| 53 |
or item.get("content")
|
| 54 |
or " ".join(str(v) for v in item.values())
|
| 55 |
)
|
| 56 |
-
|
| 57 |
prompt = f"""<|system|>
|
| 58 |
You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
| 59 |
<|user|>
|
|
@@ -65,7 +63,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 65 |
dataset = dataset.map(format_example)
|
| 66 |
output_log.append(f" ✅ Formatted {len(dataset)} examples")
|
| 67 |
|
| 68 |
-
# ==== Model ====
|
| 69 |
progress(0.3, desc="Loading model & tokenizer...")
|
| 70 |
model_name = "Qwen/Qwen2.5-0.5B"
|
| 71 |
output_log.append(f"\n🤖 Loading model: {model_name}")
|
|
@@ -82,7 +80,6 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 82 |
)
|
| 83 |
if device == "cuda":
|
| 84 |
model = model.to(device)
|
| 85 |
-
|
| 86 |
output_log.append(" ✅ Model loaded successfully")
|
| 87 |
|
| 88 |
# ==== LoRA ====
|
|
@@ -102,18 +99,21 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 102 |
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 103 |
output_log.append(f" Trainable params: {trainable_params:,}")
|
| 104 |
|
| 105 |
-
# ==== Tokenization ====
|
| 106 |
progress(0.5, desc="Tokenizing dataset...")
|
| 107 |
def tokenize_fn(examples):
|
| 108 |
-
|
| 109 |
examples["text"],
|
| 110 |
padding="max_length",
|
| 111 |
truncation=True,
|
| 112 |
max_length=256,
|
| 113 |
)
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
dataset = dataset.map(tokenize_fn, batched=True)
|
| 116 |
-
output_log.append(" ✅ Tokenization done")
|
| 117 |
|
| 118 |
# ==== Training arguments ====
|
| 119 |
progress(0.6, desc="Setting up training...")
|
|
@@ -144,6 +144,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 144 |
output_log.append("\n🚀 Starting training...\n" + "=" * 50)
|
| 145 |
train_result = trainer.train()
|
| 146 |
|
|
|
|
| 147 |
progress(0.85, desc="Saving model...")
|
| 148 |
output_log.append("\n💾 Saving model locally...")
|
| 149 |
trainer.save_model(output_dir)
|
|
@@ -157,10 +158,8 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 157 |
api = HfApi()
|
| 158 |
token = HfFolder.get_token()
|
| 159 |
|
| 160 |
-
# Create repo if not exists
|
| 161 |
api.create_repo(repo_id=hf_repo, exist_ok=True)
|
| 162 |
|
| 163 |
-
# Clone & push
|
| 164 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 165 |
repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
|
| 166 |
shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)
|
|
|
|
| 47 |
|
| 48 |
# ==== Format data ====
|
| 49 |
def format_example(item):
|
|
|
|
| 50 |
text = (
|
| 51 |
item.get("text")
|
| 52 |
or item.get("content")
|
| 53 |
or " ".join(str(v) for v in item.values())
|
| 54 |
)
|
|
|
|
| 55 |
prompt = f"""<|system|>
|
| 56 |
You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
| 57 |
<|user|>
|
|
|
|
| 63 |
dataset = dataset.map(format_example)
|
| 64 |
output_log.append(f" ✅ Formatted {len(dataset)} examples")
|
| 65 |
|
| 66 |
+
# ==== Model & Tokenizer ====
|
| 67 |
progress(0.3, desc="Loading model & tokenizer...")
|
| 68 |
model_name = "Qwen/Qwen2.5-0.5B"
|
| 69 |
output_log.append(f"\n🤖 Loading model: {model_name}")
|
|
|
|
| 80 |
)
|
| 81 |
if device == "cuda":
|
| 82 |
model = model.to(device)
|
|
|
|
| 83 |
output_log.append(" ✅ Model loaded successfully")
|
| 84 |
|
| 85 |
# ==== LoRA ====
|
|
|
|
| 99 |
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 100 |
output_log.append(f" Trainable params: {trainable_params:,}")
|
| 101 |
|
| 102 |
+
# ==== Tokenization + Labels ====
|
| 103 |
progress(0.5, desc="Tokenizing dataset...")
|
| 104 |
def tokenize_fn(examples):
|
| 105 |
+
tokenized = tokenizer(
|
| 106 |
examples["text"],
|
| 107 |
padding="max_length",
|
| 108 |
truncation=True,
|
| 109 |
max_length=256,
|
| 110 |
)
|
| 111 |
+
# Add labels for causal LM
|
| 112 |
+
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 113 |
+
return tokenized
|
| 114 |
|
| 115 |
dataset = dataset.map(tokenize_fn, batched=True)
|
| 116 |
+
output_log.append(" ✅ Tokenization + labels done")
|
| 117 |
|
| 118 |
# ==== Training arguments ====
|
| 119 |
progress(0.6, desc="Setting up training...")
|
|
|
|
| 144 |
output_log.append("\n🚀 Starting training...\n" + "=" * 50)
|
| 145 |
train_result = trainer.train()
|
| 146 |
|
| 147 |
+
# ==== Save model ====
|
| 148 |
progress(0.85, desc="Saving model...")
|
| 149 |
output_log.append("\n💾 Saving model locally...")
|
| 150 |
trainer.save_model(output_dir)
|
|
|
|
| 158 |
api = HfApi()
|
| 159 |
token = HfFolder.get_token()
|
| 160 |
|
|
|
|
| 161 |
api.create_repo(repo_id=hf_repo, exist_ok=True)
|
| 162 |
|
|
|
|
| 163 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 164 |
repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
|
| 165 |
shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)
|