Qwen-Training

Running

App Files Files Community

rahul7star commited on Oct 12

Commit

ae9ce4a

verified ·

1 Parent(s): 138479c

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -9

app.py CHANGED Viewed

@@ -47,13 +47,11 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
         # ==== Format data ====
         def format_example(item):
-            # Use "text" or "content" column if available
             text = (
                 item.get("text")
                 or item.get("content")
                 or " ".join(str(v) for v in item.values())
             )
             prompt = f"""<|system|>
 You are a wise teacher interpreting Bhagavad Gita with deep insights.
 <|user|>
@@ -65,7 +63,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
         dataset = dataset.map(format_example)
         output_log.append(f"   ✅ Formatted {len(dataset)} examples")
-        # ==== Model ====
         progress(0.3, desc="Loading model & tokenizer...")
         model_name = "Qwen/Qwen2.5-0.5B"
         output_log.append(f"\n🤖 Loading model: {model_name}")
@@ -82,7 +80,6 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
         )
         if device == "cuda":
             model = model.to(device)
         output_log.append("   ✅ Model loaded successfully")
         # ==== LoRA ====
@@ -102,18 +99,21 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
         trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
         output_log.append(f"   Trainable params: {trainable_params:,}")
-        # ==== Tokenization ====
         progress(0.5, desc="Tokenizing dataset...")
         def tokenize_fn(examples):
-            return tokenizer(
                 examples["text"],
                 padding="max_length",
                 truncation=True,
                 max_length=256,
             )
         dataset = dataset.map(tokenize_fn, batched=True)
-        output_log.append("   ✅ Tokenization done")
         # ==== Training arguments ====
         progress(0.6, desc="Setting up training...")
@@ -144,6 +144,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
         output_log.append("\n🚀 Starting training...\n" + "=" * 50)
         train_result = trainer.train()
         progress(0.85, desc="Saving model...")
         output_log.append("\n💾 Saving model locally...")
         trainer.save_model(output_dir)
@@ -157,10 +158,8 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
         api = HfApi()
         token = HfFolder.get_token()
-        # Create repo if not exists
         api.create_repo(repo_id=hf_repo, exist_ok=True)
-        # Clone & push
         with tempfile.TemporaryDirectory() as tmpdir:
             repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
             shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)

         # ==== Format data ====
         def format_example(item):
             text = (
                 item.get("text")
                 or item.get("content")
                 or " ".join(str(v) for v in item.values())
             )
             prompt = f"""<|system|>
 You are a wise teacher interpreting Bhagavad Gita with deep insights.
 <|user|>
         dataset = dataset.map(format_example)
         output_log.append(f"   ✅ Formatted {len(dataset)} examples")
+        # ==== Model & Tokenizer ====
         progress(0.3, desc="Loading model & tokenizer...")
         model_name = "Qwen/Qwen2.5-0.5B"
         output_log.append(f"\n🤖 Loading model: {model_name}")
         )
         if device == "cuda":
             model = model.to(device)
         output_log.append("   ✅ Model loaded successfully")
         # ==== LoRA ====
         trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
         output_log.append(f"   Trainable params: {trainable_params:,}")
+        # ==== Tokenization + Labels ====
         progress(0.5, desc="Tokenizing dataset...")
         def tokenize_fn(examples):
+            tokenized = tokenizer(
                 examples["text"],
                 padding="max_length",
                 truncation=True,
                 max_length=256,
             )
+            # Add labels for causal LM
+            tokenized["labels"] = tokenized["input_ids"].copy()
+            return tokenized
         dataset = dataset.map(tokenize_fn, batched=True)
+        output_log.append("   ✅ Tokenization + labels done")
         # ==== Training arguments ====
         progress(0.6, desc="Setting up training...")
         output_log.append("\n🚀 Starting training...\n" + "=" * 50)
         train_result = trainer.train()
+        # ==== Save model ====
         progress(0.85, desc="Saving model...")
         output_log.append("\n💾 Saving model locally...")
         trainer.save_model(output_dir)
         api = HfApi()
         token = HfFolder.get_token()
         api.create_repo(repo_id=hf_repo, exist_ok=True)
         with tempfile.TemporaryDirectory() as tmpdir:
             repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
             shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)