Qwen-Training

Running

App Files Files Community

rahul7star commited on Oct 30

Commit

b7a5c82

verified ·

1 Parent(s): a5cab2a

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -61

app.py CHANGED Viewed

@@ -225,69 +225,64 @@ def log_message(output_log, msg):
 def train_model(
     base_model: str,
     dataset_name: str,
     num_epochs: int = 1,
-    batch_size: int = 1,
-    learning_rate: float = 2e-4,
-    hf_repo: str = None,
 ):
     output_log = []
     try:
-        log_message(output_log, "🚀 Initializing prompt expansion training...")
-        # ===== Device setup =====
         device = "cuda" if torch.cuda.is_available() else "cpu"
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
         log_message(output_log, f"🎮 Device: {device}, dtype: {dtype}")
         if device == "cuda":
-            gpu_name = torch.cuda.get_device_name(0)
-            gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-            log_message(output_log, f"✅ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
-        # ===== Load dataset safely =====
         log_message(output_log, f"\n📚 Loading dataset: {dataset_name}")
         dataset = load_dataset(dataset_name)
         dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
-        train_dataset = dataset["train"]
-        test_dataset = dataset["test"]
-        log_message(output_log, f"   → Train samples: {len(train_dataset)} | Test samples: {len(test_dataset)}")
-        # ===== Format examples =====
         def format_example(example):
             short_prompt = example.get("short", "").strip()
             long_response = example.get("long", "").strip()
-            prompt = (
-                f"<|system|>\nYou are an AI that expands short prompts into detailed, descriptive versions.\n"
-                f"<|user|>\nShort: {short_prompt}\n"
-                f"<|assistant|>\n{long_response}"
-            )
-            return {"text": prompt}
-        train_dataset = train_dataset.map(format_example, num_proc=1)
-        test_dataset = test_dataset.map(format_example, num_proc=1)
-        # ===== Load model & tokenizer safely =====
-        log_message(output_log, f"\n🤖 Loading base model: {base_model}")
         tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
-            torch_dtype=dtype,
             trust_remote_code=True,
-            low_cpu_mem_usage=True,
             device_map="auto" if device == "cuda" else None,
         )
-        # Enable memory optimizations
         model.gradient_checkpointing_enable()
-        log_message(output_log, "✅ Model loaded with gradient checkpointing")
-        # ===== Apply lightweight LoRA =====
-        log_message(output_log, "\n⚙️ Applying LoRA fine-tuning config...")
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             r=4,
@@ -297,46 +292,39 @@ def train_model(
             bias="none",
         )
         model = get_peft_model(model, lora_config)
-        log_message(output_log, f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
-        # ===== Tokenization (batched for speed) =====
         def tokenize_fn(examples):
             tokenized = tokenizer(
                 examples["text"],
                 padding="max_length",
                 truncation=True,
-                max_length=384,
             )
             tokenized["labels"] = tokenized["input_ids"].copy()
             return tokenized
         train_dataset = train_dataset.map(tokenize_fn, batched=True)
         test_dataset = test_dataset.map(tokenize_fn, batched=True)
-        log_message(output_log, "✅ Tokenization done")
         # ===== Training setup =====
-        output_dir = "./prompt_expander_lora"
         os.makedirs(output_dir, exist_ok=True)
-        # Automatically reduce batch size for low GPU VRAM
-        if device == "cuda" and gpu_mem < 10:
-            batch_size = 1
-            log_message(output_log, f"⚠️ GPU memory low → Using batch_size={batch_size}")
         training_args = TrainingArguments(
             output_dir=output_dir,
             num_train_epochs=num_epochs,
             per_device_train_batch_size=batch_size,
-            gradient_accumulation_steps=4,
-            warmup_steps=20,
-            logging_steps=10,
-            save_strategy="epoch",
-            optim="adamw_torch",
-            learning_rate=learning_rate,
             fp16=(dtype == torch.float16),
             bf16=(dtype == torch.bfloat16),
-            max_grad_norm=1.0,
             report_to="none",
         )
         trainer = Trainer(
@@ -348,25 +336,26 @@ def train_model(
         )
         # ===== Train =====
-        log_message(output_log, "\n🔥 Starting safe LoRA fine-tuning...")
         trainer.train()
-        # ===== Save =====
-        log_message(output_log, "\n💾 Saving fine-tuned model...")
-        trainer.save_model(output_dir)
         tokenizer.save_pretrained(output_dir)
-        # ===== Upload to Hub =====
-        if hf_repo:
-            log_message(output_log, f"\n☁️ Uploading to {hf_repo} ...")
-            start_async_upload(output_dir, hf_repo, output_log)
-        log_message(output_log, "\n✅ Training complete!")
-    except torch.cuda.OutOfMemoryError:
-        log_message(output_log, "\n❌ CUDA OOM — try lowering batch size or sequence length.")
     except Exception as e:
-        log_message(output_log, f"\n❌ Training error: {e}")
     return "\n".join(output_log)

 def train_model(
     base_model: str,
     dataset_name: str,
+    hf_repo: str,
     num_epochs: int = 1,
+    batch_size: int = 2,
+    learning_rate: float = 5e-4,
 ):
     output_log = []
     try:
+        log_message(output_log, "🚀 Starting FAST test training...")
+        # ===== Device =====
         device = "cuda" if torch.cuda.is_available() else "cpu"
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
         log_message(output_log, f"🎮 Device: {device}, dtype: {dtype}")
         if device == "cuda":
+            log_message(output_log, f"✅ GPU: {torch.cuda.get_device_name(0)}")
+        # ===== Load dataset =====
         log_message(output_log, f"\n📚 Loading dataset: {dataset_name}")
         dataset = load_dataset(dataset_name)
         dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
+        train_dataset, test_dataset = dataset["train"], dataset["test"]
+        # ===== ⚡ FAST mode: use small subset =====
+        train_dataset = train_dataset.select(range(min(100, len(train_dataset))))
+        test_dataset = test_dataset.select(range(min(20, len(test_dataset))))
+        log_message(output_log, f"⚡ Using {len(train_dataset)} train / {len(test_dataset)} test samples")
+        # ===== Format samples =====
         def format_example(example):
             short_prompt = example.get("short", "").strip()
             long_response = example.get("long", "").strip()
+            return {
+                "text": (
+                    f"<|system|>\nYou are an AI that expands short prompts into detailed, descriptive ones.\n"
+                    f"<|user|>\nShort: {short_prompt}\n"
+                    f"<|assistant|>\n{long_response}"
+                )
+            }
+        train_dataset = train_dataset.map(format_example)
+        test_dataset = test_dataset.map(format_example)
+        # ===== Tokenizer & Model =====
         tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
             trust_remote_code=True,
+            torch_dtype=dtype,
             device_map="auto" if device == "cuda" else None,
+            low_cpu_mem_usage=True,
         )
         model.gradient_checkpointing_enable()
+        # ===== LoRA setup =====
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             r=4,
             bias="none",
         )
         model = get_peft_model(model, lora_config)
+        log_message(output_log, "✅ LoRA applied successfully")
+        # ===== Tokenization =====
         def tokenize_fn(examples):
             tokenized = tokenizer(
                 examples["text"],
                 padding="max_length",
                 truncation=True,
+                max_length=256,
             )
             tokenized["labels"] = tokenized["input_ids"].copy()
             return tokenized
         train_dataset = train_dataset.map(tokenize_fn, batched=True)
         test_dataset = test_dataset.map(tokenize_fn, batched=True)
         # ===== Training setup =====
+        output_dir = "./prompt_expander_fast"
         os.makedirs(output_dir, exist_ok=True)
         training_args = TrainingArguments(
             output_dir=output_dir,
             num_train_epochs=num_epochs,
             per_device_train_batch_size=batch_size,
+            gradient_accumulation_steps=2,
+            warmup_steps=5,
+            logging_steps=5,
+            save_strategy="no",  # don't save checkpoints
             fp16=(dtype == torch.float16),
             bf16=(dtype == torch.bfloat16),
+            learning_rate=learning_rate,
             report_to="none",
+            optim="adamw_torch",
         )
         trainer = Trainer(
         )
         # ===== Train =====
+        log_message(output_log, "\n🔥 Quick training started...")
         trainer.train()
+        # ===== Save + Upload =====
+        log_message(output_log, "\n💾 Saving fast fine-tuned model...")
+        model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
+        log_message(output_log, f"☁️ Uploading model to {hf_repo} ...")
+        upload_folder(
+            repo_id=hf_repo,
+            folder_path=output_dir,
+            repo_type="model",
+            commit_message="Quick test fine-tune upload",
+        )
+        log_message(output_log, "\n✅ FAST training completed successfully!")
     except Exception as e:
+        log_message(output_log, f"❌ Error: {e}")
     return "\n".join(output_log)