rahul7star commited on
Commit
ae9ce4a
·
verified ·
1 Parent(s): 138479c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -47,13 +47,11 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
47
 
48
  # ==== Format data ====
49
  def format_example(item):
50
- # Use "text" or "content" column if available
51
  text = (
52
  item.get("text")
53
  or item.get("content")
54
  or " ".join(str(v) for v in item.values())
55
  )
56
-
57
  prompt = f"""<|system|>
58
  You are a wise teacher interpreting Bhagavad Gita with deep insights.
59
  <|user|>
@@ -65,7 +63,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
65
  dataset = dataset.map(format_example)
66
  output_log.append(f" ✅ Formatted {len(dataset)} examples")
67
 
68
- # ==== Model ====
69
  progress(0.3, desc="Loading model & tokenizer...")
70
  model_name = "Qwen/Qwen2.5-0.5B"
71
  output_log.append(f"\n🤖 Loading model: {model_name}")
@@ -82,7 +80,6 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
82
  )
83
  if device == "cuda":
84
  model = model.to(device)
85
-
86
  output_log.append(" ✅ Model loaded successfully")
87
 
88
  # ==== LoRA ====
@@ -102,18 +99,21 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
102
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
103
  output_log.append(f" Trainable params: {trainable_params:,}")
104
 
105
- # ==== Tokenization ====
106
  progress(0.5, desc="Tokenizing dataset...")
107
  def tokenize_fn(examples):
108
- return tokenizer(
109
  examples["text"],
110
  padding="max_length",
111
  truncation=True,
112
  max_length=256,
113
  )
 
 
 
114
 
115
  dataset = dataset.map(tokenize_fn, batched=True)
116
- output_log.append(" ✅ Tokenization done")
117
 
118
  # ==== Training arguments ====
119
  progress(0.6, desc="Setting up training...")
@@ -144,6 +144,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
144
  output_log.append("\n🚀 Starting training...\n" + "=" * 50)
145
  train_result = trainer.train()
146
 
 
147
  progress(0.85, desc="Saving model...")
148
  output_log.append("\n💾 Saving model locally...")
149
  trainer.save_model(output_dir)
@@ -157,10 +158,8 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
157
  api = HfApi()
158
  token = HfFolder.get_token()
159
 
160
- # Create repo if not exists
161
  api.create_repo(repo_id=hf_repo, exist_ok=True)
162
 
163
- # Clone & push
164
  with tempfile.TemporaryDirectory() as tmpdir:
165
  repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
166
  shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)
 
47
 
48
  # ==== Format data ====
49
  def format_example(item):
 
50
  text = (
51
  item.get("text")
52
  or item.get("content")
53
  or " ".join(str(v) for v in item.values())
54
  )
 
55
  prompt = f"""<|system|>
56
  You are a wise teacher interpreting Bhagavad Gita with deep insights.
57
  <|user|>
 
63
  dataset = dataset.map(format_example)
64
  output_log.append(f" ✅ Formatted {len(dataset)} examples")
65
 
66
+ # ==== Model & Tokenizer ====
67
  progress(0.3, desc="Loading model & tokenizer...")
68
  model_name = "Qwen/Qwen2.5-0.5B"
69
  output_log.append(f"\n🤖 Loading model: {model_name}")
 
80
  )
81
  if device == "cuda":
82
  model = model.to(device)
 
83
  output_log.append(" ✅ Model loaded successfully")
84
 
85
  # ==== LoRA ====
 
99
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
100
  output_log.append(f" Trainable params: {trainable_params:,}")
101
 
102
+ # ==== Tokenization + Labels ====
103
  progress(0.5, desc="Tokenizing dataset...")
104
  def tokenize_fn(examples):
105
+ tokenized = tokenizer(
106
  examples["text"],
107
  padding="max_length",
108
  truncation=True,
109
  max_length=256,
110
  )
111
+ # Add labels for causal LM
112
+ tokenized["labels"] = tokenized["input_ids"].copy()
113
+ return tokenized
114
 
115
  dataset = dataset.map(tokenize_fn, batched=True)
116
+ output_log.append(" ✅ Tokenization + labels done")
117
 
118
  # ==== Training arguments ====
119
  progress(0.6, desc="Setting up training...")
 
144
  output_log.append("\n🚀 Starting training...\n" + "=" * 50)
145
  train_result = trainer.train()
146
 
147
+ # ==== Save model ====
148
  progress(0.85, desc="Saving model...")
149
  output_log.append("\n💾 Saving model locally...")
150
  trainer.save_model(output_dir)
 
158
  api = HfApi()
159
  token = HfFolder.get_token()
160
 
 
161
  api.create_repo(repo_id=hf_repo, exist_ok=True)
162
 
 
163
  with tempfile.TemporaryDirectory() as tmpdir:
164
  repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
165
  shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)