boltuix commited on
Commit
29d83cf
·
verified ·
1 Parent(s): 46e8c82

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +76 -72
README.md CHANGED
@@ -312,78 +312,82 @@ To adapt BERT-Lite for custom IoT tasks (e.g., specific smart home commands):
312
  1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
313
  2. **Fine-Tune with Hugging Face**:
314
  ```python
315
- #!pip uninstall -y transformers torch datasets
316
- #!pip install transformers==4.44.2 torch==2.4.1 datasets==3.0.1
317
-
318
- import torch
319
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
320
- from datasets import Dataset
321
- import pandas as pd
322
-
323
- # 1. Prepare the sample IoT dataset
324
- data = {
325
- "text": [
326
- "Turn on the fan",
327
- "Switch off the light",
328
- "Invalid command",
329
- "Activate the air conditioner",
330
- "Turn off the heater",
331
- "Gibberish input"
332
- ],
333
- "label": [1, 1, 0, 1, 1, 0] # 1 for valid IoT commands, 0 for invalid
334
- }
335
- df = pd.DataFrame(data)
336
- dataset = Dataset.from_pandas(df)
337
-
338
- # 2. Load tokenizer and model
339
- model_name = "boltuix/bert-lite"
340
- tokenizer = BertTokenizer.from_pretrained(model_name)
341
- model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
342
-
343
- # 3. Tokenize the dataset
344
- def tokenize_function(examples):
345
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64) # Short max_length for IoT commands
346
-
347
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
348
-
349
- # 4. Set format for PyTorch
350
- tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
351
-
352
- # 5. Define training arguments
353
- training_args = TrainingArguments(
354
- output_dir="./bert_lite_results",
355
- num_train_epochs=5, # Increased epochs for small dataset
356
- per_device_train_batch_size=2,
357
- logging_dir="./bert_lite_logs",
358
- logging_steps=10,
359
- save_steps=100,
360
- evaluation_strategy="no",
361
- learning_rate=5e-5, # Adjusted for BERT-Lite
362
- )
363
-
364
- # 6. Initialize Trainer
365
- trainer = Trainer(
366
- model=model,
367
- args=training_args,
368
- train_dataset=tokenized_dataset,
369
- )
370
-
371
- # 7. Fine-tune the model
372
- trainer.train()
373
-
374
- # 8. Save the fine-tuned model
375
- model.save_pretrained("./fine_tuned_bert_lite")
376
- tokenizer.save_pretrained("./fine_tuned_bert_lite")
377
-
378
- # 9. Example inference
379
- text = "Turn on the light"
380
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
381
- model.eval()
382
- with torch.no_grad():
383
- outputs = model(**inputs)
384
- logits = outputs.logits
385
- predicted_class = torch.argmax(logits, dim=1).item()
386
- print(f"Predicted class for '{text}': {'Valid IoT Command' if predicted_class == 1 else 'Invalid Command'}")
 
 
 
 
387
  ```
388
  3. **Deploy**: Export the fine-tuned model to ONNX or TensorFlow Lite for edge devices.
389
 
 
312
  1. **Prepare Dataset**: Collect labeled data (e.g., commands with intents or masked sentences).
313
  2. **Fine-Tune with Hugging Face**:
314
  ```python
315
+ # !pip install transformers datasets torch --upgrade
316
+
317
+ import torch
318
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
319
+ from datasets import Dataset
320
+ import pandas as pd
321
+
322
+ # 1. Prepare the sample IoT dataset
323
+ data = {
324
+ "text": [
325
+ "Turn on the fan",
326
+ "Switch off the light",
327
+ "Invalid command",
328
+ "Activate the air conditioner",
329
+ "Turn off the heater",
330
+ "Gibberish input"
331
+ ],
332
+ "label": [1, 1, 0, 1, 1, 0] # 1 = Valid command, 0 = Invalid
333
+ }
334
+ df = pd.DataFrame(data)
335
+ dataset = Dataset.from_pandas(df)
336
+
337
+ # 2. Load tokenizer and model
338
+ model_name = "boltuix/bert-lite" # Replace with any small/quantized BERT
339
+ tokenizer = BertTokenizer.from_pretrained(model_name)
340
+ model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
341
+
342
+ # 3. Tokenize the dataset
343
+ def tokenize_function(examples):
344
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
345
+
346
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
347
+
348
+ # 4. Manually convert columns to tensors (NumPy 2.0 safe)
349
+ tokenized_dataset = tokenized_dataset.map(lambda x: {
350
+ "input_ids": torch.tensor(x["input_ids"]),
351
+ "attention_mask": torch.tensor(x["attention_mask"]),
352
+ "label": torch.tensor(x["label"])
353
+ })
354
+
355
+ # 5. Define training arguments
356
+ training_args = TrainingArguments(
357
+ output_dir="./bert_lite_results",
358
+ num_train_epochs=5,
359
+ per_device_train_batch_size=2,
360
+ logging_dir="./bert_lite_logs",
361
+ logging_steps=10,
362
+ save_steps=100,
363
+ eval_strategy="no",
364
+ learning_rate=5e-5,
365
+ )
366
+
367
+ # 6. Initialize Trainer
368
+ trainer = Trainer(
369
+ model=model,
370
+ args=training_args,
371
+ train_dataset=tokenized_dataset,
372
+ )
373
+
374
+ # 7. Fine-tune the model
375
+ trainer.train()
376
+
377
+ # 8. Save the fine-tuned model
378
+ model.save_pretrained("./fine_tuned_bert_lite")
379
+ tokenizer.save_pretrained("./fine_tuned_bert_lite")
380
+
381
+ # 9. Inference example
382
+ text = "Turn on the light"
383
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
384
+ model.eval()
385
+ with torch.no_grad():
386
+ outputs = model(**inputs)
387
+ logits = outputs.logits
388
+ predicted_class = torch.argmax(logits, dim=1).item()
389
+
390
+ print(f"Predicted class for '{text}': {'✅ Valid IoT Command' if predicted_class == 1 else '❌ Invalid Command'}")
391
  ```
392
  3. **Deploy**: Export the fine-tuned model to ONNX or TensorFlow Lite for edge devices.
393