license: ncsa datasets:
- nvidia/OpenCodeReasoning language:
- vi metrics:
- accuracy base_model:
- sentence-transformers/all-mpnet-base-v2 new_version: nari-labs/Dia-1.6B pipeline_tag: text-classification library_name: adapter-transformers tags:
- code ---!pip install sentence-transformers transformers datasets torch -q !pip install --upgrade transformers !rm -rf /content/drive/AI_CAPSTONE_2 from google.colab import drive drive.mount('/content/drive') import pandas as pd data = pd.read_csv('/content/drive/MyDrive/AI_CAPSTONE_2/data1.csv', sep=',', encoding='utf-8-sig')
Hiển thị kiểm tra
print(data.head()) print(data.columns) print(data.columns) from datasets import Dataset
Đảm bảo 2 cột đúng tên
dataset = Dataset.from_pandas(data[['input', 'output']]) from transformers import T5Tokenizer, T5ForConditionalGeneration
model_name = "t5-small" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) def preprocess_function(example): # Kiểm tra nếu input hoặc output là None input_text = "command: " + (example['input'] if example['input'] is not None else "") label = example['output'] if example['output'] is not None else ""
model_input = tokenizer(input_text, truncation=True, padding='max_length', max_length=64)
label = tokenizer(label, truncation=True, padding='max_length', max_length=32)
model_input["labels"] = label["input_ids"]
return model_input
tokenized_dataset = dataset.map(preprocess_function) tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1) train_dataset = tokenized_dataset["train"] eval_dataset = tokenized_dataset["test"] from transformers import T5ForConditionalGeneration, T5Tokenizer
Khởi tạo tokenizer và model
tokenizer = T5Tokenizer.from_pretrained("t5-base") model = T5ForConditionalGeneration.from_pretrained("t5-base")
(Giả sử train_dataset và eval_dataset cũng đã được định nghĩa)
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments( output_dir="/content/drive/MyDrive/AI_CAPSTONE_2/t5-navigation-model", eval_steps=500, learning_rate=2e-4, per_device_train_batch_size=12, per_device_eval_batch_size=12, num_train_epochs=10, weight_decay=0.01, save_total_limit=2, save_strategy="epoch", logging_dir="/content/logs", logging_steps=50, push_to_hub=False )
trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer ) trainer.train() #Step Training Loss 50 1.421000 100 0.123900 150 0.082400 200 0.060600 250 0.042900 300 0.033500 350 0.032200 400 0.022700 450 0.019900 500 0.018800 550 0.018900 600 0.014100 650 0.010800 700 0.012500 750 0.011200 800 0.009500 850 0.009600 900 0.010300 950 0.006100 1000 0.007000 1050 0.005400 1100 0.007000 1150 0.004800 1200 0.005000 1250 0.005100 1300 0.004000 1350 0.004900 1400 0.005100 1450 0.004400 1500 0.002800 1550 0.003800 1600 0.003500 [1830/1830 9:13:54, Epoch 10/10] Step Training Loss 50 1.421000 100 0.123900 150 0.082400 200 0.060600 250 0.042900 300 0.033500 350 0.032200 400 0.022700 450 0.019900 500 0.018800 550 0.018900 600 0.014100 650 0.010800 700 0.012500 750 0.011200 800 0.009500 850 0.009600 900 0.010300 950 0.006100 1000 0.007000 1050 0.005400 1100 0.007000 1150 0.004800 1200 0.005000 1250 0.005100 1300 0.004000 1350 0.004900 1400 0.005100 1450 0.004400 1500 0.002800 1550 0.003800 1600 0.003500 1650 0.002500 1700 0.002900 1750 0.002800 1800 0.002200 model.save_pretrained("/content/drive/MyDrive/AI_CAPSTONE_2/t5-navigation-model") tokenizer.save_pretrained("/content/drive/MyDrive/AI_CAPSTONE_2/t5-navigation-model") from transformers import T5Tokenizer, T5ForConditionalGeneration
Load lại model đã fine-tune
model_path = "/content/drive/MyDrive/AI_CAPSTONE_2/t5-navigation-model" model = T5ForConditionalGeneration.from_pretrained(model_path) tokenizer = T5Tokenizer.from_pretrained(model_path) def convert_to_function(sentence): sentence = sentence.lower()
if "phát nhạc" in sentence:
# lấy thể loại nhạc phía sau "phát nhạc"
genre = sentence.replace("phát nhạc", "").strip()
return f"play_music({genre})"
elif "bật nhận diện" in sentence:
return "start_recognition()"
elif "mô tả xung quanh" in sentence:
return "describe_surroundings()"
elif "ai đang đứng trước mặt" in sentence:
return "identify_person()"
elif "tìm" in sentence:
item = sentence.replace("tìm", "").strip()
return f"find_object({item})"
elif "tăng âm lượng" in sentence:
return "increase_volume()"
elif "thoát ứng dụng" in sentence:
return "exit_app()"
else:
return "unknown_command()"
Ví dụ test
test_sentences = [ "Phát nhạc edm", "Mô tả xung quanh giúp tôi với", "Bật nhận diện lên", "Ai đang đứng trước mặt tôi?", "Tìm cái điện thoại cho tôi", "Tăng âm lượng lên", "Tôi muốn thoát ứng dụng" ]
for sentence in test_sentences: command = convert_to_function(sentence) print(f"Input: {sentence}") print(f"Output: {command}") print("-" * 30)
- Downloads last month
- -