debug_divas45model / model.py
anitha2520's picture
Update model.py
126842e verified
raw
history blame
3.62 kB
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
# Load dataset
file_path = "/content/debug_divas_dataset.json" # Ensure the correct file path
dataset = load_dataset("json", data_files=file_path)
# Load Unsloth's FastLanguageModel and tokenizer
model_name = "unsloth/mistral-7b-instruct" # Using an instruct model for colloquial translation
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=128, # Adjust based on dataset
dtype=torch.float32, # Avoid FP16 issues
load_in_4bit=False, # Disable 4-bit quantization for precision
)
# Define preprocessing function for colloquial speech
def preprocess_function(examples):
"""
Prepares dataset in an informal/colloquial tone for training.
"""
inputs = tokenizer(
[f"Convert the given English text into Tamil casual speech: {text}" for text in examples["input"]],
padding="max_length",
truncation=True,
max_length=128,
)
labels = tokenizer(
examples["output"], padding="max_length", truncation=True, max_length=128
)
inputs["labels"] = labels["input_ids"]
return inputs
# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
# Split dataset into training & testing sets
split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
# Initialize UnslothTrainer
trainer = UnslothTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
args={
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"num_train_epochs": 5, # Increased for better colloquial adaptation
"learning_rate": 2e-5,
"save_strategy": "epoch",
"evaluation_strategy": "epoch",
"fp16": False, # Avoiding mixed precision
}
)
# Train with Unsloth
unsloth_train(trainer)
# Save fine-tuned model
trainer.model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
# Load fine-tuned model
fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
model_name="./fine_tuned_model",
max_seq_length=128,
dtype=torch.float32,
load_in_4bit=False,
)
# Inference with optimized settings
device = "cuda" if torch.cuda.is_available() else "cpu"
fine_tuned_model.to(device)
def translate_to_colloquial_tamil(english_text):
instruction = "Convert this English sentence into Tamil colloquial speech"
inputs = tokenizer(f"{instruction}: {english_text}", return_tensors="pt").to(device)
# Generate colloquial Tamil translation
translated_tokens = fine_tuned_model.generate(
**inputs,
max_new_tokens=50, # Limit response length
do_sample=True, # Enable sampling for natural output
top_p=0.95, # Nucleus sampling for more natural phrasing
temperature=0.7, # Adjust creativity
)
return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
# Example translations
examples = [
"The pharmacy is near the bus stop.",
"Take this medicine after food.",
"Train tickets for tomorrow are available.",
"Tell me about OOPs in Python?",
"Can we edit a tuple?",
"When will the new software be implemented?",
]
for sentence in examples:
print(f"English: {sentence}")
print(f"Colloquial Tamil: {translate_to_colloquial_tamil(sentence)}\n")