anitha2520 commited on
Commit
64382c6
·
verified ·
1 Parent(s): 6cd7cbb

Rename model to model.py

Browse files
Files changed (2) hide show
  1. model +0 -0
  2. model.py +85 -0
model DELETED
File without changes
model.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_dataset
3
+ from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
4
+
5
+ # Load dataset
6
+ file_path = "/content/debug_divas_dataset.json" # Corrected file path
7
+ dataset = load_dataset("json", data_files=file_path)
8
+
9
+ # Load Unsloth's FastLanguageModel and tokenizer
10
+ model_name = "unsloth/mistral-7b-instruct" # Ensure it's an instruct model for translation
11
+ model, tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name=model_name,
13
+ max_seq_length=128, # Adjust based on your dataset
14
+ dtype=torch.float32, # Use float32 to avoid FP16 issues
15
+ load_in_4bit=False, # Disable 4-bit quantization if not needed
16
+ )
17
+
18
+ # Preprocessing function
19
+ def preprocess_function(examples):
20
+ # Combine instruction and input for the model
21
+ inputs = tokenizer(
22
+ [f"Translate the following English sentence to colloquial Tamil: {text}" for text in examples["input"]],
23
+ padding="max_length",
24
+ truncation=True,
25
+ max_length=128,
26
+ )
27
+ labels = tokenizer(
28
+ examples["output"], padding="max_length", truncation=True, max_length=128
29
+ )
30
+ inputs["labels"] = labels["input_ids"]
31
+ return inputs
32
+
33
+ # Tokenize dataset
34
+ tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
35
+
36
+ # Split dataset
37
+ split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
38
+ train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
39
+
40
+ # Initialize UnslothTrainer
41
+ trainer = UnslothTrainer(
42
+ model=model,
43
+ train_dataset=train_dataset,
44
+ eval_dataset=test_dataset,
45
+ tokenizer=tokenizer,
46
+ args={
47
+ "per_device_train_batch_size": 8,
48
+ "per_device_eval_batch_size": 8,
49
+ "num_train_epochs": 3,
50
+ "learning_rate": 2e-5,
51
+ "save_strategy": "epoch",
52
+ "evaluation_strategy": "epoch",
53
+ "fp16": False, # Disable mixed precision training
54
+ }
55
+ )
56
+
57
+ # Train with Unsloth
58
+ unsloth_train(trainer)
59
+
60
+ # Save fine-tuned model
61
+ trainer.model.save_pretrained("./fine_tuned_model")
62
+ tokenizer.save_pretrained("./fine_tuned_model")
63
+
64
+ # Load fine-tuned model
65
+ fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
66
+ model_name="./fine_tuned_model",
67
+ max_seq_length=128,
68
+ dtype=torch.float32,
69
+ load_in_4bit=False,
70
+ )
71
+
72
+ # Translation inference
73
+ device = "cuda" if torch.cuda.is_available() else "cpu"
74
+ fine_tuned_model.to(device)
75
+
76
+ input_text = "The pharmacy is near the bus stop."
77
+ instruction = "Translate the following English sentence to colloquial Tamil"
78
+
79
+ inputs = tokenizer(f"{instruction}: {input_text}", return_tensors="pt").to(device)
80
+
81
+ # Generate translation
82
+ translated_tokens = fine_tuned_model.generate(**inputs)
83
+ translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
84
+
85
+ print("Translated Tamil Text:", translated_text)