anitha2520
/

debug_divas45model

Tamil

Model card Files Files and versions

xet

Community

anitha2520 commited on Feb 21, 2025

Commit

126842e

verified ·

1 Parent(s): e4028dd

Update model.py

Browse files

Files changed (1) hide show

model.py +39 -20

model.py CHANGED Viewed

@@ -3,23 +3,25 @@ from datasets import load_dataset
 from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
 # Load dataset
-file_path = "/content/debug_divas_dataset.json"  # Corrected file path
 dataset = load_dataset("json", data_files=file_path)
 # Load Unsloth's FastLanguageModel and tokenizer
-model_name = "unsloth/mistral-7b-instruct"  # Ensure it's an instruct model for translation
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name,
-    max_seq_length=128,  # Adjust based on your dataset
-    dtype=torch.float32,  # Use float32 to avoid FP16 issues
-    load_in_4bit=False,   # Disable 4-bit quantization if not needed
 )
-# Preprocessing function
 def preprocess_function(examples):
-    # Combine instruction and input for the model
     inputs = tokenizer(
-        [f"Translate the following English sentence to colloquial Tamil: {text}" for text in examples["input"]],
         padding="max_length",
         truncation=True,
         max_length=128,
@@ -30,10 +32,10 @@ def preprocess_function(examples):
     inputs["labels"] = labels["input_ids"]
     return inputs
-# Tokenize dataset
 tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
-# Split dataset
 split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
 train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
@@ -46,11 +48,11 @@ trainer = UnslothTrainer(
     args={
         "per_device_train_batch_size": 8,
         "per_device_eval_batch_size": 8,
-        "num_train_epochs": 3,
         "learning_rate": 2e-5,
         "save_strategy": "epoch",
         "evaluation_strategy": "epoch",
-        "fp16": False,  # Disable mixed precision training
     }
 )
@@ -69,17 +71,34 @@ fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
     load_in_4bit=False,
 )
-# Translation inference
 device = "cuda" if torch.cuda.is_available() else "cpu"
 fine_tuned_model.to(device)
-input_text = "The pharmacy is near the bus stop."
-instruction = "Translate the following English sentence to colloquial Tamil"
-inputs = tokenizer(f"{instruction}: {input_text}", return_tensors="pt").to(device)
-# Generate translation
-translated_tokens = fine_tuned_model.generate(**inputs)
-translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
-print("Translated Tamil Text:", translated_text)

 from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
 # Load dataset
+file_path = "/content/debug_divas_dataset.json"  # Ensure the correct file path
 dataset = load_dataset("json", data_files=file_path)
 # Load Unsloth's FastLanguageModel and tokenizer
+model_name = "unsloth/mistral-7b-instruct"  # Using an instruct model for colloquial translation
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name,
+    max_seq_length=128,  # Adjust based on dataset
+    dtype=torch.float32,  # Avoid FP16 issues
+    load_in_4bit=False,   # Disable 4-bit quantization for precision
 )
+# Define preprocessing function for colloquial speech
 def preprocess_function(examples):
+    """
+    Prepares dataset in an informal/colloquial tone for training.
+    """
     inputs = tokenizer(
+        [f"Convert the given English text into Tamil casual speech: {text}" for text in examples["input"]],
         padding="max_length",
         truncation=True,
         max_length=128,
     inputs["labels"] = labels["input_ids"]
     return inputs
+# Apply preprocessing
 tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
+# Split dataset into training & testing sets
 split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
 train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
     args={
         "per_device_train_batch_size": 8,
         "per_device_eval_batch_size": 8,
+        "num_train_epochs": 5,  # Increased for better colloquial adaptation
         "learning_rate": 2e-5,
         "save_strategy": "epoch",
         "evaluation_strategy": "epoch",
+        "fp16": False,  # Avoiding mixed precision
     }
 )
     load_in_4bit=False,
 )
+# Inference with optimized settings
 device = "cuda" if torch.cuda.is_available() else "cpu"
 fine_tuned_model.to(device)
+def translate_to_colloquial_tamil(english_text):
+    instruction = "Convert this English sentence into Tamil colloquial speech"
+    inputs = tokenizer(f"{instruction}: {english_text}", return_tensors="pt").to(device)
+    # Generate colloquial Tamil translation
+    translated_tokens = fine_tuned_model.generate(
+        **inputs,
+        max_new_tokens=50,  # Limit response length
+        do_sample=True,  # Enable sampling for natural output
+        top_p=0.95,  # Nucleus sampling for more natural phrasing
+        temperature=0.7,  # Adjust creativity
+    )
+    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
+# Example translations
+examples = [
+    "The pharmacy is near the bus stop.",
+    "Take this medicine after food.",
+    "Train tickets for tomorrow are available.",
+    "Tell me about OOPs in Python?",
+    "Can we edit a tuple?",
+    "When will the new software be implemented?",
+]
+for sentence in examples:
+    print(f"English: {sentence}")
+    print(f"Colloquial Tamil: {translate_to_colloquial_tamil(sentence)}\n")