anitha2520
/

debug_divas45model

Tamil

Model card Files Files and versions

xet

Community

anitha2520 commited on Feb 21, 2025

Commit

37b0b2a

verified ·

1 Parent(s): 126842e

Update model.py

Browse files

Files changed (1) hide show

model.py +27 -41

model.py CHANGED Viewed

@@ -3,25 +3,22 @@ from datasets import load_dataset
 from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
 # Load dataset
-file_path = "/content/debug_divas_dataset.json"  # Ensure the correct file path
 dataset = load_dataset("json", data_files=file_path)
 # Load Unsloth's FastLanguageModel and tokenizer
-model_name = "unsloth/mistral-7b-instruct"  # Using an instruct model for colloquial translation
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name,
-    max_seq_length=128,  # Adjust based on dataset
-    dtype=torch.float32,  # Avoid FP16 issues
-    load_in_4bit=False,   # Disable 4-bit quantization for precision
 )
-# Define preprocessing function for colloquial speech
 def preprocess_function(examples):
-    """
-    Prepares dataset in an informal/colloquial tone for training.
-    """
     inputs = tokenizer(
-        [f"Convert the given English text into Tamil casual speech: {text}" for text in examples["input"]],
         padding="max_length",
         truncation=True,
         max_length=128,
@@ -32,10 +29,10 @@ def preprocess_function(examples):
     inputs["labels"] = labels["input_ids"]
     return inputs
-# Apply preprocessing
 tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
-# Split dataset into training & testing sets
 split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
 train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
@@ -48,11 +45,11 @@ trainer = UnslothTrainer(
     args={
         "per_device_train_batch_size": 8,
         "per_device_eval_batch_size": 8,
-        "num_train_epochs": 5,  # Increased for better colloquial adaptation
         "learning_rate": 2e-5,
         "save_strategy": "epoch",
         "evaluation_strategy": "epoch",
-        "fp16": False,  # Avoiding mixed precision
     }
 )
@@ -71,34 +68,23 @@ fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
     load_in_4bit=False,
 )
-# Inference with optimized settings
 device = "cuda" if torch.cuda.is_available() else "cpu"
 fine_tuned_model.to(device)
-def translate_to_colloquial_tamil(english_text):
-    instruction = "Convert this English sentence into Tamil colloquial speech"
-    inputs = tokenizer(f"{instruction}: {english_text}", return_tensors="pt").to(device)
-    # Generate colloquial Tamil translation
-    translated_tokens = fine_tuned_model.generate(
-        **inputs,
-        max_new_tokens=50,  # Limit response length
-        do_sample=True,  # Enable sampling for natural output
-        top_p=0.95,  # Nucleus sampling for more natural phrasing
-        temperature=0.7,  # Adjust creativity
-    )
-    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
-# Example translations
-examples = [
-    "The pharmacy is near the bus stop.",
-    "Take this medicine after food.",
-    "Train tickets for tomorrow are available.",
-    "Tell me about OOPs in Python?",
-    "Can we edit a tuple?",
-    "When will the new software be implemented?",
-]
-for sentence in examples:
-    print(f"English: {sentence}")
-    print(f"Colloquial Tamil: {translate_to_colloquial_tamil(sentence)}\n")

 from unsloth import FastLanguageModel, UnslothTrainer, unsloth_train
 # Load dataset
+file_path = "/content/debug_divas_dataset.json"  # Ensure the file path is correct
 dataset = load_dataset("json", data_files=file_path)
 # Load Unsloth's FastLanguageModel and tokenizer
+model_name = "unsloth/mistral-7b-instruct"  # Ensure it's an instruct model for translation
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name,
+    max_seq_length=128,
+    dtype=torch.float32,  # Use float32 to avoid FP16 issues
+    load_in_4bit=False,   # Disable 4-bit quantization if not needed
 )
+# Preprocessing function
 def preprocess_function(examples):
     inputs = tokenizer(
+        [f"Translate the following English sentence to colloquial Tamil: {text}" for text in examples["input"]],
         padding="max_length",
         truncation=True,
         max_length=128,
     inputs["labels"] = labels["input_ids"]
     return inputs
+# Tokenize dataset
 tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
+# Split dataset
 split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
 train_dataset, test_dataset = split_datasets["train"], split_datasets["test"]
     args={
         "per_device_train_batch_size": 8,
         "per_device_eval_batch_size": 8,
+        "num_train_epochs": 3,
         "learning_rate": 2e-5,
         "save_strategy": "epoch",
         "evaluation_strategy": "epoch",
+        "fp16": False,  # Disable mixed precision training
     }
 )
     load_in_4bit=False,
 )
+# Move model to device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 fine_tuned_model.to(device)
+# User input loop for real-time translation
+print("Colloquial Tamil Translator (Type 'exit' to quit)")
+while True:
+    input_text = input("Enter an English sentence: ")
+    if input_text.lower() == "exit":
+        break
+    instruction = "Translate the following English sentence to colloquial Tamil"
+    inputs = tokenizer(f"{instruction}: {input_text}", return_tensors="pt").to(device)
+    # Generate translation
+    translated_tokens = fine_tuned_model.generate(**inputs)
+    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
+    print("Colloquial Tamil Translation:", translated_text)