# -*- coding: utf-8 -*- """SAWIT_HAckathon.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1JdwIlEdowZbW21IVuKYirC_YVXUyzgVA """ pip install transformers datasets unsloth import pandas as pd # Load CSV file df = pd.read_csv("/content/marathi_dataset.csv") # Display first 5 rows df.head() import json # Convert to JSON format dataset = [ {"English": row["English Sentence"], "Colloquial": row["Colloquial Marathi Translation"]} for _, row in df.iterrows() ] # Save as JSON file with open("marathi_dataset.json", "w", encoding="utf-8") as f: json.dump(dataset, f, ensure_ascii=False, indent=4) print("Dataset saved successfully!") from datasets import load_dataset # Load the dataset from JSON dataset = load_dataset("json", data_files="marathi_dataset.json") # Tokenization function def tokenize_function(examples): return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True) # Apply tokenization tokenized_datasets = dataset.map(tokenize_function, batched=True) print(tokenized_datasets) from transformers import AutoTokenizer model_name = "t5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenization function def tokenize_function(examples): return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True) # Apply tokenization try: tokenized_datasets = dataset.map(tokenize_function, batched=True) print("Tokenization successful!") except Exception as e: print("Error in tokenization:", str(e)) # Print tokenized dataset structure print(tokenized_datasets) if "train" not in tokenized_datasets: print("Error: 'train' dataset not found! Check dataset loading.") else: print("Dataset is ready for training!") from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM # Load model model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") # Define training arguments (Disable evaluation) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="no", # This fixes the error! learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3 ) # Initialize Trainer without eval_dataset trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] # Use only training dataset ) import os os.environ["WANDB_DISABLED"] = "true" import os print(os.listdir("./results")) trainer.train() # This will start training the model trainer.save_model("./fine_tuned_model") # Saves the model in a folder from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # Load fine-tuned model model_path = "./fine_tuned_model" model = AutoModelForSeq2SeqLM.from_pretrained(model_path) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("t5-small") # Example input text input_text = "Translate English to Marathi: Hello, how are you?" # Tokenize input inputs = tokenizer(input_text, return_tensors="pt") # Generate output outputs = model.generate(**inputs) # Decode the output text output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print("Generated Output:", output_text) from huggingface_hub import notebook_login notebook_login() # Log in to Hugging Face from huggingface_hub import whoami print(whoami()) from huggingface_hub import notebook_login notebook_login() from huggingface_hub import create_repo # Correct format: "username/repo_name" create_repo("tawadesg20/translation-Model", repo_type="model", private=False) from google.colab import files uploaded = files.upload() # This will prompt you to upload a file