translationModel / sawit_hackathon.py

Upload sawit_hackathon.py with huggingface_hub

ff1b32e verified 11 months ago

3.74 kB

	# -- coding: utf-8 --
	"""SAWIT_HAckathon.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1JdwIlEdowZbW21IVuKYirC_YVXUyzgVA
	"""

	pip install transformers datasets unsloth

	import pandas as pd

	# Load CSV file
	df = pd.read_csv("/content/marathi_dataset.csv")

	# Display first 5 rows
	df.head()

	import json

	# Convert to JSON format
	dataset = [
	{"English": row["English Sentence"], "Colloquial": row["Colloquial Marathi Translation"]}
	for _, row in df.iterrows()
	]

	# Save as JSON file
	with open("marathi_dataset.json", "w", encoding="utf-8") as f:
	json.dump(dataset, f, ensure_ascii=False, indent=4)

	print("Dataset saved successfully!")

	from datasets import load_dataset

	# Load the dataset from JSON
	dataset = load_dataset("json", data_files="marathi_dataset.json")

	# Tokenization function
	def tokenize_function(examples):
	return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)

	# Apply tokenization
	tokenized_datasets = dataset.map(tokenize_function, batched=True)
	print(tokenized_datasets)

	from transformers import AutoTokenizer

	model_name = "t5-small"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Tokenization function
	def tokenize_function(examples):
	return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)

	# Apply tokenization
	try:
	tokenized_datasets = dataset.map(tokenize_function, batched=True)
	print("Tokenization successful!")
	except Exception as e:
	print("Error in tokenization:", str(e))


	# Print tokenized dataset structure
	print(tokenized_datasets)
	if "train" not in tokenized_datasets:
	print("Error: 'train' dataset not found! Check dataset loading.")
	else:
	print("Dataset is ready for training!")

	from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM

	# Load model
	model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

	# Define training arguments (Disable evaluation)
	training_args = TrainingArguments(
	output_dir="./results",
	evaluation_strategy="no", # This fixes the error!
	learning_rate=2e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	num_train_epochs=3
	)

	# Initialize Trainer without eval_dataset
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"] # Use only training dataset
	)

	import os
	os.environ["WANDB_DISABLED"] = "true"

	import os
	print(os.listdir("./results"))

	trainer.train() # This will start training the model

	trainer.save_model("./fine_tuned_model") # Saves the model in a folder

	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	# Load fine-tuned model
	model_path = "./fine_tuned_model"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained("t5-small")

	# Example input text
	input_text = "Translate English to Marathi: Hello, how are you?"

	# Tokenize input
	inputs = tokenizer(input_text, return_tensors="pt")

	# Generate output
	outputs = model.generate(**inputs)

	# Decode the output text
	output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print("Generated Output:", output_text)

	from huggingface_hub import notebook_login
	notebook_login() # Log in to Hugging Face
	from huggingface_hub import whoami
	print(whoami())
	from huggingface_hub import notebook_login
	notebook_login()

	from huggingface_hub import create_repo

	# Correct format: "username/repo_name"
	create_repo("tawadesg20/translation-Model", repo_type="model", private=False)

	from google.colab import files
	uploaded = files.upload() # This will prompt you to upload a file