# -*- coding: utf-8 -*-
"""SAWIT_HAckathon.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1JdwIlEdowZbW21IVuKYirC_YVXUyzgVA
"""

pip install transformers datasets unsloth

import pandas as pd

# Load CSV file
df = pd.read_csv("/content/marathi_dataset.csv")

# Display first 5 rows
df.head()

import json

# Convert to JSON format
dataset = [
    {"English": row["English Sentence"], "Colloquial": row["Colloquial Marathi Translation"]}
    for _, row in df.iterrows()
]

# Save as JSON file
with open("marathi_dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print("Dataset saved successfully!")

from datasets import load_dataset

# Load the dataset from JSON
dataset = load_dataset("json", data_files="marathi_dataset.json")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

from transformers import AutoTokenizer

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)

# Apply tokenization
try:
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    print("Tokenization successful!")
except Exception as e:
    print("Error in tokenization:", str(e))


# Print tokenized dataset structure
print(tokenized_datasets)
if "train" not in tokenized_datasets:
    print("Error: 'train' dataset not found! Check dataset loading.")
else:
    print("Dataset is ready for training!")

from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Define training arguments (Disable evaluation)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # This fixes the error!
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3
)

# Initialize Trainer without eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"]  # Use only training dataset
)

import os
os.environ["WANDB_DISABLED"] = "true"

import os
print(os.listdir("./results"))

trainer.train()  # This will start training the model

trainer.save_model("./fine_tuned_model")  # Saves the model in a folder

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load fine-tuned model
model_path = "./fine_tuned_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Example input text
input_text = "Translate English to Marathi: Hello, how are you?"

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode the output text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Output:", output_text)

from huggingface_hub import notebook_login
notebook_login()  # Log in to Hugging Face
from huggingface_hub import whoami
print(whoami())
from huggingface_hub import notebook_login
notebook_login()

from huggingface_hub import create_repo

# Correct format: "username/repo_name"
create_repo("tawadesg20/translation-Model", repo_type="model", private=False)

from google.colab import files
uploaded = files.upload()  # This will prompt you to upload a file