|
|
|
|
|
"""SAWIT_HAckathon.ipynb |
|
|
|
|
|
Automatically generated by Colab. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1JdwIlEdowZbW21IVuKYirC_YVXUyzgVA |
|
|
""" |
|
|
|
|
|
pip install transformers datasets unsloth |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
df = pd.read_csv("/content/marathi_dataset.csv") |
|
|
|
|
|
|
|
|
df.head() |
|
|
|
|
|
import json |
|
|
|
|
|
|
|
|
dataset = [ |
|
|
{"English": row["English Sentence"], "Colloquial": row["Colloquial Marathi Translation"]} |
|
|
for _, row in df.iterrows() |
|
|
] |
|
|
|
|
|
|
|
|
with open("marathi_dataset.json", "w", encoding="utf-8") as f: |
|
|
json.dump(dataset, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
print("Dataset saved successfully!") |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
dataset = load_dataset("json", data_files="marathi_dataset.json") |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True) |
|
|
|
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
print(tokenized_datasets) |
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
model_name = "t5-small" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True) |
|
|
|
|
|
|
|
|
try: |
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
print("Tokenization successful!") |
|
|
except Exception as e: |
|
|
print("Error in tokenization:", str(e)) |
|
|
|
|
|
|
|
|
|
|
|
print(tokenized_datasets) |
|
|
if "train" not in tokenized_datasets: |
|
|
print("Error: 'train' dataset not found! Check dataset loading.") |
|
|
else: |
|
|
print("Dataset is ready for training!") |
|
|
|
|
|
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM |
|
|
|
|
|
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./results", |
|
|
evaluation_strategy="no", |
|
|
learning_rate=2e-5, |
|
|
per_device_train_batch_size=8, |
|
|
per_device_eval_batch_size=8, |
|
|
num_train_epochs=3 |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_datasets["train"] |
|
|
) |
|
|
|
|
|
import os |
|
|
os.environ["WANDB_DISABLED"] = "true" |
|
|
|
|
|
import os |
|
|
print(os.listdir("./results")) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model("./fine_tuned_model") |
|
|
|
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
|
|
|
|
|
|
|
model_path = "./fine_tuned_model" |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_path) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
|
|
|
|
|
|
|
input_text = "Translate English to Marathi: Hello, how are you?" |
|
|
|
|
|
|
|
|
inputs = tokenizer(input_text, return_tensors="pt") |
|
|
|
|
|
|
|
|
outputs = model.generate(**inputs) |
|
|
|
|
|
|
|
|
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
print("Generated Output:", output_text) |
|
|
|
|
|
from huggingface_hub import notebook_login |
|
|
notebook_login() |
|
|
from huggingface_hub import whoami |
|
|
print(whoami()) |
|
|
from huggingface_hub import notebook_login |
|
|
notebook_login() |
|
|
|
|
|
from huggingface_hub import create_repo |
|
|
|
|
|
|
|
|
create_repo("tawadesg20/translation-Model", repo_type="model", private=False) |
|
|
|
|
|
from google.colab import files |
|
|
uploaded = files.upload() |
|
|
|
|
|
|