Text Classification
Transformers
English
NLP
translationModel / sawit_hackathon.py
tawadesg20's picture
Upload sawit_hackathon.py with huggingface_hub
ff1b32e verified
# -*- coding: utf-8 -*-
"""SAWIT_HAckathon.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1JdwIlEdowZbW21IVuKYirC_YVXUyzgVA
"""
pip install transformers datasets unsloth
import pandas as pd
# Load CSV file
df = pd.read_csv("/content/marathi_dataset.csv")
# Display first 5 rows
df.head()
import json
# Convert to JSON format
dataset = [
{"English": row["English Sentence"], "Colloquial": row["Colloquial Marathi Translation"]}
for _, row in df.iterrows()
]
# Save as JSON file
with open("marathi_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
print("Dataset saved successfully!")
from datasets import load_dataset
# Load the dataset from JSON
dataset = load_dataset("json", data_files="marathi_dataset.json")
# Tokenization function
def tokenize_function(examples):
return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)
# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)
from transformers import AutoTokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Tokenization function
def tokenize_function(examples):
return tokenizer(examples["English"], text_target=examples["Colloquial"], padding="max_length", truncation=True)
# Apply tokenization
try:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print("Tokenization successful!")
except Exception as e:
print("Error in tokenization:", str(e))
# Print tokenized dataset structure
print(tokenized_datasets)
if "train" not in tokenized_datasets:
print("Error: 'train' dataset not found! Check dataset loading.")
else:
print("Dataset is ready for training!")
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# Define training arguments (Disable evaluation)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="no", # This fixes the error!
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3
)
# Initialize Trainer without eval_dataset
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"] # Use only training dataset
)
import os
os.environ["WANDB_DISABLED"] = "true"
import os
print(os.listdir("./results"))
trainer.train() # This will start training the model
trainer.save_model("./fine_tuned_model") # Saves the model in a folder
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load fine-tuned model
model_path = "./fine_tuned_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Example input text
input_text = "Translate English to Marathi: Hello, how are you?"
# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")
# Generate output
outputs = model.generate(**inputs)
# Decode the output text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Output:", output_text)
from huggingface_hub import notebook_login
notebook_login() # Log in to Hugging Face
from huggingface_hub import whoami
print(whoami())
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import create_repo
# Correct format: "username/repo_name"
create_repo("tawadesg20/translation-Model", repo_type="model", private=False)
from google.colab import files
uploaded = files.upload() # This will prompt you to upload a file