Urfavghost's picture
Added models and code
39a7504
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import (
WhitespaceSplit,
Punctuation,
Sequence as PreSequence,
)
from tokenizers.normalizers import NFD, Lowercase, Sequence, StripAccents
from pathlib import Path
import yaml
from datasets import load_dataset
from dataset import English2HindiDataset, English2HindiDatasetTest
from torch.utils.data import DataLoader
import json
import random
from sklearn.model_selection import train_test_split
def load_config(config_path):
with open(config_path, "r") as file:
return yaml.safe_load(file)
def get_all_sentences(ds, lang):
for item in ds:
yield item[lang]
def get_or_build_tokenizer(tokenizer_path, ds, lang):
tokenizer_path = Path(tokenizer_path)
if not Path.exists(tokenizer_path):
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
if lang == "en_text":
tokenizer.normalizer = Sequence(
[
NFD(),
StripAccents(),
Lowercase(),
]
)
else:
tokenizer.normalizer = Sequence(
[
NFD(),
Lowercase(),
]
)
tokenizer.pre_tokenizer = PreSequence([WhitespaceSplit(), Punctuation()])
trainer = BpeTrainer(
special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
min_frequency=3,
vocab_size=60000,
)
tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
tokenizer.save(str(tokenizer_path))
else:
tokenizer = Tokenizer.from_file(str(tokenizer_path))
return tokenizer
def preprocess_to_json(dataset_hf, output_json_path, word_limit=77, char_limit=300):
filtered_data = []
for row in dataset_hf:
if not row.get("translated", False):
continue
en_value = None
en_conversations = row.get("en_conversations", [])
for conv in en_conversations:
if "human" in conv["from"]:
en_value = conv["value"]
break
if en_value is None and len(en_conversations) > 0:
en_value = en_conversations[0]["value"]
hi_value = None
hi_conversations = row.get("conversations", [])
for conv in hi_conversations:
if "human" in conv["from"]:
hi_value = conv["value"]
break
if hi_value is None and len(hi_conversations) > 0:
hi_value = hi_conversations[0]["value"]
if en_value and hi_value:
if (
len(en_value.split()) > word_limit
or len(en_value) > char_limit
or len(hi_value.split()) > word_limit
or len(hi_value) > char_limit
):
continue
filtered_data.append({"en_text": en_value, "hi_text": hi_value})
with open(output_json_path, "w", encoding="utf-8") as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
def create_resources():
config_path = "config.yaml"
config = load_config(config_path)
dataset_json_path = config.get("dataset_path", "data/english2hindi_data.json")
if not Path(dataset_json_path).exists():
print(f"Dataset file {dataset_json_path} not found. Creating it...")
dataset_hf = load_dataset("BhabhaAI/openhermes-2.5-hindi", split="train")
preprocess_to_json(dataset_hf, dataset_json_path)
else:
print(f"Dataset file {dataset_json_path} already exists. Skipping preprocessing.")
with open(dataset_json_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
tokenizer_src = get_or_build_tokenizer(
config["src_tokenizer_file"], raw_data, config["src_lang"]
)
tokenizer_tgt = get_or_build_tokenizer(
config["tgt_tokenizer_file"], raw_data, config["tgt_lang"]
)
test_pt_dataset = English2HindiDatasetTest(config["dataset_path"])
print(len(test_pt_dataset))
max_len_src = 0
max_len_tgt = 0
seq_len = config["seq_len"]
if seq_len == 0:
print("seq_len is 0, starting process...")
for item in raw_data:
src_ids = tokenizer_src.encode(item[config['src_lang']]).ids
tgt_ids = tokenizer_tgt.encode(item[config['tgt_lang']]).ids
max_len_src = max(max_len_src, len(src_ids))
max_len_tgt = max(max_len_tgt, len(tgt_ids))
print(max_len_src,max_len_tgt )
final_max_len = max(max_len_src, max_len_tgt) + 30
config['seq_len'] = final_max_len
with open("config.yaml", 'w') as f:
yaml.safe_dump(config, f, default_flow_style=False)
print(f'Updated seq_len to {final_max_len}')
else:
print("seq_len is not 0, skipping process.")
random.seed(42)
train_data, temp_data = train_test_split(raw_data, test_size=0.2, random_state=42)
test_data, valid_data = train_test_split(temp_data, test_size=0.5, random_state=42)
print("######################################################")
train_dataset = English2HindiDataset(
train_data,
tokenizer_src,
tokenizer_tgt,
config["src_lang"],
config["tgt_lang"],
config["seq_len"],
)
valid_dataset = English2HindiDataset(
valid_data,
tokenizer_src,
tokenizer_tgt,
config["src_lang"],
config["tgt_lang"],
config["seq_len"],
)
test_dataset = English2HindiDataset(
test_data,
tokenizer_src,
tokenizer_tgt,
config["src_lang"],
config["tgt_lang"],
config["seq_len"],
)
train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=config["batch_size"],shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=config["batch_size"],shuffle=True)
return train_dataloader,valid_dataloader,test_dataloader,tokenizer_src,tokenizer_tgt