| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import ( | |
| WhitespaceSplit, | |
| Punctuation, | |
| Sequence as PreSequence, | |
| ) | |
| from tokenizers.normalizers import NFD, Lowercase, Sequence, StripAccents | |
| from pathlib import Path | |
| import yaml | |
| from datasets import load_dataset | |
| from dataset import English2HindiDataset, English2HindiDatasetTest | |
| from torch.utils.data import DataLoader | |
| import json | |
| import random | |
| from sklearn.model_selection import train_test_split | |
| def load_config(config_path): | |
| with open(config_path, "r") as file: | |
| return yaml.safe_load(file) | |
| def get_all_sentences(ds, lang): | |
| for item in ds: | |
| yield item[lang] | |
| def get_or_build_tokenizer(tokenizer_path, ds, lang): | |
| tokenizer_path = Path(tokenizer_path) | |
| if not Path.exists(tokenizer_path): | |
| tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | |
| if lang == "en_text": | |
| tokenizer.normalizer = Sequence( | |
| [ | |
| NFD(), | |
| StripAccents(), | |
| Lowercase(), | |
| ] | |
| ) | |
| else: | |
| tokenizer.normalizer = Sequence( | |
| [ | |
| NFD(), | |
| Lowercase(), | |
| ] | |
| ) | |
| tokenizer.pre_tokenizer = PreSequence([WhitespaceSplit(), Punctuation()]) | |
| trainer = BpeTrainer( | |
| special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], | |
| min_frequency=3, | |
| vocab_size=60000, | |
| ) | |
| tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer) | |
| tokenizer.save(str(tokenizer_path)) | |
| else: | |
| tokenizer = Tokenizer.from_file(str(tokenizer_path)) | |
| return tokenizer | |
| def preprocess_to_json(dataset_hf, output_json_path, word_limit=77, char_limit=300): | |
| filtered_data = [] | |
| for row in dataset_hf: | |
| if not row.get("translated", False): | |
| continue | |
| en_value = None | |
| en_conversations = row.get("en_conversations", []) | |
| for conv in en_conversations: | |
| if "human" in conv["from"]: | |
| en_value = conv["value"] | |
| break | |
| if en_value is None and len(en_conversations) > 0: | |
| en_value = en_conversations[0]["value"] | |
| hi_value = None | |
| hi_conversations = row.get("conversations", []) | |
| for conv in hi_conversations: | |
| if "human" in conv["from"]: | |
| hi_value = conv["value"] | |
| break | |
| if hi_value is None and len(hi_conversations) > 0: | |
| hi_value = hi_conversations[0]["value"] | |
| if en_value and hi_value: | |
| if ( | |
| len(en_value.split()) > word_limit | |
| or len(en_value) > char_limit | |
| or len(hi_value.split()) > word_limit | |
| or len(hi_value) > char_limit | |
| ): | |
| continue | |
| filtered_data.append({"en_text": en_value, "hi_text": hi_value}) | |
| with open(output_json_path, "w", encoding="utf-8") as f: | |
| json.dump(filtered_data, f, ensure_ascii=False, indent=2) | |
| def create_resources(): | |
| config_path = "config.yaml" | |
| config = load_config(config_path) | |
| dataset_json_path = config.get("dataset_path", "data/english2hindi_data.json") | |
| if not Path(dataset_json_path).exists(): | |
| print(f"Dataset file {dataset_json_path} not found. Creating it...") | |
| dataset_hf = load_dataset("BhabhaAI/openhermes-2.5-hindi", split="train") | |
| preprocess_to_json(dataset_hf, dataset_json_path) | |
| else: | |
| print(f"Dataset file {dataset_json_path} already exists. Skipping preprocessing.") | |
| with open(dataset_json_path, "r", encoding="utf-8") as f: | |
| raw_data = json.load(f) | |
| tokenizer_src = get_or_build_tokenizer( | |
| config["src_tokenizer_file"], raw_data, config["src_lang"] | |
| ) | |
| tokenizer_tgt = get_or_build_tokenizer( | |
| config["tgt_tokenizer_file"], raw_data, config["tgt_lang"] | |
| ) | |
| test_pt_dataset = English2HindiDatasetTest(config["dataset_path"]) | |
| print(len(test_pt_dataset)) | |
| max_len_src = 0 | |
| max_len_tgt = 0 | |
| seq_len = config["seq_len"] | |
| if seq_len == 0: | |
| print("seq_len is 0, starting process...") | |
| for item in raw_data: | |
| src_ids = tokenizer_src.encode(item[config['src_lang']]).ids | |
| tgt_ids = tokenizer_tgt.encode(item[config['tgt_lang']]).ids | |
| max_len_src = max(max_len_src, len(src_ids)) | |
| max_len_tgt = max(max_len_tgt, len(tgt_ids)) | |
| print(max_len_src,max_len_tgt ) | |
| final_max_len = max(max_len_src, max_len_tgt) + 30 | |
| config['seq_len'] = final_max_len | |
| with open("config.yaml", 'w') as f: | |
| yaml.safe_dump(config, f, default_flow_style=False) | |
| print(f'Updated seq_len to {final_max_len}') | |
| else: | |
| print("seq_len is not 0, skipping process.") | |
| random.seed(42) | |
| train_data, temp_data = train_test_split(raw_data, test_size=0.2, random_state=42) | |
| test_data, valid_data = train_test_split(temp_data, test_size=0.5, random_state=42) | |
| print("######################################################") | |
| train_dataset = English2HindiDataset( | |
| train_data, | |
| tokenizer_src, | |
| tokenizer_tgt, | |
| config["src_lang"], | |
| config["tgt_lang"], | |
| config["seq_len"], | |
| ) | |
| valid_dataset = English2HindiDataset( | |
| valid_data, | |
| tokenizer_src, | |
| tokenizer_tgt, | |
| config["src_lang"], | |
| config["tgt_lang"], | |
| config["seq_len"], | |
| ) | |
| test_dataset = English2HindiDataset( | |
| test_data, | |
| tokenizer_src, | |
| tokenizer_tgt, | |
| config["src_lang"], | |
| config["tgt_lang"], | |
| config["seq_len"], | |
| ) | |
| train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=False) | |
| valid_dataloader = DataLoader(valid_dataset, batch_size=config["batch_size"],shuffle=True) | |
| test_dataloader = DataLoader(test_dataset, batch_size=config["batch_size"],shuffle=True) | |
| return train_dataloader,valid_dataloader,test_dataloader,tokenizer_src,tokenizer_tgt | |