Spaces:
Runtime error
Runtime error
| import os | |
| import random | |
| from pathlib import Path | |
| from datasets import load_dataset, Dataset | |
| from utils import process_recipes, create_tokenizer | |
| dataset = 'tengomucho/all-recipes-split' | |
| dataset_split = 'train' | |
| tokenizer = create_tokenizer() | |
| def get_dataset(dataset, dataset_split): | |
| """ | |
| Function to grab dataset based on `dataset` and `dataset_split`. | |
| """ | |
| data = load_dataset(dataset, split=dataset_split) | |
| return data | |
| def save_data(dataset, dataset_split): | |
| """ | |
| Function to save data locally as a text file. | |
| """ | |
| data = get_dataset(dataset, dataset_split) | |
| data_path = Path('../data/') | |
| file_path = data_path / "recipes.txt" | |
| if file_path.exists(): | |
| print("Dataset file already exists. Moving onto tokenization...") | |
| return | |
| if data_path.is_dir(): | |
| print(f"Directory exists, skipping...") | |
| else: | |
| print("Creating diretory...") | |
| data_path.mkdir(parents=True, exist_ok=True) | |
| with open(data_path / "recipes.txt", "w", encoding="utf-8") as f: | |
| for i in range(len(data)): | |
| title = data[i].get("title", "").strip() | |
| ingredients = data[i].get("ingredients", "").strip() | |
| directions = data[i].get("directions", "").strip() | |
| recipe_block = f"<start>\n" # Add special token to the beginning of each recipe | |
| if title: | |
| recipe_block += f"Title: {title}\n" | |
| recipe_block += f"Ingredients:\n{ingredients}\n" | |
| recipe_block += f"Directions:\n{directions}\n" | |
| recipe_block += f"<end>\n\n" # Add a special token to the end of each recipe | |
| f.write(recipe_block) | |
| print("File saved.") | |
| def create_dataset(processed_recipes): | |
| """ | |
| Extract input_ids and attention_mask from processed recipes and create a Dataset. | |
| Args: | |
| processed_recipes (list): List of tokenized recipe blocks | |
| Returns: | |
| Dataset: HuggingFace Dataset with input_ids and attention_mask | |
| """ | |
| input_ids = [] | |
| attention_mask = [] | |
| for recipe in processed_recipes: | |
| input_ids.append(recipe['input_ids']) | |
| attention_mask.append(recipe['attention_mask']) | |
| dataset = Dataset.from_dict({ | |
| "input_ids": input_ids, | |
| "attention_mask": attention_mask | |
| }) | |
| return dataset | |
| def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"): | |
| """ | |
| Split tokenized dataset into train/validation and save both to disk. | |
| Args: | |
| dataset (Dataset): HuggingFace Dataset to split | |
| train_split (float): Proportion for training set (default 0.9 = 90%) | |
| save_dir (str): Directory to save the datasets | |
| Returns: | |
| tuple: (train_dataset, val_dataset) | |
| """ | |
| # Split the dataset | |
| split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42) | |
| train_dataset = split_dataset['train'] | |
| val_dataset = split_dataset['test'] | |
| # Create save directory if it doesn't exist | |
| os.makedirs(save_dir, exist_ok=True) | |
| # Save datasets to disk | |
| train_path = os.path.join(save_dir, "train") | |
| val_path = os.path.join(save_dir, "validation") | |
| train_dataset.save_to_disk(train_path) | |
| val_dataset.save_to_disk(val_path) | |
| print(f"Datasets saved successfully!") | |
| print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)") | |
| print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)") | |
| return train_dataset, val_dataset | |
| def load_saved_datasets(save_dir="../data/tokenized_recipes"): | |
| """ | |
| Load previously saved tokenized train/validation datasets from disk. | |
| Args: | |
| save_dir (str): Directory where datasets were saved | |
| Returns: | |
| tuple: (train_dataset, val_dataset) | |
| """ | |
| train_path = os.path.join(save_dir, "train") | |
| val_path = os.path.join(save_dir, "validation") | |
| if not os.path.exists(train_path) or not os.path.exists(val_path): | |
| raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.") | |
| train_dataset = Dataset.load_from_disk(train_path) | |
| val_dataset = Dataset.load_from_disk(val_path) | |
| print(f"Datasets loaded successfully!") | |
| print(f"Train dataset: {len(train_dataset)} samples") | |
| print(f"Validation dataset: {len(val_dataset)} samples") | |
| return train_dataset, val_dataset | |
| if __name__ == "__main__": | |
| #download data | |
| get_dataset(dataset, dataset_split) | |
| #save as text file | |
| save_data(dataset, dataset_split) | |
| #Create tokenized dataset | |
| with open("../data/recipes.txt", "r", encoding="utf-8") as f: | |
| recipes = f.read() | |
| processed_recipes = process_recipes(recipes, tokenizer) | |
| dataset = create_dataset(processed_recipes) | |
| split_and_save_dataset(dataset) | |
| print("Successfully created tokenized dataset.") | |