| from transformers import AutoTokenizer | |
| def preprocess_data(data, model_name="facebook/opt-350m"): | |
| """ | |
| Tokenizes input text data using chosen tokenizer. | |
| Args: | |
| data (list of str): Input sentences. | |
| model_name (str): Pretrained model tokenizer. | |
| Returns: | |
| tokenized dataset (dict): Dictionary of tokenized inputs. | |
| """ | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| tokenized = tokenizer(data, padding=True, truncation=True, return_tensors="pt") | |
| return tokenized | |
| if __name__ == "__main__": | |
| sample = ["Fine-tuning an open-source LLM.", "This is a sample sentence."] | |
| tokenized_sample = preprocess_data(sample) | |
| print(tokenized_sample) | |