Respair
/

NeMo_Canary

Model card Files Files and versions

Metrics Training metrics Community

NeMo_Canary / scripts /llm /gemma3_generate.py

Respair's picture

Upload folder using huggingface_hub

b386992 verified 6 months ago

history blame contribute delete

3 kB

	# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Gemma3 language model generate"""

	import torch
	from transformers import AutoTokenizer

	from nemo import lightning as nl
	from nemo.collections.llm.gpt.model.gemma3 import Gemma3Model

	HF_MODEL_NAME = "google/gemma-3-1b-it"


	def main():
	"""Entrypoint"""

	tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)

	strategy = nl.MegatronStrategy(
	tensor_model_parallel_size=1,
	pipeline_model_parallel_size=1,
	pipeline_dtype=torch.bfloat16,
	virtual_pipeline_model_parallel_size=None,
	context_parallel_size=1,
	expert_model_parallel_size=1,
	sequence_parallel=False,
	setup_optimizers=False,
	store_optimizer_states=False,
	)

	trainer = nl.Trainer(
	accelerator="gpu",
	devices=1,
	num_nodes=1,
	strategy=strategy,
	plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
	enable_checkpointing=False,
	)
	fabric = trainer.to_fabric()
	model = fabric.import_model(f"hf://{HF_MODEL_NAME}", Gemma3Model)
	model = model.module.cuda()
	model.eval()

	messages = [
	[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Who are you?"},
	],
	},
	],
	]
	inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	).to(model.device)

	# print(model)

	with torch.no_grad():
	input_ids = inputs['input_ids'].clone().to("cuda")
	generated_ids = input_ids
	for _ in range(10):
	seq_len = input_ids[0].shape[0]
	position_ids = torch.arange(seq_len, dtype=torch.int64).to("cuda")
	output = model(
	input_ids=input_ids,
	position_ids=position_ids,
	attention_mask=None,
	)
	next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True)
	generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
	input_ids = generated_ids

	outputs = tokenizer.batch_decode(generated_ids)
	# ['<bos><start_of_turn>user\nWho are you?<end_of_turn>\n<start_of_turn>model\nHi there! I’m Gemma, a large']
	print(outputs)


	if __name__ == "__main__":
	main()