Spaces:

13Aluminium
/

Gemma_inference

Build error

Gemma_inference / app.py

Update app.py

270a9d4 verified 9 months ago

1.82 kB

	import gradio as gr
	from huggingface_hub import InferenceClient

	# Initialize the client with the model ID
	client = InferenceClient("13Aluminium/gemma-3.1")

	def format_chat_history(history, system_message):
	"""Convert the chat history to the format expected by Gemma"""
	formatted_prompt = f"<system>\n{system_message}\n</system>\n\n"

	for user_msg, assistant_msg in history:
	if user_msg:
	formatted_prompt += f"<user>\n{user_msg}\n</user>\n\n"
	if assistant_msg:
	formatted_prompt += f"<assistant>\n{assistant_msg}\n</assistant>\n\n"

	return formatted_prompt

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	# Format the history into a text prompt that Gemma understands
	prompt = format_chat_history(history, system_message)

	# Add the current message
	prompt += f"<user>\n{message}\n</user>\n\n<assistant>\n"

	response = ""

	# Use text generation instead of chat completion
	for token in client.text_generation(
	prompt,
	max_new_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	response += token
	yield response

	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	)

	if __name__ == "__main__":
	demo.launch()