Spaces:

Csplk
/

moondream2-batch-processing

Build error

App Files Files Community

moondream2-batch-processing / app.py

Csplk

Update app.py

7ab08cb verified 5 months ago

raw

history blame

4.68 kB

	import spaces
	import torch
	import re
	import gradio as gr
	from threading import Thread
	from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
	from PIL import ImageDraw
	from torchvision.transforms.v2 import Resize

	from transformers import AutoModelForCausalLM

	moondream = AutoModelForCausalLM.from_pretrained(
	"moondream/moondream3-preview",
	trust_remote_code=True,
	dtype=torch.bfloat16,
	device_map={"": "cuda"},
	)
	moondream.compile()

	# Encode image once
	image = Image.open("complex_scene.jpg")
	encoded = moondream.encode_image(image)

	# Reuse the encoding for multiple queries
	questions = [
	"How many people are in this image?",
	"What time of day was this taken?",
	"What's the weather like?"
	]

	for q in questions:
	result = moondream.query(image=encoded, question=q, reasoning=False)
	print(f"Q: {q}")
	print(f"A: {result['answer']}\n")

	# Also works with other skills
	caption = moondream.caption(encoded, length="normal")
	objects = moondream.detect(encoded, "poop")
	pointe = moondream.point(encoded, "grass")
	print(f"caption: {e}, objects:{g}, point:{h}")

	# Segment an object
	result = moondream.segment(image, "cat")
	svg_path = result["path"]
	bbox = result["bbox"]

	print(f"SVG Path: {svg_path[:100]}...")
	print(f"Bounding box: {bbox}")

	# With spatial hint (point) to guide segmentation
	result = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])

	# With spatial hint (bounding box)
	result = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])

	"""
	#model_id = "vikhyatk/moondream2"
	#revision = "2025-01-09"

	#def load_moondream():
	# Load Moondream model and tokenizer.
	# model = AutoModelForCausalLM.from_pretrained(
	# "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
	# )
	# tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
	# return model, tokenizer

	#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
	#moondream = AutoModelForCausalLM.from_pretrained(
	# model_id, trust_remote_code=True, revision=revision,
	# torch_dtype=torch.bfloat16, device_map={"": "cuda"},
	#)

	#moondream.eval()

	model = AutoModelForCausalLM.from_pretrained(
	"vikhyatk/moondream2",
	trust_remote_code=True,
	dtype=torch.bfloat16,
	device_map="cuda", # "cuda" on Nvidia GPUs
	)
	"""

	@spaces.GPU(durtion="150")
	def answer_questions(image_tuples, prompt_text):
	result = ""
	Q_and_A = ""
	prompts = [p.strip() for p in prompt_text.split('?')]
	image_embeds = [img[0] for img in image_tuples if img[0] is not None]
	answers = []

	for prompt in prompts:
	answers.append(moondream.batch_answer(
	images=[img.convert("RGB") for img in image_embeds],
	prompts=[prompt] * len(image_embeds),
	tokenizer=tokenizer
	))

	for i, prompt in enumerate(prompts):
	Q_and_A += f"### Q: {prompt}\n"
	for j, image_tuple in enumerate(image_tuples):
	image_name = f"image{j+1}"
	answer_text = answers[i][j]
	Q_and_A += f"{image_name} A: \n {answer_text} \n"

	result = {'headers': prompts, 'data': answers}
	#print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
	return Q_and_A, result

	"""
	Load Moondream model and tokenizer.
	moondream = AutoModelForCausalLM.from_pretrained(
	"vikhyatk/moondream2",
	revision="2025-01-09",
	trust_remote_code=True,
	device_map={"": "cuda"},
	)
	tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
	"""


	with gr.Blocks() as demo:
	gr.Markdown("# moondream2 unofficial batch processing demo")
	gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
	gr.Markdown("Currently each image will be sent as a batch with the prompts thus asking each prompt on each image")
	gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
	with gr.Row():
	img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
	with gr.Row():
	prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
	with gr.Row():
	submit = gr.Button("Submit")
	with gr.Row():
	output = gr.Markdown(label="Questions and Answers", line_breaks=True)
	with gr.Row():
	output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
	submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])

	demo.queue().launch()