Spaces:
Build error
Build error
| import spaces | |
| import torch | |
| import re | |
| import gradio as gr | |
| from threading import Thread | |
| from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM | |
| from PIL import ImageDraw | |
| from torchvision.transforms.v2 import Resize | |
| from transformers import AutoModelForCausalLM | |
| moondream = AutoModelForCausalLM.from_pretrained( | |
| "moondream/moondream3-preview", | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| device_map={"": "cuda"}, | |
| ) | |
| moondream.compile() | |
| # Encode image once | |
| image = Image.open("complex_scene.jpg") | |
| encoded = moondream.encode_image(image) | |
| # Reuse the encoding for multiple queries | |
| questions = [ | |
| "How many people are in this image?", | |
| "What time of day was this taken?", | |
| "What's the weather like?" | |
| ] | |
| for q in questions: | |
| result = moondream.query(image=encoded, question=q, reasoning=False) | |
| print(f"Q: {q}") | |
| print(f"A: {result['answer']}\n") | |
| # Also works with other skills | |
| caption = moondream.caption(encoded, length="normal") | |
| objects = moondream.detect(encoded, "poop") | |
| pointe = moondream.point(encoded, "grass") | |
| print(f"caption: {e}, objects:{g}, point:{h}") | |
| # Segment an object | |
| result = moondream.segment(image, "cat") | |
| svg_path = result["path"] | |
| bbox = result["bbox"] | |
| print(f"SVG Path: {svg_path[:100]}...") | |
| print(f"Bounding box: {bbox}") | |
| # With spatial hint (point) to guide segmentation | |
| result = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]]) | |
| # With spatial hint (bounding box) | |
| result = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]]) | |
| """ | |
| #model_id = "vikhyatk/moondream2" | |
| #revision = "2025-01-09" | |
| #def load_moondream(): | |
| # Load Moondream model and tokenizer. | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"} | |
| # ) | |
| # tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2") | |
| # return model, tokenizer | |
| #tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) | |
| #moondream = AutoModelForCausalLM.from_pretrained( | |
| # model_id, trust_remote_code=True, revision=revision, | |
| # torch_dtype=torch.bfloat16, device_map={"": "cuda"}, | |
| #) | |
| #moondream.eval() | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "vikhyatk/moondream2", | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| device_map="cuda", # "cuda" on Nvidia GPUs | |
| ) | |
| """ | |
| def answer_questions(image_tuples, prompt_text): | |
| result = "" | |
| Q_and_A = "" | |
| prompts = [p.strip() for p in prompt_text.split('?')] | |
| image_embeds = [img[0] for img in image_tuples if img[0] is not None] | |
| answers = [] | |
| for prompt in prompts: | |
| answers.append(moondream.batch_answer( | |
| images=[img.convert("RGB") for img in image_embeds], | |
| prompts=[prompt] * len(image_embeds), | |
| tokenizer=tokenizer | |
| )) | |
| for i, prompt in enumerate(prompts): | |
| Q_and_A += f"### Q: {prompt}\n" | |
| for j, image_tuple in enumerate(image_tuples): | |
| image_name = f"image{j+1}" | |
| answer_text = answers[i][j] | |
| Q_and_A += f"**{image_name} A:** \n {answer_text} \n" | |
| result = {'headers': prompts, 'data': answers} | |
| #print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A)) | |
| return Q_and_A, result | |
| """ | |
| Load Moondream model and tokenizer. | |
| moondream = AutoModelForCausalLM.from_pretrained( | |
| "vikhyatk/moondream2", | |
| revision="2025-01-09", | |
| trust_remote_code=True, | |
| device_map={"": "cuda"}, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2") | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# moondream2 unofficial batch processing demo") | |
| gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n") | |
| gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**") | |
| gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)") | |
| with gr.Row(): | |
| img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4) | |
| with gr.Row(): | |
| prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8) | |
| with gr.Row(): | |
| submit = gr.Button("Submit") | |
| with gr.Row(): | |
| output = gr.Markdown(label="Questions and Answers", line_breaks=True) | |
| with gr.Row(): | |
| output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True) | |
| submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2]) | |
| demo.queue().launch() | |