Spaces:

yifehuang97
/

CountEx

Sleeping

App Files Files Community

CountEx / app.py

yifehuang97

Update app.py

985fb95 verified about 1 month ago

raw

history blame contribute delete

28.2 kB

	import os
	import json
	import gradio as gr
	import torch
	from PIL import Image, ImageDraw
	from transformers import GroundingDinoProcessor
	from hf_model import CountEX
	from utils import post_process_grounded_object_detection, post_process_grounded_object_detection_with_queries
	import google.generativeai as genai
	from datetime import datetime
	import csv
	from pathlib import Path
	import uuid
	import io

	# Try to import HEIC support
	try:
	from pillow_heif import register_heif_opener
	register_heif_opener()
	HEIC_SUPPORTED = True
	except ImportError:
	HEIC_SUPPORTED = False
	print("Warning: pillow-heif not installed. HEIC images will not be supported.")

	# Try to import HuggingFace Hub
	try:
	from huggingface_hub import HfApi
	HF_HUB_AVAILABLE = True
	except ImportError:
	HF_HUB_AVAILABLE = False
	print("Warning: huggingface_hub not installed.")

	# Global variables for model and processor
	model = None
	processor = None
	device = None
	hf_api = None

	# Data collection directory (local fallback)
	DATA_LOG_DIR = Path("uploaded_data")
	DATA_LOG_DIR.mkdir(exist_ok=True)
	IMAGES_DIR = DATA_LOG_DIR / "images"
	IMAGES_DIR.mkdir(exist_ok=True)
	DATA_LOG_FILE = DATA_LOG_DIR / "prompts_log.csv"

	# HuggingFace Dataset repo for data collection
	HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "BBVisual/CountEx_UserData")

	# Initialize CSV log file with headers if it doesn't exist
	if not DATA_LOG_FILE.exists():
	with open(DATA_LOG_FILE, "w", newline="") as f:
	writer = csv.writer(f)
	writer.writerow(["timestamp", "image_filename", "instruction", "pos_caption", "neg_caption", "count"])

	# Image processing constants
	MAX_IMAGE_SIZE = 1333 # Max dimension (width or height)
	ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png"}

	gemini_api_key = os.environ.get("GEMINI_API_KEY")
	# Configure Gemini
	genai.configure(api_key=gemini_api_key)
	gemini_model = genai.GenerativeModel("gemini-2.0-flash")

	PARSING_PROMPT = """Parse sentences of the form "Count A, not B" into two lists—A (include) and B (exclude)—splitting on "and", "or", and commas, and reattaching shared head nouns (e.g., "red and black beans" → "red beans", "black beans").
	Rules:
	- Remove from B items that are equivalent to items in A (synonyms/variants/abbreviations/regional terms)
	- Keep B items that are more specific than A (for fine-grained exclusion)
	- If B is more general than A but shares the head noun, remove B (contradictory)
	Case 1 — Different head nouns → Keep B
	Example 1: Count green apples and red beans, not yellow screws and white rice → A: ["green apples", "red beans"], B: ["yellow screws", "white rice"]
	Example 2: Count black beans, not poker chips or nails → A: ["black beans"], B: ["poker chips", "nails"]
	Case 2 — Equivalent items → Remove from B
	Example 1: Count fries and TV, not chips and television → A: ["fries", "TV"], B: []
	Example 2: Count garbanzo beans and couch, not chickpeas and sofa → A: ["garbanzo beans", "couch"], B: []
	Case 3 — B more specific than A → Keep B (for fine-grained exclusion)
	Example 1: Count apples and beans, not green apples and black beans → A: ["apples", "beans"], B: ["green apples", "black beans"]
	Example 2: Count beans, not white beans or yellow beans → A: ["beans"], B: ["white beans", "yellow beans"]
	Example 3: Count people, not women → A: ["people"], B: ["women"]
	Case 4 — B more general than A → Remove B (contradictory)
	Example 1: Count green apples, not apples → A: ["green apples"], B: []
	Example 2: Count red beans and green apples, not beans and apples → A: ["red beans", "green apples"], B: []
	User instruction: {instruction}
	Respond ONLY with a JSON object in this exact format, no other text:
	{{"A": ["item1", "item2"], "B": ["item3"]}}
	"""


	def init_hf_api():
	"""Initialize HuggingFace API for dataset upload."""
	global hf_api

	if not HF_HUB_AVAILABLE:
	print("HuggingFace Hub not available")
	return None

	try:
	hf_token = os.environ.get("HF_WRITTE_TOKEN")
	if not hf_token:
	print("HF_WRITTE_TOKEN not set, data collection disabled")
	return None

	hf_api = HfApi(token=hf_token)
	print(f"HuggingFace API initialized. Dataset repo: {HF_DATASET_REPO}")
	return hf_api
	except Exception as e:
	print(f"Error initializing HuggingFace API: {e}")
	return None


	def upload_to_hf_dataset(image_bytes, image_filename, data_dict):
	"""Upload image and metadata to HuggingFace Dataset."""
	global hf_api

	if not hf_api:
	return False

	try:
	hf_token = os.environ.get("HF_WRITTE_TOKEN")

	# Upload image
	hf_api.upload_file(
	path_or_fileobj=io.BytesIO(image_bytes),
	path_in_repo=f"images/{image_filename}",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=hf_token
	)

	# Upload metadata as individual JSON file (avoids race conditions)
	json_filename = image_filename.replace('.jpg', '.json')
	json_content = json.dumps(data_dict, indent=2)

	hf_api.upload_file(
	path_or_fileobj=io.BytesIO(json_content.encode('utf-8')),
	path_in_repo=f"metadata/{json_filename}",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=hf_token
	)

	return True
	except Exception as e:
	print(f"Error uploading to HuggingFace Dataset: {e}")
	return False


	def save_uploaded_data(image, instruction, pos_caption, neg_caption, count, points=None):
	"""
	Save uploaded image and prompt data for collection.
	Tries HuggingFace Dataset first, falls back to local storage.
	"""
	global hf_api

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	unique_id = str(uuid.uuid4())[:8]
	image_filename = f"{timestamp}_{unique_id}.jpg"

	# Prepare image bytes
	img_buffer = io.BytesIO()
	image.save(img_buffer, format='JPEG', quality=95)
	img_bytes = img_buffer.getvalue()

	# Data as dict (for JSON)
	data_dict = {
	"timestamp": timestamp,
	"image_filename": image_filename,
	"instruction": instruction,
	"pos_caption": pos_caption,
	"neg_caption": neg_caption,
	"count": count,
	"points": points if points else [] # normalized coordinates (0-1)
	}

	# Try HuggingFace Dataset first
	if hf_api:
	try:
	if upload_to_hf_dataset(img_bytes, image_filename, data_dict):
	print(f"Saved to HuggingFace Dataset: {image_filename}")
	return
	except Exception as e:
	print(f"HuggingFace upload failed, falling back to local: {e}")

	# Fallback to local storage
	try:
	image_path = IMAGES_DIR / image_filename
	image.save(image_path, "JPEG", quality=95)

	# Also save as JSON locally
	json_path = DATA_LOG_DIR / "metadata"
	json_path.mkdir(exist_ok=True)
	with open(json_path / image_filename.replace('.jpg', '.json'), 'w') as f:
	json.dump(data_dict, f, indent=2)

	# Also append to CSV for backward compatibility
	with open(DATA_LOG_FILE, "a", newline="") as f:
	writer = csv.writer(f)
	writer.writerow([timestamp, image_filename, instruction, pos_caption, neg_caption, count])

	print(f"Saved locally: {image_filename}")
	except Exception as e:
	print(f"Error saving data: {e}")


	def validate_image(image):
	"""
	Validate uploaded image format.
	Returns (is_valid, error_message)
	"""
	if image is None:
	return False, "Error: Please upload an image."

	# Get file extension
	if isinstance(image, str):
	ext = os.path.splitext(image)[1].lower()
	if ext and ext not in ALLOWED_EXTENSIONS:
	return False, f"Error: Unsupported format '{ext}'. Only JPG and PNG are supported."

	return True, None


	def preprocess_image(image):
	"""
	Preprocess uploaded image: convert format and resize if needed.
	"""
	# Handle file path input
	if isinstance(image, str):
	image = Image.open(image)

	# Convert to RGB (handles RGBA, P mode, etc.)
	if image.mode != "RGB":
	if image.mode in ("RGBA", "LA", "P"):
	background = Image.new("RGB", image.size, (255, 255, 255))
	if image.mode == "P":
	image = image.convert("RGBA")
	background.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
	image = background
	else:
	image = image.convert("RGB")

	# Resize if image is too large
	width, height = image.size
	if max(width, height) > MAX_IMAGE_SIZE:
	scale = MAX_IMAGE_SIZE / max(width, height)
	new_width = int(width * scale)
	new_height = int(height * scale)
	image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
	print(f"Resized image from {width}x{height} to {new_width}x{new_height}")

	return image


	def parse_counting_instruction(instruction: str) -> tuple[str, str]:
	"""
	Parse natural language counting instruction using Gemini 2.0 Flash.
	"""
	try:
	prompt = PARSING_PROMPT.format(instruction=instruction)
	response = gemini_model.generate_content(prompt)
	response_text = response.text.strip()

	# Clean up response - remove markdown code blocks if present
	if response_text.startswith("```"):
	response_text = response_text.split("```")[1]
	if response_text.startswith("json"):
	response_text = response_text[4:]
	response_text = response_text.strip()

	result = json.loads(response_text)

	# Convert lists to caption strings
	pos_items = result.get("A", [])
	neg_items = result.get("B", [])

	# Join items with " and " and add period
	pos_caption = " and ".join(pos_items) + "." if pos_items else ""
	neg_caption = " and ".join(neg_items) + "." if neg_items else "None."

	return pos_caption, neg_caption

	except Exception as e:
	print(f"Error parsing instruction: {e}")
	return instruction.strip() + ".", "None."


	def load_model():
	"""Load model and processor once at startup"""
	global model, processor, device

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model_id = "yifehuang97/CountEx_KC_aug_v3_12140136_v2"
	model = CountEX.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
	model = model.to(torch.bfloat16)
	model = model.to(device)
	model.eval()

	processor_id = "fushh7/llmdet_swin_tiny_hf"
	processor = GroundingDinoProcessor.from_pretrained(processor_id)

	return model, processor, device


	import numpy as np


	def discriminative_point_suppression(
	points,
	neg_points,
	pos_queries,
	neg_queries,
	image_size,
	pixel_threshold=5,
	similarity_threshold=0.3,
	):
	"""Discriminative Point Suppression (DPS)"""
	if not neg_points or not points:
	return points, list(range(len(points))), {}

	width, height = image_size
	N, M = len(points), len(neg_points)

	points_arr = np.array(points) * np.array([width, height])
	neg_points_arr = np.array(neg_points) * np.array([width, height])

	spatial_dist = np.linalg.norm(
	points_arr[:, None, :] - neg_points_arr[None, :, :], axis=-1
	)

	nearest_neg_idx = spatial_dist.argmin(axis=1)
	nearest_neg_dist = spatial_dist.min(axis=1)
	spatially_close = nearest_neg_dist < pixel_threshold

	pos_q = pos_queries / (np.linalg.norm(pos_queries, axis=-1, keepdims=True) + 1e-8)
	neg_q = neg_queries / (np.linalg.norm(neg_queries, axis=-1, keepdims=True) + 1e-8)

	matched_neg_q = neg_q[nearest_neg_idx]
	query_sim = (pos_q * matched_neg_q).sum(axis=-1)
	semantically_similar = query_sim > similarity_threshold

	should_suppress = spatially_close & semantically_similar

	keep_mask = ~should_suppress
	filtered_points = np.array(points)[keep_mask].tolist()
	filtered_indices = np.where(keep_mask)[0].tolist()

	suppression_info = {
	"nearest_neg_idx": nearest_neg_idx.tolist(),
	"nearest_neg_dist": nearest_neg_dist.tolist(),
	"query_similarity": query_sim.tolist(),
	"spatially_close": spatially_close.tolist(),
	"semantically_similar": semantically_similar.tolist(),
	"suppressed_indices": np.where(should_suppress)[0].tolist(),
	}

	return filtered_points, filtered_indices, suppression_info


	def count_objects(image, instruction, box_threshold, point_radius, point_color):
	"""Main inference function for counting objects"""
	global model, processor, device

	# Validate image format
	is_valid, error_msg = validate_image(image)
	if not is_valid:
	raise gr.Error(error_msg)

	if model is None:
	load_model()

	# Preprocess image
	image = preprocess_image(image)

	# Parse instruction using Gemini
	pos_caption, neg_caption = parse_counting_instruction(instruction)
	parsed_info = f"Positive: {pos_caption}\nNegative: {neg_caption}"

	# Process positive caption
	pos_inputs = processor(
	images=image,
	text=pos_caption,
	return_tensors="pt",
	padding=True
	)
	pos_inputs = pos_inputs.to(device)
	pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)

	# Process negative caption
	use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.' and neg_caption != 'None.')

	if not use_neg:
	neg_caption = "None."
	neg_inputs = processor(
	images=image,
	text=neg_caption,
	return_tensors="pt",
	padding=True
	)
	neg_inputs = {k: v.to(device) for k, v in neg_inputs.items()}
	neg_inputs['pixel_values'] = neg_inputs['pixel_values'].to(torch.bfloat16)

	pos_inputs['neg_token_type_ids'] = neg_inputs['token_type_ids']
	pos_inputs['neg_attention_mask'] = neg_inputs['attention_mask']
	pos_inputs['neg_pixel_mask'] = neg_inputs['pixel_mask']
	pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
	pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
	pos_inputs['use_neg'] = True

	# Run inference
	with torch.no_grad():
	outputs = model(**pos_inputs)

	outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
	outputs["pred_logits"] = outputs["logits"]

	threshold = box_threshold if box_threshold > 0 else model.box_threshold
	pos_queries = outputs["pos_queries"].squeeze(0).float()
	neg_queries = outputs["neg_queries"].squeeze(0).float()
	pos_queries = pos_queries[-1].squeeze(0)
	neg_queries = neg_queries[-1].squeeze(0)
	pos_queries = pos_queries.unsqueeze(0)
	neg_queries = neg_queries.unsqueeze(0)
	results = post_process_grounded_object_detection_with_queries(outputs, pos_queries, box_threshold=threshold)[0]

	boxes = results["boxes"]
	boxes = [box.tolist() for box in boxes]
	points = [[box[0], box[1]] for box in boxes]

	neg_points = []
	neg_results = None
	if "neg_pred_boxes" in outputs and "neg_logits" in outputs:
	neg_outputs = outputs.copy()
	neg_outputs["pred_boxes"] = outputs["neg_pred_boxes"]
	neg_outputs["logits"] = outputs["neg_logits"]
	neg_outputs["pred_points"] = outputs["neg_pred_boxes"][:, :, :2]
	neg_outputs["pred_logits"] = outputs["neg_logits"]

	neg_results = post_process_grounded_object_detection_with_queries(neg_outputs, neg_queries, box_threshold=0.5)[0]
	neg_boxes = neg_results["boxes"]
	neg_boxes = [box.tolist() for box in neg_boxes]
	neg_points = [[box[0], box[1]] for box in neg_boxes]

	pos_queries_np = results["queries"].cpu().numpy()
	neg_queries_np = neg_results["queries"].cpu().numpy() if neg_results else np.array([])

	img_size = image.size

	if len(neg_points) > 0 and len(neg_queries_np) > 0:
	filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
	points,
	neg_points,
	pos_queries_np,
	neg_queries_np,
	image_size=img_size,
	pixel_threshold=5,
	similarity_threshold=0.3,
	)
	filtered_boxes = [boxes[i] for i in kept_indices]
	else:
	filtered_points = points
	filtered_boxes = boxes

	points = filtered_points
	boxes = filtered_boxes

	# Visualize results
	img_w, img_h = image.size
	img_draw = image.copy()
	draw = ImageDraw.Draw(img_draw)

	for point in points:
	x = point[0] * img_w
	y = point[1] * img_h
	draw.ellipse(
	[x - point_radius, y - point_radius, x + point_radius, y + point_radius],
	fill=point_color
	)

	count = len(points)

	# Save uploaded data for collection
	save_uploaded_data(image, instruction, pos_caption, neg_caption, count, points)

	return img_draw, f"Count: {count}", parsed_info


	def count_objects_manual(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
	"""Manual mode: directly use provided positive and negative captions."""
	global model, processor, device

	# Validate image format
	is_valid, error_msg = validate_image(image)
	if not is_valid:
	raise gr.Error(error_msg)

	if model is None:
	load_model()

	# Preprocess image
	image = preprocess_image(image)

	if pos_caption and not pos_caption.endswith('.'):
	pos_caption = pos_caption + '.'
	if neg_caption and not neg_caption.endswith('.'):
	neg_caption = neg_caption + '.'

	if not neg_caption or neg_caption.strip() == '':
	neg_caption = "None."

	parsed_info = f"Positive: {pos_caption}\nNegative: {neg_caption}"

	pos_inputs = processor(
	images=image,
	text=pos_caption,
	return_tensors="pt",
	padding=True
	)
	pos_inputs = pos_inputs.to(device)
	pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)

	use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.' and neg_caption != 'None.')

	if not use_neg:
	neg_caption = "None."
	neg_inputs = processor(
	images=image,
	text=neg_caption,
	return_tensors="pt",
	padding=True
	)
	neg_inputs = {k: v.to(device) for k, v in neg_inputs.items()}
	neg_inputs['pixel_values'] = neg_inputs['pixel_values'].to(torch.bfloat16)

	pos_inputs['neg_token_type_ids'] = neg_inputs['token_type_ids']
	pos_inputs['neg_attention_mask'] = neg_inputs['attention_mask']
	pos_inputs['neg_pixel_mask'] = neg_inputs['pixel_mask']
	pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
	pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
	pos_inputs['use_neg'] = True

	with torch.no_grad():
	outputs = model(**pos_inputs)

	outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
	outputs["pred_logits"] = outputs["logits"]

	threshold = box_threshold if box_threshold > 0 else model.box_threshold
	pos_queries = outputs["pos_queries"].squeeze(0).float()
	neg_queries = outputs["neg_queries"].squeeze(0).float()
	pos_queries = pos_queries[-1].squeeze(0)
	neg_queries = neg_queries[-1].squeeze(0)
	pos_queries = pos_queries.unsqueeze(0)
	neg_queries = neg_queries.unsqueeze(0)
	results = post_process_grounded_object_detection_with_queries(outputs, pos_queries, box_threshold=threshold)[0]

	boxes = results["boxes"]
	boxes = [box.tolist() for box in boxes]
	points = [[box[0], box[1]] for box in boxes]

	neg_points = []
	neg_results = None
	if "neg_pred_boxes" in outputs and "neg_logits" in outputs:
	neg_outputs = outputs.copy()
	neg_outputs["pred_boxes"] = outputs["neg_pred_boxes"]
	neg_outputs["logits"] = outputs["neg_logits"]
	neg_outputs["pred_points"] = outputs["neg_pred_boxes"][:, :, :2]
	neg_outputs["pred_logits"] = outputs["neg_logits"]

	neg_results = post_process_grounded_object_detection_with_queries(neg_outputs, neg_queries, box_threshold=0.5)[0]
	neg_boxes = neg_results["boxes"]
	neg_boxes = [box.tolist() for box in neg_boxes]
	neg_points = [[box[0], box[1]] for box in neg_boxes]

	pos_queries_np = results["queries"].cpu().numpy()
	neg_queries_np = neg_results["queries"].cpu().numpy() if neg_results else np.array([])

	img_size = image.size

	if len(neg_points) > 0 and len(neg_queries_np) > 0:
	filtered_points, kept_indices, suppression_info = discriminative_point_suppression(
	points,
	neg_points,
	pos_queries_np,
	neg_queries_np,
	image_size=img_size,
	pixel_threshold=5,
	similarity_threshold=0.3,
	)
	filtered_boxes = [boxes[i] for i in kept_indices]
	else:
	filtered_points = points
	filtered_boxes = boxes

	points = filtered_points
	boxes = filtered_boxes

	img_w, img_h = image.size
	img_draw = image.copy()
	draw = ImageDraw.Draw(img_draw)

	for point in points:
	x = point[0] * img_w
	y = point[1] * img_h
	draw.ellipse(
	[x - point_radius, y - point_radius, x + point_radius, y + point_radius],
	fill=point_color
	)

	count = len(points)

	instruction = f"[MANUAL] pos: {pos_caption} \| neg: {neg_caption}"
	save_uploaded_data(image, instruction, pos_caption, neg_caption, count, points)

	return img_draw, f"Count: {count}", parsed_info


	def create_demo():
	with gr.Blocks(title="CountEx: Discriminative Visual Counting") as demo:
	gr.Markdown("""
	# CountEx: Fine-Grained Counting via Exemplars and Exclusion
	Count specific objects in images using text prompts with exclusion capability.
	""")

	current_mode = gr.State(value="natural_language")

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(type="filepath", label="Input Image (JPG, PNG only)")

	with gr.Tabs() as input_tabs:
	with gr.TabItem("Natural Language", id=0) as tab_nl:
	instruction = gr.Textbox(
	label="Counting Instruction",
	placeholder="e.g., Count apples, not green apples",
	value="Count apples, not green apples",
	lines=2
	)
	gr.Markdown("""
	Examples:
	- "Count apples, not green apples"
	- "Count red and black beans, exclude white beans"
	- "Count people, not women"
	""")

	with gr.TabItem("Manual Input", id=1) as tab_manual:
	pos_caption = gr.Textbox(
	label="Positive Prompt (objects to count)",
	placeholder="e.g., apple",
	value="apple."
	)
	neg_caption = gr.Textbox(
	label="Negative Prompt (objects to exclude)",
	placeholder="e.g., green apple",
	value="None."
	)

	submit_btn = gr.Button("Count Objects", variant="primary", size="lg")

	with gr.Accordion("Advanced Settings", open=False):
	box_threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.42,
	step=0.01,
	label="Threshold"
	)
	point_radius = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Point Radius"
	)
	point_color = gr.Dropdown(
	choices=["blue", "red", "green", "yellow", "cyan", "magenta", "white", "orange"],
	value="blue",
	label="Point Color"
	)

	with gr.Column(scale=1):
	output_image = gr.Image(type="pil", label="Result")
	count_output = gr.Textbox(label="Count Result")
	parsed_output = gr.Textbox(label="Parsed Captions", lines=2)

	gr.Markdown("### Examples (Natural Language)")

	gr.Examples(
	examples=[
	["examples/apples.png", "Count apples, not green apples"],
	["examples/apples.png", "Count apples, exclude red apples"],
	["examples/apple.jpg", "Count green apples"],
	["examples/apple.jpg", "Count apples, exclude green apples"],
	["examples/apple.jpg", "Count apples, exclude red apples"],
	["examples/blue_straw_peach.png", "Count blueberries"],
	["examples/blue_straw_peach.png", "Count leaf"],
	["examples/blue_straw_peach.png", "Count blueberries and cherry"],
	["examples/blue_straw_peach.png", "Count blueberries and cherry and strawberry"],
	["examples/black_beans.jpg", "Count black beans and soy beans"],
	["examples/black_beans.jpg", "Count beans"],
	["examples/black_beans.jpg", "Count pig"],
	["examples/candy.jpg", "Count brown coffee candy, exclude black coffee candy"],
	["examples/candy.jpg", "Count candy"],
	["examples/candy.jpg", "Count brown coffee candy and black coffee candy"],
	["examples/candy.jpg", "Count sausage"],
	["examples/strawberry.jpg", "Count blueberries and strawberry"],
	["examples/strawberry.jpg", "Count book"],
	["examples/strawberry2.jpg", "Count blueberries, exclude strawberry"],
	["examples/women.jpg", "Count people, not women"],
	["examples/women.jpg", "Count people, not man"],
	["examples/boat-1.jpg", "Count boats, exclude blue boats"],
	["examples/boat-1.jpg", "Count boats, exclude red boats"],
	],
	inputs=[input_image, instruction],
	outputs=[output_image, count_output, parsed_output],
	fn=lambda img, instr: count_objects(img, instr, 0.42, 5, "blue"),
	cache_examples=False,
	)

	def set_mode_nl():
	return "natural_language"

	def set_mode_manual():
	return "manual"

	tab_nl.select(fn=set_mode_nl, outputs=[current_mode])
	tab_manual.select(fn=set_mode_manual, outputs=[current_mode])

	def handle_submit(mode, image, instr, pos_cap, neg_cap, threshold, radius, color):
	if mode == "natural_language":
	return count_objects(image, instr, threshold, radius, color)
	else:
	return count_objects_manual(image, pos_cap, neg_cap, threshold, radius, color)

	submit_btn.click(
	fn=handle_submit,
	inputs=[current_mode, input_image, instruction, pos_caption, neg_caption,
	box_threshold, point_radius, point_color],
	outputs=[output_image, count_output, parsed_output]
	)

	return demo


	if __name__ == "__main__":
	# Initialize HuggingFace API
	print("Initializing HuggingFace API...")
	init_hf_api()

	# Load model at startup
	print("Loading model...")
	load_model()
	print("Model loaded!")

	# Create and launch demo
	demo = create_demo()
	demo.launch()