Spaces:

dwellbot
/

object-memory

Configuration error

russ4stall

fresh history

24f3fb6 6 months ago

20.4 kB

	import gradio as gr
	import torch
	from PIL import Image
	import numpy as np
	import uuid
	import cv2
	import sys
	import os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from core.processing import get_dino_boxes_from_prompt, embed_image_dino_large, embed_text, expand_coords_shape
	from core.models import get_sam_predictor
	from core.image_processing import crop_to_mask_size, apply_mask, resize_image
	from core.storage import upload_image_to_s3, add_vector_to_qdrant, add_object_to_neo4j
	from core.storage import query_vector_db_by_mask, get_object_details, query_vector_db_by_text_embedding
	from core.storage import get_all_locations_for_house, set_object_primary_location_hierarchy

	#HOUSE_ID='c8c5fdea-7138-44ea-9f02-7fdcd47ff8cf' #office
	HOUSE_ID='fc2e081a-2b17-4b2e-a1bb-woodward' #woodward


	# ------------------------------
	# Helper functions
	# ------------------------------

	def extract_image_and_stroke_mask(editor_output):
	"""
	Extracts the image and stroke mask from the editor output.

	Parameters:
	editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array

	Returns:
	A tuple (image, stroke_mask) where:
	- image is the RGB image (HxWx3 array)
	- stroke_mask is a binary mask (HxW array)
	"""
	if isinstance(editor_output, dict):
	bg = editor_output.get('background')
	if bg is None:
	return None, None
	image = bg[..., :3]
	stroke_mask = np.zeros(image.shape[:2], dtype=np.uint8)
	for layer in editor_output.get('layers', []):
	stroke_mask \|= (layer[..., 3] > 0).astype(np.uint8)
	else:
	arr = editor_output
	if arr.shape[2] == 4:
	image = arr[..., :3]
	stroke_mask = (arr[..., 3] > 0).astype(np.uint8)
	else:
	image = arr
	stroke_mask = np.zeros(arr.shape[:2], dtype=np.uint8)
	return image, stroke_mask

	def apply_sam(editor_output, background_mode="remove", crop_result=True) -> np.ndarray:
	"""
	Uses SAM to generate a segmentation mask based on the sketch (stroke_mask),
	then either removes or extremely blurs the background. Optionally crops to
	the foreground bbox.

	Parameters:
	editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array
	background_mode: "remove" or "extreme_blur"
	crop_result: whether to crop output to fg bbox

	Returns:
	HxWx3 uint8 array
	"""
	# --- 1) pull RGB + sketch mask ---
	image, stroke_mask = extract_image_and_stroke_mask(editor_output)

	# if no sketch, just return original
	if stroke_mask.sum() == 0:
	return image

	# preprocess & set image
	image = resize_image(image)
	get_sam_predictor().set_image(image)

	# downscale stroke mask to predictor size
	h, w = image.shape[:2]
	stroke_small = cv2.resize(stroke_mask, (w, h), interpolation=cv2.INTER_NEAREST)
	point_coords, point_labels = stroke_to_coords(stroke_small)

	# now actually predict using the strokes
	with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
	masks, scores, logits = get_sam_predictor().predict(
	point_coords=point_coords,
	point_labels=point_labels,
	box=None,
	multimask_output=False
	)

	# pick the highest-score mask and binarize
	best_idx = int(np.argmax(scores))
	mask = masks[best_idx] > 0.5

	# composite
	output = apply_mask(image, mask, background_mode)

	# optional crop
	if crop_result:
	output = crop_to_mask_size(output, mask)

	return output

	def apply_grounded_sam(editor_output, prompt: str, crop_result=True) -> np.ndarray:
	# 1) pull RGB out
	image, stroke_mask = extract_image_and_stroke_mask(editor_output)

	sam_boxes = get_dino_boxes_from_prompt(image, prompt)

	point_coords = None
	point_labels = None

	if stroke_mask.sum() > 0:
	point_coords, point_labels = stroke_to_coords(stroke_mask)
	point_coords, point_labels = expand_coords_shape(point_coords, point_labels, sam_boxes.shape[0])

	# 5) feed those boxes into SAM2
	get_sam_predictor().set_image(image)
	masks, scores_sam, _ = get_sam_predictor().predict(
	point_coords=point_coords,
	point_labels=point_labels,
	box=sam_boxes,
	multimask_output=False
	)

	# 6) pick the best SAM proposal, composite & crop
	best = int(np.argmax(scores_sam))
	# 1) pick the best mask and remove any leading batch‐dim
	mask = masks[best] > 0.5 # masks[best] should give you shape (H, W)

	output = apply_mask(image, mask, background_mode)

	if crop_result:
	output = crop_to_mask_size(output, mask)

	return output

	def add_item(image, description, object_id, background_mode, click_points):
	"""
	Processes the image for memorization:
	- Resizes it.
	- Optionally applies SAM processing (background removal or extreme blur) based on background_mode.
	- Generates a caption if needed.
	- Computes the CLIP embedding and stores it in Qdrant.
	"""

	#apply clip embeddings
	image_features = embed_image_dino_large(image)

	#generate id's
	if not object_id or object_id.strip() == "":
	object_id = str(uuid.uuid4())
	view_id = str(uuid.uuid4())

	#upload original full-res to S3
	key = f"object_collection/{object_id}/{view_id}.png"
	image_url = upload_image_to_s3(image, key)

	store_image_in_qdrant(view_id, vector=image_features, object_id=object_id, house_id=HOUSE_ID, image_url=image_url)

	if not (description is None or description.strip() == ""):
	desc_features = embed_text(description)
	store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)

	store_in_neo4j(object_id, HOUSE_ID, description, object_id)

	return f"Item added under object ID: {object_id}\nDescription: {description}"

	def query_item(query_image, background_mode, click_points, k=5):
	"""
	Processes the query image:
	- Resizes it.
	- Optionally applies SAM processing based on background_mode and click points.
	- Computes the CLIP embedding and queries Qdrant.
	- Returns matching objects.
	"""
	search_results = query_vector_db_by_mask(query_image, k)

	object_scores = {}
	object_views = {}
	for result in search_results:
	obj_id = result.payload.get("object_id")
	score = result.score
	if obj_id in object_scores:
	object_scores[obj_id] = max(object_scores[obj_id], score)
	object_views[obj_id].append(result.payload.get("description"))
	else:
	object_scores[obj_id] = score
	object_views[obj_id] = [result.payload.get("description")]
	all_scores = np.array(list(object_scores.values()))
	exp_scores = np.exp(all_scores)
	probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
	results = []
	for i, (obj_id, score) in enumerate(object_scores.items()):
	results.append({
	"object_id": obj_id,
	"aggregated_similarity": float(score),
	"probability": float(probabilities[i]),
	"descriptions": object_views[obj_id]
	})
	return results

	def query_by_text(description, k=5):
	"""
	Embeds the provided text and queries the vector DB.
	Returns top k matches in the usual object result format.
	"""
	if not description.strip():
	return {"error": "Description cannot be empty."}

	query_features = embed_text(description)

	# Note: assuming you have or can implement a `query_vector_db_by_text` similar to `query_vector_db_by_mask`
	search_results = query_vector_db_by_text_embedding(query_features, k)

	object_scores = {}
	object_views = {}
	for result in search_results:
	obj_id = result.payload.get("object_id")
	score = result.score
	if obj_id in object_scores:
	object_scores[obj_id] = max(object_scores[obj_id], score)
	object_views[obj_id].append(result.payload.get("description"))
	else:
	object_scores[obj_id] = score
	object_views[obj_id] = [result.payload.get("description")]
	all_scores = np.array(list(object_scores.values()))
	exp_scores = np.exp(all_scores)
	probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
	results = []
	for i, (obj_id, score) in enumerate(object_scores.items()):
	results.append({
	"object_id": obj_id,
	"aggregated_similarity": float(score),
	"probability": float(probabilities[i]),
	"descriptions": object_views[obj_id]
	})
	return results


	def store_image_in_qdrant(view_id, vector : np.ndarray, object_id, house_id, image_url : str):
	if object_id is None:
	object_id = str(uuid.uuid4())

	payload = {"object_id": object_id, "image_url": image_url, "house_id": house_id, "type": "image", "embedding_model": "dino_large"}
	view_id = add_vector_to_qdrant(view_id=view_id,
	vectors={"dinov2_embedding": vector},
	payload=payload)

	return view_id

	def store_text_in_qdrant(vector : np.ndarray, house_id: str, object_id: str = None, description: str = None):
	if object_id is None:
	object_id = str(uuid.uuid4())

	# Add to Qdrant as "text_embedding"
	view_id = add_vector_to_qdrant(
	vectors={"clip_text_embedding": vector},
	payload={"object_id": object_id, "house_id": house_id, "description": description, "type": "text", "embedding_model": "clip"}
	)

	return view_id

	def store_in_neo4j(object_id, house_id, description, qdrant_object_id):
	add_object_to_neo4j(object_id, house_id, description, qdrant_object_id)

	def stroke_to_coords(stroke_mask, max_points=10):
	"""
	Converts a stroke mask into sampled point coordinates and labels.

	Parameters:
	stroke_mask: Binary mask (HxW array) representing the stroke.
	max_points: Maximum number of points to sample.

	Returns:
	A tuple (point_coords, point_labels) where:
	- point_coords is an Nx2 array of sampled [x, y] coordinates.
	- point_labels is an N array of labels (1 for foreground).
	"""
	ys, xs = np.nonzero(stroke_mask)
	coords = np.stack([xs, ys], axis=1)

	# Sample up to max_points
	N = min(max_points, len(coords))
	if N == 0:
	raise ValueError("No stroke pixels found")
	idxs = np.linspace(0, len(coords) - 1, num=N, dtype=int)
	point_coords = coords[idxs]
	point_labels = np.ones(N, dtype=int)

	return point_coords, point_labels


	def get_locations_overview():
	"""
	Fetches all existing locations and their details.
	"""
	locations = get_all_locations_for_house(HOUSE_ID, include_images=True)
	# Example response structure expected from `get_all_locations`:
	# [{"name": "Kitchen", "image": <np.ndarray>, "parents": ["Home"]}, ...]

	overview = []
	for loc in locations:
	overview.append({
	"name": loc["name"],
	"parents": loc.get("parents", []),
	"image": loc.get("image") # Expected to be np.ndarray or PIL.Image
	})
	return overview

	# Remove location function
	def remove_location(name):
	#from core.storage import remove_location
	#remove_location(house_id=HOUSE_ID, name=name)
	return f"Location '{name}' removed."

	def add_update_location(name, parent_str, image):
	parents = [p.strip() for p in parent_str.split(",")] if parent_str else []
	# Example function you'd define in core.storage
	#from core.storage import add_or_update_location
	#add_or_update_location(house_id=HOUSE_ID, name=name, parents=parents, image=image)
	return f"Location '{name}' added or updated with parents {parents}."
	# ------------------------------
	# Gradio Interface
	# ------------------------------

	with gr.Blocks() as demo:
	with gr.Tab("Add Item"):
	image_input = gr.ImageEditor(label="Upload & Sketch", type="numpy")
	seg_prompt_input = gr.Textbox(label="Segmentation Prompt", placeholder="e.g. ‘red apple’")
	description_input = gr.Textbox(label="Description", lines=3)
	object_id_input = gr.Textbox(label="Object ID (optional)")
	background_mode = gr.Radio(choices=["remove","extreme_blur"], value="remove")
	preview_button = gr.Button("Preview")
	preview_output = gr.Image(label="Preview Processed Image", type="numpy")
	submit_button = gr.Button("Submit")
	output_text = gr.Textbox(label="Result")

	preview_button.click(
	fn=lambda img,mode,prompt: (
	apply_grounded_sam(img, prompt)
	if prompt else
	apply_sam(img, mode)
	),
	inputs=[image_input, background_mode, seg_prompt_input],
	outputs=[preview_output]
	)
	submit_button.click(fn=add_item,
	inputs=[preview_output, description_input, object_id_input, background_mode, image_input],
	outputs=[output_text])

	with gr.Tab("Query By Text"):
	text_query_input = gr.Textbox(label="Describe Object", lines=3, placeholder="e.g., 'red ceramic mug'")
	k_text_slider = gr.Slider(1, 10, 5, label="Results k")
	text_query_button = gr.Button("Search by Text")
	text_query_output = gr.JSON(label="Query Results")

	text_query_button.click(query_by_text,
	inputs=[text_query_input, k_text_slider],
	outputs=[text_query_output])

	with gr.Tab("Query By Image"):
	query_input = gr.ImageEditor(label="Query & Sketch", type="numpy")
	query_prompt = gr.Textbox(label="Segmentation Prompt", placeholder="optional text-based mask")
	query_mode = gr.Radio(choices=["remove","extreme_blur"], value="remove")
	query_preview_button = gr.Button("Refresh Preview")
	query_preview= gr.Image(label="Query Preview", type="numpy")
	k_slider = gr.Slider(1,10,1, label="Results k")
	query_button = gr.Button("Search")
	query_output = gr.JSON(label="Query Results")

	# Manual preview refresh
	query_preview_button.click(fn=lambda img,mode,prompt: (
	apply_grounded_sam(img, prompt)
	if prompt else
	apply_sam(img, mode)
	),
	inputs=[query_input, query_mode, query_prompt],
	outputs=[query_preview])

	query_button.click(fn=query_item,
	inputs=[query_preview, query_mode, query_input, k_slider],
	outputs=[query_output])

	with gr.Tab("View Object"):
	view_object_id_input = gr.Textbox(label="Object ID", placeholder="Enter Object ID")
	view_button = gr.Button("View Object")

	add_image_button = gr.Button("Add Image to This Object")
	add_description_button = gr.Button("Add Text Description")
	add_location_button = gr.Button("Add Location")

	view_description_output = gr.Textbox(label="Description")
	view_images_output = gr.Gallery(label="Images", columns=3, height="auto")
	view_texts_output = gr.JSON(label="Text Descriptions")
	view_locations_output = gr.JSON(label="Location Chain")
	view_location_images_output = gr.Gallery(label="Location Images", columns=3, height="auto")

	view_owners_output = gr.JSON(label="Owners")

	desc_object_id_input = 0 #placeholder

	def view_object(object_id):
	data = get_object_details(HOUSE_ID, object_id)
	images_display = [Image.fromarray(img_dict["image"]) for img_dict in data["images"]]
	location_images_display = [Image.fromarray(img) for img in data.get("location_images", [])]
	return (
	data["description"] or "No description found.",
	images_display,
	data["texts"],
	data["locations"],
	location_images_display,
	data["owners"]
	)

	view_button.click(
	view_object,
	inputs=[view_object_id_input],
	outputs=[
	view_description_output,
	view_images_output,
	view_texts_output,
	view_locations_output,
	view_location_images_output,
	view_owners_output
	]
	)

	# Reference your existing Add Item tab's object_id_input
	#add_image_button.click(
	# lambda object_id: gr.update(value=object_id),
	# inputs=[view_object_id_input],
	# outputs=[object_id_input]
	#)

	# Navigation from View Object
	#add_description_button.click(
	# lambda object_id: gr.update(value=object_id),
	# inputs=[view_object_id_input],
	# outputs=[desc_object_id_input]
	#)


	with gr.Tab("Add Description"):
	desc_object_id_input = gr.Textbox(label="Object ID")
	desc_text_input = gr.Textbox(label="Description", lines=3)
	submit_desc_button = gr.Button("Submit Description")
	desc_output = gr.Textbox(label="Result")

	def submit_description(object_id, description):
	desc_features = embed_text(description)
	store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)
	return f"Added description to object {object_id}"

	submit_desc_button.click(submit_description,
	inputs=[desc_object_id_input, desc_text_input],
	outputs=[desc_output])



	with gr.Tab("Manage Locations"):
	with gr.Row():
	refresh_locations_button = gr.Button("Refresh Locations List")
	locations_json_output = gr.JSON(label="Locations Overview (Names and Parents)")
	locations_gallery_output = gr.Gallery(label="Location Images", columns=3, height="auto")

	# Controls to Add/Remove locations
	location_name_input = gr.Textbox(label="Location Name")
	location_parent_input = gr.Textbox(label="Parent Location(s)", placeholder="Comma-separated, e.g. 'Home, Kitchen'")
	location_image_input = gr.Image(label="Upload Location Image", type="numpy")

	add_location_button = gr.Button("Add / Update Location")
	remove_location_button = gr.Button("Remove Location")

	location_manage_output = gr.Textbox(label="Result")

	# Backend processor to return both JSON summary and Gallery
	def refresh_locations_ui():
	raw_locations = get_all_locations_for_house(HOUSE_ID, include_images=True)

	# Prepare JSON summary
	summary = [
	{"name": loc["name"], "parents": loc.get("parents", [])}
	for loc in raw_locations
	]

	# Prepare images for gallery
	images = []
	for loc in raw_locations:
	img_base64 = loc.get("image_base64")
	if img_base64:
	from PIL import Image
	import io, base64
	img_data = base64.b64decode(img_base64)
	img_pil = Image.open(io.BytesIO(img_data))
	images.append(img_pil)

	return summary, images

	refresh_locations_button.click(
	refresh_locations_ui,
	inputs=[],
	outputs=[locations_json_output, locations_gallery_output]
	)



	# Add/Update and Remove functions stay unchanged
	add_location_button.click(
	add_update_location,
	inputs=[location_name_input, location_parent_input, location_image_input],
	outputs=[location_manage_output]
	)

	remove_location_button.click(
	remove_location,
	inputs=[location_name_input],
	outputs=[location_manage_output]
	)


	import os
	os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True, root_path="/", show_api=False)