russ4stall
fresh history
24f3fb6
raw
history blame
20.4 kB
import gradio as gr
import torch
from PIL import Image
import numpy as np
import uuid
import cv2
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.processing import get_dino_boxes_from_prompt, embed_image_dino_large, embed_text, expand_coords_shape
from core.models import get_sam_predictor
from core.image_processing import crop_to_mask_size, apply_mask, resize_image
from core.storage import upload_image_to_s3, add_vector_to_qdrant, add_object_to_neo4j
from core.storage import query_vector_db_by_mask, get_object_details, query_vector_db_by_text_embedding
from core.storage import get_all_locations_for_house, set_object_primary_location_hierarchy
#HOUSE_ID='c8c5fdea-7138-44ea-9f02-7fdcd47ff8cf' #office
HOUSE_ID='fc2e081a-2b17-4b2e-a1bb-woodward' #woodward
# ------------------------------
# Helper functions
# ------------------------------
def extract_image_and_stroke_mask(editor_output):
"""
Extracts the image and stroke mask from the editor output.
Parameters:
editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array
Returns:
A tuple (image, stroke_mask) where:
- image is the RGB image (HxWx3 array)
- stroke_mask is a binary mask (HxW array)
"""
if isinstance(editor_output, dict):
bg = editor_output.get('background')
if bg is None:
return None, None
image = bg[..., :3]
stroke_mask = np.zeros(image.shape[:2], dtype=np.uint8)
for layer in editor_output.get('layers', []):
stroke_mask |= (layer[..., 3] > 0).astype(np.uint8)
else:
arr = editor_output
if arr.shape[2] == 4:
image = arr[..., :3]
stroke_mask = (arr[..., 3] > 0).astype(np.uint8)
else:
image = arr
stroke_mask = np.zeros(arr.shape[:2], dtype=np.uint8)
return image, stroke_mask
def apply_sam(editor_output, background_mode="remove", crop_result=True) -> np.ndarray:
"""
Uses SAM to generate a segmentation mask based on the sketch (stroke_mask),
then either removes or extremely blurs the background. Optionally crops to
the foreground bbox.
Parameters:
editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array
background_mode: "remove" or "extreme_blur"
crop_result: whether to crop output to fg bbox
Returns:
HxWx3 uint8 array
"""
# --- 1) pull RGB + sketch mask ---
image, stroke_mask = extract_image_and_stroke_mask(editor_output)
# if no sketch, just return original
if stroke_mask.sum() == 0:
return image
# preprocess & set image
image = resize_image(image)
get_sam_predictor().set_image(image)
# downscale stroke mask to predictor size
h, w = image.shape[:2]
stroke_small = cv2.resize(stroke_mask, (w, h), interpolation=cv2.INTER_NEAREST)
point_coords, point_labels = stroke_to_coords(stroke_small)
# now actually predict using the strokes
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
masks, scores, logits = get_sam_predictor().predict(
point_coords=point_coords,
point_labels=point_labels,
box=None,
multimask_output=False
)
# pick the highest-score mask and binarize
best_idx = int(np.argmax(scores))
mask = masks[best_idx] > 0.5
# composite
output = apply_mask(image, mask, background_mode)
# optional crop
if crop_result:
output = crop_to_mask_size(output, mask)
return output
def apply_grounded_sam(editor_output, prompt: str, crop_result=True) -> np.ndarray:
# 1) pull RGB out
image, stroke_mask = extract_image_and_stroke_mask(editor_output)
sam_boxes = get_dino_boxes_from_prompt(image, prompt)
point_coords = None
point_labels = None
if stroke_mask.sum() > 0:
point_coords, point_labels = stroke_to_coords(stroke_mask)
point_coords, point_labels = expand_coords_shape(point_coords, point_labels, sam_boxes.shape[0])
# 5) feed those boxes into SAM2
get_sam_predictor().set_image(image)
masks, scores_sam, _ = get_sam_predictor().predict(
point_coords=point_coords,
point_labels=point_labels,
box=sam_boxes,
multimask_output=False
)
# 6) pick the best SAM proposal, composite & crop
best = int(np.argmax(scores_sam))
# 1) pick the best mask and remove any leading batch‐dim
mask = masks[best] > 0.5 # masks[best] should give you shape (H, W)
output = apply_mask(image, mask, background_mode)
if crop_result:
output = crop_to_mask_size(output, mask)
return output
def add_item(image, description, object_id, background_mode, click_points):
"""
Processes the image for memorization:
- Resizes it.
- Optionally applies SAM processing (background removal or extreme blur) based on background_mode.
- Generates a caption if needed.
- Computes the CLIP embedding and stores it in Qdrant.
"""
#apply clip embeddings
image_features = embed_image_dino_large(image)
#generate id's
if not object_id or object_id.strip() == "":
object_id = str(uuid.uuid4())
view_id = str(uuid.uuid4())
#upload original full-res to S3
key = f"object_collection/{object_id}/{view_id}.png"
image_url = upload_image_to_s3(image, key)
store_image_in_qdrant(view_id, vector=image_features, object_id=object_id, house_id=HOUSE_ID, image_url=image_url)
if not (description is None or description.strip() == ""):
desc_features = embed_text(description)
store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)
store_in_neo4j(object_id, HOUSE_ID, description, object_id)
return f"Item added under object ID: {object_id}\nDescription: {description}"
def query_item(query_image, background_mode, click_points, k=5):
"""
Processes the query image:
- Resizes it.
- Optionally applies SAM processing based on background_mode and click points.
- Computes the CLIP embedding and queries Qdrant.
- Returns matching objects.
"""
search_results = query_vector_db_by_mask(query_image, k)
object_scores = {}
object_views = {}
for result in search_results:
obj_id = result.payload.get("object_id")
score = result.score
if obj_id in object_scores:
object_scores[obj_id] = max(object_scores[obj_id], score)
object_views[obj_id].append(result.payload.get("description"))
else:
object_scores[obj_id] = score
object_views[obj_id] = [result.payload.get("description")]
all_scores = np.array(list(object_scores.values()))
exp_scores = np.exp(all_scores)
probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
results = []
for i, (obj_id, score) in enumerate(object_scores.items()):
results.append({
"object_id": obj_id,
"aggregated_similarity": float(score),
"probability": float(probabilities[i]),
"descriptions": object_views[obj_id]
})
return results
def query_by_text(description, k=5):
"""
Embeds the provided text and queries the vector DB.
Returns top k matches in the usual object result format.
"""
if not description.strip():
return {"error": "Description cannot be empty."}
query_features = embed_text(description)
# Note: assuming you have or can implement a `query_vector_db_by_text` similar to `query_vector_db_by_mask`
search_results = query_vector_db_by_text_embedding(query_features, k)
object_scores = {}
object_views = {}
for result in search_results:
obj_id = result.payload.get("object_id")
score = result.score
if obj_id in object_scores:
object_scores[obj_id] = max(object_scores[obj_id], score)
object_views[obj_id].append(result.payload.get("description"))
else:
object_scores[obj_id] = score
object_views[obj_id] = [result.payload.get("description")]
all_scores = np.array(list(object_scores.values()))
exp_scores = np.exp(all_scores)
probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
results = []
for i, (obj_id, score) in enumerate(object_scores.items()):
results.append({
"object_id": obj_id,
"aggregated_similarity": float(score),
"probability": float(probabilities[i]),
"descriptions": object_views[obj_id]
})
return results
def store_image_in_qdrant(view_id, vector : np.ndarray, object_id, house_id, image_url : str):
if object_id is None:
object_id = str(uuid.uuid4())
payload = {"object_id": object_id, "image_url": image_url, "house_id": house_id, "type": "image", "embedding_model": "dino_large"}
view_id = add_vector_to_qdrant(view_id=view_id,
vectors={"dinov2_embedding": vector},
payload=payload)
return view_id
def store_text_in_qdrant(vector : np.ndarray, house_id: str, object_id: str = None, description: str = None):
if object_id is None:
object_id = str(uuid.uuid4())
# Add to Qdrant as "text_embedding"
view_id = add_vector_to_qdrant(
vectors={"clip_text_embedding": vector},
payload={"object_id": object_id, "house_id": house_id, "description": description, "type": "text", "embedding_model": "clip"}
)
return view_id
def store_in_neo4j(object_id, house_id, description, qdrant_object_id):
add_object_to_neo4j(object_id, house_id, description, qdrant_object_id)
def stroke_to_coords(stroke_mask, max_points=10):
"""
Converts a stroke mask into sampled point coordinates and labels.
Parameters:
stroke_mask: Binary mask (HxW array) representing the stroke.
max_points: Maximum number of points to sample.
Returns:
A tuple (point_coords, point_labels) where:
- point_coords is an Nx2 array of sampled [x, y] coordinates.
- point_labels is an N array of labels (1 for foreground).
"""
ys, xs = np.nonzero(stroke_mask)
coords = np.stack([xs, ys], axis=1)
# Sample up to max_points
N = min(max_points, len(coords))
if N == 0:
raise ValueError("No stroke pixels found")
idxs = np.linspace(0, len(coords) - 1, num=N, dtype=int)
point_coords = coords[idxs]
point_labels = np.ones(N, dtype=int)
return point_coords, point_labels
def get_locations_overview():
"""
Fetches all existing locations and their details.
"""
locations = get_all_locations_for_house(HOUSE_ID, include_images=True)
# Example response structure expected from `get_all_locations`:
# [{"name": "Kitchen", "image": <np.ndarray>, "parents": ["Home"]}, ...]
overview = []
for loc in locations:
overview.append({
"name": loc["name"],
"parents": loc.get("parents", []),
"image": loc.get("image") # Expected to be np.ndarray or PIL.Image
})
return overview
# Remove location function
def remove_location(name):
#from core.storage import remove_location
#remove_location(house_id=HOUSE_ID, name=name)
return f"Location '{name}' removed."
def add_update_location(name, parent_str, image):
parents = [p.strip() for p in parent_str.split(",")] if parent_str else []
# Example function you'd define in core.storage
#from core.storage import add_or_update_location
#add_or_update_location(house_id=HOUSE_ID, name=name, parents=parents, image=image)
return f"Location '{name}' added or updated with parents {parents}."
# ------------------------------
# Gradio Interface
# ------------------------------
with gr.Blocks() as demo:
with gr.Tab("Add Item"):
image_input = gr.ImageEditor(label="Upload & Sketch", type="numpy")
seg_prompt_input = gr.Textbox(label="Segmentation Prompt", placeholder="e.g. ‘red apple’")
description_input = gr.Textbox(label="Description", lines=3)
object_id_input = gr.Textbox(label="Object ID (optional)")
background_mode = gr.Radio(choices=["remove","extreme_blur"], value="remove")
preview_button = gr.Button("Preview")
preview_output = gr.Image(label="Preview Processed Image", type="numpy")
submit_button = gr.Button("Submit")
output_text = gr.Textbox(label="Result")
preview_button.click(
fn=lambda img,mode,prompt: (
apply_grounded_sam(img, prompt)
if prompt else
apply_sam(img, mode)
),
inputs=[image_input, background_mode, seg_prompt_input],
outputs=[preview_output]
)
submit_button.click(fn=add_item,
inputs=[preview_output, description_input, object_id_input, background_mode, image_input],
outputs=[output_text])
with gr.Tab("Query By Text"):
text_query_input = gr.Textbox(label="Describe Object", lines=3, placeholder="e.g., 'red ceramic mug'")
k_text_slider = gr.Slider(1, 10, 5, label="Results k")
text_query_button = gr.Button("Search by Text")
text_query_output = gr.JSON(label="Query Results")
text_query_button.click(query_by_text,
inputs=[text_query_input, k_text_slider],
outputs=[text_query_output])
with gr.Tab("Query By Image"):
query_input = gr.ImageEditor(label="Query & Sketch", type="numpy")
query_prompt = gr.Textbox(label="Segmentation Prompt", placeholder="optional text-based mask")
query_mode = gr.Radio(choices=["remove","extreme_blur"], value="remove")
query_preview_button = gr.Button("Refresh Preview")
query_preview= gr.Image(label="Query Preview", type="numpy")
k_slider = gr.Slider(1,10,1, label="Results k")
query_button = gr.Button("Search")
query_output = gr.JSON(label="Query Results")
# Manual preview refresh
query_preview_button.click(fn=lambda img,mode,prompt: (
apply_grounded_sam(img, prompt)
if prompt else
apply_sam(img, mode)
),
inputs=[query_input, query_mode, query_prompt],
outputs=[query_preview])
query_button.click(fn=query_item,
inputs=[query_preview, query_mode, query_input, k_slider],
outputs=[query_output])
with gr.Tab("View Object"):
view_object_id_input = gr.Textbox(label="Object ID", placeholder="Enter Object ID")
view_button = gr.Button("View Object")
add_image_button = gr.Button("Add Image to This Object")
add_description_button = gr.Button("Add Text Description")
add_location_button = gr.Button("Add Location")
view_description_output = gr.Textbox(label="Description")
view_images_output = gr.Gallery(label="Images", columns=3, height="auto")
view_texts_output = gr.JSON(label="Text Descriptions")
view_locations_output = gr.JSON(label="Location Chain")
view_location_images_output = gr.Gallery(label="Location Images", columns=3, height="auto")
view_owners_output = gr.JSON(label="Owners")
desc_object_id_input = 0 #placeholder
def view_object(object_id):
data = get_object_details(HOUSE_ID, object_id)
images_display = [Image.fromarray(img_dict["image"]) for img_dict in data["images"]]
location_images_display = [Image.fromarray(img) for img in data.get("location_images", [])]
return (
data["description"] or "No description found.",
images_display,
data["texts"],
data["locations"],
location_images_display,
data["owners"]
)
view_button.click(
view_object,
inputs=[view_object_id_input],
outputs=[
view_description_output,
view_images_output,
view_texts_output,
view_locations_output,
view_location_images_output,
view_owners_output
]
)
# Reference your existing Add Item tab's object_id_input
#add_image_button.click(
# lambda object_id: gr.update(value=object_id),
# inputs=[view_object_id_input],
# outputs=[object_id_input]
#)
# Navigation from View Object
#add_description_button.click(
# lambda object_id: gr.update(value=object_id),
# inputs=[view_object_id_input],
# outputs=[desc_object_id_input]
#)
with gr.Tab("Add Description"):
desc_object_id_input = gr.Textbox(label="Object ID")
desc_text_input = gr.Textbox(label="Description", lines=3)
submit_desc_button = gr.Button("Submit Description")
desc_output = gr.Textbox(label="Result")
def submit_description(object_id, description):
desc_features = embed_text(description)
store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)
return f"Added description to object {object_id}"
submit_desc_button.click(submit_description,
inputs=[desc_object_id_input, desc_text_input],
outputs=[desc_output])
with gr.Tab("Manage Locations"):
with gr.Row():
refresh_locations_button = gr.Button("Refresh Locations List")
locations_json_output = gr.JSON(label="Locations Overview (Names and Parents)")
locations_gallery_output = gr.Gallery(label="Location Images", columns=3, height="auto")
# Controls to Add/Remove locations
location_name_input = gr.Textbox(label="Location Name")
location_parent_input = gr.Textbox(label="Parent Location(s)", placeholder="Comma-separated, e.g. 'Home, Kitchen'")
location_image_input = gr.Image(label="Upload Location Image", type="numpy")
add_location_button = gr.Button("Add / Update Location")
remove_location_button = gr.Button("Remove Location")
location_manage_output = gr.Textbox(label="Result")
# Backend processor to return both JSON summary and Gallery
def refresh_locations_ui():
raw_locations = get_all_locations_for_house(HOUSE_ID, include_images=True)
# Prepare JSON summary
summary = [
{"name": loc["name"], "parents": loc.get("parents", [])}
for loc in raw_locations
]
# Prepare images for gallery
images = []
for loc in raw_locations:
img_base64 = loc.get("image_base64")
if img_base64:
from PIL import Image
import io, base64
img_data = base64.b64decode(img_base64)
img_pil = Image.open(io.BytesIO(img_data))
images.append(img_pil)
return summary, images
refresh_locations_button.click(
refresh_locations_ui,
inputs=[],
outputs=[locations_json_output, locations_gallery_output]
)
# Add/Update and Remove functions stay unchanged
add_location_button.click(
add_update_location,
inputs=[location_name_input, location_parent_input, location_image_input],
outputs=[location_manage_output]
)
remove_location_button.click(
remove_location,
inputs=[location_name_input],
outputs=[location_manage_output]
)
import os
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True, root_path="/", show_api=False)