Spaces:

dwellbot
/

object-memory

Configuration error

App Files Files Community

russ4stall commited on Jul 9, 2025

Commit

9197931

1 Parent(s): 346f25c

app

Browse files

Files changed (2) hide show

app.py +528 -0
requirements-no-version.txt +1 -1

app.py ADDED Viewed

	@@ -0,0 +1,528 @@

+import gradio as gr
+import torch
+from PIL import Image
+import numpy as np
+import uuid
+import cv2
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from core.processing import get_dino_boxes_from_prompt, embed_image_dino_large, embed_text, expand_coords_shape
+from core.models import get_sam_predictor
+from core.image_processing import crop_to_mask_size, apply_mask, resize_image
+from core.storage import upload_image_to_s3, add_vector_to_qdrant, add_object_to_neo4j
+from core.storage import query_vector_db_by_mask, get_object_details, query_vector_db_by_text_embedding
+from core.storage import  get_all_locations_for_house, set_object_primary_location_hierarchy
+#HOUSE_ID='c8c5fdea-7138-44ea-9f02-7fdcd47ff8cf' #office
+HOUSE_ID='fc2e081a-2b17-4b2e-a1bb-woodward' #woodward
+# ------------------------------
+# Helper functions
+# ------------------------------
+def extract_image_and_stroke_mask(editor_output):
+    """
+    Extracts the image and stroke mask from the editor output.
+    Parameters:
+      editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array
+    Returns:
+      A tuple (image, stroke_mask) where:
+        - image is the RGB image (HxWx3 array)
+        - stroke_mask is a binary mask (HxW array)
+    """
+    if isinstance(editor_output, dict):
+        bg = editor_output.get('background')
+        if bg is None:
+            return None, None
+        image = bg[..., :3]
+        stroke_mask = np.zeros(image.shape[:2], dtype=np.uint8)
+        for layer in editor_output.get('layers', []):
+            stroke_mask |= (layer[..., 3] > 0).astype(np.uint8)
+    else:
+        arr = editor_output
+        if arr.shape[2] == 4:
+            image = arr[..., :3]
+            stroke_mask = (arr[..., 3] > 0).astype(np.uint8)
+        else:
+            image = arr
+            stroke_mask = np.zeros(arr.shape[:2], dtype=np.uint8)
+    return image, stroke_mask
+def apply_sam(editor_output, background_mode="remove", crop_result=True) -> np.ndarray:
+    """
+    Uses SAM to generate a segmentation mask based on the sketch (stroke_mask),
+    then either removes or extremely blurs the background. Optionally crops to
+    the foreground bbox.
+    Parameters:
+      editor_output: either a dict with 'background' and 'layers' or an HxWx3/4 array
+      background_mode: "remove" or "extreme_blur"
+      crop_result: whether to crop output to fg bbox
+    Returns:
+      HxWx3 uint8 array
+    """
+    # --- 1) pull RGB + sketch mask ---
+    image, stroke_mask = extract_image_and_stroke_mask(editor_output)
+    # if no sketch, just return original
+    if stroke_mask.sum() == 0:
+        return image
+    # preprocess & set image
+    image = resize_image(image)
+    get_sam_predictor().set_image(image)
+    # downscale stroke mask to predictor size
+    h, w = image.shape[:2]
+    stroke_small = cv2.resize(stroke_mask, (w, h), interpolation=cv2.INTER_NEAREST)
+    point_coords, point_labels = stroke_to_coords(stroke_small)
+    # now actually predict using the strokes
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+        masks, scores, logits = get_sam_predictor().predict(
+            point_coords=point_coords,
+            point_labels=point_labels,
+            box=None,
+            multimask_output=False
+        )
+    # pick the highest-score mask and binarize
+    best_idx = int(np.argmax(scores))
+    mask = masks[best_idx] > 0.5
+    # composite
+    output = apply_mask(image, mask, background_mode)
+    # optional crop
+    if crop_result:
+        output = crop_to_mask_size(output, mask)
+    return output
+def apply_grounded_sam(editor_output, prompt: str, crop_result=True) -> np.ndarray:
+    # 1) pull RGB out
+    image, stroke_mask = extract_image_and_stroke_mask(editor_output)
+    sam_boxes = get_dino_boxes_from_prompt(image, prompt)
+    point_coords  = None
+    point_labels  = None
+    if stroke_mask.sum() > 0:
+        point_coords, point_labels = stroke_to_coords(stroke_mask)
+        point_coords, point_labels = expand_coords_shape(point_coords, point_labels, sam_boxes.shape[0])
+    # 5) feed those boxes into SAM2
+    get_sam_predictor().set_image(image)
+    masks, scores_sam, _ = get_sam_predictor().predict(
+        point_coords=point_coords,
+        point_labels=point_labels,
+        box=sam_boxes,
+        multimask_output=False
+    )
+    # 6) pick the best SAM proposal, composite & crop
+    best = int(np.argmax(scores_sam))
+    # 1) pick the best mask and remove any leading batch‐dim
+    mask = masks[best] > 0.5           # masks[best] should give you shape (H, W)
+    output = apply_mask(image, mask, background_mode)
+    if crop_result:
+        output = crop_to_mask_size(output, mask)
+    return output
+def add_item(image, description, object_id, background_mode, click_points):
+    """
+    Processes the image for memorization:
+      - Resizes it.
+      - Optionally applies SAM processing (background removal or extreme blur) based on background_mode.
+      - Generates a caption if needed.
+      - Computes the CLIP embedding and stores it in Qdrant.
+    """
+    #apply clip embeddings
+    image_features = embed_image_dino_large(image)
+    #generate id's
+    if not object_id or object_id.strip() == "":
+        object_id = str(uuid.uuid4())
+    view_id = str(uuid.uuid4())
+    #upload original full-res to S3
+    key = f"object_collection/{object_id}/{view_id}.png"
+    image_url = upload_image_to_s3(image, key)
+    store_image_in_qdrant(view_id, vector=image_features, object_id=object_id, house_id=HOUSE_ID, image_url=image_url)
+    if not (description is None or description.strip() == ""):
+        desc_features = embed_text(description)
+        store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)
+    store_in_neo4j(object_id, HOUSE_ID, description, object_id)
+    return f"Item added under object ID: {object_id}\nDescription: {description}"
+def query_item(query_image, background_mode, click_points, k=5):
+    """
+    Processes the query image:
+      - Resizes it.
+      - Optionally applies SAM processing based on background_mode and click points.
+      - Computes the CLIP embedding and queries Qdrant.
+      - Returns matching objects.
+    """
+    search_results = query_vector_db_by_mask(query_image, k)
+    object_scores = {}
+    object_views = {}
+    for result in search_results:
+        obj_id = result.payload.get("object_id")
+        score = result.score
+        if obj_id in object_scores:
+            object_scores[obj_id] = max(object_scores[obj_id], score)
+            object_views[obj_id].append(result.payload.get("description"))
+        else:
+            object_scores[obj_id] = score
+            object_views[obj_id] = [result.payload.get("description")]
+    all_scores = np.array(list(object_scores.values()))
+    exp_scores = np.exp(all_scores)
+    probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
+    results = []
+    for i, (obj_id, score) in enumerate(object_scores.items()):
+        results.append({
+            "object_id": obj_id,
+            "aggregated_similarity": float(score),
+            "probability": float(probabilities[i]),
+            "descriptions": object_views[obj_id]
+        })
+    return results
+def query_by_text(description, k=5):
+    """
+    Embeds the provided text and queries the vector DB.
+    Returns top k matches in the usual object result format.
+    """
+    if not description.strip():
+        return {"error": "Description cannot be empty."}
+    query_features = embed_text(description)
+    # Note: assuming you have or can implement a `query_vector_db_by_text` similar to `query_vector_db_by_mask`
+    search_results = query_vector_db_by_text_embedding(query_features, k)
+    object_scores = {}
+    object_views = {}
+    for result in search_results:
+        obj_id = result.payload.get("object_id")
+        score = result.score
+        if obj_id in object_scores:
+            object_scores[obj_id] = max(object_scores[obj_id], score)
+            object_views[obj_id].append(result.payload.get("description"))
+        else:
+            object_scores[obj_id] = score
+            object_views[obj_id] = [result.payload.get("description")]
+    all_scores = np.array(list(object_scores.values()))
+    exp_scores = np.exp(all_scores)
+    probabilities = exp_scores / np.sum(exp_scores) if np.sum(exp_scores) > 0 else np.zeros_like(exp_scores)
+    results = []
+    for i, (obj_id, score) in enumerate(object_scores.items()):
+        results.append({
+            "object_id": obj_id,
+            "aggregated_similarity": float(score),
+            "probability": float(probabilities[i]),
+            "descriptions": object_views[obj_id]
+        })
+    return results
+def store_image_in_qdrant(view_id, vector : np.ndarray, object_id, house_id, image_url : str):
+    if object_id is None:
+        object_id = str(uuid.uuid4())
+    payload = {"object_id": object_id, "image_url": image_url, "house_id": house_id,  "type": "image", "embedding_model": "dino_large"}
+    view_id = add_vector_to_qdrant(view_id=view_id,
+                                   vectors={"dinov2_embedding": vector},
+                                   payload=payload)
+    return view_id
+def store_text_in_qdrant(vector : np.ndarray, house_id: str, object_id: str = None, description: str = None):
+    if object_id is None:
+        object_id = str(uuid.uuid4())
+    # Add to Qdrant as "text_embedding"
+    view_id = add_vector_to_qdrant(
+        vectors={"clip_text_embedding": vector},
+        payload={"object_id": object_id, "house_id": house_id, "description": description, "type": "text", "embedding_model": "clip"}
+    )
+    return view_id
+def store_in_neo4j(object_id, house_id, description, qdrant_object_id):
+    add_object_to_neo4j(object_id, house_id, description, qdrant_object_id)
+def stroke_to_coords(stroke_mask, max_points=10):
+    """
+    Converts a stroke mask into sampled point coordinates and labels.
+    Parameters:
+      stroke_mask: Binary mask (HxW array) representing the stroke.
+      max_points: Maximum number of points to sample.
+    Returns:
+      A tuple (point_coords, point_labels) where:
+        - point_coords is an Nx2 array of sampled [x, y] coordinates.
+        - point_labels is an N array of labels (1 for foreground).
+    """
+    ys, xs = np.nonzero(stroke_mask)
+    coords = np.stack([xs, ys], axis=1)
+    # Sample up to max_points
+    N = min(max_points, len(coords))
+    if N == 0:
+        raise ValueError("No stroke pixels found")
+    idxs = np.linspace(0, len(coords) - 1, num=N, dtype=int)
+    point_coords = coords[idxs]
+    point_labels = np.ones(N, dtype=int)
+    return point_coords, point_labels
+def get_locations_overview():
+    """
+    Fetches all existing locations and their details.
+    """
+    locations = get_all_locations_for_house(HOUSE_ID, include_images=True)
+    # Example response structure expected from `get_all_locations`:
+    # [{"name": "Kitchen", "image": <np.ndarray>, "parents": ["Home"]}, ...]
+    overview = []
+    for loc in locations:
+        overview.append({
+            "name": loc["name"],
+            "parents": loc.get("parents", []),
+            "image": loc.get("image")  # Expected to be np.ndarray or PIL.Image
+        })
+    return overview
+# Remove location function
+def remove_location(name):
+    #from core.storage import remove_location
+    #remove_location(house_id=HOUSE_ID, name=name)
+    return f"Location '{name}' removed."
+def add_update_location(name, parent_str, image):
+    parents = [p.strip() for p in parent_str.split(",")] if parent_str else []
+    # Example function you'd define in core.storage
+    #from core.storage import add_or_update_location
+    #add_or_update_location(house_id=HOUSE_ID, name=name, parents=parents, image=image)
+    return f"Location '{name}' added or updated with parents {parents}."
+# ------------------------------
+# Gradio Interface
+# ------------------------------
+with gr.Blocks() as demo:
+    with gr.Tab("Add Item"):
+        image_input       = gr.ImageEditor(label="Upload & Sketch", type="numpy")
+        seg_prompt_input  = gr.Textbox(label="Segmentation Prompt", placeholder="e.g. ‘red apple’")
+        description_input = gr.Textbox(label="Description", lines=3)
+        object_id_input   = gr.Textbox(label="Object ID (optional)")
+        background_mode   = gr.Radio(choices=["remove","extreme_blur"], value="remove")
+        preview_button    = gr.Button("Preview")
+        preview_output    = gr.Image(label="Preview Processed Image", type="numpy")
+        submit_button     = gr.Button("Submit")
+        output_text       = gr.Textbox(label="Result")
+        preview_button.click(
+            fn=lambda img,mode,prompt: (
+                apply_grounded_sam(img, prompt)
+                if prompt else
+                apply_sam(img, mode)
+            ),
+            inputs=[image_input, background_mode, seg_prompt_input],
+            outputs=[preview_output]
+        )
+        submit_button.click(fn=add_item,
+                            inputs=[preview_output, description_input, object_id_input, background_mode, image_input],
+                            outputs=[output_text])
+    with gr.Tab("Query By Text"):
+        text_query_input = gr.Textbox(label="Describe Object", lines=3, placeholder="e.g., 'red ceramic mug'")
+        k_text_slider = gr.Slider(1, 10, 5, label="Results k")
+        text_query_button = gr.Button("Search by Text")
+        text_query_output = gr.JSON(label="Query Results")
+        text_query_button.click(query_by_text,
+                                inputs=[text_query_input, k_text_slider],
+                                outputs=[text_query_output])
+    with gr.Tab("Query By Image"):
+        query_input  = gr.ImageEditor(label="Query & Sketch", type="numpy")
+        query_prompt  = gr.Textbox(label="Segmentation Prompt", placeholder="optional text-based mask")
+        query_mode   = gr.Radio(choices=["remove","extreme_blur"], value="remove")
+        query_preview_button = gr.Button("Refresh Preview")
+        query_preview= gr.Image(label="Query Preview", type="numpy")
+        k_slider     = gr.Slider(1,10,1, label="Results k")
+        query_button = gr.Button("Search")
+        query_output = gr.JSON(label="Query Results")
+        # Manual preview refresh
+        query_preview_button.click(fn=lambda img,mode,prompt: (
+                apply_grounded_sam(img, prompt)
+                if prompt else
+                apply_sam(img, mode)
+            ),
+            inputs=[query_input, query_mode, query_prompt],
+            outputs=[query_preview])
+        query_button.click(fn=query_item,
+                           inputs=[query_preview, query_mode, query_input, k_slider],
+                           outputs=[query_output])
+    with gr.Tab("View Object"):
+        view_object_id_input = gr.Textbox(label="Object ID", placeholder="Enter Object ID")
+        view_button = gr.Button("View Object")
+        add_image_button = gr.Button("Add Image to This Object")
+        add_description_button = gr.Button("Add Text Description")
+        add_location_button = gr.Button("Add Location")
+        view_description_output = gr.Textbox(label="Description")
+        view_images_output = gr.Gallery(label="Images", columns=3, height="auto")
+        view_texts_output = gr.JSON(label="Text Descriptions")
+        view_locations_output = gr.JSON(label="Location Chain")
+        view_location_images_output = gr.Gallery(label="Location Images", columns=3, height="auto")
+        view_owners_output = gr.JSON(label="Owners")
+        desc_object_id_input = 0 #placeholder
+        def view_object(object_id):
+            data = get_object_details(HOUSE_ID, object_id)
+            images_display = [Image.fromarray(img_dict["image"]) for img_dict in data["images"]]
+            location_images_display = [Image.fromarray(img) for img in data.get("location_images", [])]
+            return (
+                data["description"] or "No description found.",
+                images_display,
+                data["texts"],
+                data["locations"],
+                location_images_display,
+                data["owners"]
+            )
+        view_button.click(
+            view_object,
+            inputs=[view_object_id_input],
+            outputs=[
+                view_description_output,
+                view_images_output,
+                view_texts_output,
+                view_locations_output,
+                view_location_images_output,
+                view_owners_output
+            ]
+        )
+        # Reference your existing Add Item tab's object_id_input
+        #add_image_button.click(
+        #    lambda object_id: gr.update(value=object_id),
+        #    inputs=[view_object_id_input],
+        #    outputs=[object_id_input]
+        #)
+        # Navigation from View Object
+        #add_description_button.click(
+        #    lambda object_id: gr.update(value=object_id),
+        #    inputs=[view_object_id_input],
+        #    outputs=[desc_object_id_input]
+        #)
+    with gr.Tab("Add Description"):
+        desc_object_id_input = gr.Textbox(label="Object ID")
+        desc_text_input = gr.Textbox(label="Description", lines=3)
+        submit_desc_button = gr.Button("Submit Description")
+        desc_output = gr.Textbox(label="Result")
+        def submit_description(object_id, description):
+            desc_features = embed_text(description)
+            store_text_in_qdrant(vector=desc_features, object_id=object_id, house_id=HOUSE_ID, description=description)
+            return f"Added description to object {object_id}"
+        submit_desc_button.click(submit_description,
+                                inputs=[desc_object_id_input, desc_text_input],
+                                outputs=[desc_output])
+    with gr.Tab("Manage Locations"):
+        with gr.Row():
+            refresh_locations_button = gr.Button("Refresh Locations List")
+            locations_json_output = gr.JSON(label="Locations Overview (Names and Parents)")
+            locations_gallery_output = gr.Gallery(label="Location Images", columns=3, height="auto")
+        # Controls to Add/Remove locations
+        location_name_input = gr.Textbox(label="Location Name")
+        location_parent_input = gr.Textbox(label="Parent Location(s)", placeholder="Comma-separated, e.g. 'Home, Kitchen'")
+        location_image_input = gr.Image(label="Upload Location Image", type="numpy")
+        add_location_button = gr.Button("Add / Update Location")
+        remove_location_button = gr.Button("Remove Location")
+        location_manage_output = gr.Textbox(label="Result")
+        # Backend processor to return both JSON summary and Gallery
+        def refresh_locations_ui():
+            raw_locations = get_all_locations_for_house(HOUSE_ID, include_images=True)
+            # Prepare JSON summary
+            summary = [
+                {"name": loc["name"], "parents": loc.get("parents", [])}
+                for loc in raw_locations
+            ]
+            # Prepare images for gallery
+            images = []
+            for loc in raw_locations:
+                img_base64 = loc.get("image_base64")
+                if img_base64:
+                    from PIL import Image
+                    import io, base64
+                    img_data = base64.b64decode(img_base64)
+                    img_pil = Image.open(io.BytesIO(img_data))
+                    images.append(img_pil)
+            return summary, images
+        refresh_locations_button.click(
+            refresh_locations_ui,
+            inputs=[],
+            outputs=[locations_json_output, locations_gallery_output]
+        )
+        # Add/Update and Remove functions stay unchanged
+        add_location_button.click(
+            add_update_location,
+            inputs=[location_name_input, location_parent_input, location_image_input],
+            outputs=[location_manage_output]
+        )
+        remove_location_button.click(
+            remove_location,
+            inputs=[location_name_input],
+            outputs=[location_manage_output]
+        )
+import os
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True, root_path="/", show_api=False)

requirements-no-version.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 backports.tarfile
 boto3
-clip
 dotenv
 gradio==5.29.1
 groundingdino-py

 backports.tarfile
 boto3
+clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
 dotenv
 gradio==5.29.1
 groundingdino-py