Spaces:

HorizonRobotics
/

EmbodiedGen-Texture-Gen

Running on Zero

File size: 3,756 Bytes

1d3e2bd

# Project EmbodiedGen
#
# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.


from __future__ import annotations

import logging

from embodied_gen.utils.gpt_clients import GPTclient

logger = logging.getLogger(__name__)

DEFAULT_RESOLVE_PROMPT = """You are matching a user's description to exactly one object in a 3D scene.

Scene instance list (each is an identifier, may contain IDs like "kitchen_cabinet_9197760", "banana_001"):
{instance_list}

User description (what they want to refer to, e.g. "黄色水果", "the yellow fruit", "柜子", "oven"):
"{user_spec}"

Rules:
1. Pick the ONE instance from the list that best matches the user's description (semantic match: e.g. "黄色水果" -> banana, "柜子" -> cabinet).
2. If no instance matches, reply with exactly: NONE
3. Otherwise reply with the EXACT instance name from the list, nothing else (no quotes, no explanation).

Your reply (one line, exact instance name or NONE):"""


def resolve_instance_with_llm(
    gpt_client: GPTclient,
    instance_names: list[str],
    user_spec: str,
    prompt_template: str | None = None,
) -> str | None:
    """Map a user description to a single scene instance name via LLM semantic matching.

    E.g. user says "yellow fruit" and the scene has "banana_001" -> returns "banana_001".
    Returns None when there is no match or the LLM replies NONE; the caller should
    prompt the user that the object does not exist and ask for re-entry.

    Args:
        gpt_client: GPT client instance, e.g. embodied_gen.utils.gpt_clients.GPT_CLIENT.
        instance_names: List of scene instance names from FloorplanManager.get_instance_names().
        user_spec: User input, e.g. "yellow fruit", "柜子", "the table".
        prompt_template: Optional custom prompt; placeholders {instance_list} and {user_spec}.

    Returns:
        The matched instance name (exactly one of instance_names), or None if no match.
    """
    if not user_spec or not instance_names:
        return None

    template = prompt_template or DEFAULT_RESOLVE_PROMPT
    instance_list_str = "\n".join(f"- {n}" for n in instance_names)
    prompt = template.format(
        instance_list=instance_list_str,
        user_spec=(user_spec or "").strip(),
    )

    try:
        response = gpt_client.query(text_prompt=prompt)
    except Exception as e:
        logger.warning("LLM `resolve_instance_with_llm` query failed: %s", e)
        return None

    if not response:
        return None

    first_line = response.strip().split("\n")[0].strip()
    if first_line.upper() == "NONE":
        return None
    candidate = first_line.strip('"\'')

    if not candidate:
        return None

    names_lower = {n.lower(): n for n in instance_names}
    candidate_lower = candidate.lower()

    if candidate in instance_names:
        return candidate

    if candidate_lower in names_lower:
        return names_lower[candidate_lower]

    matches = [n for n in instance_names if candidate_lower in n.lower()]
    if len(matches) == 1:
        return matches[0]

    logger.debug(
        "resolve_instance_with_llm: LLM reply %r did not match any of %s",
        first_line,
        instance_names[:5],
    )
    return None