Spaces:

fffiloni
/

llm-from-image-daggr

Paused

App Files Files Community

llm-from-image-daggr / app.py

fffiloni

cosmetic param order change

400e335 verified 3 months ago

raw

history blame contribute delete

9.78 kB

	import os
	import re
	import json
	import gradio as gr

	from huggingface_hub import InferenceClient
	from daggr import GradioNode, FnNode, Graph

	print("BOOT: image → kosmos → zephyr (daggr, clean, commented)")

	# =============================================================================
	# DEFAULT SYSTEM PROMPT
	# =============================================================================
	# EN:
	# This is the default system prompt used to instruct the LLM.
	# It describes the task (generate a chatbot persona) and enforces
	# a strict output format (Title / System prompt / Example input).
	#
	# FR:
	# Prompt système par défaut envoyé au LLM.
	# Il décrit précisément la tâche et impose un format de sortie strict.
	# =============================================================================
	AGENT_MAKER_SYS = """
	You are an AI whose job is to help users design a chatbot.

	The chatbot’s personality must be inspired by the character and atmosphere
	described by the user (based on an image description provided separately).

	You must:
	- Respond succinctly in a friendly tone
	- Generate exactly three things:
	1) A catchy chatbot title
	2) A system prompt defining the chatbot’s personality and behavior
	3) A very short example of what a future user might say to this chatbot

	Important rules:
	- The system prompt must NOT mention any image or visual input
	- The example input must represent a realistic first message
	that an end-user would send when chatting with this bot
	- STOP immediately after the example input
	- Follow EXACTLY this format (including labels and line breaks):

	"Sure, I'd be happy to help you build a bot! I'm generating a title, a system prompt,
	and an example user message. How do they sound?

	Title: ...
	System prompt: ...
	Example input: ..."
	""".strip()


	# =============================================================================
	# NODE 0 — IMAGE INPUT (PASSTHROUGH)
	# =============================================================================
	# EN:
	# Daggr requires explicit nodes.
	# This FnNode exists only to expose the image upload as a visible node
	# and to make data flow explicit in the graph.
	#
	# FR:
	# Node pure "pass-through" pour exposer l’upload d’image dans le graphe.
	# Aucun traitement ici, juste de la lisibilité et du debugging.
	# =============================================================================
	def passthrough_image(image):
	print("IMAGE INPUT:", image)
	return image

	image_input_node = FnNode(
	fn=passthrough_image,
	inputs={
	"image": gr.Image(
	label="Upload image",
	type="filepath",
	)
	},
	outputs={
	"image": gr.Image(label="Input image"),
	},
	)


	# =============================================================================
	# KOSMOS POSTPROCESS — CLEAN CAPTION
	# =============================================================================
	# EN:
	# Kosmos-2 returns:
	# 0) image (FileData dict)
	# 1) highlighted_text (list of token dicts)
	# 2) entities (string)
	#
	# Daggr passes all outputs positionally to postprocess.
	# We collapse everything into a single clean caption string.
	#
	# FR:
	# Le postprocess reconstruit une phrase lisible à partir des tokens
	# et supprime le préfixe "Describe this image in detail:" si présent.
	# =============================================================================
	def kosmos_postprocess(image_out, highlighted_text, *rest):
	if not isinstance(highlighted_text, list):
	return ""

	caption = "".join(
	item.get("token", "")
	for item in highlighted_text
	if isinstance(item, dict)
	).strip()

	# Remove Kosmos "Detailed" prefix if present
	caption = re.sub(
	r"^\sDescribe\s+this\s+image\s+in\s+details?\s:\s*",
	"",
	caption,
	flags=re.IGNORECASE,
	).strip()

	print("CAPTION:", repr(caption))
	return caption


	# =============================================================================
	# KOSMOS NODE
	# =============================================================================
	# EN:
	# Calls the Kosmos-2 Gradio Space via API.
	# Output is reduced to ONE value: a clean caption string.
	#
	# FR:
	# Appel du Space Kosmos-2 via GradioNode.
	# La sortie est volontairement simplifiée à une seule string.
	# =============================================================================
	desc_type = gr.Radio(
	["Brief", "Detailed"],
	label="Description Type",
	value="Detailed",
	)

	kosmos_caption = GradioNode(
	space_or_url="fffiloni/Kosmos-2-API",
	api_name="/generate_predictions",
	inputs={
	"image_input": image_input_node.image,
	"text_input": desc_type,
	},
	postprocess=kosmos_postprocess,
	outputs={
	"caption": gr.Textbox(label="Image caption", lines=3),
	},
	)


	# =============================================================================
	# BUILD CONVERSATION PAYLOAD (STRING)
	# =============================================================================
	# EN:
	# Zephyr (via InferenceClient) expects a list[dict] messages.
	# Daggr UI does NOT handle gr.JSON reliably yet.
	# We therefore pass a JSON STRING and parse it inside the LLM node.
	#
	# FR:
	# On transporte la conversation sous forme de string JSON,
	# pour éviter les problèmes d’UI liés à gr.JSON dans daggr.
	# =============================================================================
	def build_messages_json(caption: str, system_prompt: str) -> str:
	messages = [
	{"role": "system", "content": (system_prompt or "").strip()},
	{"role": "user", "content": (caption or "").strip()},
	]
	payload = json.dumps(messages, ensure_ascii=False)
	print("MESSAGES_JSON head:", repr(payload[:200]))
	return payload

	system_prompt = gr.Textbox(
	label="System prompt",
	value=AGENT_MAKER_SYS,
	lines=14,
	)

	messages_node = FnNode(
	fn=build_messages_json,
	inputs={
	"caption": kosmos_caption.caption,
	"system_prompt": system_prompt,
	},
	outputs={
	"messages_json": gr.Textbox(
	label="Conversation payload (string)",
	lines=6,
	)
	},
	)


	# =============================================================================
	# ZEPHYR CHAT NODE (FnNode, NOT InferenceNode)
	# =============================================================================
	# EN:
	# Zephyr-7B on Hugging Face ONLY supports "conversational" task.
	# daggr.InferenceNode currently targets text-generation style APIs.
	#
	# Therefore:
	# - We must use huggingface_hub.InferenceClient
	# - Wrapped in a FnNode (custom Python call)
	#
	# FR:
	# InferenceNode n’est PAS adapté ici.
	# FnNode est le bon choix pour appeler client.chat.completions.create().
	# =============================================================================
	def zephyr_chat(messages_json: str) -> str:
	token = os.environ.get("HF_TOKEN")
	if not token:
	raise RuntimeError("HF_TOKEN missing (Space secret).")

	messages = json.loads(messages_json)

	client = InferenceClient(api_key=token)
	completion = client.chat.completions.create(
	model="HuggingFaceH4/zephyr-7b-beta:featherless-ai",
	messages=messages,
	)

	out = completion.choices[0].message.content or ""
	print("LLM OUT head:", repr(out[:200]))
	return out

	zephyr_node = FnNode(
	fn=zephyr_chat,
	inputs={
	"messages_json": messages_node.messages_json,
	},
	outputs={
	"raw": gr.Textbox(
	label="Zephyr raw output",
	lines=14,
	)
	},
	)


	# =============================================================================
	# PARSE ZEPHYR OUTPUT INTO FIELDS
	# =============================================================================
	# EN:
	# Extract Title / System prompt / Example input from the LLM output.
	# Keeps raw text as fallback for debugging.
	#
	# FR:
	# Parsing volontairement simple et robuste,
	# avec fallback si le format n’est pas respecté.
	# =============================================================================
	def parse_zephyr_output(text: str):
	raw = (text or "").strip()

	def grab(label):
	pattern = rf"(?is)\b{label}\s:\s(.?)(?=\n\s(Title\|System prompt\|Example input)\s*:\|\Z)"
	m = re.search(pattern, raw)
	return m.group(1).strip() if m else ""

	title = grab("Title")
	system = grab("System prompt")
	example = grab("Example input")

	# ✅ NEW: if model inlines "Example input:" inside the system prompt field, cut it off.
	if system:
	system = re.split(r"(?is)\bExample input\s:\s", system, maxsplit=1)[0].strip()

	if not (title or system or example):
	system = raw # fallback debug

	return title, system, example

	parsed = FnNode(
	fn=parse_zephyr_output,
	inputs={"text": zephyr_node.raw},
	outputs={
	"title": gr.Textbox(label="Parsed title", lines=2),
	"system": gr.Textbox(label="Parsed system prompt", lines=10),
	"example": gr.Textbox(label="Parsed example input", lines=3),
	},
	)


	# =============================================================================
	# GRAPH
	# =============================================================================
	# EN:
	# Explicit, linear, debuggable DAG.
	# Every intermediate value is visible in the UI.
	#
	# FR:
	# Graphe volontairement verbeux pour démonstration,
	# debug et pédagogie autour de daggr.
	# =============================================================================
	graph = Graph(
	name="Image → System prompt idea (daggr)",
	nodes=[
	image_input_node,
	kosmos_caption,
	messages_node,
	zephyr_node,
	parsed,
	],
	persist_key=False,
	)

	if __name__ == "__main__":
	graph.launch()