Spaces:

ysharma
/

qwen_camera_angles_custom_component

Running on Zero

App Files Files Community

qwen_camera_angles_custom_component / app.py

ysharma HF Staff

Update app.py

f8f0519 verified 2 days ago

raw

history blame contribute delete

21.1 kB

	"""
	Agentic Coding : 3D Camera View Generator
	- Qwen Image Edit + Lightning LoRA + Multi-Angle LoRA
	- gr.HTML custom component (Gradio 6)
	- ZeroGPU (HuggingFace Spaces)

	"""

	import gradio as gr
	import numpy as np
	import random
	import torch
	import base64
	import spaces
	from io import BytesIO
	from PIL import Image
	from diffusers import QwenImageEditPlusPipeline

	MAX_SEED = np.iinfo(np.int32).max
	dtype = torch.bfloat16
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# ── Model Loading on ZEROGPU
	pipe = QwenImageEditPlusPipeline.from_pretrained(
	"Qwen/Qwen-Image-Edit-2511",
	torch_dtype=dtype,
	).to(device)

	pipe.load_lora_weights(
	"lightx2v/Qwen-Image-Edit-2511-Lightning",
	weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors",
	adapter_name="lightning",
	)
	pipe.load_lora_weights(
	"fal/Qwen-Image-Edit-2511-Multiple-Angles-LoRA",
	weight_name="qwen-image-edit-2511-multiple-angles-lora.safetensors",
	adapter_name="angles",
	)
	pipe.set_adapters(["lightning", "angles"], adapter_weights=[1.0, 1.0])


	# ── Camera parameter tables ────────────────────────────────────────────────────
	AZIMUTH_MAP = {
	0: "front view",
	45: "front-right quarter view",
	90: "right side view",
	135: "back-right quarter view",
	180: "back view",
	225: "back-left quarter view",
	270: "left side view",
	315: "front-left quarter view",
	}
	ELEVATION_MAP = {
	-30: "low-angle shot",
	0: "eye-level shot",
	30: "elevated shot",
	60: "high-angle shot",
	}
	DISTANCE_MAP = {
	0.6: "close-up",
	1.0: "medium shot",
	1.8: "wide shot",
	}

	# Default viewer state — plain dict, no custom class needed
	DEFAULT_CAM_VALUE = {"img": "", "az": 0.0, "el": 0.0, "dist": 1.0}


	def snap_to_nearest(value, steps):
	return min(steps, key=lambda x: abs(x - value))


	def build_camera_prompt(azimuth, elevation, distance):
	az = snap_to_nearest(azimuth, list(AZIMUTH_MAP.keys()))
	el = snap_to_nearest(elevation, list(ELEVATION_MAP.keys()))
	dist = snap_to_nearest(distance, list(DISTANCE_MAP.keys()))
	return f"<sks> {AZIMUTH_MAP[az]} {ELEVATION_MAP[el]} {DISTANCE_MAP[dist]}"


	def pil_to_data_url(img: Image.Image) -> str:
	buf = BytesIO()
	fmt = getattr(img, "format", None)
	if fmt and fmt.upper() == "WEBP":
	img.save(buf, format="WEBP")
	mime = "image/webp"
	else:
	img.save(buf, format="PNG")
	mime = "image/png"
	b64 = base64.b64encode(buf.getvalue()).decode()
	return f"data:{mime};base64,{b64}"


	# ── Inference ──────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=120)
	def infer_camera_edit(
	image, azimuth, elevation, distance,
	seed, randomize_seed, guidance_scale,
	num_inference_steps, height, width,
	):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	generator = torch.Generator(device=device).manual_seed(seed)
	prompt = build_camera_prompt(azimuth, elevation, distance)
	result = pipe(
	image=image,
	prompt=prompt,
	height=height,
	width=width,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps,
	generator=generator,
	).images[0]

	return result, seed, prompt


	# ── gr.HTML templates ──────────────────────────────────────────────────────────
	# Using plain gr.HTML (no subclass) with a dict value.
	#
	# Gradio 6 passes the dict as `value` to the template; all keys (img, az, el,
	# dist) are accessible as value.img, value.az, etc. in both ${} and {{}} syntax.
	HTML_TEMPLATE = """
	<div class="cv-wrap">
	{{#if value.img}}
	<img class="cv-img" src="{{value.img}}">
	{{else}}
	<div class="cv-empty">
	<svg class="cv-empty-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="1.25">
	<path stroke-linecap="round" stroke-linejoin="round" d="M6.827 6.175A2.31 2.31 0 015.186 7.23c-.38.054-.757.112-1.134.175C2.999 7.58 2.25 8.507 2.25 9.574V18a2.25 2.25 0 002.25 2.25h15A2.25 2.25 0 0021.75 18V9.574c0-1.067-.75-1.994-1.802-2.169a47.865 47.865 0 00-1.134-.175 2.31 2.31 0 01-1.64-1.055l-.822-1.316a2.192 2.192 0 00-1.736-1.039 48.774 48.774 0 00-5.232 0 2.192 2.192 0 00-1.736 1.039l-.821 1.316z" />
	<path stroke-linecap="round" stroke-linejoin="round" d="M16.5 12.75a4.5 4.5 0 11-9 0 4.5 4.5 0 019 0zM18.75 10.5h.008v.008h-.008V10.5z" />
	</svg>
	<p class="cv-empty-title">No image loaded</p>
	<p class="cv-empty-sub">Upload an image on the left, then hover here to see camera controls</p>
	</div>
	{{/if}}

	<div class="cv-hud">
	<div class="cv-readout">
	<span class="cv-lbl">Az</span><span class="cv-val">${value.az}°</span>
	<span class="cv-sep">/</span>
	<span class="cv-lbl">El</span><span class="cv-val">${value.el}°</span>
	<span class="cv-sep">/</span>
	<span class="cv-lbl">Dist</span><span class="cv-val">${value.dist}×</span>
	</div>
	<div class="cv-controls">
	<div class="cv-dpad">
	<button class="cv-btn cv-up" data-action="el-plus" title="Elevate">▲</button>
	<button class="cv-btn cv-left" data-action="az-minus" title="Rotate Left">◀</button>
	<div class="cv-dot"></div>
	<button class="cv-btn cv-right" data-action="az-plus" title="Rotate Right">▶</button>
	<button class="cv-btn cv-down" data-action="el-minus" title="Lower">▼</button>
	</div>
	<div class="cv-zoom">
	<button class="cv-zbtn" data-action="dist-minus" title="Zoom In">+</button>
	<button class="cv-zbtn" data-action="dist-plus" title="Zoom Out">−</button>
	</div>
	</div>
	</div>
	</div>
	"""

	CSS_TEMPLATE = """
	, ::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }

	/* ── Image well ── dark neutral so images pop, same treatment as any
	professional image editor / camera app preview area. Not a stylistic
	choice but a functional one: images render best against dark. */
	.cv-wrap {
	position: relative;
	width: 100%; height: 500px;
	background: #1c1c1e;
	border-radius: 12px;
	overflow: hidden;
	display: flex; align-items: center; justify-content: center;
	}

	.cv-img {
	max-width: 100%; max-height: 100%;
	object-fit: contain; display: block;
	}

	/* empty state */
	.cv-empty {
	text-align: center; user-select: none;
	display: flex; flex-direction: column; align-items: center; gap: 14px;
	}
	.cv-empty-icon {
	width: 52px; height: 52px;
	color: rgba(255,255,255,0.2);
	}
	.cv-empty-title {
	font-size: 15px; font-weight: 500; letter-spacing: -0.01em;
	color: rgba(255,255,255,0.45);
	}
	.cv-empty-sub {
	font-size: 13px; max-width: 230px; line-height: 1.65;
	color: rgba(255,255,255,0.25);
	}

	/* HUD — fades in on hover via CSS, no JS needed */
	.cv-hud {
	position: absolute; bottom: 16px; right: 16px;
	display: flex; flex-direction: column; align-items: flex-end; gap: 8px;
	opacity: 0; transition: opacity 0.16s ease; pointer-events: auto;
	}
	.cv-wrap:hover .cv-hud { opacity: 1; }

	/* coordinate readout — white card floating over image */
	.cv-readout {
	display: flex; align-items: center; gap: 8px;
	background: rgba(255,255,255,0.96);
	border-radius: 7px; padding: 5px 13px;
	font-size: 12px; white-space: nowrap;
	box-shadow: 0 2px 12px rgba(0,0,0,0.25);
	}
	.cv-lbl { color: #9ca3af; font-size: 10px; text-transform: uppercase; letter-spacing: 0.04em; }
	.cv-val { color: #111827; font-weight: 600; font-variant-numeric: tabular-nums; }
	.cv-sep { color: #d1d5db; margin: 0 2px; }

	/* controls panel — white card, same treatment as readout */
	.cv-controls {
	display: flex; align-items: center; gap: 8px;
	background: rgba(255,255,255,0.96);
	border-radius: 10px; padding: 8px 10px;
	box-shadow: 0 2px 12px rgba(0,0,0,0.25);
	}

	/* d-pad */
	.cv-dpad {
	display: grid;
	grid-template-columns: repeat(3, 32px);
	grid-template-rows: repeat(3, 32px);
	gap: 3px;
	}
	.cv-btn {
	width: 32px; height: 32px;
	border: 1px solid #e5e7eb; border-radius: 6px;
	background: #ffffff; color: #6b7280;
	font-size: 10px; cursor: pointer;
	display: flex; align-items: center; justify-content: center;
	transition: background 0.1s, border-color 0.1s, color 0.1s, transform 0.08s;
	padding: 0; line-height: 1;
	}
	/* orange matches Gradio Default theme primary */
	.cv-btn:hover {
	background: #fff7ed; border-color: #f97316; color: #f97316;
	transform: scale(1.1);
	}
	.cv-btn:active { transform: scale(0.92); background: #ffedd5; }

	.cv-up { grid-column:2; grid-row:1; }
	.cv-left { grid-column:1; grid-row:2; }
	.cv-dot {
	grid-column:2; grid-row:2;
	width:32px; height:32px; border-radius:50%;
	background: #f9fafb; border: 1px solid #e5e7eb;
	}
	.cv-right { grid-column:3; grid-row:2; }
	.cv-down { grid-column:2; grid-row:3; }

	/* zoom column */
	.cv-zoom { display: flex; flex-direction: column; gap: 3px; }
	.cv-zbtn {
	width: 32px; height: 38px;
	border: 1px solid #e5e7eb; border-radius: 6px;
	background: #ffffff; color: #6b7280;
	font-size: 16px; font-weight: 400; cursor: pointer;
	display: flex; align-items: center; justify-content: center;
	transition: background 0.1s, border-color 0.1s, color 0.1s, transform 0.08s;
	padding: 0; line-height: 1;
	}
	.cv-zbtn:hover {
	background: #fff7ed; border-color: #f97316; color: #f97316;
	transform: scale(1.1);
	}
	.cv-zbtn:active { transform: scale(0.92); background: #ffedd5; }
	"""

	JS_ON_LOAD = """
	const DIST_STEPS = [0.6, 1.0, 1.8];

	function snapDist(d) {
	return DIST_STEPS.reduce((p, c) => Math.abs(c - d) < Math.abs(p - d) ? c : p);
	}
	function shiftDist(d, dir) {
	const idx = DIST_STEPS.indexOf(snapDist(Number(d)));
	return DIST_STEPS[Math.max(0, Math.min(DIST_STEPS.length - 1, idx + dir))];
	}

	// Delegated click listener — attached once, survives template re-renders.
	element.addEventListener('click', function(e) {
	const btn = e.target.closest('[data-action]');
	if (!btn) return;

	const v = Object.assign({}, props.value);
	let az = Number(v.az) \|\| 0;
	let el = Number(v.el) \|\| 0;
	let dist = Number(v.dist) \|\| 1.0;

	switch (btn.dataset.action) {
	case 'az-minus': az = (az - 45 + 360) % 360; break;
	case 'az-plus': az = (az + 45) % 360; break;
	case 'el-plus': el = Math.min(60, el + 30); break;
	case 'el-minus': el = Math.max(-30, el - 30); break;
	case 'dist-minus': dist = shiftDist(dist, -1); break;
	case 'dist-plus': dist = shiftDist(dist, +1); break;
	}

	props.value = { ...v, az, el, dist };
	trigger('submit');
	});
	"""


	# ── Global Gradio CSS ──────────────────────────────────────────────────────────
	GLOBAL_CSS = """
	/* ── Row: never let the two columns wrap ── */
	/* Gradio 6 renders rows as flex containers with class "flex" */
	.gradio-container .flex.flex-row,
	.gradio-container .row {
	flex-wrap: nowrap !important;
	}

	/* ── Header ── */
	.app-heading { padding: 28px 0 20px; }
	.app-heading h1 {
	font-size: clamp(24px, 3.5vw, 36px);
	font-weight: 700;
	letter-spacing: -0.02em;
	line-height: 1.1;
	color: #111827;
	margin: 0 0 10px;
	}
	.app-heading .chips {
	display: flex; flex-wrap: wrap; gap: 6px;
	}
	.app-heading .chip {
	display: inline-flex; align-items: center; gap: 5px;
	padding: 3px 10px;
	background: #fff7ed;
	border: 1px solid #fed7aa;
	border-radius: 999px;
	font-size: 12px; font-weight: 500;
	color: #c2410c;
	line-height: 1.5;
	}
	.app-heading .chip svg {
	width: 12px; height: 12px; opacity: 0.7;
	}

	/* ── Controls column — subtle card to separate it from viewer ── */
	.controls-col > .block,
	.controls-col > .form {
	background: #fafafa !important;
	}

	/* ── Camera viewer column label ── */
	.viewer-label {
	font-size: 13px; font-weight: 600;
	color: #374151;
	margin-bottom: 8px;
	display: flex; align-items: center; gap: 8px;
	}
	.viewer-label .hint {
	font-weight: 400; color: #9ca3af; font-size: 12px;
	}

	/* ── Status display ── replaces the plain textbox look */
	.status-row {
	display: flex; align-items: center; gap: 8px;
	padding: 8px 12px;
	background: #f9fafb;
	border: 1px solid #e5e7eb;
	border-radius: 8px;
	margin-top: 6px;
	font-size: 12px;
	font-family: ui-monospace, "Cascadia Code", "Source Code Pro", monospace;
	color: #6b7280;
	min-height: 38px;
	}
	/* status textbox — reduce visual weight */
	.status-box textarea {
	font-family: ui-monospace, "Cascadia Code", "Source Code Pro", monospace !important;
	font-size: 12px !important;
	color: #374151 !important;
	background: #f9fafb !important;
	border-color: #e5e7eb !important;
	resize: none !important;
	}

	/* ── Prompt box ── */
	.prompt-box textarea {
	font-family: ui-monospace, "Cascadia Code", "Source Code Pro", monospace !important;
	font-size: 12px !important;
	color: #6b7280 !important;
	}

	"""

	GRADIO_THEME = gr.themes.Default()


	# ── App ────────────────────────────────────────────────────────────────────────
	def create_app():

	# FIX: theme and css are now passed to launch(), not gr.Blocks()
	with gr.Blocks(title="3D Camera View Generator") as demo:

	gr.HTML("""
	<div class="app-heading">
	<h1>3D Camera View Generator</h1>
	<div class="chips">
	<span class="chip">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor"><path d="M9.653 16.915l-.005-.003-.019-.01a20.759 20.759 0 01-1.162-.682 22.045 22.045 0 01-2.582-2.085c-1.034-1.036-2.035-2.329-2.535-3.765-.583-1.683-.322-3.498.985-4.82C5.576 4.29 7.319 3.75 9 3.75c.921 0 1.85.205 2.704.596L13 3.25l1.304 1.304L13 5.858a6.001 6.001 0 010 8.284l-.707.707-2.64-2.64z"/></svg>
	Qwen Image Edit 2511
	</span>
	<span class="chip">⚡ Lightning LoRA</span>
	<span class="chip">📐 Multi-Angle LoRA</span>
	</div>
	</div>
	""")

	with gr.Row():

	# ── Left column ──────────────────────────────────────────────────
	with gr.Column(scale=4, min_width=200, elem_classes=["controls-col"]):
	image_input = gr.Image(
	label="Source Image",
	type="pil",
	height=320,
	)

	prompt_box = gr.Textbox(
	label="Active Camera Prompt",
	value="<sks> front view eye-level shot medium shot",
	interactive=False,
	lines=1,
	elem_classes=["prompt-box"],
	)

	with gr.Accordion("⚙ Generation Settings", open=False):
	seed_slider = gr.Slider(0, MAX_SEED, value=42, step=1, label="Seed")
	rand_seed_cb = gr.Checkbox(True, label="Randomise seed each generation")
	guidance_sl = gr.Slider(1.0, 20.0, value=1.0, step=0.1, label="Guidance Scale (keep ≤1 for Lightning LoRA)")
	steps_sl = gr.Slider(1, 50, value=4, step=1, label="Inference Steps")
	width_sl = gr.Slider(256, 1024, value=1024, step=32, label="Width (px)")
	height_sl = gr.Slider(256, 1024, value=1024, step=32, label="Height (px)")

	# ── Right column ─────────────────────────────────────────────────
	with gr.Column(scale=6, min_width=280):
	gr.HTML("""
	<div class="viewer-label">
	Camera View
	<span class="hint">— hover to reveal orbit controls</span>
	</div>
	""")

	# FIX: plain gr.HTML with dict value — no subclass, no inspect error
	cam_view = gr.HTML(
	value=DEFAULT_CAM_VALUE,
	html_template=HTML_TEMPLATE,
	css_template=CSS_TEMPLATE,
	js_on_load=JS_ON_LOAD,
	apply_default_css=False,
	)

	status_box = gr.Textbox(
	label="Status",
	value="Ready — upload an image to begin",
	interactive=False,
	lines=1,
	elem_classes=["status-box"],
	)

	gallery_state = gr.State([])
	with gr.Accordion("🖼 Generated Views", open=False):
	gallery = gr.Gallery(
	label="",
	show_label=False,
	columns=4,
	height="auto",
	object_fit="cover",
	allow_preview=True,
	)

	# ── Helpers ──────────────────────────────────────────────────────────

	def _coerce_view(v):
	"""Extract (az, el, dist) safely from a dict or default."""
	if isinstance(v, dict):
	return float(v.get("az", 0)), float(v.get("el", 0)), float(v.get("dist", 1.0))
	return 0.0, 0.0, 1.0

	def _auto_dimensions(img):
	if img is None:
	return 1024, 1024
	w, h = img.size
	ar = w / h
	if ar > 1:
	nw = 1024
	nh = round(1024 / ar / 32) * 32
	else:
	nh = 1024
	nw = round(1024 * ar / 32) * 32
	return max(256, min(1024, nw)), max(256, min(1024, nh))

	# ── Event handlers ────────────────────────────────────────────────────

	def on_image_upload(img, current_view):
	nw, nh = _auto_dimensions(img)
	if img is None:
	return DEFAULT_CAM_VALUE.copy(), nw, nh, "No image"
	az, el, dist = _coerce_view(current_view)
	return (
	{"img": pil_to_data_url(img), "az": az, "el": el, "dist": dist},
	nw,
	nh,
	"Image loaded — hover the viewer and click an arrow to generate",
	)

	def on_camera_submit(
	current_view, src_img,
	seed_val, rand_seed, guidance, steps, h, w,
	gallery_imgs,
	):
	try:
	az, el, dist = _coerce_view(current_view)
	prompt = build_camera_prompt(az, el, dist)

	if src_img is None:
	return current_view, prompt, "⚠ Upload an image first", gallery_imgs, gallery_imgs

	gen_img, final_seed, final_prompt = infer_camera_edit(
	image=src_img,
	azimuth=az, elevation=el, distance=dist,
	seed=seed_val, randomize_seed=rand_seed,
	guidance_scale=guidance,
	num_inference_steps=int(steps),
	height=int(h), width=int(w),
	)

	new_view = {"img": pil_to_data_url(gen_img), "az": az, "el": el, "dist": dist}
	gallery_imgs = list(gallery_imgs) + [gen_img]
	status = f"✓ {final_prompt} \| seed {final_seed}"

	return new_view, final_prompt, status, gallery_imgs, gallery_imgs

	except Exception as exc:
	return current_view, "", f"✗ {str(exc)}", gallery_imgs, gallery_imgs

	image_input.upload(
	fn=on_image_upload,
	inputs=[image_input, cam_view],
	outputs=[cam_view, width_sl, height_sl, status_box],
	)

	cam_view.submit(
	fn=on_camera_submit,
	inputs=[
	cam_view, image_input,
	seed_slider, rand_seed_cb, guidance_sl, steps_sl,
	height_sl, width_sl,
	gallery_state,
	],
	outputs=[cam_view, prompt_box, status_box, gallery_state, gallery],
	)

	return demo


	if __name__ == "__main__":
	demo = create_app()
	# FIX: theme and css passed to launch() as required by Gradio 6.0
	demo.launch(
	debug=True,
	theme=GRADIO_THEME,
	css=GLOBAL_CSS,
	)