Spaces:

profplate
/

camera-angle-model-lab

Running

App Files Files Community

camera-angle-model-lab / app.py

profplate

Create app.py

7bb5bc1 verified 12 days ago

raw

history blame contribute delete

13.1 kB

	from functools import lru_cache
	import time

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


	MODEL_OPTIONS = {
	"SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
	"SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
	"distilgpt2 (baseline)": "distilgpt2",
	}

	DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
	INSTRUCT_MODEL_LABELS = {
	"SmolLM2 360M Instruct (best default)",
	"SmolLM2 135M Instruct (fast)",
	}

	VIEWPOINT_GUIDES = {
	"close-up": (
	"Focus on nearby detail, texture, facial expression, small objects, and "
	"what is cropped out or hidden by the tight framing."
	),
	"wide shot": (
	"Focus on layout, background, scale, distance between objects, and how "
	"the whole scene is arranged."
	),
	"bird's-eye view": (
	"Describe the scene from above. Focus on map-like layout, paths, shapes, "
	"and what becomes visible only from overhead."
	),
	"low angle": (
	"Describe the scene from below. Focus on height, scale, foreground, "
	"dominance, sky or ceiling, and what is hidden behind tall objects."
	),
	"over-the-shoulder": (
	"Describe what is visible from behind one character or object. Focus on "
	"foreground shoulder/frame, partial visibility, and what the viewer can "
	"infer but not fully see."
	),
	}

	MODE_GUIDES = {
	"cinematic shot description": (
	"Write like a film shot description, emphasizing framing, movement, and "
	"what the viewer sees first."
	),
	"photography caption": (
	"Write like a precise photography caption, emphasizing composition and "
	"visible details."
	),
	"storyboard note": (
	"Write like a storyboard note for an artist, naming visual beats and "
	"spatial relationships."
	),
	"image prompt helper": (
	"Write a detailed image-generation prompt that makes the viewpoint and "
	"composition explicit."
	),
	"visual analysis paragraph": (
	"Write an analytical paragraph explaining how the viewpoint changes "
	"what is visible and what is hidden."
	),
	}

	FIVE_VIEWPOINTS = [
	"close-up",
	"wide shot",
	"bird's-eye view",
	"low angle",
	"over-the-shoulder",
	]


	try:
	torch.set_num_threads(2)
	except Exception:
	pass


	@lru_cache(maxsize=3)
	def load_generator(model_label):
	model_id = MODEL_OPTIONS[model_label]
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token
	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
	model.eval()
	return pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=-1,
	)


	def build_prompt(model_label, scene, viewpoint, output_mode):
	scene = scene.strip()
	viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
	mode_guide = MODE_GUIDES[output_mode]

	if model_label not in INSTRUCT_MODEL_LABELS:
	return (
	f"{viewpoint.title()} {output_mode}.\n"
	f"Scene: {scene}\n"
	"Description:"
	)

	return (
	"You are a careful visual scene description assistant for a student "
	"research project.\n"
	"Describe the same scene from a selected viewpoint. The important question "
	"is not just camera vocabulary; explain what becomes visible, hidden, "
	"larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
	f"Viewpoint: {viewpoint}\n"
	f"Viewpoint guidance: {viewpoint_guide}\n"
	f"Output mode: {output_mode}\n"
	f"Output guidance: {mode_guide}\n"
	f"Scene: {scene}\n\n"
	"Write the response now:"
	)


	def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
	generator = load_generator(model_label)
	tokenizer = generator.tokenizer
	result = generator(
	final_prompt,
	max_new_tokens=int(max_new_tokens),
	temperature=max(float(temperature), 0.05),
	top_p=float(top_p),
	do_sample=True,
	repetition_penalty=1.08,
	return_full_text=False,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)
	text = result[0]["generated_text"].strip()
	return text if text else "(The model returned an empty response. Try more tokens.)"


	def generate_viewpoint(
	model_label,
	scene,
	viewpoint,
	output_mode,
	temperature,
	top_p,
	max_new_tokens,
	):
	if not scene or not scene.strip():
	return "Please enter a scene.", "", ""

	final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
	started = time.perf_counter()
	try:
	output = call_model(
	model_label,
	final_prompt,
	temperature,
	top_p,
	max_new_tokens,
	)
	except Exception as exc:
	return (
	f"Error while running the model: {exc}",
	final_prompt,
	"Try the fast model first, or reduce max tokens.",
	)

	elapsed = time.perf_counter() - started
	note = (
	f"Model: {MODEL_OPTIONS[model_label]}\n"
	f"Elapsed: {elapsed:.1f} seconds\n"
	"First use can be slower because the model has to download and load."
	)
	return output, final_prompt, note


	def make_paper_notes(scene, outputs_text):
	scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
	return (
	f"Paper notes for: {scene_line}\n\n"
	"Use these checks while reading the outputs:\n\n"
	"1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
	"2. Occlusion: Does the model notice when one object blocks another?\n"
	"3. Scale: Does low angle or close-up change perceived size or importance?\n"
	"4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
	"5. Specificity: Does the model describe this scene, or could the paragraph "
	"fit almost any scene?\n"
	"6. Finding sentence: Write one cautious sentence about whether the model "
	"understands viewpoint consequences or only uses camera-angle words.\n\n"
	"Useful wording for the paper:\n"
	"In this small test, the model was strongest when ____. It was weakest "
	"when ____. The clearest limitation was ____."
	)


	def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
	if not scene or not scene.strip():
	return "Please enter a scene.", ""

	started = time.perf_counter()
	sections = []
	try:
	for viewpoint in FIVE_VIEWPOINTS:
	final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
	output = call_model(
	model_label,
	final_prompt,
	temperature,
	top_p,
	max_new_tokens,
	)
	sections.append(f"## {viewpoint.title()}\n\n{output}")
	except Exception as exc:
	return (
	f"Error while running the five-viewpoint test: {exc}",
	"Try the fast model first, or reduce max tokens.",
	)

	elapsed = time.perf_counter() - started
	outputs_text = "\n\n---\n\n".join(sections)
	notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
	return outputs_text, notes


	def notes_from_pasted_outputs(scene, pasted_outputs):
	if not pasted_outputs or not pasted_outputs.strip():
	return "Paste your generated outputs first."
	return make_paper_notes(scene, pasted_outputs)


	with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"# Camera Angle Model Lab\n"
	"CPU-only viewpoint lab for testing how small language models describe "
	"the same scene from different visual perspectives. No API tokens or paid "
	"compute required. The first run may take a minute while the model loads."
	)

	with gr.Tab("Single Viewpoint Writer"):
	with gr.Row():
	model_one = gr.Dropdown(
	choices=list(MODEL_OPTIONS.keys()),
	value=DEFAULT_MODEL,
	label="Model",
	)
	viewpoint_one = gr.Dropdown(
	choices=list(VIEWPOINT_GUIDES.keys()),
	value="close-up",
	label="Viewpoint",
	)
	mode_one = gr.Dropdown(
	choices=list(MODE_GUIDES.keys()),
	value="visual analysis paragraph",
	label="Output mode",
	)

	scene_one = gr.Textbox(
	label="Scene",
	lines=4,
	value="A dog hides under a kitchen table while a child looks for it.",
	)

	with gr.Row():
	temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
	top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
	max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")

	run_one = gr.Button("Generate", variant="primary")
	output_one = gr.Textbox(label="Generated output", lines=10)
	prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
	note_one = gr.Textbox(label="Run note", lines=3)

	run_one.click(
	fn=generate_viewpoint,
	inputs=[
	model_one,
	scene_one,
	viewpoint_one,
	mode_one,
	temperature_one,
	top_p_one,
	max_tokens_one,
	],
	outputs=[output_one, prompt_sent_one, note_one],
	)

	gr.Examples(
	examples=[
	["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
	["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
	["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
	["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
	["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
	],
	inputs=[scene_one, viewpoint_one, mode_one],
	)

	with gr.Tab("Five-Viewpoint Test"):
	model_grid = gr.Dropdown(
	choices=list(MODEL_OPTIONS.keys()),
	value=DEFAULT_MODEL,
	label="Model",
	)
	scene_grid = gr.Textbox(
	label="Shared scene",
	lines=4,
	value="A dog hides under a kitchen table while a child looks for it.",
	)
	mode_grid = gr.Dropdown(
	choices=list(MODE_GUIDES.keys()),
	value="visual analysis paragraph",
	label="Output mode",
	)
	with gr.Row():
	temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
	top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
	max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")

	run_grid = gr.Button("Run Five Viewpoints", variant="primary")
	grid_output = gr.Markdown(label="Five-viewpoint output")
	grid_notes = gr.Textbox(label="Paper notes", lines=14)

	run_grid.click(
	fn=run_five_viewpoints,
	inputs=[
	model_grid,
	scene_grid,
	mode_grid,
	temperature_grid,
	top_p_grid,
	max_tokens_grid,
	],
	outputs=[grid_output, grid_notes],
	)

	with gr.Tab("Paper Notes Helper"):
	scene_notes = gr.Textbox(
	label="Scene being tested",
	lines=3,
	value="A dog hides under a kitchen table while a child looks for it.",
	)
	pasted_outputs = gr.Textbox(
	label="Paste generated outputs here",
	lines=12,
	placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
	)
	run_notes = gr.Button("Make Paper Notes", variant="primary")
	paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)

	run_notes.click(
	fn=notes_from_pasted_outputs,
	inputs=[scene_notes, pasted_outputs],
	outputs=paper_notes,
	)

	gr.Markdown(
	"### Duplication note\n"
	"This Space uses only local CPU models. No tokens, API keys, or paid "
	"hardware are required. Students can duplicate it and edit the viewpoints, "
	"output modes, examples, or model list."
	)


	if __name__ == "__main__":
	demo.launch()