Spaces:

profplate
/

camera-angle-model-lab

Sleeping

File size: 13,115 Bytes

7bb5bc1

from functools import lru_cache
import time

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


MODEL_OPTIONS = {
    "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
    "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
    "distilgpt2 (baseline)": "distilgpt2",
}

DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
INSTRUCT_MODEL_LABELS = {
    "SmolLM2 360M Instruct (best default)",
    "SmolLM2 135M Instruct (fast)",
}

VIEWPOINT_GUIDES = {
    "close-up": (
        "Focus on nearby detail, texture, facial expression, small objects, and "
        "what is cropped out or hidden by the tight framing."
    ),
    "wide shot": (
        "Focus on layout, background, scale, distance between objects, and how "
        "the whole scene is arranged."
    ),
    "bird's-eye view": (
        "Describe the scene from above. Focus on map-like layout, paths, shapes, "
        "and what becomes visible only from overhead."
    ),
    "low angle": (
        "Describe the scene from below. Focus on height, scale, foreground, "
        "dominance, sky or ceiling, and what is hidden behind tall objects."
    ),
    "over-the-shoulder": (
        "Describe what is visible from behind one character or object. Focus on "
        "foreground shoulder/frame, partial visibility, and what the viewer can "
        "infer but not fully see."
    ),
}

MODE_GUIDES = {
    "cinematic shot description": (
        "Write like a film shot description, emphasizing framing, movement, and "
        "what the viewer sees first."
    ),
    "photography caption": (
        "Write like a precise photography caption, emphasizing composition and "
        "visible details."
    ),
    "storyboard note": (
        "Write like a storyboard note for an artist, naming visual beats and "
        "spatial relationships."
    ),
    "image prompt helper": (
        "Write a detailed image-generation prompt that makes the viewpoint and "
        "composition explicit."
    ),
    "visual analysis paragraph": (
        "Write an analytical paragraph explaining how the viewpoint changes "
        "what is visible and what is hidden."
    ),
}

FIVE_VIEWPOINTS = [
    "close-up",
    "wide shot",
    "bird's-eye view",
    "low angle",
    "over-the-shoulder",
]


try:
    torch.set_num_threads(2)
except Exception:
    pass


@lru_cache(maxsize=3)
def load_generator(model_label):
    model_id = MODEL_OPTIONS[model_label]
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
    model.eval()
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=-1,
    )


def build_prompt(model_label, scene, viewpoint, output_mode):
    scene = scene.strip()
    viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
    mode_guide = MODE_GUIDES[output_mode]

    if model_label not in INSTRUCT_MODEL_LABELS:
        return (
            f"{viewpoint.title()} {output_mode}.\n"
            f"Scene: {scene}\n"
            "Description:"
        )

    return (
        "You are a careful visual scene description assistant for a student "
        "research project.\n"
        "Describe the same scene from a selected viewpoint. The important question "
        "is not just camera vocabulary; explain what becomes visible, hidden, "
        "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
        f"Viewpoint: {viewpoint}\n"
        f"Viewpoint guidance: {viewpoint_guide}\n"
        f"Output mode: {output_mode}\n"
        f"Output guidance: {mode_guide}\n"
        f"Scene: {scene}\n\n"
        "Write the response now:"
    )


def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
    generator = load_generator(model_label)
    tokenizer = generator.tokenizer
    result = generator(
        final_prompt,
        max_new_tokens=int(max_new_tokens),
        temperature=max(float(temperature), 0.05),
        top_p=float(top_p),
        do_sample=True,
        repetition_penalty=1.08,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = result[0]["generated_text"].strip()
    return text if text else "(The model returned an empty response. Try more tokens.)"


def generate_viewpoint(
    model_label,
    scene,
    viewpoint,
    output_mode,
    temperature,
    top_p,
    max_new_tokens,
):
    if not scene or not scene.strip():
        return "Please enter a scene.", "", ""

    final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
    started = time.perf_counter()
    try:
        output = call_model(
            model_label,
            final_prompt,
            temperature,
            top_p,
            max_new_tokens,
        )
    except Exception as exc:
        return (
            f"Error while running the model: {exc}",
            final_prompt,
            "Try the fast model first, or reduce max tokens.",
        )

    elapsed = time.perf_counter() - started
    note = (
        f"Model: {MODEL_OPTIONS[model_label]}\n"
        f"Elapsed: {elapsed:.1f} seconds\n"
        "First use can be slower because the model has to download and load."
    )
    return output, final_prompt, note


def make_paper_notes(scene, outputs_text):
    scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
    return (
        f"Paper notes for: {scene_line}\n\n"
        "Use these checks while reading the outputs:\n\n"
        "1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
        "2. Occlusion: Does the model notice when one object blocks another?\n"
        "3. Scale: Does low angle or close-up change perceived size or importance?\n"
        "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
        "5. Specificity: Does the model describe this scene, or could the paragraph "
        "fit almost any scene?\n"
        "6. Finding sentence: Write one cautious sentence about whether the model "
        "understands viewpoint consequences or only uses camera-angle words.\n\n"
        "Useful wording for the paper:\n"
        "In this small test, the model was strongest when ____. It was weakest "
        "when ____. The clearest limitation was ____."
    )


def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
    if not scene or not scene.strip():
        return "Please enter a scene.", ""

    started = time.perf_counter()
    sections = []
    try:
        for viewpoint in FIVE_VIEWPOINTS:
            final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
            output = call_model(
                model_label,
                final_prompt,
                temperature,
                top_p,
                max_new_tokens,
            )
            sections.append(f"## {viewpoint.title()}\n\n{output}")
    except Exception as exc:
        return (
            f"Error while running the five-viewpoint test: {exc}",
            "Try the fast model first, or reduce max tokens.",
        )

    elapsed = time.perf_counter() - started
    outputs_text = "\n\n---\n\n".join(sections)
    notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
    return outputs_text, notes


def notes_from_pasted_outputs(scene, pasted_outputs):
    if not pasted_outputs or not pasted_outputs.strip():
        return "Paste your generated outputs first."
    return make_paper_notes(scene, pasted_outputs)


with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        "# Camera Angle Model Lab\n"
        "CPU-only viewpoint lab for testing how small language models describe "
        "the same scene from different visual perspectives. No API tokens or paid "
        "compute required. The first run may take a minute while the model loads."
    )

    with gr.Tab("Single Viewpoint Writer"):
        with gr.Row():
            model_one = gr.Dropdown(
                choices=list(MODEL_OPTIONS.keys()),
                value=DEFAULT_MODEL,
                label="Model",
            )
            viewpoint_one = gr.Dropdown(
                choices=list(VIEWPOINT_GUIDES.keys()),
                value="close-up",
                label="Viewpoint",
            )
            mode_one = gr.Dropdown(
                choices=list(MODE_GUIDES.keys()),
                value="visual analysis paragraph",
                label="Output mode",
            )

        scene_one = gr.Textbox(
            label="Scene",
            lines=4,
            value="A dog hides under a kitchen table while a child looks for it.",
        )

        with gr.Row():
            temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
            top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")

        run_one = gr.Button("Generate", variant="primary")
        output_one = gr.Textbox(label="Generated output", lines=10)
        prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
        note_one = gr.Textbox(label="Run note", lines=3)

        run_one.click(
            fn=generate_viewpoint,
            inputs=[
                model_one,
                scene_one,
                viewpoint_one,
                mode_one,
                temperature_one,
                top_p_one,
                max_tokens_one,
            ],
            outputs=[output_one, prompt_sent_one, note_one],
        )

        gr.Examples(
            examples=[
                ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
                ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
                ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
                ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
                ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
            ],
            inputs=[scene_one, viewpoint_one, mode_one],
        )

    with gr.Tab("Five-Viewpoint Test"):
        model_grid = gr.Dropdown(
            choices=list(MODEL_OPTIONS.keys()),
            value=DEFAULT_MODEL,
            label="Model",
        )
        scene_grid = gr.Textbox(
            label="Shared scene",
            lines=4,
            value="A dog hides under a kitchen table while a child looks for it.",
        )
        mode_grid = gr.Dropdown(
            choices=list(MODE_GUIDES.keys()),
            value="visual analysis paragraph",
            label="Output mode",
        )
        with gr.Row():
            temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
            top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")

        run_grid = gr.Button("Run Five Viewpoints", variant="primary")
        grid_output = gr.Markdown(label="Five-viewpoint output")
        grid_notes = gr.Textbox(label="Paper notes", lines=14)

        run_grid.click(
            fn=run_five_viewpoints,
            inputs=[
                model_grid,
                scene_grid,
                mode_grid,
                temperature_grid,
                top_p_grid,
                max_tokens_grid,
            ],
            outputs=[grid_output, grid_notes],
        )

    with gr.Tab("Paper Notes Helper"):
        scene_notes = gr.Textbox(
            label="Scene being tested",
            lines=3,
            value="A dog hides under a kitchen table while a child looks for it.",
        )
        pasted_outputs = gr.Textbox(
            label="Paste generated outputs here",
            lines=12,
            placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
        )
        run_notes = gr.Button("Make Paper Notes", variant="primary")
        paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)

        run_notes.click(
            fn=notes_from_pasted_outputs,
            inputs=[scene_notes, pasted_outputs],
            outputs=paper_notes,
        )

    gr.Markdown(
        "### Duplication note\n"
        "This Space uses only local CPU models. No tokens, API keys, or paid "
        "hardware are required. Students can duplicate it and edit the viewpoints, "
        "output modes, examples, or model list."
    )


if __name__ == "__main__":
    demo.launch()