File size: 13,115 Bytes
7bb5bc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
from functools import lru_cache
import time

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


MODEL_OPTIONS = {
    "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
    "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
    "distilgpt2 (baseline)": "distilgpt2",
}

DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
INSTRUCT_MODEL_LABELS = {
    "SmolLM2 360M Instruct (best default)",
    "SmolLM2 135M Instruct (fast)",
}

VIEWPOINT_GUIDES = {
    "close-up": (
        "Focus on nearby detail, texture, facial expression, small objects, and "
        "what is cropped out or hidden by the tight framing."
    ),
    "wide shot": (
        "Focus on layout, background, scale, distance between objects, and how "
        "the whole scene is arranged."
    ),
    "bird's-eye view": (
        "Describe the scene from above. Focus on map-like layout, paths, shapes, "
        "and what becomes visible only from overhead."
    ),
    "low angle": (
        "Describe the scene from below. Focus on height, scale, foreground, "
        "dominance, sky or ceiling, and what is hidden behind tall objects."
    ),
    "over-the-shoulder": (
        "Describe what is visible from behind one character or object. Focus on "
        "foreground shoulder/frame, partial visibility, and what the viewer can "
        "infer but not fully see."
    ),
}

MODE_GUIDES = {
    "cinematic shot description": (
        "Write like a film shot description, emphasizing framing, movement, and "
        "what the viewer sees first."
    ),
    "photography caption": (
        "Write like a precise photography caption, emphasizing composition and "
        "visible details."
    ),
    "storyboard note": (
        "Write like a storyboard note for an artist, naming visual beats and "
        "spatial relationships."
    ),
    "image prompt helper": (
        "Write a detailed image-generation prompt that makes the viewpoint and "
        "composition explicit."
    ),
    "visual analysis paragraph": (
        "Write an analytical paragraph explaining how the viewpoint changes "
        "what is visible and what is hidden."
    ),
}

FIVE_VIEWPOINTS = [
    "close-up",
    "wide shot",
    "bird's-eye view",
    "low angle",
    "over-the-shoulder",
]


try:
    torch.set_num_threads(2)
except Exception:
    pass


@lru_cache(maxsize=3)
def load_generator(model_label):
    model_id = MODEL_OPTIONS[model_label]
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
    model.eval()
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=-1,
    )


def build_prompt(model_label, scene, viewpoint, output_mode):
    scene = scene.strip()
    viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
    mode_guide = MODE_GUIDES[output_mode]

    if model_label not in INSTRUCT_MODEL_LABELS:
        return (
            f"{viewpoint.title()} {output_mode}.\n"
            f"Scene: {scene}\n"
            "Description:"
        )

    return (
        "You are a careful visual scene description assistant for a student "
        "research project.\n"
        "Describe the same scene from a selected viewpoint. The important question "
        "is not just camera vocabulary; explain what becomes visible, hidden, "
        "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
        f"Viewpoint: {viewpoint}\n"
        f"Viewpoint guidance: {viewpoint_guide}\n"
        f"Output mode: {output_mode}\n"
        f"Output guidance: {mode_guide}\n"
        f"Scene: {scene}\n\n"
        "Write the response now:"
    )


def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
    generator = load_generator(model_label)
    tokenizer = generator.tokenizer
    result = generator(
        final_prompt,
        max_new_tokens=int(max_new_tokens),
        temperature=max(float(temperature), 0.05),
        top_p=float(top_p),
        do_sample=True,
        repetition_penalty=1.08,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = result[0]["generated_text"].strip()
    return text if text else "(The model returned an empty response. Try more tokens.)"


def generate_viewpoint(
    model_label,
    scene,
    viewpoint,
    output_mode,
    temperature,
    top_p,
    max_new_tokens,
):
    if not scene or not scene.strip():
        return "Please enter a scene.", "", ""

    final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
    started = time.perf_counter()
    try:
        output = call_model(
            model_label,
            final_prompt,
            temperature,
            top_p,
            max_new_tokens,
        )
    except Exception as exc:
        return (
            f"Error while running the model: {exc}",
            final_prompt,
            "Try the fast model first, or reduce max tokens.",
        )

    elapsed = time.perf_counter() - started
    note = (
        f"Model: {MODEL_OPTIONS[model_label]}\n"
        f"Elapsed: {elapsed:.1f} seconds\n"
        "First use can be slower because the model has to download and load."
    )
    return output, final_prompt, note


def make_paper_notes(scene, outputs_text):
    scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
    return (
        f"Paper notes for: {scene_line}\n\n"
        "Use these checks while reading the outputs:\n\n"
        "1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
        "2. Occlusion: Does the model notice when one object blocks another?\n"
        "3. Scale: Does low angle or close-up change perceived size or importance?\n"
        "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
        "5. Specificity: Does the model describe this scene, or could the paragraph "
        "fit almost any scene?\n"
        "6. Finding sentence: Write one cautious sentence about whether the model "
        "understands viewpoint consequences or only uses camera-angle words.\n\n"
        "Useful wording for the paper:\n"
        "In this small test, the model was strongest when ____. It was weakest "
        "when ____. The clearest limitation was ____."
    )


def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
    if not scene or not scene.strip():
        return "Please enter a scene.", ""

    started = time.perf_counter()
    sections = []
    try:
        for viewpoint in FIVE_VIEWPOINTS:
            final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
            output = call_model(
                model_label,
                final_prompt,
                temperature,
                top_p,
                max_new_tokens,
            )
            sections.append(f"## {viewpoint.title()}\n\n{output}")
    except Exception as exc:
        return (
            f"Error while running the five-viewpoint test: {exc}",
            "Try the fast model first, or reduce max tokens.",
        )

    elapsed = time.perf_counter() - started
    outputs_text = "\n\n---\n\n".join(sections)
    notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
    return outputs_text, notes


def notes_from_pasted_outputs(scene, pasted_outputs):
    if not pasted_outputs or not pasted_outputs.strip():
        return "Paste your generated outputs first."
    return make_paper_notes(scene, pasted_outputs)


with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        "# Camera Angle Model Lab\n"
        "CPU-only viewpoint lab for testing how small language models describe "
        "the same scene from different visual perspectives. No API tokens or paid "
        "compute required. The first run may take a minute while the model loads."
    )

    with gr.Tab("Single Viewpoint Writer"):
        with gr.Row():
            model_one = gr.Dropdown(
                choices=list(MODEL_OPTIONS.keys()),
                value=DEFAULT_MODEL,
                label="Model",
            )
            viewpoint_one = gr.Dropdown(
                choices=list(VIEWPOINT_GUIDES.keys()),
                value="close-up",
                label="Viewpoint",
            )
            mode_one = gr.Dropdown(
                choices=list(MODE_GUIDES.keys()),
                value="visual analysis paragraph",
                label="Output mode",
            )

        scene_one = gr.Textbox(
            label="Scene",
            lines=4,
            value="A dog hides under a kitchen table while a child looks for it.",
        )

        with gr.Row():
            temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
            top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")

        run_one = gr.Button("Generate", variant="primary")
        output_one = gr.Textbox(label="Generated output", lines=10)
        prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
        note_one = gr.Textbox(label="Run note", lines=3)

        run_one.click(
            fn=generate_viewpoint,
            inputs=[
                model_one,
                scene_one,
                viewpoint_one,
                mode_one,
                temperature_one,
                top_p_one,
                max_tokens_one,
            ],
            outputs=[output_one, prompt_sent_one, note_one],
        )

        gr.Examples(
            examples=[
                ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
                ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
                ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
                ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
                ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
            ],
            inputs=[scene_one, viewpoint_one, mode_one],
        )

    with gr.Tab("Five-Viewpoint Test"):
        model_grid = gr.Dropdown(
            choices=list(MODEL_OPTIONS.keys()),
            value=DEFAULT_MODEL,
            label="Model",
        )
        scene_grid = gr.Textbox(
            label="Shared scene",
            lines=4,
            value="A dog hides under a kitchen table while a child looks for it.",
        )
        mode_grid = gr.Dropdown(
            choices=list(MODE_GUIDES.keys()),
            value="visual analysis paragraph",
            label="Output mode",
        )
        with gr.Row():
            temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
            top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")

        run_grid = gr.Button("Run Five Viewpoints", variant="primary")
        grid_output = gr.Markdown(label="Five-viewpoint output")
        grid_notes = gr.Textbox(label="Paper notes", lines=14)

        run_grid.click(
            fn=run_five_viewpoints,
            inputs=[
                model_grid,
                scene_grid,
                mode_grid,
                temperature_grid,
                top_p_grid,
                max_tokens_grid,
            ],
            outputs=[grid_output, grid_notes],
        )

    with gr.Tab("Paper Notes Helper"):
        scene_notes = gr.Textbox(
            label="Scene being tested",
            lines=3,
            value="A dog hides under a kitchen table while a child looks for it.",
        )
        pasted_outputs = gr.Textbox(
            label="Paste generated outputs here",
            lines=12,
            placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
        )
        run_notes = gr.Button("Make Paper Notes", variant="primary")
        paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)

        run_notes.click(
            fn=notes_from_pasted_outputs,
            inputs=[scene_notes, pasted_outputs],
            outputs=paper_notes,
        )

    gr.Markdown(
        "### Duplication note\n"
        "This Space uses only local CPU models. No tokens, API keys, or paid "
        "hardware are required. Students can duplicate it and edit the viewpoints, "
        "output modes, examples, or model list."
    )


if __name__ == "__main__":
    demo.launch()