from functools import lru_cache import time import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline MODEL_OPTIONS = { "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct", "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct", "distilgpt2 (baseline)": "distilgpt2", } DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)" INSTRUCT_MODEL_LABELS = { "SmolLM2 360M Instruct (best default)", "SmolLM2 135M Instruct (fast)", } VIEWPOINT_GUIDES = { "close-up": ( "Focus on nearby detail, texture, facial expression, small objects, and " "what is cropped out or hidden by the tight framing." ), "wide shot": ( "Focus on layout, background, scale, distance between objects, and how " "the whole scene is arranged." ), "bird's-eye view": ( "Describe the scene from above. Focus on map-like layout, paths, shapes, " "and what becomes visible only from overhead." ), "low angle": ( "Describe the scene from below. Focus on height, scale, foreground, " "dominance, sky or ceiling, and what is hidden behind tall objects." ), "over-the-shoulder": ( "Describe what is visible from behind one character or object. Focus on " "foreground shoulder/frame, partial visibility, and what the viewer can " "infer but not fully see." ), } MODE_GUIDES = { "cinematic shot description": ( "Write like a film shot description, emphasizing framing, movement, and " "what the viewer sees first." ), "photography caption": ( "Write like a precise photography caption, emphasizing composition and " "visible details." ), "storyboard note": ( "Write like a storyboard note for an artist, naming visual beats and " "spatial relationships." ), "image prompt helper": ( "Write a detailed image-generation prompt that makes the viewpoint and " "composition explicit." ), "visual analysis paragraph": ( "Write an analytical paragraph explaining how the viewpoint changes " "what is visible and what is hidden." ), } FIVE_VIEWPOINTS = [ "close-up", "wide shot", "bird's-eye view", "low angle", "over-the-shoulder", ] try: torch.set_num_threads(2) except Exception: pass @lru_cache(maxsize=3) def load_generator(model_label): model_id = MODEL_OPTIONS[model_label] tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) model.eval() return pipeline( "text-generation", model=model, tokenizer=tokenizer, device=-1, ) def build_prompt(model_label, scene, viewpoint, output_mode): scene = scene.strip() viewpoint_guide = VIEWPOINT_GUIDES[viewpoint] mode_guide = MODE_GUIDES[output_mode] if model_label not in INSTRUCT_MODEL_LABELS: return ( f"{viewpoint.title()} {output_mode}.\n" f"Scene: {scene}\n" "Description:" ) return ( "You are a careful visual scene description assistant for a student " "research project.\n" "Describe the same scene from a selected viewpoint. The important question " "is not just camera vocabulary; explain what becomes visible, hidden, " "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n" f"Viewpoint: {viewpoint}\n" f"Viewpoint guidance: {viewpoint_guide}\n" f"Output mode: {output_mode}\n" f"Output guidance: {mode_guide}\n" f"Scene: {scene}\n\n" "Write the response now:" ) def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens): generator = load_generator(model_label) tokenizer = generator.tokenizer result = generator( final_prompt, max_new_tokens=int(max_new_tokens), temperature=max(float(temperature), 0.05), top_p=float(top_p), do_sample=True, repetition_penalty=1.08, return_full_text=False, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) text = result[0]["generated_text"].strip() return text if text else "(The model returned an empty response. Try more tokens.)" def generate_viewpoint( model_label, scene, viewpoint, output_mode, temperature, top_p, max_new_tokens, ): if not scene or not scene.strip(): return "Please enter a scene.", "", "" final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) started = time.perf_counter() try: output = call_model( model_label, final_prompt, temperature, top_p, max_new_tokens, ) except Exception as exc: return ( f"Error while running the model: {exc}", final_prompt, "Try the fast model first, or reduce max tokens.", ) elapsed = time.perf_counter() - started note = ( f"Model: {MODEL_OPTIONS[model_label]}\n" f"Elapsed: {elapsed:.1f} seconds\n" "First use can be slower because the model has to download and load." ) return output, final_prompt, note def make_paper_notes(scene, outputs_text): scene_line = scene.strip() if scene and scene.strip() else "the tested scene" return ( f"Paper notes for: {scene_line}\n\n" "Use these checks while reading the outputs:\n\n" "1. Visibility: Which objects become visible or hidden in each viewpoint?\n" "2. Occlusion: Does the model notice when one object blocks another?\n" "3. Scale: Does low angle or close-up change perceived size or importance?\n" "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n" "5. Specificity: Does the model describe this scene, or could the paragraph " "fit almost any scene?\n" "6. Finding sentence: Write one cautious sentence about whether the model " "understands viewpoint consequences or only uses camera-angle words.\n\n" "Useful wording for the paper:\n" "In this small test, the model was strongest when ____. It was weakest " "when ____. The clearest limitation was ____." ) def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens): if not scene or not scene.strip(): return "Please enter a scene.", "" started = time.perf_counter() sections = [] try: for viewpoint in FIVE_VIEWPOINTS: final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) output = call_model( model_label, final_prompt, temperature, top_p, max_new_tokens, ) sections.append(f"## {viewpoint.title()}\n\n{output}") except Exception as exc: return ( f"Error while running the five-viewpoint test: {exc}", "Try the fast model first, or reduce max tokens.", ) elapsed = time.perf_counter() - started outputs_text = "\n\n---\n\n".join(sections) notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds." return outputs_text, notes def notes_from_pasted_outputs(scene, pasted_outputs): if not pasted_outputs or not pasted_outputs.strip(): return "Paste your generated outputs first." return make_paper_notes(scene, pasted_outputs) with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo: gr.Markdown( "# Camera Angle Model Lab\n" "CPU-only viewpoint lab for testing how small language models describe " "the same scene from different visual perspectives. No API tokens or paid " "compute required. The first run may take a minute while the model loads." ) with gr.Tab("Single Viewpoint Writer"): with gr.Row(): model_one = gr.Dropdown( choices=list(MODEL_OPTIONS.keys()), value=DEFAULT_MODEL, label="Model", ) viewpoint_one = gr.Dropdown( choices=list(VIEWPOINT_GUIDES.keys()), value="close-up", label="Viewpoint", ) mode_one = gr.Dropdown( choices=list(MODE_GUIDES.keys()), value="visual analysis paragraph", label="Output mode", ) scene_one = gr.Textbox( label="Scene", lines=4, value="A dog hides under a kitchen table while a child looks for it.", ) with gr.Row(): temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens") run_one = gr.Button("Generate", variant="primary") output_one = gr.Textbox(label="Generated output", lines=10) prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8) note_one = gr.Textbox(label="Run note", lines=3) run_one.click( fn=generate_viewpoint, inputs=[ model_one, scene_one, viewpoint_one, mode_one, temperature_one, top_p_one, max_tokens_one, ], outputs=[output_one, prompt_sent_one, note_one], ) gr.Examples( examples=[ ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"], ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"], ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"], ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"], ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"], ], inputs=[scene_one, viewpoint_one, mode_one], ) with gr.Tab("Five-Viewpoint Test"): model_grid = gr.Dropdown( choices=list(MODEL_OPTIONS.keys()), value=DEFAULT_MODEL, label="Model", ) scene_grid = gr.Textbox( label="Shared scene", lines=4, value="A dog hides under a kitchen table while a child looks for it.", ) mode_grid = gr.Dropdown( choices=list(MODE_GUIDES.keys()), value="visual analysis paragraph", label="Output mode", ) with gr.Row(): temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature") top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens") run_grid = gr.Button("Run Five Viewpoints", variant="primary") grid_output = gr.Markdown(label="Five-viewpoint output") grid_notes = gr.Textbox(label="Paper notes", lines=14) run_grid.click( fn=run_five_viewpoints, inputs=[ model_grid, scene_grid, mode_grid, temperature_grid, top_p_grid, max_tokens_grid, ], outputs=[grid_output, grid_notes], ) with gr.Tab("Paper Notes Helper"): scene_notes = gr.Textbox( label="Scene being tested", lines=3, value="A dog hides under a kitchen table while a child looks for it.", ) pasted_outputs = gr.Textbox( label="Paste generated outputs here", lines=12, placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.", ) run_notes = gr.Button("Make Paper Notes", variant="primary") paper_notes = gr.Textbox(label="Checklist for findings section", lines=14) run_notes.click( fn=notes_from_pasted_outputs, inputs=[scene_notes, pasted_outputs], outputs=paper_notes, ) gr.Markdown( "### Duplication note\n" "This Space uses only local CPU models. No tokens, API keys, or paid " "hardware are required. Students can duplicate it and edit the viewpoints, " "output modes, examples, or model list." ) if __name__ == "__main__": demo.launch()