| from functools import lru_cache |
| import time |
|
|
| import gradio as gr |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
|
| MODEL_OPTIONS = { |
| "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct", |
| "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct", |
| "distilgpt2 (baseline)": "distilgpt2", |
| } |
|
|
| DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)" |
| INSTRUCT_MODEL_LABELS = { |
| "SmolLM2 360M Instruct (best default)", |
| "SmolLM2 135M Instruct (fast)", |
| } |
|
|
| VIEWPOINT_GUIDES = { |
| "close-up": ( |
| "Focus on nearby detail, texture, facial expression, small objects, and " |
| "what is cropped out or hidden by the tight framing." |
| ), |
| "wide shot": ( |
| "Focus on layout, background, scale, distance between objects, and how " |
| "the whole scene is arranged." |
| ), |
| "bird's-eye view": ( |
| "Describe the scene from above. Focus on map-like layout, paths, shapes, " |
| "and what becomes visible only from overhead." |
| ), |
| "low angle": ( |
| "Describe the scene from below. Focus on height, scale, foreground, " |
| "dominance, sky or ceiling, and what is hidden behind tall objects." |
| ), |
| "over-the-shoulder": ( |
| "Describe what is visible from behind one character or object. Focus on " |
| "foreground shoulder/frame, partial visibility, and what the viewer can " |
| "infer but not fully see." |
| ), |
| } |
|
|
| MODE_GUIDES = { |
| "cinematic shot description": ( |
| "Write like a film shot description, emphasizing framing, movement, and " |
| "what the viewer sees first." |
| ), |
| "photography caption": ( |
| "Write like a precise photography caption, emphasizing composition and " |
| "visible details." |
| ), |
| "storyboard note": ( |
| "Write like a storyboard note for an artist, naming visual beats and " |
| "spatial relationships." |
| ), |
| "image prompt helper": ( |
| "Write a detailed image-generation prompt that makes the viewpoint and " |
| "composition explicit." |
| ), |
| "visual analysis paragraph": ( |
| "Write an analytical paragraph explaining how the viewpoint changes " |
| "what is visible and what is hidden." |
| ), |
| } |
|
|
| FIVE_VIEWPOINTS = [ |
| "close-up", |
| "wide shot", |
| "bird's-eye view", |
| "low angle", |
| "over-the-shoulder", |
| ] |
|
|
|
|
| try: |
| torch.set_num_threads(2) |
| except Exception: |
| pass |
|
|
|
|
| @lru_cache(maxsize=3) |
| def load_generator(model_label): |
| model_id = MODEL_OPTIONS[model_label] |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| if tokenizer.pad_token_id is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) |
| model.eval() |
| return pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| device=-1, |
| ) |
|
|
|
|
| def build_prompt(model_label, scene, viewpoint, output_mode): |
| scene = scene.strip() |
| viewpoint_guide = VIEWPOINT_GUIDES[viewpoint] |
| mode_guide = MODE_GUIDES[output_mode] |
|
|
| if model_label not in INSTRUCT_MODEL_LABELS: |
| return ( |
| f"{viewpoint.title()} {output_mode}.\n" |
| f"Scene: {scene}\n" |
| "Description:" |
| ) |
|
|
| return ( |
| "You are a careful visual scene description assistant for a student " |
| "research project.\n" |
| "Describe the same scene from a selected viewpoint. The important question " |
| "is not just camera vocabulary; explain what becomes visible, hidden, " |
| "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n" |
| f"Viewpoint: {viewpoint}\n" |
| f"Viewpoint guidance: {viewpoint_guide}\n" |
| f"Output mode: {output_mode}\n" |
| f"Output guidance: {mode_guide}\n" |
| f"Scene: {scene}\n\n" |
| "Write the response now:" |
| ) |
|
|
|
|
| def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens): |
| generator = load_generator(model_label) |
| tokenizer = generator.tokenizer |
| result = generator( |
| final_prompt, |
| max_new_tokens=int(max_new_tokens), |
| temperature=max(float(temperature), 0.05), |
| top_p=float(top_p), |
| do_sample=True, |
| repetition_penalty=1.08, |
| return_full_text=False, |
| pad_token_id=tokenizer.eos_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
| text = result[0]["generated_text"].strip() |
| return text if text else "(The model returned an empty response. Try more tokens.)" |
|
|
|
|
| def generate_viewpoint( |
| model_label, |
| scene, |
| viewpoint, |
| output_mode, |
| temperature, |
| top_p, |
| max_new_tokens, |
| ): |
| if not scene or not scene.strip(): |
| return "Please enter a scene.", "", "" |
|
|
| final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) |
| started = time.perf_counter() |
| try: |
| output = call_model( |
| model_label, |
| final_prompt, |
| temperature, |
| top_p, |
| max_new_tokens, |
| ) |
| except Exception as exc: |
| return ( |
| f"Error while running the model: {exc}", |
| final_prompt, |
| "Try the fast model first, or reduce max tokens.", |
| ) |
|
|
| elapsed = time.perf_counter() - started |
| note = ( |
| f"Model: {MODEL_OPTIONS[model_label]}\n" |
| f"Elapsed: {elapsed:.1f} seconds\n" |
| "First use can be slower because the model has to download and load." |
| ) |
| return output, final_prompt, note |
|
|
|
|
| def make_paper_notes(scene, outputs_text): |
| scene_line = scene.strip() if scene and scene.strip() else "the tested scene" |
| return ( |
| f"Paper notes for: {scene_line}\n\n" |
| "Use these checks while reading the outputs:\n\n" |
| "1. Visibility: Which objects become visible or hidden in each viewpoint?\n" |
| "2. Occlusion: Does the model notice when one object blocks another?\n" |
| "3. Scale: Does low angle or close-up change perceived size or importance?\n" |
| "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n" |
| "5. Specificity: Does the model describe this scene, or could the paragraph " |
| "fit almost any scene?\n" |
| "6. Finding sentence: Write one cautious sentence about whether the model " |
| "understands viewpoint consequences or only uses camera-angle words.\n\n" |
| "Useful wording for the paper:\n" |
| "In this small test, the model was strongest when ____. It was weakest " |
| "when ____. The clearest limitation was ____." |
| ) |
|
|
|
|
| def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens): |
| if not scene or not scene.strip(): |
| return "Please enter a scene.", "" |
|
|
| started = time.perf_counter() |
| sections = [] |
| try: |
| for viewpoint in FIVE_VIEWPOINTS: |
| final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) |
| output = call_model( |
| model_label, |
| final_prompt, |
| temperature, |
| top_p, |
| max_new_tokens, |
| ) |
| sections.append(f"## {viewpoint.title()}\n\n{output}") |
| except Exception as exc: |
| return ( |
| f"Error while running the five-viewpoint test: {exc}", |
| "Try the fast model first, or reduce max tokens.", |
| ) |
|
|
| elapsed = time.perf_counter() - started |
| outputs_text = "\n\n---\n\n".join(sections) |
| notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds." |
| return outputs_text, notes |
|
|
|
|
| def notes_from_pasted_outputs(scene, pasted_outputs): |
| if not pasted_outputs or not pasted_outputs.strip(): |
| return "Paste your generated outputs first." |
| return make_paper_notes(scene, pasted_outputs) |
|
|
|
|
| with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo: |
| gr.Markdown( |
| "# Camera Angle Model Lab\n" |
| "CPU-only viewpoint lab for testing how small language models describe " |
| "the same scene from different visual perspectives. No API tokens or paid " |
| "compute required. The first run may take a minute while the model loads." |
| ) |
|
|
| with gr.Tab("Single Viewpoint Writer"): |
| with gr.Row(): |
| model_one = gr.Dropdown( |
| choices=list(MODEL_OPTIONS.keys()), |
| value=DEFAULT_MODEL, |
| label="Model", |
| ) |
| viewpoint_one = gr.Dropdown( |
| choices=list(VIEWPOINT_GUIDES.keys()), |
| value="close-up", |
| label="Viewpoint", |
| ) |
| mode_one = gr.Dropdown( |
| choices=list(MODE_GUIDES.keys()), |
| value="visual analysis paragraph", |
| label="Output mode", |
| ) |
|
|
| scene_one = gr.Textbox( |
| label="Scene", |
| lines=4, |
| value="A dog hides under a kitchen table while a child looks for it.", |
| ) |
|
|
| with gr.Row(): |
| temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
| top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") |
| max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens") |
|
|
| run_one = gr.Button("Generate", variant="primary") |
| output_one = gr.Textbox(label="Generated output", lines=10) |
| prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8) |
| note_one = gr.Textbox(label="Run note", lines=3) |
|
|
| run_one.click( |
| fn=generate_viewpoint, |
| inputs=[ |
| model_one, |
| scene_one, |
| viewpoint_one, |
| mode_one, |
| temperature_one, |
| top_p_one, |
| max_tokens_one, |
| ], |
| outputs=[output_one, prompt_sent_one, note_one], |
| ) |
|
|
| gr.Examples( |
| examples=[ |
| ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"], |
| ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"], |
| ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"], |
| ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"], |
| ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"], |
| ], |
| inputs=[scene_one, viewpoint_one, mode_one], |
| ) |
|
|
| with gr.Tab("Five-Viewpoint Test"): |
| model_grid = gr.Dropdown( |
| choices=list(MODEL_OPTIONS.keys()), |
| value=DEFAULT_MODEL, |
| label="Model", |
| ) |
| scene_grid = gr.Textbox( |
| label="Shared scene", |
| lines=4, |
| value="A dog hides under a kitchen table while a child looks for it.", |
| ) |
| mode_grid = gr.Dropdown( |
| choices=list(MODE_GUIDES.keys()), |
| value="visual analysis paragraph", |
| label="Output mode", |
| ) |
| with gr.Row(): |
| temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature") |
| top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") |
| max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens") |
|
|
| run_grid = gr.Button("Run Five Viewpoints", variant="primary") |
| grid_output = gr.Markdown(label="Five-viewpoint output") |
| grid_notes = gr.Textbox(label="Paper notes", lines=14) |
|
|
| run_grid.click( |
| fn=run_five_viewpoints, |
| inputs=[ |
| model_grid, |
| scene_grid, |
| mode_grid, |
| temperature_grid, |
| top_p_grid, |
| max_tokens_grid, |
| ], |
| outputs=[grid_output, grid_notes], |
| ) |
|
|
| with gr.Tab("Paper Notes Helper"): |
| scene_notes = gr.Textbox( |
| label="Scene being tested", |
| lines=3, |
| value="A dog hides under a kitchen table while a child looks for it.", |
| ) |
| pasted_outputs = gr.Textbox( |
| label="Paste generated outputs here", |
| lines=12, |
| placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.", |
| ) |
| run_notes = gr.Button("Make Paper Notes", variant="primary") |
| paper_notes = gr.Textbox(label="Checklist for findings section", lines=14) |
|
|
| run_notes.click( |
| fn=notes_from_pasted_outputs, |
| inputs=[scene_notes, pasted_outputs], |
| outputs=paper_notes, |
| ) |
|
|
| gr.Markdown( |
| "### Duplication note\n" |
| "This Space uses only local CPU models. No tokens, API keys, or paid " |
| "hardware are required. Students can duplicate it and edit the viewpoints, " |
| "output modes, examples, or model list." |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|