profplate's picture
Create app.py
7bb5bc1 verified
from functools import lru_cache
import time
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
MODEL_OPTIONS = {
"SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
"SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
"distilgpt2 (baseline)": "distilgpt2",
}
DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
INSTRUCT_MODEL_LABELS = {
"SmolLM2 360M Instruct (best default)",
"SmolLM2 135M Instruct (fast)",
}
VIEWPOINT_GUIDES = {
"close-up": (
"Focus on nearby detail, texture, facial expression, small objects, and "
"what is cropped out or hidden by the tight framing."
),
"wide shot": (
"Focus on layout, background, scale, distance between objects, and how "
"the whole scene is arranged."
),
"bird's-eye view": (
"Describe the scene from above. Focus on map-like layout, paths, shapes, "
"and what becomes visible only from overhead."
),
"low angle": (
"Describe the scene from below. Focus on height, scale, foreground, "
"dominance, sky or ceiling, and what is hidden behind tall objects."
),
"over-the-shoulder": (
"Describe what is visible from behind one character or object. Focus on "
"foreground shoulder/frame, partial visibility, and what the viewer can "
"infer but not fully see."
),
}
MODE_GUIDES = {
"cinematic shot description": (
"Write like a film shot description, emphasizing framing, movement, and "
"what the viewer sees first."
),
"photography caption": (
"Write like a precise photography caption, emphasizing composition and "
"visible details."
),
"storyboard note": (
"Write like a storyboard note for an artist, naming visual beats and "
"spatial relationships."
),
"image prompt helper": (
"Write a detailed image-generation prompt that makes the viewpoint and "
"composition explicit."
),
"visual analysis paragraph": (
"Write an analytical paragraph explaining how the viewpoint changes "
"what is visible and what is hidden."
),
}
FIVE_VIEWPOINTS = [
"close-up",
"wide shot",
"bird's-eye view",
"low angle",
"over-the-shoulder",
]
try:
torch.set_num_threads(2)
except Exception:
pass
@lru_cache(maxsize=3)
def load_generator(model_label):
model_id = MODEL_OPTIONS[model_label]
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
model.eval()
return pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=-1,
)
def build_prompt(model_label, scene, viewpoint, output_mode):
scene = scene.strip()
viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
mode_guide = MODE_GUIDES[output_mode]
if model_label not in INSTRUCT_MODEL_LABELS:
return (
f"{viewpoint.title()} {output_mode}.\n"
f"Scene: {scene}\n"
"Description:"
)
return (
"You are a careful visual scene description assistant for a student "
"research project.\n"
"Describe the same scene from a selected viewpoint. The important question "
"is not just camera vocabulary; explain what becomes visible, hidden, "
"larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
f"Viewpoint: {viewpoint}\n"
f"Viewpoint guidance: {viewpoint_guide}\n"
f"Output mode: {output_mode}\n"
f"Output guidance: {mode_guide}\n"
f"Scene: {scene}\n\n"
"Write the response now:"
)
def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
generator = load_generator(model_label)
tokenizer = generator.tokenizer
result = generator(
final_prompt,
max_new_tokens=int(max_new_tokens),
temperature=max(float(temperature), 0.05),
top_p=float(top_p),
do_sample=True,
repetition_penalty=1.08,
return_full_text=False,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
text = result[0]["generated_text"].strip()
return text if text else "(The model returned an empty response. Try more tokens.)"
def generate_viewpoint(
model_label,
scene,
viewpoint,
output_mode,
temperature,
top_p,
max_new_tokens,
):
if not scene or not scene.strip():
return "Please enter a scene.", "", ""
final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
started = time.perf_counter()
try:
output = call_model(
model_label,
final_prompt,
temperature,
top_p,
max_new_tokens,
)
except Exception as exc:
return (
f"Error while running the model: {exc}",
final_prompt,
"Try the fast model first, or reduce max tokens.",
)
elapsed = time.perf_counter() - started
note = (
f"Model: {MODEL_OPTIONS[model_label]}\n"
f"Elapsed: {elapsed:.1f} seconds\n"
"First use can be slower because the model has to download and load."
)
return output, final_prompt, note
def make_paper_notes(scene, outputs_text):
scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
return (
f"Paper notes for: {scene_line}\n\n"
"Use these checks while reading the outputs:\n\n"
"1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
"2. Occlusion: Does the model notice when one object blocks another?\n"
"3. Scale: Does low angle or close-up change perceived size or importance?\n"
"4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
"5. Specificity: Does the model describe this scene, or could the paragraph "
"fit almost any scene?\n"
"6. Finding sentence: Write one cautious sentence about whether the model "
"understands viewpoint consequences or only uses camera-angle words.\n\n"
"Useful wording for the paper:\n"
"In this small test, the model was strongest when ____. It was weakest "
"when ____. The clearest limitation was ____."
)
def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
if not scene or not scene.strip():
return "Please enter a scene.", ""
started = time.perf_counter()
sections = []
try:
for viewpoint in FIVE_VIEWPOINTS:
final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
output = call_model(
model_label,
final_prompt,
temperature,
top_p,
max_new_tokens,
)
sections.append(f"## {viewpoint.title()}\n\n{output}")
except Exception as exc:
return (
f"Error while running the five-viewpoint test: {exc}",
"Try the fast model first, or reduce max tokens.",
)
elapsed = time.perf_counter() - started
outputs_text = "\n\n---\n\n".join(sections)
notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
return outputs_text, notes
def notes_from_pasted_outputs(scene, pasted_outputs):
if not pasted_outputs or not pasted_outputs.strip():
return "Paste your generated outputs first."
return make_paper_notes(scene, pasted_outputs)
with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"# Camera Angle Model Lab\n"
"CPU-only viewpoint lab for testing how small language models describe "
"the same scene from different visual perspectives. No API tokens or paid "
"compute required. The first run may take a minute while the model loads."
)
with gr.Tab("Single Viewpoint Writer"):
with gr.Row():
model_one = gr.Dropdown(
choices=list(MODEL_OPTIONS.keys()),
value=DEFAULT_MODEL,
label="Model",
)
viewpoint_one = gr.Dropdown(
choices=list(VIEWPOINT_GUIDES.keys()),
value="close-up",
label="Viewpoint",
)
mode_one = gr.Dropdown(
choices=list(MODE_GUIDES.keys()),
value="visual analysis paragraph",
label="Output mode",
)
scene_one = gr.Textbox(
label="Scene",
lines=4,
value="A dog hides under a kitchen table while a child looks for it.",
)
with gr.Row():
temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")
run_one = gr.Button("Generate", variant="primary")
output_one = gr.Textbox(label="Generated output", lines=10)
prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
note_one = gr.Textbox(label="Run note", lines=3)
run_one.click(
fn=generate_viewpoint,
inputs=[
model_one,
scene_one,
viewpoint_one,
mode_one,
temperature_one,
top_p_one,
max_tokens_one,
],
outputs=[output_one, prompt_sent_one, note_one],
)
gr.Examples(
examples=[
["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
],
inputs=[scene_one, viewpoint_one, mode_one],
)
with gr.Tab("Five-Viewpoint Test"):
model_grid = gr.Dropdown(
choices=list(MODEL_OPTIONS.keys()),
value=DEFAULT_MODEL,
label="Model",
)
scene_grid = gr.Textbox(
label="Shared scene",
lines=4,
value="A dog hides under a kitchen table while a child looks for it.",
)
mode_grid = gr.Dropdown(
choices=list(MODE_GUIDES.keys()),
value="visual analysis paragraph",
label="Output mode",
)
with gr.Row():
temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")
run_grid = gr.Button("Run Five Viewpoints", variant="primary")
grid_output = gr.Markdown(label="Five-viewpoint output")
grid_notes = gr.Textbox(label="Paper notes", lines=14)
run_grid.click(
fn=run_five_viewpoints,
inputs=[
model_grid,
scene_grid,
mode_grid,
temperature_grid,
top_p_grid,
max_tokens_grid,
],
outputs=[grid_output, grid_notes],
)
with gr.Tab("Paper Notes Helper"):
scene_notes = gr.Textbox(
label="Scene being tested",
lines=3,
value="A dog hides under a kitchen table while a child looks for it.",
)
pasted_outputs = gr.Textbox(
label="Paste generated outputs here",
lines=12,
placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
)
run_notes = gr.Button("Make Paper Notes", variant="primary")
paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)
run_notes.click(
fn=notes_from_pasted_outputs,
inputs=[scene_notes, pasted_outputs],
outputs=paper_notes,
)
gr.Markdown(
"### Duplication note\n"
"This Space uses only local CPU models. No tokens, API keys, or paid "
"hardware are required. Students can duplicate it and edit the viewpoints, "
"output modes, examples, or model list."
)
if __name__ == "__main__":
demo.launch()