Ovis-Image / app_old.py
tchung1970's picture
Consolidate to single app.py entry point
d41a998
import os
import torch
import gradio as gr
import spaces
import random
import numpy as np
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download
from diffusers.utils import logging
from PIL import Image
from ovis_image.model.tokenizer import build_ovis_tokenizer
from ovis_image.model.autoencoder import load_ae
from ovis_image.model.hf_embedder import OvisEmbedder
from ovis_image.model.model import OvisImageModel
from ovis_image.sampling import generate_image
from ovis_image import ovis_image_configs
logging.set_verbosity_error()
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEED = np.iinfo(np.int32).max
device = "cuda"
_dtype = torch.bfloat16
hf_token = os.getenv("HF_TOKEN")
print("init ovis_image")
model_config = ovis_image_configs["ovis-image-7b"]
ovis_image = OvisImageModel(model_config)
ovis_image_path = hf_hub_download(
repo_id="AIDC-AI/Ovis-Image-7B",
filename="ovis_image.safetensors",
token=hf_token,
)
model_state_dict = load_file(ovis_image_path)
missing_keys, unexpected_keys = ovis_image.load_state_dict(model_state_dict)
print(f"Load Missing Keys {missing_keys}")
print(f"Load Unexpected Keys {unexpected_keys}")
ovis_image = ovis_image.to(device=device, dtype=_dtype)
ovis_image.eval()
print("init vae")
vae_path = hf_hub_download(
repo_id="AIDC-AI/Ovis-Image-7B",
filename="ae.safetensors",
token=hf_token,
)
autoencoder = load_ae(
vae_path,
model_config.autoencoder_params,
device=device,
dtype=_dtype,
random_init=False,
)
autoencoder.eval()
print("init ovis")
# ovis_path = hf_hub_download(
# repo_id="AIDC-AI/Ovis-Image-7B",
# subfolder="Ovis2.5-2B",
# token=hf_token,
# )
ovis_tokenizer = build_ovis_tokenizer(
"AIDC-AI/Ovis2.5-2B",
)
ovis_encoder = OvisEmbedder(
model_path="AIDC-AI/Ovis2.5-2B",
random_init=False,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
).to(device=device, dtype=_dtype)
examples = [
"Five shimmering goldfish weave through crevices between stones; four are red-and-white, while one is silver-white. By the pond's edge, a golden shaded British Shorthair cat watches them intently, counting on blind luck. Watercolor style.",
"Solar punk vehicle in a bustling city",
"An anthropomorphic cat riding a Harley Davidson in Arizona with sunglasses and a leather jacket",
"An elderly woman poses for a high fashion photoshoot in colorful, patterned clothes with a cyberpunk 2077 vibe",
]
def get_image_size(aspect_ratio):
"""Converts aspect ratio string to width, height tuple."""
if "(" in aspect_ratio and "x" in aspect_ratio:
try:
res_part = aspect_ratio.split("(")[1].split(")")[0]
width, height = res_part.split("x")
return int(width), int(height)
except:
pass
return 1024, 1024
apple_css = """
/* Global Styles */
.gradio-container {
max-width: 85vw !important;
margin: 0 auto !important;
padding: 48px 20px !important;
font-family: -apple-system, BlinkMacSystemFont, 'Inter', 'Segoe UI', 'Roboto', sans-serif !important;
}
/* Disable all transitions globally to prevent layout shifts */
* {
transition: none !important;
animation: none !important;
}
/* Header */
.header-container {
text-align: left;
margin-bottom: 24px;
}
.main-title {
font-size: 32px !important;
font-weight: 600 !important;
letter-spacing: -0.02em !important;
line-height: 1.07 !important;
color: #1d1d1f !important;
margin: 0 0 16px 0 !important;
}
.subtitle {
font-size: 21px !important;
font-weight: 400 !important;
line-height: 1.38 !important;
color: #6e6e73 !important;
margin: 0 0 24px 0 !important;
}
.attribution-link {
display: inline-block;
font-size: 14px !important;
color: #0071e3 !important;
text-decoration: none !important;
font-weight: 400 !important;
transition: color 0.2s ease !important;
}
.attribution-link:hover {
color: #0077ed !important;
text-decoration: underline !important;
}
/* Input Section */
.input-section {
background: #ffffff;
border-radius: 18px;
padding: 32px;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08);
}
/* Textbox */
textarea {
font-size: 17px !important;
line-height: 1.47 !important;
border-radius: 12px !important;
border: 1px solid #d2d2d7 !important;
padding: 12px 16px !important;
background: #ffffff !important;
font-family: -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important;
min-height: 200px !important;
max-height: 400px !important;
height: 200px !important;
resize: vertical !important;
overflow-y: auto !important;
margin-bottom: 16px !important;
}
textarea:focus {
border-color: #0071e3 !important;
box-shadow: 0 0 0 4px rgba(0, 113, 227, 0.15) !important;
outline: none !important;
}
textarea::placeholder {
color: #86868b !important;
}
/* Button */
button.primary {
font-size: 17px !important;
font-weight: 400 !important;
padding: 12px 32px !important;
border-radius: 980px !important;
background: #0071e3 !important;
border: none !important;
color: #ffffff !important;
min-height: 44px !important;
letter-spacing: -0.01em !important;
cursor: pointer !important;
}
button.primary:hover {
background: #0077ed !important;
}
button.primary:active {
opacity: 0.9 !important;
}
/* Output Section */
div.output-section {
background: #ffffff;
border-radius: 18px;
padding: 32px;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08);
overflow: hidden;
display: flex;
align-items: center;
justify-content: center;
min-height: 80vh;
max-height: 90vh;
will-change: auto;
position: relative;
}
.output-section * {
transform: none !important;
transition: none !important;
animation: none !important;
}
.output-section img {
border-radius: 12px !important;
max-width: 100% !important;
max-height: 85vh !important;
width: auto !important;
height: auto !important;
object-fit: contain !important;
transform: none !important;
transition: none !important;
animation: none !important;
backface-visibility: hidden;
-webkit-backface-visibility: hidden;
}
/* Make progress/generation area fill more space */
.output-section > div {
width: 100% !important;
min-height: 75vh !important;
max-height: 85vh !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.output-section > div > div {
min-height: 75vh !important;
max-height: 85vh !important;
width: 100% !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.output-section * {
max-width: 100% !important;
}
/* Footer */
.footer-text {
text-align: center;
margin-top: 48px;
font-size: 14px !important;
color: #86868b !important;
line-height: 1.43 !important;
}
/* Progress */
.progress-bar {
background: #0071e3 !important;
border-radius: 4px !important;
}
/* Dark Mode */
.dark .main-title {
color: #ffffff !important;
}
.dark .subtitle {
color: #a1a1a6 !important;
}
.input-section .main-title {
color: #ffffff !important;
}
.dark .input-section .main-title {
color: #f5f5f7 !important;
}
.dark .input-section,
.dark .output-section {
background: #1d1d1f;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.4);
}
.dark textarea {
background: #1d1d1f !important;
border-color: #424245 !important;
color: #f5f5f7 !important;
}
.dark textarea::placeholder {
color: #86868b !important;
}
/* Inline labels */
label.inline-label {
display: flex !important;
align-items: center !important;
min-width: 120px !important;
margin: 0 !important;
padding: 0 12px 0 0 !important;
font-weight: 400 !important;
font-size: 14px !important;
color: #1d1d1f !important;
}
/* Fix column width to prevent shrinking - target Gradio's generated structure */
.input-section {
min-width: 550px !important;
max-width: 550px !important;
width: 550px !important;
flex-shrink: 0 !important;
flex-grow: 0 !important;
}
/* Lock the output section to fill remaining space */
.output-section {
flex-grow: 1 !important;
flex-shrink: 0 !important;
flex-basis: auto !important;
}
/* Prevent Gradio columns from flexing */
.gradio-column {
flex-shrink: 0 !important;
}
/* Stabilize row layout - force horizontal layout with maximum specificity */
.gradio-row,
div.gradio-row,
.gradio-container .gradio-row,
.gradio-container > .gradio-row,
.gradio-container div.gradio-row {
align-items: flex-start !important;
flex-direction: row !important;
display: flex !important;
flex-wrap: nowrap !important;
width: 100% !important;
}
/* Force columns to stay inline */
.gradio-row > .gradio-column,
.gradio-row > div {
display: inline-flex !important;
vertical-align: top !important;
}
/* First column - input section */
.gradio-row > .gradio-column:first-child,
.gradio-row > div:first-child {
width: 550px !important;
min-width: 550px !important;
max-width: 550px !important;
flex: 0 0 550px !important;
}
/* Second column - output section */
.gradio-row > .gradio-column:last-child,
.gradio-row > div:last-child {
flex: 1 1 auto !important;
min-width: 0 !important;
}
/* Lock textbox container size */
.input-section .gr-textbox,
.input-section label[for] {
width: 100% !important;
}
/* Prevent form from expanding */
.input-section form {
width: 100% !important;
max-width: 100% !important;
}
/* Ensure seed input always visible */
.input-section input[type="number"] {
display: block !important;
visibility: visible !important;
}
/* Hide progress indicator in input section - target specific progress elements */
.input-section .progress-container,
.input-section [class*="progress-bar"],
.input-section [class*="progress-text"],
.input-section [class*="progress-level"],
.input-section .progress,
.input-section .eta-bar {
display: none !important;
visibility: hidden !important;
height: 0 !important;
overflow: hidden !important;
}
/* Override ALL responsive behavior - force horizontal layout at ALL viewport sizes */
@media (max-width: 2000px) {
.gradio-row,
div.gradio-row,
.gradio-container .gradio-row,
.gradio-container > .gradio-row {
flex-direction: row !important;
flex-wrap: nowrap !important;
display: flex !important;
}
.gradio-row > .gradio-column,
.gradio-row > div {
display: inline-flex !important;
}
.gradio-row > .gradio-column:first-child,
.gradio-row > div:first-child {
width: 550px !important;
min-width: 550px !important;
max-width: 550px !important;
flex: 0 0 550px !important;
}
.gradio-row > .gradio-column:last-child,
.gradio-row > div:last-child {
flex: 1 1 auto !important;
min-width: 0 !important;
}
}
/* Responsive text sizing only */
@media (max-width: 734px) {
.main-title {
font-size: 40px !important;
}
.subtitle {
font-size: 19px !important;
}
.gradio-container {
padding: 32px 16px !important;
}
.input-section,
.output-section {
padding: 24px !important;
}
/* FORCE horizontal layout even on mobile */
.gradio-row,
div.gradio-row {
flex-direction: row !important;
flex-wrap: nowrap !important;
}
}
/* Remove default Gradio styling */
.contain {
padding: 0 !important;
}
/* Hide Gradio footer */
footer {
display: none !important;
}
.footer {
display: none !important;
}
/* Target main app container */
#root, #app {
width: 100% !important;
max-width: none !important;
}
"""
# JavaScript to force horizontal layout
js_code = """
function() {
function forceHorizontalLayout() {
// Set container width
const container = document.querySelector('.gradio-container');
if (container) {
container.style.maxWidth = '85vw';
container.style.width = '85vw';
}
// Target the main row specifically
const mainRow = document.getElementById('main-row');
if (mainRow) {
mainRow.style.flexDirection = 'row';
mainRow.style.flexWrap = 'nowrap';
mainRow.style.display = 'flex';
mainRow.style.width = '100%';
}
// Force ALL rows to stay horizontal
const rows = document.querySelectorAll('.gradio-row');
rows.forEach(row => {
row.style.flexDirection = 'row';
row.style.flexWrap = 'nowrap';
row.style.display = 'flex';
});
// Target specific columns
const inputCol = document.getElementById('input-column');
if (inputCol) {
inputCol.style.width = '550px';
inputCol.style.minWidth = '550px';
inputCol.style.maxWidth = '550px';
inputCol.style.flex = '0 0 550px';
inputCol.style.display = 'inline-flex';
inputCol.style.flexDirection = 'column';
}
const outputCol = document.getElementById('output-column');
if (outputCol) {
outputCol.style.flex = '1 1 auto';
outputCol.style.minWidth = '0';
outputCol.style.display = 'inline-flex';
outputCol.style.flexDirection = 'column';
}
// Fallback: force all column children of rows
const columns = document.querySelectorAll('.gradio-row > .gradio-column, .gradio-row > div');
columns.forEach((col, index) => {
if (index === 0) {
col.style.width = '550px';
col.style.minWidth = '550px';
col.style.maxWidth = '550px';
col.style.flex = '0 0 550px';
} else if (index === 1) {
col.style.flex = '1 1 auto';
col.style.minWidth = '0';
}
col.style.display = 'inline-flex';
});
}
// Run immediately
forceHorizontalLayout();
// Run again after delays to override Gradio's dynamic changes
setTimeout(forceHorizontalLayout, 100);
setTimeout(forceHorizontalLayout, 500);
setTimeout(forceHorizontalLayout, 1000);
setTimeout(forceHorizontalLayout, 2000);
// Set up mutation observer to reapply on DOM changes
const observer = new MutationObserver(forceHorizontalLayout);
observer.observe(document.body, { childList: true, subtree: true, attributes: true, attributeFilter: ['style', 'class'] });
}
"""
@spaces.GPU(duration=75)
def infer(
prompt,
seed=42,
randomize_seed=False,
aspect_ratio="1:1 (1024x1024)",
guidance_scale=5.0,
num_inference_steps=50,
progress=gr.Progress(track_tqdm=True),
):
"""Generates an image using the Ovis-Image pipeline."""
if randomize_seed:
seed = random.randint(0, MAX_SEED)
width, height = get_image_size(aspect_ratio)
print(f'inference with prompt: {prompt}, size: {height}x{width}, seed: {seed}, steps: {num_inference_steps}, cfg: {guidance_scale}')
image = generate_image(
device=next(ovis_image.parameters()).device,
dtype=_dtype,
model=ovis_image,
prompt=prompt,
autoencoder=autoencoder,
ovis_tokenizer=ovis_tokenizer,
ovis_encoder=ovis_encoder,
img_height=height,
img_width=width,
denoising_steps=num_inference_steps,
cfg_scale=guidance_scale,
seed=seed,
)
# bring into PIL format and save
image = image.clamp(-1, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
image = (image * 255).round().astype("uint8")
return image[0], seed
with gr.Blocks(
title="Ovis-Image",
fill_height=False,
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.blue,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.gray,
spacing_size=gr.themes.sizes.spacing_lg,
radius_size=gr.themes.sizes.radius_lg,
text_size=gr.themes.sizes.text_md,
font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "-apple-system", "BlinkMacSystemFont", "system-ui", "sans-serif"],
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "SF Mono", "ui-monospace", "monospace"],
).set(
body_background_fill='#f5f5f7',
body_background_fill_dark='#000000',
button_primary_background_fill='#0071e3',
button_primary_background_fill_hover='#0077ed',
button_primary_text_color='#ffffff',
block_background_fill='#ffffff',
block_background_fill_dark='#1d1d1f',
block_border_width='0px',
block_shadow='0 2px 12px rgba(0, 0, 0, 0.08)',
block_shadow_dark='0 2px 12px rgba(0, 0, 0, 0.4)',
input_background_fill='#ffffff',
input_background_fill_dark='#1d1d1f',
input_border_width='1px',
input_border_color='#d2d2d7',
input_border_color_dark='#424245',
input_shadow='none',
input_shadow_focus='0 0 0 4px rgba(0, 113, 227, 0.15)',
),
css=apple_css,
js=js_code,
) as demo:
# Two-column layout - variant='panel' prevents responsive stacking
with gr.Row(equal_height=False, variant="panel", elem_id="main-row"):
# Left column - Input controls (fixed width)
with gr.Column(scale=0, min_width=550, elem_classes="input-section", elem_id="input-column"):
# Title above prompt box
gr.HTML("""
<div class="header-container">
<h1 class="main-title">Ovis-Image</h1>
</div>
""")
prompt = gr.Textbox(
placeholder="Describe the image you want to create...",
value=examples[0],
lines=7,
max_lines=7,
label="Prompt",
show_label=True,
container=True,
autoscroll=False,
)
aspect_ratio = gr.Dropdown(
choices=[
"1:1 (1024x1024)",
"4:3 (1024x768)",
"3:4 (768x1024)",
"16:9 (1024x576)",
"9:16 (576x1024)",
],
value="1:1 (1024x1024)",
label="Aspect Ratio",
show_label=True,
container=True,
)
run_button = gr.Button(
"Generate",
variant="primary",
size="lg",
elem_classes="primary"
)
# Hidden advanced settings (still functional but not visible)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
visible=False
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True, visible=False)
guidance_scale = gr.Slider(
label="Guidance scale",
minimum=0.0,
maximum=14.0,
step=0.1,
value=5.0,
visible=False
)
num_inference_steps = gr.Slider(
label="Number of inference steps",
minimum=1,
maximum=100,
step=1,
value=50,
visible=False
)
# Right column - Image output
with gr.Column(scale=2, elem_classes="output-section", elem_id="output-column"):
result = gr.Image(
label="Result",
show_label=False,
type="numpy",
format="png",
)
# Event handlers - using gr.on() like original Qwen-Image
gr.on(
triggers=[run_button.click, prompt.submit],
fn=infer,
inputs=[
prompt,
seed,
randomize_seed,
aspect_ratio,
guidance_scale,
num_inference_steps,
],
outputs=[result, seed],
)
if __name__ == '__main__':
demo.launch()