import os import torch import gradio as gr import spaces import random import numpy as np from safetensors.torch import load_file from huggingface_hub import hf_hub_download from diffusers.utils import logging from PIL import Image from ovis_image.model.tokenizer import build_ovis_tokenizer from ovis_image.model.autoencoder import load_ae from ovis_image.model.hf_embedder import OvisEmbedder from ovis_image.model.model import OvisImageModel from ovis_image.sampling import generate_image from ovis_image import ovis_image_configs logging.set_verbosity_error() # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MAX_SEED = np.iinfo(np.int32).max device = "cuda" _dtype = torch.bfloat16 hf_token = os.getenv("HF_TOKEN") print("init ovis_image") model_config = ovis_image_configs["ovis-image-7b"] ovis_image = OvisImageModel(model_config) ovis_image_path = hf_hub_download( repo_id="AIDC-AI/Ovis-Image-7B", filename="ovis_image.safetensors", token=hf_token, ) model_state_dict = load_file(ovis_image_path) missing_keys, unexpected_keys = ovis_image.load_state_dict(model_state_dict) print(f"Load Missing Keys {missing_keys}") print(f"Load Unexpected Keys {unexpected_keys}") ovis_image = ovis_image.to(device=device, dtype=_dtype) ovis_image.eval() print("init vae") vae_path = hf_hub_download( repo_id="AIDC-AI/Ovis-Image-7B", filename="ae.safetensors", token=hf_token, ) autoencoder = load_ae( vae_path, model_config.autoencoder_params, device=device, dtype=_dtype, random_init=False, ) autoencoder.eval() print("init ovis") # ovis_path = hf_hub_download( # repo_id="AIDC-AI/Ovis-Image-7B", # subfolder="Ovis2.5-2B", # token=hf_token, # ) ovis_tokenizer = build_ovis_tokenizer( "AIDC-AI/Ovis2.5-2B", ) ovis_encoder = OvisEmbedder( model_path="AIDC-AI/Ovis2.5-2B", random_init=False, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, ).to(device=device, dtype=_dtype) examples = [ "Five shimmering goldfish weave through crevices between stones; four are red-and-white, while one is silver-white. By the pond's edge, a golden shaded British Shorthair cat watches them intently, counting on blind luck. Watercolor style.", "Solar punk vehicle in a bustling city", "An anthropomorphic cat riding a Harley Davidson in Arizona with sunglasses and a leather jacket", "An elderly woman poses for a high fashion photoshoot in colorful, patterned clothes with a cyberpunk 2077 vibe", ] def get_image_size(aspect_ratio): """Converts aspect ratio string to width, height tuple.""" if "(" in aspect_ratio and "x" in aspect_ratio: try: res_part = aspect_ratio.split("(")[1].split(")")[0] width, height = res_part.split("x") return int(width), int(height) except: pass return 1024, 1024 apple_css = """ /* Global Styles */ .gradio-container { max-width: 85vw !important; margin: 0 auto !important; padding: 48px 20px !important; font-family: -apple-system, BlinkMacSystemFont, 'Inter', 'Segoe UI', 'Roboto', sans-serif !important; } /* Disable all transitions globally to prevent layout shifts */ * { transition: none !important; animation: none !important; } /* Header */ .header-container { text-align: left; margin-bottom: 24px; } .main-title { font-size: 32px !important; font-weight: 600 !important; letter-spacing: -0.02em !important; line-height: 1.07 !important; color: #1d1d1f !important; margin: 0 0 16px 0 !important; } .subtitle { font-size: 21px !important; font-weight: 400 !important; line-height: 1.38 !important; color: #6e6e73 !important; margin: 0 0 24px 0 !important; } .attribution-link { display: inline-block; font-size: 14px !important; color: #0071e3 !important; text-decoration: none !important; font-weight: 400 !important; transition: color 0.2s ease !important; } .attribution-link:hover { color: #0077ed !important; text-decoration: underline !important; } /* Input Section */ .input-section { background: #ffffff; border-radius: 18px; padding: 32px; box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08); } /* Textbox */ textarea { font-size: 17px !important; line-height: 1.47 !important; border-radius: 12px !important; border: 1px solid #d2d2d7 !important; padding: 12px 16px !important; background: #ffffff !important; font-family: -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important; min-height: 200px !important; max-height: 400px !important; height: 200px !important; resize: vertical !important; overflow-y: auto !important; margin-bottom: 16px !important; } textarea:focus { border-color: #0071e3 !important; box-shadow: 0 0 0 4px rgba(0, 113, 227, 0.15) !important; outline: none !important; } textarea::placeholder { color: #86868b !important; } /* Button */ button.primary { font-size: 17px !important; font-weight: 400 !important; padding: 12px 32px !important; border-radius: 980px !important; background: #0071e3 !important; border: none !important; color: #ffffff !important; min-height: 44px !important; letter-spacing: -0.01em !important; cursor: pointer !important; } button.primary:hover { background: #0077ed !important; } button.primary:active { opacity: 0.9 !important; } /* Output Section */ div.output-section { background: #ffffff; border-radius: 18px; padding: 32px; box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08); overflow: hidden; display: flex; align-items: center; justify-content: center; min-height: 80vh; max-height: 90vh; will-change: auto; position: relative; } .output-section * { transform: none !important; transition: none !important; animation: none !important; } .output-section img { border-radius: 12px !important; max-width: 100% !important; max-height: 85vh !important; width: auto !important; height: auto !important; object-fit: contain !important; transform: none !important; transition: none !important; animation: none !important; backface-visibility: hidden; -webkit-backface-visibility: hidden; } /* Make progress/generation area fill more space */ .output-section > div { width: 100% !important; min-height: 75vh !important; max-height: 85vh !important; display: flex !important; align-items: center !important; justify-content: center !important; } .output-section > div > div { min-height: 75vh !important; max-height: 85vh !important; width: 100% !important; display: flex !important; align-items: center !important; justify-content: center !important; } .output-section * { max-width: 100% !important; } /* Footer */ .footer-text { text-align: center; margin-top: 48px; font-size: 14px !important; color: #86868b !important; line-height: 1.43 !important; } /* Progress */ .progress-bar { background: #0071e3 !important; border-radius: 4px !important; } /* Dark Mode */ .dark .main-title { color: #ffffff !important; } .dark .subtitle { color: #a1a1a6 !important; } .input-section .main-title { color: #ffffff !important; } .dark .input-section .main-title { color: #f5f5f7 !important; } .dark .input-section, .dark .output-section { background: #1d1d1f; box-shadow: 0 2px 12px rgba(0, 0, 0, 0.4); } .dark textarea { background: #1d1d1f !important; border-color: #424245 !important; color: #f5f5f7 !important; } .dark textarea::placeholder { color: #86868b !important; } /* Inline labels */ label.inline-label { display: flex !important; align-items: center !important; min-width: 120px !important; margin: 0 !important; padding: 0 12px 0 0 !important; font-weight: 400 !important; font-size: 14px !important; color: #1d1d1f !important; } /* Fix column width to prevent shrinking - target Gradio's generated structure */ .input-section { min-width: 550px !important; max-width: 550px !important; width: 550px !important; flex-shrink: 0 !important; flex-grow: 0 !important; } /* Lock the output section to fill remaining space */ .output-section { flex-grow: 1 !important; flex-shrink: 0 !important; flex-basis: auto !important; } /* Prevent Gradio columns from flexing */ .gradio-column { flex-shrink: 0 !important; } /* Stabilize row layout - force horizontal layout with maximum specificity */ .gradio-row, div.gradio-row, .gradio-container .gradio-row, .gradio-container > .gradio-row, .gradio-container div.gradio-row { align-items: flex-start !important; flex-direction: row !important; display: flex !important; flex-wrap: nowrap !important; width: 100% !important; } /* Force columns to stay inline */ .gradio-row > .gradio-column, .gradio-row > div { display: inline-flex !important; vertical-align: top !important; } /* First column - input section */ .gradio-row > .gradio-column:first-child, .gradio-row > div:first-child { width: 550px !important; min-width: 550px !important; max-width: 550px !important; flex: 0 0 550px !important; } /* Second column - output section */ .gradio-row > .gradio-column:last-child, .gradio-row > div:last-child { flex: 1 1 auto !important; min-width: 0 !important; } /* Lock textbox container size */ .input-section .gr-textbox, .input-section label[for] { width: 100% !important; } /* Prevent form from expanding */ .input-section form { width: 100% !important; max-width: 100% !important; } /* Ensure seed input always visible */ .input-section input[type="number"] { display: block !important; visibility: visible !important; } /* Hide progress indicator in input section - target specific progress elements */ .input-section .progress-container, .input-section [class*="progress-bar"], .input-section [class*="progress-text"], .input-section [class*="progress-level"], .input-section .progress, .input-section .eta-bar { display: none !important; visibility: hidden !important; height: 0 !important; overflow: hidden !important; } /* Override ALL responsive behavior - force horizontal layout at ALL viewport sizes */ @media (max-width: 2000px) { .gradio-row, div.gradio-row, .gradio-container .gradio-row, .gradio-container > .gradio-row { flex-direction: row !important; flex-wrap: nowrap !important; display: flex !important; } .gradio-row > .gradio-column, .gradio-row > div { display: inline-flex !important; } .gradio-row > .gradio-column:first-child, .gradio-row > div:first-child { width: 550px !important; min-width: 550px !important; max-width: 550px !important; flex: 0 0 550px !important; } .gradio-row > .gradio-column:last-child, .gradio-row > div:last-child { flex: 1 1 auto !important; min-width: 0 !important; } } /* Responsive text sizing only */ @media (max-width: 734px) { .main-title { font-size: 40px !important; } .subtitle { font-size: 19px !important; } .gradio-container { padding: 32px 16px !important; } .input-section, .output-section { padding: 24px !important; } /* FORCE horizontal layout even on mobile */ .gradio-row, div.gradio-row { flex-direction: row !important; flex-wrap: nowrap !important; } } /* Remove default Gradio styling */ .contain { padding: 0 !important; } /* Hide Gradio footer */ footer { display: none !important; } .footer { display: none !important; } /* Target main app container */ #root, #app { width: 100% !important; max-width: none !important; } """ # JavaScript to force horizontal layout js_code = """ function() { function forceHorizontalLayout() { // Set container width const container = document.querySelector('.gradio-container'); if (container) { container.style.maxWidth = '85vw'; container.style.width = '85vw'; } // Target the main row specifically const mainRow = document.getElementById('main-row'); if (mainRow) { mainRow.style.flexDirection = 'row'; mainRow.style.flexWrap = 'nowrap'; mainRow.style.display = 'flex'; mainRow.style.width = '100%'; } // Force ALL rows to stay horizontal const rows = document.querySelectorAll('.gradio-row'); rows.forEach(row => { row.style.flexDirection = 'row'; row.style.flexWrap = 'nowrap'; row.style.display = 'flex'; }); // Target specific columns const inputCol = document.getElementById('input-column'); if (inputCol) { inputCol.style.width = '550px'; inputCol.style.minWidth = '550px'; inputCol.style.maxWidth = '550px'; inputCol.style.flex = '0 0 550px'; inputCol.style.display = 'inline-flex'; inputCol.style.flexDirection = 'column'; } const outputCol = document.getElementById('output-column'); if (outputCol) { outputCol.style.flex = '1 1 auto'; outputCol.style.minWidth = '0'; outputCol.style.display = 'inline-flex'; outputCol.style.flexDirection = 'column'; } // Fallback: force all column children of rows const columns = document.querySelectorAll('.gradio-row > .gradio-column, .gradio-row > div'); columns.forEach((col, index) => { if (index === 0) { col.style.width = '550px'; col.style.minWidth = '550px'; col.style.maxWidth = '550px'; col.style.flex = '0 0 550px'; } else if (index === 1) { col.style.flex = '1 1 auto'; col.style.minWidth = '0'; } col.style.display = 'inline-flex'; }); } // Run immediately forceHorizontalLayout(); // Run again after delays to override Gradio's dynamic changes setTimeout(forceHorizontalLayout, 100); setTimeout(forceHorizontalLayout, 500); setTimeout(forceHorizontalLayout, 1000); setTimeout(forceHorizontalLayout, 2000); // Set up mutation observer to reapply on DOM changes const observer = new MutationObserver(forceHorizontalLayout); observer.observe(document.body, { childList: true, subtree: true, attributes: true, attributeFilter: ['style', 'class'] }); } """ @spaces.GPU(duration=75) def infer( prompt, seed=42, randomize_seed=False, aspect_ratio="1:1 (1024x1024)", guidance_scale=5.0, num_inference_steps=50, progress=gr.Progress(track_tqdm=True), ): """Generates an image using the Ovis-Image pipeline.""" if randomize_seed: seed = random.randint(0, MAX_SEED) width, height = get_image_size(aspect_ratio) print(f'inference with prompt: {prompt}, size: {height}x{width}, seed: {seed}, steps: {num_inference_steps}, cfg: {guidance_scale}') image = generate_image( device=next(ovis_image.parameters()).device, dtype=_dtype, model=ovis_image, prompt=prompt, autoencoder=autoencoder, ovis_tokenizer=ovis_tokenizer, ovis_encoder=ovis_encoder, img_height=height, img_width=width, denoising_steps=num_inference_steps, cfg_scale=guidance_scale, seed=seed, ) # bring into PIL format and save image = image.clamp(-1, 1) image = image.cpu().permute(0, 2, 3, 1).float().numpy() image = (image * 255).round().astype("uint8") return image[0], seed with gr.Blocks( title="Ovis-Image", fill_height=False, theme=gr.themes.Soft( primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.slate, neutral_hue=gr.themes.colors.gray, spacing_size=gr.themes.sizes.spacing_lg, radius_size=gr.themes.sizes.radius_lg, text_size=gr.themes.sizes.text_md, font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "-apple-system", "BlinkMacSystemFont", "system-ui", "sans-serif"], font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "SF Mono", "ui-monospace", "monospace"], ).set( body_background_fill='#f5f5f7', body_background_fill_dark='#000000', button_primary_background_fill='#0071e3', button_primary_background_fill_hover='#0077ed', button_primary_text_color='#ffffff', block_background_fill='#ffffff', block_background_fill_dark='#1d1d1f', block_border_width='0px', block_shadow='0 2px 12px rgba(0, 0, 0, 0.08)', block_shadow_dark='0 2px 12px rgba(0, 0, 0, 0.4)', input_background_fill='#ffffff', input_background_fill_dark='#1d1d1f', input_border_width='1px', input_border_color='#d2d2d7', input_border_color_dark='#424245', input_shadow='none', input_shadow_focus='0 0 0 4px rgba(0, 113, 227, 0.15)', ), css=apple_css, js=js_code, ) as demo: # Two-column layout - variant='panel' prevents responsive stacking with gr.Row(equal_height=False, variant="panel", elem_id="main-row"): # Left column - Input controls (fixed width) with gr.Column(scale=0, min_width=550, elem_classes="input-section", elem_id="input-column"): # Title above prompt box gr.HTML("""

Ovis-Image

""") prompt = gr.Textbox( placeholder="Describe the image you want to create...", value=examples[0], lines=7, max_lines=7, label="Prompt", show_label=True, container=True, autoscroll=False, ) aspect_ratio = gr.Dropdown( choices=[ "1:1 (1024x1024)", "4:3 (1024x768)", "3:4 (768x1024)", "16:9 (1024x576)", "9:16 (576x1024)", ], value="1:1 (1024x1024)", label="Aspect Ratio", show_label=True, container=True, ) run_button = gr.Button( "Generate", variant="primary", size="lg", elem_classes="primary" ) # Hidden advanced settings (still functional but not visible) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, visible=False ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True, visible=False) guidance_scale = gr.Slider( label="Guidance scale", minimum=0.0, maximum=14.0, step=0.1, value=5.0, visible=False ) num_inference_steps = gr.Slider( label="Number of inference steps", minimum=1, maximum=100, step=1, value=50, visible=False ) # Right column - Image output with gr.Column(scale=2, elem_classes="output-section", elem_id="output-column"): result = gr.Image( label="Result", show_label=False, type="numpy", format="png", ) # Event handlers - using gr.on() like original Qwen-Image gr.on( triggers=[run_button.click, prompt.submit], fn=infer, inputs=[ prompt, seed, randomize_seed, aspect_ratio, guidance_scale, num_inference_steps, ], outputs=[result, seed], ) if __name__ == '__main__': demo.launch()