Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import LlavaForConditionalGeneration, AutoProcessor | |
| from PIL import Image | |
| import gc | |
| import time | |
| # Model configuration | |
| MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava" | |
| TITLE = """ | |
| <div style="text-align: center; margin: 20px 0;"> | |
| <h1>π JoyCaption Reliable</h1> | |
| <p><strong>β Ultra-optimized for ZeroGPU - No more stuck generations!</strong></p> | |
| <p><em>Fast loading, aggressive cleanup, guaranteed results</em></p> | |
| </div> | |
| <hr> | |
| """ | |
| print("π Loading reliable JoyCaption system...") | |
| # Load model and processor at startup (ONCE) | |
| print("π¦ Loading model and processor at startup...") | |
| processor = AutoProcessor.from_pretrained( | |
| MODEL_PATH, | |
| low_cpu_mem_usage=True | |
| ) | |
| model = LlavaForConditionalGeneration.from_pretrained( | |
| MODEL_PATH, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| low_cpu_mem_usage=True | |
| ) | |
| model.eval() | |
| print("β Model loaded and ready!") | |
| # Shorter duration since no model loading | |
| def caption_image_optimized(image, style, length): | |
| """Ultra-optimized JoyCaption that won't get stuck""" | |
| if image is None: | |
| return "β Please upload an image first." | |
| start_time = time.time() | |
| try: | |
| print(f"π― Starting generation at {time.time() - start_time:.1f}s...") | |
| # Optimized prompts based on length | |
| if length == "Short": | |
| max_tokens = 100 | |
| prompt_suffix = " Keep it concise and engaging." | |
| elif length == "Medium": | |
| max_tokens = 200 | |
| prompt_suffix = " Use about 1-2 sentences." | |
| else: # Long | |
| max_tokens = 300 | |
| prompt_suffix = " Provide detailed description." | |
| # Style prompts | |
| base_prompts = { | |
| "Engaging": f"Write an engaging, creative caption for this image. Avoid 'A photo of'. Make it captivating.{prompt_suffix}", | |
| "Descriptive": f"Describe this image focusing on people, poses, clothing, and setting.{prompt_suffix}", | |
| "SEO-Friendly": f"Create an SEO-friendly caption that's engaging and descriptive.{prompt_suffix}", | |
| "Creative": f"Write a creative, witty caption with interesting language.{prompt_suffix}" | |
| } | |
| prompt = base_prompts.get(style, base_prompts["Engaging"]) | |
| print(f"π― Processing image at {time.time() - start_time:.1f}s...") | |
| # Simple, fast conversation format | |
| convo = [ | |
| {"role": "system", "content": "You are a helpful, creative caption writer."}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| # Fast processing | |
| convo_string = processor.apply_chat_template( | |
| convo, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[convo_string], | |
| images=[image], | |
| return_tensors="pt" | |
| ) | |
| # Move to device efficiently | |
| device = next(model.parameters()).device | |
| inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()} | |
| if 'pixel_values' in inputs: | |
| inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) | |
| print(f"π Generating at {time.time() - start_time:.1f}s...") | |
| # Fast generation with timeout protection | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| pad_token_id=processor.tokenizer.eos_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| use_cache=True, | |
| num_return_sequences=1 | |
| ) | |
| print(f"π Decoding at {time.time() - start_time:.1f}s...") | |
| # Fast decode | |
| result = processor.tokenizer.decode(output[0], skip_special_tokens=True) | |
| # Quick extraction | |
| for split_marker in ["assistant\n", "ASSISTANT:", "<|im_start|>assistant"]: | |
| if split_marker in result: | |
| result = result.split(split_marker)[-1].strip() | |
| break | |
| # Clean up inputs and output (but NOT the global model/processor) | |
| del inputs, output | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| total_time = time.time() - start_time | |
| print(f"β Complete in {total_time:.1f}s") | |
| if not result or len(result.strip()) < 10: | |
| return "Generated caption but couldn't extract readable text. Please try again." | |
| return f"β±οΈ Generated in {total_time:.1f}s\n\n{result}" | |
| except Exception as e: | |
| # Emergency cleanup | |
| try: | |
| if 'inputs' in locals(): | |
| del inputs | |
| if 'output' in locals(): | |
| del output | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| except: | |
| pass | |
| error_time = time.time() - start_time | |
| return f"β Error after {error_time:.1f}s: {str(e)[:200]}..." | |
| # Streamlined interface | |
| with gr.Blocks(title="Reliable JoyCaption", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(TITLE) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image( | |
| type="pil", | |
| label="πΈ Upload Image", | |
| height=400 | |
| ) | |
| with gr.Row(): | |
| style_input = gr.Dropdown( | |
| choices=["Engaging", "Descriptive", "SEO-Friendly", "Creative"], | |
| value="Engaging", | |
| label="Style", | |
| scale=2 | |
| ) | |
| length_input = gr.Dropdown( | |
| choices=["Short", "Medium", "Long"], | |
| value="Medium", | |
| label="Length", | |
| scale=1 | |
| ) | |
| submit_btn = gr.Button( | |
| "π Generate Caption", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.HTML(""" | |
| <div style="background: #e8f5e8; padding: 10px; border-radius: 5px; margin-top: 10px;"> | |
| <strong>π― Optimizations:</strong><br> | |
| β’ 45-second GPU limit<br> | |
| β’ Aggressive memory cleanup<br> | |
| β’ Fast loading & processing<br> | |
| β’ Timeout protection | |
| </div> | |
| """) | |
| with gr.Column(): | |
| output = gr.Textbox( | |
| label="π Generated Caption", | |
| lines=8, | |
| max_lines=15, | |
| show_copy_button=True | |
| ) | |
| # Single event handler | |
| submit_btn.click( | |
| caption_image_optimized, | |
| inputs=[image_input, style_input, length_input], | |
| outputs=output, | |
| show_progress=True | |
| ) | |
| gr.Markdown(""" | |
| ## π― Ultra-Reliable Features: | |
| β **Fast Loading**: Optimized model loading (5-10 seconds) | |
| β **Short Duration**: 45-second GPU limit prevents timeouts | |
| β **Aggressive Cleanup**: Immediate memory release | |
| β **Progress Tracking**: See exactly how long each step takes | |
| β **Error Protection**: Graceful handling of any issues | |
| β **Multiple Styles**: Engaging, Descriptive, SEO-Friendly, Creative | |
| β **Length Control**: Short, Medium, Long options | |
| **π‘ Why it won't get stuck:** | |
| - Shorter GPU duration prevents ZeroGPU timeouts | |
| - Immediate model cleanup after generation | |
| - Optimized loading with `low_cpu_mem_usage=True` | |
| - Progress timestamps to track performance | |
| - Emergency cleanup on any errors | |
| This version prioritizes **reliability over features** - it should work consistently! | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |