Spaces:

JQ66
/

11111

Sleeping

App Files Files Community

JQ66 commited on Apr 22, 2025

Commit

fe5b715

verified ·

1 Parent(s): 4a2e843

Create app.py

Browse files

Files changed (1) hide show

app.py +313 -0

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# --- Filename: app.py ---
+import gradio as gr
+import openai
+import torch
+from diffusers import StableDiffusionPipeline, LCMScheduler
+import os
+from PIL import Image
+import io # Required for handling audio file object for OpenAI API
+import time # To estimate generation time
+# --- Configuration ---
+# Load API keys from Hugging Face Secrets or environment variables
+# IMPORTANT: Ensure the secret/variable named OPENAI_API_KEY is set in your environment.
+openai.api_key = os.environ.get("OPENAI_API_KEY")
+hf_token = os.environ.get("HF_TOKEN") # May be needed for model download
+if not openai.api_key:
+    print("\n" + "="*40)
+    print("ERROR: OPENAI_API_KEY environment variable not found.")
+    print("Please set the OPENAI_API_KEY secret/variable.")
+    print("OpenAI features (prompt enhancement, voice input) WILL FAIL.")
+    print("="*40 + "\n")
+    # Optionally raise an error or exit if the key is absolutely critical
+    # raise ValueError("OpenAI API Key not found!")
+else:
+    print("OpenAI API Key found.")
+# Model IDs
+llm_model = "gpt-3.5-turbo"
+sd_model_id = "runwayml/stable-diffusion-v1-5"
+lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5" # LCM LoRA for faster inference
+# Check for GPU availability - WILL BE 'cpu' in your case
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Use float32 for CPU for stability/compatibility
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+print(f"Selected Device: {device.upper()}")
+print(f"Selected PyTorch Dtype: {torch_dtype}")
+# --- Model Loading ---
+pipe = None # Initialize pipe to None
+try:
+    print("Loading Stable Diffusion model... (This might take a while on CPU)")
+    pipe = StableDiffusionPipeline.from_pretrained(
+        sd_model_id,
+        torch_dtype=torch_dtype,
+        # use_auth_token=hf_token # Uncomment if you face download issues
+    )
+    print("Base model loaded. Loading LCM Scheduler and LoRA...")
+    # Using LCM Scheduler and LoRA for faster generation
+    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+    pipe.load_lora_weights(lcm_lora_id)
+    pipe.fuse_lora() # Fuse LoRA for slightly faster inference after loading
+    pipe.to(device) # Move pipe to CPU
+    print("Stable Diffusion model loaded successfully with LCM-LoRA on CPU.")
+    # Perform a small dummy inference run to warm up / check for errors
+    print("Performing a quick warm-up inference...")
+    _ = pipe(prompt="warmup", num_inference_steps=1, guidance_scale=1.0, output_type="pil").images[0]
+    print("Warm-up successful.")
+except Exception as e:
+    print(f"\n{'='*40}\nERROR loading Stable Diffusion model: {e}\n{'='*40}\n")
+    # pipe remains None, generation will fail gracefully later
+# --- Core Functions ---
+def enhance_prompt_openai(short_prompt, add_style_keywords):
+    """Uses OpenAI LLM to enhance the short prompt."""
+    if not openai.api_key:
+        # Should not happen if checked at start, but good practice
+        return "Error: OpenAI API Key not configured."
+    system_message = """You are an expert prompt engineer for text-to-image models like Stable Diffusion.
+    Expand the user's short idea into a detailed, vivid, and structured prompt optimized for Stable Diffusion v1.5.
+    Include details about the subject, scene, style (e.g., photorealistic, oil painting, cinematic),
+    lighting (e.g., soft light, dramatic lighting), composition (e.g., wide shot, close-up),
+    and mood. Add high-quality keywords like 'highly detailed', 'sharp focus', 'masterpiece'.
+    Keep the prompt concise and effective, ideally under 100 words.""" # Slightly shorter for clarity
+    user_message = f"Short idea: \"{short_prompt}\""
+    if add_style_keywords:
+        user_message += "\nPlease specifically add artistic and quality keywords like 'cinematic lighting', 'photorealistic', '8k', 'masterpiece', 'professional photography'."
+    try:
+        response = openai.chat.completions.create(
+            model=llm_model,
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": user_message},
+            ],
+            temperature=0.7,
+            max_tokens=150 # Reduced max tokens slightly
+        )
+        enhanced_prompt = response.choices[0].message.content.strip()
+        # Basic cleanup
+        enhanced_prompt = enhanced_prompt.replace("Here's a detailed prompt:", "").strip()
+        return enhanced_prompt
+    except Exception as e:
+        print(f"Error calling OpenAI API for prompt enhancement: {e}")
+        # Provide a more user-friendly error message
+        return f"Error: Could not enhance prompt using OpenAI. ({e})"
+def transcribe_audio_openai(audio_path):
+    """Transcribes audio using OpenAI Whisper API."""
+    if not audio_path:
+        return None
+    if not openai.api_key:
+        print("Warning: OpenAI API Key not configured. Cannot transcribe audio.")
+        return "Error: OpenAI API Key needed for transcription."
+    try:
+        with open(audio_path, "rb") as audio_file:
+            transcript = openai.audio.transcriptions.create(
+              model="whisper-1",
+              file=audio_file
+            )
+        return transcript.text
+    except Exception as e:
+        print(f"Error calling OpenAI Whisper API: {e}")
+        return f"Error: Could not transcribe audio using OpenAI. ({e})"
+def generate_image_lcm(prompt, guidance_scale, num_inference_steps=8): # Increased steps slightly for potentially better quality on CPU
+    """Generates an image using the loaded SD+LCM pipeline on CPU."""
+    if pipe is None:
+        print("Error: Stable Diffusion pipeline is not available.")
+        img = Image.new('RGB', (512, 512), color = (128, 128, 128)) # Grey placeholder
+        # Add text to placeholder if possible/easy? For now, just grey.
+        return img, "Error: Image generation model failed to load."
+    print(f"Starting image generation on CPU with prompt: '{prompt}'")
+    print(f"Guidance Scale: {guidance_scale}, Steps: {num_inference_steps}. BE PATIENT, THIS WILL BE SLOW.")
+    # LCM performs best with low guidance scale
+    effective_guidance = max(1.0, min(guidance_scale, 3.0))
+    if effective_guidance != guidance_scale:
+        print(f"Adjusted guidance scale to {effective_guidance} (optimal range for LCM).")
+    negative_prompt = "blurry, low quality, deformed, ugly, text, words, writing, signature, watermark"
+    start_time = time.time()
+    try:
+        # No torch.autocast(device) needed for CPU float32? Check diffusers docs.
+        # inference_mode is still good practice
+        with torch.inference_mode():
+             image = pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                guidance_scale=effective_guidance,
+                num_inference_steps=num_inference_steps # LCM needs few steps
+             ).images[0]
+        end_time = time.time()
+        duration = end_time - start_time
+        print(f"Image generation successful on CPU in {duration:.2f} seconds.")
+        return image, f"Image generated in {duration:.2f}s (CPU)." # Return image and status message
+    except Exception as e:
+        end_time = time.time()
+        duration = end_time - start_time
+        print(f"Error during image generation after {duration:.2f} seconds: {e}")
+        img = Image.new('RGB', (512, 512), color = (255, 100, 100)) # Red-ish placeholder
+        return img, f"Error generating image: {e}"
+# --- Main Processing Function ---
+def process_input(text_input, audio_input, add_style_keywords, guidance_scale):
+    """
+    Main function triggered by the Gradio interface.
+    Handles text/audio input, enhances prompt, generates image.
+    """
+    status_updates = []
+    final_text_input = ""
+    enhanced_prompt = ""
+    generated_image = None
+    # 1. Determine input source
+    if text_input and text_input.strip():
+        final_text_input = text_input.strip()
+        status_updates.append("Using provided text input.")
+    elif audio_input:
+        status_updates.append("Processing audio input...")
+        transcribed_text = transcribe_audio_openai(audio_input)
+        if transcribed_text and not transcribed_text.startswith("Error:"):
+            final_text_input = transcribed_text
+            status_updates.append(f"Transcribed Audio: \"{final_text_input[:100]}...\"" if len(final_text_input) > 100 else f"Transcribed Audio: \"{final_text_input}\"")
+        else:
+            status_updates.append(transcribed_text or "Error: Transcription failed.") # Show the error message
+            final_text_input = "" # Prevent proceeding if transcription fails
+    else:
+        status_updates.append("Error: Please provide a text description or record audio.")
+        # Return current status, empty prompt, no image
+        return "\n".join(status_updates), "", None
+    # If no valid input text after checking both sources
+    if not final_text_input:
+         return "\n".join(status_updates), "", None
+    # 2. Enhance Prompt
+    status_updates.append("Enhancing prompt using OpenAI...")
+    if openai.api_key:
+        enhanced_prompt = enhance_prompt_openai(final_text_input, add_style_keywords)
+        if enhanced_prompt.startswith("Error:"):
+            status_updates.append(enhanced_prompt) # Add error to status
+            # Decide if we should proceed with the *original* prompt or stop? Let's stop.
+            return "\n".join(status_updates), "", None
+        else:
+             status_updates.append("Prompt enhanced successfully.")
+    else:
+        status_updates.append("Warning: OpenAI API Key missing. Using original text as prompt.")
+        enhanced_prompt = final_text_input # Use original text if API key missing
+    # 3. Generate Image
+    status_updates.append(f"Generating image on CPU ({device})... **THIS WILL BE SLOW - PLEASE WAIT**")
+    # Update the UI status *before* starting generation
+    # yield "\n".join(status_updates), enhanced_prompt, None # Requires making the function a generator
+    # Simple update (blocks UI until done):
+    img_gen_result, img_status_msg = generate_image_lcm(enhanced_prompt, guidance_scale)
+    generated_image = img_gen_result
+    if img_status_msg:
+        status_updates.append(img_status_msg)
+    # 4. Return results
+    return "\n".join(status_updates), enhanced_prompt, generated_image
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Prompt Enhancer & Image Generator 🪄🖼️ (CPU Version)")
+    gr.Markdown(
+        f"**WARNING:** Running on **CPU ({device.upper()})**. Image generation will be **VERY SLOW** (potentially several minutes). Please be patient after clicking Generate."
+        f"\nEnter a short description or record audio. It will be enhanced by `{llm_model}` and an image generated using `{sd_model_id}` + LCM acceleration."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input Controls
+            text_input = gr.Textbox(
+                label="Short Description",
+                placeholder="e.g., 'magical treehouse in the sky'",
+                lines=2
+            )
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="filepath", # Get file path for OpenAI API
+                label="Or Record Audio Input"
+            )
+            gr.Markdown("---")
+            gr.Markdown("**Generation Options**")
+            add_style_keywords = gr.Checkbox(
+                label="Add Extra Style Keywords (via LLM)?",
+                value=True,
+                info="Asks the LLM to add 'photorealistic', '8k', 'cinematic' etc."
+            )
+            guidance_scale = gr.Slider(
+                minimum=1.0,
+                maximum=3.0, # Keep low for LCM
+                step=0.1,
+                value=1.5, # Good default for LCM
+                label="Guidance Scale",
+                info="How closely the image follows the prompt (1-2 recommended for LCM)."
+            )
+            submit_button = gr.Button("Generate ✨ (Will be slow!)", variant="primary")
+        with gr.Column(scale=2):
+            # Output Area
+            status_output = gr.Textbox(
+                label="Status Log",
+                interactive=False,
+                lines=4 # More lines for verbose status
+            )
+            enhanced_prompt_output = gr.Textbox(
+                label="✨ Enhanced Prompt (from LLM)",
+                interactive=False,
+                lines=4
+            )
+            image_output = gr.Image(
+                label="🖼️ Generated Image (CPU)",
+                type="pil",
+                interactive=False,
+                height=512, # Set fixed height if desired
+                # width=512
+            )
+    # Connect UI elements
+    submit_button.click(
+        fn=process_input,
+        inputs=[
+            text_input,
+            audio_input,
+            add_style_keywords,
+            guidance_scale
+        ],
+        outputs=[
+            status_output,
+            enhanced_prompt_output,
+            image_output
+        ]
+    )
+    # Clear inputs upon submission for better UX
+    submit_button.click(lambda: ("", None), inputs=[], outputs=[text_input, audio_input])
+# --- Launch the App ---
+if __name__ == "__main__":
+    print("\nLaunching Gradio App...")
+    # Enable queue for better handling, especially with slow generation
+    # share=True can create a public link if running locally (use with caution)
+    demo.queue().launch(debug=False, share=False)