Spaces:

gowshiselva
/

image-description

Runtime error

App Files Files Community

image-description / app.py

gowshiselva

Update app.py

2cbccec verified about 1 year ago

raw

history blame

6.91 kB

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2Processor, Blip2ForConditionalGeneration

	# Initial setup
	print("Loading models...")

	# Main model for detailed captions
	blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
	blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

	# Secondary model for emotion and detail detection
	blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

	# Move models to GPU if available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	blip2_model.to(device)
	blip_model.to(device)

	print(f"Models loaded. Using device: {device}")

	def generate_advanced_description(image, detail_level, emotion_focus, style_focus):
	"""
	Generate an advanced description of the image with varying levels of detail.

	Args:
	image: Input image
	detail_level: Level of detail (1-5)
	emotion_focus: Focus on emotions (0-5)
	style_focus: Focus on artistic style (0-5)
	"""
	if image is None:
	return "Please upload an image to generate a description."

	try:
	# Generate both basic and detailed descriptions
	with torch.no_grad():
	# Get basic caption from BLIP large
	inputs = blip_processor(image, return_tensors="pt").to(device)
	basic_outputs = blip_model.generate(**inputs, max_length=50)
	basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)

	# Create prompt text based on sliders
	detail_text = f"Describe this image with extreme detail, focus on {'all elements including tiny details' if detail_level > 3 else 'main elements'}"
	emotion_text = "Describe the mood, emotions, and atmosphere conveyed in this image" if emotion_focus > 2 else ""
	style_text = "Describe the artistic style, lighting, colors, and composition" if style_focus > 2 else ""

	# Combine texts based on focus areas
	prompt_text = f"{detail_text}. {emotion_text}. {style_text}"

	# Process with BLIP-2
	inputs = blip2_processor(image, text=prompt_text, return_tensors="pt").to(device)

	max_length = 150 + (detail_level * 50)

	outputs = blip2_model.generate(
	**inputs,
	max_length=max_length,
	num_beams=5,
	min_length=50,
	top_p=0.9,
	repetition_penalty=1.5,
	length_penalty=1.0
	)
	detailed_description = blip2_processor.decode(outputs[0], skip_special_tokens=True)

	# Format results for AI image generation
	formatted_result = ""

	# Add basic subject identification
	formatted_result += f"## Basic Caption:\n{basic_caption}\n\n"

	# Add detailed description
	formatted_result += f"## Detailed Description for AI Image Recreation:\n{detailed_description}\n\n"

	# Add formatting guide based on detail level
	if detail_level >= 4:
	# Extract potential elements for structured description
	elements = []
	if "person" in detailed_description.lower() or "people" in detailed_description.lower():
	elements.append("subjects")
	if any(word in detailed_description.lower() for word in ["background", "scene", "setting"]):
	elements.append("setting")
	if any(word in detailed_description.lower() for word in ["light", "shadow", "bright", "dark"]):
	elements.append("lighting")
	if any(word in detailed_description.lower() for word in ["color", "red", "blue", "green", "yellow", "tone"]):
	elements.append("colors")

	# Create a structured breakdown
	formatted_result += "## Structured Elements:\n"
	for element in elements:
	formatted_result += f"- {element.capitalize()}: " + \
	f"[Extract relevant details about {element} from the description]\n"

	# Add prompt suggestion
	formatted_result += "\n## Suggested AI Image Prompt:\n"
	formatted_result += f"{basic_caption}, {', '.join(detailed_description.split('.')[:3])}, " + \
	f"{'high detail' if detail_level > 3 else 'moderate detail'}, " + \
	f"{'emotional' if emotion_focus > 3 else ''}, " + \
	f"{'artistic' if style_focus > 3 else ''}"

	return formatted_result

	except Exception as e:
	return f"Error generating description: {str(e)}\n\nTraceback: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}"

	# Create Gradio interface
	with gr.Blocks(title="Advanced Image Description Generator") as demo:
	gr.Markdown("# Advanced Image Description Generator for AI Image Recreation")
	gr.Markdown("Upload an image to generate a detailed description that can help AI image generators recreate similar images.")

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(label="Upload Image", type="pil")
	with gr.Row():
	detail_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Detail Level")
	emotion_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Emotion Focus")
	style_slider = gr.Slider(minimum=0, maximum=5, value=3, step=1, label="Style/Artistic Focus")
	submit_btn = gr.Button("Generate Description")

	with gr.Column(scale=1):
	output_text = gr.Textbox(label="Image Description", lines=20)

	submit_btn.click(
	fn=generate_advanced_description,
	inputs=[input_image, detail_slider, emotion_slider, style_slider],
	outputs=output_text
	)

	gr.Markdown("""
	## How to Use
	1. Upload an image
	2. Adjust the sliders to control description detail:
	- Detail Level: How comprehensive the description should be
	- Emotion Focus: Emphasis on mood and feelings
	- Style Focus: Emphasis on artistic elements
	3. Click "Generate Description"
	4. Use the generated text to prompt AI image generators

	## About
	This app uses BLIP-2 and BLIP large models to analyze images and generate detailed descriptions
	suitable for recreating similar images with AI image generators like Stable Diffusion, DALL-E, or Midjourney.
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()