File size: 2,328 Bytes
4dea00e
 
 
 
 
 
 
 
 
 
 
 
 
72d184a
4dea00e
 
 
 
fcda26a
4dea00e
 
 
 
 
 
 
 
 
82b731b
 
 
4dea00e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72d184a
 
4dea00e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# !pip install -q gTTS
# !pip install -qU "google-genai==1.9.0"


# In[3]:


import numpy as np
import pandas as pd
import os

from google import genai
from google.generativeai import types
from IPython.display import display, Image, Markdown, Audio
from IPython.display import display, Image as IPImage
from gtts import gTTS
import IPython.display as ipd
from PIL import Image as PILImage
import io


# In[4]:
import os
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# Replace with your key
client = genai.Client(api_key=GOOGLE_API_KEY)


# In[ ]:


#!pip install google.api_core


# In[8]:


from google.api_core import retry

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})
genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable
)(genai.models.Models.generate_content)


# In[10]:


# Prompt for user input
user_prompt = input("Enter your prompt: ")

# Request image generation
generation_response = client.models.generate_content(
    model="gemini-2.0-flash-exp-image-generation",
    contents=user_prompt,
    config=types.GenerateContentConfig(
        response_modalities=['text', 'image']
    )
)

# Process and display the image
image_bytes = None
for part in generation_response.candidates[0].content.parts:
    if part.text:
        print(part.text)
    elif part.inline_data:
        image_bytes = part.inline_data.data
        display(Image(image_bytes))


# In[11]:


if image_bytes:
    pil_image = PILImage.open(io.BytesIO(image_bytes))

    vision_prompt = [
        "What is in this image? Describe it in detail.",
        pil_image
    ]

    vision_response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=vision_prompt
    )

    display(Markdown("### 🖼️ Image Description:"))
    display(Markdown(vision_response.text))


# In[12]:


language = 'en'  # ← change here if you want different language

image_description_text = vision_response.text

tts = gTTS(text=image_description_text, lang=language)
tts.save("description.mp3")

display(Markdown("### 📝 Image Description (Text):"))
display(Markdown(image_description_text))

display(Markdown("### 🔊 Image Description (Audio):"))
ipd.display(ipd.Audio("description.mp3"))


# In[ ]: