maria355 commited on
Commit
8a97603
Β·
verified Β·
1 Parent(s): d1e7dff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -321
app.py CHANGED
@@ -1,375 +1,370 @@
1
- import gradio as gr
2
- import speech_recognition as sr
3
- import requests
4
- import json
5
  import io
6
  import base64
7
- from PIL import Image
8
  import os
 
 
 
 
9
  from datetime import datetime
10
- import time
11
- import re
12
 
13
- try:
14
- import google.generativeai as genai
15
- GEMINI_AVAILABLE = True
16
- except ImportError:
17
- GEMINI_AVAILABLE = False
18
- print("Gemini AI not available")
 
 
 
 
19
 
20
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
21
- if GEMINI_AVAILABLE and GEMINI_API_KEY:
22
- genai.configure(api_key=GEMINI_API_KEY)
23
- try:
24
- gemini_model = genai.GenerativeModel('gemini-pro')
25
- except Exception as e:
26
- print(f"Error initializing Gemini: {e}")
27
- GEMINI_AVAILABLE = False
28
-
29
- HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
30
 
31
- TEXT_MODELS = [
32
- "microsoft/DialoGPT-medium",
33
- "gpt2",
34
- "facebook/blenderbot-400M-distill"
35
- ]
 
 
 
 
36
 
37
- IMAGE_MODELS = [
38
- "stabilityai/stable-diffusion-2-1",
39
- "runwayml/stable-diffusion-v1-5",
40
- "CompVis/stable-diffusion-v1-4"
41
- ]
 
 
 
 
 
 
42
 
43
- def query_huggingface_text(payload, model_name):
44
- API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
45
- headers = {}
46
-
47
- if HF_TOKEN:
48
- headers["Authorization"] = f"Bearer {HF_TOKEN}"
49
-
50
  try:
51
- response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
 
 
 
 
 
 
52
 
53
- if response.status_code == 200:
54
- result = response.json()
55
- return result
56
- elif response.status_code == 503:
57
- print(f"Model {model_name} is loading")
58
- return None
59
  else:
60
- print(f"Error {response.status_code} with model {model_name}")
61
- return None
62
 
 
 
63
  except Exception as e:
64
- print(f"Error with model {model_name}: {str(e)}")
65
  return None
66
 
67
- def query_huggingface_image(payload, model_name):
68
- API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
69
- headers = {}
70
-
71
- if HF_TOKEN:
72
- headers["Authorization"] = f"Bearer {HF_TOKEN}"
73
-
74
  try:
75
- response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
76
-
77
- if response.status_code == 200:
78
- return response.content
79
- else:
80
- print(f"Error with image model {model_name}")
81
- return None
82
-
83
  except Exception as e:
84
- print(f"Error with image model {model_name}: {str(e)}")
85
  return None
86
 
87
- def transcribe_audio(audio_file):
88
- if audio_file is None:
89
- return "No audio file provided"
90
-
91
- recognizer = sr.Recognizer()
92
-
93
  try:
94
- audio_path = str(audio_file)
95
-
96
- with sr.AudioFile(audio_path) as source:
97
- audio = recognizer.record(source)
98
-
99
- try:
100
- text = recognizer.recognize_google(audio)
101
- return text
102
- except sr.UnknownValueError:
103
- return "Could not understand the audio"
104
- except sr.RequestError as e:
105
- return f"Speech recognition error: {str(e)}"
 
 
 
 
 
 
106
 
 
 
 
 
 
 
107
  except Exception as e:
108
- return f"Error processing audio: {str(e)}"
109
 
110
- def enhance_prompt_with_gemini(text):
111
- if not (GEMINI_AVAILABLE and GEMINI_API_KEY):
112
- return text, text
113
-
114
  try:
 
 
115
  prompt = f"""
116
- Enhance this prompt for content and image generation:
117
- Original: {text}
118
 
119
- Provide:
120
- TEXT: [enhanced text prompt]
121
- IMAGE: [enhanced image prompt]
122
- """
123
 
124
- response = gemini_model.generate_content(prompt)
125
- enhanced = response.text
 
 
 
126
 
127
- text_match = re.search(r'TEXT:\s*(.+?)(?=IMAGE:|$)', enhanced, re.DOTALL)
128
- image_match = re.search(r'IMAGE:\s*(.+?)$', enhanced, re.DOTALL)
 
129
 
130
- enhanced_text = text_match.group(1).strip() if text_match else text
131
- enhanced_image = image_match.group(1).strip() if image_match else text
132
 
133
- return enhanced_text, enhanced_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  except Exception as e:
135
- print(f"Gemini error: {str(e)}")
136
- return text, text
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- def generate_text_content(prompt, content_type="blog"):
139
- if GEMINI_AVAILABLE and GEMINI_API_KEY:
140
- enhanced_text, _ = enhance_prompt_with_gemini(prompt)
141
- prompt = enhanced_text
142
-
143
- content_templates = {
144
- "blog": f"Write a blog post about: {prompt}\n\nPost:",
145
- "social": f"Write a social media post about: {prompt}\n\nPost:",
146
- "caption": f"Write a caption for: {prompt}\n\nCaption:",
147
- "story": f"Write a story about: {prompt}\n\nStory:"
148
- }
149
-
150
- full_prompt = content_templates.get(content_type, prompt)
151
-
152
- for model in TEXT_MODELS:
153
- payload = {
154
- "inputs": full_prompt,
155
- "parameters": {
156
- "max_length": 200,
157
- "temperature": 0.7
158
- }
159
- }
160
 
161
- result = query_huggingface_text(payload, model)
162
-
163
- if result and len(result) > 0:
164
- try:
165
- if isinstance(result, list) and len(result) > 0:
166
- generated_text = result[0].get("generated_text", "")
167
- elif isinstance(result, dict):
168
- generated_text = result.get("generated_text", "")
169
- else:
170
- continue
171
-
172
- if generated_text and generated_text.startswith(full_prompt):
173
- generated_text = generated_text[len(full_prompt):].strip()
174
 
175
- if generated_text and len(generated_text) > 10:
176
- return generated_text
177
-
178
- except Exception as e:
179
- print(f"Error processing result: {e}")
180
- continue
181
-
182
- fallback_content = {
183
- "blog": f"# About {prompt}\n\nThis is an interesting topic with many aspects to explore. Here are key points:\n\nβ€’ Main concepts and principles\nβ€’ Practical applications\nβ€’ Future possibilities\n\nThis topic offers great potential for discussion.",
184
- "social": f"Excited to share thoughts about {prompt}! This is such an important topic. What are your thoughts? #inspiration",
185
- "caption": f"✨ {prompt} ✨ Beautiful moments from simple ideas. #creativity #inspiration",
186
- "story": f"There was something special about {prompt}. It captured everyone's imagination, leading to wonderful adventures and discoveries."
187
- }
188
-
189
- return fallback_content.get(content_type, f"Content about: {prompt}")
190
-
191
- def generate_image_from_text(prompt):
192
- if GEMINI_AVAILABLE and GEMINI_API_KEY:
193
- _, enhanced_image = enhance_prompt_with_gemini(prompt)
194
- prompt = enhanced_image
195
-
196
- enhanced_prompt = f"{prompt}, high quality, detailed, artistic"
197
-
198
- for model in IMAGE_MODELS:
199
- payload = {"inputs": enhanced_prompt}
200
-
201
- image_bytes = query_huggingface_image(payload, model)
202
 
203
- if image_bytes:
204
- try:
205
- image = Image.open(io.BytesIO(image_bytes))
206
- if image.mode != 'RGB':
207
- image = image.convert('RGB')
208
- return image
209
- except Exception as e:
210
- print(f"Error opening image: {str(e)}")
211
- continue
212
-
213
- placeholder = Image.new('RGB', (512, 512), color='lightblue')
214
- return placeholder
215
 
216
- def process_voice_input(audio_file, content_type):
217
- if audio_file is None:
218
- return "Please record some audio first", None, ""
219
-
220
- transcribed_text = transcribe_audio(audio_file)
221
-
222
- if transcribed_text.startswith("Error") or transcribed_text.startswith("Could not"):
223
- return transcribed_text, None, transcribed_text
224
-
225
- try:
226
- text_content = generate_text_content(transcribed_text, content_type)
227
- except Exception as e:
228
- text_content = f"Error generating text: {str(e)}"
229
-
230
  try:
231
- image = generate_image_from_text(transcribed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  except Exception as e:
233
- print(f"Error generating image: {str(e)}")
234
- image = Image.new('RGB', (512, 512), color='lightgray')
235
-
236
- return text_content, image, transcribed_text
237
 
238
- def process_text_input(text_input, content_type):
239
- if not text_input.strip():
240
- return "Please enter some text", None
241
 
242
- try:
243
- text_content = generate_text_content(text_input, content_type)
244
- except Exception as e:
245
- text_content = f"Error generating text: {str(e)}"
 
 
246
 
247
- try:
248
- image = generate_image_from_text(text_input)
249
- except Exception as e:
250
- print(f"Error generating image: {str(e)}")
251
- image = Image.new('RGB', (512, 512), color='lightgray')
 
 
 
 
 
 
252
 
253
- return text_content, image
254
-
255
- def create_interface():
256
- with gr.Blocks(title="VociArt - Voice AI Creator", theme=gr.themes.Soft()) as app:
 
257
 
258
- gr.Markdown("""
259
- # πŸŽ™οΈ VociArt - Voice AI Creator
260
 
261
- Transform your voice into AI-generated content and images!
262
 
263
- **Features:** Voice-to-text β€’ Content generation β€’ Image creation
264
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- with gr.Tab("πŸŽ™οΈ Voice Input"):
267
- with gr.Row():
268
- with gr.Column():
269
- audio_input = gr.Audio(
270
- sources=["microphone"],
271
- type="filepath",
272
- label="🎀 Record Your Voice"
273
- )
274
-
275
- content_type = gr.Dropdown(
276
- choices=["blog", "social", "caption", "story"],
277
- value="blog",
278
- label="πŸ“ Content Type"
279
- )
280
-
281
- voice_submit_btn = gr.Button("πŸš€ Generate from Voice", variant="primary")
282
-
283
- with gr.Column():
284
- transcribed_output = gr.Textbox(
285
- label="πŸ“ What You Said",
286
- lines=3
287
- )
288
-
289
- with gr.Row():
290
- with gr.Column():
291
- text_output = gr.Textbox(
292
- label="πŸ“„ Generated Content",
293
- lines=8
294
- )
295
-
296
- with gr.Column():
297
- image_output = gr.Image(
298
- label="🎨 Generated Image",
299
- type="pil"
300
- )
301
 
302
- with gr.Tab("⌨️ Text Input"):
303
- with gr.Row():
304
- with gr.Column():
305
- text_input = gr.Textbox(
306
- label="πŸ’­ Enter Your Idea",
307
- lines=3
308
- )
309
-
310
- text_content_type = gr.Dropdown(
311
- choices=["blog", "social", "caption", "story"],
312
- value="blog",
313
- label="πŸ“ Content Type"
314
- )
315
-
316
- text_submit_btn = gr.Button("πŸš€ Generate from Text", variant="primary")
317
-
318
- with gr.Row():
319
- with gr.Column():
320
- text_output_2 = gr.Textbox(
321
- label="πŸ“„ Generated Content",
322
- lines=8
323
- )
324
-
325
- with gr.Column():
326
- image_output_2 = gr.Image(
327
- label="🎨 Generated Image",
328
- type="pil"
329
- )
330
 
331
- with gr.Tab("ℹ️ About"):
332
- gr.Markdown("""
333
- ## About VociArt
334
-
335
- Transform spoken ideas into content and visuals using AI!
336
-
337
- ### How to Use:
338
- 1. **Voice**: Record your idea, select content type, generate
339
- 2. **Text**: Type your idea, choose type, generate
340
-
341
- ### Content Types:
342
- - **Blog**: Articles and posts
343
- - **Social**: Social media content
344
- - **Caption**: Image captions
345
- - **Story**: Short stories
346
-
347
- ### Tips:
348
- - Speak clearly in a quiet environment
349
- - Be specific with your ideas
350
- - Try different content types
351
-
352
- Made with free AI models from Hugging Face!
353
- """)
354
 
355
- voice_submit_btn.click(
356
- fn=process_voice_input,
357
- inputs=[audio_input, content_type],
358
- outputs=[text_output, image_output, transcribed_output]
359
- )
 
 
 
 
360
 
361
- text_submit_btn.click(
362
- fn=process_text_input,
363
- inputs=[text_input, text_content_type],
364
- outputs=[text_output_2, image_output_2]
365
- )
 
 
 
 
 
 
 
 
 
366
 
367
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  if __name__ == "__main__":
370
- print("Starting VociArt...")
371
- app = create_interface()
372
- app.launch(
373
- server_name="0.0.0.0",
374
- server_port=7860
375
- )
 
1
+ import streamlit as st
2
+ import torch
3
+ import numpy as np
 
4
  import io
5
  import base64
 
6
  import os
7
+ import tempfile
8
+ from PIL import Image
9
+ import requests
10
+ import json
11
  from datetime import datetime
 
 
12
 
13
+ # Hugging Face imports
14
+ from transformers import (
15
+ AutoProcessor,
16
+ AutoModelForSpeechSeq2Seq,
17
+ pipeline
18
+ )
19
+ from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
20
+ import torchaudio
21
+ from scipy.io import wavfile
22
+ import google.generativeai as genai
23
 
24
+ # Configure page
25
+ st.set_page_config(
26
+ page_title="VoiceCanvas - AI Content Studio",
27
+ page_icon="🎨",
28
+ layout="wide"
29
+ )
 
 
 
 
30
 
31
+ # Initialize session state
32
+ if 'generated_images' not in st.session_state:
33
+ st.session_state.generated_images = []
34
+ if 'generated_text' not in st.session_state:
35
+ st.session_state.generated_text = []
36
+ if 'transcription' not in st.session_state:
37
+ st.session_state.transcription = ""
38
+ if 'selected_image' not in st.session_state:
39
+ st.session_state.selected_image = None
40
 
41
+ @st.cache_resource
42
+ def load_whisper_model():
43
+ """Load Whisper model for speech-to-text"""
44
+ try:
45
+ model_name = "openai/whisper-small"
46
+ processor = AutoProcessor.from_pretrained(model_name)
47
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
48
+ return processor, model
49
+ except Exception as e:
50
+ st.error(f"Error loading Whisper model: {e}")
51
+ return None, None
52
 
53
+ @st.cache_resource
54
+ def load_diffusion_model():
55
+ """Load Stable Diffusion model for image generation"""
 
 
 
 
56
  try:
57
+ model_name = "runwayml/stable-diffusion-v1-5"
58
+ pipe = StableDiffusionPipeline.from_pretrained(
59
+ model_name,
60
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
61
+ safety_checker=None,
62
+ requires_safety_checker=False
63
+ )
64
 
65
+ if torch.cuda.is_available():
66
+ pipe = pipe.to("cuda")
 
 
 
 
67
  else:
68
+ pipe = pipe.to("cpu")
 
69
 
70
+ pipe.enable_attention_slicing()
71
+ return pipe
72
  except Exception as e:
73
+ st.error(f"Error loading Stable Diffusion model: {e}")
74
  return None
75
 
76
+ @st.cache_resource
77
+ def load_tts_model():
78
+ """Load TTS model for text-to-speech"""
 
 
 
 
79
  try:
80
+ tts_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts")
81
+ return tts_pipeline
 
 
 
 
 
 
82
  except Exception as e:
83
+ st.error(f"Error loading TTS model: {e}")
84
  return None
85
 
86
+ def setup_gemini():
87
+ """Setup Gemini API"""
 
 
 
 
88
  try:
89
+ api_key = os.getenv("GEMINI_API_KEY")
90
+ if not api_key:
91
+ st.error("Gemini API key not found in environment variables")
92
+ return False
93
+ genai.configure(api_key=api_key)
94
+ return True
95
+ except Exception as e:
96
+ st.error(f"Error setting up Gemini: {e}")
97
+ return False
98
+
99
+ def transcribe_audio(audio_data, processor, model):
100
+ """Transcribe audio using Whisper"""
101
+ try:
102
+ if processor is None or model is None:
103
+ return "Error: Whisper model not loaded"
104
+
105
+ # Process audio
106
+ inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")
107
 
108
+ # Generate transcription
109
+ with torch.no_grad():
110
+ predicted_ids = model.generate(inputs["input_features"])
111
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
112
+
113
+ return transcription
114
  except Exception as e:
115
+ return f"Error in transcription: {e}"
116
 
117
+ def generate_creative_content(transcription):
118
+ """Generate creative copy and image prompts using Gemini"""
 
 
119
  try:
120
+ model = genai.GenerativeModel('gemini-pro')
121
+
122
  prompt = f"""
123
+ Based on this user request: "{transcription}"
 
124
 
125
+ Please generate:
126
+ 1. Three marketing taglines/copy variations
127
+ 2. Three detailed image prompt variations for AI image generation
 
128
 
129
+ Format your response as JSON:
130
+ {{
131
+ "taglines": ["tagline1", "tagline2", "tagline3"],
132
+ "image_prompts": ["prompt1", "prompt2", "prompt3"]
133
+ }}
134
 
135
+ Make the taglines catchy and marketing-focused.
136
+ Make the image prompts detailed and optimized for Stable Diffusion.
137
+ """
138
 
139
+ response = model.generate_content(prompt)
 
140
 
141
+ # Try to parse JSON from response
142
+ try:
143
+ content = json.loads(response.text)
144
+ return content["taglines"], content["image_prompts"]
145
+ except:
146
+ # Fallback if JSON parsing fails
147
+ taglines = [
148
+ f"Creative content based on: {transcription}",
149
+ f"Innovative solution for: {transcription}",
150
+ f"Experience the magic of: {transcription}"
151
+ ]
152
+ image_prompts = [
153
+ f"High quality, detailed illustration of {transcription}, professional art style",
154
+ f"Beautiful artistic rendering of {transcription}, vibrant colors",
155
+ f"Creative visual representation of {transcription}, modern design"
156
+ ]
157
+ return taglines, image_prompts
158
+
159
  except Exception as e:
160
+ st.error(f"Error with Gemini API: {e}")
161
+ # Fallback content
162
+ taglines = [
163
+ f"Discover: {transcription}",
164
+ f"Experience: {transcription}",
165
+ f"Explore: {transcription}"
166
+ ]
167
+ image_prompts = [
168
+ f"Artistic illustration of {transcription}",
169
+ f"Creative visualization of {transcription}",
170
+ f"Beautiful rendering of {transcription}"
171
+ ]
172
+ return taglines, image_prompts
173
 
174
+ def generate_images(prompts, pipe):
175
+ """Generate images using Stable Diffusion"""
176
+ images = []
177
+ if pipe is None:
178
+ return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ try:
181
+ for prompt in prompts:
182
+ with st.spinner(f"Generating image for: {prompt[:50]}..."):
183
+ # Generate image
184
+ result = pipe(
185
+ prompt,
186
+ num_inference_steps=20,
187
+ guidance_scale=7.5,
188
+ height=512,
189
+ width=512
190
+ )
191
+ images.append(result.images[0])
 
192
 
193
+ except Exception as e:
194
+ st.error(f"Error generating images: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ return images
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ def generate_tts(text, tts_pipeline):
199
+ """Generate text-to-speech audio"""
 
 
 
 
 
 
 
 
 
 
 
 
200
  try:
201
+ if tts_pipeline is None:
202
+ return None
203
+
204
+ # Generate speech
205
+ result = tts_pipeline(text)
206
+
207
+ # Convert to audio format
208
+ audio_data = result["audio"]
209
+ sample_rate = result["sampling_rate"]
210
+
211
+ # Save to temporary file
212
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
213
+ wavfile.write(tmp_file.name, sample_rate, (audio_data * 32767).astype(np.int16))
214
+ return tmp_file.name
215
+
216
  except Exception as e:
217
+ st.error(f"Error generating TTS: {e}")
218
+ return None
 
 
219
 
220
+ def main():
221
+ st.title("🎨 VoiceCanvas - AI Content Studio")
222
+ st.markdown("*Transform your voice into visual and textual content*")
223
 
224
+ # Setup models and APIs
225
+ with st.spinner("Loading AI models..."):
226
+ whisper_processor, whisper_model = load_whisper_model()
227
+ diffusion_pipe = load_diffusion_model()
228
+ tts_pipeline = load_tts_model()
229
+ gemini_ready = setup_gemini()
230
 
231
+ # Sidebar for settings
232
+ with st.sidebar:
233
+ st.header("Settings")
234
+ st.info("πŸ’‘ **How to use:**\n1. Record or upload audio\n2. Review transcription\n3. Generate content\n4. Download results")
235
+
236
+ # Model status
237
+ st.header("Model Status")
238
+ st.write("🎀 Whisper:", "βœ…" if whisper_model else "❌")
239
+ st.write("🎨 Stable Diffusion:", "βœ…" if diffusion_pipe else "❌")
240
+ st.write("πŸ”Š TTS:", "βœ…" if tts_pipeline else "❌")
241
+ st.write("πŸ€– Gemini:", "βœ…" if gemini_ready else "❌")
242
 
243
+ # Main interface
244
+ col1, col2 = st.columns([1, 2])
245
+
246
+ with col1:
247
+ st.header("🎀 Voice Input")
248
 
249
+ # Audio input methods
250
+ audio_method = st.radio("Choose input method:", ["Upload Audio File", "Record Audio"])
251
 
252
+ audio_data = None
253
 
254
+ if audio_method == "Upload Audio File":
255
+ uploaded_file = st.file_uploader("Upload audio file", type=['wav', 'mp3', 'mp4'])
256
+ if uploaded_file:
257
+ # Load audio file
258
+ try:
259
+ audio_data, sample_rate = torchaudio.load(io.BytesIO(uploaded_file.read()))
260
+ # Convert to mono and resample to 16kHz
261
+ if audio_data.shape[0] > 1:
262
+ audio_data = torch.mean(audio_data, dim=0, keepdim=True)
263
+ if sample_rate != 16000:
264
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
265
+ audio_data = resampler(audio_data)
266
+ audio_data = audio_data.squeeze().numpy()
267
+ except Exception as e:
268
+ st.error(f"Error loading audio: {e}")
269
 
270
+ else: # Record Audio
271
+ st.info("Audio recording requires browser permissions. Click the record button below.")
272
+ # Note: Streamlit doesn't have built-in audio recording,
273
+ # so we'll provide a text input as alternative
274
+ st.text_area("Or type your prompt directly:", key="direct_prompt", height=100)
275
+ if st.session_state.direct_prompt:
276
+ st.session_state.transcription = st.session_state.direct_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ # Transcription
279
+ if st.button("🎯 Transcribe Audio") and audio_data is not None:
280
+ with st.spinner("Transcribing audio..."):
281
+ transcription = transcribe_audio(audio_data, whisper_processor, whisper_model)
282
+ st.session_state.transcription = transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ # Show transcription
285
+ if st.session_state.transcription:
286
+ st.subheader("πŸ“ Transcription")
287
+ edited_transcription = st.text_area(
288
+ "Edit if needed:",
289
+ value=st.session_state.transcription,
290
+ height=100
291
+ )
292
+ st.session_state.transcription = edited_transcription
293
+
294
+ with col2:
295
+ st.header("πŸš€ Content Generation")
 
 
 
 
 
 
 
 
 
 
 
296
 
297
+ if st.session_state.transcription and st.button("✨ Generate Content"):
298
+ with st.spinner("Generating creative content..."):
299
+ # Generate taglines and image prompts
300
+ taglines, image_prompts = generate_creative_content(st.session_state.transcription)
301
+ st.session_state.generated_text = taglines
302
+
303
+ # Generate images
304
+ images = generate_images(image_prompts, diffusion_pipe)
305
+ st.session_state.generated_images = images
306
 
307
+ # Display generated content
308
+ if st.session_state.generated_text:
309
+ st.subheader("✍️ Generated Taglines")
310
+ for i, tagline in enumerate(st.session_state.generated_text):
311
+ st.write(f"**{i+1}.** {tagline}")
312
+
313
+ if st.session_state.generated_images:
314
+ st.subheader("🎨 Generated Images")
315
+ cols = st.columns(3)
316
+ for i, img in enumerate(st.session_state.generated_images):
317
+ with cols[i % 3]:
318
+ st.image(img, caption=f"Variation {i+1}")
319
+ if st.button(f"Select Image {i+1}", key=f"select_{i}"):
320
+ st.session_state.selected_image = img
321
 
322
+ # Content export section
323
+ if st.session_state.generated_text or st.session_state.generated_images:
324
+ st.header("πŸ“¦ Export Content")
325
+
326
+ col1, col2, col3 = st.columns(3)
327
+
328
+ with col1:
329
+ if st.session_state.generated_text and st.button("πŸ”Š Generate Voiceover"):
330
+ selected_text = st.selectbox("Choose text for voiceover:", st.session_state.generated_text)
331
+ with st.spinner("Generating voiceover..."):
332
+ audio_file = generate_tts(selected_text, tts_pipeline)
333
+ if audio_file:
334
+ st.audio(audio_file)
335
+ with open(audio_file, "rb") as f:
336
+ st.download_button(
337
+ "Download Audio",
338
+ f.read(),
339
+ file_name=f"voiceover_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
340
+ mime="audio/wav"
341
+ )
342
+
343
+ with col2:
344
+ if st.session_state.selected_image:
345
+ st.write("Selected Image:")
346
+ st.image(st.session_state.selected_image, width=200)
347
+
348
+ # Convert image to bytes for download
349
+ img_buffer = io.BytesIO()
350
+ st.session_state.selected_image.save(img_buffer, format="PNG")
351
+ st.download_button(
352
+ "Download Image",
353
+ img_buffer.getvalue(),
354
+ file_name=f"generated_image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
355
+ mime="image/png"
356
+ )
357
+
358
+ with col3:
359
+ if st.session_state.generated_text:
360
+ # Create text file with all taglines
361
+ text_content = "\n".join([f"{i+1}. {tagline}" for i, tagline in enumerate(st.session_state.generated_text)])
362
+ st.download_button(
363
+ "Download Taglines",
364
+ text_content,
365
+ file_name=f"taglines_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
366
+ mime="text/plain"
367
+ )
368
 
369
  if __name__ == "__main__":
370
+ main()