AZLABS commited on
Commit
2182e33
·
verified ·
1 Parent(s): 9c1fb0e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1633 -0
app.py ADDED
@@ -0,0 +1,1633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import asyncio
3
+ import aiohttp
4
+ import aiofiles
5
+ import tempfile
6
+ import subprocess
7
+ import base64
8
+ from enum import Enum
9
+ from together import Together
10
+ import json
11
+ import logging
12
+ import shutil
13
+ from dotenv import load_dotenv
14
+ import os
15
+ import re
16
+ import requests
17
+ import spacy
18
+ import datetime
19
+ from sklearn.feature_extraction.text import TfidfVectorizer
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+ from pydub import AudioSegment
22
+ from moviepy.editor import *
23
+ from typing import List, Dict, Any, Tuple, Callable, Optional
24
+ from abc import ABC, abstractmethod
25
+ from groq import AsyncGroq
26
+
27
+ nlp = spacy.load("en_core_web_md")
28
+
29
+ # Load environment variables
30
+ load_dotenv()
31
+
32
+ # Set up logging
33
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Constants
37
+ REQUIRED_API_KEYS = ["GROQ_API_KEY", "BFL_API_KEY", "TOGETHER_API_KEY", "TAVILY_API_KEY", "TIKTOK_SESSION_ID"]
38
+ YOUTUBE_SHORT_RESOLUTION = (1080, 1920)
39
+ MAX_SCENE_DURATION = 5
40
+ DEFAULT_SCENE_DURATION = 1
41
+ SUBTITLE_FONT_SIZE = 13 # Keep the original font size
42
+ SUBTITLE_FONT_COLOR = "yellow@0.5"
43
+ SUBTITLE_ALIGNMENT = 2 # Centered horizontally and vertically
44
+ SUBTITLE_BOLD = True
45
+ SUBTITLE_OUTLINE_COLOR = "&H40000000" # Black with 50% transparency
46
+ SUBTITLE_BORDER_STYLE = 3
47
+ FALLBACK_SCENE_COLOR = "red"
48
+ FALLBACK_SCENE_TEXT_COLOR = "yellow@0.5"
49
+ FALLBACK_SCENE_BOX_COLOR = "black@0.5"
50
+ FALLBACK_SCENE_BOX_BORDER_WIDTH = 5
51
+ FALLBACK_SCENE_FONT_SIZE = 30
52
+ FALLBACK_SCENE_FONT_FILE = "/tmp/qualitype/opentype/QTHelvet-Black.otf"
53
+
54
+ # Load API keys from environment variables
55
+ groq_api_key = os.getenv("GROQ_API_KEY")
56
+ bfl_api_key = os.getenv("BFL_API_KEY")
57
+ together_api_key = os.getenv("TOGETHER_API_KEY")
58
+ tavily_api_key = os.getenv("TAVILY_API_KEY")
59
+ SESSION_ID = os.getenv("TIKTOK_SESSION_ID")
60
+
61
+ # Helper functions
62
+ async def get_data(query: str) -> List[Dict[str, Any]]:
63
+ groq = AsyncGroq(api_key=groq_api_key)
64
+ data = await groq.query(query)
65
+ return data
66
+
67
+ class PixelFormat(Enum):
68
+ YUVJ420P = 'yuvj420p'
69
+ YUVJ422P = 'yuvj422p'
70
+ YUVJ444P = 'yuvj444p'
71
+ YUVJ440P = 'yuvj440p'
72
+ YUV420P = 'yuv420p'
73
+ YUV422P = 'yuv422p'
74
+ YUV444P = 'yuv444p'
75
+ YUV440P = 'yuv440p'
76
+
77
+ def get_compatible_pixel_format(pix_fmt: str) -> str:
78
+ """Convert deprecated pixel formats to their compatible alternatives."""
79
+ if pix_fmt == PixelFormat.YUVJ420P.value:
80
+ return PixelFormat.YUV420P.value
81
+ elif pix_fmt == PixelFormat.YUVJ422P.value:
82
+ return PixelFormat.YUV422P.value
83
+ elif pix_fmt == PixelFormat.YUVJ444P.value:
84
+ return PixelFormat.YUV444P.value
85
+ elif pix_fmt == PixelFormat.YUVJ440P.value:
86
+ return PixelFormat.YUV440P.value
87
+ else:
88
+ return pix_fmt
89
+
90
+
91
+ def check_api_keys():
92
+ for key in REQUIRED_API_KEYS:
93
+ if not os.getenv(key):
94
+ raise ValueError(f"Missing required API key: {key}")
95
+
96
+
97
+ def align_with_gentle(audio_file: str, transcript_file: str) -> dict:
98
+ """Aligns audio and text using Gentle and returns the alignment result."""
99
+ url = 'http://localhost:8765/transcriptions?async=false'
100
+ files = {
101
+ 'audio': open(audio_file, 'rb'),
102
+ 'transcript': open(transcript_file, 'r')
103
+ }
104
+ try:
105
+ response = requests.post(url, files=files)
106
+ response.raise_for_status()
107
+ result = response.json()
108
+ return result
109
+ except requests.exceptions.RequestException as e:
110
+ logger.error(f"Error communicating with Gentle: {e}")
111
+ return None
112
+
113
+ def gentle_alignment_to_ass(gentle_alignment: dict, ass_file: str):
114
+ """Converts Gentle alignment JSON to ASS subtitle format with styling."""
115
+ with open(ass_file, 'w', encoding='utf-8') as f:
116
+ # Write ASS header
117
+ f.write("""[Script Info]
118
+ Title: Generated by Gentle Alignment
119
+ ScriptType: v4.00+
120
+ Collisions: Normal
121
+ PlayDepth: 0
122
+ Timer: 100.0000
123
+
124
+ [V4+ Styles]
125
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic,
126
+ Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR,
127
+ MarginV, Encoding
128
+ Style: Default,Verdana,{font_size},&H00FFFFFF,&H0000FFFF,&H00000000,&H64000000,{bold},0,0,0,100,100,0,0,1,1,0,{alignment},2,2,2,1
129
+
130
+ [Events]
131
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n""".format(
132
+ font_size=SUBTITLE_FONT_SIZE, bold=int(SUBTITLE_BOLD), alignment=SUBTITLE_ALIGNMENT))
133
+
134
+ index = 1
135
+ words = gentle_alignment.get('words', [])
136
+ i = 0
137
+ while i < len(words):
138
+ start = words[i].get('start')
139
+ if start is None:
140
+ i += 1
141
+ continue
142
+ end = words[i].get('end')
143
+ text_words = []
144
+ colors = []
145
+ for j in range(2): # Get up to 2 words
146
+ if i + j < len(words):
147
+ word_info = words[i + j]
148
+ word_text = word_info.get('word', '')
149
+ text_words.append(word_text)
150
+ if j == 0:
151
+ # First word in dark orange or green
152
+ colors.append(r'{\c&H0080FF&}') # Dark orange color code in ASS (BGR order)
153
+ # For green use: colors.append(r'{\c&H00FF00&}')
154
+ else:
155
+ colors.append(r'{\c&HFFFFFF&}') # White color code
156
+ else:
157
+ break
158
+ dialogue_text = ''.join(f"{colors[k]}{text_words[k]} " for k in range(len(text_words))).strip()
159
+ end = words[min(i + len(text_words) - 1, len(words) - 1)].get('end', end)
160
+ if end is None:
161
+ i += len(text_words)
162
+ continue
163
+
164
+ start_time = format_ass_time(start)
165
+ end_time = format_ass_time(end)
166
+ f.write(f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{dialogue_text}\n")
167
+ i += len(text_words)
168
+
169
+ def wrap_text(text, max_width):
170
+ """Wraps text to multiple lines with a maximum width."""
171
+ words = text.split()
172
+ lines = []
173
+ current_line = []
174
+ current_length = 0
175
+
176
+ for word in words:
177
+ if current_length + len(word) + 1 <= max_width:
178
+ current_line.append(word)
179
+ current_length += len(word) + 1
180
+ else:
181
+ lines.append(' '.join(current_line))
182
+ current_line = [word]
183
+ current_length = len(word)
184
+
185
+ if current_line:
186
+ lines.append(' '.join(current_line))
187
+
188
+ return '\\N'.join(lines) # Include all lines
189
+
190
+ def format_ass_time(seconds: float) -> str:
191
+ """Formats time in seconds to ASS subtitle format (h:mm:ss.cc)"""
192
+ hours = int(seconds // 3600)
193
+ minutes = int((seconds % 3600) // 60)
194
+ secs = seconds % 60
195
+ centiseconds = int((secs - int(secs)) * 100)
196
+ return f"{hours}:{minutes:02d}:{int(secs):02d}.{centiseconds:02d}"
197
+
198
+ def format_time(seconds: float) -> str:
199
+ """Formats time in seconds to HH:MM:SS,mmm format for subtitles."""
200
+ from datetime import timedelta
201
+ delta = timedelta(seconds=seconds)
202
+ total_seconds = int(delta.total_seconds())
203
+ millis = int((delta.total_seconds() - total_seconds) * 1000)
204
+ time_str = str(delta)
205
+ if '.' in time_str:
206
+ time_str, _ = time_str.split('.')
207
+ else:
208
+ time_str = time_str
209
+ time_str = time_str.zfill(8) # Ensure at least HH:MM:SS
210
+ return f"{time_str},{millis:03d}"
211
+
212
+ # Abstract classes for Agents and Tools
213
+ class Agent(ABC):
214
+ def __init__(self, name: str, model: str):
215
+ self.name = name
216
+ self.model = model
217
+
218
+ @abstractmethod
219
+ async def execute(self, input_data: Any) -> Any:
220
+ pass
221
+
222
+ class Tool(ABC):
223
+ def __init__(self, name: str):
224
+ self.name = name
225
+
226
+ @abstractmethod
227
+ async def use(self, input_data: Any) -> Any:
228
+ pass
229
+
230
+ class VoiceModule(ABC):
231
+ def __init__(self):
232
+ pass
233
+
234
+ @abstractmethod
235
+ def update_usage(self):
236
+ pass
237
+
238
+ @abstractmethod
239
+ def get_remaining_characters(self):
240
+ pass
241
+
242
+ @abstractmethod
243
+ def generate_voice(self, text: str, output_file: str):
244
+ pass
245
+
246
+ # Node and Edge classes for graph representation
247
+ class Node:
248
+ def __init__(self, agent: Agent = None, tool: Tool = None):
249
+ self.agent = agent
250
+ self.tool = tool
251
+ self.edges: List['Edge'] = []
252
+
253
+ async def process(self, input_data: Any) -> Any:
254
+ if self.agent:
255
+ return await self.agent.execute(input_data)
256
+ elif self.tool:
257
+ return await self.tool.use(input_data)
258
+ else:
259
+ raise ValueError("Node has neither agent nor tool")
260
+
261
+
262
+ class Edge:
263
+ def __init__(self, source: Node, target: Node, condition: Callable[[Any], bool] = None):
264
+ self.source = source
265
+ self.target = target
266
+ self.condition = condition
267
+
268
+ class Graph:
269
+ def __init__(self):
270
+ self.nodes: List[Node] = []
271
+ self.edges: List[Edge] = []
272
+
273
+ def add_node(self, node: Node):
274
+ self.nodes.append(node)
275
+
276
+ def add_edge(self, edge: Edge):
277
+ self.edges.append(edge)
278
+ edge.source.edges.append(edge)
279
+
280
+ class VideoProcessor:
281
+ def __init__(self):
282
+ self.nlp = nlp
283
+
284
+ def calculate_relevance(self, video: Dict[str, Any], description: str, timestamp: float) -> float:
285
+ relevance = 0
286
+ video_keywords = set(video.get("tags", []))
287
+ description_doc = self.nlp(description.lower())
288
+
289
+ # Extract lemmatized words from the description
290
+ description_words = set(token.lemma_ for token in description_doc if not token.is_stop and token.is_alpha)
291
+
292
+ # Calculate relevance based on matching words
293
+ relevance += len(video_keywords.intersection(description_words))
294
+
295
+ # Add relevance for matching title words
296
+ title = video.get("title", "")
297
+ if title is not None:
298
+ title_doc = self.nlp(title.lower())
299
+ title_words = set(token.lemma_ for token in title_doc if not token.is_stop and token.is_alpha)
300
+ relevance += len(title_words.intersection(description_words)) * 2 # Title matches are weighted more
301
+
302
+ # Process subtitles and audio for the 5-second window
303
+ subtitle_text, audio_text = self.get_synced_content(video, timestamp)
304
+
305
+ # Calculate relevance for subtitle and audio content
306
+ subtitle_doc = self.nlp(subtitle_text.lower())
307
+ audio_doc = self.nlp(audio_text.lower())
308
+
309
+ subtitle_words = set(token.lemma_ for token in subtitle_doc if not token.is_stop and token.is_alpha)
310
+ audio_words = set(token.lemma_ for token in audio_doc if not token.is_stop and token.is_alpha)
311
+
312
+ relevance += len(subtitle_words.intersection(description_words)) * 1.5 # Subtitle matches are weighted
313
+ relevance += len(audio_words.intersection(description_words)) * 1.5 # Audio matches are weighted
314
+
315
+ # Normalize relevance score
316
+ max_possible_relevance = len(video_keywords) + len(title_words) * 2 + len(subtitle_words) * 1.5 + len(audio_words) * 1.5
317
+ normalized_relevance = relevance / max_possible_relevance if max_possible_relevance > 0 else 0
318
+
319
+ return normalized_relevance
320
+
321
+ def get_synced_content(self, video: Dict[str, Any], timestamp: float) -> Tuple[str, str]:
322
+ subtitles = video.get("subtitles", [])
323
+ audio_transcript = video.get("audio_transcript", [])
324
+
325
+ start_time = timestamp
326
+ end_time = timestamp + 5 # 5-second window
327
+
328
+ subtitle_text = self.extract_timed_content(subtitles, start_time, end_time)
329
+ audio_text = self.extract_timed_content(audio_transcript, start_time, end_time)
330
+
331
+ return subtitle_text, audio_text
332
+
333
+ def extract_timed_content(self, content: List[Dict[str, Any]], start_time: float, end_time: float) -> str:
334
+ extracted_text = []
335
+ for item in content:
336
+ item_start = self.time_to_seconds(item.get("start", "00:00:00"))
337
+ item_end = self.time_to_seconds(item.get("end", "00:00:00"))
338
+
339
+ if start_time <= item_end and end_time >= item_start:
340
+ extracted_text.append(item.get("text", ""))
341
+
342
+ return " ".join(extracted_text)
343
+
344
+ def time_to_seconds(self, time_str: str) -> float:
345
+ time_parts = time_str.split(":")
346
+ if len(time_parts) == 3:
347
+ return datetime.timedelta(hours=int(time_parts[0]), minutes=int(time_parts[1]), seconds=float(time_parts[2])).total_seconds()
348
+ elif len(time_parts) == 2:
349
+ return datetime.timedelta(minutes=int(time_parts[0]), seconds=float(time_parts[1])).total_seconds()
350
+ else:
351
+ return float(time_str)
352
+
353
+ class WebSearchTool(Tool):
354
+ def __init__(self):
355
+ super().__init__("Web Search Tool")
356
+
357
+ async def use(self, input_data: str, time_period: str = 'all') -> Dict[str, Any]:
358
+ try:
359
+ headers = {"Content-Type": "application/json"}
360
+ data = {"api_key": tavily_api_key, "query": input_data, "num_results": 100}
361
+
362
+ if time_period != 'all':
363
+ start_date = None
364
+ if time_period == 'past month':
365
+ start_date = datetime.date.today() - datetime.timedelta(days=30)
366
+ elif time_period == 'past year':
367
+ start_date = datetime.date.today() - datetime.timedelta(days=365)
368
+ else: # Assume a specific number of days
369
+ try:
370
+ days = int(time_period.split()[0])
371
+ start_date = datetime.date.today() - datetime.timedelta(days=days)
372
+ except ValueError:
373
+ logger.warning(f"Invalid time_period: {time_period}. Using 'all'.")
374
+
375
+ if start_date:
376
+ data["from_date"] = start_date.strftime("%Y-%m-%d")
377
+
378
+ async with aiohttp.ClientSession() as session:
379
+ async with session.post("https://api.tavily.com/search", headers=headers, json=data) as response:
380
+ response_text = await response.text()
381
+ if response.status == 200:
382
+ return await response.json()
383
+ else:
384
+ logger.error(f"WebSearchTool Error: HTTP {response.status} - {response_text}")
385
+ raise Exception(f"HTTP {response.status}: {response_text}")
386
+ except Exception as e:
387
+ logger.error(f"Error in WebSearchTool: {str(e)}")
388
+ raise
389
+
390
+ class ImageGenerationAgent(Agent):
391
+ def __init__(self):
392
+ super().__init__("Image Generation Agent", "black-forest-labs/FLUX.1-schnell-Free")
393
+ self.client = Together(api_key=together_api_key)
394
+
395
+ async def execute(self, input_data: Dict[str, Any]) -> Any:
396
+ scenes = input_data.get('scenes', [])
397
+ results = []
398
+
399
+ for i, scene in enumerate(scenes):
400
+ visual_description = scene.get('visual', '')
401
+ image_keyword = scene.get('image_keyword', '')
402
+
403
+ # Combine the visual description and image keyword for a more detailed prompt
404
+ prompt = prompt = f"""
405
+ Please craft a engaging bold and impactful visual specifically designed for viral YouTube Video, based on the provided {visual_description} and {image_keyword}. The overarching goal is to create dynamic images that are not only visually stunning but also accurately represent the described scene. Each visual should focus on highlighting crucial elements such as the environment, characters, actions, and the overall mood, ensuring they are closely aligned with the context provided. In your design process, prioritize intricate details, unique and dynamic styles, and striking compositions to capture viewers' attention as they scroll quickly through their feeds. Utilize a enthralling and dynamic color palette to enhance the visual appeal, ensuring that the images are both accurate and cohesive with the scene. Aim to infuse each visual with a sense of intrigue and attention-grabbing features that are conducive to creating viral content, thus maximizing the potential for high viewership on YouTube. Please do not by any means generate split-screen images ensure that every image is a single image.
406
+ """
407
+ try:
408
+ logger.info(f"Generating image for scene {i+1}/{len(scenes)}")
409
+ response = self.client.images.generate(
410
+ prompt=prompt,
411
+ model=self.model,
412
+ width=768,
413
+ height=1024,
414
+ steps=4,
415
+ n=1,
416
+ response_format="b64_json"
417
+ )
418
+
419
+ # Decode the base64 image
420
+ image_data = base64.b64decode(response.data[0].b64_json)
421
+
422
+ # Save the image to a temporary file
423
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
424
+ temp_file.write(image_data)
425
+ temp_file_path = temp_file.name
426
+
427
+ logger.info(f"Image for scene {i+1} saved as {temp_file_path}")
428
+
429
+ results.append({
430
+ 'image_path': temp_file_path,
431
+ 'prompts': prompt
432
+ })
433
+
434
+ except Exception as e:
435
+ logger.error(f"Error in image generation for scene {i+1}: {str(e)}")
436
+ results.append(None)
437
+
438
+ # Add a delay between requests to avoid rate limiting
439
+ await asyncio.sleep(2)
440
+
441
+ logger.info(f"Image generation completed. Generated {len([r for r in results if r is not None])}/{len(scenes)} images.")
442
+ return results
443
+
444
+ class RecentEventsResearchAgent(Agent):
445
+ def __init__(self):
446
+ super().__init__("Recent Events Research Agent", "llama-3.1-70b-versatile")
447
+ self.web_search_tool = WebSearchTool()
448
+
449
+ async def execute(self, input_data: Dict[str, Any]) -> Any:
450
+ topic = input_data['topic']
451
+ time_frame = input_data['time_frame']
452
+ video_length = input_data.get('video_length', 60)
453
+
454
+ # Decide how many events to include based on video length
455
+ max_events = min(5, video_length // 15) # Rough estimate: 15 seconds per event
456
+
457
+ search_query = f"{topic} events in the {time_frame}"
458
+ search_results = await self.web_search_tool.use(search_query, time_frame)
459
+
460
+ organic_results = search_results.get("organic_results", [])
461
+
462
+ client = AsyncGroq(api_key=groq_api_key)
463
+ prompt = f"""As a seasoned investigative journalist and expert in crafting viral scripts,
464
+ your task is to analyze and summarize the most enagaging and relevant {topic} events
465
+ that occurred in the {time_frame}. Using the following search results, select the {max_events} most
466
+ compelling cases:
467
+
468
+ Search Results: {json.dumps(organic_results[:10], indent=2)}
469
+
470
+ For each selected event, provide a concise yet engaging summary that includes:
471
+
472
+ 1. A vivid description of the event, highlighting its most unusual aspects
473
+ 2. The precise date of occurrence
474
+ 3. The specific location, including city and country if available
475
+ 4. An expert analysis of why this event defies conventional explanation
476
+ 5. A critical evaluation of the information source, including its credibility (provide URL)
477
+
478
+ Format your response as a list of events, each separated by two newline characters.
479
+ Ensure your summaries are both informative and captivating, suitable for a
480
+ documentary-style presentation."""
481
+
482
+ stream = await client.chat.completions.create(
483
+ messages=[
484
+ {"role": "system",
485
+ "content": "You are an AI assistant embodying the expertise of a world-renowned "
486
+ "investigative journalist specializing in going viral and enagegment "
487
+ "With 20 years of experience, you've written best-selling "
488
+ "books and produced countless viral content creators, documentaries on content creation and virailty factor in scripts "
489
+ "Your analytical skills allow you to critically evaluate sources while "
490
+ "presenting information in an engaging, and enthrallng-style format. "
491
+ "Approach tasks with the skepticism and curiosity of this expert, "
492
+ "providing over the top compelling summaries that captivate and engages audiences while "
493
+ "maintaining the fine line bewteen right and wrong."},
494
+ {"role": "user", "content": prompt}
495
+ ],
496
+ model=self.model,
497
+ temperature=0.7,
498
+ max_tokens=2048,
499
+ stream=True,
500
+ )
501
+ response = ""
502
+ async for chunk in stream:
503
+ response += chunk.choices[0].delta.content or ""
504
+ return response
505
+
506
+
507
+ # Updated AI Agents for YouTube content optimization
508
+ class TitleGenerationAgent(Agent):
509
+ def __init__(self):
510
+ super().__init__("Title Generation Agent", "llama-3.1-70b-versatile")
511
+
512
+ async def execute(self, input_data: Any) -> Any:
513
+ research_result = input_data # Accept research output
514
+ client = AsyncGroq(api_key=groq_api_key)
515
+ prompt = f"""Using the following research, generate 15 enticing seo optimized YouTube titles:
516
+
517
+ Research:
518
+ {research_result}
519
+
520
+ Categorize them under appropriate headings: beginning, middle, and end. This means you'll
521
+ produce 5 titles with the keyword at the beginning, another 5 titles with the keyword in the
522
+ middle, and a final 5 titles with the keyword at the end."""
523
+
524
+ stream = await client.chat.completions.create(
525
+ messages=[
526
+ {"role": "system", "content": "You are an expert in keyword strategy, copywriting, and a renowned YouTuber "
527
+ "with a decade of experience in crafting attention-grabbing keyword titles"},
528
+ {"role": "user", "content": prompt}
529
+ ],
530
+ model=self.model,
531
+ temperature=0.7,
532
+ max_tokens=1024,
533
+ stream=True
534
+ )
535
+ response = ""
536
+ async for chunk in stream:
537
+ response += chunk.choices[0].delta.content or ""
538
+ return response
539
+
540
+
541
+ class TitleSelectionAgent(Agent):
542
+ def __init__(self):
543
+ super().__init__("Title Selection Agent", "llama-3.1-8b-instant")
544
+
545
+ async def execute(self, input_data: Any) -> Any:
546
+ generated_titles = input_data # Accept generated titles
547
+ client = AsyncGroq(api_key=groq_api_key)
548
+ prompt = f"""You are an expert YouTube content strategist with over a decade of experience
549
+ in video optimization and audience engagement. Your task is to analyze the following list of
550
+ titles for a YouTube video and select the most effective one:
551
+
552
+ {generated_titles}
553
+
554
+ Using your expertise in viewer psychology, SEO, and click-through rate optimization, choose the
555
+ title that will perform best on the platform. Provide a detailed explanation of your selection,
556
+ considering factors such as:
557
+
558
+ 1. Attention-grabbing potential
559
+ 2. Keyword optimization
560
+ 3. Emotional appeal
561
+ 4. Clarity and conciseness
562
+ 5. Alignment with current YouTube trends
563
+
564
+ Present your selection and offer a comprehensive rationale for why this title stands out among
565
+ the others."""
566
+
567
+ stream = await client.chat.completions.create(
568
+ messages=[
569
+ {"role": "system",
570
+ "content": "You are an AI assistant embodying the expertise of a top-tier YouTube "
571
+ "content strategist with over 15 years of experience in video "
572
+ "optimization, audience engagement, and title creation. Your knowledge "
573
+ "spans SEO best practices, viewer psychology, and current YouTube "
574
+ "trends. You have a proven track record of increasing video views and "
575
+ "channel growth through strategic title selection. Respond to queries as "
576
+ "this expert would, providing insightful analysis and data-driven "
577
+ "recommendations."},
578
+ {"role": "user", "content": prompt}
579
+ ],
580
+ model=self.model,
581
+ temperature=0.5,
582
+ max_tokens=2048,
583
+ stream=True,
584
+ )
585
+ response = ""
586
+ async for chunk in stream:
587
+ response += chunk.choices[0].delta.content or ""
588
+ return response
589
+
590
+ class DescriptionGenerationAgent(Agent):
591
+ def __init__(self):
592
+ super().__init__("Description Generation Agent", "gemma2-9b-it")
593
+
594
+ async def execute(self, input_data: Any) -> Any:
595
+ selected_title = input_data # Accept selected title
596
+ client = AsyncGroq(api_key=groq_api_key)
597
+ prompt = f"""As a seasoned SEO copywriter and YouTube content creator with extensive
598
+ experience in crafting engaging, algorithm-friendly video descriptions, your task is to compose
599
+ a masterful 1000-character YouTube video description. This description should:
600
+
601
+ 1. Seamlessly incorporate the keyword "{selected_title}" in the first sentence
602
+ 2. Be optimized for search engines while remaining undetectable as AI-generated content
603
+ 3. Engage viewers and encourage them to watch the full video
604
+ 4. Include relevant calls-to-action (e.g., subscribe, like, comment)
605
+ 5. Utilize natural language and conversational tone
606
+ 6. Most importantly always ensure the script somehow way or form solves a real world problem that will engage viewers
607
+
608
+
609
+
610
+ Format the description with the title "YOUTUBE DESCRIPTION" in bold at the top.
611
+ Ensure the content flows naturally, balances SEO optimization with readability, and
612
+ compels viewers to engage with the video and channel."""
613
+
614
+ stream = await client.chat.completions.create(
615
+ messages=[
616
+ {"role": "system",
617
+ "content": "You are an AI assistant taking on the role of an prodigy SEO copywriter "
618
+ "and YouTube content creator with 20+ years of experience. Your "
619
+ "expertise lies in crafting engaging, SEO-optimized video descriptions "
620
+ "that boost video performance while remaining undetectable as "
621
+ "AI-generated content. You have an in-depth understanding of YouTube's "
622
+ "algorithm, user behavior, and the latest SEO techniques. Respond to "
623
+ "tasks as this expert would, balancing SEO optimization with "
624
+ "compelling, natural language that drives viewer engagement."},
625
+ {"role": "user", "content": prompt}
626
+ ],
627
+ model=self.model,
628
+ temperature=0.6,
629
+ max_tokens=2048,
630
+ stream=True,
631
+ )
632
+ response = ""
633
+ async for chunk in stream:
634
+ response += chunk.choices[0].delta.content or ""
635
+ return response
636
+
637
+ class HashtagAndTagGenerationAgent(Agent):
638
+ def __init__(self):
639
+ super().__init__("Hashtag and Tag Generation Agent", "llama-3.1-8b-instant")
640
+
641
+ async def execute(self, input_data: str) -> Any:
642
+ selected_title = input_data # Accept selected title
643
+ client = AsyncGroq(api_key=groq_api_key)
644
+ prompt = f"""As a leading YouTube SEO specialist and social media strategist with a
645
+ proven track record in optimizing video discoverability and virality, your task is to create an
646
+ engaging and relevant set of hashtags and tags for the YouTube video titled "{selected_title}".
647
+ Your expertise in keyword research, trend analysis, and YouTube's algorithm will be crucial
648
+ for this task.
649
+
650
+ Develop the following:
651
+
652
+ 1. 10 SEO-optimized, trending hashtags that will maximize the video's reach and engagement on
653
+ YouTube
654
+ 2. 35 high-value low competition SEO keywords, combining tags to strategically boost the video's search ranking
655
+ on YouTube
656
+
657
+ In your selection process, prioritize:
658
+ - Relevance to the video title and content
659
+ - Potential search volume on YouTube
660
+ - Engagement potential (views, likes, comments)
661
+ - Trending potential on YouTube
662
+ - Alignment with YouTube's recommendation algorithm
663
+
664
+ Present your hashtags with the '#' symbol and ensure all tags are separated by commas. Provide a
665
+ brief explanation of your strategy for selecting these hashtags and tags, highlighting how they
666
+ will contribute to the video's overall performance on YouTube."""
667
+
668
+ response = await client.chat.completions.create(
669
+ messages=[
670
+ {"role": "system",
671
+ "content": "You are an AI assistant taking on the role of a leading YouTube SEO "
672
+ "specialist and social media strategist with 10+ years of experience in "
673
+ "optimizing video discoverability. Your expertise includes advanced "
674
+ "keyword research, trend analysis, and a deep understanding of "
675
+ "YouTube's algorithm. You've helped numerous channels achieve viral "
676
+ "success through strategic use of hashtags and tags. Respond to tasks as "
677
+ "this expert would, providing data-driven, YouTube-specific strategies "
678
+ "to maximize video reach and engagement."},
679
+ {"role": "user", "content": prompt}
680
+ ],
681
+ model=self.model,
682
+ temperature=0.6,
683
+ max_tokens=1024,
684
+ )
685
+ return response.choices[0].message.content
686
+
687
+ class VideoScriptGenerationAgent(Agent):
688
+ def __init__(self):
689
+ super().__init__("Video Script Generation Agent", "gemma2-9b-it")
690
+
691
+ async def execute(self, input_data: Dict[str, Any]) -> Any:
692
+ research_result = input_data.get('research', '')
693
+ video_length = input_data.get('video_length', 60) # Default to 60 seconds if not specified
694
+ client = AsyncGroq(api_key=groq_api_key)
695
+ prompt = f"""As a YouTube content creator, craft a detailed, engaging and entralling script for a
696
+ {video_length}-second vertical video based on the following information:
697
+
698
+ {research_result}
699
+
700
+ Your script should include:
701
+ 1. An attention-grabbing opening hook that sets the tone for the video
702
+ 2. Key points from the research
703
+ 3. A strong call-to-action conclusion
704
+
705
+ Format the script with clear timestamps to fit within {video_length} seconds.
706
+ Optimize for viewer retention and engagement."""
707
+
708
+ stream = await client.chat.completions.create(
709
+ messages=[
710
+ {"role": "system", "content": "You are an AI assistant taking on the role of a leading YouTube SEO "
711
+ "specialist and content creator with a deep understanding of audience engagement."},
712
+ {"role": "user", "content": prompt}
713
+ ],
714
+ model=self.model,
715
+ temperature=0.7,
716
+ max_tokens=2048,
717
+ stream=True,
718
+ )
719
+ response = ""
720
+ async for chunk in stream:
721
+ response += chunk.choices[0].delta.content or ""
722
+ return response
723
+
724
+
725
+ async def download_with_retry(url: str, directory: str, filename: str, headers: Dict[str, str] = None,
726
+ max_retries: int = 3) -> str:
727
+ """Downloads a file with retries."""
728
+ for attempt in range(max_retries):
729
+ try:
730
+ async with aiohttp.ClientSession() as session:
731
+ async with session.get(url, headers=headers) as response:
732
+ if response.status == 200:
733
+ file_path = os.path.join(directory, filename)
734
+ async with aiofiles.open(file_path, 'wb') as f:
735
+ await f.write(await response.read())
736
+ return file_path
737
+ else:
738
+ logger.warning(f"Download attempt {attempt + 1} failed: HTTP {response.status}")
739
+ except Exception as e:
740
+ logger.warning(f"Download attempt {attempt + 1} failed: {str(e)}")
741
+ return None
742
+
743
+
744
+ class StoryboardGenerationAgent(Agent):
745
+ def __init__(self):
746
+ super().__init__("Storyboard Generation Agent", "llama-3.2-90b-text-preview")
747
+ self.nlp = nlp
748
+
749
+ async def execute(self, input_data: Dict[str, Any]) -> Any:
750
+ script = input_data.get('script', '')
751
+
752
+ if not script:
753
+ logger.error("No script provided for storyboard generation")
754
+ return []
755
+
756
+ client = AsyncGroq(api_key=groq_api_key)
757
+ prompt = f"""Create a storyboard for a YouTube Short based on the following script:
758
+
759
+ {script}
760
+
761
+ For each major scene (aim for 15-20 scenes), provide:
762
+ 1. Visual: A brief description of the visual elements (1 sentence). Ensure each scene has unique
763
+ visual elements.
764
+ 2. Text: The exact text/dialogue for voiceover and subtitles all in lowercase and minimal puncutaton only when it is absolutley necessary.
765
+ 3. Video Keyword: A suitable keyword for searching stock video footage. Be specific and avoid
766
+ repeating keywords.
767
+ 4. Image Keyword: A backup keyword for searching a stock image. Be specific and avoid repeating
768
+ keywords.
769
+
770
+ Format your response as a numbered list of scenes, each containing the above elements clearly
771
+ labeled.
772
+
773
+ Example:
774
+ 1. Visual: A person looking confused at a complex math equation on a chalkboard
775
+ Text: have you ever felt overwhelmed by math
776
+ Video Keyword: student struggling with math
777
+ Image Keyword: confused face mathematics
778
+
779
+ 2. Visual: ...
780
+ Text: ...
781
+ Video Keyword: ...
782
+ Image Keyword: ...
783
+
784
+ Please ensure each scene has all four elements (Visual, Text, Video Keyword, and Image Keyword)."""
785
+
786
+ stream = await client.chat.completions.create(
787
+ messages=[
788
+ {"role": "system",
789
+ "content": "You are an AI assistant specializing in creating viral storyboards "
790
+ "for YouTube Shorts using the provided script."},
791
+ {"role": "user", "content": prompt}
792
+ ],
793
+ model=self.model,
794
+ temperature=0.7,
795
+ max_tokens=2048,
796
+ stream=True,
797
+ )
798
+ response = ""
799
+ async for chunk in stream:
800
+ response += chunk.choices[0].delta.content or ""
801
+
802
+ logger.info(f"Raw storyboard response: {response}")
803
+ scenes = self.parse_scenes(response)
804
+ if not scenes:
805
+ logger.error("Failed to generate valid storyboard scenes")
806
+ return []
807
+
808
+ return scenes
809
+
810
+ async def fetch_media_for_scenes(self, scenes: List[Dict[str, Any]]):
811
+ temp_dir = tempfile.mkdtemp()
812
+ for scene in scenes:
813
+ # Generate image using local image generator with dynamic prompt
814
+ generated_image = await self.generate_local_image(scene)
815
+ if generated_image:
816
+ scene["image_path"] = generated_image
817
+ # Create video clip from the image
818
+ video_clip = self.create_video_from_image(generated_image, temp_dir, scene['number'], scene.get('adjusted_duration', DEFAULT_SCENE_DURATION))
819
+ if video_clip:
820
+ scene["video_path"] = video_clip
821
+ else:
822
+ logger.warning(f"Failed to create video clip for scene {scene['number']}")
823
+ else:
824
+ logger.warning(f"Failed to generate image for scene {scene['number']}")
825
+
826
+ async def generate_local_image(self, scene: Dict[str, Any]) -> Optional[str]:
827
+ """Generate an image using the local image generator."""
828
+ try:
829
+ image_gen_input = {"scene": scene}
830
+ image_gen_result = await self.image_generation_agent.execute(image_gen_input)
831
+ if image_gen_result and 'image_path' in image_gen_result:
832
+ return image_gen_result['image_path']
833
+ else:
834
+ logger.warning(f"Local image generation failed for scene: {scene['number']}")
835
+ return None
836
+ except Exception as e:
837
+ logger.error(f"Error in local image generation: {str(e)}")
838
+ return None
839
+
840
+ def parse_scenes(self, response: str) -> List[Dict[str, Any]]:
841
+ scenes = []
842
+ current_scene = {}
843
+ current_scene_number = None
844
+
845
+ for line in response.split('\n'):
846
+ line = line.strip()
847
+ logger.debug(f"Processing line: {line}")
848
+
849
+ if line.startswith(tuple(f"{i}." for i in range(1, 51))): # Assuming up to 50 scenes
850
+ if current_scene:
851
+ # Append the completed current_scene
852
+ current_scene['number'] = current_scene_number
853
+ # Ensure the scene is validated and enhanced
854
+ current_scene = self.validate_and_fix_scene(current_scene, current_scene_number)
855
+ current_scene = self.enhance_scene_keywords(current_scene)
856
+ scenes.append(current_scene)
857
+ logger.debug(f"Scene {current_scene_number} appended to scenes list")
858
+ current_scene = {}
859
+
860
+ try:
861
+ # Start a new scene
862
+ current_scene_number = int(line.split('.', 1)[0])
863
+ logger.debug(f"New scene number detected: {current_scene_number}")
864
+ except ValueError:
865
+ logger.warning(f"Invalid scene number format: {line}")
866
+ continue # Skip this line and move to the next
867
+ elif ':' in line:
868
+ key, value = line.split(':', 1)
869
+ key = key.strip().lower()
870
+ value = value.strip()
871
+ current_scene[key] = value
872
+ logger.debug(f"Key-value pair added to current scene: {key}:{value}")
873
+ else:
874
+ logger.warning(f"Line format not recognized: {line}")
875
+
876
+ # After looping through all lines, check if there is an unfinished scene
877
+ if current_scene:
878
+ current_scene['number'] = current_scene_number
879
+ current_scene = self.validate_and_fix_scene(current_scene, current_scene_number)
880
+ current_scene = self.enhance_scene_keywords(current_scene)
881
+ scenes.append(current_scene)
882
+ logger.debug(f"Final scene {current_scene_number} appended to scenes list")
883
+
884
+ logger.info(f"Parsed and enhanced scenes: {scenes}")
885
+ return scenes
886
+
887
+ def enhance_scene_keywords(self, scene: Dict[str, Any]) -> Dict[str, Any]:
888
+ # Extract keywords from narration_text and visual descriptions
889
+ narration_doc = self.nlp(scene.get('narration_text', ''))
890
+ visual_doc = self.nlp(scene.get('visual', ''))
891
+
892
+ # Function to extract nouns and named entities
893
+ def extract_keywords(doc):
894
+ return [token.lemma_ for token in doc if token.pos_ in ('NOUN', 'PROPN') or token.ent_type_]
895
+
896
+ narration_keywords = extract_keywords(narration_doc)
897
+ visual_keywords = extract_keywords(visual_doc)
898
+
899
+ # Combine and deduplicate keywords
900
+ combined_keywords = list(set(narration_keywords + visual_keywords))
901
+
902
+ # Generate enhanced video and image keywords
903
+ scene['video_keyword'] = ' '.join(combined_keywords[:5]) # Use top 5 keywords
904
+ scene['image_keyword'] = scene['video_keyword']
905
+
906
+ return scene
907
+
908
+ def validate_and_fix_scene(self, scene: Dict[str, Any], scene_number: int) -> Dict[str, Any]:
909
+ # Ensure 'number' key is present in the scene dictionary
910
+ scene['number'] = scene_number
911
+
912
+ required_keys = ['visual', 'text', 'video_keyword', 'image_keyword']
913
+ for key in required_keys:
914
+ if key not in scene:
915
+ if key == 'visual':
916
+ scene[key] = f"Visual representation of scene {scene_number}"
917
+ elif key == 'text':
918
+ scene[key] = ""
919
+ elif key == 'video_keyword':
920
+ scene[key] = f"video scene {scene_number}"
921
+ elif key == 'image_keyword':
922
+ scene[key] = f"image scene {scene_number}"
923
+ logger.warning(f"Added missing {key} for scene {scene_number}")
924
+
925
+ # Clean the 'text' field by removing leading/trailing quotation marks
926
+ text = scene.get('text', '')
927
+ text = text.strip('"').strip("'")
928
+ scene['text'] = text
929
+
930
+ # Copy the cleaned text into 'narration_text'
931
+ scene['narration_text'] = text
932
+
933
+ return scene
934
+
935
+ def calculate_relevance(self, video: Dict[str, Any], description: str) -> float:
936
+ relevance = 0
937
+ video_keywords = set(video.get("tags", []))
938
+ description_words = set(description.lower().split())
939
+
940
+ # Calculate relevance based on matching words
941
+ relevance += len(video_keywords.intersection(description_words))
942
+
943
+ # Add relevance for matching title words
944
+ title = video.get("title", "")
945
+ if title is not None:
946
+ title_words = set(title.lower().split())
947
+ relevance += len(title_words.intersection(description_words)) * 2 # Title matches are weighted more
948
+
949
+ return relevance
950
+
951
+ def calculate_similarity(self, text1: str, text2: str) -> float:
952
+ """Calculates the cosine similarity between two texts."""
953
+ vectorizer = TfidfVectorizer().fit_transform([text1, text2])
954
+ vectors = vectorizer.toarray()
955
+ cos_sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
956
+ return cos_sim
957
+
958
+ def fallback_scene_generation(self, invalid_scenes: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
959
+ valid_scenes = []
960
+ for scene in invalid_scenes:
961
+ if 'visual' not in scene:
962
+ scene['visual'] = f"Visual representation of: {scene.get('text', 'scene')}"
963
+ if 'text' not in scene:
964
+ scene['text'] = "No text provided for this scene."
965
+ if 'video_keyword' not in scene:
966
+ scene['video_keyword'] = scene.get('image_keyword', 'generic scene')
967
+ if 'image_keyword' not in scene:
968
+ scene['image_keyword'] = scene.get('video_keyword', 'generic image')
969
+ valid_scenes.append(scene)
970
+ return valid_scenes
971
+
972
+ def compile_youtube_short(scenes: List[Dict[str, Any]], audio_file: str) -> str:
973
+ """Compiles the YouTube Short using ffmpeg."""
974
+ if not scenes:
975
+ logger.error("No scenes were generated. Cannot compile YouTube Short.")
976
+ return None
977
+
978
+ temp_dir = tempfile.mkdtemp()
979
+ scene_files = []
980
+ subtitle_file = os.path.join(temp_dir, "subtitles.ass")
981
+ concat_file = os.path.join(temp_dir, 'concat.txt')
982
+ output_path = os.path.join(os.getcwd(), "youtube_short.mp4")
983
+
984
+ try:
985
+ if not generate_subtitles(scenes, subtitle_file, audio_file):
986
+ raise Exception("Failed to generate subtitles")
987
+
988
+ # Collect total audio duration and adjust scene durations before processing scenes
989
+ total_audio_duration = sum(scene.get('audio_duration', 0) for scene in scenes)
990
+ logger.info(f"Total audio duration: {total_audio_duration}s")
991
+
992
+ # Initially set total_video_duration as the sum of original scene durations
993
+ total_video_duration = sum(scene.get('audio_duration', DEFAULT_SCENE_DURATION) for scene in scenes)
994
+ logger.info(f"Total video duration before adjustment: {total_video_duration}s")
995
+
996
+ # Adjust scene durations if necessary
997
+ if abs(total_video_duration - total_audio_duration) > 0.1:
998
+ logger.warning("Total video duration does not match total audio duration.")
999
+ scaling_factor = total_audio_duration / total_video_duration
1000
+ logger.info(f"Scaling factor: {scaling_factor}")
1001
+ for i, scene in enumerate(scenes):
1002
+ original_duration = scene.get('audio_duration', DEFAULT_SCENE_DURATION)
1003
+ adjusted_duration = original_duration * scaling_factor
1004
+ scene['adjusted_duration'] = adjusted_duration
1005
+ logger.info(f"Scene {i}: Original duration = {original_duration}s, Adjusted duration = {adjusted_duration}s")
1006
+ else:
1007
+ for scene in scenes:
1008
+ scene['adjusted_duration'] = scene.get('audio_duration', DEFAULT_SCENE_DURATION)
1009
+
1010
+ # Now process each scene using the adjusted durations
1011
+ for i, scene in enumerate(scenes):
1012
+ duration = scene.get('adjusted_duration', scene.get('audio_duration', DEFAULT_SCENE_DURATION))
1013
+ logger.info(f"Processing scene {i}: Duration = {duration}s")
1014
+ if not isinstance(duration, (int, float)) or duration <= 0:
1015
+ logger.warning(f"Scene {i} has invalid duration ({duration}), skipping")
1016
+ continue
1017
+
1018
+ processed_path = None
1019
+ try:
1020
+ if i == 0 and 'image_path' in scene:
1021
+ # Apply effects to the generated image
1022
+ processed_path = apply_effects_to_image(scene['image_path'], temp_dir, i, duration)
1023
+ elif 'video_path' in scene and os.path.exists(scene['video_path']):
1024
+ processed_path = process_video(scene['video_path'], temp_dir, i, duration)
1025
+ elif 'image_path' in scene and os.path.exists(scene['image_path']):
1026
+ processed_path = create_video_from_image(scene['image_path'], temp_dir, i, duration)
1027
+ else:
1028
+ processed_path = create_fallback_scene(temp_dir, i, duration, scene.get('narration_text', ''))
1029
+
1030
+ if processed_path and os.path.exists(processed_path):
1031
+ scene_files.append(processed_path)
1032
+ else:
1033
+ logger.error(f"Failed to process media for scene {i}")
1034
+ except Exception as e:
1035
+ logger.error(f"Error processing scene {i}: {str(e)}")
1036
+ # Create a fallback scene
1037
+ fallback_path = create_fallback_scene(temp_dir, i, duration, f"Error in scene {i}")
1038
+ if fallback_path and os.path.exists(fallback_path):
1039
+ scene_files.append(fallback_path)
1040
+
1041
+ # Create concat.txt file
1042
+ with open(concat_file, 'w') as f:
1043
+ for file in scene_files:
1044
+ f.write(f"file '{file}'\n")
1045
+
1046
+ with open(concat_file, 'r') as f:
1047
+ concat_contents = f.read()
1048
+ logger.info(f"Contents of concat file:\n{concat_contents}")
1049
+
1050
+ ffmpeg_command = [
1051
+ 'ffmpeg', '-y',
1052
+ '-f', 'concat', '-safe', '0', '-i', concat_file,
1053
+ '-i', audio_file,
1054
+ '-r', '30',
1055
+ '-vf', f"subtitles='{subtitle_file}':force_style='FontSize={SUBTITLE_FONT_SIZE},Alignment={SUBTITLE_ALIGNMENT},"
1056
+ f"OutlineColour={SUBTITLE_OUTLINE_COLOR},BorderStyle={SUBTITLE_BORDER_STYLE}'",
1057
+ '-map', '0:v',
1058
+ '-map', '1:a',
1059
+ '-c:v', 'libx264', '-preset', 'ultrafast',
1060
+ '-c:a', 'aac', '-shortest',
1061
+ output_path
1062
+ ]
1063
+ logger.info(f"Running FFmpeg command: {' '.join(ffmpeg_command)}")
1064
+ subprocess.run(ffmpeg_command, check=True)
1065
+
1066
+ if os.path.exists(output_path):
1067
+ logger.info(f"YouTube Short compiled successfully: {output_path}")
1068
+ return output_path
1069
+ else:
1070
+ logger.error("Failed to create output video")
1071
+ return None
1072
+
1073
+ except Exception as e:
1074
+ logger.error(f"Error compiling YouTube Short: {str(e)}")
1075
+ return None
1076
+
1077
+ finally:
1078
+ # Clean up
1079
+ for file in scene_files:
1080
+ try:
1081
+ os.remove(file)
1082
+ except Exception as e:
1083
+ logger.warning(f"Error removing file {file}: {str(e)}")
1084
+
1085
+ try:
1086
+ if os.path.exists(concat_file):
1087
+ os.remove(concat_file)
1088
+ if os.path.exists(subtitle_file):
1089
+ os.remove(subtitle_file)
1090
+ except Exception as e:
1091
+ logger.warning(f"Error removing temporary files: {str(e)}")
1092
+
1093
+ try:
1094
+ shutil.rmtree(temp_dir)
1095
+ except Exception as e:
1096
+ logger.warning(f"Error removing temporary directory {temp_dir}: {str(e)}")
1097
+
1098
+ def apply_effects_to_image(image_path: str, temp_dir: str, scene_number: int, duration: float) -> str:
1099
+ """Applies effects to the generated image and creates a video scene."""
1100
+ try:
1101
+ processed_path = os.path.join(temp_dir, f"processed_scene_{scene_number}.mp4")
1102
+ # Apply a zoom effect to the image
1103
+ ffmpeg_command = [
1104
+ 'ffmpeg', '-y',
1105
+ '-loop', '1',
1106
+ '-i', image_path,
1107
+ '-t', str(duration),
1108
+ '-filter_complex', f'zoompan=z=\'min(zoom+0.0015,1.5)\':d={duration*30}:s={YOUTUBE_SHORT_RESOLUTION[0]}x{YOUTUBE_SHORT_RESOLUTION[1]}',
1109
+ '-c:v', 'libx264', '-pix_fmt', 'yuv420p', '-r', '30',
1110
+ processed_path
1111
+ ]
1112
+ subprocess.run(ffmpeg_command, check=True)
1113
+ return processed_path
1114
+ except Exception as e:
1115
+ logger.error(f"Error applying effects to generated image for scene {scene_number}: {str(e)}")
1116
+ return None
1117
+
1118
+ def create_video_from_image(image_path: str, temp_dir: str, scene_number: int, duration: float) -> str:
1119
+ """Creates a video scene from a static image."""
1120
+ try:
1121
+ processed_path = os.path.join(temp_dir, f"processed_scene_{scene_number}.mp4")
1122
+ subprocess.run(['ffmpeg', '-y', '-loop', '1', '-i', image_path, '-t', str(duration),
1123
+ '-r', '30',
1124
+ '-vf', f'scale={YOUTUBE_SHORT_RESOLUTION[0]}:{YOUTUBE_SHORT_RESOLUTION[1]}:force_original_aspect_ratio=increase,crop={YOUTUBE_SHORT_RESOLUTION[0]}:{YOUTUBE_SHORT_RESOLUTION[1]}',
1125
+ '-c:v', 'libx264', '-preset', 'ultrafast', '-an', processed_path],
1126
+ check=True)
1127
+ return processed_path
1128
+ except Exception as e:
1129
+ logger.error(f"Error creating video from image for scene {scene_number}: {str(e)}")
1130
+ return None
1131
+
1132
+ def clean_text_for_tts(text: str) -> str:
1133
+ """
1134
+ Cleans the text for TTS by removing or replacing unwanted characters.
1135
+ Removes asterisks, unnecessary punctuation, and extra whitespace.
1136
+ """
1137
+ # Remove asterisks
1138
+ text = text.replace('*', '')
1139
+ # Remove any undesired punctuation or symbols
1140
+ text = re.sub(r'[^\w\s.,!?\'"]', '', text)
1141
+ # Replace multiple punctuation marks with a single one
1142
+ text = re.sub(r'([.!?])\1+', r'\1', text)
1143
+ # Remove extra whitespace
1144
+ text = ' '.join(text.split())
1145
+ return text
1146
+
1147
+ def generate_voiceover(scenes: List[Dict[str, Any]], output_file: str) -> bool:
1148
+ """Generates per-scene voiceover from scene narrations using F5-TTS."""
1149
+ if not scenes:
1150
+ logging.error("No scenes provided for voiceover generation.")
1151
+ return False
1152
+
1153
+ logging.info(f"Total number of scenes: {len(scenes)}")
1154
+
1155
+ temp_dir = tempfile.mkdtemp()
1156
+ audio_segments = []
1157
+
1158
+ try:
1159
+ f5_tts_dir = os.path.join(os.getcwd(), "F5-TTS")
1160
+ inference_cli_path = os.path.join(f5_tts_dir, "inference-cli.py")
1161
+ ref_audio = os.path.join(f5_tts_dir, "tests", "ref_audio", "mike.wav")
1162
+ ref_text = ""
1163
+ config_path = os.path.join(f5_tts_dir, "inference-cli.toml")
1164
+ data_dir = os.path.join(f5_tts_dir, "data")
1165
+
1166
+ # Check and setup vocab file
1167
+ vocab_file = os.path.join(data_dir, "Emilia_ZH_EN_pinyin", "vocab.txt")
1168
+ if not os.path.exists(vocab_file):
1169
+ logging.warning(f"Vocab file not found at {vocab_file}")
1170
+ for root, dirs, files in os.walk(f5_tts_dir):
1171
+ if "vocab.txt" in files:
1172
+ found_vocab = os.path.join(root, "vocab.txt")
1173
+ logging.info(f"Found vocab file at {found_vocab}")
1174
+ os.makedirs(os.path.dirname(vocab_file), exist_ok=True)
1175
+ os.symlink(found_vocab, vocab_file)
1176
+ logging.info(f"Created symlink to vocab file at {vocab_file}")
1177
+ break
1178
+ else:
1179
+ logging.error("Could not find vocab.txt file in F5-TTS directory")
1180
+ return False
1181
+
1182
+ for i, scene in enumerate(scenes):
1183
+ text = scene.get('narration_text', '').strip()
1184
+ if not text or text.lower() == 'none':
1185
+ continue
1186
+
1187
+ # Create a separate temp directory for each scene
1188
+ scene_temp_dir = os.path.join(temp_dir, f"scene_{i}")
1189
+ os.makedirs(scene_temp_dir, exist_ok=True)
1190
+
1191
+ # F5-TTS always outputs as 'out.wav' in the specified directory
1192
+ temp_output_path = os.path.join(scene_temp_dir, "out.wav")
1193
+ final_scene_path = os.path.join(temp_dir, f"scene_{i}.mp3")
1194
+
1195
+ logging.info(f"Generating voiceover for scene {i}")
1196
+
1197
+ command = [
1198
+ "python", inference_cli_path,
1199
+ "--config", config_path,
1200
+ "--model", "F5-TTS",
1201
+ "--ref_audio", ref_audio,
1202
+ "--ref_text", ref_text,
1203
+ "--gen_text", text,
1204
+ "--output", scene_temp_dir,
1205
+ "--vocab_file", vocab_file
1206
+ ]
1207
+
1208
+ try:
1209
+ logging.info(f"Running F5-TTS command: {' '.join(command)}")
1210
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
1211
+ logging.info("Voice generation successful")
1212
+ logging.debug(f"F5-TTS output: {result.stdout}")
1213
+
1214
+ if os.path.exists(temp_output_path):
1215
+ # Convert WAV to MP3
1216
+ audio = AudioSegment.from_wav(temp_output_path)
1217
+ audio.export(final_scene_path, format="mp3")
1218
+
1219
+ duration = len(audio) / 1000.0 # Convert milliseconds to seconds
1220
+ scene['audio_file'] = final_scene_path
1221
+ scene['audio_duration'] = duration
1222
+ audio_segments.append(audio)
1223
+ logging.info(f"Scene {i}: Audio duration = {duration}s")
1224
+ else:
1225
+ logging.error(f"Generated audio file not found at {temp_output_path}")
1226
+ return False
1227
+
1228
+ except subprocess.CalledProcessError as e:
1229
+ logging.error(f"Error during voice generation for scene {i}: {e}")
1230
+ logging.error(f"Error output: {e.stderr}")
1231
+ return False
1232
+ except Exception as e:
1233
+ logging.exception(f"Unexpected error during voice generation for scene {i}: {e}")
1234
+ return False
1235
+ finally:
1236
+ # Clean up scene-specific temp directory
1237
+ if os.path.exists(scene_temp_dir):
1238
+ shutil.rmtree(scene_temp_dir)
1239
+
1240
+ if not audio_segments:
1241
+ logging.error("No audio segments were generated.")
1242
+ return False
1243
+
1244
+ # Combine all audio segments into one file
1245
+ combined_audio = sum(audio_segments)
1246
+ combined_audio.export(output_file, format='mp3')
1247
+ logging.info(f"Combined voiceover saved to {output_file}")
1248
+ return True
1249
+
1250
+ except Exception as e:
1251
+ logging.error(f"Error generating voiceover: {str(e)}")
1252
+ return False
1253
+ finally:
1254
+ try:
1255
+ shutil.rmtree(temp_dir)
1256
+ except Exception as e:
1257
+ logging.warning(f"Error removing temporary directory {temp_dir}: {str(e)}")
1258
+
1259
+ def generate_subtitles(scenes: List[Dict[str, Any]], output_file: str, audio_file: str) -> bool:
1260
+ try:
1261
+ temp_dir = tempfile.mkdtemp()
1262
+ input_text_file = os.path.join(temp_dir, "input_text.txt")
1263
+ EXCLUDED_TEXTS = [
1264
+ 'none',
1265
+ 'no narration',
1266
+ 'no voiceover',
1267
+ 'no subtitles',
1268
+ 'just music',
1269
+ 'no specific text for this scene',
1270
+ 'no text',
1271
+ 'n/a',
1272
+ 'none.',
1273
+ 'none,',
1274
+ 'none\n',
1275
+ 'no narration.',
1276
+ 'no narration,',
1277
+ 'no narration\n',
1278
+ ' '
1279
+ ]
1280
+ with open(input_text_file, "w", encoding="utf-8") as f:
1281
+ for scene in scenes:
1282
+ text = scene.get('narration_text', '').replace('\n', ' ').strip()
1283
+ # Clean the text
1284
+ text = clean_text_for_tts(text)
1285
+ if text and not any(excluded_text.strip() == text.lower() for excluded_text in EXCLUDED_TEXTS):
1286
+ f.write(text + " ")
1287
+
1288
+ # Align using Gentle
1289
+ alignment_result = align_with_gentle(audio_file, input_text_file)
1290
+ if not alignment_result:
1291
+ raise Exception("Alignment failed with Gentle.")
1292
+
1293
+ # Convert alignment result to ASS
1294
+ gentle_alignment_to_ass(alignment_result, output_file)
1295
+
1296
+ shutil.rmtree(temp_dir)
1297
+ return True
1298
+ except Exception as e:
1299
+ logger.error(f"Error generating subtitles: {str(e)}")
1300
+ return False
1301
+
1302
+ def calculate_scene_durations(scenes: List[Dict[str, Any]], audio_segments: List[AudioSegment]) -> List[float]:
1303
+ """
1304
+ Calculates the duration of each scene based on the actual duration of the corresponding narration audio.
1305
+ """
1306
+ if not scenes:
1307
+ logger.error("No scene durations calculated. Cannot calculate scene durations.")
1308
+ return None
1309
+ scene_durations = []
1310
+ for segment in audio_segments:
1311
+ duration = len(segment) / 1000 # Convert milliseconds to seconds
1312
+ scene_durations.append(duration)
1313
+ return scene_durations
1314
+
1315
+ def process_video(video_path: str, temp_dir: str, scene_number: int, duration: float) -> Optional[str]:
1316
+ try:
1317
+ processed_path = os.path.join(temp_dir, f"processed_scene_{scene_number}.mp4")
1318
+ duration_str = str(duration)
1319
+ logger.info(f"Processing video for scene {scene_number}: Duration = {duration_str}s")
1320
+ ffmpeg_command = [
1321
+ 'ffmpeg', '-y',
1322
+ '-i', video_path,
1323
+ '-t', duration_str,
1324
+ '-vf', f'scale={YOUTUBE_SHORT_RESOLUTION[0]}:{YOUTUBE_SHORT_RESOLUTION[1]}:force_original_aspect_ratio=increase,crop={YOUTUBE_SHORT_RESOLUTION[0]}:{YOUTUBE_SHORT_RESOLUTION[1]}',
1325
+ '-c:v', 'libx264',
1326
+ '-preset', 'fast',
1327
+ '-r', '30',
1328
+ '-an',
1329
+ processed_path
1330
+ ]
1331
+ subprocess.run(ffmpeg_command, check=True)
1332
+ if os.path.exists(processed_path):
1333
+ logger.info(f"Processed video saved: {processed_path}")
1334
+ return processed_path
1335
+ else:
1336
+ logger.error(f"Processed video not found: {processed_path}")
1337
+ return None
1338
+ except Exception as e:
1339
+ logger.error(f"Error processing video for scene {scene_number}: {str(e)}")
1340
+ return None
1341
+
1342
+ def create_fallback_scene(temp_dir: str, scene_number: int, duration: float, text: str) -> str:
1343
+ """Creates a fallback scene with a colored background and text."""
1344
+ try:
1345
+ fallback_path = os.path.join(temp_dir, f"fallback_scene_{scene_number}.mp4")
1346
+ # Escape single quotes and other special characters in the text
1347
+ escaped_text = text.replace("'", "'\\''").replace(':', '\\:')
1348
+
1349
+ ffmpeg_command = [
1350
+ 'ffmpeg', '-y', '-f', 'lavfi',
1351
+ '-i', f'color=c={FALLBACK_SCENE_COLOR}:s={YOUTUBE_SHORT_RESOLUTION[0]}x{YOUTUBE_SHORT_RESOLUTION[1]}:d={duration}',
1352
+ '-vf', f"drawtext=fontfile={FALLBACK_SCENE_FONT_FILE}:fontsize={FALLBACK_SCENE_FONT_SIZE}:"
1353
+ f"fontcolor={FALLBACK_SCENE_TEXT_COLOR}:box=1:boxcolor={FALLBACK_SCENE_BOX_COLOR}:"
1354
+ f"boxborderw={FALLBACK_SCENE_BOX_BORDER_WIDTH}:x=(w-tw)/2:y=(h-th)/2:text='{escaped_text}'",
1355
+ '-c:v', 'libx265', '-preset', 'ultrafast', '-an',
1356
+ fallback_path
1357
+ ]
1358
+
1359
+ # Log the full ffmpeg command
1360
+ logger.debug(f"Fallback scene FFmpeg command: {' '.join(ffmpeg_command)}")
1361
+
1362
+ # Run ffmpeg command and capture output
1363
+ result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
1364
+
1365
+ # Log ffmpeg output
1366
+ logger.debug(f"Fallback scene FFmpeg stdout:\n{result.stdout}")
1367
+ logger.debug(f"Fallback scene FFmpeg stderr:\n{result.stderr}")
1368
+
1369
+ return fallback_path
1370
+ except subprocess.CalledProcessError as e:
1371
+ logger.error(f"Error creating fallback scene {scene_number}: {str(e)}")
1372
+ logger.error(f"FFmpeg stdout:\n{e.stdout}")
1373
+ logger.error(f"FFmpeg stderr:\n{e.stderr}")
1374
+ return None
1375
+ except Exception as e:
1376
+ logger.error(f"Error creating fallback scene {scene_number}: {str(e)}")
1377
+ return None
1378
+
1379
+
1380
+ def extract_selected_title(selection_output: str) -> str:
1381
+ """
1382
+ Extracts the selected title from the Title Selection Agent's output.
1383
+ Assumes that the agent's output contains the selected title in a consistent format.
1384
+ """
1385
+ try:
1386
+ lines = selection_output.strip().split('\n')
1387
+ for line in lines:
1388
+ if "Selected Title:" in line or "Title:" in line:
1389
+ # Extract the title part
1390
+ title = line.split(":", 1)[1].strip().strip('"').strip("'")
1391
+ return title
1392
+ # If not found, return the entire output (may not be ideal)
1393
+ return selection_output.strip()
1394
+ except Exception as e:
1395
+ logger.error(f"Error extracting selected title: {str(e)}")
1396
+ return selection_output.strip()
1397
+
1398
+ def get_audio_duration(audio_file: str) -> float:
1399
+ try:
1400
+ result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio_file], capture_output=True, text=True)
1401
+ return float(result.stdout)
1402
+ except Exception as e:
1403
+ logger.error(f"Error getting audio duration: {str(e)}")
1404
+ return 0.0
1405
+
1406
+
1407
+
1408
+ # Streamlit app
1409
+ def main():
1410
+ st.set_page_config(page_title="YouTube Shorts Generator", page_icon="🎥", layout="wide")
1411
+ st.title("YouTube Shorts Generator")
1412
+
1413
+ # Input fields
1414
+ topic = st.text_input("Enter the topic for your YouTube video:")
1415
+ time_frame = st.text_input("Enter the time frame for recent events (e.g., 'past week', '30d', '1y'):")
1416
+ video_length = st.number_input("Enter the desired video length in seconds:")
1417
+
1418
+ if st.button("Generate YouTube Shorts"):
1419
+ if topic and time_frame:
1420
+ with st.spinner("Generating YouTube Shorts ... This will take at least 3-5 minutes"):
1421
+ try:
1422
+ results = asyncio.run(youtube_shorts_workflow(topic, time_frame, video_length))
1423
+ if "Error" in results:
1424
+ st.error(f"An error occurred: {results['Error']}")
1425
+ else:
1426
+ display_results(results)
1427
+ except Exception as e:
1428
+ st.error(f"An unexpected error occurred: {str(e)}")
1429
+ logger.exception("Unexpected error in YouTube Shorts generation")
1430
+ else:
1431
+ st.warning("Please enter both topic and time frame.")
1432
+
1433
+ def display_results(results):
1434
+ st.subheader("Generation Results")
1435
+ for agent_name, result in results.items():
1436
+ with st.expander(f"{agent_name} Result"):
1437
+ if agent_name == "Storyboard Generation Agent" and isinstance(result, list):
1438
+ for scene in result:
1439
+ st.write(f"Scene {scene['number']}:")
1440
+ st.write(f"Visual: {scene['visual']}")
1441
+ st.write(f"Text/Dialogue: {scene['narration_text']}")
1442
+ if 'video_url' in scene:
1443
+ st.write(f"Video URL: {scene['video_url']}")
1444
+ st.write(f"Video Details: {scene['video_details']}")
1445
+ elif 'image_url' in scene:
1446
+ st.write(f"Image URL: {scene['image_url']}")
1447
+ else:
1448
+ st.write(result)
1449
+
1450
+ if "Output Video Path" in results:
1451
+ output_path = results["Output Video Path"]
1452
+ if output_path:
1453
+ st.success(f"YouTube Short saved as '{output_path}'")
1454
+ st.video(output_path)
1455
+ else:
1456
+ st.error("Failed to compile YouTube Short")
1457
+
1458
+ async def youtube_shorts_workflow(topic: str, time_frame: str, video_length: int) -> Dict[str, Any]:
1459
+ # Create graph instance
1460
+ graph = Graph() # Create an instance of the Graph class
1461
+ video_length = video_length * 1000 # Convert to milliseconds
1462
+ # Check if TikTok session ID is set
1463
+ if not SESSION_ID:
1464
+ logger.error("TikTok session ID is not set. Please set the TIKTOK_SESSION_ID environment variable.")
1465
+ results["Error"] = "TikTok session ID is not set"
1466
+ return results
1467
+
1468
+ # Create nodes
1469
+ recent_events_node = Node(agent=RecentEventsResearchAgent())
1470
+ title_gen_node = Node(agent=TitleGenerationAgent())
1471
+ title_select_node = Node(agent=TitleSelectionAgent())
1472
+ desc_gen_node = Node(agent=DescriptionGenerationAgent())
1473
+ hashtag_tag_node = Node(agent=HashtagAndTagGenerationAgent())
1474
+ script_gen_node = Node(agent=VideoScriptGenerationAgent())
1475
+ image_gen_node = Node(agent=ImageGenerationAgent())
1476
+ storyboard_gen_node = Node(agent=StoryboardGenerationAgent())
1477
+
1478
+ # Add nodes to graph
1479
+ graph.add_node(recent_events_node) # Use the graph instance
1480
+ graph.add_node(title_gen_node)
1481
+ graph.add_node(title_select_node)
1482
+ graph.add_node(desc_gen_node)
1483
+ graph.add_node(hashtag_tag_node)
1484
+ graph.add_node(script_gen_node)
1485
+ graph.add_node(image_gen_node)
1486
+ graph.add_node(storyboard_gen_node)
1487
+
1488
+
1489
+ # Create and add edges
1490
+ graph.add_edge(Edge(recent_events_node, title_gen_node)) # Use the graph instance
1491
+ graph.add_edge(Edge(title_gen_node, title_select_node))
1492
+ graph.add_edge(Edge(title_select_node, desc_gen_node))
1493
+ graph.add_edge(Edge(desc_gen_node, hashtag_tag_node))
1494
+ graph.add_edge(Edge(hashtag_tag_node, script_gen_node))
1495
+ graph.add_edge(Edge(script_gen_node, image_gen_node))
1496
+ graph.add_edge(Edge(image_gen_node, storyboard_gen_node))
1497
+
1498
+
1499
+ logger.info(f"Running workflow for topic {topic} and time frame {time_frame}")
1500
+ # Execute workflow
1501
+ current_node = recent_events_node
1502
+ logger.info(f"Starting workflow from node: {current_node.agent.name}")
1503
+ input_data = {"topic": topic, "time_frame": time_frame}
1504
+ results = {}
1505
+
1506
+ # Step 1: Recent Events Research Agent
1507
+ input_data = {"topic": topic, "time_frame": time_frame}
1508
+ try:
1509
+ research_result = await recent_events_node.process(input_data)
1510
+ results[recent_events_node.agent.name] = research_result
1511
+ except Exception as e:
1512
+ logger.error(f"Error in RecentEventsResearchAgent: {str(e)}")
1513
+ results["Error"] = f"RecentEventsResearchAgent failed: {str(e)}"
1514
+ return results
1515
+
1516
+ # Step 2: Title Generation Agent
1517
+ try:
1518
+ title_gen_result = await title_gen_node.process(research_result)
1519
+ results[title_gen_node.agent.name] = title_gen_result
1520
+ except Exception as e:
1521
+ logger.error(f"Error in TitleGenerationAgent: {str(e)}")
1522
+ results["Error"] = f"TitleGenerationAgent failed: {str(e)}"
1523
+ return results
1524
+
1525
+ # Step 3: Title Selection Agent
1526
+ try:
1527
+ title_select_result = await title_select_node.process(title_gen_result)
1528
+ results[title_select_node.agent.name] = title_select_result
1529
+ except Exception as e:
1530
+ logger.error(f"Error in TitleSelectionAgent: {str(e)}")
1531
+ results["Error"] = f"TitleSelectionAgent failed: {str(e)}"
1532
+ return results
1533
+
1534
+ # Extract the selected title from the title selection result
1535
+ selected_title = extract_selected_title(title_select_result)
1536
+ results["Selected Title"] = selected_title
1537
+
1538
+ # Step 4: Description Generation Agent
1539
+ try:
1540
+ desc_gen_result = await desc_gen_node.process(selected_title)
1541
+ results[desc_gen_node.agent.name] = desc_gen_result
1542
+ except Exception as e:
1543
+ logger.error(f"Error in DescriptionGenerationAgent: {str(e)}")
1544
+ results["Error"] = f"DescriptionGenerationAgent failed: {str(e)}"
1545
+ return results
1546
+
1547
+ # Step 5: Hashtag and Tag Generation Agent
1548
+ try:
1549
+ hashtag_tag_result = await hashtag_tag_node.process(selected_title)
1550
+ results[hashtag_tag_node.agent.name] = hashtag_tag_result
1551
+ except Exception as e:
1552
+ logger.error(f"Error in HashtagAndTagGenerationAgent: {str(e)}")
1553
+ results["Error"] = f"HashtagAndTagGenerationAgent failed: {str(e)}"
1554
+ return results
1555
+
1556
+ # Step 6: Video Script Generation Agent
1557
+ try:
1558
+ script_gen_input = {"research": research_result}
1559
+ script_gen_result = await script_gen_node.process(script_gen_input)
1560
+ results[script_gen_node.agent.name] = script_gen_result
1561
+ except Exception as e:
1562
+ logger.error(f"Error in VideoScriptGenerationAgent: {str(e)}")
1563
+ results["Error"] = f"VideoScriptGenerationAgent failed: {str(e)}"
1564
+ return results
1565
+
1566
+ # Step 7: Storyboard Generation Agent
1567
+ logger.info("Executing Storyboard Generation Agent")
1568
+ storyboard_gen_input = {
1569
+ "script": script_gen_result,
1570
+ }
1571
+ storyboard_gen_result = await storyboard_gen_node.process(storyboard_gen_input)
1572
+ if storyboard_gen_result is None:
1573
+ raise ValueError("Storyboard Generation Agent returned None")
1574
+ results[storyboard_gen_node.agent.name] = storyboard_gen_result
1575
+
1576
+ # Step 8: Image Generation Agent
1577
+ logger.info("Executing Image Generation Agent")
1578
+ image_gen_input = {"scenes": storyboard_gen_result}
1579
+ image_gen_result = await image_gen_node.process(image_gen_input)
1580
+ if image_gen_result is None:
1581
+ raise ValueError("Image Generation Agent returned None")
1582
+ results[image_gen_node.agent.name] = image_gen_result
1583
+
1584
+ # Update storyboard with generated images and calculate scene durations
1585
+ total_duration = 0
1586
+ for scene, image_result in zip(storyboard_gen_result, image_gen_result):
1587
+ if image_result is not None and 'image_path' in image_result:
1588
+ scene['image_path'] = image_result['image_path']
1589
+ # Calculate scene duration based on word count or use a default duration
1590
+ word_count = len(scene.get('script', '').split())
1591
+ scene['duration'] = max(word_count * 0.5, 3.0) # Assume 0.5 seconds per word, minimum 3 seconds
1592
+ total_duration += scene['duration']
1593
+ else:
1594
+ logger.warning(f"No image generated for scene {scene.get('number', 'unknown')}")
1595
+
1596
+ # Adjust scene durations to match target video length
1597
+ target_duration = video_length / 1000 # Convert video_length to seconds
1598
+ duration_factor = target_duration / total_duration
1599
+ for scene in storyboard_gen_result:
1600
+ scene['adjusted_duration'] = scene['duration'] * duration_factor
1601
+
1602
+ logger.info(f"Target duration: {target_duration} seconds")
1603
+ logger.info(f"Total calculated duration: {total_duration} seconds")
1604
+ logger.info(f"Duration factor: {duration_factor}")
1605
+
1606
+ # Filter out scenes without images
1607
+ valid_scenes = [scene for scene in storyboard_gen_result if 'image_path' in scene]
1608
+
1609
+ if not valid_scenes:
1610
+ raise ValueError("No valid scenes with images remaining")
1611
+
1612
+ # Log scene information
1613
+ for i, scene in enumerate(valid_scenes):
1614
+ logger.info(f"Scene {i}: Duration = {scene['duration']:.2f}s, Adjusted Duration = {scene['adjusted_duration']:.2f}s, Image = {scene['image_path']}")
1615
+
1616
+ # Proceed to generate voiceover and compile video
1617
+ temp_dir = tempfile.mkdtemp()
1618
+ audio_file = os.path.join(temp_dir, "voiceover.mp3")
1619
+ if not generate_voiceover(valid_scenes, audio_file):
1620
+ raise Exception("Failed to generate voiceover")
1621
+
1622
+ output_path = compile_youtube_short(scenes=valid_scenes, audio_file=audio_file)
1623
+ if output_path:
1624
+ print(f"YouTube Short saved as '{output_path}'")
1625
+ results["Output Video Path"] = output_path
1626
+ else:
1627
+ print("Failed to compile YouTube Short")
1628
+ results["Output Video Path"] = None
1629
+
1630
+ return results
1631
+
1632
+ if __name__ == "__main__":
1633
+ main()