Update app.py
Browse files
app.py
CHANGED
|
@@ -130,41 +130,61 @@ async def get_video_url(youtube_url: str):
|
|
| 130 |
@app.get("/script")
|
| 131 |
async def get_transcript(youtube_url: str, language: str = None):
|
| 132 |
try:
|
| 133 |
-
#
|
| 134 |
ydl_opts = {
|
| 135 |
'skip_download': True,
|
| 136 |
'writesubtitles': True,
|
| 137 |
'writeautomaticsub': True,
|
| 138 |
-
'subtitleslangs': ['all'] if not language else [language],
|
| 139 |
-
'subtitlesformat': 'best',
|
| 140 |
'outtmpl': '%(id)s.%(ext)s',
|
| 141 |
'noplaylist': True,
|
| 142 |
'cookiefile': "firefox-cookies.txt"
|
| 143 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
env_to_cookies_from_env("firefox-cookies.txt")
|
| 145 |
-
# Show current directory structure before download
|
| 146 |
logger.info(f"Current directory files (before): {os.listdir('.')}")
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
|
|
|
| 150 |
video_id = info['id']
|
| 151 |
logger.info(f"Video ID: {video_id}")
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
# Check actual downloaded files
|
| 154 |
logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
subtitle_files = [
|
| 158 |
-
|
| 159 |
-
subtitle_files = [f for f in os.listdir('.')
|
| 160 |
-
if f.startswith(video_id) and (language in f)]
|
| 161 |
|
| 162 |
-
# If
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
any(f.endswith(ext) for ext in ['.vtt', '.srt', '.ttml', '.json3'])]
|
| 168 |
|
| 169 |
logger.info(f"Potential subtitle files: {subtitle_files}")
|
| 170 |
|
|
@@ -197,7 +217,15 @@ async def get_transcript(youtube_url: str, language: str = None):
|
|
| 197 |
else:
|
| 198 |
text = f"Unsupported format: {subtitle_file}"
|
| 199 |
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
return {"transcript": f"No subtitle files found for {video_id}", "language": "none"}
|
| 203 |
except Exception as e:
|
|
|
|
| 130 |
@app.get("/script")
|
| 131 |
async def get_transcript(youtube_url: str, language: str = None):
|
| 132 |
try:
|
| 133 |
+
# Set up yt-dlp options
|
| 134 |
ydl_opts = {
|
| 135 |
'skip_download': True,
|
| 136 |
'writesubtitles': True,
|
| 137 |
'writeautomaticsub': True,
|
|
|
|
|
|
|
| 138 |
'outtmpl': '%(id)s.%(ext)s',
|
| 139 |
'noplaylist': True,
|
| 140 |
'cookiefile': "firefox-cookies.txt"
|
| 141 |
}
|
| 142 |
+
|
| 143 |
+
# If a language is specified, only download that language
|
| 144 |
+
# Otherwise, we'll first get video info to determine the original language
|
| 145 |
+
if language:
|
| 146 |
+
ydl_opts['subtitleslangs'] = [language]
|
| 147 |
+
|
| 148 |
env_to_cookies_from_env("firefox-cookies.txt")
|
|
|
|
| 149 |
logger.info(f"Current directory files (before): {os.listdir('.')}")
|
| 150 |
|
| 151 |
+
# First, get video info without downloading anything
|
| 152 |
+
with yt_dlp.YoutubeDL({**ydl_opts, 'skip_download': True, 'writesubtitles': False, 'writeautomaticsub': False}) as ydl:
|
| 153 |
+
info = ydl.extract_info(youtube_url, download=False)
|
| 154 |
video_id = info['id']
|
| 155 |
logger.info(f"Video ID: {video_id}")
|
| 156 |
|
| 157 |
+
# If no language specified, try to use the original language
|
| 158 |
+
if not language:
|
| 159 |
+
# Try to determine the original language if available in the info
|
| 160 |
+
if 'subtitles' in info and info['subtitles']:
|
| 161 |
+
# Use the first available subtitle language
|
| 162 |
+
available_languages = list(info['subtitles'].keys())
|
| 163 |
+
if available_languages:
|
| 164 |
+
language = available_languages[0]
|
| 165 |
+
logger.info(f"Using detected language: {language}")
|
| 166 |
+
ydl_opts['subtitleslangs'] = [language]
|
| 167 |
+
else:
|
| 168 |
+
# Fall back to 'en' if can't determine
|
| 169 |
+
language = 'en'
|
| 170 |
+
ydl_opts['subtitleslangs'] = [language]
|
| 171 |
+
|
| 172 |
+
# Now download the subtitle in the selected language
|
| 173 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 174 |
+
ydl.extract_info(youtube_url, download=True)
|
| 175 |
+
|
| 176 |
# Check actual downloaded files
|
| 177 |
logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
|
| 178 |
|
| 179 |
+
# Look for the subtitle file with the specified language
|
| 180 |
+
subtitle_files = [f for f in os.listdir('.')
|
| 181 |
+
if f.startswith(video_id) and any(ext in f for ext in ['.vtt', '.srt', '.ttml', '.json3'])]
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
# If specific language requested, filter for that language
|
| 184 |
+
if language:
|
| 185 |
+
lang_subtitle_files = [f for f in subtitle_files if language in f]
|
| 186 |
+
if lang_subtitle_files:
|
| 187 |
+
subtitle_files = lang_subtitle_files
|
|
|
|
| 188 |
|
| 189 |
logger.info(f"Potential subtitle files: {subtitle_files}")
|
| 190 |
|
|
|
|
| 217 |
else:
|
| 218 |
text = f"Unsupported format: {subtitle_file}"
|
| 219 |
|
| 220 |
+
# Clean up files to avoid cluttering the directory
|
| 221 |
+
for f in subtitle_files:
|
| 222 |
+
try:
|
| 223 |
+
os.remove(f)
|
| 224 |
+
except:
|
| 225 |
+
logger.warning(f"Could not remove file: {f}")
|
| 226 |
+
|
| 227 |
+
detected_language = subtitle_file.split('.')[-2] if '.' in subtitle_file else "unknown"
|
| 228 |
+
return {"transcript": text, "language": detected_language}
|
| 229 |
|
| 230 |
return {"transcript": f"No subtitle files found for {video_id}", "language": "none"}
|
| 231 |
except Exception as e:
|