Spaces:
Sleeping
Sleeping
Add get subtitle with type support.
Browse files- fetchYoutubeSubtitle.py +85 -16
- main.py +2 -2
fetchYoutubeSubtitle.py
CHANGED
|
@@ -1,16 +1,51 @@
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
from typing import Optional
|
|
|
|
|
|
|
| 3 |
import yt_dlp
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
langs = item.keys()
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
return subtitle.get("url")
|
| 11 |
return None
|
| 12 |
|
| 13 |
-
async def fetchSubtitle(url: str, lang: Optional[str] = 'en',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
ydl_opts = {
|
| 15 |
"writesubtitles": True,
|
| 16 |
"allsubtitles": True,
|
|
@@ -21,19 +56,53 @@ async def fetchSubtitle(url: str, lang: Optional[str] = 'en', vttType="vtt") ->
|
|
| 21 |
|
| 22 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 23 |
info_dict = ydl.extract_info(url, download=False)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
subtitle_url = getVttUrlFromSubtitles(info_dict.get("automatic_captions"), lang, vttType)
|
| 32 |
-
if subtitle_url:
|
| 33 |
-
with ydl.urlopen(subtitle_url) as subtitle:
|
| 34 |
-
return subtitle.read().decode()
|
| 35 |
return None
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
async def fetchSubtitleUrls(url: str) -> json:
|
| 38 |
ydl_opts = {
|
| 39 |
"writesubtitles": True,
|
|
|
|
| 1 |
import json
|
| 2 |
+
import math
|
| 3 |
+
import time
|
| 4 |
from typing import Optional
|
| 5 |
+
import xml.etree.ElementTree as ElementTree
|
| 6 |
+
from html import unescape
|
| 7 |
import yt_dlp
|
| 8 |
|
| 9 |
+
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
|
| 10 |
+
|
| 11 |
+
# "subtitles": {
|
| 12 |
+
# "live_chat": [
|
| 13 |
+
# {
|
| 14 |
+
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
|
| 15 |
+
# "ext": "json",
|
| 16 |
+
# "video_id": "ANtM2bHRz04",
|
| 17 |
+
# "protocol": "youtube_live_chat_replay"
|
| 18 |
+
# }
|
| 19 |
+
# ]
|
| 20 |
+
# }
|
| 21 |
+
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
|
| 22 |
langs = item.keys()
|
| 23 |
+
if len(langs) == 0:
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
|
| 27 |
+
print("getUrlFromSubtitles l: %s, item: %s" % (l, item))
|
| 28 |
+
|
| 29 |
+
for subtitle in item[l]:
|
| 30 |
+
print("getUrlFromSubtitles subtitle: %s" % subtitle)
|
| 31 |
+
if l != "live_chat" and subType =="xml":
|
| 32 |
+
return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"")
|
| 33 |
+
if subtitle.get("ext") == subType:
|
| 34 |
return subtitle.get("url")
|
| 35 |
return None
|
| 36 |
|
| 37 |
+
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]:
|
| 38 |
+
if subType == "srt":
|
| 39 |
+
subtitle = await fetchSubtitlebyType(url, lang, subType, True)
|
| 40 |
+
if subtitle:
|
| 41 |
+
return subtitle
|
| 42 |
+
subtitle = await fetchSubtitlebyType(url, lang, "xml", True)
|
| 43 |
+
print(subtitle)
|
| 44 |
+
return xml_caption_to_srt(subtitle)
|
| 45 |
+
else:
|
| 46 |
+
return await fetchSubtitlebyType(url, lang, subType, True)
|
| 47 |
+
|
| 48 |
+
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]:
|
| 49 |
ydl_opts = {
|
| 50 |
"writesubtitles": True,
|
| 51 |
"allsubtitles": True,
|
|
|
|
| 56 |
|
| 57 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 58 |
info_dict = ydl.extract_info(url, download=False)
|
| 59 |
+
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
|
| 60 |
+
if info_dict.get(subtitle_item) :
|
| 61 |
+
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
|
| 62 |
+
if subtitle_url:
|
| 63 |
+
with ydl.urlopen(subtitle_url) as subtitle:
|
| 64 |
+
return subtitle.read().decode() if decode else subtitle.read()
|
| 65 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return None
|
| 67 |
|
| 68 |
+
def float_to_srt_time_format(d: float) -> str:
|
| 69 |
+
"""Convert decimal durations into proper srt format.
|
| 70 |
+
:rtype: str
|
| 71 |
+
:returns:
|
| 72 |
+
SubRip Subtitle (str) formatted time duration.
|
| 73 |
+
float_to_srt_time_format(3.89) -> '00:00:03,890'
|
| 74 |
+
"""
|
| 75 |
+
fraction, whole = math.modf(d)
|
| 76 |
+
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
|
| 77 |
+
ms = f"{fraction:.3f}".replace("0.", "")
|
| 78 |
+
return time_fmt + ms
|
| 79 |
+
|
| 80 |
+
def xml_caption_to_srt( xml_captions: str) -> str:
|
| 81 |
+
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
| 82 |
+
:param str xml_captions:
|
| 83 |
+
XML formatted caption tracks.
|
| 84 |
+
"""
|
| 85 |
+
segments = []
|
| 86 |
+
root = ElementTree.fromstring(xml_captions)
|
| 87 |
+
for i, child in enumerate(list(root)):
|
| 88 |
+
text = child.text or ""
|
| 89 |
+
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
|
| 90 |
+
try:
|
| 91 |
+
duration = float(child.attrib["dur"])
|
| 92 |
+
except KeyError:
|
| 93 |
+
duration = 0.0
|
| 94 |
+
start = float(child.attrib["start"])
|
| 95 |
+
end = start + duration
|
| 96 |
+
sequence_number = i + 1 # convert from 0-indexed to 1.
|
| 97 |
+
line = "{seq}\n{start} --> {end}\n{text}\n".format(
|
| 98 |
+
seq=sequence_number,
|
| 99 |
+
start=float_to_srt_time_format(start),
|
| 100 |
+
end=float_to_srt_time_format(end),
|
| 101 |
+
text=caption,
|
| 102 |
+
)
|
| 103 |
+
segments.append(line)
|
| 104 |
+
return "\n".join(segments).strip()
|
| 105 |
+
|
| 106 |
async def fetchSubtitleUrls(url: str) -> json:
|
| 107 |
ydl_opts = {
|
| 108 |
"writesubtitles": True,
|
main.py
CHANGED
|
@@ -15,8 +15,8 @@ def read_json():
|
|
| 15 |
|
| 16 |
|
| 17 |
@app.get("/subtitle/")
|
| 18 |
-
async def get_subtitle(url: str):
|
| 19 |
-
subtitle = await fetchSubtitle(url)
|
| 20 |
return JSONResponse(content=subtitle)
|
| 21 |
|
| 22 |
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@app.get("/subtitle/")
|
| 18 |
+
async def get_subtitle(url: str, subtype: str="srt"):
|
| 19 |
+
subtitle = await fetchSubtitle(url,subType=subtype)
|
| 20 |
return JSONResponse(content=subtitle)
|
| 21 |
|
| 22 |
|