ytdlp_subtitle_dev

Sleeping

App Files Files Community

lanbogao commited on Aug 4, 2023

Commit

eb92911

1 Parent(s): 5c48e61

1. Add retry to extract video, and use proxy when retry if has proxy.

Browse files

2. Default enable proxy for url subtitle-urls and subtitle-dl. default disable proxy for url subtitle

Files changed (3) hide show

fetchYoutubeSubtitle.py +168 -118
main.py +5 -0
requirements.txt +2 -1

fetchYoutubeSubtitle.py CHANGED Viewed

@@ -7,12 +7,27 @@ import traceback
 from typing import Optional
 import xml.etree.ElementTree as ElementTree
 from html import unescape
-import yt_dlp
 debug = os.getenv("DEBUG")
 # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
 # "subtitles": {
 #     "live_chat": [
 #       {
@@ -63,7 +78,7 @@ def getSubtitleOptions(
     if proxy:
         ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
-    # print(ydl_opts)
     return ydl_opts
@@ -179,72 +194,68 @@ async def fetchAnySubtitle(
     url: str,
     lang: Optional[str] = "en",
     subType: Optional[str] = "vtt",
-    skipEmpty: bool = True,
     proxy: Optional[str] = None,
 ) -> dict:
     # lang-code or lang.* .* is regex
     # reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
-    ydl_opts = getSubtitleOptions(lang, proxy)
     title = "unknow"
     duration = ""
     try:
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info_dict = ydl.extract_info(url, download=False)
-            # print(json.dumps(info_dict))
-            title = info_dict.get("title", "unknow")
-            seconds = info_dict.get("duration")
-            duration = str(seconds) if seconds else ""
-            thumbnail = info_dict.get("thumbnail")
-            if ".webp" in thumbnail:
-                thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
-                    info_dict.get("id")
                 )
-            reqType = subType
-            if info_dict.get("extractor") == "youtube" and subType in ["srt", "txt"]:
-                reqType = "xml"
-            if debug:
                 print(
-                    "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
-                        info_dict.get("subtitles").keys(),
-                        info_dict.get("automatic_captions").keys(),
-                        (
-                            info_dict.get("requested_subtitles").keys()
-                            if info_dict.get("requested_subtitles")
-                            else {}
-                        ),
                     )
                 )
-            subtitle_funcs = [
-                getRequestedSubtitlesUrl,
-                getSubtitleLangUrl,
-                getSubtitleOtherUrl,
-            ]
-            for index in range(len(subtitle_funcs)):
-                subtitle_url = subtitle_funcs[index](info_dict, lang, reqType)
-                if subtitle_url:
-                    # print("subtitle_url: {}".format(subtitle_url))
-                    subtitle = fetchSubtitlebydlUrl(ydl, subType, subtitle_url)
-                    print(
-                        "function index:{}, url:{}, title:{}, duration:{} len(subtitle): {}".format(
-                            index, url, title, duration, len(subtitle or "")
-                        )
-                    )
-                    if subtitle is not None:
-                        return {
-                            "id": info_dict.get("id"),
-                            "url": url,
-                            "title": title,
-                            "thumbnail": thumbnail,
-                            "duration": duration,
-                            "subtitle": subtitle,
-                            "chapters": info_dict.get("chapters", None),
-                        }
     except Exception as e:
-        print(e)
         traceback.print_exc()
         return {"error": str(e)}
     return {"title": title, "duration": duration, "error": "No subtitles"}
@@ -330,40 +341,49 @@ def xml_caption_to_txt(xml_captions: str, skip_empty: bool = True) -> str:
 async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
-    ydl_opts = getSubtitleOptions(proxy)
-    title = "unknow"
-    duration = ""
     try:
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info_dict = ydl.extract_info(url, download=False)
-            title = info_dict.get("title", "unknow")
-            seconds = info_dict.get("duration")
-            duration = str(seconds) if seconds else ""
-            thumbnail = info_dict.get("thumbnail")
-            if ".webp" in thumbnail:
-                thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
-                    info_dict.get("id")
-                )
-            return {
-                "id": info_dict.get("id"),
-                "url": url,
-                "title": title,
-                "thumbnail": thumbnail,
-                "duration": duration,
-                "subtitles": info_dict.get("subtitles"),
-                "automatic_captions": info_dict.get("automatic_captions"),
-            }
     except Exception as e:
         return {"error": str(e)}
-def fetchSubtitlebydlUrl(ydl, subType, dlUrl, skipEmpty=True):
-    dlUrl = dlUrl if subType not in ["srt", "txt"] else re.sub(r"&fmt=[\w]+", "", dlUrl)
     try:
-        with ydl.urlopen(dlUrl) as resp:
             if subType == "srt":
                 return xml_caption_to_srt(resp.read().decode(), skipEmpty)
             elif subType == "txt":
@@ -389,49 +409,79 @@ def getSubtitleUrlByLang(info_dict, lang, subType, isLangKey):
             return subtitle_url
 async def fetchSubtitleByInfo(
     url: str, subType: str, dlInfo, proxy: Optional[str] = None
 ):
     try:
         reqType = "xml" if subType in ["srt", "txt"] else subType
-        ydl_opts = getSubtitleOptions(dlInfo.get("lang", None), proxy)
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            subtitle = None
-            if "dlUrl" in dlInfo:
-                subtitle = fetchSubtitlebydlUrl(
-                    ydl, subType, dlInfo.get("dlUrl"), False
-                )
-                if subtitle is not None:
-                    return subtitle
-            info_dict = ydl.extract_info(url, download=False)
-            if debug:
-                print(
-                    "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
-                        info_dict.get("subtitles").keys(),
-                        info_dict.get("automatic_captions").keys(),
-                        (
-                            info_dict.get("requested_subtitles").keys()
-                            if info_dict.get("requested_subtitles")
-                            else {}
-                        ),
-                    )
-                )
-            subtitleUrl = None
-            if "langKey" in dlInfo:
-                subtitleUrl = getSubtitleUrlByLang(
-                    info_dict, dlInfo.get("langKey"), reqType, True
-                )
-            if subtitleUrl is None:
-                subtitleUrl = getSubtitleUrlByLang(
-                    info_dict, dlInfo.get("lang"), reqType, False
                 )
-            print("subtitleUrl: {}".format(subtitleUrl))
-            subtitle = fetchSubtitlebydlUrl(ydl, subType, subtitleUrl, False)
-            return subtitle
     except Exception as e:
-        print(e)
         traceback.print_exc()
         return {"error": str(e)}

 from typing import Optional
 import xml.etree.ElementTree as ElementTree
 from html import unescape
+from yt_dlp import YoutubeDL, DownloadError
+from yt_dlp.networking import Request
+from yt_dlp.utils import sanitize_filename, random_user_agent
+NO_RETRY_STR = [
+    "Sorry about that",
+    "unavailable",
+    "not available",
+]
+RETRY_STR = [
+    "URLError",
+    "429",
+    "IncompleteRead",
+    "Remote end closed connection",
+    # "No video formats found",
+]
 debug = os.getenv("DEBUG")
 # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
 # "subtitles": {
 #     "live_chat": [
 #       {
     if proxy:
         ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
+    print(ydl_opts)
     return ydl_opts
     url: str,
     lang: Optional[str] = "en",
     subType: Optional[str] = "vtt",
     proxy: Optional[str] = None,
 ) -> dict:
     # lang-code or lang.* .* is regex
     # reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
     title = "unknow"
     duration = ""
     try:
+        ydl, info_dict = extractInfo(url, lang, proxy, False)
+        # print(json.dumps(info_dict))
+        title = sanitize_filename(info_dict.get("title", "unknow"))
+        seconds = info_dict.get("duration")
+        duration = str(seconds) if seconds else ""
+        thumbnail = info_dict.get("thumbnail")
+        if ".webp" in thumbnail:
+            thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
+                info_dict.get("id")
+            )
+        reqType = subType
+        if info_dict.get("extractor") == "youtube" and subType in ["srt", "txt"]:
+            reqType = "xml"
+        if debug:
+            print(
+                "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
+                    info_dict.get("subtitles").keys(),
+                    info_dict.get("automatic_captions").keys(),
+                    (
+                        info_dict.get("requested_subtitles").keys()
+                        if info_dict.get("requested_subtitles")
+                        else {}
+                    ),
                 )
+            )
+        subtitle_funcs = [
+            getRequestedSubtitlesUrl,
+            getSubtitleLangUrl,
+            getSubtitleOtherUrl,
+        ]
+        for index in range(len(subtitle_funcs)):
+            subtitle_url = subtitle_funcs[index](info_dict, lang, reqType)
+            if subtitle_url:
+                # print("subtitle_url: {}".format(subtitle_url))
+                subtitle = fetchSubtitleBydlUrl(subType, subtitle_url, ydl=ydl)
                 print(
+                    "function index:{}, url:{}, title:{}, duration:{} len(subtitle): {}".format(
+                        index, url, title, duration, len(subtitle or "")
                     )
                 )
+                if subtitle is not None:
+                    return {
+                        "id": info_dict.get("id"),
+                        "url": url,
+                        "title": title,
+                        "thumbnail": thumbnail,
+                        "duration": duration,
+                        "subtitle": subtitle,
+                        "chapters": info_dict.get("chapters", None),
+                    }
     except Exception as e:
+        print("{}, {}".format(e, url))
         traceback.print_exc()
         return {"error": str(e)}
     return {"title": title, "duration": duration, "error": "No subtitles"}
 async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
     try:
+        _, info_dict = extractInfo(url, None, proxy, True)
+        title = sanitize_filename(info_dict.get("title", "unknow"))
+        seconds = info_dict.get("duration")
+        duration = str(seconds) if seconds else ""
+        thumbnail = info_dict.get("thumbnail")
+        if ".webp" in thumbnail:
+            thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
+                info_dict.get("id")
+            )
+        return {
+            "id": info_dict.get("id"),
+            "url": url,
+            "title": title,
+            "thumbnail": thumbnail,
+            "duration": duration,
+            "subtitles": info_dict.get("subtitles"),
+            "automatic_captions": info_dict.get("automatic_captions"),
+        }
     except Exception as e:
+        print("{}, {}".format(e, url))
+        traceback.print_exc()
         return {"error": str(e)}
+def createHeaders():
+    return {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
+        "Accept-Encoding": "gzip, deflate",
+        "Accept-Language": "en-us,en;q=0.5",
+        "User-Agent": random_user_agent(),
+    }
+def fetchSubtitleBydlUrl(subType, dlUrl, skipEmpty=True, ydl=None):
+    dlUrl = dlUrl if subType not in ["srt", "txt"] else re.sub(r"&fmt=[\w]+", "", dlUrl)
+    # if download mailed we may contain headers and cookies in info and use it here.
     try:
+        ydl = ydl if ydl else YoutubeDL(getSubtitleOptions())
+        with ydl.urlopen(Request(dlUrl, headers=createHeaders())) as resp:
             if subType == "srt":
                 return xml_caption_to_srt(resp.read().decode(), skipEmpty)
             elif subType == "txt":
             return subtitle_url
+def extractInfo(url, lang, proxy, forceProxy=False):
+    max_retry = 2
+    retry = 0
+    http_proxy = proxy if forceProxy else None
+    errMsg = None
+    while retry < max_retry:
+        try:
+            ydl_opts = getSubtitleOptions(lang, http_proxy)
+            ydl = YoutubeDL(ydl_opts)
+            return ydl, ydl.extract_info(url, download=False)
+        except DownloadError as e:
+            errMsg = str(e)
+            if "429" in errMsg:
+                http_proxy = proxy
+            if any(s in errMsg for s in NO_RETRY_STR):
+                # print("{}, {}".format(e, url))
+                break
+            if not any(s in errMsg for s in RETRY_STR):
+                # print("{}, {}".format(e, url))
+                break
+            retry += 1
+        except Exception as e:
+            print(e)
+            traceback.print_exc()
+            raise e
+    raise Exception(errMsg)
 async def fetchSubtitleByInfo(
     url: str, subType: str, dlInfo, proxy: Optional[str] = None
 ):
     try:
         reqType = "xml" if subType in ["srt", "txt"] else subType
+        subtitle = None
+        if "dlUrl" in dlInfo:
+            subtitle = fetchSubtitleBydlUrl(subType, dlInfo.get("dlUrl"), False)
+            if subtitle is not None:
+                return subtitle
+        ydl, info_dict = extractInfo(url, dlInfo.get("lang", None), proxy, False)
+        if debug:
+            print(
+                "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
+                    info_dict.get("subtitles").keys(),
+                    info_dict.get("automatic_captions").keys(),
+                    (
+                        info_dict.get("requested_subtitles").keys()
+                        if info_dict.get("requested_subtitles")
+                        else {}
+                    ),
                 )
+            )
+        subtitleUrl = None
+        if "langKey" in dlInfo:
+            subtitleUrl = getSubtitleUrlByLang(
+                info_dict, dlInfo.get("langKey"), reqType, True
+            )
+        if subtitleUrl is None:
+            subtitleUrl = getSubtitleUrlByLang(
+                info_dict, dlInfo.get("lang"), reqType, False
+            )
+        print("subtitleUrl: {}".format(subtitleUrl))
+        subtitle = fetchSubtitleBydlUrl(subType, subtitleUrl, False, ydl)
+        return subtitle
     except Exception as e:
+        print("{}, {}".format(e, url))
         traceback.print_exc()
         return {"error": str(e)}

main.py CHANGED Viewed

@@ -68,8 +68,13 @@ async def download(
     if token != x_token:
         raise HTTPException(status_code=401, detail="Invalid token")
     try:
         dlInfo = json.loads(info)
         # print(
         #     "url: {}, fileName: {}, fileType: {}, dlInfo: {}".format(
         #         url, fileName, fileType, dlInfo

     if token != x_token:
         raise HTTPException(status_code=401, detail="Invalid token")
+    dlInfo = None
     try:
         dlInfo = json.loads(info)
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid params")
+    try:
         # print(
         #     "url: {}, fileName: {}, fileType: {}, dlInfo: {}".format(
         #         url, fileName, fileType, dlInfo

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ fastapi==0.95.*
 # torch==1.11.*
 # transformers==4.*
 uvicorn[standard]==0.17.*
-yt-dlp==2023.06.22

 # torch==1.11.*
 # transformers==4.*
 uvicorn[standard]==0.17.*
+# yt-dlp==2023.07.06
+yt-dlp @ git+https://github.com/yt-dlp/yt-dlp.git@6014355c6142f68e20c8374e3787e5b5820f19e2 # jul 30