Spaces:

jackmouse
/

videoNote

Running

App Files Files Community

zhoujiaangyao commited on 14 days ago

Commit

6cfe55f

0 Parent(s):

deploy videomemo backend to HF Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +12 -0
Dockerfile +48 -0
README.md +20 -0
backend/.env.example +12 -0
backend/Dockerfile +42 -0
backend/Dockerfile.gpu +40 -0
backend/__init__.py +0 -0
backend/app/__init__.py +47 -0
backend/app/article_fetchers/__init__.py +3 -0
backend/app/article_fetchers/base.py +36 -0
backend/app/article_fetchers/generic.py +117 -0
backend/app/article_fetchers/wechat.py +142 -0
backend/app/article_fetchers/xiaohongshu.py +218 -0
backend/app/core/__init__.py +0 -0
backend/app/db/__init__.py +0 -0
backend/app/db/article_dao.py +167 -0
backend/app/db/builtin_providers.json +65 -0
backend/app/db/engine.py +45 -0
backend/app/db/init_db.py +34 -0
backend/app/db/model_dao.py +69 -0
backend/app/db/models/__init__.py +0 -0
backend/app/db/models/articles.py +55 -0
backend/app/db/models/models.py +12 -0
backend/app/db/models/providers.py +17 -0
backend/app/db/models/trend_subscription.py +50 -0
backend/app/db/models/video_tasks.py +14 -0
backend/app/db/provider_dao.py +129 -0
backend/app/db/sqlite_client.py +4 -0
backend/app/db/trend_subscription_dao.py +293 -0
backend/app/db/video_task_dao.py +61 -0
backend/app/decorators/__init__.py +0 -0
backend/app/decorators/timeit.py +13 -0
backend/app/downloaders/__init__.py +0 -0
backend/app/downloaders/base.py +52 -0
backend/app/downloaders/bilibili_downloader.py +343 -0
backend/app/downloaders/bilibili_subtitle.py +164 -0
backend/app/downloaders/common.py +1 -0
backend/app/downloaders/douyin_downloader.py +499 -0
backend/app/downloaders/douyin_helper/abogus.py +635 -0
backend/app/downloaders/generic_downloader.py +128 -0
backend/app/downloaders/kuaishou_downloader.py +97 -0
backend/app/downloaders/kuaishou_helper/__init__.py +0 -0
backend/app/downloaders/kuaishou_helper/kuaishou.py +101 -0
backend/app/downloaders/local_downloader.py +137 -0
backend/app/downloaders/xiaohongshu_downloader.py +133 -0
backend/app/downloaders/xiaoyuzhoufm_download.py +25 -0
backend/app/downloaders/youtube_downloader.py +259 -0
backend/app/downloaders/youtube_subtitle.py +113 -0
backend/app/enmus/exception.py +21 -0
backend/app/enmus/note_enums.py +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+backend/.venv/
+backend/data/
+backend/models/
+backend/config/
+backend/note_results/
+backend/static/
+backend/uploads/
+backend/*.db
+backend/app/db/*.db
+__pycache__/
+*.pyc
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+# VideoMemo 后端 —— Hugging Face Spaces（Docker SDK）部署用 Dockerfile。
+#
+# 用法：HF Space 是一个独立 git 仓库，把它的根目录布置成：
+#   /Dockerfile     ← 本文件（复制到 Space 根目录，重命名为 Dockerfile）
+#   /README.md      ← deploy/hf-space/README.md（含 HF 必需的 frontmatter）
+#   /backend/...    ← 从本项目复制整个 backend 目录过去
+# 然后 git push 到 Space，HF 会构建本文件（COPY 路径相对 Space 根目录）。
+#
+# 镜像故意精简：只装 ffmpeg + 后端依赖，默认走 REST 飞书推送，不装 lark-cli。
+# 数据库用外接 Postgres（Supabase），通过 DATABASE_URL Secret 注入。
+ARG BASE_REGISTRY=docker.io
+FROM ${BASE_REGISTRY}/library/python:3.11-slim
+# HF 在 huggingface.co 自家基础设施上构建/运行：用官方 PyPI 与默认 HF 端点，
+# 不要用国内镜像（那会更慢甚至失败）。
+ARG PIP_INDEX=https://pypi.org/simple
+# fonts-liberation 提供与 Arial 度量兼容的 LiberationSans，替代仓库里的 arial.ttf
+# （HF git 不收二进制，故字体不进仓库，改由镜像在构建时提供）
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ffmpeg curl fonts-liberation && \
+    rm -rf /var/lib/apt/lists/*
+ENV PYTHONUNBUFFERED=1 \
+    BACKEND_HOST=0.0.0.0 \
+    BACKEND_PORT=8483 \
+    STATIC=/static \
+    OUT_DIR=/app/static/screenshots \
+    IMAGE_BASE_URL=/static/screenshots \
+    NOTE_OUTPUT_DIR=/app/data/note_results \
+    DATA_DIR=/app/data
+WORKDIR /app
+# 先装依赖利用层缓存
+COPY backend/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
+# 再复制后端代码
+COPY backend /app
+# 预建可写目录（HF 容器以 root 运行，这些目录是临时盘——重启会清空，
+# 所以结构化数据务必走外接 DATABASE_URL；笔记/截图属临时数据，后续可再迁对象存储）
+RUN mkdir -p /app/data/note_results /app/static/screenshots /app/config /app/fonts && \
+    cp /usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf /app/fonts/arial.ttf
+EXPOSE 8483
+CMD ["python", "main.py"]

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: VideoMemo Backend
+emoji: 🎬
+colorFrom: indigo
+colorTo: blue
+sdk: docker
+app_port: 8483
+pinned: false
+---
+# VideoMemo 后端（API）
+AI 视频笔记生成的后端服务。桌面端 / 网页端 / 浏览器插件连接本 Space 的地址使用。
+- **结构化数据**（LLM 供应商配置与 API key、模型、关键词订阅、通知渠道、任务索引）
+  持久化到外接 Postgres（Supabase），通过 `DATABASE_URL` Secret 配置。
+- **本 Space 公开可访问**：务必设置 `WEB_ACCESS_PASSWORD` Secret，否则任何人都能调用你的后端。
+- 笔记正文 / 截图 / 向量库当前仍是容器内临时文件，**重启会清空**（计划后续迁入 Postgres / 对象存储）。
+> 部署步骤见仓库 `deploy/hf-space/DEPLOY.md`。

backend/.env.example ADDED Viewed

	@@ -0,0 +1,12 @@

+# 通用
+ENV=production
+API_BASE_URL=http://127.0.0.1:8000
+SCREENSHOT_BASE_URL=http://127.0.0.1:8000/static/screenshots
+STATIC=/static # 外部访问路径（URL 前缀）
+OUT_DIR=./static/screenshots    # 本地输出目录
+IMAGE_BASE_URL=/static/screenshots  # 图片访问 URL
+DATA_DIR=data
+# transcriber 相关配置
+TRANSCRIBER_TYPE=fast-whisper # fast-whisper/bcut/kuaishou
+WHISPER_MODEL_SIZE=base

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+# BASE_REGISTRY 默认走 docker.io；国内拉不到 docker.io 时可换 daocloud / 阿里云 / 自建镜像源：
+#   docker-compose build --build-arg BASE_REGISTRY=docker.m.daocloud.io
+# 或写到 docker-compose.yml 的 build.args / 环境变量里
+ARG BASE_REGISTRY=docker.io
+FROM ${BASE_REGISTRY}/library/python:3.11-slim
+ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
+ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
+RUN rm -f /etc/apt/sources.list && \
+    rm -rf /etc/apt/sources.list.d/* && \
+    echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
+    echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
+    echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends ffmpeg curl && \
+    rm -rf /var/lib/apt/lists/*
+ENV PATH="/usr/bin:${PATH}"
+ENV HF_ENDPOINT=https://hf-mirror.com
+# 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI（npm 包 @larksuite/cli，二进制名 lark-cli）。
+# 走 REST 直连推送则用不到，可按需删除本段以瘦身镜像。
+# 凭证通过 LARK_APP_ID / LARK_APP_SECRET 环境变量在运行时注入（由后端调用时传入），此处不写死。
+ARG NPM_REGISTRY=https://registry.npmmirror.com
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends nodejs npm && \
+    npm config set registry ${NPM_REGISTRY} && \
+    npm install -g @larksuite/cli && \
+    rm -rf /var/lib/apt/lists/* /root/.npm && \
+    (lark-cli --version || true)
+WORKDIR /app
+# 先复制 requirements.txt 利用层缓存
+COPY ./backend/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
+# 再复制应用代码（频繁变动不影响 pip 缓存层）
+COPY ./backend /app
+CMD ["python", "main.py"]

backend/Dockerfile.gpu ADDED Viewed

	@@ -0,0 +1,40 @@

+# BASE_REGISTRY 默认走 docker.io；国内可换 daocloud / 阿里云镜像（注意所选镜像需支持 nvidia/cuda 命名空间）
+ARG BASE_REGISTRY=docker.io
+FROM ${BASE_REGISTRY}/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
+ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
+RUN rm -f /etc/apt/sources.list && \
+    rm -rf /etc/apt/sources.list.d/* && \
+    echo "deb https://${APT_MIRROR}/ubuntu jammy main restricted universe multiverse" > /etc/apt/sources.list && \
+    echo "deb https://${APT_MIRROR}/ubuntu jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
+    echo "deb https://${APT_MIRROR}/ubuntu jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends ffmpeg python3-pip curl && \
+    rm -rf /var/lib/apt/lists/*
+ENV HF_ENDPOINT=https://hf-mirror.com
+# 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI（npm 包 @larksuite/cli，二进制名 lark-cli）。
+# Ubuntu 22.04 自带 apt 的 Node 太旧（v12）跑不动新 CLI，这里用 NodeSource 装 Node 20。
+# 走 REST 直连推送则用不到，可按需删除本段以瘦身镜像。凭证由后端运行时经环境变量注入，不写死。
+ARG NPM_REGISTRY=https://registry.npmmirror.com
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm config set registry ${NPM_REGISTRY} && \
+    npm install -g @larksuite/cli && \
+    rm -rf /var/lib/apt/lists/* /root/.npm && \
+    (lark-cli --version || true)
+WORKDIR /app
+# 先复制 requirements.txt 利用层缓存
+COPY ./backend/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt && \
+    pip install --no-cache-dir -i ${PIP_INDEX} 'transformers[torch]>=4.23'
+# 再复制应用代码
+COPY ./backend /app
+CMD ["python3", "main.py"]

backend/__init__.py ADDED Viewed

File without changes

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from typing import Optional
+from fastapi import Depends, FastAPI, Header, HTTPException, Request
+# 健康/诊断类接口：公网前端在用户尚未填访问密码时，也要能判断后端是否可达、
+# 从而正常加载页面（否则启动探测被密码拦成 401，整页卡在「连接中」无法进入设置去填密码）。
+_AUTH_EXEMPT_PATHS = {"/api/sys_check", "/api/sys_health", "/api/deploy_status"}
+async def verify_web_access_password(
+    request: Request,
+    request_web_access_password: Optional[str] = Header(
+        None, alias="request-web-access-password"
+    )
+):
+    if request.url.path in _AUTH_EXEMPT_PATHS:
+        return True
+    expected = os.getenv("WEB_ACCESS_PASSWORD")
+    if expected and request_web_access_password != expected:
+        raise HTTPException(status_code=401, detail="访问密码错误或未填写")
+    return True
+def create_app(lifespan) -> FastAPI:
+    from .routers import note, notification, provider, model, config, chat, flashcard, hot_videos, article, trend_subscription, feishu
+    from .utils.response import ResponseWrapper as R
+    app = FastAPI(title="VideoMemo",lifespan=lifespan)
+    protected = [Depends(verify_web_access_password)]
+    @app.get("/sys_check")
+    async def root_sys_check():
+        return R.success()
+    app.include_router(note.router, prefix="/api", dependencies=protected)
+    app.include_router(provider.router, prefix="/api", dependencies=protected)
+    app.include_router(model.router, prefix="/api", dependencies=protected)
+    app.include_router(config.router, prefix="/api", dependencies=protected)
+    app.include_router(chat.router, prefix="/api", dependencies=protected)
+    app.include_router(flashcard.router, prefix="/api", dependencies=protected)
+    app.include_router(hot_videos.router, prefix="/api", dependencies=protected)
+    app.include_router(article.router, prefix="/api", dependencies=protected)
+    app.include_router(trend_subscription.router, prefix="/api", dependencies=protected)
+    app.include_router(notification.router, prefix="/api", dependencies=protected)
+    app.include_router(feishu.router, prefix="/api", dependencies=protected)
+    return app

backend/app/article_fetchers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from app.article_fetchers.base import ArticleContent, ArticleFetcher, ArticleFetchError
2	+
3	+ __all__ = ["ArticleContent", "ArticleFetcher", "ArticleFetchError"]

backend/app/article_fetchers/base.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Protocol
+@dataclass
+class ArticleContent:
+    platform: str
+    url: str
+    article_id: str
+    title: str
+    author_name: str = ""
+    author_id: str = ""
+    content_text: str = ""
+    image_urls: list[str] = field(default_factory=list)
+    cover_url: str = ""
+    published_at: str = ""
+    raw_metadata: dict = field(default_factory=dict)
+class ArticleFetchError(Exception):
+    pass
+class ArticleFetcher(Protocol):
+    platform: str
+    def fetch(self, url: str) -> ArticleContent:
+        ...
+    def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
+        ...
+    def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
+        ...

backend/app/article_fetchers/generic.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from __future__ import annotations
+import re
+from urllib.parse import urlparse
+import requests
+from bs4 import BeautifulSoup
+from app.article_fetchers.base import ArticleContent, ArticleFetchError
+from app.utils.url_parser import clean_url
+def _clean_text(value: str) -> str:
+    return re.sub(r"[ \t\r\f\v]+", " ", value or "").strip()
+def _normalize_body(value: str) -> str:
+    lines = [_clean_text(line) for line in (value or "").splitlines()]
+    return "\n".join(line for line in lines if line)
+def _meta_content(soup: BeautifulSoup, *selectors: tuple[str, str]) -> str:
+    for attr, value in selectors:
+        node = soup.find("meta", attrs={attr: value})
+        if node:
+            content = _clean_text(node.get("content") or "")
+            if content:
+                return content
+    return ""
+def _candidate_score(node) -> int:
+    text = _normalize_body(node.get_text("\n"))
+    paragraphs = node.find_all("p")
+    return len(text) + len(paragraphs) * 120
+def parse_generic_article_html(html: str, url: str) -> ArticleContent:
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "noscript", "svg", "canvas", "iframe"]):
+        tag.decompose()
+    for tag in soup(["nav", "header", "footer", "aside", "form"]):
+        tag.decompose()
+    title = (
+        _meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
+        or _clean_text(soup.title.get_text(" ")) if soup.title else ""
+    )
+    author = _meta_content(soup, ("name", "author"), ("property", "article:author"))
+    published_at = _meta_content(
+        soup,
+        ("property", "article:published_time"),
+        ("name", "publishdate"),
+        ("name", "date"),
+    )
+    cover = _meta_content(soup, ("property", "og:image"), ("name", "twitter:image"))
+    candidates = []
+    for selector in ("article", "main", "[role='main']", "#content", ".content", ".article", ".post"):
+        candidates.extend(soup.select(selector))
+    if not candidates and soup.body:
+        candidates = [soup.body]
+    best = max(candidates, key=_candidate_score, default=None)
+    body = _normalize_body(best.get_text("\n")) if best else ""
+    if len(body) < 80:
+        description = _meta_content(soup, ("name", "description"), ("property", "og:description"))
+        body = description if len(description) > len(body) else body
+    if len(body) < 40:
+        raise ValueError("网页正文为空或过短，无法生成总结")
+    parsed = urlparse(url)
+    article_id = parsed.netloc + parsed.path
+    return ArticleContent(
+        platform="generic_web",
+        url=url,
+        article_id=article_id or url,
+        title=title or parsed.netloc or "网页文章",
+        author_name=author,
+        content_text=body,
+        image_urls=[cover] if cover else [],
+        cover_url=cover,
+        published_at=published_at,
+        raw_metadata={"source": "generic_web"},
+    )
+class GenericArticleFetcher:
+    platform = "generic_web"
+    def fetch(self, url: str) -> ArticleContent:
+        clean = clean_url(url)
+        try:
+            response = requests.get(
+                clean,
+                timeout=12,
+                allow_redirects=True,
+                headers={
+                    "User-Agent": (
+                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
+                    ),
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                },
+            )
+            response.raise_for_status()
+            return parse_generic_article_html(response.text, response.url or clean)
+        except ValueError:
+            raise
+        except Exception as exc:
+            raise ArticleFetchError(f"网页文章抓取失败：{exc}") from exc
+    def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
+        raise ArticleFetchError("通用网页暂不支持关键字查询，请粘贴具体文章链接")
+    def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
+        raise ArticleFetchError("通用网页暂不支持发布者订阅，请粘贴具体文章链接")

backend/app/article_fetchers/wechat.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from __future__ import annotations
+import re
+from urllib.parse import parse_qs, quote, unquote, urljoin, urlparse
+import requests
+from bs4 import BeautifulSoup
+from app.article_fetchers.base import ArticleContent, ArticleFetchError
+def _clean_text(value: str) -> str:
+    return re.sub(r"\s+", " ", value or "").strip()
+def _element_text(element) -> str:
+    return _clean_text(element.get_text(" ")) if element else ""
+def _script_value(html: str, name: str) -> str:
+    patterns = [
+        rf'var\s+{re.escape(name)}\s*=\s*"([^"]*)"',
+        rf"{re.escape(name)}\s*:\s*'([^']*)'",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, html)
+        if match:
+            return match.group(1).strip()
+    return ""
+def parse_wechat_article_html(html: str, url: str) -> ArticleContent:
+    soup = BeautifulSoup(html, "html.parser")
+    title = _element_text(soup.find(id="activity-name") or soup.find("h1"))
+    author = _element_text(soup.find(id="js_name"))
+    published_at = _element_text(soup.find(id="publish_time"))
+    content = soup.find(id="js_content")
+    body = _clean_text(content.get_text("\n")) if content else ""
+    if not body:
+        raise ValueError("微信公众号文章正文为空，无法生成总结")
+    image_urls: list[str] = []
+    for image in content.find_all("img") if content else []:
+        src = image.get("data-src") or image.get("src") or ""
+        if src and src not in image_urls:
+            image_urls.append(src)
+    biz = _script_value(html, "biz")
+    mid = _script_value(html, "mid")
+    idx = _script_value(html, "idx")
+    sn = _script_value(html, "sn")
+    article_id = ":".join(part for part in [biz, mid, idx, sn] if part) or url
+    return ArticleContent(
+        platform="wechat_mp",
+        url=url,
+        article_id=article_id,
+        title=title or "微信公众号文章",
+        author_name=author,
+        author_id=biz,
+        content_text=body,
+        image_urls=image_urls,
+        cover_url=image_urls[0] if image_urls else "",
+        published_at=published_at,
+        raw_metadata={"biz": biz, "mid": mid, "idx": idx, "sn": sn},
+    )
+def _normalize_wechat_result_url(href: str) -> str:
+    if not href:
+        return ""
+    absolute = urljoin("https://weixin.sogou.com", href)
+    parsed = urlparse(absolute)
+    query = parse_qs(parsed.query)
+    for key in ("url", "target"):
+        if query.get(key):
+            candidate = unquote(query[key][0])
+            if "mp.weixin.qq.com" in candidate:
+                return candidate
+    return absolute if "mp.weixin.qq.com" in absolute else ""
+def parse_wechat_search_html(html: str, keyword: str, limit: int = 20) -> list[ArticleContent]:
+    soup = BeautifulSoup(html, "html.parser")
+    items: list[ArticleContent] = []
+    seen: set[str] = set()
+    for anchor in soup.find_all("a", href=True):
+        url = _normalize_wechat_result_url(anchor.get("href") or "")
+        if not url or url in seen:
+            continue
+        title = _clean_text(anchor.get_text(" "))
+        if not title:
+            continue
+        container = anchor.find_parent(["div", "li"]) or anchor.parent
+        info_nodes = container.find_all(class_=re.compile(r"(txt-info|s-p|account)")) if container else []
+        info = [_clean_text(node.get_text(" ")) for node in info_nodes if _clean_text(node.get_text(" "))]
+        author = info[0] if info else ""
+        summary = info[-1] if len(info) > 1 else title
+        seen.add(url)
+        items.append(
+            ArticleContent(
+                platform="wechat_mp",
+                url=url,
+                article_id=url,
+                title=title,
+                author_name=author,
+                content_text=summary,
+                raw_metadata={"keyword": keyword, "source": "sogou_weixin"},
+            )
+        )
+        if len(items) >= limit:
+            break
+    return items
+class WechatArticleFetcher:
+    platform = "wechat_mp"
+    def fetch(self, url: str) -> ArticleContent:
+        try:
+            response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
+            response.raise_for_status()
+            return parse_wechat_article_html(response.text, url)
+        except ValueError:
+            raise
+        except Exception as exc:
+            raise ArticleFetchError(f"微信公众号文章抓取失败：{exc}") from exc
+    def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
+        try:
+            response = requests.get(
+                f"https://weixin.sogou.com/weixin?type=2&query={quote(keyword)}",
+                timeout=10,
+                headers={"User-Agent": "Mozilla/5.0"},
+            )
+            response.raise_for_status()
+            return parse_wechat_search_html(response.text, keyword, limit)
+        except Exception as exc:
+            raise ArticleFetchError(f"微信公众号关键字查询失败：{exc}") from exc
+    def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
+        return self.search(query, limit)

backend/app/article_fetchers/xiaohongshu.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from __future__ import annotations
+import json
+import re
+from datetime import datetime
+from urllib.parse import quote, urlparse
+import requests
+from bs4 import BeautifulSoup
+from app.article_fetchers.base import ArticleContent, ArticleFetchError
+from app.services.cookie_manager import CookieConfigManager
+from app.utils.url_parser import clean_url
+def _note_id_from_url(url: str) -> str:
+    path = urlparse(url).path.rstrip("/")
+    return path.split("/")[-1] if path else url
+def _extract_initial_state(html: str) -> dict:
+    match = re.search(r"window\.__INITIAL_STATE__\s*=", html)
+    if not match:
+        return {}
+    start = html.find("{", match.end())
+    if start < 0:
+        return {}
+    depth = 0
+    end = -1
+    for index in range(start, len(html)):
+        char = html[index]
+        if char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                end = index + 1
+                break
+    if end < 0:
+        return {}
+    raw = html[start:end].replace("undefined", "null")
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        return {}
+def _first_image_url(item: dict) -> str:
+    for key in ("urlDefault", "url", "traceId"):
+        value = item.get(key)
+        if isinstance(value, str) and value.startswith("http"):
+            return value
+    nested = item.get("cover") or item.get("image") or {}
+    if isinstance(nested, dict):
+        for key in ("urlDefault", "url"):
+            value = nested.get(key)
+            if isinstance(value, str) and value.startswith("http"):
+                return value
+    return ""
+def _published_at(value) -> str:
+    try:
+        timestamp = int(value)
+    except (TypeError, ValueError):
+        return ""
+    if timestamp > 10_000_000_000:
+        timestamp = timestamp // 1000
+    return datetime.fromtimestamp(timestamp).isoformat(timespec="seconds")
+def _article_from_note(note: dict, url: str) -> ArticleContent:
+    user = note.get("user") or {}
+    images: list[str] = []
+    for image in note.get("imageList") or note.get("images") or []:
+        src = _first_image_url(image)
+        if src and src not in images:
+            images.append(src)
+    content = str(note.get("desc") or note.get("description") or "").strip()
+    title = str(note.get("title") or "").strip() or content[:40] or "小红书笔记"
+    article_id = str(note.get("noteId") or note.get("id") or _note_id_from_url(url)).strip()
+    if not content:
+        raise ValueError("小红书笔记正文为空，无法生成总结")
+    return ArticleContent(
+        platform="xiaohongshu",
+        url=url,
+        article_id=article_id,
+        title=title,
+        author_name=str(user.get("nickname") or "").strip(),
+        author_id=str(user.get("userId") or user.get("id") or "").strip(),
+        content_text=content,
+        image_urls=images,
+        cover_url=images[0] if images else "",
+        published_at=_published_at(note.get("time") or note.get("lastUpdateTime")),
+        raw_metadata={"raw_note": note},
+    )
+def parse_xiaohongshu_article_html(html: str, url: str) -> ArticleContent:
+    state = _extract_initial_state(html)
+    detail_map = ((state.get("note") or {}).get("noteDetailMap")) or {}
+    for value in detail_map.values():
+        note = value.get("note") if isinstance(value, dict) else None
+        if isinstance(note, dict):
+            return _article_from_note(note, url)
+    soup = BeautifulSoup(html, "html.parser")
+    title_meta = soup.find("meta", attrs={"property": "og:title"})
+    desc_meta = soup.find("meta", attrs={"name": "description"})
+    title = (title_meta.get("content") if title_meta else "") or "小红书笔记"
+    body = (desc_meta.get("content") if desc_meta else "").strip()
+    if not body:
+        raise ValueError("小红书笔记正文为空，无法生成总结")
+    return ArticleContent(
+        platform="xiaohongshu",
+        url=url,
+        article_id=_note_id_from_url(url),
+        title=title.strip(),
+        content_text=body,
+    )
+def _iter_note_like(value):
+    if isinstance(value, dict):
+        note_id = value.get("noteId") or value.get("id")
+        title = value.get("title") or value.get("displayTitle")
+        desc = value.get("desc") or value.get("description")
+        if note_id and (title or desc):
+            yield value
+        for child in value.values():
+            yield from _iter_note_like(child)
+    elif isinstance(value, list):
+        for child in value:
+            yield from _iter_note_like(child)
+def parse_xiaohongshu_discovery_html(
+    html: str,
+    source_url: str,
+    limit: int = 20,
+) -> list[ArticleContent]:
+    state = _extract_initial_state(html)
+    items: list[ArticleContent] = []
+    seen: set[str] = set()
+    for note in _iter_note_like(state):
+        article_id = str(note.get("noteId") or note.get("id") or "").strip()
+        if not article_id or article_id in seen:
+            continue
+        user = note.get("user") or note.get("author") or {}
+        image_url = _first_image_url(note)
+        content = str(note.get("desc") or note.get("description") or note.get("title") or "").strip()
+        title = str(note.get("title") or note.get("displayTitle") or content[:40] or "小红书笔记").strip()
+        seen.add(article_id)
+        items.append(
+            ArticleContent(
+                platform="xiaohongshu",
+                url=f"https://www.xiaohongshu.com/explore/{article_id}",
+                article_id=article_id,
+                title=title,
+                author_name=str(user.get("nickname") or user.get("name") or "").strip(),
+                author_id=str(user.get("userId") or user.get("id") or "").strip(),
+                content_text=content,
+                image_urls=[image_url] if image_url else [],
+                cover_url=image_url,
+                raw_metadata={"source_url": source_url},
+            )
+        )
+        if len(items) >= limit:
+            break
+    return items
+class XiaohongshuArticleFetcher:
+    platform = "xiaohongshu"
+    def __init__(self):
+        self._cookie_mgr = CookieConfigManager()
+    def _headers(self) -> dict:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        cookie = self._cookie_mgr.get("xiaohongshu")
+        if cookie:
+            headers["Cookie"] = cookie
+        return headers
+    def fetch(self, url: str) -> ArticleContent:
+        clean = clean_url(url)
+        try:
+            response = requests.get(clean, timeout=10, headers=self._headers(), allow_redirects=True)
+            response.raise_for_status()
+            return parse_xiaohongshu_article_html(response.text, response.url or clean)
+        except ValueError:
+            raise
+        except Exception as exc:
+            raise ArticleFetchError(f"小红书笔记抓取失败：{exc}") from exc
+    def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
+        url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}"
+        try:
+            response = requests.get(url, timeout=10, headers=self._headers())
+            response.raise_for_status()
+            return parse_xiaohongshu_discovery_html(response.text, url, limit)
+        except Exception as exc:
+            raise ArticleFetchError(f"小红书关键字查询失败：{exc}") from exc
+    def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
+        url = clean_url(query)
+        if not url.startswith("http"):
+            url = f"https://www.xiaohongshu.com/user/profile/{quote(query)}"
+        try:
+            response = requests.get(url, timeout=10, headers=self._headers(), allow_redirects=True)
+            response.raise_for_status()
+            return parse_xiaohongshu_discovery_html(response.text, response.url or url, limit)
+        except Exception as exc:
+            raise ArticleFetchError(f"小红书发布者订阅刷新失败：{exc}") from exc

backend/app/core/__init__.py ADDED Viewed

File without changes

backend/app/db/__init__.py ADDED Viewed

File without changes

backend/app/db/article_dao.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from __future__ import annotations
+import hashlib
+import json
+from datetime import datetime
+from app.article_fetchers.base import ArticleContent
+from app.db.engine import get_db
+from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
+def url_hash(url: str) -> str:
+    return hashlib.sha256(url.encode("utf-8")).hexdigest()
+def _detach(obj):
+    data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
+    obj.__dict__.clear()
+    obj.__dict__.update(data)
+    return obj
+def upsert_article_item(article: ArticleContent) -> ArticleItem:
+    db = next(get_db())
+    try:
+        digest = url_hash(article.url)
+        item = None
+        if article.article_id:
+            item = (
+                db.query(ArticleItem)
+                .filter_by(platform=article.platform, article_id=article.article_id)
+                .first()
+            )
+        if item is None:
+            item = db.query(ArticleItem).filter_by(platform=article.platform, url_hash=digest).first()
+        if item is None:
+            item = ArticleItem(
+                platform=article.platform,
+                article_id=article.article_id,
+                url_hash=digest,
+                url=article.url,
+                title=article.title,
+            )
+            db.add(item)
+        item.url = article.url
+        item.title = article.title
+        item.author_name = article.author_name
+        item.author_id = article.author_id
+        item.cover_url = article.cover_url
+        item.published_at = article.published_at
+        item.content_text = article.content_text
+        item.raw_metadata = json.dumps(article.raw_metadata or {}, ensure_ascii=False)
+        db.commit()
+        db.refresh(item)
+        return _detach(item)
+    finally:
+        db.close()
+def get_article_item(item_id: int) -> ArticleItem | None:
+    db = next(get_db())
+    try:
+        item = db.query(ArticleItem).filter_by(id=item_id).first()
+        return _detach(item) if item else None
+    finally:
+        db.close()
+def list_article_items(subscription_id: int | None = None) -> list[ArticleItem]:
+    db = next(get_db())
+    try:
+        query = db.query(ArticleItem)
+        if subscription_id is not None:
+            query = query.join(
+                ArticleSubscriptionItem,
+                ArticleSubscriptionItem.article_item_id == ArticleItem.id,
+            ).filter(ArticleSubscriptionItem.subscription_id == subscription_id)
+        return [_detach(item) for item in query.order_by(ArticleItem.id.desc()).all()]
+    finally:
+        db.close()
+def mark_article_summarized(item_id: int, task_id: str) -> None:
+    db = next(get_db())
+    try:
+        item = db.query(ArticleItem).filter_by(id=item_id).first()
+        if item:
+            item.summary_status = "summarized"
+            item.task_id = task_id
+            db.commit()
+    finally:
+        db.close()
+def create_subscription(
+    platform: str,
+    subscription_type: str,
+    query: str,
+    label: str = "",
+) -> ArticleSubscription:
+    db = next(get_db())
+    try:
+        subscription = ArticleSubscription(
+            platform=platform,
+            type=subscription_type,
+            query=query,
+            label=label or query,
+        )
+        db.add(subscription)
+        db.commit()
+        db.refresh(subscription)
+        return _detach(subscription)
+    finally:
+        db.close()
+def list_subscriptions() -> list[ArticleSubscription]:
+    db = next(get_db())
+    try:
+        return [
+            _detach(item)
+            for item in db.query(ArticleSubscription).order_by(ArticleSubscription.id.desc()).all()
+        ]
+    finally:
+        db.close()
+def get_subscription(subscription_id: int) -> ArticleSubscription | None:
+    db = next(get_db())
+    try:
+        item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
+        return _detach(item) if item else None
+    finally:
+        db.close()
+def update_subscription_refresh(subscription_id: int, error: str = "") -> None:
+    db = next(get_db())
+    try:
+        item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
+        if item:
+            item.last_refresh_at = datetime.now()
+            item.last_error = error
+            db.commit()
+    finally:
+        db.close()
+def link_subscription_item(subscription_id: int, article_item_id: int, match_reason: str) -> None:
+    db = next(get_db())
+    try:
+        existing = (
+            db.query(ArticleSubscriptionItem)
+            .filter_by(subscription_id=subscription_id, article_item_id=article_item_id)
+            .first()
+        )
+        if existing is None:
+            db.add(
+                ArticleSubscriptionItem(
+                    subscription_id=subscription_id,
+                    article_item_id=article_item_id,
+                    match_reason=match_reason,
+                )
+            )
+            db.commit()
+    finally:
+        db.close()

backend/app/db/builtin_providers.json ADDED Viewed

	@@ -0,0 +1,65 @@

+[
+  {
+    "id": "openai",
+    "name": "OpenAI",
+    "type": "built-in",
+    "logo": "OpenAI",
+    "api_key": "",
+    "base_url": "https://api.openai.com/v1",
+    "enabled": 0
+  },
+  {
+    "id": "deepseek",
+    "name": "DeepSeek",
+    "type": "built-in",
+    "logo": "DeepSeek",
+    "api_key": "",
+    "base_url": "https://api.deepseek.com",
+    "enabled": 1
+  },
+  {
+    "id": "qwen",
+    "name": "Qwen",
+    "type": "built-in",
+    "logo": "Qwen",
+    "api_key": "",
+    "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+    "enabled": 0
+  },
+  {
+    "id": "Claude",
+    "name": "Claude",
+    "type": "built-in",
+    "logo": "Claude",
+    "api_key": "",
+    "base_url": "https://",
+    "enabled": 0
+  },
+  {
+    "id": "gemini",
+    "name": "Gemini",
+    "type": "built-in",
+    "logo": "Gemini",
+    "api_key": "",
+    "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
+    "enabled": 0
+  },
+  {
+    "id": "groq",
+    "name": "Groq",
+    "type": "built-in",
+    "logo": "Groq",
+    "api_key": "",
+    "base_url": "https://api.groq.com/openai/v1",
+    "enabled": 0
+  },
+  {
+    "id": "ollama",
+    "name": "ollama",
+    "type": "built-in",
+    "logo": "Ollama",
+    "api_key": "",
+    "base_url": "http://127.0.0.1:11434/v1",
+    "enabled": 0
+  }
+]

backend/app/db/engine.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker, declarative_base
+from dotenv import load_dotenv
+load_dotenv()
+# 默认 SQLite，如果想换 PostgreSQL 或 MySQL，可以直接改 .env
+DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///video_memo.db")
+# SQLite 需要特定连接参数，其他数据库不需要
+engine_args = {}
+if DATABASE_URL.startswith("sqlite"):
+    engine_args["connect_args"] = {"check_same_thread": False}
+_pool_args = {}
+if not DATABASE_URL.startswith("sqlite"):
+    _pool_args = {
+        "pool_size": int(os.getenv("DB_POOL_SIZE", "10")),
+        "max_overflow": int(os.getenv("DB_MAX_OVERFLOW", "20")),
+        "pool_pre_ping": True,
+    }
+engine = create_engine(
+    DATABASE_URL,
+    echo=os.getenv("SQLALCHEMY_ECHO", "false").lower() == "true",
+    **engine_args,
+    **_pool_args,
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+def get_engine():
+    return engine
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()

backend/app/db/init_db.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
+from app.db.models.models import Model
+from app.db.models.providers import Provider
+from app.db.models.trend_subscription import (
+    NotificationChannel,
+    TrendSubscription,
+    TrendSubscriptionMatch,
+)
+from app.db.models.video_tasks import VideoTask
+from app.db.engine import get_engine, Base
+from sqlalchemy import inspect, text
+def init_db():
+    engine = get_engine()
+    Base.metadata.create_all(bind=engine)
+    _ensure_article_content_text(engine)
+# 注：原 _ensure_model_columns 为 models.supports_multimodal 做的迁移已删除——
+# 该列在「drop multimodal」重构后已不再被 ORM 使用（纯遗留），且它的
+# `ALTER ... BOOLEAN NOT NULL DEFAULT 0` 在 Postgres 上会因 boolean 默认值类型不符直接报错。
+# 已有 SQLite 库里残留的该列无害，保持不动即可。
+def _ensure_article_content_text(engine):
+    inspector = inspect(engine)
+    if "article_items" not in inspector.get_table_names():
+        return
+    columns = {column["name"] for column in inspector.get_columns("article_items")}
+    if "content_text" in columns:
+        return
+    with engine.begin() as conn:
+        conn.execute(text("ALTER TABLE article_items ADD COLUMN content_text TEXT NOT NULL DEFAULT ''"))

backend/app/db/model_dao.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from app.db.engine import get_db
+from app.db.models.models import Model
+from app.db.models.providers import Provider
+def get_model_by_provider_and_name(provider_id: int, model_name: str):
+    db = next(get_db())
+    try:
+        model = db.query(Model).filter_by(provider_id=provider_id, model_name=model_name).first()
+        if model:
+            return {
+                "id": model.id,
+                "provider_id": model.provider_id,
+                "model_name": model.model_name,
+                "created_at": model.created_at,
+            }
+        return None
+    finally:
+        db.close()
+def insert_model(provider_id: int, model_name: str):
+    db = next(get_db())
+    try:
+        model = Model(provider_id=provider_id, model_name=model_name)
+        db.add(model)
+        db.commit()
+        db.refresh(model)
+        return {
+            "id": model.id,
+            "provider_id": model.provider_id,
+            "model_name": model.model_name,
+            "created_at": model.created_at,
+        }
+    finally:
+        db.close()
+def get_models_by_provider(provider_id: int):
+    db = next(get_db())
+    try:
+        models = db.query(Model).filter_by(provider_id=provider_id).all()
+        return [{"id": m.id, "model_name": m.model_name} for m in models]
+    finally:
+        db.close()
+def delete_model(model_id: int):
+    db = next(get_db())
+    try:
+        model = db.query(Model).filter_by(id=model_id).first()
+        if model:
+            db.delete(model)
+            db.commit()
+    finally:
+        db.close()
+def get_all_models():
+    db = next(get_db())
+    try:
+        # 只查询启用状态供应商的模型
+        models = db.query(Model).join(Provider, Model.provider_id == Provider.id).filter(Provider.enabled == 1).all()
+        return [
+            {"id": m.id, "provider_id": m.provider_id, "model_name": m.model_name}
+            for m in models
+        ]
+    finally:
+        db.close()

backend/app/db/models/__init__.py ADDED Viewed

File without changes

backend/app/db/models/articles.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
+from app.db.engine import Base
+class ArticleItem(Base):
+    __tablename__ = "article_items"
+    __table_args__ = (
+        UniqueConstraint("platform", "article_id", name="uq_article_platform_article_id"),
+        UniqueConstraint("platform", "url_hash", name="uq_article_platform_url_hash"),
+    )
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    platform = Column(String, nullable=False)
+    article_id = Column(String, nullable=False, default="")
+    url = Column(Text, nullable=False)
+    url_hash = Column(String, nullable=False)
+    title = Column(String, nullable=False)
+    author_name = Column(String, nullable=False, default="")
+    author_id = Column(String, nullable=False, default="")
+    summary_status = Column(String, nullable=False, default="pending")
+    task_id = Column(String, nullable=False, default="")
+    cover_url = Column(Text, nullable=False, default="")
+    published_at = Column(String, nullable=False, default="")
+    content_text = Column(Text, nullable=False, default="")
+    discovered_at = Column(DateTime, server_default=func.now())
+    raw_metadata = Column(Text, nullable=False, default="{}")
+class ArticleSubscription(Base):
+    __tablename__ = "article_subscriptions"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    platform = Column(String, nullable=False)
+    type = Column(String, nullable=False)
+    query = Column(Text, nullable=False)
+    label = Column(String, nullable=False, default="")
+    enabled = Column(Boolean, nullable=False, default=True)
+    last_refresh_at = Column(DateTime, nullable=True)
+    last_error = Column(Text, nullable=False, default="")
+    created_at = Column(DateTime, server_default=func.now())
+    updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
+class ArticleSubscriptionItem(Base):
+    __tablename__ = "article_subscription_items"
+    __table_args__ = (
+        UniqueConstraint("subscription_id", "article_item_id", name="uq_subscription_article_item"),
+    )
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    subscription_id = Column(Integer, ForeignKey("article_subscriptions.id"), nullable=False)
+    article_item_id = Column(Integer, ForeignKey("article_items.id"), nullable=False)
+    matched_at = Column(DateTime, server_default=func.now())
+    match_reason = Column(Text, nullable=False, default="")

backend/app/db/models/models.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from sqlalchemy import Column, Integer, String, DateTime, func, ForeignKey
+from app.db.engine import Base
+class Model(Base):
+    __tablename__ = "models"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    provider_id = Column(Integer, nullable=False)
+    model_name = Column(String, nullable=False)
+    created_at = Column(DateTime, server_default=func.now())

backend/app/db/models/providers.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from sqlalchemy import Column, String, Integer, DateTime, func
+from sqlalchemy.orm import declarative_base
+from app.db.engine import Base
+class Provider(Base):
+    __tablename__ = "providers"
+    id = Column(String, primary_key=True)
+    name = Column(String, nullable=False)
+    logo = Column(String, nullable=False)
+    type = Column(String, nullable=False)
+    api_key = Column(String, nullable=False)
+    base_url = Column(String, nullable=False)
+    enabled = Column(Integer, default=1)
+    created_at = Column(DateTime, server_default=func.now())

backend/app/db/models/trend_subscription.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, func
+from app.db.engine import Base
+class TrendSubscription(Base):
+    __tablename__ = "trend_subscriptions"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(String, nullable=False)
+    keywords = Column(Text, nullable=False, default="[]")  # JSON array of keyword strings
+    platforms = Column(Text, nullable=False, default='["all"]')  # JSON array of platform ids
+    match_mode = Column(String, nullable=False, default="any")  # "any" | "all"
+    enabled = Column(Boolean, nullable=False, default=True)
+    push_enabled = Column(Boolean, nullable=False, default=False)
+    push_channel_ids = Column(Text, nullable=False, default="[]")  # JSON array of channel ids
+    last_matched_at = Column(DateTime, nullable=True)
+    created_at = Column(DateTime, server_default=func.now())
+    updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
+class TrendSubscriptionMatch(Base):
+    __tablename__ = "trend_subscription_matches"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    subscription_id = Column(Integer, ForeignKey("trend_subscriptions.id"), nullable=False)
+    platform = Column(String, nullable=False)
+    item_id = Column(String, nullable=False)
+    title = Column(String, nullable=False)
+    url = Column(Text, nullable=False, default="")
+    hot_score = Column(String, nullable=False, default="")
+    matched_keywords = Column(Text, nullable=False, default="[]")  # JSON array of matched keywords
+    matched_at = Column(DateTime, server_default=func.now())
+    is_read = Column(Boolean, nullable=False, default=False)
+    # dedup: same subscription + same platform + same item_id
+    __table_args__ = (
+        {"sqlite_autoincrement": True},
+    )
+class NotificationChannel(Base):
+    __tablename__ = "notification_channels"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(String, nullable=False)
+    type = Column(String, nullable=False)  # "webhook" | "bark" | "email"
+    config = Column(Text, nullable=False, default="{}")  # JSON object, type-specific
+    enabled = Column(Boolean, nullable=False, default=True)
+    created_at = Column(DateTime, server_default=func.now())
+    updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())

backend/app/db/models/video_tasks.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from sqlalchemy import Column, Integer, String, DateTime, func
+from sqlalchemy.orm import declarative_base
+from app.db.engine import Base
+class VideoTask(Base):
+    __tablename__ = "video_tasks"
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    video_id = Column(String, nullable=False)
+    platform = Column(String, nullable=False)
+    task_id = Column(String, unique=True, nullable=False)
+    created_at = Column(DateTime, server_default=func.now())

backend/app/db/provider_dao.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import json
+import os
+import sys
+from app.db.models.providers import Provider
+from app.utils.logger import get_logger
+from app.db.engine import get_engine, Base, get_db
+logger = get_logger(__name__)
+def get_builtin_providers_path():
+    if getattr(sys, 'frozen', False):
+        base_path = sys._MEIPASS
+    else:
+        base_path = os.path.dirname(__file__)
+    return os.path.join(base_path, 'builtin_providers.json')
+def seed_default_providers():
+    db = next(get_db())
+    try:
+        if db.query(Provider).count() > 0:
+            logger.info("Providers already exist, skipping seed.")
+            return
+        json_path = get_builtin_providers_path()
+        try:
+            with open(json_path, 'r', encoding='utf-8') as f:
+                providers = json.load(f)
+        except Exception as e:
+            logger.error(f"Failed to read builtin_providers.json: {e}")
+            return
+        for p in providers:
+            db.add(Provider(
+                id=p['id'],
+                name=p['name'],
+                api_key=p['api_key'],
+                base_url=p['base_url'],
+                logo=p['logo'],
+                type=p['type'],
+                enabled=p.get('enabled', 1)
+            ))
+        db.commit()
+        logger.info("Default providers seeded successfully.")
+    except Exception as e:
+        logger.error(f"Failed to seed default providers: {e}")
+    finally:
+        db.close()
+def insert_provider(id: str, name: str, api_key: str, base_url: str, logo: str, type_: str, enabled: int = 1):
+    db = next(get_db())
+    try:
+        provider = Provider(id=id, name=name, api_key=api_key, base_url=base_url, logo=logo, type=type_, enabled=enabled)
+        db.add(provider)
+        db.commit()
+        logger.info(f"Provider inserted successfully. id: {id}, name: {name}, type: {type_}")
+        return id
+    except Exception as e:
+        logger.error(f"Failed to insert provider: {e}")
+    finally:
+        db.close()
+def get_enabled_providers():
+    db = next(get_db())
+    try:
+        return db.query(Provider).filter_by(enabled=1).all()
+    finally:
+        db.close()
+def get_provider_by_name(name: str):
+    db = next(get_db())
+    try:
+        return db.query(Provider).filter_by(name=name).first()
+    finally:
+        db.close()
+def get_provider_by_id(id: str):
+    db = next(get_db())
+    try:
+        return db.query(Provider).filter_by(id=id).first()
+    finally:
+        db.close()
+def get_all_providers():
+    db = next(get_db())
+    try:
+        return db.query(Provider).all()
+    finally:
+        db.close()
+def update_provider(id: str, **kwargs):
+    db = next(get_db())
+    try:
+        provider = db.query(Provider).filter_by(id=id).first()
+        if not provider:
+            logger.warning(f"Provider {id} not found for update.")
+            return
+        for key, value in kwargs.items():
+            if hasattr(provider, key):
+                setattr(provider, key, value)
+        db.commit()
+        logger.info(f"Provider updated successfully. id: {id}, updated_fields: {list(kwargs.keys())}")
+    except Exception as e:
+        logger.error(f"Failed to update provider: {e}")
+    finally:
+        db.close()
+def delete_provider(id: str):
+    db = next(get_db())
+    try:
+        provider = db.query(Provider).filter_by(id=id).first()
+        if provider:
+            db.delete(provider)
+            db.commit()
+            logger.info(f"Provider deleted successfully. id: {id}")
+    except Exception as e:
+        logger.error(f"Failed to delete provider: {e}")
+    finally:
+        db.close()

backend/app/db/sqlite_client.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import sqlite3
+def get_connection():
+    return sqlite3.connect("video_memo.db")

backend/app/db/trend_subscription_dao.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from __future__ import annotations
+import json
+from datetime import datetime
+from app.db.engine import get_db
+from app.db.models.trend_subscription import (
+    NotificationChannel,
+    TrendSubscription,
+    TrendSubscriptionMatch,
+)
+def _detach(obj):
+    data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
+    obj.__dict__.clear()
+    obj.__dict__.update(data)
+    return obj
+# ─── Trend Subscriptions ──────────────────────────────────────────────────────────
+def create_subscription(
+    name: str,
+    keywords: list[str],
+    platforms: list[str] | None = None,
+    match_mode: str = "any",
+    push_enabled: bool = False,
+    push_channel_ids: list[int] | None = None,
+) -> TrendSubscription:
+    db = next(get_db())
+    try:
+        sub = TrendSubscription(
+            name=name,
+            keywords=json.dumps(keywords, ensure_ascii=False),
+            platforms=json.dumps(platforms or ["all"], ensure_ascii=False),
+            match_mode=match_mode,
+            push_enabled=push_enabled,
+            push_channel_ids=json.dumps(push_channel_ids or []),
+        )
+        db.add(sub)
+        db.commit()
+        db.refresh(sub)
+        return _detach(sub)
+    finally:
+        db.close()
+def list_subscriptions() -> list[TrendSubscription]:
+    db = next(get_db())
+    try:
+        return [
+            _detach(item)
+            for item in db.query(TrendSubscription).order_by(TrendSubscription.id.desc()).all()
+        ]
+    finally:
+        db.close()
+def get_subscription(subscription_id: int) -> TrendSubscription | None:
+    db = next(get_db())
+    try:
+        item = db.query(TrendSubscription).filter_by(id=subscription_id).first()
+        return _detach(item) if item else None
+    finally:
+        db.close()
+def update_subscription(
+    subscription_id: int,
+    name: str | None = None,
+    keywords: list[str] | None = None,
+    platforms: list[str] | None = None,
+    match_mode: str | None = None,
+    enabled: bool | None = None,
+    push_enabled: bool | None = None,
+    push_channel_ids: list[int] | None = None,
+) -> TrendSubscription | None:
+    db = next(get_db())
+    try:
+        sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
+        if sub is None:
+            return None
+        if name is not None:
+            sub.name = name
+        if keywords is not None:
+            sub.keywords = json.dumps(keywords, ensure_ascii=False)
+        if platforms is not None:
+            sub.platforms = json.dumps(platforms, ensure_ascii=False)
+        if match_mode is not None:
+            sub.match_mode = match_mode
+        if enabled is not None:
+            sub.enabled = enabled
+        if push_enabled is not None:
+            sub.push_enabled = push_enabled
+        if push_channel_ids is not None:
+            sub.push_channel_ids = json.dumps(push_channel_ids)
+        db.commit()
+        db.refresh(sub)
+        return _detach(sub)
+    finally:
+        db.close()
+def delete_subscription(subscription_id: int) -> bool:
+    db = next(get_db())
+    try:
+        sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
+        if sub is None:
+            return False
+        # also delete associated matches
+        db.query(TrendSubscriptionMatch).filter_by(subscription_id=subscription_id).delete()
+        db.delete(sub)
+        db.commit()
+        return True
+    finally:
+        db.close()
+def update_subscription_refresh(subscription_id: int) -> None:
+    db = next(get_db())
+    try:
+        sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
+        if sub:
+            sub.last_matched_at = datetime.now()
+            db.commit()
+    finally:
+        db.close()
+# ─── Trend Subscription Matches ───────────────────────────────────────────────────
+def create_match(
+    subscription_id: int,
+    platform: str,
+    item_id: str,
+    title: str,
+    url: str = "",
+    hot_score: str = "",
+    matched_keywords: list[str] | None = None,
+) -> TrendSubscriptionMatch | None:
+    """Create a match record. Returns None if this (subscription, platform, item_id) already exists."""
+    db = next(get_db())
+    try:
+        existing = (
+            db.query(TrendSubscriptionMatch)
+            .filter_by(subscription_id=subscription_id, platform=platform, item_id=item_id)
+            .first()
+        )
+        if existing is not None:
+            return None  # already matched before
+        match = TrendSubscriptionMatch(
+            subscription_id=subscription_id,
+            platform=platform,
+            item_id=item_id,
+            title=title,
+            url=url,
+            hot_score=hot_score,
+            matched_keywords=json.dumps(matched_keywords or [], ensure_ascii=False),
+        )
+        db.add(match)
+        db.commit()
+        db.refresh(match)
+        return _detach(match)
+    finally:
+        db.close()
+def list_matches(
+    subscription_id: int | None = None,
+    limit: int = 100,
+    unread_only: bool = False,
+) -> list[TrendSubscriptionMatch]:
+    db = next(get_db())
+    try:
+        query = db.query(TrendSubscriptionMatch)
+        if subscription_id is not None:
+            query = query.filter_by(subscription_id=subscription_id)
+        if unread_only:
+            query = query.filter_by(is_read=False)
+        return [
+            _detach(item)
+            for item in query.order_by(TrendSubscriptionMatch.matched_at.desc())
+            .limit(limit)
+            .all()
+        ]
+    finally:
+        db.close()
+def mark_matches_read(subscription_id: int) -> int:
+    """Mark all matches for a subscription as read. Returns count of updated rows."""
+    db = next(get_db())
+    try:
+        count = (
+            db.query(TrendSubscriptionMatch)
+            .filter_by(subscription_id=subscription_id, is_read=False)
+            .update({"is_read": True})
+        )
+        db.commit()
+        return count
+    finally:
+        db.close()
+def count_unread_matches(subscription_id: int) -> int:
+    db = next(get_db())
+    try:
+        return (
+            db.query(TrendSubscriptionMatch)
+            .filter_by(subscription_id=subscription_id, is_read=False)
+            .count()
+        )
+    finally:
+        db.close()
+# ─── Notification Channels ────────────────────────────────────────────────────────
+def create_channel(name: str, channel_type: str, config: dict | None = None) -> NotificationChannel:
+    db = next(get_db())
+    try:
+        channel = NotificationChannel(
+            name=name,
+            type=channel_type,
+            config=json.dumps(config or {}, ensure_ascii=False),
+        )
+        db.add(channel)
+        db.commit()
+        db.refresh(channel)
+        return _detach(channel)
+    finally:
+        db.close()
+def list_channels() -> list[NotificationChannel]:
+    db = next(get_db())
+    try:
+        return [
+            _detach(item)
+            for item in db.query(NotificationChannel).order_by(NotificationChannel.id.desc()).all()
+        ]
+    finally:
+        db.close()
+def get_channel(channel_id: int) -> NotificationChannel | None:
+    db = next(get_db())
+    try:
+        item = db.query(NotificationChannel).filter_by(id=channel_id).first()
+        return _detach(item) if item else None
+    finally:
+        db.close()
+def update_channel(
+    channel_id: int,
+    name: str | None = None,
+    channel_type: str | None = None,
+    config: dict | None = None,
+    enabled: bool | None = None,
+) -> NotificationChannel | None:
+    db = next(get_db())
+    try:
+        channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
+        if channel is None:
+            return None
+        if name is not None:
+            channel.name = name
+        if channel_type is not None:
+            channel.type = channel_type
+        if config is not None:
+            channel.config = json.dumps(config, ensure_ascii=False)
+        if enabled is not None:
+            channel.enabled = enabled
+        db.commit()
+        db.refresh(channel)
+        return _detach(channel)
+    finally:
+        db.close()
+def delete_channel(channel_id: int) -> bool:
+    db = next(get_db())
+    try:
+        channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
+        if channel is None:
+            return False
+        db.delete(channel)
+        db.commit()
+        return True
+    finally:
+        db.close()

backend/app/db/video_task_dao.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from app.db.models.video_tasks import VideoTask
+from app.db.engine import get_db
+from app.utils.logger import get_logger
+logger = get_logger(__name__)
+# 插入任务
+def insert_video_task(video_id: str, platform: str, task_id: str):
+    db = next(get_db())
+    try:
+        task = VideoTask(video_id=video_id, platform=platform, task_id=task_id)
+        db.add(task)
+        db.commit()
+        db.refresh(task)
+        logger.info(f"Video task inserted successfully. video_id: {video_id}, platform: {platform}, task_id: {task_id}")
+    except Exception as e:
+        logger.error(f"Failed to insert video task: {e}")
+    finally:
+        db.close()
+# 查询任务（最新一条）
+def get_task_by_video(video_id: str, platform: str):
+    db = next(get_db())
+    try:
+        task = (
+            db.query(VideoTask)
+            .filter_by(video_id=video_id, platform=platform)
+            .order_by(VideoTask.created_at.desc())
+            .first()
+        )
+        if task:
+            logger.info(f"Task found for video_id: {video_id} and platform: {platform}")
+            return task.task_id
+        else:
+            logger.info(f"No task found for video_id: {video_id} and platform: {platform}")
+            return None
+    except Exception as e:
+        logger.error(f"Failed to get task by video: {e}")
+    finally:
+        db.close()
+# 删除任务
+def delete_task_by_video(video_id: str, platform: str):
+    db = next(get_db())
+    try:
+        tasks = (
+            db.query(VideoTask)
+            .filter_by(video_id=video_id, platform=platform)
+            .all()
+        )
+        for task in tasks:
+            db.delete(task)
+        db.commit()
+        logger.info(f"Task(s) deleted for video_id: {video_id} and platform: {platform}")
+    except Exception as e:
+        logger.error(f"Failed to delete task by video: {e}")
+    finally:
+        db.close()

backend/app/decorators/__init__.py ADDED Viewed

File without changes

backend/app/decorators/timeit.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import time
+import functools
+def timeit(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start = time.perf_counter()
+        result = func(*args, **kwargs)
+        end = time.perf_counter()
+        duration = end - start
+        print(f"{func.__name__} executed in {duration:.4f} seconds")
+        return result
+    return wrapper

backend/app/downloaders/__init__.py ADDED Viewed

File without changes

backend/app/downloaders/base.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import enum
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+from app.enmus.note_enums import DownloadQuality
+from app.models.notes_model import AudioDownloadResult
+from app.models.transcriber_model import TranscriptResult
+from os import getenv
+QUALITY_MAP = {
+    "fast": "32",
+    "medium": "64",
+    "slow": "128"
+}
+class Downloader(ABC):
+    def __init__(self):
+        #TODO 需要修改为可配置
+        self.quality = QUALITY_MAP.get('fast')
+        self.cache_data=getenv('DATA_DIR')
+    @abstractmethod
+    def download(self, video_url: str, output_dir: str = None,
+                 quality: DownloadQuality = "fast", need_video: Optional[bool] = False,
+                 skip_download: bool = False) -> AudioDownloadResult:
+        '''
+        :param need_video:
+        :param video_url: 资源链接
+        :param output_dir: 输出路径 默认根目录data
+        :param quality: 音频质量 fast | medium | slow
+        :return:返回一个 AudioDownloadResult 类
+        '''
+        pass
+    @staticmethod
+    def download_video(self, video_url: str,
+                       output_dir: Union[str, None] = None) -> str:
+        pass
+    def download_subtitles(self, video_url: str, output_dir: str = None,
+                           langs: list = None) -> Optional[TranscriptResult]:
+        '''
+        尝试获取平台字幕（人工字幕或自动生成字幕）
+        :param video_url: 视频链接
+        :param output_dir: 输出路径
+        :param langs: 优先语言列表，如 ['zh-Hans', 'zh', 'en']
+        :return: TranscriptResult 或 None（无字幕时）
+        '''
+        return None

backend/app/downloaders/bilibili_downloader.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import os
+import json
+import logging
+import tempfile
+from abc import ABC
+from typing import Union, Optional, List
+import yt_dlp
+from app.downloaders.base import Downloader, DownloadQuality, QUALITY_MAP
+from app.downloaders.bilibili_subtitle import BilibiliSubtitleFetcher
+from app.models.notes_model import AudioDownloadResult
+from app.models.transcriber_model import TranscriptResult, TranscriptSegment
+from app.utils.path_helper import get_data_dir
+from app.utils.url_parser import extract_video_id
+from app.services.cookie_manager import CookieConfigManager
+logger = logging.getLogger(__name__)
+class BilibiliDownloader(Downloader, ABC):
+    def __init__(self):
+        super().__init__()
+        self._cookie_mgr = CookieConfigManager()
+        self._cookie = self._cookie_mgr.get('bilibili')
+        self._cookiefile = self._write_netscape_cookie_file()
+    def _write_netscape_cookie_file(self) -> Optional[str]:
+        """将 Cookie 写入 Netscape 格式临时文件，返回文件路径（供 yt-dlp cookiefile 使用）"""
+        if not self._cookie:
+            logger.warning("B站 Cookie 未配置，下载可能失败")
+            return None
+        lines = ["# Netscape HTTP Cookie File\n"]
+        for pair in self._cookie.split("; "):
+            if "=" in pair:
+                key, value = pair.split("=", 1)
+                lines.append(f".bilibili.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
+        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
+        tmp.writelines(lines)
+        tmp.close()
+        logger.info("已生成 B站 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
+        return tmp.name
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video:Optional[bool]=False
+    ) -> AudioDownloadResult:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir=self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestaudio[ext=m4a]/bestaudio/best',
+            'outtmpl': output_path,
+            'http_headers': {'Referer': 'https://www.bilibili.com'},
+            'postprocessors': [
+                {
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '64',
+                }
+            ],
+            'noplaylist': True,
+            'quiet': False,
+        }
+        if self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=True)
+            video_id = info.get("id")
+            title = info.get("title")
+            duration = info.get("duration", 0)
+            cover_url = info.get("thumbnail")
+            audio_path = os.path.join(output_dir, f"{video_id}.mp3")
+        return AudioDownloadResult(
+            file_path=audio_path,
+            title=title,
+            duration=duration,
+            cover_url=cover_url,
+            platform="bilibili",
+            video_id=video_id,
+            raw_info=info,
+            video_path=None  # ❗音频下载不包含视频路径
+        )
+    def download_video(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+    ) -> str:
+        """
+        下载视频，返回视频文件路径
+        """
+        if output_dir is None:
+            output_dir = get_data_dir()
+        os.makedirs(output_dir, exist_ok=True)
+        print("video_url",video_url)
+        video_id=extract_video_id(video_url, "bilibili")
+        video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if os.path.exists(video_path):
+            return video_path
+        # 检查是否已经存在
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bv*[ext=mp4]/bestvideo+bestaudio/best',
+            'outtmpl': output_path,
+            'http_headers': {'Referer': 'https://www.bilibili.com'},
+            'noplaylist': True,
+            'quiet': False,
+            'merge_output_format': 'mp4',  # 确保合并成 mp4
+        }
+        if self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=True)
+            video_id = info.get("id")
+            video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"视频文件未找到: {video_path}")
+        return video_path
+    def delete_video(self, video_path: str) -> str:
+        """
+        删除视频文件
+        """
+        if os.path.exists(video_path):
+            os.remove(video_path)
+            return f"视频文件已删除: {video_path}"
+        else:
+            return f"视频文件未找到: {video_path}"
+    def download_subtitles(self, video_url: str, output_dir: str = None,
+                           langs: List[str] = None) -> Optional[TranscriptResult]:
+        """
+        尝试获取B站视频字幕
+        :param video_url: 视频链接
+        :param output_dir: 输出路径
+        :param langs: 优先语言列表
+        :return: TranscriptResult 或 None
+        """
+        # 1) 优先走 B 站官方 player API（直拉，无需下视频；AI 字幕需 SESSDATA cookie）
+        try:
+            result = BilibiliSubtitleFetcher().fetch_subtitles(video_url)
+            if result and result.segments:
+                return result
+        except Exception as e:
+            logger.warning(f"player API 直拉字幕异常，回退到 yt-dlp: {e}")
+        # 2) Fallback：原 yt-dlp 路径（更脆弱，遇到签名/Cookie 问题失败概率较高）
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        if langs is None:
+            langs = ['zh-Hans', 'zh', 'zh-CN', 'ai-zh', 'en', 'en-US']
+        video_id = extract_video_id(video_url, "bilibili")
+        ydl_opts = {
+            'writesubtitles': True,
+            'writeautomaticsub': True,
+            'subtitleslangs': langs,
+            'subtitlesformat': 'srt/json3/best',  # 支持多种格式
+            'skip_download': True,
+            'outtmpl': os.path.join(output_dir, f'{video_id}.%(ext)s'),
+            'quiet': True,
+        }
+        # 通过 CookieConfigManager 注入 B站 Cookie（Netscape cookiefile）
+        if self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+            ydl_opts['http_headers'] = {'Referer': 'https://www.bilibili.com'}
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(video_url, download=True)
+                # 查找下载的字幕文件
+                subtitles = info.get('requested_subtitles') or {}
+                if not subtitles:
+                    logger.info(f"B站视频 {video_id} 没有可用字幕")
+                    return None
+                # 按优先级查找字幕
+                detected_lang = None
+                sub_info = None
+                for lang in langs:
+                    if lang in subtitles:
+                        detected_lang = lang
+                        sub_info = subtitles[lang]
+                        break
+                # 如果按优先级没找到，取第一个可用的（排除弹幕）
+                if not detected_lang:
+                    for lang, info_item in subtitles.items():
+                        if lang != 'danmaku':  # 排除弹幕
+                            detected_lang = lang
+                            sub_info = info_item
+                            break
+                if not sub_info:
+                    logger.info(f"B站视频 {video_id} 没有可用字幕（排除弹幕）")
+                    return None
+                # 检查是否有内嵌数据（yt-dlp 有时直接返回字幕内容）
+                if 'data' in sub_info and sub_info['data']:
+                    logger.info(f"直接从返回数据解析字幕: {detected_lang}")
+                    return self._parse_srt_content(sub_info['data'], detected_lang)
+                # 查找字幕文件
+                ext = sub_info.get('ext', 'srt')
+                subtitle_file = os.path.join(output_dir, f"{video_id}.{detected_lang}.{ext}")
+                if not os.path.exists(subtitle_file):
+                    logger.info(f"字幕文件不存在: {subtitle_file}")
+                    return None
+                # 根据格式解析字幕文件
+                if ext == 'json3':
+                    return self._parse_json3_subtitle(subtitle_file, detected_lang)
+                else:
+                    with open(subtitle_file, 'r', encoding='utf-8') as f:
+                        return self._parse_srt_content(f.read(), detected_lang)
+        except Exception as e:
+            logger.warning(f"获取B站字幕失败: {e}")
+            return None
+    def _parse_srt_content(self, srt_content: str, language: str) -> Optional[TranscriptResult]:
+        """
+        解析 SRT 格式字幕内容
+        :param srt_content: SRT 字幕文本内容
+        :param language: 语言代码
+        :return: TranscriptResult
+        """
+        import re
+        try:
+            segments = []
+            # SRT 格式: 序号\n时间戳\n文本\n\n
+            pattern = r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})\n(.*?)(?=\n\n|\n\d+\n|$)'
+            matches = re.findall(pattern, srt_content, re.DOTALL)
+            for match in matches:
+                idx, start_time, end_time, text = match
+                text = text.strip()
+                if not text:
+                    continue
+                # 转换时间格式 00:00:00,000 -> 秒
+                def time_to_seconds(t):
+                    parts = t.replace(',', '.').split(':')
+                    return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
+                segments.append(TranscriptSegment(
+                    start=time_to_seconds(start_time),
+                    end=time_to_seconds(end_time),
+                    text=text
+                ))
+            if not segments:
+                return None
+            full_text = ' '.join(seg.text for seg in segments)
+            logger.info(f"成功解析B站SRT字幕，共 {len(segments)} 段")
+            return TranscriptResult(
+                language=language,
+                full_text=full_text,
+                segments=segments,
+                raw={'source': 'bilibili_subtitle', 'format': 'srt'}
+            )
+        except Exception as e:
+            logger.warning(f"解析SRT字幕失败: {e}")
+            return None
+    def _parse_json3_subtitle(self, subtitle_file: str, language: str) -> Optional[TranscriptResult]:
+        """
+        解析 json3 格式字幕文件
+        :param subtitle_file: 字幕文件路径
+        :param language: 语言代码
+        :return: TranscriptResult
+        """
+        try:
+            with open(subtitle_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            segments = []
+            events = data.get('events', [])
+            for event in events:
+                # json3 格式中时间单位是毫秒
+                start_ms = event.get('tStartMs', 0)
+                duration_ms = event.get('dDurationMs', 0)
+                # 提取文本
+                segs = event.get('segs', [])
+                text = ''.join(seg.get('utf8', '') for seg in segs).strip()
+                if text:  # 只添加非空文本
+                    segments.append(TranscriptSegment(
+                        start=start_ms / 1000.0,
+                        end=(start_ms + duration_ms) / 1000.0,
+                        text=text
+                    ))
+            if not segments:
+                return None
+            full_text = ' '.join(seg.text for seg in segments)
+            logger.info(f"成功解析B站字幕，共 {len(segments)} 段")
+            return TranscriptResult(
+                language=language,
+                full_text=full_text,
+                segments=segments,
+                raw={'source': 'bilibili_subtitle', 'file': subtitle_file}
+            )
+        except Exception as e:
+            logger.warning(f"解析字幕文件失败: {e}")
+            return None

backend/app/downloaders/bilibili_subtitle.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+直接调用 B 站 player API 拿字幕，绕过 yt-dlp。
+流程：
+1. 从 URL 提 BV id（已有 utils.url_parser.extract_video_id）
+2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
+3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
+   每条带 subtitle_url（B 站后端已经签好 auth_key 的完整地址）
+4. 按优先级（人工 zh-CN > AI zh-CN > 任意 zh > 任意非空）选一条
+5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
+6. 解析为 TranscriptResult
+AI 字幕需要登录态 cookie（SESSDATA）；通过 CookieConfigManager 注入。
+"""
+from typing import List, Optional
+import requests
+from app.models.transcriber_model import TranscriptResult, TranscriptSegment
+from app.services.cookie_manager import CookieConfigManager
+from app.utils.logger import get_logger
+from app.utils.url_parser import extract_video_id
+logger = get_logger(__name__)
+UA = (
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+class BilibiliSubtitleFetcher:
+    """通过 B 站官方 API 直拉字幕。"""
+    def __init__(self):
+        self._cookie = CookieConfigManager().get("bilibili") or ""
+    def _headers(self) -> dict:
+        h = {
+            "User-Agent": UA,
+            "Referer": "https://www.bilibili.com",
+        }
+        if self._cookie:
+            h["Cookie"] = self._cookie
+        return h
+    def _get_cid(self, bvid: str) -> Optional[int]:
+        url = "https://api.bilibili.com/x/web-interface/view"
+        try:
+            resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
+            data = resp.json()
+        except Exception as e:
+            logger.warning(f"获取 cid 失败: {e}")
+            return None
+        if data.get("code") != 0:
+            logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
+            return None
+        cid = data.get("data", {}).get("cid")
+        return int(cid) if cid else None
+    def _list_subtitles(self, bvid: str, cid: int) -> List[dict]:
+        url = "https://api.bilibili.com/x/player/wbi/v2"
+        try:
+            resp = requests.get(url, params={"bvid": bvid, "cid": cid}, headers=self._headers(), timeout=10)
+            data = resp.json()
+        except Exception as e:
+            logger.warning(f"获取字幕列表失败: {e}")
+            return []
+        if data.get("code") != 0:
+            logger.warning(f"player API 返回错误: code={data.get('code')}, msg={data.get('message')}")
+            return []
+        subtitles = data.get("data", {}).get("subtitle", {}).get("subtitles", [])
+        return subtitles or []
+    def _pick(self, subtitles: List[dict]) -> Optional[dict]:
+        """优先级：人工中文 > AI 中文 > 任意中文 > 任意非空。"""
+        if not subtitles:
+            return None
+        def is_zh(s: dict) -> bool:
+            lan = (s.get("lan") or "").lower()
+            return lan.startswith("zh") or lan == "ai-zh"
+        # 人工中文（type 0=AI, 1=人工 ；ai_type=0 视为人工）
+        for s in subtitles:
+            if is_zh(s) and not s.get("ai_type"):
+                return s
+        # AI 中文
+        for s in subtitles:
+            if is_zh(s):
+                return s
+        # 任意非空
+        return subtitles[0]
+    @staticmethod
+    def _normalize_url(url: str) -> str:
+        if url.startswith("//"):
+            return "https:" + url
+        return url
+    def _fetch_body(self, subtitle_url: str) -> Optional[List[dict]]:
+        try:
+            resp = requests.get(self._normalize_url(subtitle_url), headers=self._headers(), timeout=15)
+            data = resp.json()
+            return data.get("body") or []
+        except Exception as e:
+            logger.warning(f"下载字幕 JSON 失败: {e}")
+            return None
+    def fetch_subtitles(self, video_url: str) -> Optional[TranscriptResult]:
+        bvid = extract_video_id(video_url, "bilibili")
+        if not bvid:
+            logger.info("无法从 URL 提取 BV id")
+            return None
+        cid = self._get_cid(bvid)
+        if not cid:
+            logger.info(f"{bvid} 没有取到 cid")
+            return None
+        subtitles = self._list_subtitles(bvid, cid)
+        if not subtitles:
+            logger.info(f"{bvid} (cid={cid}) 没有可用字幕轨")
+            return None
+        track = self._pick(subtitles)
+        if not track or not track.get("subtitle_url"):
+            logger.info(f"{bvid} 字幕轨存在但没有 subtitle_url（可能未登录、需要 SESSDATA cookie）")
+            return None
+        lan = track.get("lan") or "zh"
+        body = self._fetch_body(track["subtitle_url"])
+        if not body:
+            return None
+        segments: List[TranscriptSegment] = []
+        for item in body:
+            text = (item.get("content") or "").strip()
+            if not text:
+                continue
+            segments.append(TranscriptSegment(
+                start=float(item.get("from", 0)),
+                end=float(item.get("to", 0)),
+                text=text,
+            ))
+        if not segments:
+            return None
+        full_text = " ".join(s.text for s in segments)
+        logger.info(f"B站直拉字幕成功: {bvid} lan={lan} 共 {len(segments)} 段")
+        return TranscriptResult(
+            language=lan,
+            full_text=full_text,
+            segments=segments,
+            raw={
+                "source": "bilibili_player_api",
+                "bvid": bvid,
+                "cid": cid,
+                "lan": lan,
+                "ai_type": track.get("ai_type"),
+            },
+        )

backend/app/downloaders/common.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # def download():

backend/app/downloaders/douyin_downloader.py ADDED Viewed

	@@ -0,0 +1,499 @@

+from __future__ import annotations
+import json
+import os
+import re
+import subprocess
+from dataclasses import dataclass, field
+from typing import Any, Literal, Optional, Union
+from urllib.parse import parse_qs, unquote, urlparse
+import requests
+from app.downloaders.base import Downloader
+from app.enmus.note_enums import DownloadQuality
+from app.models.audio_model import AudioDownloadResult
+from app.models.transcriber_model import TranscriptResult, TranscriptSegment
+from app.utils.path_helper import get_data_dir
+SHARE_PAGE_UA = (
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) "
+    "Version/17.0 Mobile/15E148 Safari/604.1"
+)
+ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*(\{.+)", re.DOTALL)
+RENDER_DATA_RE = re.compile(
+    r'<script id="RENDER_DATA" type="application/json">([^<]+)</script>'
+)
+DOUYIN_URL_RE = re.compile(
+    r"https?://(?:v\.douyin\.com|www\.douyin\.com|www\.iesdouyin\.com|m\.douyin\.com)[^\s\]]*"
+)
+IMAGE_AWEME_TYPES = {2, 68}
+class DouyinResolveError(Exception):
+    pass
+@dataclass
+class DouyinContentMeta:
+    aweme_id: str
+    title: str
+    author: str
+    source_url: str
+    content_type: Literal["video", "image"] = "video"
+    aweme_type: Optional[int] = None
+    download_url: str = ""
+    cover_url: Optional[str] = None
+    image_urls: list[str] = field(default_factory=list)
+    duration: float = 0
+    tags: list[str] = field(default_factory=list)
+def _session() -> requests.Session:
+    session = requests.Session()
+    session.headers.update(
+        {
+            "User-Agent": SHARE_PAGE_UA,
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        }
+    )
+    return session
+def expand_share_url(share_text: str) -> str:
+    """从抖音分享文案中提取可访问链接。"""
+    match = DOUYIN_URL_RE.search((share_text or "").strip())
+    if not match:
+        raise DouyinResolveError("未在输入中找到抖音链接")
+    return match.group(0).rstrip("/.,;)")
+def _extract_aweme_id_from_search_url(url: str) -> Optional[str]:
+    parsed = urlparse(url)
+    if not parsed.netloc.endswith("douyin.com") or not parsed.path.startswith("/search"):
+        return None
+    params = parse_qs(parsed.query)
+    for key in ("modal_id", "item_ids"):
+        for value in params.get(key, []):
+            match = re.search(r"\d{10,}", value)
+            if match:
+                return match.group(0)
+    return None
+def normalize_to_share_page(url: str) -> str:
+    """www.douyin.com 的 video/note 页面转为移动端分享页。"""
+    note = re.search(r"https?://(?:www\.)?douyin\.com/note/(\d+)", url)
+    if note:
+        return f"https://www.iesdouyin.com/share/note/{note.group(1)}/"
+    video = re.search(r"https?://(?:www\.)?douyin\.com/video/(\d+)", url)
+    if video:
+        return f"https://www.iesdouyin.com/share/video/{video.group(1)}/"
+    search_aweme_id = _extract_aweme_id_from_search_url(url)
+    if search_aweme_id:
+        return f"https://www.iesdouyin.com/share/video/{search_aweme_id}/"
+    return url
+def resolve_share_page(session: requests.Session, share_url: str) -> tuple[str, str]:
+    response = session.get(share_url, allow_redirects=True, timeout=30)
+    response.raise_for_status()
+    return str(response.url), response.text
+def extract_aweme_id(page_url: str, html: Optional[str] = None) -> str:
+    patterns = [
+        r"/video/(\d+)",
+        r"/note/(\d+)",
+        r"/share/video/(\d+)",
+        r"/share/note/(\d+)",
+        r"modal_id=(\d+)",
+        r"item_ids=(\d+)",
+        r'"aweme_id"\s*:\s*"?(\d+)"?',
+        r'"itemId"\s*:\s*"?(\d+)"?',
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, page_url)
+        if match:
+            return match.group(1)
+    if html:
+        for pattern in patterns:
+            match = re.search(pattern, html)
+            if match:
+                return match.group(1)
+    raise DouyinResolveError(f"无法从分享页解析作品 ID: {page_url}")
+def _parse_router_data(html: str) -> Optional[dict[str, Any]]:
+    match = ROUTER_DATA_RE.search(html)
+    if not match:
+        return None
+    raw = match.group(1).split("</script>")[0].rstrip().rstrip(";")
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        return None
+def _parse_render_data(html: str) -> Optional[dict[str, Any]]:
+    match = RENDER_DATA_RE.search(html)
+    if not match:
+        return None
+    try:
+        return json.loads(unquote(match.group(1)))
+    except json.JSONDecodeError:
+        return None
+def _find_item_list(obj: Any) -> list[dict[str, Any]]:
+    if isinstance(obj, dict):
+        item_list = obj.get("item_list")
+        if isinstance(item_list, list) and item_list:
+            first = item_list[0]
+            if isinstance(first, dict) and (
+                "aweme_id" in first or "awemeId" in first or "video" in first or "images" in first
+            ):
+                return item_list
+        for value in obj.values():
+            found = _find_item_list(value)
+            if found:
+                return found
+    elif isinstance(obj, list):
+        for item in obj:
+            found = _find_item_list(item)
+            if found:
+                return found
+    return []
+def _pick_url_from_image_node(image: dict[str, Any]) -> Optional[str]:
+    url_list = image.get("url_list") or []
+    if url_list:
+        return str(url_list[-1])
+    download_list = image.get("download_url_list") or []
+    if download_list:
+        return str(download_list[-1])
+    return None
+def _extract_image_urls(item: dict[str, Any]) -> list[str]:
+    urls: list[str] = []
+    seen: set[str] = set()
+    def add(url: Optional[str]) -> None:
+        if url and url not in seen:
+            seen.add(url)
+            urls.append(url)
+    for image in item.get("images") or []:
+        if isinstance(image, dict):
+            add(_pick_url_from_image_node(image))
+    post = item.get("image_post_info") or {}
+    if isinstance(post, dict):
+        for image in post.get("images") or []:
+            if isinstance(image, dict):
+                add(_pick_url_from_image_node(image))
+    return urls
+def _has_playable_video(item: dict[str, Any]) -> bool:
+    video = item.get("video") or {}
+    if not isinstance(video, dict):
+        return False
+    play_addr = video.get("play_addr") or video.get("playAddr") or {}
+    if not isinstance(play_addr, dict):
+        return False
+    return bool(play_addr.get("uri") or play_addr.get("url_list"))
+def _is_image_note(item: dict[str, Any]) -> bool:
+    aweme_type = item.get("aweme_type")
+    if aweme_type in IMAGE_AWEME_TYPES:
+        return True
+    return bool(_extract_image_urls(item)) and not _has_playable_video(item)
+def _build_no_watermark_url(play_addr: dict[str, Any]) -> str:
+    uri = play_addr.get("uri") or ""
+    url_list = play_addr.get("url_list") or []
+    if uri:
+        return f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=720p&line=0"
+    if url_list:
+        return str(url_list[0]).replace("playwm", "play")
+    raise DouyinResolveError("分享页内嵌数据中未找到视频播放地址")
+def _extract_tags(item: dict[str, Any]) -> list[str]:
+    tags: list[str] = []
+    seen: set[str] = set()
+    for tag in item.get("text_extra") or item.get("video_tag") or []:
+        if not isinstance(tag, dict):
+            continue
+        name = tag.get("hashtag_name") or tag.get("tag_name") or tag.get("name")
+        if name and name not in seen:
+            seen.add(name)
+            tags.append(str(name))
+    return tags
+def _duration_seconds(raw: Any) -> float:
+    try:
+        value = float(raw or 0)
+    except (TypeError, ValueError):
+        return 0
+    return value / 1000 if value > 10000 else value
+def _meta_from_aweme_item(item: dict[str, Any], source_url: str) -> DouyinContentMeta:
+    aweme_id = str(item.get("aweme_id") or item.get("awemeId") or "")
+    title = (item.get("desc") or item.get("caption") or "").strip() or f"douyin_{aweme_id}"
+    aweme_type = item.get("aweme_type")
+    tags = _extract_tags(item)
+    author = ""
+    author_info = item.get("author") or {}
+    if isinstance(author_info, dict):
+        author = author_info.get("nickname") or author_info.get("unique_id") or ""
+    duration = _duration_seconds(item.get("duration"))
+    if _is_image_note(item):
+        image_urls = _extract_image_urls(item)
+        if not image_urls:
+            raise DouyinResolveError("识别为图文，但未找到图片地址")
+        return DouyinContentMeta(
+            aweme_id=aweme_id,
+            title=title,
+            author=author,
+            source_url=source_url,
+            content_type="image",
+            aweme_type=aweme_type,
+            cover_url=image_urls[0],
+            image_urls=image_urls,
+            duration=duration,
+            tags=tags,
+        )
+    video = item.get("video") or {}
+    if not isinstance(video, dict):
+        raise DouyinResolveError("分享页内嵌数据中未找到视频节点")
+    play_addr = video.get("play_addr") or video.get("playAddr") or {}
+    if not isinstance(play_addr, dict):
+        raise DouyinResolveError("视频节点缺少 play_addr")
+    download_url = _build_no_watermark_url(play_addr)
+    cover_url = None
+    for key in ("cover", "origin_cover", "dynamic_cover", "cover_original_scale"):
+        cover_info = video.get(key) or {}
+        if isinstance(cover_info, dict):
+            covers = cover_info.get("url_list") or []
+            if covers:
+                cover_url = str(covers[0])
+                break
+    for bit_rate in video.get("bit_rate") or []:
+        if not isinstance(bit_rate, dict):
+            continue
+        bit_play = bit_rate.get("play_addr") or {}
+        if isinstance(bit_play, dict) and bit_play.get("url_list"):
+            candidate = str(bit_play["url_list"][0])
+            if "playwm" not in candidate and ("douyinvod" in candidate or "bytecdn" in candidate):
+                download_url = candidate
+                break
+    return DouyinContentMeta(
+        aweme_id=aweme_id,
+        title=title,
+        author=author,
+        source_url=source_url,
+        content_type="video",
+        aweme_type=aweme_type,
+        download_url=download_url,
+        cover_url=cover_url,
+        duration=duration,
+        tags=tags,
+    )
+def parse_share_page_html(html: str, page_url: str, original_share: str) -> DouyinContentMeta:
+    for parser in (_parse_router_data, _parse_render_data):
+        payload = parser(html)
+        if not payload:
+            continue
+        items = _find_item_list(payload)
+        if items:
+            meta = _meta_from_aweme_item(items[0], original_share)
+            if meta.aweme_id:
+                return meta
+            return DouyinContentMeta(
+                aweme_id=extract_aweme_id(page_url, html),
+                title=meta.title,
+                author=meta.author,
+                source_url=meta.source_url,
+                content_type=meta.content_type,
+                aweme_type=meta.aweme_type,
+                download_url=meta.download_url,
+                cover_url=meta.cover_url,
+                image_urls=meta.image_urls,
+                duration=meta.duration,
+                tags=meta.tags,
+            )
+    raise DouyinResolveError(
+        "分享页未找到内嵌公开数据（_ROUTER_DATA / RENDER_DATA）。"
+        "请确认链接有效。"
+    )
+def resolve_douyin_share(share_text: str) -> DouyinContentMeta:
+    session = _session()
+    share_url = expand_share_url(share_text)
+    fetch_url = normalize_to_share_page(share_url)
+    page_url, html = resolve_share_page(session, fetch_url)
+    return parse_share_page_html(html, page_url, share_url)
+def _download_file(url: str, dest: str) -> str:
+    os.makedirs(os.path.dirname(dest), exist_ok=True)
+    headers = {"User-Agent": SHARE_PAGE_UA, "Referer": "https://www.iesdouyin.com/"}
+    with requests.get(url, headers=headers, stream=True, timeout=120) as response:
+        response.raise_for_status()
+        with open(dest, "wb") as file:
+            for chunk in response.iter_content(chunk_size=1024 * 256):
+                if chunk:
+                    file.write(chunk)
+    return dest
+def _extract_audio(video_path: str, audio_path: str) -> None:
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path],
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+def _build_result(
+    meta: DouyinContentMeta,
+    audio_path: str,
+    video_path: Optional[str],
+) -> AudioDownloadResult:
+    return AudioDownloadResult(
+        file_path=audio_path,
+        title=meta.title,
+        duration=meta.duration,
+        cover_url=meta.cover_url,
+        platform="douyin",
+        video_id=meta.aweme_id,
+        raw_info={
+            "tags": meta.tags,
+            "author": meta.author,
+            "source_url": meta.source_url,
+            "content_type": meta.content_type,
+            "image_urls": meta.image_urls,
+        },
+        video_path=video_path,
+    )
+class DouyinDownloader(Downloader):
+    def __init__(self, cookie=None):
+        super().__init__()
+    def extract_video_id(self, url: str) -> str:
+        try:
+            return extract_aweme_id(normalize_to_share_page(expand_share_url(url)))
+        except DouyinResolveError:
+            return ""
+    def _resolve_meta(self, video_url: str) -> DouyinContentMeta:
+        try:
+            return resolve_douyin_share(video_url)
+        except DouyinResolveError:
+            raise
+        except Exception as exc:
+            raise DouyinResolveError(f"抖音分享页解析失败：{exc}") from exc
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video: Optional[bool] = False,
+        skip_download: bool = False,
+    ) -> AudioDownloadResult:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        meta = self._resolve_meta(video_url)
+        if meta.content_type == "image":
+            return _build_result(meta, "", None)
+        video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
+        audio_path = os.path.join(output_dir, f"{meta.aweme_id}.mp3")
+        if skip_download:
+            return _build_result(meta, "", None)
+        if not os.path.exists(video_path):
+            _download_file(meta.download_url, video_path)
+        if not os.path.exists(audio_path):
+            try:
+                _extract_audio(video_path, audio_path)
+            except subprocess.CalledProcessError as exc:
+                raise RuntimeError("ffmpeg 转换 MP3 失败") from exc
+        return _build_result(
+            meta,
+            audio_path,
+            video_path if need_video or os.path.exists(video_path) else None,
+        )
+    def download_video(self, video_url: str, output_dir: Union[str, None] = None) -> str:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        meta = self._resolve_meta(video_url)
+        if meta.content_type == "image":
+            raise DouyinResolveError("抖音图文内容没有可下载的视频文件")
+        video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
+        if not os.path.exists(video_path):
+            _download_file(meta.download_url, video_path)
+        return video_path
+    def download_subtitles(
+        self,
+        video_url: str,
+        output_dir: str = None,
+        langs: list = None,
+    ) -> Optional[TranscriptResult]:
+        meta = self._resolve_meta(video_url)
+        if meta.content_type != "image" or not meta.title:
+            return None
+        return TranscriptResult(
+            language="zh",
+            full_text=meta.title,
+            segments=[
+                TranscriptSegment(
+                    start=0,
+                    end=meta.duration or 0,
+                    text=meta.title,
+                )
+            ],
+        )

backend/app/downloaders/douyin_helper/abogus.py ADDED Viewed

	@@ -0,0 +1,635 @@

+"""
+Original Author:
+This file is from https://github.com/JoeanAmier/TikTokDownloader
+And is licensed under the GNU General Public License v3.0
+If you use this code, please keep this license and the original author information.
+Modified by:
+And this file is now a part of the https://github.com/Evil0ctal/Douyin_TikTok_Download_API open-source project.
+This project is licensed under the Apache License 2.0, and the original author information is kept.
+Purpose:
+This file is used to generate the `a_bogus` parameter for the Douyin Web API.
+Changes Made:
+1. Changed the ua_code to compatible with the current config file User-Agent string in https://github.com/Evil0ctal/Douyin_TikTok_Download_API/blob/main/crawlers/douyin/web/config.yaml
+"""
+from random import choice
+from random import randint
+from random import random
+from re import compile
+from time import time
+from urllib.parse import urlencode
+from urllib.parse import quote
+from gmssl import sm3, func
+__all__ = ["ABogus", ]
+class ABogus:
+    __filter = compile(r'%([0-9A-F]{2})')
+    __arguments = [0, 1, 14]
+    __ua_key = "\u0000\u0001\u000e"
+    __end_string = "cus"
+    __version = [1, 0, 1, 5]
+    __browser = "1536|742|1536|864|0|0|0|0|1536|864|1536|864|1536|742|24|24|MacIntel"
+    __reg = [
+        1937774191,
+        1226093241,
+        388252375,
+        3666478592,
+        2842636476,
+        372324522,
+        3817729613,
+        2969243214,
+    ]
+    __str = {
+        "s0": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
+        "s1": "Dkdpgh4ZKsQB80/Mfvw36XI1R25+WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
+        "s2": "Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
+        "s3": "ckdp1h4ZKsUB80/Mfvw36XIgR25+WQAlEi7NLboqYTOPuzmFjJnryx9HVGDaStCe",
+        "s4": "Dkdpgh2ZmsQB80/MfvV36XI1R45-WUAlEixNLwoqYTOPuzKFjJnry79HbGcaStCe",
+    }
+    def __init__(self,
+                 # user_agent: str = USERAGENT,
+                 platform: str = None, ):
+        self.chunk = []
+        self.size = 0
+        self.reg = self.__reg[:]
+        # self.ua_code = self.generate_ua_code(user_agent)
+        # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36
+        self.ua_code = [
+            76,
+            98,
+            15,
+            131,
+            97,
+            245,
+            224,
+            133,
+            122,
+            199,
+            241,
+            166,
+            79,
+            34,
+            90,
+            191,
+            128,
+            126,
+            122,
+            98,
+            66,
+            11,
+            14,
+            40,
+            49,
+            110,
+            110,
+            173,
+            67,
+            96,
+            138,
+            252]
+        self.browser = self.generate_browser_info(
+            platform) if platform else self.__browser
+        self.browser_len = len(self.browser)
+        self.browser_code = self.char_code_at(self.browser)
+    @classmethod
+    def list_1(cls, random_num=None, a=170, b=85, c=45, ) -> list:
+        return cls.random_list(
+            random_num,
+            a,
+            b,
+            1,
+            2,
+            5,
+            c & a,
+        )
+    @classmethod
+    def list_2(cls, random_num=None, a=170, b=85, ) -> list:
+        return cls.random_list(
+            random_num,
+            a,
+            b,
+            1,
+            0,
+            0,
+            0,
+        )
+    @classmethod
+    def list_3(cls, random_num=None, a=170, b=85, ) -> list:
+        return cls.random_list(
+            random_num,
+            a,
+            b,
+            1,
+            0,
+            5,
+            0,
+        )
+    @staticmethod
+    def random_list(
+            a: float = None,
+            b=170,
+            c=85,
+            d=0,
+            e=0,
+            f=0,
+            g=0,
+    ) -> list:
+        r = a or (random() * 10000)
+        v = [
+            r,
+            int(r) & 255,
+            int(r) >> 8,
+        ]
+        s = v[1] & b | d
+        v.append(s)
+        s = v[1] & c | e
+        v.append(s)
+        s = v[2] & b | f
+        v.append(s)
+        s = v[2] & c | g
+        v.append(s)
+        return v[-4:]
+    @staticmethod
+    def from_char_code(*args):
+        return "".join(chr(code) for code in args)
+    @classmethod
+    def generate_string_1(
+            cls,
+            random_num_1=None,
+            random_num_2=None,
+            random_num_3=None,
+    ):
+        return cls.from_char_code(*cls.list_1(random_num_1)) + cls.from_char_code(
+            *cls.list_2(random_num_2)) + cls.from_char_code(*cls.list_3(random_num_3))
+    def generate_string_2(
+            self,
+            url_params: str,
+            method="GET",
+            start_time=0,
+            end_time=0,
+    ) -> str:
+        a = self.generate_string_2_list(
+            url_params,
+            method,
+            start_time,
+            end_time,
+        )
+        e = self.end_check_num(a)
+        a.extend(self.browser_code)
+        a.append(e)
+        return self.rc4_encrypt(self.from_char_code(*a), "y")
+    def generate_string_2_list(
+            self,
+            url_params: str,
+            method="GET",
+            start_time=0,
+            end_time=0,
+    ) -> list:
+        start_time = start_time or int(time() * 1000)
+        end_time = end_time or (start_time + randint(4, 8))
+        params_array = self.generate_params_code(url_params)
+        method_array = self.generate_method_code(method)
+        return self.list_4(
+            (end_time >> 24) & 255,
+            params_array[21],
+            self.ua_code[23],
+            (end_time >> 16) & 255,
+            params_array[22],
+            self.ua_code[24],
+            (end_time >> 8) & 255,
+            (end_time >> 0) & 255,
+            (start_time >> 24) & 255,
+            (start_time >> 16) & 255,
+            (start_time >> 8) & 255,
+            (start_time >> 0) & 255,
+            method_array[21],
+            method_array[22],
+            int(end_time / 256 / 256 / 256 / 256) >> 0,
+            int(start_time / 256 / 256 / 256 / 256) >> 0,
+            self.browser_len,
+        )
+    @staticmethod
+    def reg_to_array(a):
+        o = [0] * 32
+        for i in range(8):
+            c = a[i]
+            o[4 * i + 3] = (255 & c)
+            c >>= 8
+            o[4 * i + 2] = (255 & c)
+            c >>= 8
+            o[4 * i + 1] = (255 & c)
+            c >>= 8
+            o[4 * i] = (255 & c)
+        return o
+    def compress(self, a):
+        f = self.generate_f(a)
+        i = self.reg[:]
+        for o in range(64):
+            c = self.de(i[0], 12) + i[4] + self.de(self.pe(o), o)
+            c = (c & 0xFFFFFFFF)
+            c = self.de(c, 7)
+            s = (c ^ self.de(i[0], 12)) & 0xFFFFFFFF
+            u = self.he(o, i[0], i[1], i[2])
+            u = (u + i[3] + s + f[o + 68]) & 0xFFFFFFFF
+            b = self.ve(o, i[4], i[5], i[6])
+            b = (b + i[7] + c + f[o]) & 0xFFFFFFFF
+            i[3] = i[2]
+            i[2] = self.de(i[1], 9)
+            i[1] = i[0]
+            i[0] = u
+            i[7] = i[6]
+            i[6] = self.de(i[5], 19)
+            i[5] = i[4]
+            i[4] = (b ^ self.de(b, 9) ^ self.de(b, 17)) & 0xFFFFFFFF
+        for l in range(8):
+            self.reg[l] = (self.reg[l] ^ i[l]) & 0xFFFFFFFF
+    @classmethod
+    def generate_f(cls, e):
+        r = [0] * 132
+        for t in range(16):
+            r[t] = (e[4 * t] << 24) | (e[4 * t + 1] <<
+                                       16) | (e[4 * t + 2] << 8) | e[4 * t + 3]
+            r[t] &= 0xFFFFFFFF
+        for n in range(16, 68):
+            a = r[n - 16] ^ r[n - 9] ^ cls.de(r[n - 3], 15)
+            a = a ^ cls.de(a, 15) ^ cls.de(a, 23)
+            r[n] = (a ^ cls.de(r[n - 13], 7) ^ r[n - 6]) & 0xFFFFFFFF
+        for n in range(68, 132):
+            r[n] = (r[n - 68] ^ r[n - 64]) & 0xFFFFFFFF
+        return r
+    @staticmethod
+    def pad_array(arr, length=60):
+        while len(arr) < length:
+            arr.append(0)
+        return arr
+    def fill(self, length=60):
+        size = 8 * self.size
+        self.chunk.append(128)
+        self.chunk = self.pad_array(self.chunk, length)
+        for i in range(4):
+            self.chunk.append((size >> 8 * (3 - i)) & 255)
+    @staticmethod
+    def list_4(
+            a: int,
+            b: int,
+            c: int,
+            d: int,
+            e: int,
+            f: int,
+            g: int,
+            h: int,
+            i: int,
+            j: int,
+            k: int,
+            m: int,
+            n: int,
+            o: int,
+            p: int,
+            q: int,
+            r: int,
+    ) -> list:
+        return [
+            44,
+            a,
+            0,
+            0,
+            0,
+            0,
+            24,
+            b,
+            n,
+            0,
+            c,
+            d,
+            0,
+            0,
+            0,
+            1,
+            0,
+            239,
+            e,
+            o,
+            f,
+            g,
+            0,
+            0,
+            0,
+            0,
+            h,
+            0,
+            0,
+            14,
+            i,
+            j,
+            0,
+            k,
+            m,
+            3,
+            p,
+            1,
+            q,
+            1,
+            r,
+            0,
+            0,
+            0]
+    @staticmethod
+    def end_check_num(a: list):
+        r = 0
+        for i in a:
+            r ^= i
+        return r
+    @classmethod
+    def decode_string(cls, url_string, ):
+        decoded = cls.__filter.sub(cls.replace_func, url_string)
+        return decoded
+    @staticmethod
+    def replace_func(match):
+        return chr(int(match.group(1), 16))
+    @staticmethod
+    def de(e, r):
+        r %= 32
+        return ((e << r) & 0xFFFFFFFF) | (e >> (32 - r))
+    @staticmethod
+    def pe(e):
+        return 2043430169 if 0 <= e < 16 else 2055708042
+    @staticmethod
+    def he(e, r, t, n):
+        if 0 <= e < 16:
+            return (r ^ t ^ n) & 0xFFFFFFFF
+        elif 16 <= e < 64:
+            return (r & t | r & n | t & n) & 0xFFFFFFFF
+        raise ValueError
+    @staticmethod
+    def ve(e, r, t, n):
+        if 0 <= e < 16:
+            return (r ^ t ^ n) & 0xFFFFFFFF
+        elif 16 <= e < 64:
+            return (r & t | ~r & n) & 0xFFFFFFFF
+        raise ValueError
+    @staticmethod
+    def convert_to_char_code(a):
+        d = []
+        for i in a:
+            d.append(ord(i))
+        return d
+    @staticmethod
+    def split_array(arr, chunk_size=64):
+        result = []
+        for i in range(0, len(arr), chunk_size):
+            result.append(arr[i:i + chunk_size])
+        return result
+    @staticmethod
+    def char_code_at(s):
+        return [ord(char) for char in s]
+    def write(self, e, ):
+        self.size = len(e)
+        if isinstance(e, str):
+            e = self.decode_string(e)
+            e = self.char_code_at(e)
+        if len(e) <= 64:
+            self.chunk = e
+        else:
+            chunks = self.split_array(e, 64)
+            for i in chunks[:-1]:
+                self.compress(i)
+            self.chunk = chunks[-1]
+    def reset(self, ):
+        self.chunk = []
+        self.size = 0
+        self.reg = self.__reg[:]
+    def sum(self, e, length=60):
+        self.reset()
+        self.write(e)
+        self.fill(length)
+        self.compress(self.chunk)
+        return self.reg_to_array(self.reg)
+    @classmethod
+    def generate_result_unit(cls, n, s):
+        r = ""
+        for i, j in zip(range(18, -1, -6), (16515072, 258048, 4032, 63)):
+            r += cls.__str[s][(n & j) >> i]
+        return r
+    @classmethod
+    def generate_result_end(cls, s, e="s4"):
+        r = ""
+        b = ord(s[120]) << 16
+        r += cls.__str[e][(b & 16515072) >> 18]
+        r += cls.__str[e][(b & 258048) >> 12]
+        r += "=="
+        return r
+    @classmethod
+    def generate_result(cls, s, e="s4"):
+        # r = ""
+        # for i in range(len(s)//4):
+        #     b = ((ord(s[i * 3]) << 16) | (ord(s[i * 3 + 1]))
+        #          << 8) | ord(s[i * 3 + 2])
+        #     r += cls.generate_result_unit(b, e)
+        # return r
+        r = []
+        for i in range(0, len(s), 3):
+            if i + 2 < len(s):
+                n = (
+                    (ord(s[i]) << 16)
+                    | (ord(s[i + 1]) << 8)
+                    | ord(s[i + 2])
+                )
+            elif i + 1 < len(s):
+                n = (ord(s[i]) << 16) | (
+                    ord(s[i + 1]) << 8
+                )
+            else:
+                n = ord(s[i]) << 16
+            for j, k in zip(range(18, -1, -6),
+                            (0xFC0000, 0x03F000, 0x0FC0, 0x3F)):
+                if j == 6 and i + 1 >= len(s):
+                    break
+                if j == 0 and i + 2 >= len(s):
+                    break
+                r.append(cls.__str[e][(n & k) >> j])
+        r.append("=" * ((4 - len(r) % 4) % 4))
+        return "".join(r)
+    @classmethod
+    def generate_args_code(cls):
+        a = []
+        for j in range(24, -1, -8):
+            a.append(cls.__arguments[0] >> j)
+        a.append(cls.__arguments[1] / 256)
+        a.append(cls.__arguments[1] % 256)
+        a.append(cls.__arguments[1] >> 24)
+        a.append(cls.__arguments[1] >> 16)
+        for j in range(24, -1, -8):
+            a.append(cls.__arguments[2] >> j)
+        return [int(i) & 255 for i in a]
+    def generate_method_code(self, method: str = "GET") -> list[int]:
+        return self.sm3_to_array(self.sm3_to_array(method + self.__end_string))
+        # return self.sum(self.sum(method + self.__end_string))
+    def generate_params_code(self, params: str) -> list[int]:
+        return self.sm3_to_array(self.sm3_to_array(params + self.__end_string))
+        # return self.sum(self.sum(params + self.__end_string))
+    @classmethod
+    def sm3_to_array(cls, data: str | list) -> list[int]:
+        """
+        代码参考: https://github.com/Johnserf-Seed/f2/blob/main/f2/utils/abogus.py
+        计算请求体的 SM3 哈希值，并将结果转换为整数数组
+        Calculate the SM3 hash value of the request body and convert the result to an array of integers
+        Args:
+            data (Union[str, List[int]]): 输入数据 (Input data).
+        Returns:
+            List[int]: 哈希值的整数数组 (Array of integers representing the hash value).
+        """
+        if isinstance(data, str):
+            b = data.encode("utf-8")
+        else:
+            b = bytes(data)  # 将 List[int] 转换为字节数组
+        # 将字节数组转换为适合 sm3.sm3_hash 函数处理的列表格式
+        h = sm3.sm3_hash(func.bytes_to_list(b))
+        # 将十六进制字符串结果转换为十进制整数列表
+        return [int(h[i: i + 2], 16) for i in range(0, len(h), 2)]
+    @classmethod
+    def generate_browser_info(cls, platform: str = "Win32") -> str:
+        inner_width = randint(1280, 1920)
+        inner_height = randint(720, 1080)
+        outer_width = randint(inner_width, 1920)
+        outer_height = randint(inner_height, 1080)
+        screen_x = 0
+        screen_y = choice((0, 30))
+        value_list = [
+            inner_width,
+            inner_height,
+            outer_width,
+            outer_height,
+            screen_x,
+            screen_y,
+            0,
+            0,
+            outer_width,
+            outer_height,
+            outer_width,
+            outer_height,
+            inner_width,
+            inner_height,
+            24,
+            24,
+            platform,
+        ]
+        return "|".join(str(i) for i in value_list)
+    @staticmethod
+    def rc4_encrypt(plaintext, key):
+        s = list(range(256))
+        j = 0
+        for i in range(256):
+            j = (j + s[i] + ord(key[i % len(key)])) % 256
+            s[i], s[j] = s[j], s[i]
+        i = 0
+        j = 0
+        cipher = []
+        for k in range(len(plaintext)):
+            i = (i + 1) % 256
+            j = (j + s[i]) % 256
+            s[i], s[j] = s[j], s[i]
+            t = (s[i] + s[j]) % 256
+            cipher.append(chr(s[t] ^ ord(plaintext[k])))
+        return ''.join(cipher)
+    def get_value(self,
+                  url_params: dict | str,
+                  method="GET",
+                  start_time=0,
+                  end_time=0,
+                  random_num_1=None,
+                  random_num_2=None,
+                  random_num_3=None,
+                  ) -> str:
+        string_1 = self.generate_string_1(
+            random_num_1,
+            random_num_2,
+            random_num_3,
+        )
+        string_2 = self.generate_string_2(urlencode(url_params) if isinstance(
+            url_params, dict) else url_params, method, start_time, end_time, )
+        string = string_1 + string_2
+        # return self.generate_result(
+        #     string, "s4") + self.generate_result_end(string, "s4")
+        return self.generate_result(string, "s4")
+if __name__ == "__main__":
+    bogus = ABogus()
+    USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
+    url_str = "https://www.douyin.com/aweme/v1/web/aweme/detail/?device_platform=webapp&aid=6383&channel=channel_pc_web&pc_client_type=1&version_code=190500&version_name=19.5.0&cookie_enabled=true&browser_language=zh-CN&browser_platform=Win32&browser_name=Firefox&browser_online=true&engine_name=Gecko&os_name=Windows&os_version=10&platform=PC&screen_width=1920&screen_height=1080&browser_version=124.0&engine_version=122.0.0.0&cpu_core_num=12&device_memory=8&aweme_id=7345492945006595379"
+    # 将url参数转换为字典
+    url_params = dict([param.split("=")
+                      for param in url_str.split("?")[1].split("&")])
+    print(f"URL参数: {url_params}")
+    a_bogus = bogus.get_value(url_params, )
+    # 使用url编码a_bogus
+    a_bogus = quote(a_bogus, safe='')
+    print(a_bogus)
+    print(USERAGENT)

backend/app/downloaders/generic_downloader.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""通用 yt-dlp 下载器：用于用户在「下载配置」里登记的自定义平台。
+不做任何站点特定逻辑——完全依赖 yt-dlp 内置 extractor。只把：
+  - 该平台的 Cookie/cookies-from-browser 注入 ydl_opts
+  - 全局代理注入 ydl_opts
+"""
+import logging
+import os
+import tempfile
+from abc import ABC
+from typing import Optional, Union
+import yt_dlp
+from app.downloaders.base import Downloader, DownloadQuality
+from app.models.notes_model import AudioDownloadResult
+from app.services.cookie_manager import CookieConfigManager
+from app.services.proxy_config_manager import ProxyConfigManager
+from app.utils.path_helper import get_data_dir
+logger = logging.getLogger(__name__)
+class GenericYtdlpDownloader(Downloader, ABC):
+    """对任意 yt-dlp 支持站点的薄封装。按平台 key 读取 cookie 配置。"""
+    def __init__(self, platform: str, cookie_domain: Optional[str] = None):
+        super().__init__()
+        self.platform = platform
+        # cookie 文件里 Netscape 格式需要 domain；不知道就用通用 . 让 yt-dlp 自己挑
+        self.cookie_domain = cookie_domain or f".{platform}.com"
+        mgr = CookieConfigManager()
+        self._cookie = mgr.get(platform)
+        self._browser = mgr.get_browser(platform)
+        self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
+    def _write_netscape_cookie_file(self) -> Optional[str]:
+        if not self._cookie:
+            return None
+        lines = ["# Netscape HTTP Cookie File\n"]
+        for pair in self._cookie.split("; "):
+            if "=" in pair:
+                k, v = pair.split("=", 1)
+                lines.append(f"{self.cookie_domain}\tTRUE\t/\tFALSE\t0\t{k}\t{v}\n")
+        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
+        tmp.writelines(lines)
+        tmp.close()
+        logger.info("已生成 [%s] Netscape Cookie 文件: %s", self.platform, tmp.name)
+        return tmp.name
+    def _apply_ydl_extras(self, ydl_opts: dict) -> None:
+        proxy = ProxyConfigManager().get_proxy_url()
+        if proxy:
+            ydl_opts['proxy'] = proxy
+        if self._browser:
+            ydl_opts['cookiesfrombrowser'] = (self._browser,)
+        elif self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video: Optional[bool] = False,
+        skip_download: bool = False,
+    ) -> AudioDownloadResult:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestaudio/best',
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+        }
+        if skip_download:
+            ydl_opts['skip_download'] = True
+        self._apply_ydl_extras(ydl_opts)
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=not skip_download)
+            video_id = info.get("id") or "unknown"
+            title = info.get("title") or self.platform
+            duration = info.get("duration", 0)
+            cover_url = info.get("thumbnail")
+            ext = info.get("ext", "mp3")
+            audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
+        return AudioDownloadResult(
+            file_path=audio_path,
+            title=title,
+            duration=duration,
+            cover_url=cover_url,
+            platform=self.platform,
+            video_id=video_id,
+            raw_info={'tags': info.get('tags')},
+            video_path=None,
+        )
+    def download_video(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+    ) -> str:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestvideo+bestaudio/best',
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+            'merge_output_format': 'mp4',
+        }
+        self._apply_ydl_extras(ydl_opts)
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=True)
+            video_id = info.get("id")
+            video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"视频文件未找到: {video_path}")
+        return video_path

backend/app/downloaders/kuaishou_downloader.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import subprocess
+from abc import ABC
+from typing import Union, Optional
+import requests
+from app.downloaders.base import Downloader
+from app.downloaders.kuaishou_helper.kuaishou import KuaiShou
+from app.enmus.note_enums import DownloadQuality
+from app.models.audio_model import AudioDownloadResult
+from app.utils.path_helper import get_data_dir
+class KuaiShouDownloader(Downloader, ABC):
+    def __init__(self):
+        super().__init__()
+    def download(
+            self,
+            video_url: str,
+            output_dir: Union[str, None] = None,
+            quality: str = "fast",
+            need_video: Optional[bool] = False
+    ) -> AudioDownloadResult:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        ks = KuaiShou()
+        video_raw_info = ks.run(video_url)
+        print(video_raw_info)
+        photo_info = video_raw_info['visionVideoDetail']['photo']
+        video_id = photo_info['id']
+        title = photo_info['caption'].strip().replace('\n', '').replace(' ', '_')[:50]
+        mp4_path = os.path.join(output_dir, f"{video_id}.mp4")
+        mp3_path = os.path.join(output_dir, f"{video_id}.mp3")
+        if os.path.exists(mp3_path):
+            print(f"[已存在] 跳过下载: {mp3_path}")
+            return AudioDownloadResult(
+                file_path=mp3_path,
+                title=title,
+                duration=photo_info['duration'],
+                cover_url=photo_info['coverUrl'],
+                platform="kuaishou",
+                video_id=video_id,
+                raw_info={
+                    'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
+                },
+                video_path=mp4_path
+            )
+        # 下载 mp4 视频
+        resp = requests.get(photo_info['photoUrl'], stream=True)
+        if resp.status_code == 200:
+            with open(mp4_path, "wb") as f:
+                for chunk in resp.iter_content(1024 * 1024):
+                    f.write(chunk)
+        else:
+            raise Exception(f"视频下载失败: {resp.status_code}")
+        # 使用 ffmpeg 转换为 mp3
+        try:
+            subprocess.run([
+                "ffmpeg", "-y", "-i", mp4_path, "-vn", "-acodec", "libmp3lame", mp3_path
+            ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        except subprocess.CalledProcessError:
+            raise Exception("ffmpeg 转换 MP3 失败")
+        return AudioDownloadResult(
+            file_path=mp3_path,
+            title=photo_info['caption'],
+            duration=photo_info['duration'],
+            cover_url=photo_info['coverUrl'],
+            platform="kuaishou",
+            video_id=video_id,
+            raw_info={
+                'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
+            },
+            video_path=mp4_path
+        )
+    def download_video(
+            self,
+            video_url: str,
+            output_dir: Union[str, None] = None,
+    ) -> str:
+        print('self.download(video_url, output_dir).video_path',self.download(video_url, output_dir).video_path)
+        return self.download(video_url, output_dir).video_path
+if __name__ == '__main__':
+    ks = KuaiShouDownloader()
+    ks.download('https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')

backend/app/downloaders/kuaishou_helper/__init__.py ADDED Viewed

File without changes

backend/app/downloaders/kuaishou_helper/kuaishou.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import logging
+import os
+import re
+import requests
+from dotenv import load_dotenv
+from app.services.cookie_manager import CookieConfigManager
+from app.utils.logger import get_logger
+KUAISHOU_API_BASE = 'https://www.kuaishou.com/graphql'
+KUAISHOU_URL = "https://www.kuaishou.com/"
+load_dotenv()
+headers = {
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    # 'Cookie': 'did=web_9e8cfa4403000587b9e7d67233e6b04c; didv=1719811812378; kpf=PC_WEB; clientid=3; kpn=KUAISHOU_VISION',
+    'Origin': 'https://www.kuaishou.com',
+    'Pragma': 'no-cache',
+    'Referer': 'https://www.kuaishou.com/',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
+    'accept': '*/*',
+    'content-type': 'application/json',
+    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    # 'Cookie':cookies.strip()
+}
+logger = get_logger(__name__)
+cfm=CookieConfigManager()
+class KuaiShou:
+    def __init__(self):
+        self.header = headers.copy()
+        self.cookie = None
+    @staticmethod
+    def _extract_kuaishou_link(text):
+        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
+        return url[0]
+    def get_photo_id(self, url):
+        response = requests.get(url, allow_redirects=True, headers=self.header)
+        real_url = response.url
+        # 提取short—video/后面的id
+        pattern = re.compile(r'short-video/(\w+)')
+        match = pattern.search(real_url)
+        return match.group().split('/')[1]
+    def get_temp_cookies(self):
+        is_exist = cfm.get('kuaishou')
+        print(is_exist)
+        if is_exist:
+            return is_exist
+        res = requests.get(url=KUAISHOU_URL, headers=self.header, allow_redirects=True)
+        cookie_string = '; '.join([f"{k}={v}" for k, v in res.cookies.get_dict().items()])
+        return cookie_string
+    def get_video_details(self, url, photo_id):
+        json_data = {
+            'operationName': 'visionVideoDetail',
+            "variables": {"photoId": photo_id, "page": "detail"},
+            "query": "query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {\n  visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {\n    status\n    type\n    author {\n      id\n      name\n      following\n      headerUrl\n      __typename\n    }\n    photo {\n      id\n      duration\n      caption\n      likeCount\n      realLikeCount\n      coverUrl\n      photoUrl\n      liked\n      timestamp\n      expTag\n      llsid\n      viewCount\n      videoRatio\n      stereoType\n      croppedPhotoUrl\n      manifest {\n        mediaType\n        businessType\n        version\n        adaptationSet {\n          id\n          duration\n          representation {\n            id\n            defaultSelect\n            backupUrl\n            codecs\n            url\n            height\n            width\n            avgBitrate\n            maxBitrate\n            m3u8Slice\n            qualityType\n            qualityLabel\n            frameRate\n            featureP2sp\n            hidden\n            disableAdaptive\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    tags {\n      type\n      name\n      __typename\n    }\n    commentLimit {\n      canAddComment\n      __typename\n    }\n    llsid\n    danmakuSwitch\n    __typename\n  }\n}\n"
+        }
+        response = requests.post(url=KUAISHOU_API_BASE, headers=self.header, json=json_data)
+        if response.status_code == 200:
+            response.raise_for_status()
+            return response.json()
+        else:
+            return None
+    def run(self, url):
+        real_url = self._extract_kuaishou_link(url)
+        if not real_url:
+            logger.error(f"快手视频 URL 解析失败 {url}")
+        cookies = self.get_temp_cookies()
+        if not cookies:
+            logger.error(f"快手视频 cookies 解析失败 {url},请考虑设置环境变量 KUAISHOU_COOKIES")
+        self.header['Cookie'] = cookies.strip()
+        photo_id = self.get_photo_id(real_url)
+        if photo_id is None:
+            logger.error(f"快手视频 ID 解析失败 {url}")
+        video_details = self.get_video_details(real_url, photo_id)
+        print(video_details)
+        if video_details is None:
+            logger.error(f"快手视频详情解析失败 {url}")
+        return video_details['data']
+if __name__ == '__main__':
+    ks = KuaiShou()
+    ks.run(
+        'https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')

backend/app/downloaders/local_downloader.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import subprocess
+from abc import ABC
+from typing import Optional
+from app.downloaders.base import Downloader
+from app.enmus.note_enums import DownloadQuality
+from app.models.audio_model import AudioDownloadResult
+import os
+import subprocess
+from app.utils.video_helper import save_cover_to_static
+class LocalDownloader(Downloader, ABC):
+    def __init__(self):
+        super().__init__()
+    def extract_cover(self, input_path: str, output_dir: Optional[str] = None) -> str:
+        """
+        从本地视频文件中提取一张封面图（默认取第一帧）
+        :param input_path: 输入视频路径
+        :param output_dir: 输出目录，默认和视频同目录
+        :return: 提取出的封面图片路径
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"输入文件不存在: {input_path}")
+        if output_dir is None:
+            output_dir = os.path.dirname(input_path)
+        base_name = os.path.splitext(os.path.basename(input_path))[0]
+        output_path = os.path.join(output_dir, f"{base_name}_cover.jpg")
+        try:
+            command = [
+                'ffmpeg',
+                '-i', input_path,
+                '-ss', '00:00:01',  # 跳到视频第1秒，防止黑屏
+                '-vframes', '1',  # 只截取一帧
+                '-q:v', '2',  # 输出质量高一点（qscale，2是很高）
+                '-y',  # 覆盖
+                output_path
+            ]
+            subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            if not os.path.exists(output_path):
+                raise RuntimeError(f"封面图片生成失败: {output_path}")
+            return output_path
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"提取封面失败: {output_path}") from e
+    def convert_to_mp3(self,input_path: str, output_path: str = None) -> str:
+        """
+        将本地视频文件转为 MP3 音频文件
+        :param input_path: 输入文件路径（如 .mp4）
+        :param output_path: 输出文件路径（可选，默认同目录同名 .mp3）
+        :return: 生成的 mp3 文件路径
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"输入文件不存在: {input_path}")
+        if output_path is None:
+            base, _ = os.path.splitext(input_path)
+            output_path = base + ".mp3"
+        try:
+        # 调用 ffmpeg 转换
+            command = [
+                'ffmpeg',
+                '-i', input_path,
+                '-vn',  # 不要视频流
+                '-acodec', 'libmp3lame',  # 使用mp3编码
+                '-y',  # 覆盖输出文件
+                output_path
+            ]
+            subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            if not os.path.exists(output_path):
+                raise RuntimeError(f"mp3 文件生成失败: {output_path}")
+            return output_path
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"mp3 文件生成失败: {output_path}") from e
+    def download_video(self, video_url: str, output_dir: str = None) -> str:
+        """
+        处理本地文件路径，返回视频文件路径
+        """
+        if video_url.startswith('/uploads'):
+            project_root = os.getcwd()
+            video_url = os.path.join(project_root, video_url.lstrip('/'))
+            video_url = os.path.normpath(video_url)
+        if not os.path.exists(video_url):
+            raise FileNotFoundError()
+        return video_url
+    def download(
+            self,
+            video_url: str,
+            output_dir: str = None,
+            quality: DownloadQuality = "fast",
+            need_video: Optional[bool] = False
+    ) -> AudioDownloadResult:
+        """
+        处理本地文件路径，返回音频元信息
+        """
+        if video_url.startswith('/uploads'):
+            project_root = os.getcwd()
+            video_url = os.path.join(project_root, video_url.lstrip('/'))
+            video_url = os.path.normpath(video_url)
+        if not os.path.exists(video_url):
+            raise FileNotFoundError(f"本地文件不存在: {video_url}")
+        file_name = os.path.basename(video_url)
+        title, _ = os.path.splitext(file_name)
+        print(title, file_name,video_url)
+        file_path=self.convert_to_mp3(video_url)
+        cover_path = self.extract_cover(video_url)
+        cover_url = save_cover_to_static(cover_path)
+        print('file——path',file_path)
+        return AudioDownloadResult(
+            file_path=file_path,
+            title=title,
+            duration=0,  # 可选：后续加上读取时长
+            cover_url=cover_url,  # 暂无封面
+            platform="local",
+            video_id=title,
+            raw_info={
+                'path':  file_path
+            },
+            video_path=None
+        )

backend/app/downloaders/xiaohongshu_downloader.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""小红书下载器：基于 yt-dlp 内置 XiaoHongShu extractor。
+URL 模式：
+  - https://www.xiaohongshu.com/explore/{id}
+  - https://www.xiaohongshu.com/discovery/item/{id}
+  - 短链 xhslink.com/xxx 由 yt-dlp 自行跟随重定向
+小红书很多内容是图文笔记（无视频/音频）。无视频的会触发 yt-dlp 报「请求格式不可用」，
+前端会展示生成失败——这是预期行为，不强行兜底。
+"""
+import os
+import logging
+import tempfile
+from abc import ABC
+from typing import Union, Optional
+import yt_dlp
+from app.downloaders.base import Downloader, DownloadQuality
+from app.models.notes_model import AudioDownloadResult
+from app.services.cookie_manager import CookieConfigManager
+from app.utils.path_helper import get_data_dir
+from app.utils.url_parser import extract_video_id, clean_url
+logger = logging.getLogger(__name__)
+class XiaohongshuDownloader(Downloader, ABC):
+    def __init__(self):
+        super().__init__()
+        self._cookie_mgr = CookieConfigManager()
+        self._cookie = self._cookie_mgr.get('xiaohongshu')
+        self._browser = self._cookie_mgr.get_browser('xiaohongshu')
+        self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
+    def _write_netscape_cookie_file(self) -> Optional[str]:
+        if not self._cookie:
+            logger.warning("小红书 Cookie 未配置，部分内容可能下载失败")
+            return None
+        lines = ["# Netscape HTTP Cookie File\n"]
+        for pair in self._cookie.split("; "):
+            if "=" in pair:
+                key, value = pair.split("=", 1)
+                lines.append(f".xiaohongshu.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
+        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
+        tmp.writelines(lines)
+        tmp.close()
+        logger.info("已生成小红书 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
+        return tmp.name
+    def _apply_cookie(self, ydl_opts: dict) -> None:
+        if self._browser:
+            ydl_opts['cookiesfrombrowser'] = (self._browser,)
+            logger.info(f"小红书使用 cookies-from-browser: {self._browser}")
+        elif self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video: Optional[bool] = False,
+        skip_download: bool = False,
+    ) -> AudioDownloadResult:
+        # 从分享文案中提取干净链接（标题+不可见字符+短链 整段粘贴也能用）
+        video_url = clean_url(video_url)
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestaudio/best',
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+        }
+        if skip_download:
+            ydl_opts['skip_download'] = True
+        self._apply_cookie(ydl_opts)
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=not skip_download)
+            video_id = info.get("id")
+            title = info.get("title")
+            duration = info.get("duration", 0)
+            cover_url = info.get("thumbnail")
+            ext = info.get("ext", "mp3")
+            audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
+        return AudioDownloadResult(
+            file_path=audio_path,
+            title=title,
+            duration=duration,
+            cover_url=cover_url,
+            platform="xiaohongshu",
+            video_id=video_id,
+            raw_info={'tags': info.get('tags')},
+            video_path=None,
+        )
+    def download_video(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+    ) -> str:
+        video_url = clean_url(video_url)
+        if output_dir is None:
+            output_dir = get_data_dir()
+        video_id = extract_video_id(video_url, "xiaohongshu")
+        video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if os.path.exists(video_path):
+            return video_path
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestvideo+bestaudio/best',
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+            'merge_output_format': 'mp4',
+        }
+        self._apply_cookie(ydl_opts)
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=True)
+            video_id = info.get("id")
+            video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"视频文件未找到: {video_path}")
+        return video_path

backend/app/downloaders/xiaoyuzhoufm_download.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Union, Optional
+import requests
+from app.downloaders.base import Downloader
+from app.enmus.note_enums import DownloadQuality
+from app.models.audio_model import AudioDownloadResult
+url='https://www.xiaoyuzhoufm.com/_next/data/5Pvt_oGntgdyBD_XgwBaB/podcast/62382c1103bea1ebfffa1c00.json?id=62382c1103bea1ebfffa1c00'
+header ={
+    'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36'
+}
+response = requests.get(url, headers=header)
+print(response.json())
+class Xiaoyuzhoufm_download(Downloader):
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video:Optional[bool]=False
+    ) -> AudioDownloadResult:
+        pass

backend/app/downloaders/youtube_downloader.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import logging
+import tempfile
+from abc import ABC
+from typing import Union, Optional, List
+import yt_dlp
+from app.downloaders.base import Downloader, DownloadQuality
+from app.downloaders.youtube_subtitle import YouTubeSubtitleFetcher
+from app.models.notes_model import AudioDownloadResult
+from app.models.transcriber_model import TranscriptResult
+from app.services.cookie_manager import CookieConfigManager
+from app.services.proxy_config_manager import ProxyConfigManager
+from app.utils.path_helper import get_data_dir
+from app.utils.url_parser import extract_video_id
+logger = logging.getLogger(__name__)
+def _apply_proxy(ydl_opts: dict) -> dict:
+    """YouTube 在国内需要代理。配置了全局代理就塞进 yt-dlp opts。"""
+    proxy = ProxyConfigManager().get_proxy_url()
+    if proxy:
+        ydl_opts['proxy'] = proxy
+        logger.info(f"yt-dlp 走代理: {proxy}")
+    return ydl_opts
+def _apply_youtube_extractor_args(ydl_opts: dict) -> dict:
+    """YouTube player_client 选择。
+    默认不再覆盖、交给 yt-dlp 的内置策略：
+    早期为绕开 SSAP 实验（issue #12482）硬编码过 ['tv', 'web_safari']，
+    但 YouTube 后来对 tv 客户端做「全量 DRM」实验（issue #12563），命中的会话
+    所有视频都报 "This video is DRM protected"；而 web 系客户端需要 JS runtime
+    （deno）解 n challenge，装好后 yt-dlp 默认客户端列表即可正常取流。
+    硬编码的客户端列表会随 YouTube 风控变化反复失效，不如跟随 yt-dlp 升级。
+    如需临时指定，可设环境变量 YT_PLAYER_CLIENT（逗号分隔），如
+    YT_PLAYER_CLIENT=web_safari,android_vr。
+    """
+    clients = os.getenv('YT_PLAYER_CLIENT', '').strip()
+    if clients:
+        ydl_opts.setdefault('extractor_args', {})
+        ydl_opts['extractor_args'].setdefault('youtube', {})
+        ydl_opts['extractor_args']['youtube']['player_client'] = [
+            c.strip() for c in clients.split(',') if c.strip()
+        ]
+    return ydl_opts
+class YoutubeDownloader(Downloader, ABC):
+    def __init__(self):
+        super().__init__()
+        self._cookie_mgr = CookieConfigManager()
+        self._cookie = self._cookie_mgr.get('youtube')
+        # 优先级：浏览器实时 cookies > 粘贴的 cookie 字符串。
+        # 配了浏览器就走 yt-dlp `cookiesfrombrowser`，能避开 YouTube 的会话轮换风控。
+        self._browser = self._cookie_mgr.get_browser('youtube')
+        self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
+    def _write_netscape_cookie_file(self) -> Optional[str]:
+        """将 YouTube Cookie 写入 Netscape 格式临时文件，供 yt-dlp cookiefile 使用。
+        没有 Cookie 时返回 None；YouTube 现在没 Cookie 基本会被拦在「Sign in to confirm you're not a bot」。
+        """
+        if not self._cookie:
+            logger.warning("YouTube Cookie 未配置，下载可能会被风控为机器人")
+            return None
+        lines = ["# Netscape HTTP Cookie File\n"]
+        for pair in self._cookie.split("; "):
+            if "=" in pair:
+                key, value = pair.split("=", 1)
+                lines.append(f".youtube.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
+        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
+        tmp.writelines(lines)
+        tmp.close()
+        logger.info("已生成 YouTube Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
+        return tmp.name
+    def download(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+        quality: DownloadQuality = "fast",
+        need_video: Optional[bool] = False,
+        skip_download: bool = False,
+    ) -> AudioDownloadResult:
+        if output_dir is None:
+            output_dir = get_data_dir()
+        if not output_dir:
+            output_dir = self.cache_data
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            'format': 'bestaudio[ext=m4a]/bestaudio/best',
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+        }
+        if skip_download:
+            ydl_opts['skip_download'] = True
+        _apply_proxy(ydl_opts)
+        _apply_youtube_extractor_args(ydl_opts)
+        if self._browser:
+            # (browser_name,) 形式即可；profile/keyring/container 留默认
+            ydl_opts['cookiesfrombrowser'] = (self._browser,)
+            logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
+        elif self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(video_url, download=not skip_download)
+                video_id = info.get("id")
+                title = info.get("title")
+                duration = info.get("duration", 0)
+                cover_url = info.get("thumbnail")
+                ext = info.get("ext", "m4a")
+                audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
+            return AudioDownloadResult(
+                file_path=audio_path,
+                title=title,
+                duration=duration,
+                cover_url=cover_url,
+                platform="youtube",
+                video_id=video_id,
+                raw_info={'tags': info.get('tags')},
+                video_path=None,
+            )
+        except Exception as exc:
+            # DRM / 反爬 / 格式不可用等情况下 yt-dlp 拉不动；只要本次仅需要 metadata
+            # （即字幕路径，skip_download=True），就退到 YouTube oEmbed 兜底拿标题+封面，
+            # 让流程能继续走总结。需要下载音视频时只能向上抛。
+            if not skip_download:
+                raise
+            logger.warning(f"yt-dlp 获取元数据失败，回退 oEmbed: {exc}")
+            return self._fallback_metadata(video_url)
+    def _fallback_metadata(self, video_url: str) -> AudioDownloadResult:
+        """yt-dlp 失败时的兜底：用 YouTube 公开的 oEmbed 接口拿基础 metadata。
+        只能拿到 title / thumbnail / author 这几样；duration / tags 拿不到，做空值处理。
+        DRM、bot 拦截等都不影响 oEmbed。
+        """
+        import requests
+        video_id = extract_video_id(video_url, "youtube") or ""
+        title = video_id or "YouTube 视频"
+        cover = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg" if video_id else ""
+        try:
+            proxies = None
+            proxy = ProxyConfigManager().get_proxy_url()
+            if proxy:
+                proxies = {"http": proxy, "https": proxy}
+            resp = requests.get(
+                "https://www.youtube.com/oembed",
+                params={"url": video_url, "format": "json"},
+                proxies=proxies,
+                timeout=10,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            if data.get("title"):
+                title = data["title"]
+            if data.get("thumbnail_url"):
+                cover = data["thumbnail_url"]
+            logger.info(f"oEmbed 兜底成功：title={title}")
+        except Exception as e:
+            logger.warning(f"oEmbed 兜底也失败，使用最小元数据：{e}")
+        return AudioDownloadResult(
+            file_path="",          # 没下载音视频文件
+            title=title,
+            duration=0,            # oEmbed 不返回时长
+            cover_url=cover,
+            platform="youtube",
+            video_id=video_id,
+            raw_info={"tags": []}, # oEmbed 不返回标签
+            video_path=None,
+        )
+    def download_video(
+        self,
+        video_url: str,
+        output_dir: Union[str, None] = None,
+    ) -> str:
+        """
+        下载视频，返回视频文件路径
+        """
+        if output_dir is None:
+            output_dir = get_data_dir()
+        video_id = extract_video_id(video_url, "youtube")
+        video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if os.path.exists(video_path):
+            return video_path
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
+        ydl_opts = {
+            # 这里下载的视频只用于截图网格/视频理解抽帧，720p 足够：
+            # 不设上限的话 bestvideo 会选 4K AV1（动辄 300MB+，下载和 ffmpeg
+            # 解码抽帧都极慢）。优先 avc1（解码远快于 av01），同高度再退 av01。
+            'format': (
+                'bestvideo[height<=720][vcodec^=avc1]+bestaudio[ext=m4a]'
+                '/bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]'
+                '/best[height<=720][ext=mp4]/best[ext=mp4]'
+            ),
+            'outtmpl': output_path,
+            'noplaylist': True,
+            'quiet': False,
+            'merge_output_format': 'mp4',  # 确保合并成 mp4
+        }
+        _apply_proxy(ydl_opts)
+        _apply_youtube_extractor_args(ydl_opts)
+        if self._browser:
+            # (browser_name,) 形式即可；profile/keyring/container 留默认
+            ydl_opts['cookiesfrombrowser'] = (self._browser,)
+            logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
+        elif self._cookiefile:
+            ydl_opts['cookiefile'] = self._cookiefile
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(video_url, download=True)
+            video_id = info.get("id")
+            video_path = os.path.join(output_dir, f"{video_id}.mp4")
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"视频文件未找到: {video_path}")
+        return video_path
+    def download_subtitles(self, video_url: str, output_dir: str = None,
+                           langs: List[str] = None) -> Optional[TranscriptResult]:
+        """
+        通过 YouTube InnerTube API 直接获取字幕（优先人工字幕，其次自动生成）。
+        比 yt_dlp 方式更轻量，无需写临时文件到磁盘。
+        :param video_url: 视频链接
+        :param output_dir: 未使用（保留接口兼容）
+        :param langs: 优先语言列表
+        :return: TranscriptResult 或 None
+        """
+        if langs is None:
+            langs = ['zh-Hans', 'zh', 'zh-CN', 'zh-TW', 'en', 'en-US', 'ja']
+        video_id = extract_video_id(video_url, "youtube")
+        fetcher = YouTubeSubtitleFetcher()
+        print(
+            f"尝试获取字幕，video_id={video_id}, langs={langs}"
+        )
+        return fetcher.fetch_subtitles(video_id, langs)

backend/app/downloaders/youtube_subtitle.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+通过 youtube-transcript-api 获取 YouTube 字幕。
+优先人工字幕，其次自动生成字幕。不依赖 yt_dlp，无需下载任何文件。
+"""
+from typing import Optional, List
+from youtube_transcript_api import YouTubeTranscriptApi
+from app.models.transcriber_model import TranscriptResult, TranscriptSegment
+from app.services.proxy_config_manager import ProxyConfigManager
+from app.utils.logger import get_logger
+logger = get_logger(__name__)
+class YouTubeSubtitleFetcher:
+    """通过 youtube-transcript-api 获取 YouTube 字幕。"""
+    def __init__(self):
+        # 配了全局代理就给 youtube-transcript-api 套一个带 proxies 的 requests.Session，
+        # 否则国内拉字幕同样会超时。代理未配置时退回默认无代理客户端。
+        proxy = ProxyConfigManager().get_proxy_url()
+        if proxy:
+            try:
+                import requests
+                session = requests.Session()
+                session.proxies = {"http": proxy, "https": proxy}
+                self._api = YouTubeTranscriptApi(http_client=session)
+                logger.info(f"YouTube 字幕走代理: {proxy}")
+            except Exception as e:
+                logger.warning(f"为 youtube-transcript-api 注入代理失败，回退无代理: {e}")
+                self._api = YouTubeTranscriptApi()
+        else:
+            self._api = YouTubeTranscriptApi()
+    def fetch_subtitles(
+        self,
+        video_id: str,
+        langs: Optional[List[str]] = None,
+    ) -> Optional[TranscriptResult]:
+        if langs is None:
+            langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"]
+        try:
+            # 1. 列出所有可用字幕
+            transcript_list = self._api.list(video_id)
+            available = []
+            for t in transcript_list:
+                available.append(
+                    f"{t.language_code}({'auto' if t.is_generated else 'manual'})"
+                )
+            logger.info(f"可用字幕轨道: {', '.join(available)}")
+            # 2. 按优先级查找：先人工字幕，再自动字幕
+            transcript = None
+            try:
+                transcript = transcript_list.find_manually_created_transcript(langs)
+                logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})")
+            except Exception:
+                try:
+                    transcript = transcript_list.find_generated_transcript(langs)
+                    logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})")
+                except Exception:
+                    # 都没匹配，取第一个可用的
+                    for t in transcript_list:
+                        transcript = t
+                        source = "auto" if t.is_generated else "manual"
+                        logger.info(f"使用首个可用字幕: {t.language_code} ({source})")
+                        break
+            if not transcript:
+                logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕")
+                return None
+            # 3. 获取字幕内容
+            fetched = transcript.fetch()
+            segments = []
+            for snippet in fetched:
+                text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip()
+                if not text:
+                    continue
+                start = snippet.get("start", 0) if isinstance(snippet, dict) else 0
+                duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0
+                segments.append(TranscriptSegment(
+                    start=float(start),
+                    end=float(start) + float(duration),
+                    text=text,
+                ))
+            if not segments:
+                logger.warning(f"YouTube 字幕内容为空: {video_id}")
+                return None
+            full_text = " ".join(seg.text for seg in segments)
+            logger.info(f"成功获取 YouTube 字幕，共 {len(segments)} 段")
+            return TranscriptResult(
+                language=transcript.language_code,
+                full_text=full_text,
+                segments=segments,
+                raw={
+                    "source": "youtube_transcript_api",
+                    "language": transcript.language,
+                    "language_code": transcript.language_code,
+                    "is_generated": transcript.is_generated,
+                },
+            )
+        except Exception as e:
+            logger.warning(f"YouTube 字幕获取失败: {e}")
+            return None

backend/app/enmus/exception.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import enum
+class ProviderErrorEnum(enum.Enum):
+    CONNECTION_TEST_FAILED = (200101, "供应商连接测试失败")
+    SAVE_FAILED = (200102, "供应商保存失败")
+    CREATE_FAILED = (200103, "供应商创建失败")
+    NOT_FOUND = (200104, "供应商不存在/未保存")
+    WRONG_PARAMETER = (200105, "API / API 地址不正确")
+    UNKNOW_ERROR = (200106, "未知错误")
+    def __init__(self, code, message):
+        self.code = code
+        self.message = message
+class NoteErrorEnum(enum.Enum):
+    PLATFORM_NOT_SUPPORTED = (300101 ,"选择的平台不受支持")
+    def __init__(self, code, message):
+        self.code = code
+        self.message = message

backend/app/enmus/note_enums.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import enum
+class DownloadQuality(str, enum.Enum):
+    fast = "fast"
+    medium = "medium"
+    slow = "slow"