zhoujiaangyao commited on
Commit
6cfe55f
·
0 Parent(s):

deploy videomemo backend to HF Space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +12 -0
  2. Dockerfile +48 -0
  3. README.md +20 -0
  4. backend/.env.example +12 -0
  5. backend/Dockerfile +42 -0
  6. backend/Dockerfile.gpu +40 -0
  7. backend/__init__.py +0 -0
  8. backend/app/__init__.py +47 -0
  9. backend/app/article_fetchers/__init__.py +3 -0
  10. backend/app/article_fetchers/base.py +36 -0
  11. backend/app/article_fetchers/generic.py +117 -0
  12. backend/app/article_fetchers/wechat.py +142 -0
  13. backend/app/article_fetchers/xiaohongshu.py +218 -0
  14. backend/app/core/__init__.py +0 -0
  15. backend/app/db/__init__.py +0 -0
  16. backend/app/db/article_dao.py +167 -0
  17. backend/app/db/builtin_providers.json +65 -0
  18. backend/app/db/engine.py +45 -0
  19. backend/app/db/init_db.py +34 -0
  20. backend/app/db/model_dao.py +69 -0
  21. backend/app/db/models/__init__.py +0 -0
  22. backend/app/db/models/articles.py +55 -0
  23. backend/app/db/models/models.py +12 -0
  24. backend/app/db/models/providers.py +17 -0
  25. backend/app/db/models/trend_subscription.py +50 -0
  26. backend/app/db/models/video_tasks.py +14 -0
  27. backend/app/db/provider_dao.py +129 -0
  28. backend/app/db/sqlite_client.py +4 -0
  29. backend/app/db/trend_subscription_dao.py +293 -0
  30. backend/app/db/video_task_dao.py +61 -0
  31. backend/app/decorators/__init__.py +0 -0
  32. backend/app/decorators/timeit.py +13 -0
  33. backend/app/downloaders/__init__.py +0 -0
  34. backend/app/downloaders/base.py +52 -0
  35. backend/app/downloaders/bilibili_downloader.py +343 -0
  36. backend/app/downloaders/bilibili_subtitle.py +164 -0
  37. backend/app/downloaders/common.py +1 -0
  38. backend/app/downloaders/douyin_downloader.py +499 -0
  39. backend/app/downloaders/douyin_helper/abogus.py +635 -0
  40. backend/app/downloaders/generic_downloader.py +128 -0
  41. backend/app/downloaders/kuaishou_downloader.py +97 -0
  42. backend/app/downloaders/kuaishou_helper/__init__.py +0 -0
  43. backend/app/downloaders/kuaishou_helper/kuaishou.py +101 -0
  44. backend/app/downloaders/local_downloader.py +137 -0
  45. backend/app/downloaders/xiaohongshu_downloader.py +133 -0
  46. backend/app/downloaders/xiaoyuzhoufm_download.py +25 -0
  47. backend/app/downloaders/youtube_downloader.py +259 -0
  48. backend/app/downloaders/youtube_subtitle.py +113 -0
  49. backend/app/enmus/exception.py +21 -0
  50. backend/app/enmus/note_enums.py +7 -0
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backend/.venv/
2
+ backend/data/
3
+ backend/models/
4
+ backend/config/
5
+ backend/note_results/
6
+ backend/static/
7
+ backend/uploads/
8
+ backend/*.db
9
+ backend/app/db/*.db
10
+ __pycache__/
11
+ *.pyc
12
+ .env
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VideoMemo 后端 —— Hugging Face Spaces(Docker SDK)部署用 Dockerfile。
2
+ #
3
+ # 用法:HF Space 是一个独立 git 仓库,把它的根目录布置成:
4
+ # /Dockerfile ← 本文件(复制到 Space 根目录,重命名为 Dockerfile)
5
+ # /README.md ← deploy/hf-space/README.md(含 HF 必需的 frontmatter)
6
+ # /backend/... ← 从本项目复制整个 backend 目录过去
7
+ # 然后 git push 到 Space,HF 会构建本文件(COPY 路径相对 Space 根目录)。
8
+ #
9
+ # 镜像故意精简:只装 ffmpeg + 后端依赖,默认走 REST 飞书推送,不装 lark-cli。
10
+ # 数据库用外接 Postgres(Supabase),通过 DATABASE_URL Secret 注入。
11
+ ARG BASE_REGISTRY=docker.io
12
+ FROM ${BASE_REGISTRY}/library/python:3.11-slim
13
+
14
+ # HF 在 huggingface.co 自家基础设施上构建/运行:用官方 PyPI 与默认 HF 端点,
15
+ # 不要用国内镜像(那会更慢甚至失败)。
16
+ ARG PIP_INDEX=https://pypi.org/simple
17
+
18
+ # fonts-liberation 提供与 Arial 度量兼容的 LiberationSans,替代仓库里的 arial.ttf
19
+ # (HF git 不收二进制,故字体不进仓库,改由镜像在构建时提供)
20
+ RUN apt-get update && \
21
+ apt-get install -y --no-install-recommends ffmpeg curl fonts-liberation && \
22
+ rm -rf /var/lib/apt/lists/*
23
+
24
+ ENV PYTHONUNBUFFERED=1 \
25
+ BACKEND_HOST=0.0.0.0 \
26
+ BACKEND_PORT=8483 \
27
+ STATIC=/static \
28
+ OUT_DIR=/app/static/screenshots \
29
+ IMAGE_BASE_URL=/static/screenshots \
30
+ NOTE_OUTPUT_DIR=/app/data/note_results \
31
+ DATA_DIR=/app/data
32
+
33
+ WORKDIR /app
34
+
35
+ # 先装依赖利用层缓存
36
+ COPY backend/requirements.txt /app/requirements.txt
37
+ RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
38
+
39
+ # 再复制后端代码
40
+ COPY backend /app
41
+
42
+ # 预建可写目录(HF 容器以 root 运行,这些目录是临时盘——重启会清空,
43
+ # 所以结构化数据务必走外接 DATABASE_URL;笔记/截图属临时数据,后续可再迁对象存储)
44
+ RUN mkdir -p /app/data/note_results /app/static/screenshots /app/config /app/fonts && \
45
+ cp /usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf /app/fonts/arial.ttf
46
+
47
+ EXPOSE 8483
48
+ CMD ["python", "main.py"]
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VideoMemo Backend
3
+ emoji: 🎬
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 8483
8
+ pinned: false
9
+ ---
10
+
11
+ # VideoMemo 后端(API)
12
+
13
+ AI 视频笔记生成的后端服务。桌面端 / 网页端 / 浏览器插件连接本 Space 的地址使用。
14
+
15
+ - **结构化数据**(LLM 供应商配置与 API key、模型、关键词订阅、通知渠道、任务索引)
16
+ 持久化到外接 Postgres(Supabase),通过 `DATABASE_URL` Secret 配置。
17
+ - **本 Space 公开可访问**:务必设置 `WEB_ACCESS_PASSWORD` Secret,否则任何人都能调用你的后端。
18
+ - 笔记正文 / 截图 / 向量库当前仍是容器内临时文件,**重启会清空**(计划后续迁入 Postgres / 对象存储)。
19
+
20
+ > 部署步骤见仓库 `deploy/hf-space/DEPLOY.md`。
backend/.env.example ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 通用
3
+ ENV=production
4
+ API_BASE_URL=http://127.0.0.1:8000
5
+ SCREENSHOT_BASE_URL=http://127.0.0.1:8000/static/screenshots
6
+ STATIC=/static # 外部访问路径(URL 前缀)
7
+ OUT_DIR=./static/screenshots # 本地输出目录
8
+ IMAGE_BASE_URL=/static/screenshots # 图片访问 URL
9
+ DATA_DIR=data
10
+ # transcriber 相关配置
11
+ TRANSCRIBER_TYPE=fast-whisper # fast-whisper/bcut/kuaishou
12
+ WHISPER_MODEL_SIZE=base
backend/Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BASE_REGISTRY 默认走 docker.io;国内拉不到 docker.io 时可换 daocloud / 阿里云 / 自建镜像源:
2
+ # docker-compose build --build-arg BASE_REGISTRY=docker.m.daocloud.io
3
+ # 或写到 docker-compose.yml 的 build.args / 环境变量里
4
+ ARG BASE_REGISTRY=docker.io
5
+ FROM ${BASE_REGISTRY}/library/python:3.11-slim
6
+
7
+ ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
8
+ ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
9
+
10
+ RUN rm -f /etc/apt/sources.list && \
11
+ rm -rf /etc/apt/sources.list.d/* && \
12
+ echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
13
+ echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
14
+ echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
15
+ apt-get update && \
16
+ apt-get install -y --no-install-recommends ffmpeg curl && \
17
+ rm -rf /var/lib/apt/lists/*
18
+
19
+ ENV PATH="/usr/bin:${PATH}"
20
+ ENV HF_ENDPOINT=https://hf-mirror.com
21
+
22
+ # 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI(npm 包 @larksuite/cli,二进制名 lark-cli)。
23
+ # 走 REST 直连推送则用不到,可按需删除本段以瘦身镜像。
24
+ # 凭证通过 LARK_APP_ID / LARK_APP_SECRET 环境变量在运行时注入(由后端调用时传入),此处不写死。
25
+ ARG NPM_REGISTRY=https://registry.npmmirror.com
26
+ RUN apt-get update && \
27
+ apt-get install -y --no-install-recommends nodejs npm && \
28
+ npm config set registry ${NPM_REGISTRY} && \
29
+ npm install -g @larksuite/cli && \
30
+ rm -rf /var/lib/apt/lists/* /root/.npm && \
31
+ (lark-cli --version || true)
32
+
33
+ WORKDIR /app
34
+
35
+ # 先复制 requirements.txt 利用层缓存
36
+ COPY ./backend/requirements.txt /app/requirements.txt
37
+ RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
38
+
39
+ # 再复制应用代码(频繁变动不影响 pip 缓存层)
40
+ COPY ./backend /app
41
+
42
+ CMD ["python", "main.py"]
backend/Dockerfile.gpu ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BASE_REGISTRY 默认走 docker.io;国内可换 daocloud / 阿里云镜像(注意所选镜像需支持 nvidia/cuda 命名空间)
2
+ ARG BASE_REGISTRY=docker.io
3
+ FROM ${BASE_REGISTRY}/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
4
+
5
+ ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
6
+ ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
7
+
8
+ RUN rm -f /etc/apt/sources.list && \
9
+ rm -rf /etc/apt/sources.list.d/* && \
10
+ echo "deb https://${APT_MIRROR}/ubuntu jammy main restricted universe multiverse" > /etc/apt/sources.list && \
11
+ echo "deb https://${APT_MIRROR}/ubuntu jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
12
+ echo "deb https://${APT_MIRROR}/ubuntu jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
13
+ apt-get update && \
14
+ apt-get install -y --no-install-recommends ffmpeg python3-pip curl && \
15
+ rm -rf /var/lib/apt/lists/*
16
+
17
+ ENV HF_ENDPOINT=https://hf-mirror.com
18
+
19
+ # 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI(npm 包 @larksuite/cli,二进制名 lark-cli)。
20
+ # Ubuntu 22.04 自带 apt 的 Node 太旧(v12)跑不动新 CLI,这里用 NodeSource 装 Node 20。
21
+ # 走 REST 直连推送则用不到,可按需删除本段以瘦身镜像。凭证由后端运行时经环境变量注入,不写死。
22
+ ARG NPM_REGISTRY=https://registry.npmmirror.com
23
+ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
24
+ apt-get install -y --no-install-recommends nodejs && \
25
+ npm config set registry ${NPM_REGISTRY} && \
26
+ npm install -g @larksuite/cli && \
27
+ rm -rf /var/lib/apt/lists/* /root/.npm && \
28
+ (lark-cli --version || true)
29
+
30
+ WORKDIR /app
31
+
32
+ # 先复制 requirements.txt 利用层缓存
33
+ COPY ./backend/requirements.txt /app/requirements.txt
34
+ RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt && \
35
+ pip install --no-cache-dir -i ${PIP_INDEX} 'transformers[torch]>=4.23'
36
+
37
+ # 再复制应用代码
38
+ COPY ./backend /app
39
+
40
+ CMD ["python3", "main.py"]
backend/__init__.py ADDED
File without changes
backend/app/__init__.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ from fastapi import Depends, FastAPI, Header, HTTPException, Request
5
+
6
+ # 健康/诊断类接口:公网前端在用户尚未填访问密码时,也要能判断后端是否可达、
7
+ # 从而正常加载页面(否则启动探测被密码拦成 401,整页卡在「连接中」无法进入设置去填密码)。
8
+ _AUTH_EXEMPT_PATHS = {"/api/sys_check", "/api/sys_health", "/api/deploy_status"}
9
+
10
+
11
+ async def verify_web_access_password(
12
+ request: Request,
13
+ request_web_access_password: Optional[str] = Header(
14
+ None, alias="request-web-access-password"
15
+ )
16
+ ):
17
+ if request.url.path in _AUTH_EXEMPT_PATHS:
18
+ return True
19
+ expected = os.getenv("WEB_ACCESS_PASSWORD")
20
+ if expected and request_web_access_password != expected:
21
+ raise HTTPException(status_code=401, detail="访问密码错误或未填写")
22
+ return True
23
+
24
+ def create_app(lifespan) -> FastAPI:
25
+ from .routers import note, notification, provider, model, config, chat, flashcard, hot_videos, article, trend_subscription, feishu
26
+ from .utils.response import ResponseWrapper as R
27
+
28
+ app = FastAPI(title="VideoMemo",lifespan=lifespan)
29
+ protected = [Depends(verify_web_access_password)]
30
+
31
+ @app.get("/sys_check")
32
+ async def root_sys_check():
33
+ return R.success()
34
+
35
+ app.include_router(note.router, prefix="/api", dependencies=protected)
36
+ app.include_router(provider.router, prefix="/api", dependencies=protected)
37
+ app.include_router(model.router, prefix="/api", dependencies=protected)
38
+ app.include_router(config.router, prefix="/api", dependencies=protected)
39
+ app.include_router(chat.router, prefix="/api", dependencies=protected)
40
+ app.include_router(flashcard.router, prefix="/api", dependencies=protected)
41
+ app.include_router(hot_videos.router, prefix="/api", dependencies=protected)
42
+ app.include_router(article.router, prefix="/api", dependencies=protected)
43
+ app.include_router(trend_subscription.router, prefix="/api", dependencies=protected)
44
+ app.include_router(notification.router, prefix="/api", dependencies=protected)
45
+ app.include_router(feishu.router, prefix="/api", dependencies=protected)
46
+
47
+ return app
backend/app/article_fetchers/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from app.article_fetchers.base import ArticleContent, ArticleFetcher, ArticleFetchError
2
+
3
+ __all__ = ["ArticleContent", "ArticleFetcher", "ArticleFetchError"]
backend/app/article_fetchers/base.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Protocol
5
+
6
+
7
+ @dataclass
8
+ class ArticleContent:
9
+ platform: str
10
+ url: str
11
+ article_id: str
12
+ title: str
13
+ author_name: str = ""
14
+ author_id: str = ""
15
+ content_text: str = ""
16
+ image_urls: list[str] = field(default_factory=list)
17
+ cover_url: str = ""
18
+ published_at: str = ""
19
+ raw_metadata: dict = field(default_factory=dict)
20
+
21
+
22
+ class ArticleFetchError(Exception):
23
+ pass
24
+
25
+
26
+ class ArticleFetcher(Protocol):
27
+ platform: str
28
+
29
+ def fetch(self, url: str) -> ArticleContent:
30
+ ...
31
+
32
+ def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
33
+ ...
34
+
35
+ def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
36
+ ...
backend/app/article_fetchers/generic.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from urllib.parse import urlparse
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ from app.article_fetchers.base import ArticleContent, ArticleFetchError
10
+ from app.utils.url_parser import clean_url
11
+
12
+
13
+ def _clean_text(value: str) -> str:
14
+ return re.sub(r"[ \t\r\f\v]+", " ", value or "").strip()
15
+
16
+
17
+ def _normalize_body(value: str) -> str:
18
+ lines = [_clean_text(line) for line in (value or "").splitlines()]
19
+ return "\n".join(line for line in lines if line)
20
+
21
+
22
+ def _meta_content(soup: BeautifulSoup, *selectors: tuple[str, str]) -> str:
23
+ for attr, value in selectors:
24
+ node = soup.find("meta", attrs={attr: value})
25
+ if node:
26
+ content = _clean_text(node.get("content") or "")
27
+ if content:
28
+ return content
29
+ return ""
30
+
31
+
32
+ def _candidate_score(node) -> int:
33
+ text = _normalize_body(node.get_text("\n"))
34
+ paragraphs = node.find_all("p")
35
+ return len(text) + len(paragraphs) * 120
36
+
37
+
38
+ def parse_generic_article_html(html: str, url: str) -> ArticleContent:
39
+ soup = BeautifulSoup(html, "html.parser")
40
+ for tag in soup(["script", "style", "noscript", "svg", "canvas", "iframe"]):
41
+ tag.decompose()
42
+ for tag in soup(["nav", "header", "footer", "aside", "form"]):
43
+ tag.decompose()
44
+
45
+ title = (
46
+ _meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
47
+ or _clean_text(soup.title.get_text(" ")) if soup.title else ""
48
+ )
49
+ author = _meta_content(soup, ("name", "author"), ("property", "article:author"))
50
+ published_at = _meta_content(
51
+ soup,
52
+ ("property", "article:published_time"),
53
+ ("name", "publishdate"),
54
+ ("name", "date"),
55
+ )
56
+ cover = _meta_content(soup, ("property", "og:image"), ("name", "twitter:image"))
57
+
58
+ candidates = []
59
+ for selector in ("article", "main", "[role='main']", "#content", ".content", ".article", ".post"):
60
+ candidates.extend(soup.select(selector))
61
+ if not candidates and soup.body:
62
+ candidates = [soup.body]
63
+ best = max(candidates, key=_candidate_score, default=None)
64
+ body = _normalize_body(best.get_text("\n")) if best else ""
65
+ if len(body) < 80:
66
+ description = _meta_content(soup, ("name", "description"), ("property", "og:description"))
67
+ body = description if len(description) > len(body) else body
68
+ if len(body) < 40:
69
+ raise ValueError("网页正文为空或过短,无法生成总结")
70
+
71
+ parsed = urlparse(url)
72
+ article_id = parsed.netloc + parsed.path
73
+ return ArticleContent(
74
+ platform="generic_web",
75
+ url=url,
76
+ article_id=article_id or url,
77
+ title=title or parsed.netloc or "网页文章",
78
+ author_name=author,
79
+ content_text=body,
80
+ image_urls=[cover] if cover else [],
81
+ cover_url=cover,
82
+ published_at=published_at,
83
+ raw_metadata={"source": "generic_web"},
84
+ )
85
+
86
+
87
+ class GenericArticleFetcher:
88
+ platform = "generic_web"
89
+
90
+ def fetch(self, url: str) -> ArticleContent:
91
+ clean = clean_url(url)
92
+ try:
93
+ response = requests.get(
94
+ clean,
95
+ timeout=12,
96
+ allow_redirects=True,
97
+ headers={
98
+ "User-Agent": (
99
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
100
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
101
+ ),
102
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
103
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
104
+ },
105
+ )
106
+ response.raise_for_status()
107
+ return parse_generic_article_html(response.text, response.url or clean)
108
+ except ValueError:
109
+ raise
110
+ except Exception as exc:
111
+ raise ArticleFetchError(f"网页文章抓取失败:{exc}") from exc
112
+
113
+ def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
114
+ raise ArticleFetchError("通用网页暂不支持关键字查询,请粘贴具体文章链接")
115
+
116
+ def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
117
+ raise ArticleFetchError("通用网页暂不支持发布者订阅,请粘贴具体文章链接")
backend/app/article_fetchers/wechat.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from urllib.parse import parse_qs, quote, unquote, urljoin, urlparse
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ from app.article_fetchers.base import ArticleContent, ArticleFetchError
10
+
11
+
12
+ def _clean_text(value: str) -> str:
13
+ return re.sub(r"\s+", " ", value or "").strip()
14
+
15
+
16
+ def _element_text(element) -> str:
17
+ return _clean_text(element.get_text(" ")) if element else ""
18
+
19
+
20
+ def _script_value(html: str, name: str) -> str:
21
+ patterns = [
22
+ rf'var\s+{re.escape(name)}\s*=\s*"([^"]*)"',
23
+ rf"{re.escape(name)}\s*:\s*'([^']*)'",
24
+ ]
25
+ for pattern in patterns:
26
+ match = re.search(pattern, html)
27
+ if match:
28
+ return match.group(1).strip()
29
+ return ""
30
+
31
+
32
+ def parse_wechat_article_html(html: str, url: str) -> ArticleContent:
33
+ soup = BeautifulSoup(html, "html.parser")
34
+ title = _element_text(soup.find(id="activity-name") or soup.find("h1"))
35
+ author = _element_text(soup.find(id="js_name"))
36
+ published_at = _element_text(soup.find(id="publish_time"))
37
+ content = soup.find(id="js_content")
38
+ body = _clean_text(content.get_text("\n")) if content else ""
39
+ if not body:
40
+ raise ValueError("微信公众号文章正文为空,无法生成总结")
41
+
42
+ image_urls: list[str] = []
43
+ for image in content.find_all("img") if content else []:
44
+ src = image.get("data-src") or image.get("src") or ""
45
+ if src and src not in image_urls:
46
+ image_urls.append(src)
47
+
48
+ biz = _script_value(html, "biz")
49
+ mid = _script_value(html, "mid")
50
+ idx = _script_value(html, "idx")
51
+ sn = _script_value(html, "sn")
52
+ article_id = ":".join(part for part in [biz, mid, idx, sn] if part) or url
53
+
54
+ return ArticleContent(
55
+ platform="wechat_mp",
56
+ url=url,
57
+ article_id=article_id,
58
+ title=title or "微信公众号文章",
59
+ author_name=author,
60
+ author_id=biz,
61
+ content_text=body,
62
+ image_urls=image_urls,
63
+ cover_url=image_urls[0] if image_urls else "",
64
+ published_at=published_at,
65
+ raw_metadata={"biz": biz, "mid": mid, "idx": idx, "sn": sn},
66
+ )
67
+
68
+
69
+ def _normalize_wechat_result_url(href: str) -> str:
70
+ if not href:
71
+ return ""
72
+ absolute = urljoin("https://weixin.sogou.com", href)
73
+ parsed = urlparse(absolute)
74
+ query = parse_qs(parsed.query)
75
+ for key in ("url", "target"):
76
+ if query.get(key):
77
+ candidate = unquote(query[key][0])
78
+ if "mp.weixin.qq.com" in candidate:
79
+ return candidate
80
+ return absolute if "mp.weixin.qq.com" in absolute else ""
81
+
82
+
83
+ def parse_wechat_search_html(html: str, keyword: str, limit: int = 20) -> list[ArticleContent]:
84
+ soup = BeautifulSoup(html, "html.parser")
85
+ items: list[ArticleContent] = []
86
+ seen: set[str] = set()
87
+ for anchor in soup.find_all("a", href=True):
88
+ url = _normalize_wechat_result_url(anchor.get("href") or "")
89
+ if not url or url in seen:
90
+ continue
91
+ title = _clean_text(anchor.get_text(" "))
92
+ if not title:
93
+ continue
94
+ container = anchor.find_parent(["div", "li"]) or anchor.parent
95
+ info_nodes = container.find_all(class_=re.compile(r"(txt-info|s-p|account)")) if container else []
96
+ info = [_clean_text(node.get_text(" ")) for node in info_nodes if _clean_text(node.get_text(" "))]
97
+ author = info[0] if info else ""
98
+ summary = info[-1] if len(info) > 1 else title
99
+ seen.add(url)
100
+ items.append(
101
+ ArticleContent(
102
+ platform="wechat_mp",
103
+ url=url,
104
+ article_id=url,
105
+ title=title,
106
+ author_name=author,
107
+ content_text=summary,
108
+ raw_metadata={"keyword": keyword, "source": "sogou_weixin"},
109
+ )
110
+ )
111
+ if len(items) >= limit:
112
+ break
113
+ return items
114
+
115
+
116
+ class WechatArticleFetcher:
117
+ platform = "wechat_mp"
118
+
119
+ def fetch(self, url: str) -> ArticleContent:
120
+ try:
121
+ response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
122
+ response.raise_for_status()
123
+ return parse_wechat_article_html(response.text, url)
124
+ except ValueError:
125
+ raise
126
+ except Exception as exc:
127
+ raise ArticleFetchError(f"微信公众号文章抓取失败:{exc}") from exc
128
+
129
+ def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
130
+ try:
131
+ response = requests.get(
132
+ f"https://weixin.sogou.com/weixin?type=2&query={quote(keyword)}",
133
+ timeout=10,
134
+ headers={"User-Agent": "Mozilla/5.0"},
135
+ )
136
+ response.raise_for_status()
137
+ return parse_wechat_search_html(response.text, keyword, limit)
138
+ except Exception as exc:
139
+ raise ArticleFetchError(f"微信公众号关键字查询失败:{exc}") from exc
140
+
141
+ def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
142
+ return self.search(query, limit)
backend/app/article_fetchers/xiaohongshu.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from datetime import datetime
6
+ from urllib.parse import quote, urlparse
7
+
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+
11
+ from app.article_fetchers.base import ArticleContent, ArticleFetchError
12
+ from app.services.cookie_manager import CookieConfigManager
13
+ from app.utils.url_parser import clean_url
14
+
15
+
16
+ def _note_id_from_url(url: str) -> str:
17
+ path = urlparse(url).path.rstrip("/")
18
+ return path.split("/")[-1] if path else url
19
+
20
+
21
+ def _extract_initial_state(html: str) -> dict:
22
+ match = re.search(r"window\.__INITIAL_STATE__\s*=", html)
23
+ if not match:
24
+ return {}
25
+ start = html.find("{", match.end())
26
+ if start < 0:
27
+ return {}
28
+ depth = 0
29
+ end = -1
30
+ for index in range(start, len(html)):
31
+ char = html[index]
32
+ if char == "{":
33
+ depth += 1
34
+ elif char == "}":
35
+ depth -= 1
36
+ if depth == 0:
37
+ end = index + 1
38
+ break
39
+ if end < 0:
40
+ return {}
41
+ raw = html[start:end].replace("undefined", "null")
42
+ try:
43
+ return json.loads(raw)
44
+ except json.JSONDecodeError:
45
+ return {}
46
+
47
+
48
+ def _first_image_url(item: dict) -> str:
49
+ for key in ("urlDefault", "url", "traceId"):
50
+ value = item.get(key)
51
+ if isinstance(value, str) and value.startswith("http"):
52
+ return value
53
+ nested = item.get("cover") or item.get("image") or {}
54
+ if isinstance(nested, dict):
55
+ for key in ("urlDefault", "url"):
56
+ value = nested.get(key)
57
+ if isinstance(value, str) and value.startswith("http"):
58
+ return value
59
+ return ""
60
+
61
+
62
+ def _published_at(value) -> str:
63
+ try:
64
+ timestamp = int(value)
65
+ except (TypeError, ValueError):
66
+ return ""
67
+ if timestamp > 10_000_000_000:
68
+ timestamp = timestamp // 1000
69
+ return datetime.fromtimestamp(timestamp).isoformat(timespec="seconds")
70
+
71
+
72
+ def _article_from_note(note: dict, url: str) -> ArticleContent:
73
+ user = note.get("user") or {}
74
+ images: list[str] = []
75
+ for image in note.get("imageList") or note.get("images") or []:
76
+ src = _first_image_url(image)
77
+ if src and src not in images:
78
+ images.append(src)
79
+
80
+ content = str(note.get("desc") or note.get("description") or "").strip()
81
+ title = str(note.get("title") or "").strip() or content[:40] or "小红书笔记"
82
+ article_id = str(note.get("noteId") or note.get("id") or _note_id_from_url(url)).strip()
83
+ if not content:
84
+ raise ValueError("小红书笔记正文为空,无法生成总结")
85
+
86
+ return ArticleContent(
87
+ platform="xiaohongshu",
88
+ url=url,
89
+ article_id=article_id,
90
+ title=title,
91
+ author_name=str(user.get("nickname") or "").strip(),
92
+ author_id=str(user.get("userId") or user.get("id") or "").strip(),
93
+ content_text=content,
94
+ image_urls=images,
95
+ cover_url=images[0] if images else "",
96
+ published_at=_published_at(note.get("time") or note.get("lastUpdateTime")),
97
+ raw_metadata={"raw_note": note},
98
+ )
99
+
100
+
101
+ def parse_xiaohongshu_article_html(html: str, url: str) -> ArticleContent:
102
+ state = _extract_initial_state(html)
103
+ detail_map = ((state.get("note") or {}).get("noteDetailMap")) or {}
104
+ for value in detail_map.values():
105
+ note = value.get("note") if isinstance(value, dict) else None
106
+ if isinstance(note, dict):
107
+ return _article_from_note(note, url)
108
+
109
+ soup = BeautifulSoup(html, "html.parser")
110
+ title_meta = soup.find("meta", attrs={"property": "og:title"})
111
+ desc_meta = soup.find("meta", attrs={"name": "description"})
112
+ title = (title_meta.get("content") if title_meta else "") or "小红书笔记"
113
+ body = (desc_meta.get("content") if desc_meta else "").strip()
114
+ if not body:
115
+ raise ValueError("小红书笔记正文为空,无法生成总结")
116
+
117
+ return ArticleContent(
118
+ platform="xiaohongshu",
119
+ url=url,
120
+ article_id=_note_id_from_url(url),
121
+ title=title.strip(),
122
+ content_text=body,
123
+ )
124
+
125
+
126
+ def _iter_note_like(value):
127
+ if isinstance(value, dict):
128
+ note_id = value.get("noteId") or value.get("id")
129
+ title = value.get("title") or value.get("displayTitle")
130
+ desc = value.get("desc") or value.get("description")
131
+ if note_id and (title or desc):
132
+ yield value
133
+ for child in value.values():
134
+ yield from _iter_note_like(child)
135
+ elif isinstance(value, list):
136
+ for child in value:
137
+ yield from _iter_note_like(child)
138
+
139
+
140
+ def parse_xiaohongshu_discovery_html(
141
+ html: str,
142
+ source_url: str,
143
+ limit: int = 20,
144
+ ) -> list[ArticleContent]:
145
+ state = _extract_initial_state(html)
146
+ items: list[ArticleContent] = []
147
+ seen: set[str] = set()
148
+ for note in _iter_note_like(state):
149
+ article_id = str(note.get("noteId") or note.get("id") or "").strip()
150
+ if not article_id or article_id in seen:
151
+ continue
152
+ user = note.get("user") or note.get("author") or {}
153
+ image_url = _first_image_url(note)
154
+ content = str(note.get("desc") or note.get("description") or note.get("title") or "").strip()
155
+ title = str(note.get("title") or note.get("displayTitle") or content[:40] or "小红书笔记").strip()
156
+ seen.add(article_id)
157
+ items.append(
158
+ ArticleContent(
159
+ platform="xiaohongshu",
160
+ url=f"https://www.xiaohongshu.com/explore/{article_id}",
161
+ article_id=article_id,
162
+ title=title,
163
+ author_name=str(user.get("nickname") or user.get("name") or "").strip(),
164
+ author_id=str(user.get("userId") or user.get("id") or "").strip(),
165
+ content_text=content,
166
+ image_urls=[image_url] if image_url else [],
167
+ cover_url=image_url,
168
+ raw_metadata={"source_url": source_url},
169
+ )
170
+ )
171
+ if len(items) >= limit:
172
+ break
173
+ return items
174
+
175
+
176
+ class XiaohongshuArticleFetcher:
177
+ platform = "xiaohongshu"
178
+
179
+ def __init__(self):
180
+ self._cookie_mgr = CookieConfigManager()
181
+
182
+ def _headers(self) -> dict:
183
+ headers = {"User-Agent": "Mozilla/5.0"}
184
+ cookie = self._cookie_mgr.get("xiaohongshu")
185
+ if cookie:
186
+ headers["Cookie"] = cookie
187
+ return headers
188
+
189
+ def fetch(self, url: str) -> ArticleContent:
190
+ clean = clean_url(url)
191
+ try:
192
+ response = requests.get(clean, timeout=10, headers=self._headers(), allow_redirects=True)
193
+ response.raise_for_status()
194
+ return parse_xiaohongshu_article_html(response.text, response.url or clean)
195
+ except ValueError:
196
+ raise
197
+ except Exception as exc:
198
+ raise ArticleFetchError(f"小红书笔记抓取失败:{exc}") from exc
199
+
200
+ def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
201
+ url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}"
202
+ try:
203
+ response = requests.get(url, timeout=10, headers=self._headers())
204
+ response.raise_for_status()
205
+ return parse_xiaohongshu_discovery_html(response.text, url, limit)
206
+ except Exception as exc:
207
+ raise ArticleFetchError(f"小红书关键字查询失败:{exc}") from exc
208
+
209
+ def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
210
+ url = clean_url(query)
211
+ if not url.startswith("http"):
212
+ url = f"https://www.xiaohongshu.com/user/profile/{quote(query)}"
213
+ try:
214
+ response = requests.get(url, timeout=10, headers=self._headers(), allow_redirects=True)
215
+ response.raise_for_status()
216
+ return parse_xiaohongshu_discovery_html(response.text, response.url or url, limit)
217
+ except Exception as exc:
218
+ raise ArticleFetchError(f"小红书发布者订阅刷新失败:{exc}") from exc
backend/app/core/__init__.py ADDED
File without changes
backend/app/db/__init__.py ADDED
File without changes
backend/app/db/article_dao.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from datetime import datetime
6
+
7
+ from app.article_fetchers.base import ArticleContent
8
+ from app.db.engine import get_db
9
+ from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
10
+
11
+
12
+ def url_hash(url: str) -> str:
13
+ return hashlib.sha256(url.encode("utf-8")).hexdigest()
14
+
15
+
16
+ def _detach(obj):
17
+ data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
18
+ obj.__dict__.clear()
19
+ obj.__dict__.update(data)
20
+ return obj
21
+
22
+
23
+ def upsert_article_item(article: ArticleContent) -> ArticleItem:
24
+ db = next(get_db())
25
+ try:
26
+ digest = url_hash(article.url)
27
+ item = None
28
+ if article.article_id:
29
+ item = (
30
+ db.query(ArticleItem)
31
+ .filter_by(platform=article.platform, article_id=article.article_id)
32
+ .first()
33
+ )
34
+ if item is None:
35
+ item = db.query(ArticleItem).filter_by(platform=article.platform, url_hash=digest).first()
36
+ if item is None:
37
+ item = ArticleItem(
38
+ platform=article.platform,
39
+ article_id=article.article_id,
40
+ url_hash=digest,
41
+ url=article.url,
42
+ title=article.title,
43
+ )
44
+ db.add(item)
45
+ item.url = article.url
46
+ item.title = article.title
47
+ item.author_name = article.author_name
48
+ item.author_id = article.author_id
49
+ item.cover_url = article.cover_url
50
+ item.published_at = article.published_at
51
+ item.content_text = article.content_text
52
+ item.raw_metadata = json.dumps(article.raw_metadata or {}, ensure_ascii=False)
53
+ db.commit()
54
+ db.refresh(item)
55
+ return _detach(item)
56
+ finally:
57
+ db.close()
58
+
59
+
60
+ def get_article_item(item_id: int) -> ArticleItem | None:
61
+ db = next(get_db())
62
+ try:
63
+ item = db.query(ArticleItem).filter_by(id=item_id).first()
64
+ return _detach(item) if item else None
65
+ finally:
66
+ db.close()
67
+
68
+
69
+ def list_article_items(subscription_id: int | None = None) -> list[ArticleItem]:
70
+ db = next(get_db())
71
+ try:
72
+ query = db.query(ArticleItem)
73
+ if subscription_id is not None:
74
+ query = query.join(
75
+ ArticleSubscriptionItem,
76
+ ArticleSubscriptionItem.article_item_id == ArticleItem.id,
77
+ ).filter(ArticleSubscriptionItem.subscription_id == subscription_id)
78
+ return [_detach(item) for item in query.order_by(ArticleItem.id.desc()).all()]
79
+ finally:
80
+ db.close()
81
+
82
+
83
+ def mark_article_summarized(item_id: int, task_id: str) -> None:
84
+ db = next(get_db())
85
+ try:
86
+ item = db.query(ArticleItem).filter_by(id=item_id).first()
87
+ if item:
88
+ item.summary_status = "summarized"
89
+ item.task_id = task_id
90
+ db.commit()
91
+ finally:
92
+ db.close()
93
+
94
+
95
+ def create_subscription(
96
+ platform: str,
97
+ subscription_type: str,
98
+ query: str,
99
+ label: str = "",
100
+ ) -> ArticleSubscription:
101
+ db = next(get_db())
102
+ try:
103
+ subscription = ArticleSubscription(
104
+ platform=platform,
105
+ type=subscription_type,
106
+ query=query,
107
+ label=label or query,
108
+ )
109
+ db.add(subscription)
110
+ db.commit()
111
+ db.refresh(subscription)
112
+ return _detach(subscription)
113
+ finally:
114
+ db.close()
115
+
116
+
117
+ def list_subscriptions() -> list[ArticleSubscription]:
118
+ db = next(get_db())
119
+ try:
120
+ return [
121
+ _detach(item)
122
+ for item in db.query(ArticleSubscription).order_by(ArticleSubscription.id.desc()).all()
123
+ ]
124
+ finally:
125
+ db.close()
126
+
127
+
128
+ def get_subscription(subscription_id: int) -> ArticleSubscription | None:
129
+ db = next(get_db())
130
+ try:
131
+ item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
132
+ return _detach(item) if item else None
133
+ finally:
134
+ db.close()
135
+
136
+
137
+ def update_subscription_refresh(subscription_id: int, error: str = "") -> None:
138
+ db = next(get_db())
139
+ try:
140
+ item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
141
+ if item:
142
+ item.last_refresh_at = datetime.now()
143
+ item.last_error = error
144
+ db.commit()
145
+ finally:
146
+ db.close()
147
+
148
+
149
+ def link_subscription_item(subscription_id: int, article_item_id: int, match_reason: str) -> None:
150
+ db = next(get_db())
151
+ try:
152
+ existing = (
153
+ db.query(ArticleSubscriptionItem)
154
+ .filter_by(subscription_id=subscription_id, article_item_id=article_item_id)
155
+ .first()
156
+ )
157
+ if existing is None:
158
+ db.add(
159
+ ArticleSubscriptionItem(
160
+ subscription_id=subscription_id,
161
+ article_item_id=article_item_id,
162
+ match_reason=match_reason,
163
+ )
164
+ )
165
+ db.commit()
166
+ finally:
167
+ db.close()
backend/app/db/builtin_providers.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "openai",
4
+ "name": "OpenAI",
5
+ "type": "built-in",
6
+ "logo": "OpenAI",
7
+ "api_key": "",
8
+ "base_url": "https://api.openai.com/v1",
9
+ "enabled": 0
10
+ },
11
+ {
12
+ "id": "deepseek",
13
+ "name": "DeepSeek",
14
+ "type": "built-in",
15
+ "logo": "DeepSeek",
16
+ "api_key": "",
17
+ "base_url": "https://api.deepseek.com",
18
+ "enabled": 1
19
+ },
20
+ {
21
+ "id": "qwen",
22
+ "name": "Qwen",
23
+ "type": "built-in",
24
+ "logo": "Qwen",
25
+ "api_key": "",
26
+ "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
27
+ "enabled": 0
28
+ },
29
+ {
30
+ "id": "Claude",
31
+ "name": "Claude",
32
+ "type": "built-in",
33
+ "logo": "Claude",
34
+ "api_key": "",
35
+ "base_url": "https://",
36
+ "enabled": 0
37
+ },
38
+ {
39
+ "id": "gemini",
40
+ "name": "Gemini",
41
+ "type": "built-in",
42
+ "logo": "Gemini",
43
+ "api_key": "",
44
+ "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
45
+ "enabled": 0
46
+ },
47
+ {
48
+ "id": "groq",
49
+ "name": "Groq",
50
+ "type": "built-in",
51
+ "logo": "Groq",
52
+ "api_key": "",
53
+ "base_url": "https://api.groq.com/openai/v1",
54
+ "enabled": 0
55
+ },
56
+ {
57
+ "id": "ollama",
58
+ "name": "ollama",
59
+ "type": "built-in",
60
+ "logo": "Ollama",
61
+ "api_key": "",
62
+ "base_url": "http://127.0.0.1:11434/v1",
63
+ "enabled": 0
64
+ }
65
+ ]
backend/app/db/engine.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sqlalchemy import create_engine
3
+ from sqlalchemy.orm import sessionmaker, declarative_base
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ # 默认 SQLite,如果想换 PostgreSQL 或 MySQL,可以直接改 .env
9
+ DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///video_memo.db")
10
+
11
+ # SQLite 需要特定连接参数,其他数据库不需要
12
+ engine_args = {}
13
+ if DATABASE_URL.startswith("sqlite"):
14
+ engine_args["connect_args"] = {"check_same_thread": False}
15
+
16
+ _pool_args = {}
17
+ if not DATABASE_URL.startswith("sqlite"):
18
+ _pool_args = {
19
+ "pool_size": int(os.getenv("DB_POOL_SIZE", "10")),
20
+ "max_overflow": int(os.getenv("DB_MAX_OVERFLOW", "20")),
21
+ "pool_pre_ping": True,
22
+ }
23
+
24
+ engine = create_engine(
25
+ DATABASE_URL,
26
+ echo=os.getenv("SQLALCHEMY_ECHO", "false").lower() == "true",
27
+ **engine_args,
28
+ **_pool_args,
29
+ )
30
+
31
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
32
+
33
+ Base = declarative_base()
34
+
35
+
36
+ def get_engine():
37
+ return engine
38
+
39
+
40
+ def get_db():
41
+ db = SessionLocal()
42
+ try:
43
+ yield db
44
+ finally:
45
+ db.close()
backend/app/db/init_db.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
2
+ from app.db.models.models import Model
3
+ from app.db.models.providers import Provider
4
+ from app.db.models.trend_subscription import (
5
+ NotificationChannel,
6
+ TrendSubscription,
7
+ TrendSubscriptionMatch,
8
+ )
9
+ from app.db.models.video_tasks import VideoTask
10
+ from app.db.engine import get_engine, Base
11
+ from sqlalchemy import inspect, text
12
+
13
+ def init_db():
14
+ engine = get_engine()
15
+
16
+ Base.metadata.create_all(bind=engine)
17
+ _ensure_article_content_text(engine)
18
+
19
+
20
+ # 注:原 _ensure_model_columns 为 models.supports_multimodal 做的迁移已删除——
21
+ # 该列在「drop multimodal」重构后已不再被 ORM 使用(纯遗留),且它的
22
+ # `ALTER ... BOOLEAN NOT NULL DEFAULT 0` 在 Postgres 上会因 boolean 默认值类型不符直接报错。
23
+ # 已有 SQLite 库里残留的该列无害,保持不动即可。
24
+
25
+
26
+ def _ensure_article_content_text(engine):
27
+ inspector = inspect(engine)
28
+ if "article_items" not in inspector.get_table_names():
29
+ return
30
+ columns = {column["name"] for column in inspector.get_columns("article_items")}
31
+ if "content_text" in columns:
32
+ return
33
+ with engine.begin() as conn:
34
+ conn.execute(text("ALTER TABLE article_items ADD COLUMN content_text TEXT NOT NULL DEFAULT ''"))
backend/app/db/model_dao.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.db.engine import get_db
2
+ from app.db.models.models import Model
3
+ from app.db.models.providers import Provider
4
+
5
+
6
+ def get_model_by_provider_and_name(provider_id: int, model_name: str):
7
+ db = next(get_db())
8
+ try:
9
+ model = db.query(Model).filter_by(provider_id=provider_id, model_name=model_name).first()
10
+ if model:
11
+ return {
12
+ "id": model.id,
13
+ "provider_id": model.provider_id,
14
+ "model_name": model.model_name,
15
+ "created_at": model.created_at,
16
+ }
17
+ return None
18
+ finally:
19
+ db.close()
20
+
21
+
22
+ def insert_model(provider_id: int, model_name: str):
23
+ db = next(get_db())
24
+ try:
25
+ model = Model(provider_id=provider_id, model_name=model_name)
26
+ db.add(model)
27
+ db.commit()
28
+ db.refresh(model)
29
+ return {
30
+ "id": model.id,
31
+ "provider_id": model.provider_id,
32
+ "model_name": model.model_name,
33
+ "created_at": model.created_at,
34
+ }
35
+ finally:
36
+ db.close()
37
+
38
+
39
+ def get_models_by_provider(provider_id: int):
40
+ db = next(get_db())
41
+ try:
42
+ models = db.query(Model).filter_by(provider_id=provider_id).all()
43
+ return [{"id": m.id, "model_name": m.model_name} for m in models]
44
+ finally:
45
+ db.close()
46
+
47
+
48
+ def delete_model(model_id: int):
49
+ db = next(get_db())
50
+ try:
51
+ model = db.query(Model).filter_by(id=model_id).first()
52
+ if model:
53
+ db.delete(model)
54
+ db.commit()
55
+ finally:
56
+ db.close()
57
+
58
+
59
+ def get_all_models():
60
+ db = next(get_db())
61
+ try:
62
+ # 只查询启用状态供应商的模型
63
+ models = db.query(Model).join(Provider, Model.provider_id == Provider.id).filter(Provider.enabled == 1).all()
64
+ return [
65
+ {"id": m.id, "provider_id": m.provider_id, "model_name": m.model_name}
66
+ for m in models
67
+ ]
68
+ finally:
69
+ db.close()
backend/app/db/models/__init__.py ADDED
File without changes
backend/app/db/models/articles.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
2
+
3
+ from app.db.engine import Base
4
+
5
+
6
+ class ArticleItem(Base):
7
+ __tablename__ = "article_items"
8
+ __table_args__ = (
9
+ UniqueConstraint("platform", "article_id", name="uq_article_platform_article_id"),
10
+ UniqueConstraint("platform", "url_hash", name="uq_article_platform_url_hash"),
11
+ )
12
+
13
+ id = Column(Integer, primary_key=True, autoincrement=True)
14
+ platform = Column(String, nullable=False)
15
+ article_id = Column(String, nullable=False, default="")
16
+ url = Column(Text, nullable=False)
17
+ url_hash = Column(String, nullable=False)
18
+ title = Column(String, nullable=False)
19
+ author_name = Column(String, nullable=False, default="")
20
+ author_id = Column(String, nullable=False, default="")
21
+ summary_status = Column(String, nullable=False, default="pending")
22
+ task_id = Column(String, nullable=False, default="")
23
+ cover_url = Column(Text, nullable=False, default="")
24
+ published_at = Column(String, nullable=False, default="")
25
+ content_text = Column(Text, nullable=False, default="")
26
+ discovered_at = Column(DateTime, server_default=func.now())
27
+ raw_metadata = Column(Text, nullable=False, default="{}")
28
+
29
+
30
+ class ArticleSubscription(Base):
31
+ __tablename__ = "article_subscriptions"
32
+
33
+ id = Column(Integer, primary_key=True, autoincrement=True)
34
+ platform = Column(String, nullable=False)
35
+ type = Column(String, nullable=False)
36
+ query = Column(Text, nullable=False)
37
+ label = Column(String, nullable=False, default="")
38
+ enabled = Column(Boolean, nullable=False, default=True)
39
+ last_refresh_at = Column(DateTime, nullable=True)
40
+ last_error = Column(Text, nullable=False, default="")
41
+ created_at = Column(DateTime, server_default=func.now())
42
+ updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
43
+
44
+
45
+ class ArticleSubscriptionItem(Base):
46
+ __tablename__ = "article_subscription_items"
47
+ __table_args__ = (
48
+ UniqueConstraint("subscription_id", "article_item_id", name="uq_subscription_article_item"),
49
+ )
50
+
51
+ id = Column(Integer, primary_key=True, autoincrement=True)
52
+ subscription_id = Column(Integer, ForeignKey("article_subscriptions.id"), nullable=False)
53
+ article_item_id = Column(Integer, ForeignKey("article_items.id"), nullable=False)
54
+ matched_at = Column(DateTime, server_default=func.now())
55
+ match_reason = Column(Text, nullable=False, default="")
backend/app/db/models/models.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, DateTime, func, ForeignKey
2
+
3
+ from app.db.engine import Base
4
+
5
+
6
+ class Model(Base):
7
+ __tablename__ = "models"
8
+
9
+ id = Column(Integer, primary_key=True, autoincrement=True)
10
+ provider_id = Column(Integer, nullable=False)
11
+ model_name = Column(String, nullable=False)
12
+ created_at = Column(DateTime, server_default=func.now())
backend/app/db/models/providers.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, String, Integer, DateTime, func
2
+ from sqlalchemy.orm import declarative_base
3
+
4
+ from app.db.engine import Base
5
+
6
+
7
+ class Provider(Base):
8
+ __tablename__ = "providers"
9
+
10
+ id = Column(String, primary_key=True)
11
+ name = Column(String, nullable=False)
12
+ logo = Column(String, nullable=False)
13
+ type = Column(String, nullable=False)
14
+ api_key = Column(String, nullable=False)
15
+ base_url = Column(String, nullable=False)
16
+ enabled = Column(Integer, default=1)
17
+ created_at = Column(DateTime, server_default=func.now())
backend/app/db/models/trend_subscription.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, func
2
+
3
+ from app.db.engine import Base
4
+
5
+
6
+ class TrendSubscription(Base):
7
+ __tablename__ = "trend_subscriptions"
8
+
9
+ id = Column(Integer, primary_key=True, autoincrement=True)
10
+ name = Column(String, nullable=False)
11
+ keywords = Column(Text, nullable=False, default="[]") # JSON array of keyword strings
12
+ platforms = Column(Text, nullable=False, default='["all"]') # JSON array of platform ids
13
+ match_mode = Column(String, nullable=False, default="any") # "any" | "all"
14
+ enabled = Column(Boolean, nullable=False, default=True)
15
+ push_enabled = Column(Boolean, nullable=False, default=False)
16
+ push_channel_ids = Column(Text, nullable=False, default="[]") # JSON array of channel ids
17
+ last_matched_at = Column(DateTime, nullable=True)
18
+ created_at = Column(DateTime, server_default=func.now())
19
+ updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
20
+
21
+
22
+ class TrendSubscriptionMatch(Base):
23
+ __tablename__ = "trend_subscription_matches"
24
+
25
+ id = Column(Integer, primary_key=True, autoincrement=True)
26
+ subscription_id = Column(Integer, ForeignKey("trend_subscriptions.id"), nullable=False)
27
+ platform = Column(String, nullable=False)
28
+ item_id = Column(String, nullable=False)
29
+ title = Column(String, nullable=False)
30
+ url = Column(Text, nullable=False, default="")
31
+ hot_score = Column(String, nullable=False, default="")
32
+ matched_keywords = Column(Text, nullable=False, default="[]") # JSON array of matched keywords
33
+ matched_at = Column(DateTime, server_default=func.now())
34
+ is_read = Column(Boolean, nullable=False, default=False)
35
+ # dedup: same subscription + same platform + same item_id
36
+ __table_args__ = (
37
+ {"sqlite_autoincrement": True},
38
+ )
39
+
40
+
41
+ class NotificationChannel(Base):
42
+ __tablename__ = "notification_channels"
43
+
44
+ id = Column(Integer, primary_key=True, autoincrement=True)
45
+ name = Column(String, nullable=False)
46
+ type = Column(String, nullable=False) # "webhook" | "bark" | "email"
47
+ config = Column(Text, nullable=False, default="{}") # JSON object, type-specific
48
+ enabled = Column(Boolean, nullable=False, default=True)
49
+ created_at = Column(DateTime, server_default=func.now())
50
+ updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
backend/app/db/models/video_tasks.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, DateTime, func
2
+ from sqlalchemy.orm import declarative_base
3
+
4
+ from app.db.engine import Base
5
+
6
+
7
+ class VideoTask(Base):
8
+ __tablename__ = "video_tasks"
9
+
10
+ id = Column(Integer, primary_key=True, autoincrement=True)
11
+ video_id = Column(String, nullable=False)
12
+ platform = Column(String, nullable=False)
13
+ task_id = Column(String, unique=True, nullable=False)
14
+ created_at = Column(DateTime, server_default=func.now())
backend/app/db/provider_dao.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+ from app.db.models.providers import Provider
5
+ from app.utils.logger import get_logger
6
+ from app.db.engine import get_engine, Base, get_db
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def get_builtin_providers_path():
12
+ if getattr(sys, 'frozen', False):
13
+ base_path = sys._MEIPASS
14
+ else:
15
+ base_path = os.path.dirname(__file__)
16
+ return os.path.join(base_path, 'builtin_providers.json')
17
+
18
+
19
+ def seed_default_providers():
20
+ db = next(get_db())
21
+ try:
22
+ if db.query(Provider).count() > 0:
23
+ logger.info("Providers already exist, skipping seed.")
24
+ return
25
+
26
+ json_path = get_builtin_providers_path()
27
+ try:
28
+ with open(json_path, 'r', encoding='utf-8') as f:
29
+ providers = json.load(f)
30
+ except Exception as e:
31
+ logger.error(f"Failed to read builtin_providers.json: {e}")
32
+ return
33
+
34
+ for p in providers:
35
+ db.add(Provider(
36
+ id=p['id'],
37
+ name=p['name'],
38
+ api_key=p['api_key'],
39
+ base_url=p['base_url'],
40
+ logo=p['logo'],
41
+ type=p['type'],
42
+ enabled=p.get('enabled', 1)
43
+ ))
44
+ db.commit()
45
+ logger.info("Default providers seeded successfully.")
46
+ except Exception as e:
47
+ logger.error(f"Failed to seed default providers: {e}")
48
+ finally:
49
+ db.close()
50
+
51
+
52
+ def insert_provider(id: str, name: str, api_key: str, base_url: str, logo: str, type_: str, enabled: int = 1):
53
+ db = next(get_db())
54
+ try:
55
+ provider = Provider(id=id, name=name, api_key=api_key, base_url=base_url, logo=logo, type=type_, enabled=enabled)
56
+ db.add(provider)
57
+ db.commit()
58
+ logger.info(f"Provider inserted successfully. id: {id}, name: {name}, type: {type_}")
59
+ return id
60
+ except Exception as e:
61
+ logger.error(f"Failed to insert provider: {e}")
62
+ finally:
63
+ db.close()
64
+
65
+
66
+ def get_enabled_providers():
67
+ db = next(get_db())
68
+ try:
69
+ return db.query(Provider).filter_by(enabled=1).all()
70
+ finally:
71
+ db.close()
72
+
73
+
74
+ def get_provider_by_name(name: str):
75
+ db = next(get_db())
76
+ try:
77
+ return db.query(Provider).filter_by(name=name).first()
78
+ finally:
79
+ db.close()
80
+
81
+
82
+ def get_provider_by_id(id: str):
83
+ db = next(get_db())
84
+ try:
85
+ return db.query(Provider).filter_by(id=id).first()
86
+ finally:
87
+ db.close()
88
+
89
+
90
+ def get_all_providers():
91
+ db = next(get_db())
92
+ try:
93
+ return db.query(Provider).all()
94
+ finally:
95
+ db.close()
96
+
97
+
98
+ def update_provider(id: str, **kwargs):
99
+ db = next(get_db())
100
+ try:
101
+ provider = db.query(Provider).filter_by(id=id).first()
102
+ if not provider:
103
+ logger.warning(f"Provider {id} not found for update.")
104
+ return
105
+
106
+ for key, value in kwargs.items():
107
+ if hasattr(provider, key):
108
+ setattr(provider, key, value)
109
+
110
+ db.commit()
111
+ logger.info(f"Provider updated successfully. id: {id}, updated_fields: {list(kwargs.keys())}")
112
+ except Exception as e:
113
+ logger.error(f"Failed to update provider: {e}")
114
+ finally:
115
+ db.close()
116
+
117
+
118
+ def delete_provider(id: str):
119
+ db = next(get_db())
120
+ try:
121
+ provider = db.query(Provider).filter_by(id=id).first()
122
+ if provider:
123
+ db.delete(provider)
124
+ db.commit()
125
+ logger.info(f"Provider deleted successfully. id: {id}")
126
+ except Exception as e:
127
+ logger.error(f"Failed to delete provider: {e}")
128
+ finally:
129
+ db.close()
backend/app/db/sqlite_client.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import sqlite3
2
+
3
+ def get_connection():
4
+ return sqlite3.connect("video_memo.db")
backend/app/db/trend_subscription_dao.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime
5
+
6
+ from app.db.engine import get_db
7
+ from app.db.models.trend_subscription import (
8
+ NotificationChannel,
9
+ TrendSubscription,
10
+ TrendSubscriptionMatch,
11
+ )
12
+
13
+
14
+ def _detach(obj):
15
+ data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
16
+ obj.__dict__.clear()
17
+ obj.__dict__.update(data)
18
+ return obj
19
+
20
+
21
+ # ─── Trend Subscriptions ──────────────────────────────────────────────────────────
22
+
23
+ def create_subscription(
24
+ name: str,
25
+ keywords: list[str],
26
+ platforms: list[str] | None = None,
27
+ match_mode: str = "any",
28
+ push_enabled: bool = False,
29
+ push_channel_ids: list[int] | None = None,
30
+ ) -> TrendSubscription:
31
+ db = next(get_db())
32
+ try:
33
+ sub = TrendSubscription(
34
+ name=name,
35
+ keywords=json.dumps(keywords, ensure_ascii=False),
36
+ platforms=json.dumps(platforms or ["all"], ensure_ascii=False),
37
+ match_mode=match_mode,
38
+ push_enabled=push_enabled,
39
+ push_channel_ids=json.dumps(push_channel_ids or []),
40
+ )
41
+ db.add(sub)
42
+ db.commit()
43
+ db.refresh(sub)
44
+ return _detach(sub)
45
+ finally:
46
+ db.close()
47
+
48
+
49
+ def list_subscriptions() -> list[TrendSubscription]:
50
+ db = next(get_db())
51
+ try:
52
+ return [
53
+ _detach(item)
54
+ for item in db.query(TrendSubscription).order_by(TrendSubscription.id.desc()).all()
55
+ ]
56
+ finally:
57
+ db.close()
58
+
59
+
60
+ def get_subscription(subscription_id: int) -> TrendSubscription | None:
61
+ db = next(get_db())
62
+ try:
63
+ item = db.query(TrendSubscription).filter_by(id=subscription_id).first()
64
+ return _detach(item) if item else None
65
+ finally:
66
+ db.close()
67
+
68
+
69
+ def update_subscription(
70
+ subscription_id: int,
71
+ name: str | None = None,
72
+ keywords: list[str] | None = None,
73
+ platforms: list[str] | None = None,
74
+ match_mode: str | None = None,
75
+ enabled: bool | None = None,
76
+ push_enabled: bool | None = None,
77
+ push_channel_ids: list[int] | None = None,
78
+ ) -> TrendSubscription | None:
79
+ db = next(get_db())
80
+ try:
81
+ sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
82
+ if sub is None:
83
+ return None
84
+ if name is not None:
85
+ sub.name = name
86
+ if keywords is not None:
87
+ sub.keywords = json.dumps(keywords, ensure_ascii=False)
88
+ if platforms is not None:
89
+ sub.platforms = json.dumps(platforms, ensure_ascii=False)
90
+ if match_mode is not None:
91
+ sub.match_mode = match_mode
92
+ if enabled is not None:
93
+ sub.enabled = enabled
94
+ if push_enabled is not None:
95
+ sub.push_enabled = push_enabled
96
+ if push_channel_ids is not None:
97
+ sub.push_channel_ids = json.dumps(push_channel_ids)
98
+ db.commit()
99
+ db.refresh(sub)
100
+ return _detach(sub)
101
+ finally:
102
+ db.close()
103
+
104
+
105
+ def delete_subscription(subscription_id: int) -> bool:
106
+ db = next(get_db())
107
+ try:
108
+ sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
109
+ if sub is None:
110
+ return False
111
+ # also delete associated matches
112
+ db.query(TrendSubscriptionMatch).filter_by(subscription_id=subscription_id).delete()
113
+ db.delete(sub)
114
+ db.commit()
115
+ return True
116
+ finally:
117
+ db.close()
118
+
119
+
120
+ def update_subscription_refresh(subscription_id: int) -> None:
121
+ db = next(get_db())
122
+ try:
123
+ sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
124
+ if sub:
125
+ sub.last_matched_at = datetime.now()
126
+ db.commit()
127
+ finally:
128
+ db.close()
129
+
130
+
131
+ # ─── Trend Subscription Matches ───────────────────────────────────────────────────
132
+
133
+ def create_match(
134
+ subscription_id: int,
135
+ platform: str,
136
+ item_id: str,
137
+ title: str,
138
+ url: str = "",
139
+ hot_score: str = "",
140
+ matched_keywords: list[str] | None = None,
141
+ ) -> TrendSubscriptionMatch | None:
142
+ """Create a match record. Returns None if this (subscription, platform, item_id) already exists."""
143
+ db = next(get_db())
144
+ try:
145
+ existing = (
146
+ db.query(TrendSubscriptionMatch)
147
+ .filter_by(subscription_id=subscription_id, platform=platform, item_id=item_id)
148
+ .first()
149
+ )
150
+ if existing is not None:
151
+ return None # already matched before
152
+ match = TrendSubscriptionMatch(
153
+ subscription_id=subscription_id,
154
+ platform=platform,
155
+ item_id=item_id,
156
+ title=title,
157
+ url=url,
158
+ hot_score=hot_score,
159
+ matched_keywords=json.dumps(matched_keywords or [], ensure_ascii=False),
160
+ )
161
+ db.add(match)
162
+ db.commit()
163
+ db.refresh(match)
164
+ return _detach(match)
165
+ finally:
166
+ db.close()
167
+
168
+
169
+ def list_matches(
170
+ subscription_id: int | None = None,
171
+ limit: int = 100,
172
+ unread_only: bool = False,
173
+ ) -> list[TrendSubscriptionMatch]:
174
+ db = next(get_db())
175
+ try:
176
+ query = db.query(TrendSubscriptionMatch)
177
+ if subscription_id is not None:
178
+ query = query.filter_by(subscription_id=subscription_id)
179
+ if unread_only:
180
+ query = query.filter_by(is_read=False)
181
+ return [
182
+ _detach(item)
183
+ for item in query.order_by(TrendSubscriptionMatch.matched_at.desc())
184
+ .limit(limit)
185
+ .all()
186
+ ]
187
+ finally:
188
+ db.close()
189
+
190
+
191
+ def mark_matches_read(subscription_id: int) -> int:
192
+ """Mark all matches for a subscription as read. Returns count of updated rows."""
193
+ db = next(get_db())
194
+ try:
195
+ count = (
196
+ db.query(TrendSubscriptionMatch)
197
+ .filter_by(subscription_id=subscription_id, is_read=False)
198
+ .update({"is_read": True})
199
+ )
200
+ db.commit()
201
+ return count
202
+ finally:
203
+ db.close()
204
+
205
+
206
+ def count_unread_matches(subscription_id: int) -> int:
207
+ db = next(get_db())
208
+ try:
209
+ return (
210
+ db.query(TrendSubscriptionMatch)
211
+ .filter_by(subscription_id=subscription_id, is_read=False)
212
+ .count()
213
+ )
214
+ finally:
215
+ db.close()
216
+
217
+
218
+ # ─── Notification Channels ────────────────────────────────────────────────────────
219
+
220
+ def create_channel(name: str, channel_type: str, config: dict | None = None) -> NotificationChannel:
221
+ db = next(get_db())
222
+ try:
223
+ channel = NotificationChannel(
224
+ name=name,
225
+ type=channel_type,
226
+ config=json.dumps(config or {}, ensure_ascii=False),
227
+ )
228
+ db.add(channel)
229
+ db.commit()
230
+ db.refresh(channel)
231
+ return _detach(channel)
232
+ finally:
233
+ db.close()
234
+
235
+
236
+ def list_channels() -> list[NotificationChannel]:
237
+ db = next(get_db())
238
+ try:
239
+ return [
240
+ _detach(item)
241
+ for item in db.query(NotificationChannel).order_by(NotificationChannel.id.desc()).all()
242
+ ]
243
+ finally:
244
+ db.close()
245
+
246
+
247
+ def get_channel(channel_id: int) -> NotificationChannel | None:
248
+ db = next(get_db())
249
+ try:
250
+ item = db.query(NotificationChannel).filter_by(id=channel_id).first()
251
+ return _detach(item) if item else None
252
+ finally:
253
+ db.close()
254
+
255
+
256
+ def update_channel(
257
+ channel_id: int,
258
+ name: str | None = None,
259
+ channel_type: str | None = None,
260
+ config: dict | None = None,
261
+ enabled: bool | None = None,
262
+ ) -> NotificationChannel | None:
263
+ db = next(get_db())
264
+ try:
265
+ channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
266
+ if channel is None:
267
+ return None
268
+ if name is not None:
269
+ channel.name = name
270
+ if channel_type is not None:
271
+ channel.type = channel_type
272
+ if config is not None:
273
+ channel.config = json.dumps(config, ensure_ascii=False)
274
+ if enabled is not None:
275
+ channel.enabled = enabled
276
+ db.commit()
277
+ db.refresh(channel)
278
+ return _detach(channel)
279
+ finally:
280
+ db.close()
281
+
282
+
283
+ def delete_channel(channel_id: int) -> bool:
284
+ db = next(get_db())
285
+ try:
286
+ channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
287
+ if channel is None:
288
+ return False
289
+ db.delete(channel)
290
+ db.commit()
291
+ return True
292
+ finally:
293
+ db.close()
backend/app/db/video_task_dao.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.db.models.video_tasks import VideoTask
2
+ from app.db.engine import get_db
3
+ from app.utils.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ # 插入任务
9
+ def insert_video_task(video_id: str, platform: str, task_id: str):
10
+ db = next(get_db())
11
+ try:
12
+ task = VideoTask(video_id=video_id, platform=platform, task_id=task_id)
13
+ db.add(task)
14
+ db.commit()
15
+ db.refresh(task)
16
+ logger.info(f"Video task inserted successfully. video_id: {video_id}, platform: {platform}, task_id: {task_id}")
17
+ except Exception as e:
18
+ logger.error(f"Failed to insert video task: {e}")
19
+ finally:
20
+ db.close()
21
+
22
+
23
+ # 查询任务(最新一条)
24
+ def get_task_by_video(video_id: str, platform: str):
25
+ db = next(get_db())
26
+ try:
27
+ task = (
28
+ db.query(VideoTask)
29
+ .filter_by(video_id=video_id, platform=platform)
30
+ .order_by(VideoTask.created_at.desc())
31
+ .first()
32
+ )
33
+ if task:
34
+ logger.info(f"Task found for video_id: {video_id} and platform: {platform}")
35
+ return task.task_id
36
+ else:
37
+ logger.info(f"No task found for video_id: {video_id} and platform: {platform}")
38
+ return None
39
+ except Exception as e:
40
+ logger.error(f"Failed to get task by video: {e}")
41
+ finally:
42
+ db.close()
43
+
44
+
45
+ # 删除任务
46
+ def delete_task_by_video(video_id: str, platform: str):
47
+ db = next(get_db())
48
+ try:
49
+ tasks = (
50
+ db.query(VideoTask)
51
+ .filter_by(video_id=video_id, platform=platform)
52
+ .all()
53
+ )
54
+ for task in tasks:
55
+ db.delete(task)
56
+ db.commit()
57
+ logger.info(f"Task(s) deleted for video_id: {video_id} and platform: {platform}")
58
+ except Exception as e:
59
+ logger.error(f"Failed to delete task by video: {e}")
60
+ finally:
61
+ db.close()
backend/app/decorators/__init__.py ADDED
File without changes
backend/app/decorators/timeit.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import functools
3
+
4
+ def timeit(func):
5
+ @functools.wraps(func)
6
+ def wrapper(*args, **kwargs):
7
+ start = time.perf_counter()
8
+ result = func(*args, **kwargs)
9
+ end = time.perf_counter()
10
+ duration = end - start
11
+ print(f"{func.__name__} executed in {duration:.4f} seconds")
12
+ return result
13
+ return wrapper
backend/app/downloaders/__init__.py ADDED
File without changes
backend/app/downloaders/base.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Optional, Union
5
+
6
+ from app.enmus.note_enums import DownloadQuality
7
+ from app.models.notes_model import AudioDownloadResult
8
+ from app.models.transcriber_model import TranscriptResult
9
+ from os import getenv
10
+ QUALITY_MAP = {
11
+ "fast": "32",
12
+ "medium": "64",
13
+ "slow": "128"
14
+ }
15
+
16
+
17
+ class Downloader(ABC):
18
+ def __init__(self):
19
+ #TODO 需要修改为可配置
20
+ self.quality = QUALITY_MAP.get('fast')
21
+ self.cache_data=getenv('DATA_DIR')
22
+
23
+ @abstractmethod
24
+ def download(self, video_url: str, output_dir: str = None,
25
+ quality: DownloadQuality = "fast", need_video: Optional[bool] = False,
26
+ skip_download: bool = False) -> AudioDownloadResult:
27
+ '''
28
+
29
+ :param need_video:
30
+ :param video_url: 资源链接
31
+ :param output_dir: 输出路径 默认根目录data
32
+ :param quality: 音频质量 fast | medium | slow
33
+ :return:返回一个 AudioDownloadResult 类
34
+ '''
35
+ pass
36
+
37
+ @staticmethod
38
+ def download_video(self, video_url: str,
39
+ output_dir: Union[str, None] = None) -> str:
40
+ pass
41
+
42
+ def download_subtitles(self, video_url: str, output_dir: str = None,
43
+ langs: list = None) -> Optional[TranscriptResult]:
44
+ '''
45
+ 尝试获取平台字幕(人工字幕或自动生成字幕)
46
+
47
+ :param video_url: 视频链接
48
+ :param output_dir: 输出路径
49
+ :param langs: 优先语言列表,如 ['zh-Hans', 'zh', 'en']
50
+ :return: TranscriptResult 或 None(无字幕时)
51
+ '''
52
+ return None
backend/app/downloaders/bilibili_downloader.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import tempfile
5
+ from abc import ABC
6
+ from typing import Union, Optional, List
7
+
8
+ import yt_dlp
9
+
10
+ from app.downloaders.base import Downloader, DownloadQuality, QUALITY_MAP
11
+ from app.downloaders.bilibili_subtitle import BilibiliSubtitleFetcher
12
+ from app.models.notes_model import AudioDownloadResult
13
+ from app.models.transcriber_model import TranscriptResult, TranscriptSegment
14
+ from app.utils.path_helper import get_data_dir
15
+ from app.utils.url_parser import extract_video_id
16
+ from app.services.cookie_manager import CookieConfigManager
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BilibiliDownloader(Downloader, ABC):
22
+ def __init__(self):
23
+ super().__init__()
24
+ self._cookie_mgr = CookieConfigManager()
25
+ self._cookie = self._cookie_mgr.get('bilibili')
26
+ self._cookiefile = self._write_netscape_cookie_file()
27
+
28
+ def _write_netscape_cookie_file(self) -> Optional[str]:
29
+ """将 Cookie 写入 Netscape 格式临时文件,返回文件路径(供 yt-dlp cookiefile 使用)"""
30
+ if not self._cookie:
31
+ logger.warning("B站 Cookie 未配置,下载可能失败")
32
+ return None
33
+ lines = ["# Netscape HTTP Cookie File\n"]
34
+ for pair in self._cookie.split("; "):
35
+ if "=" in pair:
36
+ key, value = pair.split("=", 1)
37
+ lines.append(f".bilibili.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
38
+ tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
39
+ tmp.writelines(lines)
40
+ tmp.close()
41
+ logger.info("已生成 B站 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
42
+ return tmp.name
43
+
44
+ def download(
45
+ self,
46
+ video_url: str,
47
+ output_dir: Union[str, None] = None,
48
+ quality: DownloadQuality = "fast",
49
+ need_video:Optional[bool]=False
50
+ ) -> AudioDownloadResult:
51
+ if output_dir is None:
52
+ output_dir = get_data_dir()
53
+ if not output_dir:
54
+ output_dir=self.cache_data
55
+ os.makedirs(output_dir, exist_ok=True)
56
+
57
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
58
+
59
+ ydl_opts = {
60
+ 'format': 'bestaudio[ext=m4a]/bestaudio/best',
61
+ 'outtmpl': output_path,
62
+ 'http_headers': {'Referer': 'https://www.bilibili.com'},
63
+ 'postprocessors': [
64
+ {
65
+ 'key': 'FFmpegExtractAudio',
66
+ 'preferredcodec': 'mp3',
67
+ 'preferredquality': '64',
68
+ }
69
+ ],
70
+ 'noplaylist': True,
71
+ 'quiet': False,
72
+ }
73
+ if self._cookiefile:
74
+ ydl_opts['cookiefile'] = self._cookiefile
75
+
76
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
77
+ info = ydl.extract_info(video_url, download=True)
78
+ video_id = info.get("id")
79
+ title = info.get("title")
80
+ duration = info.get("duration", 0)
81
+ cover_url = info.get("thumbnail")
82
+ audio_path = os.path.join(output_dir, f"{video_id}.mp3")
83
+
84
+ return AudioDownloadResult(
85
+ file_path=audio_path,
86
+ title=title,
87
+ duration=duration,
88
+ cover_url=cover_url,
89
+ platform="bilibili",
90
+ video_id=video_id,
91
+ raw_info=info,
92
+ video_path=None # ❗音频下载不包含视频路径
93
+ )
94
+
95
+ def download_video(
96
+ self,
97
+ video_url: str,
98
+ output_dir: Union[str, None] = None,
99
+ ) -> str:
100
+ """
101
+ 下载视频,返回视频文件路径
102
+ """
103
+
104
+ if output_dir is None:
105
+ output_dir = get_data_dir()
106
+ os.makedirs(output_dir, exist_ok=True)
107
+ print("video_url",video_url)
108
+ video_id=extract_video_id(video_url, "bilibili")
109
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
110
+ if os.path.exists(video_path):
111
+ return video_path
112
+
113
+ # 检查是否已经存在
114
+
115
+
116
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
117
+
118
+ ydl_opts = {
119
+ 'format': 'bv*[ext=mp4]/bestvideo+bestaudio/best',
120
+ 'outtmpl': output_path,
121
+ 'http_headers': {'Referer': 'https://www.bilibili.com'},
122
+ 'noplaylist': True,
123
+ 'quiet': False,
124
+ 'merge_output_format': 'mp4', # 确保合并成 mp4
125
+ }
126
+ if self._cookiefile:
127
+ ydl_opts['cookiefile'] = self._cookiefile
128
+
129
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
130
+ info = ydl.extract_info(video_url, download=True)
131
+ video_id = info.get("id")
132
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
133
+
134
+ if not os.path.exists(video_path):
135
+ raise FileNotFoundError(f"视频文件未找到: {video_path}")
136
+
137
+ return video_path
138
+
139
+ def delete_video(self, video_path: str) -> str:
140
+ """
141
+ 删除视频文件
142
+ """
143
+ if os.path.exists(video_path):
144
+ os.remove(video_path)
145
+ return f"视频文件已删除: {video_path}"
146
+ else:
147
+ return f"视频文件未找到: {video_path}"
148
+
149
+ def download_subtitles(self, video_url: str, output_dir: str = None,
150
+ langs: List[str] = None) -> Optional[TranscriptResult]:
151
+ """
152
+ 尝试获取B站视频字幕
153
+
154
+ :param video_url: 视频链接
155
+ :param output_dir: 输出路径
156
+ :param langs: 优先语言列表
157
+ :return: TranscriptResult 或 None
158
+ """
159
+ # 1) 优先走 B 站官方 player API(直拉,无需下视频;AI 字幕需 SESSDATA cookie)
160
+ try:
161
+ result = BilibiliSubtitleFetcher().fetch_subtitles(video_url)
162
+ if result and result.segments:
163
+ return result
164
+ except Exception as e:
165
+ logger.warning(f"player API 直拉字幕异常,回退到 yt-dlp: {e}")
166
+
167
+ # 2) Fallback:原 yt-dlp 路径(更脆弱,遇到签名/Cookie 问题失败概率较高)
168
+ if output_dir is None:
169
+ output_dir = get_data_dir()
170
+ if not output_dir:
171
+ output_dir = self.cache_data
172
+ os.makedirs(output_dir, exist_ok=True)
173
+
174
+ if langs is None:
175
+ langs = ['zh-Hans', 'zh', 'zh-CN', 'ai-zh', 'en', 'en-US']
176
+
177
+ video_id = extract_video_id(video_url, "bilibili")
178
+
179
+ ydl_opts = {
180
+ 'writesubtitles': True,
181
+ 'writeautomaticsub': True,
182
+ 'subtitleslangs': langs,
183
+ 'subtitlesformat': 'srt/json3/best', # 支持多种格式
184
+ 'skip_download': True,
185
+ 'outtmpl': os.path.join(output_dir, f'{video_id}.%(ext)s'),
186
+ 'quiet': True,
187
+ }
188
+
189
+ # 通过 CookieConfigManager 注入 B站 Cookie(Netscape cookiefile)
190
+ if self._cookiefile:
191
+ ydl_opts['cookiefile'] = self._cookiefile
192
+ ydl_opts['http_headers'] = {'Referer': 'https://www.bilibili.com'}
193
+
194
+ try:
195
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
196
+ info = ydl.extract_info(video_url, download=True)
197
+
198
+ # 查找下载的字幕文件
199
+ subtitles = info.get('requested_subtitles') or {}
200
+ if not subtitles:
201
+ logger.info(f"B站视频 {video_id} 没有可用字幕")
202
+ return None
203
+
204
+ # 按优先级查找字幕
205
+ detected_lang = None
206
+ sub_info = None
207
+ for lang in langs:
208
+ if lang in subtitles:
209
+ detected_lang = lang
210
+ sub_info = subtitles[lang]
211
+ break
212
+
213
+ # 如果按优先级没找到,取第一个可用的(排除弹幕)
214
+ if not detected_lang:
215
+ for lang, info_item in subtitles.items():
216
+ if lang != 'danmaku': # 排除弹幕
217
+ detected_lang = lang
218
+ sub_info = info_item
219
+ break
220
+
221
+ if not sub_info:
222
+ logger.info(f"B站视频 {video_id} 没有可用字幕(排除弹幕)")
223
+ return None
224
+
225
+ # 检查是否有内嵌数据(yt-dlp 有时直接返回字幕内容)
226
+ if 'data' in sub_info and sub_info['data']:
227
+ logger.info(f"直接从返回数据解析字幕: {detected_lang}")
228
+ return self._parse_srt_content(sub_info['data'], detected_lang)
229
+
230
+ # 查找字幕文件
231
+ ext = sub_info.get('ext', 'srt')
232
+ subtitle_file = os.path.join(output_dir, f"{video_id}.{detected_lang}.{ext}")
233
+
234
+ if not os.path.exists(subtitle_file):
235
+ logger.info(f"字幕文件不存在: {subtitle_file}")
236
+ return None
237
+
238
+ # 根据格式解析字幕文件
239
+ if ext == 'json3':
240
+ return self._parse_json3_subtitle(subtitle_file, detected_lang)
241
+ else:
242
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
243
+ return self._parse_srt_content(f.read(), detected_lang)
244
+
245
+ except Exception as e:
246
+ logger.warning(f"获取B站字幕失败: {e}")
247
+ return None
248
+
249
+ def _parse_srt_content(self, srt_content: str, language: str) -> Optional[TranscriptResult]:
250
+ """
251
+ 解析 SRT 格式字幕内容
252
+
253
+ :param srt_content: SRT 字幕文本内容
254
+ :param language: 语言代码
255
+ :return: TranscriptResult
256
+ """
257
+ import re
258
+ try:
259
+ segments = []
260
+ # SRT 格式: 序号\n时间戳\n文本\n\n
261
+ pattern = r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})\n(.*?)(?=\n\n|\n\d+\n|$)'
262
+ matches = re.findall(pattern, srt_content, re.DOTALL)
263
+
264
+ for match in matches:
265
+ idx, start_time, end_time, text = match
266
+ text = text.strip()
267
+ if not text:
268
+ continue
269
+
270
+ # 转换时间格式 00:00:00,000 -> 秒
271
+ def time_to_seconds(t):
272
+ parts = t.replace(',', '.').split(':')
273
+ return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
274
+
275
+ segments.append(TranscriptSegment(
276
+ start=time_to_seconds(start_time),
277
+ end=time_to_seconds(end_time),
278
+ text=text
279
+ ))
280
+
281
+ if not segments:
282
+ return None
283
+
284
+ full_text = ' '.join(seg.text for seg in segments)
285
+ logger.info(f"成功解析B站SRT字幕,共 {len(segments)} 段")
286
+ return TranscriptResult(
287
+ language=language,
288
+ full_text=full_text,
289
+ segments=segments,
290
+ raw={'source': 'bilibili_subtitle', 'format': 'srt'}
291
+ )
292
+
293
+ except Exception as e:
294
+ logger.warning(f"解析SRT字幕失败: {e}")
295
+ return None
296
+
297
+ def _parse_json3_subtitle(self, subtitle_file: str, language: str) -> Optional[TranscriptResult]:
298
+ """
299
+ 解析 json3 格式字幕文件
300
+
301
+ :param subtitle_file: 字幕文件路径
302
+ :param language: 语言代码
303
+ :return: TranscriptResult
304
+ """
305
+ try:
306
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
307
+ data = json.load(f)
308
+
309
+ segments = []
310
+ events = data.get('events', [])
311
+
312
+ for event in events:
313
+ # json3 格式中时间单位是毫秒
314
+ start_ms = event.get('tStartMs', 0)
315
+ duration_ms = event.get('dDurationMs', 0)
316
+
317
+ # 提取文本
318
+ segs = event.get('segs', [])
319
+ text = ''.join(seg.get('utf8', '') for seg in segs).strip()
320
+
321
+ if text: # 只添加非空文本
322
+ segments.append(TranscriptSegment(
323
+ start=start_ms / 1000.0,
324
+ end=(start_ms + duration_ms) / 1000.0,
325
+ text=text
326
+ ))
327
+
328
+ if not segments:
329
+ return None
330
+
331
+ full_text = ' '.join(seg.text for seg in segments)
332
+
333
+ logger.info(f"成功解析B站字幕,共 {len(segments)} 段")
334
+ return TranscriptResult(
335
+ language=language,
336
+ full_text=full_text,
337
+ segments=segments,
338
+ raw={'source': 'bilibili_subtitle', 'file': subtitle_file}
339
+ )
340
+
341
+ except Exception as e:
342
+ logger.warning(f"解析字幕文件失败: {e}")
343
+ return None
backend/app/downloaders/bilibili_subtitle.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 直接调用 B 站 player API 拿字幕,绕过 yt-dlp。
3
+
4
+ 流程:
5
+ 1. 从 URL 提 BV id(已有 utils.url_parser.extract_video_id)
6
+ 2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
7
+ 3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
8
+ 每条带 subtitle_url(B 站后端已经签好 auth_key 的完整地址)
9
+ 4. 按优先级(人工 zh-CN > AI zh-CN > 任意 zh > 任意非空)选一条
10
+ 5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
11
+ 6. 解析为 TranscriptResult
12
+
13
+ AI 字幕需要登录态 cookie(SESSDATA);通过 CookieConfigManager 注入。
14
+ """
15
+
16
+ from typing import List, Optional
17
+
18
+ import requests
19
+
20
+ from app.models.transcriber_model import TranscriptResult, TranscriptSegment
21
+ from app.services.cookie_manager import CookieConfigManager
22
+ from app.utils.logger import get_logger
23
+ from app.utils.url_parser import extract_video_id
24
+
25
+ logger = get_logger(__name__)
26
+
27
+ UA = (
28
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
29
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
30
+ )
31
+
32
+
33
+ class BilibiliSubtitleFetcher:
34
+ """通过 B 站官方 API 直拉字幕。"""
35
+
36
+ def __init__(self):
37
+ self._cookie = CookieConfigManager().get("bilibili") or ""
38
+
39
+ def _headers(self) -> dict:
40
+ h = {
41
+ "User-Agent": UA,
42
+ "Referer": "https://www.bilibili.com",
43
+ }
44
+ if self._cookie:
45
+ h["Cookie"] = self._cookie
46
+ return h
47
+
48
+ def _get_cid(self, bvid: str) -> Optional[int]:
49
+ url = "https://api.bilibili.com/x/web-interface/view"
50
+ try:
51
+ resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
52
+ data = resp.json()
53
+ except Exception as e:
54
+ logger.warning(f"获取 cid 失败: {e}")
55
+ return None
56
+ if data.get("code") != 0:
57
+ logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
58
+ return None
59
+ cid = data.get("data", {}).get("cid")
60
+ return int(cid) if cid else None
61
+
62
+ def _list_subtitles(self, bvid: str, cid: int) -> List[dict]:
63
+ url = "https://api.bilibili.com/x/player/wbi/v2"
64
+ try:
65
+ resp = requests.get(url, params={"bvid": bvid, "cid": cid}, headers=self._headers(), timeout=10)
66
+ data = resp.json()
67
+ except Exception as e:
68
+ logger.warning(f"获取字幕列表失败: {e}")
69
+ return []
70
+ if data.get("code") != 0:
71
+ logger.warning(f"player API 返回错误: code={data.get('code')}, msg={data.get('message')}")
72
+ return []
73
+ subtitles = data.get("data", {}).get("subtitle", {}).get("subtitles", [])
74
+ return subtitles or []
75
+
76
+ def _pick(self, subtitles: List[dict]) -> Optional[dict]:
77
+ """优先级:人工中文 > AI 中文 > 任意中文 > 任意非空。"""
78
+ if not subtitles:
79
+ return None
80
+
81
+ def is_zh(s: dict) -> bool:
82
+ lan = (s.get("lan") or "").lower()
83
+ return lan.startswith("zh") or lan == "ai-zh"
84
+
85
+ # 人工中文(type 0=AI, 1=人工 ;ai_type=0 视为人工)
86
+ for s in subtitles:
87
+ if is_zh(s) and not s.get("ai_type"):
88
+ return s
89
+ # AI 中文
90
+ for s in subtitles:
91
+ if is_zh(s):
92
+ return s
93
+ # 任意非空
94
+ return subtitles[0]
95
+
96
+ @staticmethod
97
+ def _normalize_url(url: str) -> str:
98
+ if url.startswith("//"):
99
+ return "https:" + url
100
+ return url
101
+
102
+ def _fetch_body(self, subtitle_url: str) -> Optional[List[dict]]:
103
+ try:
104
+ resp = requests.get(self._normalize_url(subtitle_url), headers=self._headers(), timeout=15)
105
+ data = resp.json()
106
+ return data.get("body") or []
107
+ except Exception as e:
108
+ logger.warning(f"下载字幕 JSON 失败: {e}")
109
+ return None
110
+
111
+ def fetch_subtitles(self, video_url: str) -> Optional[TranscriptResult]:
112
+ bvid = extract_video_id(video_url, "bilibili")
113
+ if not bvid:
114
+ logger.info("无法从 URL 提取 BV id")
115
+ return None
116
+
117
+ cid = self._get_cid(bvid)
118
+ if not cid:
119
+ logger.info(f"{bvid} 没有取到 cid")
120
+ return None
121
+
122
+ subtitles = self._list_subtitles(bvid, cid)
123
+ if not subtitles:
124
+ logger.info(f"{bvid} (cid={cid}) 没有可用字幕轨")
125
+ return None
126
+
127
+ track = self._pick(subtitles)
128
+ if not track or not track.get("subtitle_url"):
129
+ logger.info(f"{bvid} 字幕轨存在但没有 subtitle_url(可能未登录、需要 SESSDATA cookie)")
130
+ return None
131
+
132
+ lan = track.get("lan") or "zh"
133
+ body = self._fetch_body(track["subtitle_url"])
134
+ if not body:
135
+ return None
136
+
137
+ segments: List[TranscriptSegment] = []
138
+ for item in body:
139
+ text = (item.get("content") or "").strip()
140
+ if not text:
141
+ continue
142
+ segments.append(TranscriptSegment(
143
+ start=float(item.get("from", 0)),
144
+ end=float(item.get("to", 0)),
145
+ text=text,
146
+ ))
147
+
148
+ if not segments:
149
+ return None
150
+
151
+ full_text = " ".join(s.text for s in segments)
152
+ logger.info(f"B站直拉字幕成功: {bvid} lan={lan} 共 {len(segments)} 段")
153
+ return TranscriptResult(
154
+ language=lan,
155
+ full_text=full_text,
156
+ segments=segments,
157
+ raw={
158
+ "source": "bilibili_player_api",
159
+ "bvid": bvid,
160
+ "cid": cid,
161
+ "lan": lan,
162
+ "ai_type": track.get("ai_type"),
163
+ },
164
+ )
backend/app/downloaders/common.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # def download():
backend/app/downloaders/douyin_downloader.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import subprocess
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Literal, Optional, Union
9
+ from urllib.parse import parse_qs, unquote, urlparse
10
+
11
+ import requests
12
+
13
+ from app.downloaders.base import Downloader
14
+ from app.enmus.note_enums import DownloadQuality
15
+ from app.models.audio_model import AudioDownloadResult
16
+ from app.models.transcriber_model import TranscriptResult, TranscriptSegment
17
+ from app.utils.path_helper import get_data_dir
18
+
19
+
20
+ SHARE_PAGE_UA = (
21
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
22
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) "
23
+ "Version/17.0 Mobile/15E148 Safari/604.1"
24
+ )
25
+
26
+ ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*(\{.+)", re.DOTALL)
27
+ RENDER_DATA_RE = re.compile(
28
+ r'<script id="RENDER_DATA" type="application/json">([^<]+)</script>'
29
+ )
30
+ DOUYIN_URL_RE = re.compile(
31
+ r"https?://(?:v\.douyin\.com|www\.douyin\.com|www\.iesdouyin\.com|m\.douyin\.com)[^\s\]]*"
32
+ )
33
+ IMAGE_AWEME_TYPES = {2, 68}
34
+
35
+
36
+ class DouyinResolveError(Exception):
37
+ pass
38
+
39
+
40
+ @dataclass
41
+ class DouyinContentMeta:
42
+ aweme_id: str
43
+ title: str
44
+ author: str
45
+ source_url: str
46
+ content_type: Literal["video", "image"] = "video"
47
+ aweme_type: Optional[int] = None
48
+ download_url: str = ""
49
+ cover_url: Optional[str] = None
50
+ image_urls: list[str] = field(default_factory=list)
51
+ duration: float = 0
52
+ tags: list[str] = field(default_factory=list)
53
+
54
+
55
+ def _session() -> requests.Session:
56
+ session = requests.Session()
57
+ session.headers.update(
58
+ {
59
+ "User-Agent": SHARE_PAGE_UA,
60
+ "Accept-Language": "zh-CN,zh;q=0.9",
61
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
62
+ }
63
+ )
64
+ return session
65
+
66
+
67
+ def expand_share_url(share_text: str) -> str:
68
+ """从抖音分享文案中提取可访问链接。"""
69
+ match = DOUYIN_URL_RE.search((share_text or "").strip())
70
+ if not match:
71
+ raise DouyinResolveError("未在输入中找到抖音链接")
72
+ return match.group(0).rstrip("/.,;)")
73
+
74
+
75
+ def _extract_aweme_id_from_search_url(url: str) -> Optional[str]:
76
+ parsed = urlparse(url)
77
+ if not parsed.netloc.endswith("douyin.com") or not parsed.path.startswith("/search"):
78
+ return None
79
+
80
+ params = parse_qs(parsed.query)
81
+ for key in ("modal_id", "item_ids"):
82
+ for value in params.get(key, []):
83
+ match = re.search(r"\d{10,}", value)
84
+ if match:
85
+ return match.group(0)
86
+ return None
87
+
88
+
89
+ def normalize_to_share_page(url: str) -> str:
90
+ """www.douyin.com 的 video/note 页面转为移动端分享页。"""
91
+ note = re.search(r"https?://(?:www\.)?douyin\.com/note/(\d+)", url)
92
+ if note:
93
+ return f"https://www.iesdouyin.com/share/note/{note.group(1)}/"
94
+ video = re.search(r"https?://(?:www\.)?douyin\.com/video/(\d+)", url)
95
+ if video:
96
+ return f"https://www.iesdouyin.com/share/video/{video.group(1)}/"
97
+ search_aweme_id = _extract_aweme_id_from_search_url(url)
98
+ if search_aweme_id:
99
+ return f"https://www.iesdouyin.com/share/video/{search_aweme_id}/"
100
+ return url
101
+
102
+
103
+ def resolve_share_page(session: requests.Session, share_url: str) -> tuple[str, str]:
104
+ response = session.get(share_url, allow_redirects=True, timeout=30)
105
+ response.raise_for_status()
106
+ return str(response.url), response.text
107
+
108
+
109
+ def extract_aweme_id(page_url: str, html: Optional[str] = None) -> str:
110
+ patterns = [
111
+ r"/video/(\d+)",
112
+ r"/note/(\d+)",
113
+ r"/share/video/(\d+)",
114
+ r"/share/note/(\d+)",
115
+ r"modal_id=(\d+)",
116
+ r"item_ids=(\d+)",
117
+ r'"aweme_id"\s*:\s*"?(\d+)"?',
118
+ r'"itemId"\s*:\s*"?(\d+)"?',
119
+ ]
120
+ for pattern in patterns:
121
+ match = re.search(pattern, page_url)
122
+ if match:
123
+ return match.group(1)
124
+ if html:
125
+ for pattern in patterns:
126
+ match = re.search(pattern, html)
127
+ if match:
128
+ return match.group(1)
129
+ raise DouyinResolveError(f"无法从分享页解析作品 ID: {page_url}")
130
+
131
+
132
+ def _parse_router_data(html: str) -> Optional[dict[str, Any]]:
133
+ match = ROUTER_DATA_RE.search(html)
134
+ if not match:
135
+ return None
136
+ raw = match.group(1).split("</script>")[0].rstrip().rstrip(";")
137
+ try:
138
+ return json.loads(raw)
139
+ except json.JSONDecodeError:
140
+ return None
141
+
142
+
143
+ def _parse_render_data(html: str) -> Optional[dict[str, Any]]:
144
+ match = RENDER_DATA_RE.search(html)
145
+ if not match:
146
+ return None
147
+ try:
148
+ return json.loads(unquote(match.group(1)))
149
+ except json.JSONDecodeError:
150
+ return None
151
+
152
+
153
+ def _find_item_list(obj: Any) -> list[dict[str, Any]]:
154
+ if isinstance(obj, dict):
155
+ item_list = obj.get("item_list")
156
+ if isinstance(item_list, list) and item_list:
157
+ first = item_list[0]
158
+ if isinstance(first, dict) and (
159
+ "aweme_id" in first or "awemeId" in first or "video" in first or "images" in first
160
+ ):
161
+ return item_list
162
+ for value in obj.values():
163
+ found = _find_item_list(value)
164
+ if found:
165
+ return found
166
+ elif isinstance(obj, list):
167
+ for item in obj:
168
+ found = _find_item_list(item)
169
+ if found:
170
+ return found
171
+ return []
172
+
173
+
174
+ def _pick_url_from_image_node(image: dict[str, Any]) -> Optional[str]:
175
+ url_list = image.get("url_list") or []
176
+ if url_list:
177
+ return str(url_list[-1])
178
+ download_list = image.get("download_url_list") or []
179
+ if download_list:
180
+ return str(download_list[-1])
181
+ return None
182
+
183
+
184
+ def _extract_image_urls(item: dict[str, Any]) -> list[str]:
185
+ urls: list[str] = []
186
+ seen: set[str] = set()
187
+
188
+ def add(url: Optional[str]) -> None:
189
+ if url and url not in seen:
190
+ seen.add(url)
191
+ urls.append(url)
192
+
193
+ for image in item.get("images") or []:
194
+ if isinstance(image, dict):
195
+ add(_pick_url_from_image_node(image))
196
+
197
+ post = item.get("image_post_info") or {}
198
+ if isinstance(post, dict):
199
+ for image in post.get("images") or []:
200
+ if isinstance(image, dict):
201
+ add(_pick_url_from_image_node(image))
202
+
203
+ return urls
204
+
205
+
206
+ def _has_playable_video(item: dict[str, Any]) -> bool:
207
+ video = item.get("video") or {}
208
+ if not isinstance(video, dict):
209
+ return False
210
+ play_addr = video.get("play_addr") or video.get("playAddr") or {}
211
+ if not isinstance(play_addr, dict):
212
+ return False
213
+ return bool(play_addr.get("uri") or play_addr.get("url_list"))
214
+
215
+
216
+ def _is_image_note(item: dict[str, Any]) -> bool:
217
+ aweme_type = item.get("aweme_type")
218
+ if aweme_type in IMAGE_AWEME_TYPES:
219
+ return True
220
+ return bool(_extract_image_urls(item)) and not _has_playable_video(item)
221
+
222
+
223
+ def _build_no_watermark_url(play_addr: dict[str, Any]) -> str:
224
+ uri = play_addr.get("uri") or ""
225
+ url_list = play_addr.get("url_list") or []
226
+ if uri:
227
+ return f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=720p&line=0"
228
+ if url_list:
229
+ return str(url_list[0]).replace("playwm", "play")
230
+ raise DouyinResolveError("分享页内嵌数据中未找到视频播放地址")
231
+
232
+
233
+ def _extract_tags(item: dict[str, Any]) -> list[str]:
234
+ tags: list[str] = []
235
+ seen: set[str] = set()
236
+ for tag in item.get("text_extra") or item.get("video_tag") or []:
237
+ if not isinstance(tag, dict):
238
+ continue
239
+ name = tag.get("hashtag_name") or tag.get("tag_name") or tag.get("name")
240
+ if name and name not in seen:
241
+ seen.add(name)
242
+ tags.append(str(name))
243
+ return tags
244
+
245
+
246
+ def _duration_seconds(raw: Any) -> float:
247
+ try:
248
+ value = float(raw or 0)
249
+ except (TypeError, ValueError):
250
+ return 0
251
+ return value / 1000 if value > 10000 else value
252
+
253
+
254
+ def _meta_from_aweme_item(item: dict[str, Any], source_url: str) -> DouyinContentMeta:
255
+ aweme_id = str(item.get("aweme_id") or item.get("awemeId") or "")
256
+ title = (item.get("desc") or item.get("caption") or "").strip() or f"douyin_{aweme_id}"
257
+ aweme_type = item.get("aweme_type")
258
+ tags = _extract_tags(item)
259
+
260
+ author = ""
261
+ author_info = item.get("author") or {}
262
+ if isinstance(author_info, dict):
263
+ author = author_info.get("nickname") or author_info.get("unique_id") or ""
264
+
265
+ duration = _duration_seconds(item.get("duration"))
266
+
267
+ if _is_image_note(item):
268
+ image_urls = _extract_image_urls(item)
269
+ if not image_urls:
270
+ raise DouyinResolveError("识别为图文,但未找到图片地址")
271
+ return DouyinContentMeta(
272
+ aweme_id=aweme_id,
273
+ title=title,
274
+ author=author,
275
+ source_url=source_url,
276
+ content_type="image",
277
+ aweme_type=aweme_type,
278
+ cover_url=image_urls[0],
279
+ image_urls=image_urls,
280
+ duration=duration,
281
+ tags=tags,
282
+ )
283
+
284
+ video = item.get("video") or {}
285
+ if not isinstance(video, dict):
286
+ raise DouyinResolveError("分享页内嵌数据中未找到视频节点")
287
+ play_addr = video.get("play_addr") or video.get("playAddr") or {}
288
+ if not isinstance(play_addr, dict):
289
+ raise DouyinResolveError("视频节点缺少 play_addr")
290
+
291
+ download_url = _build_no_watermark_url(play_addr)
292
+ cover_url = None
293
+ for key in ("cover", "origin_cover", "dynamic_cover", "cover_original_scale"):
294
+ cover_info = video.get(key) or {}
295
+ if isinstance(cover_info, dict):
296
+ covers = cover_info.get("url_list") or []
297
+ if covers:
298
+ cover_url = str(covers[0])
299
+ break
300
+
301
+ for bit_rate in video.get("bit_rate") or []:
302
+ if not isinstance(bit_rate, dict):
303
+ continue
304
+ bit_play = bit_rate.get("play_addr") or {}
305
+ if isinstance(bit_play, dict) and bit_play.get("url_list"):
306
+ candidate = str(bit_play["url_list"][0])
307
+ if "playwm" not in candidate and ("douyinvod" in candidate or "bytecdn" in candidate):
308
+ download_url = candidate
309
+ break
310
+
311
+ return DouyinContentMeta(
312
+ aweme_id=aweme_id,
313
+ title=title,
314
+ author=author,
315
+ source_url=source_url,
316
+ content_type="video",
317
+ aweme_type=aweme_type,
318
+ download_url=download_url,
319
+ cover_url=cover_url,
320
+ duration=duration,
321
+ tags=tags,
322
+ )
323
+
324
+
325
+ def parse_share_page_html(html: str, page_url: str, original_share: str) -> DouyinContentMeta:
326
+ for parser in (_parse_router_data, _parse_render_data):
327
+ payload = parser(html)
328
+ if not payload:
329
+ continue
330
+ items = _find_item_list(payload)
331
+ if items:
332
+ meta = _meta_from_aweme_item(items[0], original_share)
333
+ if meta.aweme_id:
334
+ return meta
335
+ return DouyinContentMeta(
336
+ aweme_id=extract_aweme_id(page_url, html),
337
+ title=meta.title,
338
+ author=meta.author,
339
+ source_url=meta.source_url,
340
+ content_type=meta.content_type,
341
+ aweme_type=meta.aweme_type,
342
+ download_url=meta.download_url,
343
+ cover_url=meta.cover_url,
344
+ image_urls=meta.image_urls,
345
+ duration=meta.duration,
346
+ tags=meta.tags,
347
+ )
348
+
349
+ raise DouyinResolveError(
350
+ "分享页未找到内嵌公开数据(_ROUTER_DATA / RENDER_DATA)。"
351
+ "请确认链接有效。"
352
+ )
353
+
354
+
355
+ def resolve_douyin_share(share_text: str) -> DouyinContentMeta:
356
+ session = _session()
357
+ share_url = expand_share_url(share_text)
358
+ fetch_url = normalize_to_share_page(share_url)
359
+ page_url, html = resolve_share_page(session, fetch_url)
360
+ return parse_share_page_html(html, page_url, share_url)
361
+
362
+
363
+ def _download_file(url: str, dest: str) -> str:
364
+ os.makedirs(os.path.dirname(dest), exist_ok=True)
365
+ headers = {"User-Agent": SHARE_PAGE_UA, "Referer": "https://www.iesdouyin.com/"}
366
+ with requests.get(url, headers=headers, stream=True, timeout=120) as response:
367
+ response.raise_for_status()
368
+ with open(dest, "wb") as file:
369
+ for chunk in response.iter_content(chunk_size=1024 * 256):
370
+ if chunk:
371
+ file.write(chunk)
372
+ return dest
373
+
374
+
375
+ def _extract_audio(video_path: str, audio_path: str) -> None:
376
+ subprocess.run(
377
+ ["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path],
378
+ check=True,
379
+ stdout=subprocess.DEVNULL,
380
+ stderr=subprocess.DEVNULL,
381
+ )
382
+
383
+
384
+ def _build_result(
385
+ meta: DouyinContentMeta,
386
+ audio_path: str,
387
+ video_path: Optional[str],
388
+ ) -> AudioDownloadResult:
389
+ return AudioDownloadResult(
390
+ file_path=audio_path,
391
+ title=meta.title,
392
+ duration=meta.duration,
393
+ cover_url=meta.cover_url,
394
+ platform="douyin",
395
+ video_id=meta.aweme_id,
396
+ raw_info={
397
+ "tags": meta.tags,
398
+ "author": meta.author,
399
+ "source_url": meta.source_url,
400
+ "content_type": meta.content_type,
401
+ "image_urls": meta.image_urls,
402
+ },
403
+ video_path=video_path,
404
+ )
405
+
406
+
407
+ class DouyinDownloader(Downloader):
408
+ def __init__(self, cookie=None):
409
+ super().__init__()
410
+
411
+ def extract_video_id(self, url: str) -> str:
412
+ try:
413
+ return extract_aweme_id(normalize_to_share_page(expand_share_url(url)))
414
+ except DouyinResolveError:
415
+ return ""
416
+
417
+ def _resolve_meta(self, video_url: str) -> DouyinContentMeta:
418
+ try:
419
+ return resolve_douyin_share(video_url)
420
+ except DouyinResolveError:
421
+ raise
422
+ except Exception as exc:
423
+ raise DouyinResolveError(f"抖音分享页解析失败:{exc}") from exc
424
+
425
+ def download(
426
+ self,
427
+ video_url: str,
428
+ output_dir: Union[str, None] = None,
429
+ quality: DownloadQuality = "fast",
430
+ need_video: Optional[bool] = False,
431
+ skip_download: bool = False,
432
+ ) -> AudioDownloadResult:
433
+ if output_dir is None:
434
+ output_dir = get_data_dir()
435
+ if not output_dir:
436
+ output_dir = self.cache_data
437
+ os.makedirs(output_dir, exist_ok=True)
438
+
439
+ meta = self._resolve_meta(video_url)
440
+ if meta.content_type == "image":
441
+ return _build_result(meta, "", None)
442
+
443
+ video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
444
+ audio_path = os.path.join(output_dir, f"{meta.aweme_id}.mp3")
445
+
446
+ if skip_download:
447
+ return _build_result(meta, "", None)
448
+
449
+ if not os.path.exists(video_path):
450
+ _download_file(meta.download_url, video_path)
451
+
452
+ if not os.path.exists(audio_path):
453
+ try:
454
+ _extract_audio(video_path, audio_path)
455
+ except subprocess.CalledProcessError as exc:
456
+ raise RuntimeError("ffmpeg 转换 MP3 失败") from exc
457
+
458
+ return _build_result(
459
+ meta,
460
+ audio_path,
461
+ video_path if need_video or os.path.exists(video_path) else None,
462
+ )
463
+
464
+ def download_video(self, video_url: str, output_dir: Union[str, None] = None) -> str:
465
+ if output_dir is None:
466
+ output_dir = get_data_dir()
467
+ if not output_dir:
468
+ output_dir = self.cache_data
469
+ os.makedirs(output_dir, exist_ok=True)
470
+
471
+ meta = self._resolve_meta(video_url)
472
+ if meta.content_type == "image":
473
+ raise DouyinResolveError("抖音图文内容没有可下载的视频文件")
474
+
475
+ video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
476
+ if not os.path.exists(video_path):
477
+ _download_file(meta.download_url, video_path)
478
+ return video_path
479
+
480
+ def download_subtitles(
481
+ self,
482
+ video_url: str,
483
+ output_dir: str = None,
484
+ langs: list = None,
485
+ ) -> Optional[TranscriptResult]:
486
+ meta = self._resolve_meta(video_url)
487
+ if meta.content_type != "image" or not meta.title:
488
+ return None
489
+ return TranscriptResult(
490
+ language="zh",
491
+ full_text=meta.title,
492
+ segments=[
493
+ TranscriptSegment(
494
+ start=0,
495
+ end=meta.duration or 0,
496
+ text=meta.title,
497
+ )
498
+ ],
499
+ )
backend/app/downloaders/douyin_helper/abogus.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Original Author:
3
+ This file is from https://github.com/JoeanAmier/TikTokDownloader
4
+ And is licensed under the GNU General Public License v3.0
5
+ If you use this code, please keep this license and the original author information.
6
+
7
+ Modified by:
8
+ And this file is now a part of the https://github.com/Evil0ctal/Douyin_TikTok_Download_API open-source project.
9
+ This project is licensed under the Apache License 2.0, and the original author information is kept.
10
+
11
+ Purpose:
12
+ This file is used to generate the `a_bogus` parameter for the Douyin Web API.
13
+
14
+ Changes Made:
15
+ 1. Changed the ua_code to compatible with the current config file User-Agent string in https://github.com/Evil0ctal/Douyin_TikTok_Download_API/blob/main/crawlers/douyin/web/config.yaml
16
+ """
17
+
18
+ from random import choice
19
+ from random import randint
20
+ from random import random
21
+ from re import compile
22
+ from time import time
23
+ from urllib.parse import urlencode
24
+ from urllib.parse import quote
25
+ from gmssl import sm3, func
26
+
27
+ __all__ = ["ABogus", ]
28
+
29
+
30
+ class ABogus:
31
+ __filter = compile(r'%([0-9A-F]{2})')
32
+ __arguments = [0, 1, 14]
33
+ __ua_key = "\u0000\u0001\u000e"
34
+ __end_string = "cus"
35
+ __version = [1, 0, 1, 5]
36
+ __browser = "1536|742|1536|864|0|0|0|0|1536|864|1536|864|1536|742|24|24|MacIntel"
37
+ __reg = [
38
+ 1937774191,
39
+ 1226093241,
40
+ 388252375,
41
+ 3666478592,
42
+ 2842636476,
43
+ 372324522,
44
+ 3817729613,
45
+ 2969243214,
46
+ ]
47
+ __str = {
48
+ "s0": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
49
+ "s1": "Dkdpgh4ZKsQB80/Mfvw36XI1R25+WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
50
+ "s2": "Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
51
+ "s3": "ckdp1h4ZKsUB80/Mfvw36XIgR25+WQAlEi7NLboqYTOPuzmFjJnryx9HVGDaStCe",
52
+ "s4": "Dkdpgh2ZmsQB80/MfvV36XI1R45-WUAlEixNLwoqYTOPuzKFjJnry79HbGcaStCe",
53
+ }
54
+
55
+ def __init__(self,
56
+ # user_agent: str = USERAGENT,
57
+ platform: str = None, ):
58
+ self.chunk = []
59
+ self.size = 0
60
+ self.reg = self.__reg[:]
61
+ # self.ua_code = self.generate_ua_code(user_agent)
62
+ # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36
63
+ self.ua_code = [
64
+ 76,
65
+ 98,
66
+ 15,
67
+ 131,
68
+ 97,
69
+ 245,
70
+ 224,
71
+ 133,
72
+ 122,
73
+ 199,
74
+ 241,
75
+ 166,
76
+ 79,
77
+ 34,
78
+ 90,
79
+ 191,
80
+ 128,
81
+ 126,
82
+ 122,
83
+ 98,
84
+ 66,
85
+ 11,
86
+ 14,
87
+ 40,
88
+ 49,
89
+ 110,
90
+ 110,
91
+ 173,
92
+ 67,
93
+ 96,
94
+ 138,
95
+ 252]
96
+ self.browser = self.generate_browser_info(
97
+ platform) if platform else self.__browser
98
+ self.browser_len = len(self.browser)
99
+ self.browser_code = self.char_code_at(self.browser)
100
+
101
+ @classmethod
102
+ def list_1(cls, random_num=None, a=170, b=85, c=45, ) -> list:
103
+ return cls.random_list(
104
+ random_num,
105
+ a,
106
+ b,
107
+ 1,
108
+ 2,
109
+ 5,
110
+ c & a,
111
+ )
112
+
113
+ @classmethod
114
+ def list_2(cls, random_num=None, a=170, b=85, ) -> list:
115
+ return cls.random_list(
116
+ random_num,
117
+ a,
118
+ b,
119
+ 1,
120
+ 0,
121
+ 0,
122
+ 0,
123
+ )
124
+
125
+ @classmethod
126
+ def list_3(cls, random_num=None, a=170, b=85, ) -> list:
127
+ return cls.random_list(
128
+ random_num,
129
+ a,
130
+ b,
131
+ 1,
132
+ 0,
133
+ 5,
134
+ 0,
135
+ )
136
+
137
+ @staticmethod
138
+ def random_list(
139
+ a: float = None,
140
+ b=170,
141
+ c=85,
142
+ d=0,
143
+ e=0,
144
+ f=0,
145
+ g=0,
146
+ ) -> list:
147
+ r = a or (random() * 10000)
148
+ v = [
149
+ r,
150
+ int(r) & 255,
151
+ int(r) >> 8,
152
+ ]
153
+ s = v[1] & b | d
154
+ v.append(s)
155
+ s = v[1] & c | e
156
+ v.append(s)
157
+ s = v[2] & b | f
158
+ v.append(s)
159
+ s = v[2] & c | g
160
+ v.append(s)
161
+ return v[-4:]
162
+
163
+ @staticmethod
164
+ def from_char_code(*args):
165
+ return "".join(chr(code) for code in args)
166
+
167
+ @classmethod
168
+ def generate_string_1(
169
+ cls,
170
+ random_num_1=None,
171
+ random_num_2=None,
172
+ random_num_3=None,
173
+ ):
174
+ return cls.from_char_code(*cls.list_1(random_num_1)) + cls.from_char_code(
175
+ *cls.list_2(random_num_2)) + cls.from_char_code(*cls.list_3(random_num_3))
176
+
177
+ def generate_string_2(
178
+ self,
179
+ url_params: str,
180
+ method="GET",
181
+ start_time=0,
182
+ end_time=0,
183
+ ) -> str:
184
+ a = self.generate_string_2_list(
185
+ url_params,
186
+ method,
187
+ start_time,
188
+ end_time,
189
+ )
190
+ e = self.end_check_num(a)
191
+ a.extend(self.browser_code)
192
+ a.append(e)
193
+ return self.rc4_encrypt(self.from_char_code(*a), "y")
194
+
195
+ def generate_string_2_list(
196
+ self,
197
+ url_params: str,
198
+ method="GET",
199
+ start_time=0,
200
+ end_time=0,
201
+ ) -> list:
202
+ start_time = start_time or int(time() * 1000)
203
+ end_time = end_time or (start_time + randint(4, 8))
204
+ params_array = self.generate_params_code(url_params)
205
+ method_array = self.generate_method_code(method)
206
+ return self.list_4(
207
+ (end_time >> 24) & 255,
208
+ params_array[21],
209
+ self.ua_code[23],
210
+ (end_time >> 16) & 255,
211
+ params_array[22],
212
+ self.ua_code[24],
213
+ (end_time >> 8) & 255,
214
+ (end_time >> 0) & 255,
215
+ (start_time >> 24) & 255,
216
+ (start_time >> 16) & 255,
217
+ (start_time >> 8) & 255,
218
+ (start_time >> 0) & 255,
219
+ method_array[21],
220
+ method_array[22],
221
+ int(end_time / 256 / 256 / 256 / 256) >> 0,
222
+ int(start_time / 256 / 256 / 256 / 256) >> 0,
223
+ self.browser_len,
224
+ )
225
+
226
+ @staticmethod
227
+ def reg_to_array(a):
228
+ o = [0] * 32
229
+ for i in range(8):
230
+ c = a[i]
231
+ o[4 * i + 3] = (255 & c)
232
+ c >>= 8
233
+ o[4 * i + 2] = (255 & c)
234
+ c >>= 8
235
+ o[4 * i + 1] = (255 & c)
236
+ c >>= 8
237
+ o[4 * i] = (255 & c)
238
+
239
+ return o
240
+
241
+ def compress(self, a):
242
+ f = self.generate_f(a)
243
+ i = self.reg[:]
244
+ for o in range(64):
245
+ c = self.de(i[0], 12) + i[4] + self.de(self.pe(o), o)
246
+ c = (c & 0xFFFFFFFF)
247
+ c = self.de(c, 7)
248
+ s = (c ^ self.de(i[0], 12)) & 0xFFFFFFFF
249
+
250
+ u = self.he(o, i[0], i[1], i[2])
251
+ u = (u + i[3] + s + f[o + 68]) & 0xFFFFFFFF
252
+
253
+ b = self.ve(o, i[4], i[5], i[6])
254
+ b = (b + i[7] + c + f[o]) & 0xFFFFFFFF
255
+
256
+ i[3] = i[2]
257
+ i[2] = self.de(i[1], 9)
258
+ i[1] = i[0]
259
+ i[0] = u
260
+
261
+ i[7] = i[6]
262
+ i[6] = self.de(i[5], 19)
263
+ i[5] = i[4]
264
+ i[4] = (b ^ self.de(b, 9) ^ self.de(b, 17)) & 0xFFFFFFFF
265
+
266
+ for l in range(8):
267
+ self.reg[l] = (self.reg[l] ^ i[l]) & 0xFFFFFFFF
268
+
269
+ @classmethod
270
+ def generate_f(cls, e):
271
+ r = [0] * 132
272
+
273
+ for t in range(16):
274
+ r[t] = (e[4 * t] << 24) | (e[4 * t + 1] <<
275
+ 16) | (e[4 * t + 2] << 8) | e[4 * t + 3]
276
+ r[t] &= 0xFFFFFFFF
277
+
278
+ for n in range(16, 68):
279
+ a = r[n - 16] ^ r[n - 9] ^ cls.de(r[n - 3], 15)
280
+ a = a ^ cls.de(a, 15) ^ cls.de(a, 23)
281
+ r[n] = (a ^ cls.de(r[n - 13], 7) ^ r[n - 6]) & 0xFFFFFFFF
282
+
283
+ for n in range(68, 132):
284
+ r[n] = (r[n - 68] ^ r[n - 64]) & 0xFFFFFFFF
285
+
286
+ return r
287
+
288
+ @staticmethod
289
+ def pad_array(arr, length=60):
290
+ while len(arr) < length:
291
+ arr.append(0)
292
+ return arr
293
+
294
+ def fill(self, length=60):
295
+ size = 8 * self.size
296
+ self.chunk.append(128)
297
+ self.chunk = self.pad_array(self.chunk, length)
298
+ for i in range(4):
299
+ self.chunk.append((size >> 8 * (3 - i)) & 255)
300
+
301
+ @staticmethod
302
+ def list_4(
303
+ a: int,
304
+ b: int,
305
+ c: int,
306
+ d: int,
307
+ e: int,
308
+ f: int,
309
+ g: int,
310
+ h: int,
311
+ i: int,
312
+ j: int,
313
+ k: int,
314
+ m: int,
315
+ n: int,
316
+ o: int,
317
+ p: int,
318
+ q: int,
319
+ r: int,
320
+ ) -> list:
321
+ return [
322
+ 44,
323
+ a,
324
+ 0,
325
+ 0,
326
+ 0,
327
+ 0,
328
+ 24,
329
+ b,
330
+ n,
331
+ 0,
332
+ c,
333
+ d,
334
+ 0,
335
+ 0,
336
+ 0,
337
+ 1,
338
+ 0,
339
+ 239,
340
+ e,
341
+ o,
342
+ f,
343
+ g,
344
+ 0,
345
+ 0,
346
+ 0,
347
+ 0,
348
+ h,
349
+ 0,
350
+ 0,
351
+ 14,
352
+ i,
353
+ j,
354
+ 0,
355
+ k,
356
+ m,
357
+ 3,
358
+ p,
359
+ 1,
360
+ q,
361
+ 1,
362
+ r,
363
+ 0,
364
+ 0,
365
+ 0]
366
+
367
+ @staticmethod
368
+ def end_check_num(a: list):
369
+ r = 0
370
+ for i in a:
371
+ r ^= i
372
+ return r
373
+
374
+ @classmethod
375
+ def decode_string(cls, url_string, ):
376
+ decoded = cls.__filter.sub(cls.replace_func, url_string)
377
+ return decoded
378
+
379
+ @staticmethod
380
+ def replace_func(match):
381
+ return chr(int(match.group(1), 16))
382
+
383
+ @staticmethod
384
+ def de(e, r):
385
+ r %= 32
386
+ return ((e << r) & 0xFFFFFFFF) | (e >> (32 - r))
387
+
388
+ @staticmethod
389
+ def pe(e):
390
+ return 2043430169 if 0 <= e < 16 else 2055708042
391
+
392
+ @staticmethod
393
+ def he(e, r, t, n):
394
+ if 0 <= e < 16:
395
+ return (r ^ t ^ n) & 0xFFFFFFFF
396
+ elif 16 <= e < 64:
397
+ return (r & t | r & n | t & n) & 0xFFFFFFFF
398
+ raise ValueError
399
+
400
+ @staticmethod
401
+ def ve(e, r, t, n):
402
+ if 0 <= e < 16:
403
+ return (r ^ t ^ n) & 0xFFFFFFFF
404
+ elif 16 <= e < 64:
405
+ return (r & t | ~r & n) & 0xFFFFFFFF
406
+ raise ValueError
407
+
408
+ @staticmethod
409
+ def convert_to_char_code(a):
410
+ d = []
411
+ for i in a:
412
+ d.append(ord(i))
413
+ return d
414
+
415
+ @staticmethod
416
+ def split_array(arr, chunk_size=64):
417
+ result = []
418
+ for i in range(0, len(arr), chunk_size):
419
+ result.append(arr[i:i + chunk_size])
420
+ return result
421
+
422
+ @staticmethod
423
+ def char_code_at(s):
424
+ return [ord(char) for char in s]
425
+
426
+ def write(self, e, ):
427
+ self.size = len(e)
428
+ if isinstance(e, str):
429
+ e = self.decode_string(e)
430
+ e = self.char_code_at(e)
431
+ if len(e) <= 64:
432
+ self.chunk = e
433
+ else:
434
+ chunks = self.split_array(e, 64)
435
+ for i in chunks[:-1]:
436
+ self.compress(i)
437
+ self.chunk = chunks[-1]
438
+
439
+ def reset(self, ):
440
+ self.chunk = []
441
+ self.size = 0
442
+ self.reg = self.__reg[:]
443
+
444
+ def sum(self, e, length=60):
445
+ self.reset()
446
+ self.write(e)
447
+ self.fill(length)
448
+ self.compress(self.chunk)
449
+ return self.reg_to_array(self.reg)
450
+
451
+ @classmethod
452
+ def generate_result_unit(cls, n, s):
453
+ r = ""
454
+ for i, j in zip(range(18, -1, -6), (16515072, 258048, 4032, 63)):
455
+ r += cls.__str[s][(n & j) >> i]
456
+ return r
457
+
458
+ @classmethod
459
+ def generate_result_end(cls, s, e="s4"):
460
+ r = ""
461
+ b = ord(s[120]) << 16
462
+ r += cls.__str[e][(b & 16515072) >> 18]
463
+ r += cls.__str[e][(b & 258048) >> 12]
464
+ r += "=="
465
+ return r
466
+
467
+ @classmethod
468
+ def generate_result(cls, s, e="s4"):
469
+ # r = ""
470
+ # for i in range(len(s)//4):
471
+ # b = ((ord(s[i * 3]) << 16) | (ord(s[i * 3 + 1]))
472
+ # << 8) | ord(s[i * 3 + 2])
473
+ # r += cls.generate_result_unit(b, e)
474
+ # return r
475
+
476
+ r = []
477
+
478
+ for i in range(0, len(s), 3):
479
+ if i + 2 < len(s):
480
+ n = (
481
+ (ord(s[i]) << 16)
482
+ | (ord(s[i + 1]) << 8)
483
+ | ord(s[i + 2])
484
+ )
485
+ elif i + 1 < len(s):
486
+ n = (ord(s[i]) << 16) | (
487
+ ord(s[i + 1]) << 8
488
+ )
489
+ else:
490
+ n = ord(s[i]) << 16
491
+
492
+ for j, k in zip(range(18, -1, -6),
493
+ (0xFC0000, 0x03F000, 0x0FC0, 0x3F)):
494
+ if j == 6 and i + 1 >= len(s):
495
+ break
496
+ if j == 0 and i + 2 >= len(s):
497
+ break
498
+ r.append(cls.__str[e][(n & k) >> j])
499
+
500
+ r.append("=" * ((4 - len(r) % 4) % 4))
501
+ return "".join(r)
502
+
503
+ @classmethod
504
+ def generate_args_code(cls):
505
+ a = []
506
+ for j in range(24, -1, -8):
507
+ a.append(cls.__arguments[0] >> j)
508
+ a.append(cls.__arguments[1] / 256)
509
+ a.append(cls.__arguments[1] % 256)
510
+ a.append(cls.__arguments[1] >> 24)
511
+ a.append(cls.__arguments[1] >> 16)
512
+ for j in range(24, -1, -8):
513
+ a.append(cls.__arguments[2] >> j)
514
+ return [int(i) & 255 for i in a]
515
+
516
+ def generate_method_code(self, method: str = "GET") -> list[int]:
517
+ return self.sm3_to_array(self.sm3_to_array(method + self.__end_string))
518
+ # return self.sum(self.sum(method + self.__end_string))
519
+
520
+ def generate_params_code(self, params: str) -> list[int]:
521
+ return self.sm3_to_array(self.sm3_to_array(params + self.__end_string))
522
+ # return self.sum(self.sum(params + self.__end_string))
523
+
524
+ @classmethod
525
+ def sm3_to_array(cls, data: str | list) -> list[int]:
526
+ """
527
+ 代码参考: https://github.com/Johnserf-Seed/f2/blob/main/f2/utils/abogus.py
528
+
529
+ 计算请求体的 SM3 哈希值,并将结果转换为整数数组
530
+ Calculate the SM3 hash value of the request body and convert the result to an array of integers
531
+
532
+ Args:
533
+ data (Union[str, List[int]]): 输入数据 (Input data).
534
+
535
+ Returns:
536
+ List[int]: 哈希值的整数数组 (Array of integers representing the hash value).
537
+ """
538
+
539
+ if isinstance(data, str):
540
+ b = data.encode("utf-8")
541
+ else:
542
+ b = bytes(data) # 将 List[int] 转换为字节数组
543
+
544
+ # 将字节数组转换为适合 sm3.sm3_hash 函数处理的列表格式
545
+ h = sm3.sm3_hash(func.bytes_to_list(b))
546
+
547
+ # 将十六进制字符串结果转换为十进制整数列表
548
+ return [int(h[i: i + 2], 16) for i in range(0, len(h), 2)]
549
+
550
+ @classmethod
551
+ def generate_browser_info(cls, platform: str = "Win32") -> str:
552
+ inner_width = randint(1280, 1920)
553
+ inner_height = randint(720, 1080)
554
+ outer_width = randint(inner_width, 1920)
555
+ outer_height = randint(inner_height, 1080)
556
+ screen_x = 0
557
+ screen_y = choice((0, 30))
558
+ value_list = [
559
+ inner_width,
560
+ inner_height,
561
+ outer_width,
562
+ outer_height,
563
+ screen_x,
564
+ screen_y,
565
+ 0,
566
+ 0,
567
+ outer_width,
568
+ outer_height,
569
+ outer_width,
570
+ outer_height,
571
+ inner_width,
572
+ inner_height,
573
+ 24,
574
+ 24,
575
+ platform,
576
+ ]
577
+ return "|".join(str(i) for i in value_list)
578
+
579
+ @staticmethod
580
+ def rc4_encrypt(plaintext, key):
581
+ s = list(range(256))
582
+ j = 0
583
+
584
+ for i in range(256):
585
+ j = (j + s[i] + ord(key[i % len(key)])) % 256
586
+ s[i], s[j] = s[j], s[i]
587
+
588
+ i = 0
589
+ j = 0
590
+ cipher = []
591
+
592
+ for k in range(len(plaintext)):
593
+ i = (i + 1) % 256
594
+ j = (j + s[i]) % 256
595
+ s[i], s[j] = s[j], s[i]
596
+ t = (s[i] + s[j]) % 256
597
+ cipher.append(chr(s[t] ^ ord(plaintext[k])))
598
+
599
+ return ''.join(cipher)
600
+
601
+ def get_value(self,
602
+ url_params: dict | str,
603
+ method="GET",
604
+ start_time=0,
605
+ end_time=0,
606
+ random_num_1=None,
607
+ random_num_2=None,
608
+ random_num_3=None,
609
+ ) -> str:
610
+ string_1 = self.generate_string_1(
611
+ random_num_1,
612
+ random_num_2,
613
+ random_num_3,
614
+ )
615
+ string_2 = self.generate_string_2(urlencode(url_params) if isinstance(
616
+ url_params, dict) else url_params, method, start_time, end_time, )
617
+ string = string_1 + string_2
618
+ # return self.generate_result(
619
+ # string, "s4") + self.generate_result_end(string, "s4")
620
+ return self.generate_result(string, "s4")
621
+
622
+
623
+ if __name__ == "__main__":
624
+ bogus = ABogus()
625
+ USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
626
+ url_str = "https://www.douyin.com/aweme/v1/web/aweme/detail/?device_platform=webapp&aid=6383&channel=channel_pc_web&pc_client_type=1&version_code=190500&version_name=19.5.0&cookie_enabled=true&browser_language=zh-CN&browser_platform=Win32&browser_name=Firefox&browser_online=true&engine_name=Gecko&os_name=Windows&os_version=10&platform=PC&screen_width=1920&screen_height=1080&browser_version=124.0&engine_version=122.0.0.0&cpu_core_num=12&device_memory=8&aweme_id=7345492945006595379"
627
+ # 将url参数转换为字典
628
+ url_params = dict([param.split("=")
629
+ for param in url_str.split("?")[1].split("&")])
630
+ print(f"URL参数: {url_params}")
631
+ a_bogus = bogus.get_value(url_params, )
632
+ # 使用url编码a_bogus
633
+ a_bogus = quote(a_bogus, safe='')
634
+ print(a_bogus)
635
+ print(USERAGENT)
backend/app/downloaders/generic_downloader.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """通用 yt-dlp 下载器:用于用户在「下载配置」里登记的自定义平台。
2
+
3
+ 不做任何站点特定逻辑——完全依赖 yt-dlp 内置 extractor。只把:
4
+ - 该平台的 Cookie/cookies-from-browser 注入 ydl_opts
5
+ - 全局代理注入 ydl_opts
6
+ """
7
+ import logging
8
+ import os
9
+ import tempfile
10
+ from abc import ABC
11
+ from typing import Optional, Union
12
+
13
+ import yt_dlp
14
+
15
+ from app.downloaders.base import Downloader, DownloadQuality
16
+ from app.models.notes_model import AudioDownloadResult
17
+ from app.services.cookie_manager import CookieConfigManager
18
+ from app.services.proxy_config_manager import ProxyConfigManager
19
+ from app.utils.path_helper import get_data_dir
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class GenericYtdlpDownloader(Downloader, ABC):
25
+ """对任意 yt-dlp 支持站点的薄封装。按平台 key 读取 cookie 配置。"""
26
+
27
+ def __init__(self, platform: str, cookie_domain: Optional[str] = None):
28
+ super().__init__()
29
+ self.platform = platform
30
+ # cookie 文件里 Netscape 格式需要 domain;不知道就用通用 . 让 yt-dlp 自己挑
31
+ self.cookie_domain = cookie_domain or f".{platform}.com"
32
+ mgr = CookieConfigManager()
33
+ self._cookie = mgr.get(platform)
34
+ self._browser = mgr.get_browser(platform)
35
+ self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
36
+
37
+ def _write_netscape_cookie_file(self) -> Optional[str]:
38
+ if not self._cookie:
39
+ return None
40
+ lines = ["# Netscape HTTP Cookie File\n"]
41
+ for pair in self._cookie.split("; "):
42
+ if "=" in pair:
43
+ k, v = pair.split("=", 1)
44
+ lines.append(f"{self.cookie_domain}\tTRUE\t/\tFALSE\t0\t{k}\t{v}\n")
45
+ tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
46
+ tmp.writelines(lines)
47
+ tmp.close()
48
+ logger.info("已生成 [%s] Netscape Cookie 文件: %s", self.platform, tmp.name)
49
+ return tmp.name
50
+
51
+ def _apply_ydl_extras(self, ydl_opts: dict) -> None:
52
+ proxy = ProxyConfigManager().get_proxy_url()
53
+ if proxy:
54
+ ydl_opts['proxy'] = proxy
55
+ if self._browser:
56
+ ydl_opts['cookiesfrombrowser'] = (self._browser,)
57
+ elif self._cookiefile:
58
+ ydl_opts['cookiefile'] = self._cookiefile
59
+
60
+ def download(
61
+ self,
62
+ video_url: str,
63
+ output_dir: Union[str, None] = None,
64
+ quality: DownloadQuality = "fast",
65
+ need_video: Optional[bool] = False,
66
+ skip_download: bool = False,
67
+ ) -> AudioDownloadResult:
68
+ if output_dir is None:
69
+ output_dir = get_data_dir()
70
+ if not output_dir:
71
+ output_dir = self.cache_data
72
+ os.makedirs(output_dir, exist_ok=True)
73
+
74
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
75
+ ydl_opts = {
76
+ 'format': 'bestaudio/best',
77
+ 'outtmpl': output_path,
78
+ 'noplaylist': True,
79
+ 'quiet': False,
80
+ }
81
+ if skip_download:
82
+ ydl_opts['skip_download'] = True
83
+ self._apply_ydl_extras(ydl_opts)
84
+
85
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
86
+ info = ydl.extract_info(video_url, download=not skip_download)
87
+ video_id = info.get("id") or "unknown"
88
+ title = info.get("title") or self.platform
89
+ duration = info.get("duration", 0)
90
+ cover_url = info.get("thumbnail")
91
+ ext = info.get("ext", "mp3")
92
+ audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
93
+
94
+ return AudioDownloadResult(
95
+ file_path=audio_path,
96
+ title=title,
97
+ duration=duration,
98
+ cover_url=cover_url,
99
+ platform=self.platform,
100
+ video_id=video_id,
101
+ raw_info={'tags': info.get('tags')},
102
+ video_path=None,
103
+ )
104
+
105
+ def download_video(
106
+ self,
107
+ video_url: str,
108
+ output_dir: Union[str, None] = None,
109
+ ) -> str:
110
+ if output_dir is None:
111
+ output_dir = get_data_dir()
112
+ os.makedirs(output_dir, exist_ok=True)
113
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
114
+ ydl_opts = {
115
+ 'format': 'bestvideo+bestaudio/best',
116
+ 'outtmpl': output_path,
117
+ 'noplaylist': True,
118
+ 'quiet': False,
119
+ 'merge_output_format': 'mp4',
120
+ }
121
+ self._apply_ydl_extras(ydl_opts)
122
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
123
+ info = ydl.extract_info(video_url, download=True)
124
+ video_id = info.get("id")
125
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
126
+ if not os.path.exists(video_path):
127
+ raise FileNotFoundError(f"视频文件未找到: {video_path}")
128
+ return video_path
backend/app/downloaders/kuaishou_downloader.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from abc import ABC
4
+ from typing import Union, Optional
5
+
6
+ import requests
7
+
8
+ from app.downloaders.base import Downloader
9
+ from app.downloaders.kuaishou_helper.kuaishou import KuaiShou
10
+ from app.enmus.note_enums import DownloadQuality
11
+ from app.models.audio_model import AudioDownloadResult
12
+ from app.utils.path_helper import get_data_dir
13
+
14
+
15
+ class KuaiShouDownloader(Downloader, ABC):
16
+ def __init__(self):
17
+ super().__init__()
18
+
19
+ def download(
20
+ self,
21
+ video_url: str,
22
+ output_dir: Union[str, None] = None,
23
+ quality: str = "fast",
24
+ need_video: Optional[bool] = False
25
+ ) -> AudioDownloadResult:
26
+ if output_dir is None:
27
+ output_dir = get_data_dir()
28
+ if not output_dir:
29
+ output_dir = self.cache_data
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ ks = KuaiShou()
33
+ video_raw_info = ks.run(video_url)
34
+ print(video_raw_info)
35
+ photo_info = video_raw_info['visionVideoDetail']['photo']
36
+ video_id = photo_info['id']
37
+ title = photo_info['caption'].strip().replace('\n', '').replace(' ', '_')[:50]
38
+ mp4_path = os.path.join(output_dir, f"{video_id}.mp4")
39
+ mp3_path = os.path.join(output_dir, f"{video_id}.mp3")
40
+
41
+ if os.path.exists(mp3_path):
42
+ print(f"[已存在] 跳过下载: {mp3_path}")
43
+ return AudioDownloadResult(
44
+ file_path=mp3_path,
45
+ title=title,
46
+ duration=photo_info['duration'],
47
+ cover_url=photo_info['coverUrl'],
48
+ platform="kuaishou",
49
+ video_id=video_id,
50
+ raw_info={
51
+ 'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
52
+ },
53
+ video_path=mp4_path
54
+ )
55
+
56
+ # 下载 mp4 视频
57
+ resp = requests.get(photo_info['photoUrl'], stream=True)
58
+ if resp.status_code == 200:
59
+ with open(mp4_path, "wb") as f:
60
+ for chunk in resp.iter_content(1024 * 1024):
61
+ f.write(chunk)
62
+ else:
63
+ raise Exception(f"视频下载失败: {resp.status_code}")
64
+
65
+ # 使用 ffmpeg 转换为 mp3
66
+ try:
67
+ subprocess.run([
68
+ "ffmpeg", "-y", "-i", mp4_path, "-vn", "-acodec", "libmp3lame", mp3_path
69
+ ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
70
+ except subprocess.CalledProcessError:
71
+ raise Exception("ffmpeg 转换 MP3 失败")
72
+
73
+ return AudioDownloadResult(
74
+ file_path=mp3_path,
75
+ title=photo_info['caption'],
76
+ duration=photo_info['duration'],
77
+ cover_url=photo_info['coverUrl'],
78
+ platform="kuaishou",
79
+ video_id=video_id,
80
+ raw_info={
81
+ 'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
82
+ },
83
+ video_path=mp4_path
84
+ )
85
+
86
+ def download_video(
87
+ self,
88
+ video_url: str,
89
+ output_dir: Union[str, None] = None,
90
+ ) -> str:
91
+ print('self.download(video_url, output_dir).video_path',self.download(video_url, output_dir).video_path)
92
+ return self.download(video_url, output_dir).video_path
93
+
94
+
95
+ if __name__ == '__main__':
96
+ ks = KuaiShouDownloader()
97
+ ks.download('https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')
backend/app/downloaders/kuaishou_helper/__init__.py ADDED
File without changes
backend/app/downloaders/kuaishou_helper/kuaishou.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+
5
+ import requests
6
+ from dotenv import load_dotenv
7
+
8
+ from app.services.cookie_manager import CookieConfigManager
9
+ from app.utils.logger import get_logger
10
+ KUAISHOU_API_BASE = 'https://www.kuaishou.com/graphql'
11
+ KUAISHOU_URL = "https://www.kuaishou.com/"
12
+ load_dotenv()
13
+ headers = {
14
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
15
+ 'Cache-Control': 'no-cache',
16
+ 'Connection': 'keep-alive',
17
+ # 'Cookie': 'did=web_9e8cfa4403000587b9e7d67233e6b04c; didv=1719811812378; kpf=PC_WEB; clientid=3; kpn=KUAISHOU_VISION',
18
+ 'Origin': 'https://www.kuaishou.com',
19
+ 'Pragma': 'no-cache',
20
+ 'Referer': 'https://www.kuaishou.com/',
21
+ 'Sec-Fetch-Dest': 'empty',
22
+ 'Sec-Fetch-Mode': 'cors',
23
+ 'Sec-Fetch-Site': 'same-origin',
24
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
25
+ 'accept': '*/*',
26
+ 'content-type': 'application/json',
27
+ 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
28
+ 'sec-ch-ua-mobile': '?0',
29
+ 'sec-ch-ua-platform': '"Windows"',
30
+ # 'Cookie':cookies.strip()
31
+ }
32
+
33
+ logger = get_logger(__name__)
34
+
35
+ cfm=CookieConfigManager()
36
+ class KuaiShou:
37
+ def __init__(self):
38
+ self.header = headers.copy()
39
+ self.cookie = None
40
+
41
+ @staticmethod
42
+ def _extract_kuaishou_link(text):
43
+
44
+ url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
45
+ return url[0]
46
+
47
+ def get_photo_id(self, url):
48
+ response = requests.get(url, allow_redirects=True, headers=self.header)
49
+ real_url = response.url
50
+ # 提取short—video/后面的id
51
+ pattern = re.compile(r'short-video/(\w+)')
52
+ match = pattern.search(real_url)
53
+ return match.group().split('/')[1]
54
+
55
+ def get_temp_cookies(self):
56
+ is_exist = cfm.get('kuaishou')
57
+ print(is_exist)
58
+ if is_exist:
59
+ return is_exist
60
+ res = requests.get(url=KUAISHOU_URL, headers=self.header, allow_redirects=True)
61
+ cookie_string = '; '.join([f"{k}={v}" for k, v in res.cookies.get_dict().items()])
62
+ return cookie_string
63
+
64
+ def get_video_details(self, url, photo_id):
65
+ json_data = {
66
+ 'operationName': 'visionVideoDetail',
67
+ "variables": {"photoId": photo_id, "page": "detail"},
68
+ "query": "query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {\n visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {\n status\n type\n author {\n id\n name\n following\n headerUrl\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n photoUrl\n liked\n timestamp\n expTag\n llsid\n viewCount\n videoRatio\n stereoType\n croppedPhotoUrl\n manifest {\n mediaType\n businessType\n version\n adaptationSet {\n id\n duration\n representation {\n id\n defaultSelect\n backupUrl\n codecs\n url\n height\n width\n avgBitrate\n maxBitrate\n m3u8Slice\n qualityType\n qualityLabel\n frameRate\n featureP2sp\n hidden\n disableAdaptive\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n commentLimit {\n canAddComment\n __typename\n }\n llsid\n danmakuSwitch\n __typename\n }\n}\n"
69
+ }
70
+ response = requests.post(url=KUAISHOU_API_BASE, headers=self.header, json=json_data)
71
+ if response.status_code == 200:
72
+ response.raise_for_status()
73
+
74
+ return response.json()
75
+ else:
76
+ return None
77
+
78
+ def run(self, url):
79
+ real_url = self._extract_kuaishou_link(url)
80
+ if not real_url:
81
+ logger.error(f"快手视频 URL 解析失败 {url}")
82
+
83
+ cookies = self.get_temp_cookies()
84
+ if not cookies:
85
+ logger.error(f"快手视频 cookies 解析失败 {url},请考虑设置环境变量 KUAISHOU_COOKIES")
86
+
87
+ self.header['Cookie'] = cookies.strip()
88
+ photo_id = self.get_photo_id(real_url)
89
+ if photo_id is None:
90
+ logger.error(f"快手视频 ID 解析失败 {url}")
91
+ video_details = self.get_video_details(real_url, photo_id)
92
+ print(video_details)
93
+ if video_details is None:
94
+ logger.error(f"快手视频详情解析失败 {url}")
95
+ return video_details['data']
96
+
97
+
98
+ if __name__ == '__main__':
99
+ ks = KuaiShou()
100
+ ks.run(
101
+ 'https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')
backend/app/downloaders/local_downloader.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from abc import ABC
4
+ from typing import Optional
5
+
6
+ from app.downloaders.base import Downloader
7
+ from app.enmus.note_enums import DownloadQuality
8
+ from app.models.audio_model import AudioDownloadResult
9
+ import os
10
+ import subprocess
11
+
12
+ from app.utils.video_helper import save_cover_to_static
13
+
14
+
15
+ class LocalDownloader(Downloader, ABC):
16
+ def __init__(self):
17
+
18
+ super().__init__()
19
+
20
+
21
+ def extract_cover(self, input_path: str, output_dir: Optional[str] = None) -> str:
22
+ """
23
+ 从本地视频文件中提取一张封面图(默认取第一帧)
24
+ :param input_path: 输入视频路径
25
+ :param output_dir: 输出目录,默认和视频同目录
26
+ :return: 提取出的封面图片路径
27
+ """
28
+ if not os.path.exists(input_path):
29
+ raise FileNotFoundError(f"输入文件不存在: {input_path}")
30
+
31
+ if output_dir is None:
32
+ output_dir = os.path.dirname(input_path)
33
+
34
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
35
+ output_path = os.path.join(output_dir, f"{base_name}_cover.jpg")
36
+
37
+ try:
38
+ command = [
39
+ 'ffmpeg',
40
+ '-i', input_path,
41
+ '-ss', '00:00:01', # 跳到视频第1秒,防止黑屏
42
+ '-vframes', '1', # 只截取一帧
43
+ '-q:v', '2', # 输出质量高一点(qscale,2是很高)
44
+ '-y', # 覆盖
45
+ output_path
46
+ ]
47
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
48
+
49
+ if not os.path.exists(output_path):
50
+ raise RuntimeError(f"封面图片生成失败: {output_path}")
51
+
52
+ return output_path
53
+ except subprocess.CalledProcessError as e:
54
+ raise RuntimeError(f"提取封面失败: {output_path}") from e
55
+
56
+ def convert_to_mp3(self,input_path: str, output_path: str = None) -> str:
57
+ """
58
+ 将本地视频文件转为 MP3 音频文件
59
+ :param input_path: 输入文件路径(如 .mp4)
60
+ :param output_path: 输出文件路径(可选,默认同目录同名 .mp3)
61
+ :return: 生成的 mp3 文件路径
62
+ """
63
+ if not os.path.exists(input_path):
64
+ raise FileNotFoundError(f"输入文件不存在: {input_path}")
65
+
66
+ if output_path is None:
67
+ base, _ = os.path.splitext(input_path)
68
+ output_path = base + ".mp3"
69
+ try:
70
+ # 调用 ffmpeg 转换
71
+ command = [
72
+ 'ffmpeg',
73
+ '-i', input_path,
74
+ '-vn', # 不要视频流
75
+ '-acodec', 'libmp3lame', # 使用mp3编码
76
+ '-y', # 覆盖输出文件
77
+ output_path
78
+ ]
79
+
80
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
81
+
82
+ if not os.path.exists(output_path):
83
+ raise RuntimeError(f"mp3 文件生成失败: {output_path}")
84
+
85
+ return output_path
86
+ except subprocess.CalledProcessError as e:
87
+ raise RuntimeError(f"mp3 文件生成失败: {output_path}") from e
88
+ def download_video(self, video_url: str, output_dir: str = None) -> str:
89
+ """
90
+ 处理本地文件路径,返回视频文件路径
91
+ """
92
+ if video_url.startswith('/uploads'):
93
+ project_root = os.getcwd()
94
+ video_url = os.path.join(project_root, video_url.lstrip('/'))
95
+ video_url = os.path.normpath(video_url)
96
+
97
+ if not os.path.exists(video_url):
98
+ raise FileNotFoundError()
99
+ return video_url
100
+ def download(
101
+ self,
102
+ video_url: str,
103
+ output_dir: str = None,
104
+ quality: DownloadQuality = "fast",
105
+ need_video: Optional[bool] = False
106
+ ) -> AudioDownloadResult:
107
+ """
108
+ 处理本地文件路径,返回音频元信息
109
+ """
110
+ if video_url.startswith('/uploads'):
111
+ project_root = os.getcwd()
112
+ video_url = os.path.join(project_root, video_url.lstrip('/'))
113
+ video_url = os.path.normpath(video_url)
114
+
115
+ if not os.path.exists(video_url):
116
+ raise FileNotFoundError(f"本地文件不存在: {video_url}")
117
+
118
+ file_name = os.path.basename(video_url)
119
+ title, _ = os.path.splitext(file_name)
120
+ print(title, file_name,video_url)
121
+ file_path=self.convert_to_mp3(video_url)
122
+ cover_path = self.extract_cover(video_url)
123
+ cover_url = save_cover_to_static(cover_path)
124
+
125
+ print('file——path',file_path)
126
+ return AudioDownloadResult(
127
+ file_path=file_path,
128
+ title=title,
129
+ duration=0, # 可选:后续加上读取时长
130
+ cover_url=cover_url, # 暂无封面
131
+ platform="local",
132
+ video_id=title,
133
+ raw_info={
134
+ 'path': file_path
135
+ },
136
+ video_path=None
137
+ )
backend/app/downloaders/xiaohongshu_downloader.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """小红书下载器:基于 yt-dlp 内置 XiaoHongShu extractor。
2
+
3
+ URL 模式:
4
+ - https://www.xiaohongshu.com/explore/{id}
5
+ - https://www.xiaohongshu.com/discovery/item/{id}
6
+ - 短链 xhslink.com/xxx 由 yt-dlp 自行跟随重定向
7
+
8
+ 小红书很多内容是图文笔记(无视频/音频)。无视频的会触发 yt-dlp 报「请求格式不可用」,
9
+ 前端会展示生成失败——这是预期行为,不强行兜底。
10
+ """
11
+ import os
12
+ import logging
13
+ import tempfile
14
+ from abc import ABC
15
+ from typing import Union, Optional
16
+
17
+ import yt_dlp
18
+
19
+ from app.downloaders.base import Downloader, DownloadQuality
20
+ from app.models.notes_model import AudioDownloadResult
21
+ from app.services.cookie_manager import CookieConfigManager
22
+ from app.utils.path_helper import get_data_dir
23
+ from app.utils.url_parser import extract_video_id, clean_url
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class XiaohongshuDownloader(Downloader, ABC):
29
+ def __init__(self):
30
+ super().__init__()
31
+ self._cookie_mgr = CookieConfigManager()
32
+ self._cookie = self._cookie_mgr.get('xiaohongshu')
33
+ self._browser = self._cookie_mgr.get_browser('xiaohongshu')
34
+ self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
35
+
36
+ def _write_netscape_cookie_file(self) -> Optional[str]:
37
+ if not self._cookie:
38
+ logger.warning("小红书 Cookie 未配置,部分内容可能下载失败")
39
+ return None
40
+ lines = ["# Netscape HTTP Cookie File\n"]
41
+ for pair in self._cookie.split("; "):
42
+ if "=" in pair:
43
+ key, value = pair.split("=", 1)
44
+ lines.append(f".xiaohongshu.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
45
+ tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
46
+ tmp.writelines(lines)
47
+ tmp.close()
48
+ logger.info("已生成小红书 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
49
+ return tmp.name
50
+
51
+ def _apply_cookie(self, ydl_opts: dict) -> None:
52
+ if self._browser:
53
+ ydl_opts['cookiesfrombrowser'] = (self._browser,)
54
+ logger.info(f"小红书使用 cookies-from-browser: {self._browser}")
55
+ elif self._cookiefile:
56
+ ydl_opts['cookiefile'] = self._cookiefile
57
+
58
+ def download(
59
+ self,
60
+ video_url: str,
61
+ output_dir: Union[str, None] = None,
62
+ quality: DownloadQuality = "fast",
63
+ need_video: Optional[bool] = False,
64
+ skip_download: bool = False,
65
+ ) -> AudioDownloadResult:
66
+ # 从分享文案中提取干净链接(标题+不可见字符+短链 整段粘贴也能用)
67
+ video_url = clean_url(video_url)
68
+ if output_dir is None:
69
+ output_dir = get_data_dir()
70
+ if not output_dir:
71
+ output_dir = self.cache_data
72
+ os.makedirs(output_dir, exist_ok=True)
73
+
74
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
75
+ ydl_opts = {
76
+ 'format': 'bestaudio/best',
77
+ 'outtmpl': output_path,
78
+ 'noplaylist': True,
79
+ 'quiet': False,
80
+ }
81
+ if skip_download:
82
+ ydl_opts['skip_download'] = True
83
+ self._apply_cookie(ydl_opts)
84
+
85
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
86
+ info = ydl.extract_info(video_url, download=not skip_download)
87
+ video_id = info.get("id")
88
+ title = info.get("title")
89
+ duration = info.get("duration", 0)
90
+ cover_url = info.get("thumbnail")
91
+ ext = info.get("ext", "mp3")
92
+ audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
93
+
94
+ return AudioDownloadResult(
95
+ file_path=audio_path,
96
+ title=title,
97
+ duration=duration,
98
+ cover_url=cover_url,
99
+ platform="xiaohongshu",
100
+ video_id=video_id,
101
+ raw_info={'tags': info.get('tags')},
102
+ video_path=None,
103
+ )
104
+
105
+ def download_video(
106
+ self,
107
+ video_url: str,
108
+ output_dir: Union[str, None] = None,
109
+ ) -> str:
110
+ video_url = clean_url(video_url)
111
+ if output_dir is None:
112
+ output_dir = get_data_dir()
113
+ video_id = extract_video_id(video_url, "xiaohongshu")
114
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
115
+ if os.path.exists(video_path):
116
+ return video_path
117
+ os.makedirs(output_dir, exist_ok=True)
118
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
119
+ ydl_opts = {
120
+ 'format': 'bestvideo+bestaudio/best',
121
+ 'outtmpl': output_path,
122
+ 'noplaylist': True,
123
+ 'quiet': False,
124
+ 'merge_output_format': 'mp4',
125
+ }
126
+ self._apply_cookie(ydl_opts)
127
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
128
+ info = ydl.extract_info(video_url, download=True)
129
+ video_id = info.get("id")
130
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
131
+ if not os.path.exists(video_path):
132
+ raise FileNotFoundError(f"视频文件未找到: {video_path}")
133
+ return video_path
backend/app/downloaders/xiaoyuzhoufm_download.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, Optional
2
+
3
+ import requests
4
+
5
+ from app.downloaders.base import Downloader
6
+ from app.enmus.note_enums import DownloadQuality
7
+ from app.models.audio_model import AudioDownloadResult
8
+
9
+ url='https://www.xiaoyuzhoufm.com/_next/data/5Pvt_oGntgdyBD_XgwBaB/podcast/62382c1103bea1ebfffa1c00.json?id=62382c1103bea1ebfffa1c00'
10
+ header ={
11
+ 'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36'
12
+ }
13
+
14
+ response = requests.get(url, headers=header)
15
+ print(response.json())
16
+
17
+ class Xiaoyuzhoufm_download(Downloader):
18
+ def download(
19
+ self,
20
+ video_url: str,
21
+ output_dir: Union[str, None] = None,
22
+ quality: DownloadQuality = "fast",
23
+ need_video:Optional[bool]=False
24
+ ) -> AudioDownloadResult:
25
+ pass
backend/app/downloaders/youtube_downloader.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import tempfile
4
+ from abc import ABC
5
+ from typing import Union, Optional, List
6
+
7
+ import yt_dlp
8
+
9
+ from app.downloaders.base import Downloader, DownloadQuality
10
+ from app.downloaders.youtube_subtitle import YouTubeSubtitleFetcher
11
+ from app.models.notes_model import AudioDownloadResult
12
+ from app.models.transcriber_model import TranscriptResult
13
+ from app.services.cookie_manager import CookieConfigManager
14
+ from app.services.proxy_config_manager import ProxyConfigManager
15
+ from app.utils.path_helper import get_data_dir
16
+ from app.utils.url_parser import extract_video_id
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _apply_proxy(ydl_opts: dict) -> dict:
22
+ """YouTube 在国内需要代理。配置了全局代理就塞进 yt-dlp opts。"""
23
+ proxy = ProxyConfigManager().get_proxy_url()
24
+ if proxy:
25
+ ydl_opts['proxy'] = proxy
26
+ logger.info(f"yt-dlp 走代理: {proxy}")
27
+ return ydl_opts
28
+
29
+
30
+ def _apply_youtube_extractor_args(ydl_opts: dict) -> dict:
31
+ """YouTube player_client 选择。
32
+
33
+ 默认不再覆盖、交给 yt-dlp 的内置策略:
34
+ 早期为绕开 SSAP 实验(issue #12482)硬编码过 ['tv', 'web_safari'],
35
+ 但 YouTube 后来对 tv 客户端做「全量 DRM」实验(issue #12563),命中的会话
36
+ 所有视频都报 "This video is DRM protected";而 web 系客户端需要 JS runtime
37
+ (deno)解 n challenge,装好后 yt-dlp 默认客户端列表即可正常取流。
38
+ 硬编码的客户端列表会随 YouTube 风控变化反复失效,不如跟随 yt-dlp 升级。
39
+
40
+ 如需临时指定,可设环境变量 YT_PLAYER_CLIENT(逗号分隔),如
41
+ YT_PLAYER_CLIENT=web_safari,android_vr。
42
+ """
43
+ clients = os.getenv('YT_PLAYER_CLIENT', '').strip()
44
+ if clients:
45
+ ydl_opts.setdefault('extractor_args', {})
46
+ ydl_opts['extractor_args'].setdefault('youtube', {})
47
+ ydl_opts['extractor_args']['youtube']['player_client'] = [
48
+ c.strip() for c in clients.split(',') if c.strip()
49
+ ]
50
+ return ydl_opts
51
+
52
+
53
+ class YoutubeDownloader(Downloader, ABC):
54
+ def __init__(self):
55
+
56
+ super().__init__()
57
+ self._cookie_mgr = CookieConfigManager()
58
+ self._cookie = self._cookie_mgr.get('youtube')
59
+ # 优先级:浏览器实时 cookies > 粘贴的 cookie 字符串。
60
+ # 配了浏览器就走 yt-dlp `cookiesfrombrowser`,能避开 YouTube 的会话轮换风控。
61
+ self._browser = self._cookie_mgr.get_browser('youtube')
62
+ self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
63
+
64
+ def _write_netscape_cookie_file(self) -> Optional[str]:
65
+ """将 YouTube Cookie 写入 Netscape 格式临时文件,供 yt-dlp cookiefile 使用。
66
+
67
+ 没有 Cookie 时返回 None;YouTube 现在没 Cookie 基本会被拦在「Sign in to confirm you're not a bot」。
68
+ """
69
+ if not self._cookie:
70
+ logger.warning("YouTube Cookie 未配置,下载可能会被风控为机器人")
71
+ return None
72
+ lines = ["# Netscape HTTP Cookie File\n"]
73
+ for pair in self._cookie.split("; "):
74
+ if "=" in pair:
75
+ key, value = pair.split("=", 1)
76
+ lines.append(f".youtube.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
77
+ tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
78
+ tmp.writelines(lines)
79
+ tmp.close()
80
+ logger.info("已生成 YouTube Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
81
+ return tmp.name
82
+
83
+ def download(
84
+ self,
85
+ video_url: str,
86
+ output_dir: Union[str, None] = None,
87
+ quality: DownloadQuality = "fast",
88
+ need_video: Optional[bool] = False,
89
+ skip_download: bool = False,
90
+ ) -> AudioDownloadResult:
91
+ if output_dir is None:
92
+ output_dir = get_data_dir()
93
+ if not output_dir:
94
+ output_dir = self.cache_data
95
+ os.makedirs(output_dir, exist_ok=True)
96
+
97
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
98
+
99
+ ydl_opts = {
100
+ 'format': 'bestaudio[ext=m4a]/bestaudio/best',
101
+ 'outtmpl': output_path,
102
+ 'noplaylist': True,
103
+ 'quiet': False,
104
+ }
105
+
106
+ if skip_download:
107
+ ydl_opts['skip_download'] = True
108
+
109
+ _apply_proxy(ydl_opts)
110
+ _apply_youtube_extractor_args(ydl_opts)
111
+ if self._browser:
112
+ # (browser_name,) 形式即可;profile/keyring/container 留默认
113
+ ydl_opts['cookiesfrombrowser'] = (self._browser,)
114
+ logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
115
+ elif self._cookiefile:
116
+ ydl_opts['cookiefile'] = self._cookiefile
117
+
118
+ try:
119
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
120
+ info = ydl.extract_info(video_url, download=not skip_download)
121
+ video_id = info.get("id")
122
+ title = info.get("title")
123
+ duration = info.get("duration", 0)
124
+ cover_url = info.get("thumbnail")
125
+ ext = info.get("ext", "m4a")
126
+ audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
127
+
128
+ return AudioDownloadResult(
129
+ file_path=audio_path,
130
+ title=title,
131
+ duration=duration,
132
+ cover_url=cover_url,
133
+ platform="youtube",
134
+ video_id=video_id,
135
+ raw_info={'tags': info.get('tags')},
136
+ video_path=None,
137
+ )
138
+ except Exception as exc:
139
+ # DRM / 反爬 / 格式不可用等情况下 yt-dlp 拉不动;只要本次仅需要 metadata
140
+ # (即字幕路径,skip_download=True),就退到 YouTube oEmbed 兜底拿标题+封面,
141
+ # 让流程能继续走总结。需要下载音视频时只能向上抛。
142
+ if not skip_download:
143
+ raise
144
+ logger.warning(f"yt-dlp 获取元数据失败,回退 oEmbed: {exc}")
145
+ return self._fallback_metadata(video_url)
146
+
147
+ def _fallback_metadata(self, video_url: str) -> AudioDownloadResult:
148
+ """yt-dlp 失败时的兜底:用 YouTube 公开的 oEmbed 接口拿基础 metadata。
149
+
150
+ 只能拿到 title / thumbnail / author 这几样;duration / tags 拿不到,做空值处理。
151
+ DRM、bot 拦截等都不影响 oEmbed。
152
+ """
153
+ import requests
154
+
155
+ video_id = extract_video_id(video_url, "youtube") or ""
156
+ title = video_id or "YouTube 视频"
157
+ cover = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg" if video_id else ""
158
+ try:
159
+ proxies = None
160
+ proxy = ProxyConfigManager().get_proxy_url()
161
+ if proxy:
162
+ proxies = {"http": proxy, "https": proxy}
163
+ resp = requests.get(
164
+ "https://www.youtube.com/oembed",
165
+ params={"url": video_url, "format": "json"},
166
+ proxies=proxies,
167
+ timeout=10,
168
+ )
169
+ resp.raise_for_status()
170
+ data = resp.json()
171
+ if data.get("title"):
172
+ title = data["title"]
173
+ if data.get("thumbnail_url"):
174
+ cover = data["thumbnail_url"]
175
+ logger.info(f"oEmbed 兜底成功:title={title}")
176
+ except Exception as e:
177
+ logger.warning(f"oEmbed 兜底也失败,使用最小元数据:{e}")
178
+
179
+ return AudioDownloadResult(
180
+ file_path="", # 没下载音视频文件
181
+ title=title,
182
+ duration=0, # oEmbed 不返回时长
183
+ cover_url=cover,
184
+ platform="youtube",
185
+ video_id=video_id,
186
+ raw_info={"tags": []}, # oEmbed 不返回标签
187
+ video_path=None,
188
+ )
189
+
190
+ def download_video(
191
+ self,
192
+ video_url: str,
193
+ output_dir: Union[str, None] = None,
194
+ ) -> str:
195
+ """
196
+ 下载视频,返回视频文件路径
197
+ """
198
+ if output_dir is None:
199
+ output_dir = get_data_dir()
200
+ video_id = extract_video_id(video_url, "youtube")
201
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
202
+ if os.path.exists(video_path):
203
+ return video_path
204
+ os.makedirs(output_dir, exist_ok=True)
205
+ output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
206
+
207
+ ydl_opts = {
208
+ # 这里下载的视频只用于截图网格/视频理解抽帧,720p 足够:
209
+ # 不设上限的话 bestvideo 会选 4K AV1(动辄 300MB+,下载和 ffmpeg
210
+ # 解码抽帧都极慢)。优先 avc1(解码远快于 av01),同高度再退 av01。
211
+ 'format': (
212
+ 'bestvideo[height<=720][vcodec^=avc1]+bestaudio[ext=m4a]'
213
+ '/bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]'
214
+ '/best[height<=720][ext=mp4]/best[ext=mp4]'
215
+ ),
216
+ 'outtmpl': output_path,
217
+ 'noplaylist': True,
218
+ 'quiet': False,
219
+ 'merge_output_format': 'mp4', # 确保合并成 mp4
220
+ }
221
+
222
+ _apply_proxy(ydl_opts)
223
+ _apply_youtube_extractor_args(ydl_opts)
224
+ if self._browser:
225
+ # (browser_name,) 形式即可;profile/keyring/container 留默认
226
+ ydl_opts['cookiesfrombrowser'] = (self._browser,)
227
+ logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
228
+ elif self._cookiefile:
229
+ ydl_opts['cookiefile'] = self._cookiefile
230
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
231
+ info = ydl.extract_info(video_url, download=True)
232
+ video_id = info.get("id")
233
+ video_path = os.path.join(output_dir, f"{video_id}.mp4")
234
+
235
+ if not os.path.exists(video_path):
236
+ raise FileNotFoundError(f"视频文件未找到: {video_path}")
237
+
238
+ return video_path
239
+
240
+ def download_subtitles(self, video_url: str, output_dir: str = None,
241
+ langs: List[str] = None) -> Optional[TranscriptResult]:
242
+ """
243
+ 通过 YouTube InnerTube API 直接获取字幕(优先人工字幕,其次自动生成)。
244
+ 比 yt_dlp 方式更轻量,无需写临时文件到磁盘。
245
+
246
+ :param video_url: 视频链接
247
+ :param output_dir: 未使用(保留接口兼容)
248
+ :param langs: 优先语言列表
249
+ :return: TranscriptResult 或 None
250
+ """
251
+ if langs is None:
252
+ langs = ['zh-Hans', 'zh', 'zh-CN', 'zh-TW', 'en', 'en-US', 'ja']
253
+
254
+ video_id = extract_video_id(video_url, "youtube")
255
+ fetcher = YouTubeSubtitleFetcher()
256
+ print(
257
+ f"尝试获取字幕,video_id={video_id}, langs={langs}"
258
+ )
259
+ return fetcher.fetch_subtitles(video_id, langs)
backend/app/downloaders/youtube_subtitle.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 通过 youtube-transcript-api 获取 YouTube 字幕。
3
+ 优先人工字幕,其次自动生成字幕。不依赖 yt_dlp,无需下载任何文件。
4
+ """
5
+
6
+ from typing import Optional, List
7
+
8
+ from youtube_transcript_api import YouTubeTranscriptApi
9
+
10
+ from app.models.transcriber_model import TranscriptResult, TranscriptSegment
11
+ from app.services.proxy_config_manager import ProxyConfigManager
12
+ from app.utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class YouTubeSubtitleFetcher:
18
+ """通过 youtube-transcript-api 获取 YouTube 字幕。"""
19
+
20
+ def __init__(self):
21
+ # 配了全局代理就给 youtube-transcript-api 套一个带 proxies 的 requests.Session,
22
+ # 否则国内拉字幕同样会超时。代理未配置时退回默认无代理客户端。
23
+ proxy = ProxyConfigManager().get_proxy_url()
24
+ if proxy:
25
+ try:
26
+ import requests
27
+ session = requests.Session()
28
+ session.proxies = {"http": proxy, "https": proxy}
29
+ self._api = YouTubeTranscriptApi(http_client=session)
30
+ logger.info(f"YouTube 字幕走代理: {proxy}")
31
+ except Exception as e:
32
+ logger.warning(f"为 youtube-transcript-api 注入代理失败,回退无代理: {e}")
33
+ self._api = YouTubeTranscriptApi()
34
+ else:
35
+ self._api = YouTubeTranscriptApi()
36
+
37
+ def fetch_subtitles(
38
+ self,
39
+ video_id: str,
40
+ langs: Optional[List[str]] = None,
41
+ ) -> Optional[TranscriptResult]:
42
+ if langs is None:
43
+ langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"]
44
+
45
+ try:
46
+ # 1. 列出所有可用字幕
47
+ transcript_list = self._api.list(video_id)
48
+
49
+ available = []
50
+ for t in transcript_list:
51
+ available.append(
52
+ f"{t.language_code}({'auto' if t.is_generated else 'manual'})"
53
+ )
54
+ logger.info(f"可用字幕轨道: {', '.join(available)}")
55
+
56
+ # 2. 按优先级查找:先人工字幕,再自动字幕
57
+ transcript = None
58
+ try:
59
+ transcript = transcript_list.find_manually_created_transcript(langs)
60
+ logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})")
61
+ except Exception:
62
+ try:
63
+ transcript = transcript_list.find_generated_transcript(langs)
64
+ logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})")
65
+ except Exception:
66
+ # 都没匹配,取第一个可用的
67
+ for t in transcript_list:
68
+ transcript = t
69
+ source = "auto" if t.is_generated else "manual"
70
+ logger.info(f"使用首个可用字幕: {t.language_code} ({source})")
71
+ break
72
+
73
+ if not transcript:
74
+ logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕")
75
+ return None
76
+
77
+ # 3. 获取字幕内容
78
+ fetched = transcript.fetch()
79
+ segments = []
80
+ for snippet in fetched:
81
+ text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip()
82
+ if not text:
83
+ continue
84
+ start = snippet.get("start", 0) if isinstance(snippet, dict) else 0
85
+ duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0
86
+ segments.append(TranscriptSegment(
87
+ start=float(start),
88
+ end=float(start) + float(duration),
89
+ text=text,
90
+ ))
91
+
92
+ if not segments:
93
+ logger.warning(f"YouTube 字幕内容为空: {video_id}")
94
+ return None
95
+
96
+ full_text = " ".join(seg.text for seg in segments)
97
+ logger.info(f"成功获取 YouTube 字幕,共 {len(segments)} 段")
98
+
99
+ return TranscriptResult(
100
+ language=transcript.language_code,
101
+ full_text=full_text,
102
+ segments=segments,
103
+ raw={
104
+ "source": "youtube_transcript_api",
105
+ "language": transcript.language,
106
+ "language_code": transcript.language_code,
107
+ "is_generated": transcript.is_generated,
108
+ },
109
+ )
110
+
111
+ except Exception as e:
112
+ logger.warning(f"YouTube 字幕获取失败: {e}")
113
+ return None
backend/app/enmus/exception.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+
4
+ class ProviderErrorEnum(enum.Enum):
5
+ CONNECTION_TEST_FAILED = (200101, "供应商连接测试失败")
6
+ SAVE_FAILED = (200102, "供应商保存失败")
7
+ CREATE_FAILED = (200103, "供应商创建失败")
8
+ NOT_FOUND = (200104, "供应商不存在/未保存")
9
+ WRONG_PARAMETER = (200105, "API / API 地址不正确")
10
+ UNKNOW_ERROR = (200106, "未知错误")
11
+
12
+ def __init__(self, code, message):
13
+ self.code = code
14
+ self.message = message
15
+
16
+ class NoteErrorEnum(enum.Enum):
17
+ PLATFORM_NOT_SUPPORTED = (300101 ,"选择的平台不受支持")
18
+
19
+ def __init__(self, code, message):
20
+ self.code = code
21
+ self.message = message
backend/app/enmus/note_enums.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+
4
+ class DownloadQuality(str, enum.Enum):
5
+ fast = "fast"
6
+ medium = "medium"
7
+ slow = "slow"