zhoujiaangyao commited on
Commit ·
6cfe55f
0
Parent(s):
deploy videomemo backend to HF Space
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +12 -0
- Dockerfile +48 -0
- README.md +20 -0
- backend/.env.example +12 -0
- backend/Dockerfile +42 -0
- backend/Dockerfile.gpu +40 -0
- backend/__init__.py +0 -0
- backend/app/__init__.py +47 -0
- backend/app/article_fetchers/__init__.py +3 -0
- backend/app/article_fetchers/base.py +36 -0
- backend/app/article_fetchers/generic.py +117 -0
- backend/app/article_fetchers/wechat.py +142 -0
- backend/app/article_fetchers/xiaohongshu.py +218 -0
- backend/app/core/__init__.py +0 -0
- backend/app/db/__init__.py +0 -0
- backend/app/db/article_dao.py +167 -0
- backend/app/db/builtin_providers.json +65 -0
- backend/app/db/engine.py +45 -0
- backend/app/db/init_db.py +34 -0
- backend/app/db/model_dao.py +69 -0
- backend/app/db/models/__init__.py +0 -0
- backend/app/db/models/articles.py +55 -0
- backend/app/db/models/models.py +12 -0
- backend/app/db/models/providers.py +17 -0
- backend/app/db/models/trend_subscription.py +50 -0
- backend/app/db/models/video_tasks.py +14 -0
- backend/app/db/provider_dao.py +129 -0
- backend/app/db/sqlite_client.py +4 -0
- backend/app/db/trend_subscription_dao.py +293 -0
- backend/app/db/video_task_dao.py +61 -0
- backend/app/decorators/__init__.py +0 -0
- backend/app/decorators/timeit.py +13 -0
- backend/app/downloaders/__init__.py +0 -0
- backend/app/downloaders/base.py +52 -0
- backend/app/downloaders/bilibili_downloader.py +343 -0
- backend/app/downloaders/bilibili_subtitle.py +164 -0
- backend/app/downloaders/common.py +1 -0
- backend/app/downloaders/douyin_downloader.py +499 -0
- backend/app/downloaders/douyin_helper/abogus.py +635 -0
- backend/app/downloaders/generic_downloader.py +128 -0
- backend/app/downloaders/kuaishou_downloader.py +97 -0
- backend/app/downloaders/kuaishou_helper/__init__.py +0 -0
- backend/app/downloaders/kuaishou_helper/kuaishou.py +101 -0
- backend/app/downloaders/local_downloader.py +137 -0
- backend/app/downloaders/xiaohongshu_downloader.py +133 -0
- backend/app/downloaders/xiaoyuzhoufm_download.py +25 -0
- backend/app/downloaders/youtube_downloader.py +259 -0
- backend/app/downloaders/youtube_subtitle.py +113 -0
- backend/app/enmus/exception.py +21 -0
- backend/app/enmus/note_enums.py +7 -0
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
backend/.venv/
|
| 2 |
+
backend/data/
|
| 3 |
+
backend/models/
|
| 4 |
+
backend/config/
|
| 5 |
+
backend/note_results/
|
| 6 |
+
backend/static/
|
| 7 |
+
backend/uploads/
|
| 8 |
+
backend/*.db
|
| 9 |
+
backend/app/db/*.db
|
| 10 |
+
__pycache__/
|
| 11 |
+
*.pyc
|
| 12 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VideoMemo 后端 —— Hugging Face Spaces(Docker SDK)部署用 Dockerfile。
|
| 2 |
+
#
|
| 3 |
+
# 用法:HF Space 是一个独立 git 仓库,把它的根目录布置成:
|
| 4 |
+
# /Dockerfile ← 本文件(复制到 Space 根目录,重命名为 Dockerfile)
|
| 5 |
+
# /README.md ← deploy/hf-space/README.md(含 HF 必需的 frontmatter)
|
| 6 |
+
# /backend/... ← 从本项目复制整个 backend 目录过去
|
| 7 |
+
# 然后 git push 到 Space,HF 会构建本文件(COPY 路径相对 Space 根目录)。
|
| 8 |
+
#
|
| 9 |
+
# 镜像故意精简:只装 ffmpeg + 后端依赖,默认走 REST 飞书推送,不装 lark-cli。
|
| 10 |
+
# 数据库用外接 Postgres(Supabase),通过 DATABASE_URL Secret 注入。
|
| 11 |
+
ARG BASE_REGISTRY=docker.io
|
| 12 |
+
FROM ${BASE_REGISTRY}/library/python:3.11-slim
|
| 13 |
+
|
| 14 |
+
# HF 在 huggingface.co 自家基础设施上构建/运行:用官方 PyPI 与默认 HF 端点,
|
| 15 |
+
# 不要用国内镜像(那会更慢甚至失败)。
|
| 16 |
+
ARG PIP_INDEX=https://pypi.org/simple
|
| 17 |
+
|
| 18 |
+
# fonts-liberation 提供与 Arial 度量兼容的 LiberationSans,替代仓库里的 arial.ttf
|
| 19 |
+
# (HF git 不收二进制,故字体不进仓库,改由镜像在构建时提供)
|
| 20 |
+
RUN apt-get update && \
|
| 21 |
+
apt-get install -y --no-install-recommends ffmpeg curl fonts-liberation && \
|
| 22 |
+
rm -rf /var/lib/apt/lists/*
|
| 23 |
+
|
| 24 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 25 |
+
BACKEND_HOST=0.0.0.0 \
|
| 26 |
+
BACKEND_PORT=8483 \
|
| 27 |
+
STATIC=/static \
|
| 28 |
+
OUT_DIR=/app/static/screenshots \
|
| 29 |
+
IMAGE_BASE_URL=/static/screenshots \
|
| 30 |
+
NOTE_OUTPUT_DIR=/app/data/note_results \
|
| 31 |
+
DATA_DIR=/app/data
|
| 32 |
+
|
| 33 |
+
WORKDIR /app
|
| 34 |
+
|
| 35 |
+
# 先装依赖利用层缓存
|
| 36 |
+
COPY backend/requirements.txt /app/requirements.txt
|
| 37 |
+
RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
|
| 38 |
+
|
| 39 |
+
# 再复制后端代码
|
| 40 |
+
COPY backend /app
|
| 41 |
+
|
| 42 |
+
# 预建可写目录(HF 容器以 root 运行,这些目录是临时盘——重启会清空,
|
| 43 |
+
# 所以结构化数据务必走外接 DATABASE_URL;笔记/截图属临时数据,后续可再迁对象存储)
|
| 44 |
+
RUN mkdir -p /app/data/note_results /app/static/screenshots /app/config /app/fonts && \
|
| 45 |
+
cp /usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf /app/fonts/arial.ttf
|
| 46 |
+
|
| 47 |
+
EXPOSE 8483
|
| 48 |
+
CMD ["python", "main.py"]
|
README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VideoMemo Backend
|
| 3 |
+
emoji: 🎬
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8483
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# VideoMemo 后端(API)
|
| 12 |
+
|
| 13 |
+
AI 视频笔记生成的后端服务。桌面端 / 网页端 / 浏览器插件连接本 Space 的地址使用。
|
| 14 |
+
|
| 15 |
+
- **结构化数据**(LLM 供应商配置与 API key、模型、关键词订阅、通知渠道、任务索引)
|
| 16 |
+
持久化到外接 Postgres(Supabase),通过 `DATABASE_URL` Secret 配置。
|
| 17 |
+
- **本 Space 公开可访问**:务必设置 `WEB_ACCESS_PASSWORD` Secret,否则任何人都能调用你的后端。
|
| 18 |
+
- 笔记正文 / 截图 / 向量库当前仍是容器内临时文件,**重启会清空**(计划后续迁入 Postgres / 对象存储)。
|
| 19 |
+
|
| 20 |
+
> 部署步骤见仓库 `deploy/hf-space/DEPLOY.md`。
|
backend/.env.example
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# 通用
|
| 3 |
+
ENV=production
|
| 4 |
+
API_BASE_URL=http://127.0.0.1:8000
|
| 5 |
+
SCREENSHOT_BASE_URL=http://127.0.0.1:8000/static/screenshots
|
| 6 |
+
STATIC=/static # 外部访问路径(URL 前缀)
|
| 7 |
+
OUT_DIR=./static/screenshots # 本地输出目录
|
| 8 |
+
IMAGE_BASE_URL=/static/screenshots # 图片访问 URL
|
| 9 |
+
DATA_DIR=data
|
| 10 |
+
# transcriber 相关配置
|
| 11 |
+
TRANSCRIBER_TYPE=fast-whisper # fast-whisper/bcut/kuaishou
|
| 12 |
+
WHISPER_MODEL_SIZE=base
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BASE_REGISTRY 默认走 docker.io;国内拉不到 docker.io 时可换 daocloud / 阿里云 / 自建镜像源:
|
| 2 |
+
# docker-compose build --build-arg BASE_REGISTRY=docker.m.daocloud.io
|
| 3 |
+
# 或写到 docker-compose.yml 的 build.args / 环境变量里
|
| 4 |
+
ARG BASE_REGISTRY=docker.io
|
| 5 |
+
FROM ${BASE_REGISTRY}/library/python:3.11-slim
|
| 6 |
+
|
| 7 |
+
ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
|
| 8 |
+
ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
|
| 9 |
+
|
| 10 |
+
RUN rm -f /etc/apt/sources.list && \
|
| 11 |
+
rm -rf /etc/apt/sources.list.d/* && \
|
| 12 |
+
echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
|
| 13 |
+
echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
|
| 14 |
+
echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
|
| 15 |
+
apt-get update && \
|
| 16 |
+
apt-get install -y --no-install-recommends ffmpeg curl && \
|
| 17 |
+
rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
ENV PATH="/usr/bin:${PATH}"
|
| 20 |
+
ENV HF_ENDPOINT=https://hf-mirror.com
|
| 21 |
+
|
| 22 |
+
# 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI(npm 包 @larksuite/cli,二进制名 lark-cli)。
|
| 23 |
+
# 走 REST 直连推送则用不到,可按需删除本段以瘦身镜像。
|
| 24 |
+
# 凭证通过 LARK_APP_ID / LARK_APP_SECRET 环境变量在运行时注入(由后端调用时传入),此处不写死。
|
| 25 |
+
ARG NPM_REGISTRY=https://registry.npmmirror.com
|
| 26 |
+
RUN apt-get update && \
|
| 27 |
+
apt-get install -y --no-install-recommends nodejs npm && \
|
| 28 |
+
npm config set registry ${NPM_REGISTRY} && \
|
| 29 |
+
npm install -g @larksuite/cli && \
|
| 30 |
+
rm -rf /var/lib/apt/lists/* /root/.npm && \
|
| 31 |
+
(lark-cli --version || true)
|
| 32 |
+
|
| 33 |
+
WORKDIR /app
|
| 34 |
+
|
| 35 |
+
# 先复制 requirements.txt 利用层缓存
|
| 36 |
+
COPY ./backend/requirements.txt /app/requirements.txt
|
| 37 |
+
RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt
|
| 38 |
+
|
| 39 |
+
# 再复制应用代码(频繁变动不影响 pip 缓存层)
|
| 40 |
+
COPY ./backend /app
|
| 41 |
+
|
| 42 |
+
CMD ["python", "main.py"]
|
backend/Dockerfile.gpu
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BASE_REGISTRY 默认走 docker.io;国内可换 daocloud / 阿里云镜像(注意所选镜像需支持 nvidia/cuda 命名空间)
|
| 2 |
+
ARG BASE_REGISTRY=docker.io
|
| 3 |
+
FROM ${BASE_REGISTRY}/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
| 4 |
+
|
| 5 |
+
ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn
|
| 6 |
+
ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
|
| 7 |
+
|
| 8 |
+
RUN rm -f /etc/apt/sources.list && \
|
| 9 |
+
rm -rf /etc/apt/sources.list.d/* && \
|
| 10 |
+
echo "deb https://${APT_MIRROR}/ubuntu jammy main restricted universe multiverse" > /etc/apt/sources.list && \
|
| 11 |
+
echo "deb https://${APT_MIRROR}/ubuntu jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
|
| 12 |
+
echo "deb https://${APT_MIRROR}/ubuntu jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
|
| 13 |
+
apt-get update && \
|
| 14 |
+
apt-get install -y --no-install-recommends ffmpeg python3-pip curl && \
|
| 15 |
+
rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
ENV HF_ENDPOINT=https://hf-mirror.com
|
| 18 |
+
|
| 19 |
+
# 飞书「推送方式 = lark-cli / auto」时需要官方 lark CLI(npm 包 @larksuite/cli,二进制名 lark-cli)。
|
| 20 |
+
# Ubuntu 22.04 自带 apt 的 Node 太旧(v12)跑不动新 CLI,这里用 NodeSource 装 Node 20。
|
| 21 |
+
# 走 REST 直连推送则用不到,可按需删除本段以瘦身镜像。凭证由后端运行时经环境变量注入,不写死。
|
| 22 |
+
ARG NPM_REGISTRY=https://registry.npmmirror.com
|
| 23 |
+
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
| 24 |
+
apt-get install -y --no-install-recommends nodejs && \
|
| 25 |
+
npm config set registry ${NPM_REGISTRY} && \
|
| 26 |
+
npm install -g @larksuite/cli && \
|
| 27 |
+
rm -rf /var/lib/apt/lists/* /root/.npm && \
|
| 28 |
+
(lark-cli --version || true)
|
| 29 |
+
|
| 30 |
+
WORKDIR /app
|
| 31 |
+
|
| 32 |
+
# 先复制 requirements.txt 利用层缓存
|
| 33 |
+
COPY ./backend/requirements.txt /app/requirements.txt
|
| 34 |
+
RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt && \
|
| 35 |
+
pip install --no-cache-dir -i ${PIP_INDEX} 'transformers[torch]>=4.23'
|
| 36 |
+
|
| 37 |
+
# 再复制应用代码
|
| 38 |
+
COPY ./backend /app
|
| 39 |
+
|
| 40 |
+
CMD ["python3", "main.py"]
|
backend/__init__.py
ADDED
|
File without changes
|
backend/app/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
from fastapi import Depends, FastAPI, Header, HTTPException, Request
|
| 5 |
+
|
| 6 |
+
# 健康/诊断类接口:公网前端在用户尚未填访问密码时,也要能判断后端是否可达、
|
| 7 |
+
# 从而正常加载页面(否则启动探测被密码拦成 401,整页卡在「连接中」无法进入设置去填密码)。
|
| 8 |
+
_AUTH_EXEMPT_PATHS = {"/api/sys_check", "/api/sys_health", "/api/deploy_status"}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
async def verify_web_access_password(
|
| 12 |
+
request: Request,
|
| 13 |
+
request_web_access_password: Optional[str] = Header(
|
| 14 |
+
None, alias="request-web-access-password"
|
| 15 |
+
)
|
| 16 |
+
):
|
| 17 |
+
if request.url.path in _AUTH_EXEMPT_PATHS:
|
| 18 |
+
return True
|
| 19 |
+
expected = os.getenv("WEB_ACCESS_PASSWORD")
|
| 20 |
+
if expected and request_web_access_password != expected:
|
| 21 |
+
raise HTTPException(status_code=401, detail="访问密码错误或未填写")
|
| 22 |
+
return True
|
| 23 |
+
|
| 24 |
+
def create_app(lifespan) -> FastAPI:
|
| 25 |
+
from .routers import note, notification, provider, model, config, chat, flashcard, hot_videos, article, trend_subscription, feishu
|
| 26 |
+
from .utils.response import ResponseWrapper as R
|
| 27 |
+
|
| 28 |
+
app = FastAPI(title="VideoMemo",lifespan=lifespan)
|
| 29 |
+
protected = [Depends(verify_web_access_password)]
|
| 30 |
+
|
| 31 |
+
@app.get("/sys_check")
|
| 32 |
+
async def root_sys_check():
|
| 33 |
+
return R.success()
|
| 34 |
+
|
| 35 |
+
app.include_router(note.router, prefix="/api", dependencies=protected)
|
| 36 |
+
app.include_router(provider.router, prefix="/api", dependencies=protected)
|
| 37 |
+
app.include_router(model.router, prefix="/api", dependencies=protected)
|
| 38 |
+
app.include_router(config.router, prefix="/api", dependencies=protected)
|
| 39 |
+
app.include_router(chat.router, prefix="/api", dependencies=protected)
|
| 40 |
+
app.include_router(flashcard.router, prefix="/api", dependencies=protected)
|
| 41 |
+
app.include_router(hot_videos.router, prefix="/api", dependencies=protected)
|
| 42 |
+
app.include_router(article.router, prefix="/api", dependencies=protected)
|
| 43 |
+
app.include_router(trend_subscription.router, prefix="/api", dependencies=protected)
|
| 44 |
+
app.include_router(notification.router, prefix="/api", dependencies=protected)
|
| 45 |
+
app.include_router(feishu.router, prefix="/api", dependencies=protected)
|
| 46 |
+
|
| 47 |
+
return app
|
backend/app/article_fetchers/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.article_fetchers.base import ArticleContent, ArticleFetcher, ArticleFetchError
|
| 2 |
+
|
| 3 |
+
__all__ = ["ArticleContent", "ArticleFetcher", "ArticleFetchError"]
|
backend/app/article_fetchers/base.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Protocol
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class ArticleContent:
|
| 9 |
+
platform: str
|
| 10 |
+
url: str
|
| 11 |
+
article_id: str
|
| 12 |
+
title: str
|
| 13 |
+
author_name: str = ""
|
| 14 |
+
author_id: str = ""
|
| 15 |
+
content_text: str = ""
|
| 16 |
+
image_urls: list[str] = field(default_factory=list)
|
| 17 |
+
cover_url: str = ""
|
| 18 |
+
published_at: str = ""
|
| 19 |
+
raw_metadata: dict = field(default_factory=dict)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ArticleFetchError(Exception):
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ArticleFetcher(Protocol):
|
| 27 |
+
platform: str
|
| 28 |
+
|
| 29 |
+
def fetch(self, url: str) -> ArticleContent:
|
| 30 |
+
...
|
| 31 |
+
|
| 32 |
+
def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
|
| 33 |
+
...
|
| 34 |
+
|
| 35 |
+
def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
|
| 36 |
+
...
|
backend/app/article_fetchers/generic.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
from app.article_fetchers.base import ArticleContent, ArticleFetchError
|
| 10 |
+
from app.utils.url_parser import clean_url
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _clean_text(value: str) -> str:
|
| 14 |
+
return re.sub(r"[ \t\r\f\v]+", " ", value or "").strip()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _normalize_body(value: str) -> str:
|
| 18 |
+
lines = [_clean_text(line) for line in (value or "").splitlines()]
|
| 19 |
+
return "\n".join(line for line in lines if line)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _meta_content(soup: BeautifulSoup, *selectors: tuple[str, str]) -> str:
|
| 23 |
+
for attr, value in selectors:
|
| 24 |
+
node = soup.find("meta", attrs={attr: value})
|
| 25 |
+
if node:
|
| 26 |
+
content = _clean_text(node.get("content") or "")
|
| 27 |
+
if content:
|
| 28 |
+
return content
|
| 29 |
+
return ""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _candidate_score(node) -> int:
|
| 33 |
+
text = _normalize_body(node.get_text("\n"))
|
| 34 |
+
paragraphs = node.find_all("p")
|
| 35 |
+
return len(text) + len(paragraphs) * 120
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def parse_generic_article_html(html: str, url: str) -> ArticleContent:
|
| 39 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 40 |
+
for tag in soup(["script", "style", "noscript", "svg", "canvas", "iframe"]):
|
| 41 |
+
tag.decompose()
|
| 42 |
+
for tag in soup(["nav", "header", "footer", "aside", "form"]):
|
| 43 |
+
tag.decompose()
|
| 44 |
+
|
| 45 |
+
title = (
|
| 46 |
+
_meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
|
| 47 |
+
or _clean_text(soup.title.get_text(" ")) if soup.title else ""
|
| 48 |
+
)
|
| 49 |
+
author = _meta_content(soup, ("name", "author"), ("property", "article:author"))
|
| 50 |
+
published_at = _meta_content(
|
| 51 |
+
soup,
|
| 52 |
+
("property", "article:published_time"),
|
| 53 |
+
("name", "publishdate"),
|
| 54 |
+
("name", "date"),
|
| 55 |
+
)
|
| 56 |
+
cover = _meta_content(soup, ("property", "og:image"), ("name", "twitter:image"))
|
| 57 |
+
|
| 58 |
+
candidates = []
|
| 59 |
+
for selector in ("article", "main", "[role='main']", "#content", ".content", ".article", ".post"):
|
| 60 |
+
candidates.extend(soup.select(selector))
|
| 61 |
+
if not candidates and soup.body:
|
| 62 |
+
candidates = [soup.body]
|
| 63 |
+
best = max(candidates, key=_candidate_score, default=None)
|
| 64 |
+
body = _normalize_body(best.get_text("\n")) if best else ""
|
| 65 |
+
if len(body) < 80:
|
| 66 |
+
description = _meta_content(soup, ("name", "description"), ("property", "og:description"))
|
| 67 |
+
body = description if len(description) > len(body) else body
|
| 68 |
+
if len(body) < 40:
|
| 69 |
+
raise ValueError("网页正文为空或过短,无法生成总结")
|
| 70 |
+
|
| 71 |
+
parsed = urlparse(url)
|
| 72 |
+
article_id = parsed.netloc + parsed.path
|
| 73 |
+
return ArticleContent(
|
| 74 |
+
platform="generic_web",
|
| 75 |
+
url=url,
|
| 76 |
+
article_id=article_id or url,
|
| 77 |
+
title=title or parsed.netloc or "网页文章",
|
| 78 |
+
author_name=author,
|
| 79 |
+
content_text=body,
|
| 80 |
+
image_urls=[cover] if cover else [],
|
| 81 |
+
cover_url=cover,
|
| 82 |
+
published_at=published_at,
|
| 83 |
+
raw_metadata={"source": "generic_web"},
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class GenericArticleFetcher:
|
| 88 |
+
platform = "generic_web"
|
| 89 |
+
|
| 90 |
+
def fetch(self, url: str) -> ArticleContent:
|
| 91 |
+
clean = clean_url(url)
|
| 92 |
+
try:
|
| 93 |
+
response = requests.get(
|
| 94 |
+
clean,
|
| 95 |
+
timeout=12,
|
| 96 |
+
allow_redirects=True,
|
| 97 |
+
headers={
|
| 98 |
+
"User-Agent": (
|
| 99 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
| 100 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
| 101 |
+
),
|
| 102 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 103 |
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
| 104 |
+
},
|
| 105 |
+
)
|
| 106 |
+
response.raise_for_status()
|
| 107 |
+
return parse_generic_article_html(response.text, response.url or clean)
|
| 108 |
+
except ValueError:
|
| 109 |
+
raise
|
| 110 |
+
except Exception as exc:
|
| 111 |
+
raise ArticleFetchError(f"网页文章抓取失败:{exc}") from exc
|
| 112 |
+
|
| 113 |
+
def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
|
| 114 |
+
raise ArticleFetchError("通用网页暂不支持关键字查询,请粘贴具体文章链接")
|
| 115 |
+
|
| 116 |
+
def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
|
| 117 |
+
raise ArticleFetchError("通用网页暂不支持发布者订阅,请粘贴具体文章链接")
|
backend/app/article_fetchers/wechat.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import parse_qs, quote, unquote, urljoin, urlparse
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
from app.article_fetchers.base import ArticleContent, ArticleFetchError
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _clean_text(value: str) -> str:
|
| 13 |
+
return re.sub(r"\s+", " ", value or "").strip()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _element_text(element) -> str:
|
| 17 |
+
return _clean_text(element.get_text(" ")) if element else ""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _script_value(html: str, name: str) -> str:
|
| 21 |
+
patterns = [
|
| 22 |
+
rf'var\s+{re.escape(name)}\s*=\s*"([^"]*)"',
|
| 23 |
+
rf"{re.escape(name)}\s*:\s*'([^']*)'",
|
| 24 |
+
]
|
| 25 |
+
for pattern in patterns:
|
| 26 |
+
match = re.search(pattern, html)
|
| 27 |
+
if match:
|
| 28 |
+
return match.group(1).strip()
|
| 29 |
+
return ""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def parse_wechat_article_html(html: str, url: str) -> ArticleContent:
|
| 33 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 34 |
+
title = _element_text(soup.find(id="activity-name") or soup.find("h1"))
|
| 35 |
+
author = _element_text(soup.find(id="js_name"))
|
| 36 |
+
published_at = _element_text(soup.find(id="publish_time"))
|
| 37 |
+
content = soup.find(id="js_content")
|
| 38 |
+
body = _clean_text(content.get_text("\n")) if content else ""
|
| 39 |
+
if not body:
|
| 40 |
+
raise ValueError("微信公众号文章正文为空,无法生成总结")
|
| 41 |
+
|
| 42 |
+
image_urls: list[str] = []
|
| 43 |
+
for image in content.find_all("img") if content else []:
|
| 44 |
+
src = image.get("data-src") or image.get("src") or ""
|
| 45 |
+
if src and src not in image_urls:
|
| 46 |
+
image_urls.append(src)
|
| 47 |
+
|
| 48 |
+
biz = _script_value(html, "biz")
|
| 49 |
+
mid = _script_value(html, "mid")
|
| 50 |
+
idx = _script_value(html, "idx")
|
| 51 |
+
sn = _script_value(html, "sn")
|
| 52 |
+
article_id = ":".join(part for part in [biz, mid, idx, sn] if part) or url
|
| 53 |
+
|
| 54 |
+
return ArticleContent(
|
| 55 |
+
platform="wechat_mp",
|
| 56 |
+
url=url,
|
| 57 |
+
article_id=article_id,
|
| 58 |
+
title=title or "微信公众号文章",
|
| 59 |
+
author_name=author,
|
| 60 |
+
author_id=biz,
|
| 61 |
+
content_text=body,
|
| 62 |
+
image_urls=image_urls,
|
| 63 |
+
cover_url=image_urls[0] if image_urls else "",
|
| 64 |
+
published_at=published_at,
|
| 65 |
+
raw_metadata={"biz": biz, "mid": mid, "idx": idx, "sn": sn},
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _normalize_wechat_result_url(href: str) -> str:
|
| 70 |
+
if not href:
|
| 71 |
+
return ""
|
| 72 |
+
absolute = urljoin("https://weixin.sogou.com", href)
|
| 73 |
+
parsed = urlparse(absolute)
|
| 74 |
+
query = parse_qs(parsed.query)
|
| 75 |
+
for key in ("url", "target"):
|
| 76 |
+
if query.get(key):
|
| 77 |
+
candidate = unquote(query[key][0])
|
| 78 |
+
if "mp.weixin.qq.com" in candidate:
|
| 79 |
+
return candidate
|
| 80 |
+
return absolute if "mp.weixin.qq.com" in absolute else ""
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def parse_wechat_search_html(html: str, keyword: str, limit: int = 20) -> list[ArticleContent]:
|
| 84 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 85 |
+
items: list[ArticleContent] = []
|
| 86 |
+
seen: set[str] = set()
|
| 87 |
+
for anchor in soup.find_all("a", href=True):
|
| 88 |
+
url = _normalize_wechat_result_url(anchor.get("href") or "")
|
| 89 |
+
if not url or url in seen:
|
| 90 |
+
continue
|
| 91 |
+
title = _clean_text(anchor.get_text(" "))
|
| 92 |
+
if not title:
|
| 93 |
+
continue
|
| 94 |
+
container = anchor.find_parent(["div", "li"]) or anchor.parent
|
| 95 |
+
info_nodes = container.find_all(class_=re.compile(r"(txt-info|s-p|account)")) if container else []
|
| 96 |
+
info = [_clean_text(node.get_text(" ")) for node in info_nodes if _clean_text(node.get_text(" "))]
|
| 97 |
+
author = info[0] if info else ""
|
| 98 |
+
summary = info[-1] if len(info) > 1 else title
|
| 99 |
+
seen.add(url)
|
| 100 |
+
items.append(
|
| 101 |
+
ArticleContent(
|
| 102 |
+
platform="wechat_mp",
|
| 103 |
+
url=url,
|
| 104 |
+
article_id=url,
|
| 105 |
+
title=title,
|
| 106 |
+
author_name=author,
|
| 107 |
+
content_text=summary,
|
| 108 |
+
raw_metadata={"keyword": keyword, "source": "sogou_weixin"},
|
| 109 |
+
)
|
| 110 |
+
)
|
| 111 |
+
if len(items) >= limit:
|
| 112 |
+
break
|
| 113 |
+
return items
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class WechatArticleFetcher:
|
| 117 |
+
platform = "wechat_mp"
|
| 118 |
+
|
| 119 |
+
def fetch(self, url: str) -> ArticleContent:
|
| 120 |
+
try:
|
| 121 |
+
response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
|
| 122 |
+
response.raise_for_status()
|
| 123 |
+
return parse_wechat_article_html(response.text, url)
|
| 124 |
+
except ValueError:
|
| 125 |
+
raise
|
| 126 |
+
except Exception as exc:
|
| 127 |
+
raise ArticleFetchError(f"微信公众号文章抓取失败:{exc}") from exc
|
| 128 |
+
|
| 129 |
+
def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
|
| 130 |
+
try:
|
| 131 |
+
response = requests.get(
|
| 132 |
+
f"https://weixin.sogou.com/weixin?type=2&query={quote(keyword)}",
|
| 133 |
+
timeout=10,
|
| 134 |
+
headers={"User-Agent": "Mozilla/5.0"},
|
| 135 |
+
)
|
| 136 |
+
response.raise_for_status()
|
| 137 |
+
return parse_wechat_search_html(response.text, keyword, limit)
|
| 138 |
+
except Exception as exc:
|
| 139 |
+
raise ArticleFetchError(f"微信公众号关键字查询失败:{exc}") from exc
|
| 140 |
+
|
| 141 |
+
def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
|
| 142 |
+
return self.search(query, limit)
|
backend/app/article_fetchers/xiaohongshu.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from urllib.parse import quote, urlparse
|
| 7 |
+
|
| 8 |
+
import requests
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
|
| 11 |
+
from app.article_fetchers.base import ArticleContent, ArticleFetchError
|
| 12 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 13 |
+
from app.utils.url_parser import clean_url
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _note_id_from_url(url: str) -> str:
|
| 17 |
+
path = urlparse(url).path.rstrip("/")
|
| 18 |
+
return path.split("/")[-1] if path else url
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _extract_initial_state(html: str) -> dict:
|
| 22 |
+
match = re.search(r"window\.__INITIAL_STATE__\s*=", html)
|
| 23 |
+
if not match:
|
| 24 |
+
return {}
|
| 25 |
+
start = html.find("{", match.end())
|
| 26 |
+
if start < 0:
|
| 27 |
+
return {}
|
| 28 |
+
depth = 0
|
| 29 |
+
end = -1
|
| 30 |
+
for index in range(start, len(html)):
|
| 31 |
+
char = html[index]
|
| 32 |
+
if char == "{":
|
| 33 |
+
depth += 1
|
| 34 |
+
elif char == "}":
|
| 35 |
+
depth -= 1
|
| 36 |
+
if depth == 0:
|
| 37 |
+
end = index + 1
|
| 38 |
+
break
|
| 39 |
+
if end < 0:
|
| 40 |
+
return {}
|
| 41 |
+
raw = html[start:end].replace("undefined", "null")
|
| 42 |
+
try:
|
| 43 |
+
return json.loads(raw)
|
| 44 |
+
except json.JSONDecodeError:
|
| 45 |
+
return {}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _first_image_url(item: dict) -> str:
|
| 49 |
+
for key in ("urlDefault", "url", "traceId"):
|
| 50 |
+
value = item.get(key)
|
| 51 |
+
if isinstance(value, str) and value.startswith("http"):
|
| 52 |
+
return value
|
| 53 |
+
nested = item.get("cover") or item.get("image") or {}
|
| 54 |
+
if isinstance(nested, dict):
|
| 55 |
+
for key in ("urlDefault", "url"):
|
| 56 |
+
value = nested.get(key)
|
| 57 |
+
if isinstance(value, str) and value.startswith("http"):
|
| 58 |
+
return value
|
| 59 |
+
return ""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _published_at(value) -> str:
|
| 63 |
+
try:
|
| 64 |
+
timestamp = int(value)
|
| 65 |
+
except (TypeError, ValueError):
|
| 66 |
+
return ""
|
| 67 |
+
if timestamp > 10_000_000_000:
|
| 68 |
+
timestamp = timestamp // 1000
|
| 69 |
+
return datetime.fromtimestamp(timestamp).isoformat(timespec="seconds")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _article_from_note(note: dict, url: str) -> ArticleContent:
|
| 73 |
+
user = note.get("user") or {}
|
| 74 |
+
images: list[str] = []
|
| 75 |
+
for image in note.get("imageList") or note.get("images") or []:
|
| 76 |
+
src = _first_image_url(image)
|
| 77 |
+
if src and src not in images:
|
| 78 |
+
images.append(src)
|
| 79 |
+
|
| 80 |
+
content = str(note.get("desc") or note.get("description") or "").strip()
|
| 81 |
+
title = str(note.get("title") or "").strip() or content[:40] or "小红书笔记"
|
| 82 |
+
article_id = str(note.get("noteId") or note.get("id") or _note_id_from_url(url)).strip()
|
| 83 |
+
if not content:
|
| 84 |
+
raise ValueError("小红书笔记正文为空,无法生成总结")
|
| 85 |
+
|
| 86 |
+
return ArticleContent(
|
| 87 |
+
platform="xiaohongshu",
|
| 88 |
+
url=url,
|
| 89 |
+
article_id=article_id,
|
| 90 |
+
title=title,
|
| 91 |
+
author_name=str(user.get("nickname") or "").strip(),
|
| 92 |
+
author_id=str(user.get("userId") or user.get("id") or "").strip(),
|
| 93 |
+
content_text=content,
|
| 94 |
+
image_urls=images,
|
| 95 |
+
cover_url=images[0] if images else "",
|
| 96 |
+
published_at=_published_at(note.get("time") or note.get("lastUpdateTime")),
|
| 97 |
+
raw_metadata={"raw_note": note},
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_xiaohongshu_article_html(html: str, url: str) -> ArticleContent:
|
| 102 |
+
state = _extract_initial_state(html)
|
| 103 |
+
detail_map = ((state.get("note") or {}).get("noteDetailMap")) or {}
|
| 104 |
+
for value in detail_map.values():
|
| 105 |
+
note = value.get("note") if isinstance(value, dict) else None
|
| 106 |
+
if isinstance(note, dict):
|
| 107 |
+
return _article_from_note(note, url)
|
| 108 |
+
|
| 109 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 110 |
+
title_meta = soup.find("meta", attrs={"property": "og:title"})
|
| 111 |
+
desc_meta = soup.find("meta", attrs={"name": "description"})
|
| 112 |
+
title = (title_meta.get("content") if title_meta else "") or "小红书笔记"
|
| 113 |
+
body = (desc_meta.get("content") if desc_meta else "").strip()
|
| 114 |
+
if not body:
|
| 115 |
+
raise ValueError("小红书笔记正文为空,无法生成总结")
|
| 116 |
+
|
| 117 |
+
return ArticleContent(
|
| 118 |
+
platform="xiaohongshu",
|
| 119 |
+
url=url,
|
| 120 |
+
article_id=_note_id_from_url(url),
|
| 121 |
+
title=title.strip(),
|
| 122 |
+
content_text=body,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _iter_note_like(value):
|
| 127 |
+
if isinstance(value, dict):
|
| 128 |
+
note_id = value.get("noteId") or value.get("id")
|
| 129 |
+
title = value.get("title") or value.get("displayTitle")
|
| 130 |
+
desc = value.get("desc") or value.get("description")
|
| 131 |
+
if note_id and (title or desc):
|
| 132 |
+
yield value
|
| 133 |
+
for child in value.values():
|
| 134 |
+
yield from _iter_note_like(child)
|
| 135 |
+
elif isinstance(value, list):
|
| 136 |
+
for child in value:
|
| 137 |
+
yield from _iter_note_like(child)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def parse_xiaohongshu_discovery_html(
|
| 141 |
+
html: str,
|
| 142 |
+
source_url: str,
|
| 143 |
+
limit: int = 20,
|
| 144 |
+
) -> list[ArticleContent]:
|
| 145 |
+
state = _extract_initial_state(html)
|
| 146 |
+
items: list[ArticleContent] = []
|
| 147 |
+
seen: set[str] = set()
|
| 148 |
+
for note in _iter_note_like(state):
|
| 149 |
+
article_id = str(note.get("noteId") or note.get("id") or "").strip()
|
| 150 |
+
if not article_id or article_id in seen:
|
| 151 |
+
continue
|
| 152 |
+
user = note.get("user") or note.get("author") or {}
|
| 153 |
+
image_url = _first_image_url(note)
|
| 154 |
+
content = str(note.get("desc") or note.get("description") or note.get("title") or "").strip()
|
| 155 |
+
title = str(note.get("title") or note.get("displayTitle") or content[:40] or "小红书笔记").strip()
|
| 156 |
+
seen.add(article_id)
|
| 157 |
+
items.append(
|
| 158 |
+
ArticleContent(
|
| 159 |
+
platform="xiaohongshu",
|
| 160 |
+
url=f"https://www.xiaohongshu.com/explore/{article_id}",
|
| 161 |
+
article_id=article_id,
|
| 162 |
+
title=title,
|
| 163 |
+
author_name=str(user.get("nickname") or user.get("name") or "").strip(),
|
| 164 |
+
author_id=str(user.get("userId") or user.get("id") or "").strip(),
|
| 165 |
+
content_text=content,
|
| 166 |
+
image_urls=[image_url] if image_url else [],
|
| 167 |
+
cover_url=image_url,
|
| 168 |
+
raw_metadata={"source_url": source_url},
|
| 169 |
+
)
|
| 170 |
+
)
|
| 171 |
+
if len(items) >= limit:
|
| 172 |
+
break
|
| 173 |
+
return items
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class XiaohongshuArticleFetcher:
|
| 177 |
+
platform = "xiaohongshu"
|
| 178 |
+
|
| 179 |
+
def __init__(self):
|
| 180 |
+
self._cookie_mgr = CookieConfigManager()
|
| 181 |
+
|
| 182 |
+
def _headers(self) -> dict:
|
| 183 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
| 184 |
+
cookie = self._cookie_mgr.get("xiaohongshu")
|
| 185 |
+
if cookie:
|
| 186 |
+
headers["Cookie"] = cookie
|
| 187 |
+
return headers
|
| 188 |
+
|
| 189 |
+
def fetch(self, url: str) -> ArticleContent:
|
| 190 |
+
clean = clean_url(url)
|
| 191 |
+
try:
|
| 192 |
+
response = requests.get(clean, timeout=10, headers=self._headers(), allow_redirects=True)
|
| 193 |
+
response.raise_for_status()
|
| 194 |
+
return parse_xiaohongshu_article_html(response.text, response.url or clean)
|
| 195 |
+
except ValueError:
|
| 196 |
+
raise
|
| 197 |
+
except Exception as exc:
|
| 198 |
+
raise ArticleFetchError(f"小红书笔记抓取失败:{exc}") from exc
|
| 199 |
+
|
| 200 |
+
def search(self, keyword: str, limit: int = 20) -> list[ArticleContent]:
|
| 201 |
+
url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}"
|
| 202 |
+
try:
|
| 203 |
+
response = requests.get(url, timeout=10, headers=self._headers())
|
| 204 |
+
response.raise_for_status()
|
| 205 |
+
return parse_xiaohongshu_discovery_html(response.text, url, limit)
|
| 206 |
+
except Exception as exc:
|
| 207 |
+
raise ArticleFetchError(f"小红书关键字查询失败:{exc}") from exc
|
| 208 |
+
|
| 209 |
+
def fetch_publisher(self, query: str, limit: int = 20) -> list[ArticleContent]:
|
| 210 |
+
url = clean_url(query)
|
| 211 |
+
if not url.startswith("http"):
|
| 212 |
+
url = f"https://www.xiaohongshu.com/user/profile/{quote(query)}"
|
| 213 |
+
try:
|
| 214 |
+
response = requests.get(url, timeout=10, headers=self._headers(), allow_redirects=True)
|
| 215 |
+
response.raise_for_status()
|
| 216 |
+
return parse_xiaohongshu_discovery_html(response.text, response.url or url, limit)
|
| 217 |
+
except Exception as exc:
|
| 218 |
+
raise ArticleFetchError(f"小红书发布者订阅刷新失败:{exc}") from exc
|
backend/app/core/__init__.py
ADDED
|
File without changes
|
backend/app/db/__init__.py
ADDED
|
File without changes
|
backend/app/db/article_dao.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import json
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
from app.article_fetchers.base import ArticleContent
|
| 8 |
+
from app.db.engine import get_db
|
| 9 |
+
from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def url_hash(url: str) -> str:
|
| 13 |
+
return hashlib.sha256(url.encode("utf-8")).hexdigest()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _detach(obj):
|
| 17 |
+
data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
|
| 18 |
+
obj.__dict__.clear()
|
| 19 |
+
obj.__dict__.update(data)
|
| 20 |
+
return obj
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def upsert_article_item(article: ArticleContent) -> ArticleItem:
|
| 24 |
+
db = next(get_db())
|
| 25 |
+
try:
|
| 26 |
+
digest = url_hash(article.url)
|
| 27 |
+
item = None
|
| 28 |
+
if article.article_id:
|
| 29 |
+
item = (
|
| 30 |
+
db.query(ArticleItem)
|
| 31 |
+
.filter_by(platform=article.platform, article_id=article.article_id)
|
| 32 |
+
.first()
|
| 33 |
+
)
|
| 34 |
+
if item is None:
|
| 35 |
+
item = db.query(ArticleItem).filter_by(platform=article.platform, url_hash=digest).first()
|
| 36 |
+
if item is None:
|
| 37 |
+
item = ArticleItem(
|
| 38 |
+
platform=article.platform,
|
| 39 |
+
article_id=article.article_id,
|
| 40 |
+
url_hash=digest,
|
| 41 |
+
url=article.url,
|
| 42 |
+
title=article.title,
|
| 43 |
+
)
|
| 44 |
+
db.add(item)
|
| 45 |
+
item.url = article.url
|
| 46 |
+
item.title = article.title
|
| 47 |
+
item.author_name = article.author_name
|
| 48 |
+
item.author_id = article.author_id
|
| 49 |
+
item.cover_url = article.cover_url
|
| 50 |
+
item.published_at = article.published_at
|
| 51 |
+
item.content_text = article.content_text
|
| 52 |
+
item.raw_metadata = json.dumps(article.raw_metadata or {}, ensure_ascii=False)
|
| 53 |
+
db.commit()
|
| 54 |
+
db.refresh(item)
|
| 55 |
+
return _detach(item)
|
| 56 |
+
finally:
|
| 57 |
+
db.close()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def get_article_item(item_id: int) -> ArticleItem | None:
|
| 61 |
+
db = next(get_db())
|
| 62 |
+
try:
|
| 63 |
+
item = db.query(ArticleItem).filter_by(id=item_id).first()
|
| 64 |
+
return _detach(item) if item else None
|
| 65 |
+
finally:
|
| 66 |
+
db.close()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def list_article_items(subscription_id: int | None = None) -> list[ArticleItem]:
|
| 70 |
+
db = next(get_db())
|
| 71 |
+
try:
|
| 72 |
+
query = db.query(ArticleItem)
|
| 73 |
+
if subscription_id is not None:
|
| 74 |
+
query = query.join(
|
| 75 |
+
ArticleSubscriptionItem,
|
| 76 |
+
ArticleSubscriptionItem.article_item_id == ArticleItem.id,
|
| 77 |
+
).filter(ArticleSubscriptionItem.subscription_id == subscription_id)
|
| 78 |
+
return [_detach(item) for item in query.order_by(ArticleItem.id.desc()).all()]
|
| 79 |
+
finally:
|
| 80 |
+
db.close()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def mark_article_summarized(item_id: int, task_id: str) -> None:
|
| 84 |
+
db = next(get_db())
|
| 85 |
+
try:
|
| 86 |
+
item = db.query(ArticleItem).filter_by(id=item_id).first()
|
| 87 |
+
if item:
|
| 88 |
+
item.summary_status = "summarized"
|
| 89 |
+
item.task_id = task_id
|
| 90 |
+
db.commit()
|
| 91 |
+
finally:
|
| 92 |
+
db.close()
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def create_subscription(
|
| 96 |
+
platform: str,
|
| 97 |
+
subscription_type: str,
|
| 98 |
+
query: str,
|
| 99 |
+
label: str = "",
|
| 100 |
+
) -> ArticleSubscription:
|
| 101 |
+
db = next(get_db())
|
| 102 |
+
try:
|
| 103 |
+
subscription = ArticleSubscription(
|
| 104 |
+
platform=platform,
|
| 105 |
+
type=subscription_type,
|
| 106 |
+
query=query,
|
| 107 |
+
label=label or query,
|
| 108 |
+
)
|
| 109 |
+
db.add(subscription)
|
| 110 |
+
db.commit()
|
| 111 |
+
db.refresh(subscription)
|
| 112 |
+
return _detach(subscription)
|
| 113 |
+
finally:
|
| 114 |
+
db.close()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def list_subscriptions() -> list[ArticleSubscription]:
|
| 118 |
+
db = next(get_db())
|
| 119 |
+
try:
|
| 120 |
+
return [
|
| 121 |
+
_detach(item)
|
| 122 |
+
for item in db.query(ArticleSubscription).order_by(ArticleSubscription.id.desc()).all()
|
| 123 |
+
]
|
| 124 |
+
finally:
|
| 125 |
+
db.close()
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def get_subscription(subscription_id: int) -> ArticleSubscription | None:
|
| 129 |
+
db = next(get_db())
|
| 130 |
+
try:
|
| 131 |
+
item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
|
| 132 |
+
return _detach(item) if item else None
|
| 133 |
+
finally:
|
| 134 |
+
db.close()
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def update_subscription_refresh(subscription_id: int, error: str = "") -> None:
|
| 138 |
+
db = next(get_db())
|
| 139 |
+
try:
|
| 140 |
+
item = db.query(ArticleSubscription).filter_by(id=subscription_id).first()
|
| 141 |
+
if item:
|
| 142 |
+
item.last_refresh_at = datetime.now()
|
| 143 |
+
item.last_error = error
|
| 144 |
+
db.commit()
|
| 145 |
+
finally:
|
| 146 |
+
db.close()
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def link_subscription_item(subscription_id: int, article_item_id: int, match_reason: str) -> None:
|
| 150 |
+
db = next(get_db())
|
| 151 |
+
try:
|
| 152 |
+
existing = (
|
| 153 |
+
db.query(ArticleSubscriptionItem)
|
| 154 |
+
.filter_by(subscription_id=subscription_id, article_item_id=article_item_id)
|
| 155 |
+
.first()
|
| 156 |
+
)
|
| 157 |
+
if existing is None:
|
| 158 |
+
db.add(
|
| 159 |
+
ArticleSubscriptionItem(
|
| 160 |
+
subscription_id=subscription_id,
|
| 161 |
+
article_item_id=article_item_id,
|
| 162 |
+
match_reason=match_reason,
|
| 163 |
+
)
|
| 164 |
+
)
|
| 165 |
+
db.commit()
|
| 166 |
+
finally:
|
| 167 |
+
db.close()
|
backend/app/db/builtin_providers.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "openai",
|
| 4 |
+
"name": "OpenAI",
|
| 5 |
+
"type": "built-in",
|
| 6 |
+
"logo": "OpenAI",
|
| 7 |
+
"api_key": "",
|
| 8 |
+
"base_url": "https://api.openai.com/v1",
|
| 9 |
+
"enabled": 0
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"id": "deepseek",
|
| 13 |
+
"name": "DeepSeek",
|
| 14 |
+
"type": "built-in",
|
| 15 |
+
"logo": "DeepSeek",
|
| 16 |
+
"api_key": "",
|
| 17 |
+
"base_url": "https://api.deepseek.com",
|
| 18 |
+
"enabled": 1
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"id": "qwen",
|
| 22 |
+
"name": "Qwen",
|
| 23 |
+
"type": "built-in",
|
| 24 |
+
"logo": "Qwen",
|
| 25 |
+
"api_key": "",
|
| 26 |
+
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
| 27 |
+
"enabled": 0
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"id": "Claude",
|
| 31 |
+
"name": "Claude",
|
| 32 |
+
"type": "built-in",
|
| 33 |
+
"logo": "Claude",
|
| 34 |
+
"api_key": "",
|
| 35 |
+
"base_url": "https://",
|
| 36 |
+
"enabled": 0
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"id": "gemini",
|
| 40 |
+
"name": "Gemini",
|
| 41 |
+
"type": "built-in",
|
| 42 |
+
"logo": "Gemini",
|
| 43 |
+
"api_key": "",
|
| 44 |
+
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
|
| 45 |
+
"enabled": 0
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"id": "groq",
|
| 49 |
+
"name": "Groq",
|
| 50 |
+
"type": "built-in",
|
| 51 |
+
"logo": "Groq",
|
| 52 |
+
"api_key": "",
|
| 53 |
+
"base_url": "https://api.groq.com/openai/v1",
|
| 54 |
+
"enabled": 0
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"id": "ollama",
|
| 58 |
+
"name": "ollama",
|
| 59 |
+
"type": "built-in",
|
| 60 |
+
"logo": "Ollama",
|
| 61 |
+
"api_key": "",
|
| 62 |
+
"base_url": "http://127.0.0.1:11434/v1",
|
| 63 |
+
"enabled": 0
|
| 64 |
+
}
|
| 65 |
+
]
|
backend/app/db/engine.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from sqlalchemy import create_engine
|
| 3 |
+
from sqlalchemy.orm import sessionmaker, declarative_base
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# 默认 SQLite,如果想换 PostgreSQL 或 MySQL,可以直接改 .env
|
| 9 |
+
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///video_memo.db")
|
| 10 |
+
|
| 11 |
+
# SQLite 需要特定连接参数,其他数据库不需要
|
| 12 |
+
engine_args = {}
|
| 13 |
+
if DATABASE_URL.startswith("sqlite"):
|
| 14 |
+
engine_args["connect_args"] = {"check_same_thread": False}
|
| 15 |
+
|
| 16 |
+
_pool_args = {}
|
| 17 |
+
if not DATABASE_URL.startswith("sqlite"):
|
| 18 |
+
_pool_args = {
|
| 19 |
+
"pool_size": int(os.getenv("DB_POOL_SIZE", "10")),
|
| 20 |
+
"max_overflow": int(os.getenv("DB_MAX_OVERFLOW", "20")),
|
| 21 |
+
"pool_pre_ping": True,
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
engine = create_engine(
|
| 25 |
+
DATABASE_URL,
|
| 26 |
+
echo=os.getenv("SQLALCHEMY_ECHO", "false").lower() == "true",
|
| 27 |
+
**engine_args,
|
| 28 |
+
**_pool_args,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 32 |
+
|
| 33 |
+
Base = declarative_base()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_engine():
|
| 37 |
+
return engine
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_db():
|
| 41 |
+
db = SessionLocal()
|
| 42 |
+
try:
|
| 43 |
+
yield db
|
| 44 |
+
finally:
|
| 45 |
+
db.close()
|
backend/app/db/init_db.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.db.models.articles import ArticleItem, ArticleSubscription, ArticleSubscriptionItem
|
| 2 |
+
from app.db.models.models import Model
|
| 3 |
+
from app.db.models.providers import Provider
|
| 4 |
+
from app.db.models.trend_subscription import (
|
| 5 |
+
NotificationChannel,
|
| 6 |
+
TrendSubscription,
|
| 7 |
+
TrendSubscriptionMatch,
|
| 8 |
+
)
|
| 9 |
+
from app.db.models.video_tasks import VideoTask
|
| 10 |
+
from app.db.engine import get_engine, Base
|
| 11 |
+
from sqlalchemy import inspect, text
|
| 12 |
+
|
| 13 |
+
def init_db():
|
| 14 |
+
engine = get_engine()
|
| 15 |
+
|
| 16 |
+
Base.metadata.create_all(bind=engine)
|
| 17 |
+
_ensure_article_content_text(engine)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# 注:原 _ensure_model_columns 为 models.supports_multimodal 做的迁移已删除——
|
| 21 |
+
# 该列在「drop multimodal」重构后已不再被 ORM 使用(纯遗留),且它的
|
| 22 |
+
# `ALTER ... BOOLEAN NOT NULL DEFAULT 0` 在 Postgres 上会因 boolean 默认值类型不符直接报错。
|
| 23 |
+
# 已有 SQLite 库里残留的该列无害,保持不动即可。
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _ensure_article_content_text(engine):
|
| 27 |
+
inspector = inspect(engine)
|
| 28 |
+
if "article_items" not in inspector.get_table_names():
|
| 29 |
+
return
|
| 30 |
+
columns = {column["name"] for column in inspector.get_columns("article_items")}
|
| 31 |
+
if "content_text" in columns:
|
| 32 |
+
return
|
| 33 |
+
with engine.begin() as conn:
|
| 34 |
+
conn.execute(text("ALTER TABLE article_items ADD COLUMN content_text TEXT NOT NULL DEFAULT ''"))
|
backend/app/db/model_dao.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.db.engine import get_db
|
| 2 |
+
from app.db.models.models import Model
|
| 3 |
+
from app.db.models.providers import Provider
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_model_by_provider_and_name(provider_id: int, model_name: str):
|
| 7 |
+
db = next(get_db())
|
| 8 |
+
try:
|
| 9 |
+
model = db.query(Model).filter_by(provider_id=provider_id, model_name=model_name).first()
|
| 10 |
+
if model:
|
| 11 |
+
return {
|
| 12 |
+
"id": model.id,
|
| 13 |
+
"provider_id": model.provider_id,
|
| 14 |
+
"model_name": model.model_name,
|
| 15 |
+
"created_at": model.created_at,
|
| 16 |
+
}
|
| 17 |
+
return None
|
| 18 |
+
finally:
|
| 19 |
+
db.close()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def insert_model(provider_id: int, model_name: str):
|
| 23 |
+
db = next(get_db())
|
| 24 |
+
try:
|
| 25 |
+
model = Model(provider_id=provider_id, model_name=model_name)
|
| 26 |
+
db.add(model)
|
| 27 |
+
db.commit()
|
| 28 |
+
db.refresh(model)
|
| 29 |
+
return {
|
| 30 |
+
"id": model.id,
|
| 31 |
+
"provider_id": model.provider_id,
|
| 32 |
+
"model_name": model.model_name,
|
| 33 |
+
"created_at": model.created_at,
|
| 34 |
+
}
|
| 35 |
+
finally:
|
| 36 |
+
db.close()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_models_by_provider(provider_id: int):
|
| 40 |
+
db = next(get_db())
|
| 41 |
+
try:
|
| 42 |
+
models = db.query(Model).filter_by(provider_id=provider_id).all()
|
| 43 |
+
return [{"id": m.id, "model_name": m.model_name} for m in models]
|
| 44 |
+
finally:
|
| 45 |
+
db.close()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def delete_model(model_id: int):
|
| 49 |
+
db = next(get_db())
|
| 50 |
+
try:
|
| 51 |
+
model = db.query(Model).filter_by(id=model_id).first()
|
| 52 |
+
if model:
|
| 53 |
+
db.delete(model)
|
| 54 |
+
db.commit()
|
| 55 |
+
finally:
|
| 56 |
+
db.close()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_all_models():
|
| 60 |
+
db = next(get_db())
|
| 61 |
+
try:
|
| 62 |
+
# 只查询启用状态供应商的模型
|
| 63 |
+
models = db.query(Model).join(Provider, Model.provider_id == Provider.id).filter(Provider.enabled == 1).all()
|
| 64 |
+
return [
|
| 65 |
+
{"id": m.id, "provider_id": m.provider_id, "model_name": m.model_name}
|
| 66 |
+
for m in models
|
| 67 |
+
]
|
| 68 |
+
finally:
|
| 69 |
+
db.close()
|
backend/app/db/models/__init__.py
ADDED
|
File without changes
|
backend/app/db/models/articles.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
|
| 2 |
+
|
| 3 |
+
from app.db.engine import Base
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ArticleItem(Base):
|
| 7 |
+
__tablename__ = "article_items"
|
| 8 |
+
__table_args__ = (
|
| 9 |
+
UniqueConstraint("platform", "article_id", name="uq_article_platform_article_id"),
|
| 10 |
+
UniqueConstraint("platform", "url_hash", name="uq_article_platform_url_hash"),
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 14 |
+
platform = Column(String, nullable=False)
|
| 15 |
+
article_id = Column(String, nullable=False, default="")
|
| 16 |
+
url = Column(Text, nullable=False)
|
| 17 |
+
url_hash = Column(String, nullable=False)
|
| 18 |
+
title = Column(String, nullable=False)
|
| 19 |
+
author_name = Column(String, nullable=False, default="")
|
| 20 |
+
author_id = Column(String, nullable=False, default="")
|
| 21 |
+
summary_status = Column(String, nullable=False, default="pending")
|
| 22 |
+
task_id = Column(String, nullable=False, default="")
|
| 23 |
+
cover_url = Column(Text, nullable=False, default="")
|
| 24 |
+
published_at = Column(String, nullable=False, default="")
|
| 25 |
+
content_text = Column(Text, nullable=False, default="")
|
| 26 |
+
discovered_at = Column(DateTime, server_default=func.now())
|
| 27 |
+
raw_metadata = Column(Text, nullable=False, default="{}")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ArticleSubscription(Base):
|
| 31 |
+
__tablename__ = "article_subscriptions"
|
| 32 |
+
|
| 33 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 34 |
+
platform = Column(String, nullable=False)
|
| 35 |
+
type = Column(String, nullable=False)
|
| 36 |
+
query = Column(Text, nullable=False)
|
| 37 |
+
label = Column(String, nullable=False, default="")
|
| 38 |
+
enabled = Column(Boolean, nullable=False, default=True)
|
| 39 |
+
last_refresh_at = Column(DateTime, nullable=True)
|
| 40 |
+
last_error = Column(Text, nullable=False, default="")
|
| 41 |
+
created_at = Column(DateTime, server_default=func.now())
|
| 42 |
+
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class ArticleSubscriptionItem(Base):
|
| 46 |
+
__tablename__ = "article_subscription_items"
|
| 47 |
+
__table_args__ = (
|
| 48 |
+
UniqueConstraint("subscription_id", "article_item_id", name="uq_subscription_article_item"),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 52 |
+
subscription_id = Column(Integer, ForeignKey("article_subscriptions.id"), nullable=False)
|
| 53 |
+
article_item_id = Column(Integer, ForeignKey("article_items.id"), nullable=False)
|
| 54 |
+
matched_at = Column(DateTime, server_default=func.now())
|
| 55 |
+
match_reason = Column(Text, nullable=False, default="")
|
backend/app/db/models/models.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, DateTime, func, ForeignKey
|
| 2 |
+
|
| 3 |
+
from app.db.engine import Base
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Model(Base):
|
| 7 |
+
__tablename__ = "models"
|
| 8 |
+
|
| 9 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 10 |
+
provider_id = Column(Integer, nullable=False)
|
| 11 |
+
model_name = Column(String, nullable=False)
|
| 12 |
+
created_at = Column(DateTime, server_default=func.now())
|
backend/app/db/models/providers.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, String, Integer, DateTime, func
|
| 2 |
+
from sqlalchemy.orm import declarative_base
|
| 3 |
+
|
| 4 |
+
from app.db.engine import Base
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Provider(Base):
|
| 8 |
+
__tablename__ = "providers"
|
| 9 |
+
|
| 10 |
+
id = Column(String, primary_key=True)
|
| 11 |
+
name = Column(String, nullable=False)
|
| 12 |
+
logo = Column(String, nullable=False)
|
| 13 |
+
type = Column(String, nullable=False)
|
| 14 |
+
api_key = Column(String, nullable=False)
|
| 15 |
+
base_url = Column(String, nullable=False)
|
| 16 |
+
enabled = Column(Integer, default=1)
|
| 17 |
+
created_at = Column(DateTime, server_default=func.now())
|
backend/app/db/models/trend_subscription.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, func
|
| 2 |
+
|
| 3 |
+
from app.db.engine import Base
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TrendSubscription(Base):
|
| 7 |
+
__tablename__ = "trend_subscriptions"
|
| 8 |
+
|
| 9 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 10 |
+
name = Column(String, nullable=False)
|
| 11 |
+
keywords = Column(Text, nullable=False, default="[]") # JSON array of keyword strings
|
| 12 |
+
platforms = Column(Text, nullable=False, default='["all"]') # JSON array of platform ids
|
| 13 |
+
match_mode = Column(String, nullable=False, default="any") # "any" | "all"
|
| 14 |
+
enabled = Column(Boolean, nullable=False, default=True)
|
| 15 |
+
push_enabled = Column(Boolean, nullable=False, default=False)
|
| 16 |
+
push_channel_ids = Column(Text, nullable=False, default="[]") # JSON array of channel ids
|
| 17 |
+
last_matched_at = Column(DateTime, nullable=True)
|
| 18 |
+
created_at = Column(DateTime, server_default=func.now())
|
| 19 |
+
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TrendSubscriptionMatch(Base):
|
| 23 |
+
__tablename__ = "trend_subscription_matches"
|
| 24 |
+
|
| 25 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 26 |
+
subscription_id = Column(Integer, ForeignKey("trend_subscriptions.id"), nullable=False)
|
| 27 |
+
platform = Column(String, nullable=False)
|
| 28 |
+
item_id = Column(String, nullable=False)
|
| 29 |
+
title = Column(String, nullable=False)
|
| 30 |
+
url = Column(Text, nullable=False, default="")
|
| 31 |
+
hot_score = Column(String, nullable=False, default="")
|
| 32 |
+
matched_keywords = Column(Text, nullable=False, default="[]") # JSON array of matched keywords
|
| 33 |
+
matched_at = Column(DateTime, server_default=func.now())
|
| 34 |
+
is_read = Column(Boolean, nullable=False, default=False)
|
| 35 |
+
# dedup: same subscription + same platform + same item_id
|
| 36 |
+
__table_args__ = (
|
| 37 |
+
{"sqlite_autoincrement": True},
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class NotificationChannel(Base):
|
| 42 |
+
__tablename__ = "notification_channels"
|
| 43 |
+
|
| 44 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 45 |
+
name = Column(String, nullable=False)
|
| 46 |
+
type = Column(String, nullable=False) # "webhook" | "bark" | "email"
|
| 47 |
+
config = Column(Text, nullable=False, default="{}") # JSON object, type-specific
|
| 48 |
+
enabled = Column(Boolean, nullable=False, default=True)
|
| 49 |
+
created_at = Column(DateTime, server_default=func.now())
|
| 50 |
+
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
backend/app/db/models/video_tasks.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, DateTime, func
|
| 2 |
+
from sqlalchemy.orm import declarative_base
|
| 3 |
+
|
| 4 |
+
from app.db.engine import Base
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class VideoTask(Base):
|
| 8 |
+
__tablename__ = "video_tasks"
|
| 9 |
+
|
| 10 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 11 |
+
video_id = Column(String, nullable=False)
|
| 12 |
+
platform = Column(String, nullable=False)
|
| 13 |
+
task_id = Column(String, unique=True, nullable=False)
|
| 14 |
+
created_at = Column(DateTime, server_default=func.now())
|
backend/app/db/provider_dao.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from app.db.models.providers import Provider
|
| 5 |
+
from app.utils.logger import get_logger
|
| 6 |
+
from app.db.engine import get_engine, Base, get_db
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_builtin_providers_path():
|
| 12 |
+
if getattr(sys, 'frozen', False):
|
| 13 |
+
base_path = sys._MEIPASS
|
| 14 |
+
else:
|
| 15 |
+
base_path = os.path.dirname(__file__)
|
| 16 |
+
return os.path.join(base_path, 'builtin_providers.json')
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def seed_default_providers():
|
| 20 |
+
db = next(get_db())
|
| 21 |
+
try:
|
| 22 |
+
if db.query(Provider).count() > 0:
|
| 23 |
+
logger.info("Providers already exist, skipping seed.")
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
json_path = get_builtin_providers_path()
|
| 27 |
+
try:
|
| 28 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 29 |
+
providers = json.load(f)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.error(f"Failed to read builtin_providers.json: {e}")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
for p in providers:
|
| 35 |
+
db.add(Provider(
|
| 36 |
+
id=p['id'],
|
| 37 |
+
name=p['name'],
|
| 38 |
+
api_key=p['api_key'],
|
| 39 |
+
base_url=p['base_url'],
|
| 40 |
+
logo=p['logo'],
|
| 41 |
+
type=p['type'],
|
| 42 |
+
enabled=p.get('enabled', 1)
|
| 43 |
+
))
|
| 44 |
+
db.commit()
|
| 45 |
+
logger.info("Default providers seeded successfully.")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Failed to seed default providers: {e}")
|
| 48 |
+
finally:
|
| 49 |
+
db.close()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def insert_provider(id: str, name: str, api_key: str, base_url: str, logo: str, type_: str, enabled: int = 1):
|
| 53 |
+
db = next(get_db())
|
| 54 |
+
try:
|
| 55 |
+
provider = Provider(id=id, name=name, api_key=api_key, base_url=base_url, logo=logo, type=type_, enabled=enabled)
|
| 56 |
+
db.add(provider)
|
| 57 |
+
db.commit()
|
| 58 |
+
logger.info(f"Provider inserted successfully. id: {id}, name: {name}, type: {type_}")
|
| 59 |
+
return id
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.error(f"Failed to insert provider: {e}")
|
| 62 |
+
finally:
|
| 63 |
+
db.close()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_enabled_providers():
|
| 67 |
+
db = next(get_db())
|
| 68 |
+
try:
|
| 69 |
+
return db.query(Provider).filter_by(enabled=1).all()
|
| 70 |
+
finally:
|
| 71 |
+
db.close()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def get_provider_by_name(name: str):
|
| 75 |
+
db = next(get_db())
|
| 76 |
+
try:
|
| 77 |
+
return db.query(Provider).filter_by(name=name).first()
|
| 78 |
+
finally:
|
| 79 |
+
db.close()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_provider_by_id(id: str):
|
| 83 |
+
db = next(get_db())
|
| 84 |
+
try:
|
| 85 |
+
return db.query(Provider).filter_by(id=id).first()
|
| 86 |
+
finally:
|
| 87 |
+
db.close()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_all_providers():
|
| 91 |
+
db = next(get_db())
|
| 92 |
+
try:
|
| 93 |
+
return db.query(Provider).all()
|
| 94 |
+
finally:
|
| 95 |
+
db.close()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def update_provider(id: str, **kwargs):
|
| 99 |
+
db = next(get_db())
|
| 100 |
+
try:
|
| 101 |
+
provider = db.query(Provider).filter_by(id=id).first()
|
| 102 |
+
if not provider:
|
| 103 |
+
logger.warning(f"Provider {id} not found for update.")
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
for key, value in kwargs.items():
|
| 107 |
+
if hasattr(provider, key):
|
| 108 |
+
setattr(provider, key, value)
|
| 109 |
+
|
| 110 |
+
db.commit()
|
| 111 |
+
logger.info(f"Provider updated successfully. id: {id}, updated_fields: {list(kwargs.keys())}")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Failed to update provider: {e}")
|
| 114 |
+
finally:
|
| 115 |
+
db.close()
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def delete_provider(id: str):
|
| 119 |
+
db = next(get_db())
|
| 120 |
+
try:
|
| 121 |
+
provider = db.query(Provider).filter_by(id=id).first()
|
| 122 |
+
if provider:
|
| 123 |
+
db.delete(provider)
|
| 124 |
+
db.commit()
|
| 125 |
+
logger.info(f"Provider deleted successfully. id: {id}")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Failed to delete provider: {e}")
|
| 128 |
+
finally:
|
| 129 |
+
db.close()
|
backend/app/db/sqlite_client.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
|
| 3 |
+
def get_connection():
|
| 4 |
+
return sqlite3.connect("video_memo.db")
|
backend/app/db/trend_subscription_dao.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
from app.db.engine import get_db
|
| 7 |
+
from app.db.models.trend_subscription import (
|
| 8 |
+
NotificationChannel,
|
| 9 |
+
TrendSubscription,
|
| 10 |
+
TrendSubscriptionMatch,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _detach(obj):
|
| 15 |
+
data = {key: value for key, value in obj.__dict__.items() if not key.startswith("_")}
|
| 16 |
+
obj.__dict__.clear()
|
| 17 |
+
obj.__dict__.update(data)
|
| 18 |
+
return obj
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ─── Trend Subscriptions ──────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
def create_subscription(
|
| 24 |
+
name: str,
|
| 25 |
+
keywords: list[str],
|
| 26 |
+
platforms: list[str] | None = None,
|
| 27 |
+
match_mode: str = "any",
|
| 28 |
+
push_enabled: bool = False,
|
| 29 |
+
push_channel_ids: list[int] | None = None,
|
| 30 |
+
) -> TrendSubscription:
|
| 31 |
+
db = next(get_db())
|
| 32 |
+
try:
|
| 33 |
+
sub = TrendSubscription(
|
| 34 |
+
name=name,
|
| 35 |
+
keywords=json.dumps(keywords, ensure_ascii=False),
|
| 36 |
+
platforms=json.dumps(platforms or ["all"], ensure_ascii=False),
|
| 37 |
+
match_mode=match_mode,
|
| 38 |
+
push_enabled=push_enabled,
|
| 39 |
+
push_channel_ids=json.dumps(push_channel_ids or []),
|
| 40 |
+
)
|
| 41 |
+
db.add(sub)
|
| 42 |
+
db.commit()
|
| 43 |
+
db.refresh(sub)
|
| 44 |
+
return _detach(sub)
|
| 45 |
+
finally:
|
| 46 |
+
db.close()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def list_subscriptions() -> list[TrendSubscription]:
|
| 50 |
+
db = next(get_db())
|
| 51 |
+
try:
|
| 52 |
+
return [
|
| 53 |
+
_detach(item)
|
| 54 |
+
for item in db.query(TrendSubscription).order_by(TrendSubscription.id.desc()).all()
|
| 55 |
+
]
|
| 56 |
+
finally:
|
| 57 |
+
db.close()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def get_subscription(subscription_id: int) -> TrendSubscription | None:
|
| 61 |
+
db = next(get_db())
|
| 62 |
+
try:
|
| 63 |
+
item = db.query(TrendSubscription).filter_by(id=subscription_id).first()
|
| 64 |
+
return _detach(item) if item else None
|
| 65 |
+
finally:
|
| 66 |
+
db.close()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def update_subscription(
|
| 70 |
+
subscription_id: int,
|
| 71 |
+
name: str | None = None,
|
| 72 |
+
keywords: list[str] | None = None,
|
| 73 |
+
platforms: list[str] | None = None,
|
| 74 |
+
match_mode: str | None = None,
|
| 75 |
+
enabled: bool | None = None,
|
| 76 |
+
push_enabled: bool | None = None,
|
| 77 |
+
push_channel_ids: list[int] | None = None,
|
| 78 |
+
) -> TrendSubscription | None:
|
| 79 |
+
db = next(get_db())
|
| 80 |
+
try:
|
| 81 |
+
sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
|
| 82 |
+
if sub is None:
|
| 83 |
+
return None
|
| 84 |
+
if name is not None:
|
| 85 |
+
sub.name = name
|
| 86 |
+
if keywords is not None:
|
| 87 |
+
sub.keywords = json.dumps(keywords, ensure_ascii=False)
|
| 88 |
+
if platforms is not None:
|
| 89 |
+
sub.platforms = json.dumps(platforms, ensure_ascii=False)
|
| 90 |
+
if match_mode is not None:
|
| 91 |
+
sub.match_mode = match_mode
|
| 92 |
+
if enabled is not None:
|
| 93 |
+
sub.enabled = enabled
|
| 94 |
+
if push_enabled is not None:
|
| 95 |
+
sub.push_enabled = push_enabled
|
| 96 |
+
if push_channel_ids is not None:
|
| 97 |
+
sub.push_channel_ids = json.dumps(push_channel_ids)
|
| 98 |
+
db.commit()
|
| 99 |
+
db.refresh(sub)
|
| 100 |
+
return _detach(sub)
|
| 101 |
+
finally:
|
| 102 |
+
db.close()
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def delete_subscription(subscription_id: int) -> bool:
|
| 106 |
+
db = next(get_db())
|
| 107 |
+
try:
|
| 108 |
+
sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
|
| 109 |
+
if sub is None:
|
| 110 |
+
return False
|
| 111 |
+
# also delete associated matches
|
| 112 |
+
db.query(TrendSubscriptionMatch).filter_by(subscription_id=subscription_id).delete()
|
| 113 |
+
db.delete(sub)
|
| 114 |
+
db.commit()
|
| 115 |
+
return True
|
| 116 |
+
finally:
|
| 117 |
+
db.close()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def update_subscription_refresh(subscription_id: int) -> None:
|
| 121 |
+
db = next(get_db())
|
| 122 |
+
try:
|
| 123 |
+
sub = db.query(TrendSubscription).filter_by(id=subscription_id).first()
|
| 124 |
+
if sub:
|
| 125 |
+
sub.last_matched_at = datetime.now()
|
| 126 |
+
db.commit()
|
| 127 |
+
finally:
|
| 128 |
+
db.close()
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ─── Trend Subscription Matches ───────────────────────────────────────────────────
|
| 132 |
+
|
| 133 |
+
def create_match(
|
| 134 |
+
subscription_id: int,
|
| 135 |
+
platform: str,
|
| 136 |
+
item_id: str,
|
| 137 |
+
title: str,
|
| 138 |
+
url: str = "",
|
| 139 |
+
hot_score: str = "",
|
| 140 |
+
matched_keywords: list[str] | None = None,
|
| 141 |
+
) -> TrendSubscriptionMatch | None:
|
| 142 |
+
"""Create a match record. Returns None if this (subscription, platform, item_id) already exists."""
|
| 143 |
+
db = next(get_db())
|
| 144 |
+
try:
|
| 145 |
+
existing = (
|
| 146 |
+
db.query(TrendSubscriptionMatch)
|
| 147 |
+
.filter_by(subscription_id=subscription_id, platform=platform, item_id=item_id)
|
| 148 |
+
.first()
|
| 149 |
+
)
|
| 150 |
+
if existing is not None:
|
| 151 |
+
return None # already matched before
|
| 152 |
+
match = TrendSubscriptionMatch(
|
| 153 |
+
subscription_id=subscription_id,
|
| 154 |
+
platform=platform,
|
| 155 |
+
item_id=item_id,
|
| 156 |
+
title=title,
|
| 157 |
+
url=url,
|
| 158 |
+
hot_score=hot_score,
|
| 159 |
+
matched_keywords=json.dumps(matched_keywords or [], ensure_ascii=False),
|
| 160 |
+
)
|
| 161 |
+
db.add(match)
|
| 162 |
+
db.commit()
|
| 163 |
+
db.refresh(match)
|
| 164 |
+
return _detach(match)
|
| 165 |
+
finally:
|
| 166 |
+
db.close()
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def list_matches(
|
| 170 |
+
subscription_id: int | None = None,
|
| 171 |
+
limit: int = 100,
|
| 172 |
+
unread_only: bool = False,
|
| 173 |
+
) -> list[TrendSubscriptionMatch]:
|
| 174 |
+
db = next(get_db())
|
| 175 |
+
try:
|
| 176 |
+
query = db.query(TrendSubscriptionMatch)
|
| 177 |
+
if subscription_id is not None:
|
| 178 |
+
query = query.filter_by(subscription_id=subscription_id)
|
| 179 |
+
if unread_only:
|
| 180 |
+
query = query.filter_by(is_read=False)
|
| 181 |
+
return [
|
| 182 |
+
_detach(item)
|
| 183 |
+
for item in query.order_by(TrendSubscriptionMatch.matched_at.desc())
|
| 184 |
+
.limit(limit)
|
| 185 |
+
.all()
|
| 186 |
+
]
|
| 187 |
+
finally:
|
| 188 |
+
db.close()
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def mark_matches_read(subscription_id: int) -> int:
|
| 192 |
+
"""Mark all matches for a subscription as read. Returns count of updated rows."""
|
| 193 |
+
db = next(get_db())
|
| 194 |
+
try:
|
| 195 |
+
count = (
|
| 196 |
+
db.query(TrendSubscriptionMatch)
|
| 197 |
+
.filter_by(subscription_id=subscription_id, is_read=False)
|
| 198 |
+
.update({"is_read": True})
|
| 199 |
+
)
|
| 200 |
+
db.commit()
|
| 201 |
+
return count
|
| 202 |
+
finally:
|
| 203 |
+
db.close()
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def count_unread_matches(subscription_id: int) -> int:
|
| 207 |
+
db = next(get_db())
|
| 208 |
+
try:
|
| 209 |
+
return (
|
| 210 |
+
db.query(TrendSubscriptionMatch)
|
| 211 |
+
.filter_by(subscription_id=subscription_id, is_read=False)
|
| 212 |
+
.count()
|
| 213 |
+
)
|
| 214 |
+
finally:
|
| 215 |
+
db.close()
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# ─── Notification Channels ────────────────────────────────────────────────────────
|
| 219 |
+
|
| 220 |
+
def create_channel(name: str, channel_type: str, config: dict | None = None) -> NotificationChannel:
|
| 221 |
+
db = next(get_db())
|
| 222 |
+
try:
|
| 223 |
+
channel = NotificationChannel(
|
| 224 |
+
name=name,
|
| 225 |
+
type=channel_type,
|
| 226 |
+
config=json.dumps(config or {}, ensure_ascii=False),
|
| 227 |
+
)
|
| 228 |
+
db.add(channel)
|
| 229 |
+
db.commit()
|
| 230 |
+
db.refresh(channel)
|
| 231 |
+
return _detach(channel)
|
| 232 |
+
finally:
|
| 233 |
+
db.close()
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def list_channels() -> list[NotificationChannel]:
|
| 237 |
+
db = next(get_db())
|
| 238 |
+
try:
|
| 239 |
+
return [
|
| 240 |
+
_detach(item)
|
| 241 |
+
for item in db.query(NotificationChannel).order_by(NotificationChannel.id.desc()).all()
|
| 242 |
+
]
|
| 243 |
+
finally:
|
| 244 |
+
db.close()
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def get_channel(channel_id: int) -> NotificationChannel | None:
|
| 248 |
+
db = next(get_db())
|
| 249 |
+
try:
|
| 250 |
+
item = db.query(NotificationChannel).filter_by(id=channel_id).first()
|
| 251 |
+
return _detach(item) if item else None
|
| 252 |
+
finally:
|
| 253 |
+
db.close()
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def update_channel(
|
| 257 |
+
channel_id: int,
|
| 258 |
+
name: str | None = None,
|
| 259 |
+
channel_type: str | None = None,
|
| 260 |
+
config: dict | None = None,
|
| 261 |
+
enabled: bool | None = None,
|
| 262 |
+
) -> NotificationChannel | None:
|
| 263 |
+
db = next(get_db())
|
| 264 |
+
try:
|
| 265 |
+
channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
|
| 266 |
+
if channel is None:
|
| 267 |
+
return None
|
| 268 |
+
if name is not None:
|
| 269 |
+
channel.name = name
|
| 270 |
+
if channel_type is not None:
|
| 271 |
+
channel.type = channel_type
|
| 272 |
+
if config is not None:
|
| 273 |
+
channel.config = json.dumps(config, ensure_ascii=False)
|
| 274 |
+
if enabled is not None:
|
| 275 |
+
channel.enabled = enabled
|
| 276 |
+
db.commit()
|
| 277 |
+
db.refresh(channel)
|
| 278 |
+
return _detach(channel)
|
| 279 |
+
finally:
|
| 280 |
+
db.close()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def delete_channel(channel_id: int) -> bool:
|
| 284 |
+
db = next(get_db())
|
| 285 |
+
try:
|
| 286 |
+
channel = db.query(NotificationChannel).filter_by(id=channel_id).first()
|
| 287 |
+
if channel is None:
|
| 288 |
+
return False
|
| 289 |
+
db.delete(channel)
|
| 290 |
+
db.commit()
|
| 291 |
+
return True
|
| 292 |
+
finally:
|
| 293 |
+
db.close()
|
backend/app/db/video_task_dao.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.db.models.video_tasks import VideoTask
|
| 2 |
+
from app.db.engine import get_db
|
| 3 |
+
from app.utils.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# 插入任务
|
| 9 |
+
def insert_video_task(video_id: str, platform: str, task_id: str):
|
| 10 |
+
db = next(get_db())
|
| 11 |
+
try:
|
| 12 |
+
task = VideoTask(video_id=video_id, platform=platform, task_id=task_id)
|
| 13 |
+
db.add(task)
|
| 14 |
+
db.commit()
|
| 15 |
+
db.refresh(task)
|
| 16 |
+
logger.info(f"Video task inserted successfully. video_id: {video_id}, platform: {platform}, task_id: {task_id}")
|
| 17 |
+
except Exception as e:
|
| 18 |
+
logger.error(f"Failed to insert video task: {e}")
|
| 19 |
+
finally:
|
| 20 |
+
db.close()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# 查询任务(最新一条)
|
| 24 |
+
def get_task_by_video(video_id: str, platform: str):
|
| 25 |
+
db = next(get_db())
|
| 26 |
+
try:
|
| 27 |
+
task = (
|
| 28 |
+
db.query(VideoTask)
|
| 29 |
+
.filter_by(video_id=video_id, platform=platform)
|
| 30 |
+
.order_by(VideoTask.created_at.desc())
|
| 31 |
+
.first()
|
| 32 |
+
)
|
| 33 |
+
if task:
|
| 34 |
+
logger.info(f"Task found for video_id: {video_id} and platform: {platform}")
|
| 35 |
+
return task.task_id
|
| 36 |
+
else:
|
| 37 |
+
logger.info(f"No task found for video_id: {video_id} and platform: {platform}")
|
| 38 |
+
return None
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"Failed to get task by video: {e}")
|
| 41 |
+
finally:
|
| 42 |
+
db.close()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# 删除任务
|
| 46 |
+
def delete_task_by_video(video_id: str, platform: str):
|
| 47 |
+
db = next(get_db())
|
| 48 |
+
try:
|
| 49 |
+
tasks = (
|
| 50 |
+
db.query(VideoTask)
|
| 51 |
+
.filter_by(video_id=video_id, platform=platform)
|
| 52 |
+
.all()
|
| 53 |
+
)
|
| 54 |
+
for task in tasks:
|
| 55 |
+
db.delete(task)
|
| 56 |
+
db.commit()
|
| 57 |
+
logger.info(f"Task(s) deleted for video_id: {video_id} and platform: {platform}")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Failed to delete task by video: {e}")
|
| 60 |
+
finally:
|
| 61 |
+
db.close()
|
backend/app/decorators/__init__.py
ADDED
|
File without changes
|
backend/app/decorators/timeit.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import functools
|
| 3 |
+
|
| 4 |
+
def timeit(func):
|
| 5 |
+
@functools.wraps(func)
|
| 6 |
+
def wrapper(*args, **kwargs):
|
| 7 |
+
start = time.perf_counter()
|
| 8 |
+
result = func(*args, **kwargs)
|
| 9 |
+
end = time.perf_counter()
|
| 10 |
+
duration = end - start
|
| 11 |
+
print(f"{func.__name__} executed in {duration:.4f} seconds")
|
| 12 |
+
return result
|
| 13 |
+
return wrapper
|
backend/app/downloaders/__init__.py
ADDED
|
File without changes
|
backend/app/downloaders/base.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import enum
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
|
| 6 |
+
from app.enmus.note_enums import DownloadQuality
|
| 7 |
+
from app.models.notes_model import AudioDownloadResult
|
| 8 |
+
from app.models.transcriber_model import TranscriptResult
|
| 9 |
+
from os import getenv
|
| 10 |
+
QUALITY_MAP = {
|
| 11 |
+
"fast": "32",
|
| 12 |
+
"medium": "64",
|
| 13 |
+
"slow": "128"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Downloader(ABC):
|
| 18 |
+
def __init__(self):
|
| 19 |
+
#TODO 需要修改为可配置
|
| 20 |
+
self.quality = QUALITY_MAP.get('fast')
|
| 21 |
+
self.cache_data=getenv('DATA_DIR')
|
| 22 |
+
|
| 23 |
+
@abstractmethod
|
| 24 |
+
def download(self, video_url: str, output_dir: str = None,
|
| 25 |
+
quality: DownloadQuality = "fast", need_video: Optional[bool] = False,
|
| 26 |
+
skip_download: bool = False) -> AudioDownloadResult:
|
| 27 |
+
'''
|
| 28 |
+
|
| 29 |
+
:param need_video:
|
| 30 |
+
:param video_url: 资源链接
|
| 31 |
+
:param output_dir: 输出路径 默认根目录data
|
| 32 |
+
:param quality: 音频质量 fast | medium | slow
|
| 33 |
+
:return:返回一个 AudioDownloadResult 类
|
| 34 |
+
'''
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def download_video(self, video_url: str,
|
| 39 |
+
output_dir: Union[str, None] = None) -> str:
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
def download_subtitles(self, video_url: str, output_dir: str = None,
|
| 43 |
+
langs: list = None) -> Optional[TranscriptResult]:
|
| 44 |
+
'''
|
| 45 |
+
尝试获取平台字幕(人工字幕或自动生成字幕)
|
| 46 |
+
|
| 47 |
+
:param video_url: 视频链接
|
| 48 |
+
:param output_dir: 输出路径
|
| 49 |
+
:param langs: 优先语言列表,如 ['zh-Hans', 'zh', 'en']
|
| 50 |
+
:return: TranscriptResult 或 None(无字幕时)
|
| 51 |
+
'''
|
| 52 |
+
return None
|
backend/app/downloaders/bilibili_downloader.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import tempfile
|
| 5 |
+
from abc import ABC
|
| 6 |
+
from typing import Union, Optional, List
|
| 7 |
+
|
| 8 |
+
import yt_dlp
|
| 9 |
+
|
| 10 |
+
from app.downloaders.base import Downloader, DownloadQuality, QUALITY_MAP
|
| 11 |
+
from app.downloaders.bilibili_subtitle import BilibiliSubtitleFetcher
|
| 12 |
+
from app.models.notes_model import AudioDownloadResult
|
| 13 |
+
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
| 14 |
+
from app.utils.path_helper import get_data_dir
|
| 15 |
+
from app.utils.url_parser import extract_video_id
|
| 16 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class BilibiliDownloader(Downloader, ABC):
|
| 22 |
+
def __init__(self):
|
| 23 |
+
super().__init__()
|
| 24 |
+
self._cookie_mgr = CookieConfigManager()
|
| 25 |
+
self._cookie = self._cookie_mgr.get('bilibili')
|
| 26 |
+
self._cookiefile = self._write_netscape_cookie_file()
|
| 27 |
+
|
| 28 |
+
def _write_netscape_cookie_file(self) -> Optional[str]:
|
| 29 |
+
"""将 Cookie 写入 Netscape 格式临时文件,返回文件路径(供 yt-dlp cookiefile 使用)"""
|
| 30 |
+
if not self._cookie:
|
| 31 |
+
logger.warning("B站 Cookie 未配置,下载可能失败")
|
| 32 |
+
return None
|
| 33 |
+
lines = ["# Netscape HTTP Cookie File\n"]
|
| 34 |
+
for pair in self._cookie.split("; "):
|
| 35 |
+
if "=" in pair:
|
| 36 |
+
key, value = pair.split("=", 1)
|
| 37 |
+
lines.append(f".bilibili.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
|
| 38 |
+
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
|
| 39 |
+
tmp.writelines(lines)
|
| 40 |
+
tmp.close()
|
| 41 |
+
logger.info("已生成 B站 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
|
| 42 |
+
return tmp.name
|
| 43 |
+
|
| 44 |
+
def download(
|
| 45 |
+
self,
|
| 46 |
+
video_url: str,
|
| 47 |
+
output_dir: Union[str, None] = None,
|
| 48 |
+
quality: DownloadQuality = "fast",
|
| 49 |
+
need_video:Optional[bool]=False
|
| 50 |
+
) -> AudioDownloadResult:
|
| 51 |
+
if output_dir is None:
|
| 52 |
+
output_dir = get_data_dir()
|
| 53 |
+
if not output_dir:
|
| 54 |
+
output_dir=self.cache_data
|
| 55 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 56 |
+
|
| 57 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 58 |
+
|
| 59 |
+
ydl_opts = {
|
| 60 |
+
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
| 61 |
+
'outtmpl': output_path,
|
| 62 |
+
'http_headers': {'Referer': 'https://www.bilibili.com'},
|
| 63 |
+
'postprocessors': [
|
| 64 |
+
{
|
| 65 |
+
'key': 'FFmpegExtractAudio',
|
| 66 |
+
'preferredcodec': 'mp3',
|
| 67 |
+
'preferredquality': '64',
|
| 68 |
+
}
|
| 69 |
+
],
|
| 70 |
+
'noplaylist': True,
|
| 71 |
+
'quiet': False,
|
| 72 |
+
}
|
| 73 |
+
if self._cookiefile:
|
| 74 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 75 |
+
|
| 76 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 77 |
+
info = ydl.extract_info(video_url, download=True)
|
| 78 |
+
video_id = info.get("id")
|
| 79 |
+
title = info.get("title")
|
| 80 |
+
duration = info.get("duration", 0)
|
| 81 |
+
cover_url = info.get("thumbnail")
|
| 82 |
+
audio_path = os.path.join(output_dir, f"{video_id}.mp3")
|
| 83 |
+
|
| 84 |
+
return AudioDownloadResult(
|
| 85 |
+
file_path=audio_path,
|
| 86 |
+
title=title,
|
| 87 |
+
duration=duration,
|
| 88 |
+
cover_url=cover_url,
|
| 89 |
+
platform="bilibili",
|
| 90 |
+
video_id=video_id,
|
| 91 |
+
raw_info=info,
|
| 92 |
+
video_path=None # ❗音频下载不包含视频路径
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def download_video(
|
| 96 |
+
self,
|
| 97 |
+
video_url: str,
|
| 98 |
+
output_dir: Union[str, None] = None,
|
| 99 |
+
) -> str:
|
| 100 |
+
"""
|
| 101 |
+
下载视频,返回视频文件路径
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
if output_dir is None:
|
| 105 |
+
output_dir = get_data_dir()
|
| 106 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 107 |
+
print("video_url",video_url)
|
| 108 |
+
video_id=extract_video_id(video_url, "bilibili")
|
| 109 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 110 |
+
if os.path.exists(video_path):
|
| 111 |
+
return video_path
|
| 112 |
+
|
| 113 |
+
# 检查是否已经存在
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 117 |
+
|
| 118 |
+
ydl_opts = {
|
| 119 |
+
'format': 'bv*[ext=mp4]/bestvideo+bestaudio/best',
|
| 120 |
+
'outtmpl': output_path,
|
| 121 |
+
'http_headers': {'Referer': 'https://www.bilibili.com'},
|
| 122 |
+
'noplaylist': True,
|
| 123 |
+
'quiet': False,
|
| 124 |
+
'merge_output_format': 'mp4', # 确保合并成 mp4
|
| 125 |
+
}
|
| 126 |
+
if self._cookiefile:
|
| 127 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 128 |
+
|
| 129 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 130 |
+
info = ydl.extract_info(video_url, download=True)
|
| 131 |
+
video_id = info.get("id")
|
| 132 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 133 |
+
|
| 134 |
+
if not os.path.exists(video_path):
|
| 135 |
+
raise FileNotFoundError(f"视频文件未找到: {video_path}")
|
| 136 |
+
|
| 137 |
+
return video_path
|
| 138 |
+
|
| 139 |
+
def delete_video(self, video_path: str) -> str:
|
| 140 |
+
"""
|
| 141 |
+
删除视频文件
|
| 142 |
+
"""
|
| 143 |
+
if os.path.exists(video_path):
|
| 144 |
+
os.remove(video_path)
|
| 145 |
+
return f"视频文件已删除: {video_path}"
|
| 146 |
+
else:
|
| 147 |
+
return f"视频文件未找到: {video_path}"
|
| 148 |
+
|
| 149 |
+
def download_subtitles(self, video_url: str, output_dir: str = None,
|
| 150 |
+
langs: List[str] = None) -> Optional[TranscriptResult]:
|
| 151 |
+
"""
|
| 152 |
+
尝试获取B站视频字幕
|
| 153 |
+
|
| 154 |
+
:param video_url: 视频链接
|
| 155 |
+
:param output_dir: 输出路径
|
| 156 |
+
:param langs: 优先语言列表
|
| 157 |
+
:return: TranscriptResult 或 None
|
| 158 |
+
"""
|
| 159 |
+
# 1) 优先走 B 站官方 player API(直拉,无需下视频;AI 字幕需 SESSDATA cookie)
|
| 160 |
+
try:
|
| 161 |
+
result = BilibiliSubtitleFetcher().fetch_subtitles(video_url)
|
| 162 |
+
if result and result.segments:
|
| 163 |
+
return result
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.warning(f"player API 直拉字幕异常,回退到 yt-dlp: {e}")
|
| 166 |
+
|
| 167 |
+
# 2) Fallback:原 yt-dlp 路径(更脆弱,遇到签名/Cookie 问题失败概率较高)
|
| 168 |
+
if output_dir is None:
|
| 169 |
+
output_dir = get_data_dir()
|
| 170 |
+
if not output_dir:
|
| 171 |
+
output_dir = self.cache_data
|
| 172 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 173 |
+
|
| 174 |
+
if langs is None:
|
| 175 |
+
langs = ['zh-Hans', 'zh', 'zh-CN', 'ai-zh', 'en', 'en-US']
|
| 176 |
+
|
| 177 |
+
video_id = extract_video_id(video_url, "bilibili")
|
| 178 |
+
|
| 179 |
+
ydl_opts = {
|
| 180 |
+
'writesubtitles': True,
|
| 181 |
+
'writeautomaticsub': True,
|
| 182 |
+
'subtitleslangs': langs,
|
| 183 |
+
'subtitlesformat': 'srt/json3/best', # 支持多种格式
|
| 184 |
+
'skip_download': True,
|
| 185 |
+
'outtmpl': os.path.join(output_dir, f'{video_id}.%(ext)s'),
|
| 186 |
+
'quiet': True,
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# 通过 CookieConfigManager 注入 B站 Cookie(Netscape cookiefile)
|
| 190 |
+
if self._cookiefile:
|
| 191 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 192 |
+
ydl_opts['http_headers'] = {'Referer': 'https://www.bilibili.com'}
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 196 |
+
info = ydl.extract_info(video_url, download=True)
|
| 197 |
+
|
| 198 |
+
# 查找下载的字幕文件
|
| 199 |
+
subtitles = info.get('requested_subtitles') or {}
|
| 200 |
+
if not subtitles:
|
| 201 |
+
logger.info(f"B站视频 {video_id} 没有可用字幕")
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
# 按优先级查找字幕
|
| 205 |
+
detected_lang = None
|
| 206 |
+
sub_info = None
|
| 207 |
+
for lang in langs:
|
| 208 |
+
if lang in subtitles:
|
| 209 |
+
detected_lang = lang
|
| 210 |
+
sub_info = subtitles[lang]
|
| 211 |
+
break
|
| 212 |
+
|
| 213 |
+
# 如果按优先级没找到,取第一个可用的(排除弹幕)
|
| 214 |
+
if not detected_lang:
|
| 215 |
+
for lang, info_item in subtitles.items():
|
| 216 |
+
if lang != 'danmaku': # 排除弹幕
|
| 217 |
+
detected_lang = lang
|
| 218 |
+
sub_info = info_item
|
| 219 |
+
break
|
| 220 |
+
|
| 221 |
+
if not sub_info:
|
| 222 |
+
logger.info(f"B站视频 {video_id} 没有可用字幕(排除弹幕)")
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
# 检查是否有内嵌数据(yt-dlp 有时直接返回字幕内容)
|
| 226 |
+
if 'data' in sub_info and sub_info['data']:
|
| 227 |
+
logger.info(f"直接从返回数据解析字幕: {detected_lang}")
|
| 228 |
+
return self._parse_srt_content(sub_info['data'], detected_lang)
|
| 229 |
+
|
| 230 |
+
# 查找字幕文件
|
| 231 |
+
ext = sub_info.get('ext', 'srt')
|
| 232 |
+
subtitle_file = os.path.join(output_dir, f"{video_id}.{detected_lang}.{ext}")
|
| 233 |
+
|
| 234 |
+
if not os.path.exists(subtitle_file):
|
| 235 |
+
logger.info(f"字幕文件不存在: {subtitle_file}")
|
| 236 |
+
return None
|
| 237 |
+
|
| 238 |
+
# 根据格式解析字幕文件
|
| 239 |
+
if ext == 'json3':
|
| 240 |
+
return self._parse_json3_subtitle(subtitle_file, detected_lang)
|
| 241 |
+
else:
|
| 242 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
| 243 |
+
return self._parse_srt_content(f.read(), detected_lang)
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.warning(f"获取B站字幕失败: {e}")
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
def _parse_srt_content(self, srt_content: str, language: str) -> Optional[TranscriptResult]:
|
| 250 |
+
"""
|
| 251 |
+
解析 SRT 格式字幕内容
|
| 252 |
+
|
| 253 |
+
:param srt_content: SRT 字幕文本内容
|
| 254 |
+
:param language: 语言代码
|
| 255 |
+
:return: TranscriptResult
|
| 256 |
+
"""
|
| 257 |
+
import re
|
| 258 |
+
try:
|
| 259 |
+
segments = []
|
| 260 |
+
# SRT 格式: 序号\n时间戳\n文本\n\n
|
| 261 |
+
pattern = r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})\n(.*?)(?=\n\n|\n\d+\n|$)'
|
| 262 |
+
matches = re.findall(pattern, srt_content, re.DOTALL)
|
| 263 |
+
|
| 264 |
+
for match in matches:
|
| 265 |
+
idx, start_time, end_time, text = match
|
| 266 |
+
text = text.strip()
|
| 267 |
+
if not text:
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
# 转换时间格式 00:00:00,000 -> 秒
|
| 271 |
+
def time_to_seconds(t):
|
| 272 |
+
parts = t.replace(',', '.').split(':')
|
| 273 |
+
return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
|
| 274 |
+
|
| 275 |
+
segments.append(TranscriptSegment(
|
| 276 |
+
start=time_to_seconds(start_time),
|
| 277 |
+
end=time_to_seconds(end_time),
|
| 278 |
+
text=text
|
| 279 |
+
))
|
| 280 |
+
|
| 281 |
+
if not segments:
|
| 282 |
+
return None
|
| 283 |
+
|
| 284 |
+
full_text = ' '.join(seg.text for seg in segments)
|
| 285 |
+
logger.info(f"成功解析B站SRT字幕,共 {len(segments)} 段")
|
| 286 |
+
return TranscriptResult(
|
| 287 |
+
language=language,
|
| 288 |
+
full_text=full_text,
|
| 289 |
+
segments=segments,
|
| 290 |
+
raw={'source': 'bilibili_subtitle', 'format': 'srt'}
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logger.warning(f"解析SRT字幕失败: {e}")
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
def _parse_json3_subtitle(self, subtitle_file: str, language: str) -> Optional[TranscriptResult]:
|
| 298 |
+
"""
|
| 299 |
+
解析 json3 格式字幕文件
|
| 300 |
+
|
| 301 |
+
:param subtitle_file: 字幕文件路径
|
| 302 |
+
:param language: 语言代码
|
| 303 |
+
:return: TranscriptResult
|
| 304 |
+
"""
|
| 305 |
+
try:
|
| 306 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
| 307 |
+
data = json.load(f)
|
| 308 |
+
|
| 309 |
+
segments = []
|
| 310 |
+
events = data.get('events', [])
|
| 311 |
+
|
| 312 |
+
for event in events:
|
| 313 |
+
# json3 格式中时间单位是毫秒
|
| 314 |
+
start_ms = event.get('tStartMs', 0)
|
| 315 |
+
duration_ms = event.get('dDurationMs', 0)
|
| 316 |
+
|
| 317 |
+
# 提取文本
|
| 318 |
+
segs = event.get('segs', [])
|
| 319 |
+
text = ''.join(seg.get('utf8', '') for seg in segs).strip()
|
| 320 |
+
|
| 321 |
+
if text: # 只添加非空文本
|
| 322 |
+
segments.append(TranscriptSegment(
|
| 323 |
+
start=start_ms / 1000.0,
|
| 324 |
+
end=(start_ms + duration_ms) / 1000.0,
|
| 325 |
+
text=text
|
| 326 |
+
))
|
| 327 |
+
|
| 328 |
+
if not segments:
|
| 329 |
+
return None
|
| 330 |
+
|
| 331 |
+
full_text = ' '.join(seg.text for seg in segments)
|
| 332 |
+
|
| 333 |
+
logger.info(f"成功解析B站字幕,共 {len(segments)} 段")
|
| 334 |
+
return TranscriptResult(
|
| 335 |
+
language=language,
|
| 336 |
+
full_text=full_text,
|
| 337 |
+
segments=segments,
|
| 338 |
+
raw={'source': 'bilibili_subtitle', 'file': subtitle_file}
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.warning(f"解析字幕文件失败: {e}")
|
| 343 |
+
return None
|
backend/app/downloaders/bilibili_subtitle.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
直接调用 B 站 player API 拿字幕,绕过 yt-dlp。
|
| 3 |
+
|
| 4 |
+
流程:
|
| 5 |
+
1. 从 URL 提 BV id(已有 utils.url_parser.extract_video_id)
|
| 6 |
+
2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
|
| 7 |
+
3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
|
| 8 |
+
每条带 subtitle_url(B 站后端已经签好 auth_key 的完整地址)
|
| 9 |
+
4. 按优先级(人工 zh-CN > AI zh-CN > 任意 zh > 任意非空)选一条
|
| 10 |
+
5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
|
| 11 |
+
6. 解析为 TranscriptResult
|
| 12 |
+
|
| 13 |
+
AI 字幕需要登录态 cookie(SESSDATA);通过 CookieConfigManager 注入。
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from typing import List, Optional
|
| 17 |
+
|
| 18 |
+
import requests
|
| 19 |
+
|
| 20 |
+
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
| 21 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 22 |
+
from app.utils.logger import get_logger
|
| 23 |
+
from app.utils.url_parser import extract_video_id
|
| 24 |
+
|
| 25 |
+
logger = get_logger(__name__)
|
| 26 |
+
|
| 27 |
+
UA = (
|
| 28 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
| 29 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class BilibiliSubtitleFetcher:
|
| 34 |
+
"""通过 B 站官方 API 直拉字幕。"""
|
| 35 |
+
|
| 36 |
+
def __init__(self):
|
| 37 |
+
self._cookie = CookieConfigManager().get("bilibili") or ""
|
| 38 |
+
|
| 39 |
+
def _headers(self) -> dict:
|
| 40 |
+
h = {
|
| 41 |
+
"User-Agent": UA,
|
| 42 |
+
"Referer": "https://www.bilibili.com",
|
| 43 |
+
}
|
| 44 |
+
if self._cookie:
|
| 45 |
+
h["Cookie"] = self._cookie
|
| 46 |
+
return h
|
| 47 |
+
|
| 48 |
+
def _get_cid(self, bvid: str) -> Optional[int]:
|
| 49 |
+
url = "https://api.bilibili.com/x/web-interface/view"
|
| 50 |
+
try:
|
| 51 |
+
resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
|
| 52 |
+
data = resp.json()
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.warning(f"获取 cid 失败: {e}")
|
| 55 |
+
return None
|
| 56 |
+
if data.get("code") != 0:
|
| 57 |
+
logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
|
| 58 |
+
return None
|
| 59 |
+
cid = data.get("data", {}).get("cid")
|
| 60 |
+
return int(cid) if cid else None
|
| 61 |
+
|
| 62 |
+
def _list_subtitles(self, bvid: str, cid: int) -> List[dict]:
|
| 63 |
+
url = "https://api.bilibili.com/x/player/wbi/v2"
|
| 64 |
+
try:
|
| 65 |
+
resp = requests.get(url, params={"bvid": bvid, "cid": cid}, headers=self._headers(), timeout=10)
|
| 66 |
+
data = resp.json()
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.warning(f"获取字幕列表失败: {e}")
|
| 69 |
+
return []
|
| 70 |
+
if data.get("code") != 0:
|
| 71 |
+
logger.warning(f"player API 返回错误: code={data.get('code')}, msg={data.get('message')}")
|
| 72 |
+
return []
|
| 73 |
+
subtitles = data.get("data", {}).get("subtitle", {}).get("subtitles", [])
|
| 74 |
+
return subtitles or []
|
| 75 |
+
|
| 76 |
+
def _pick(self, subtitles: List[dict]) -> Optional[dict]:
|
| 77 |
+
"""优先级:人工中文 > AI 中文 > 任意中文 > 任意非空。"""
|
| 78 |
+
if not subtitles:
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def is_zh(s: dict) -> bool:
|
| 82 |
+
lan = (s.get("lan") or "").lower()
|
| 83 |
+
return lan.startswith("zh") or lan == "ai-zh"
|
| 84 |
+
|
| 85 |
+
# 人工中文(type 0=AI, 1=人工 ;ai_type=0 视为人工)
|
| 86 |
+
for s in subtitles:
|
| 87 |
+
if is_zh(s) and not s.get("ai_type"):
|
| 88 |
+
return s
|
| 89 |
+
# AI 中文
|
| 90 |
+
for s in subtitles:
|
| 91 |
+
if is_zh(s):
|
| 92 |
+
return s
|
| 93 |
+
# 任意非空
|
| 94 |
+
return subtitles[0]
|
| 95 |
+
|
| 96 |
+
@staticmethod
|
| 97 |
+
def _normalize_url(url: str) -> str:
|
| 98 |
+
if url.startswith("//"):
|
| 99 |
+
return "https:" + url
|
| 100 |
+
return url
|
| 101 |
+
|
| 102 |
+
def _fetch_body(self, subtitle_url: str) -> Optional[List[dict]]:
|
| 103 |
+
try:
|
| 104 |
+
resp = requests.get(self._normalize_url(subtitle_url), headers=self._headers(), timeout=15)
|
| 105 |
+
data = resp.json()
|
| 106 |
+
return data.get("body") or []
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.warning(f"下载字幕 JSON 失败: {e}")
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
def fetch_subtitles(self, video_url: str) -> Optional[TranscriptResult]:
|
| 112 |
+
bvid = extract_video_id(video_url, "bilibili")
|
| 113 |
+
if not bvid:
|
| 114 |
+
logger.info("无法从 URL 提取 BV id")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
cid = self._get_cid(bvid)
|
| 118 |
+
if not cid:
|
| 119 |
+
logger.info(f"{bvid} 没有取到 cid")
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
subtitles = self._list_subtitles(bvid, cid)
|
| 123 |
+
if not subtitles:
|
| 124 |
+
logger.info(f"{bvid} (cid={cid}) 没有可用字幕轨")
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
track = self._pick(subtitles)
|
| 128 |
+
if not track or not track.get("subtitle_url"):
|
| 129 |
+
logger.info(f"{bvid} 字幕轨存在但没有 subtitle_url(可能未登录、需要 SESSDATA cookie)")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
lan = track.get("lan") or "zh"
|
| 133 |
+
body = self._fetch_body(track["subtitle_url"])
|
| 134 |
+
if not body:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
segments: List[TranscriptSegment] = []
|
| 138 |
+
for item in body:
|
| 139 |
+
text = (item.get("content") or "").strip()
|
| 140 |
+
if not text:
|
| 141 |
+
continue
|
| 142 |
+
segments.append(TranscriptSegment(
|
| 143 |
+
start=float(item.get("from", 0)),
|
| 144 |
+
end=float(item.get("to", 0)),
|
| 145 |
+
text=text,
|
| 146 |
+
))
|
| 147 |
+
|
| 148 |
+
if not segments:
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
full_text = " ".join(s.text for s in segments)
|
| 152 |
+
logger.info(f"B站直拉字幕成功: {bvid} lan={lan} 共 {len(segments)} 段")
|
| 153 |
+
return TranscriptResult(
|
| 154 |
+
language=lan,
|
| 155 |
+
full_text=full_text,
|
| 156 |
+
segments=segments,
|
| 157 |
+
raw={
|
| 158 |
+
"source": "bilibili_player_api",
|
| 159 |
+
"bvid": bvid,
|
| 160 |
+
"cid": cid,
|
| 161 |
+
"lan": lan,
|
| 162 |
+
"ai_type": track.get("ai_type"),
|
| 163 |
+
},
|
| 164 |
+
)
|
backend/app/downloaders/common.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# def download():
|
backend/app/downloaders/douyin_downloader.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import subprocess
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import Any, Literal, Optional, Union
|
| 9 |
+
from urllib.parse import parse_qs, unquote, urlparse
|
| 10 |
+
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
from app.downloaders.base import Downloader
|
| 14 |
+
from app.enmus.note_enums import DownloadQuality
|
| 15 |
+
from app.models.audio_model import AudioDownloadResult
|
| 16 |
+
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
| 17 |
+
from app.utils.path_helper import get_data_dir
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
SHARE_PAGE_UA = (
|
| 21 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
| 22 |
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
| 23 |
+
"Version/17.0 Mobile/15E148 Safari/604.1"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*(\{.+)", re.DOTALL)
|
| 27 |
+
RENDER_DATA_RE = re.compile(
|
| 28 |
+
r'<script id="RENDER_DATA" type="application/json">([^<]+)</script>'
|
| 29 |
+
)
|
| 30 |
+
DOUYIN_URL_RE = re.compile(
|
| 31 |
+
r"https?://(?:v\.douyin\.com|www\.douyin\.com|www\.iesdouyin\.com|m\.douyin\.com)[^\s\]]*"
|
| 32 |
+
)
|
| 33 |
+
IMAGE_AWEME_TYPES = {2, 68}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class DouyinResolveError(Exception):
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class DouyinContentMeta:
|
| 42 |
+
aweme_id: str
|
| 43 |
+
title: str
|
| 44 |
+
author: str
|
| 45 |
+
source_url: str
|
| 46 |
+
content_type: Literal["video", "image"] = "video"
|
| 47 |
+
aweme_type: Optional[int] = None
|
| 48 |
+
download_url: str = ""
|
| 49 |
+
cover_url: Optional[str] = None
|
| 50 |
+
image_urls: list[str] = field(default_factory=list)
|
| 51 |
+
duration: float = 0
|
| 52 |
+
tags: list[str] = field(default_factory=list)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _session() -> requests.Session:
|
| 56 |
+
session = requests.Session()
|
| 57 |
+
session.headers.update(
|
| 58 |
+
{
|
| 59 |
+
"User-Agent": SHARE_PAGE_UA,
|
| 60 |
+
"Accept-Language": "zh-CN,zh;q=0.9",
|
| 61 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 62 |
+
}
|
| 63 |
+
)
|
| 64 |
+
return session
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def expand_share_url(share_text: str) -> str:
|
| 68 |
+
"""从抖音分享文案中提取可访问链接。"""
|
| 69 |
+
match = DOUYIN_URL_RE.search((share_text or "").strip())
|
| 70 |
+
if not match:
|
| 71 |
+
raise DouyinResolveError("未在输入中找到抖音链接")
|
| 72 |
+
return match.group(0).rstrip("/.,;)")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _extract_aweme_id_from_search_url(url: str) -> Optional[str]:
|
| 76 |
+
parsed = urlparse(url)
|
| 77 |
+
if not parsed.netloc.endswith("douyin.com") or not parsed.path.startswith("/search"):
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
params = parse_qs(parsed.query)
|
| 81 |
+
for key in ("modal_id", "item_ids"):
|
| 82 |
+
for value in params.get(key, []):
|
| 83 |
+
match = re.search(r"\d{10,}", value)
|
| 84 |
+
if match:
|
| 85 |
+
return match.group(0)
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def normalize_to_share_page(url: str) -> str:
|
| 90 |
+
"""www.douyin.com 的 video/note 页面转为移动端分享页。"""
|
| 91 |
+
note = re.search(r"https?://(?:www\.)?douyin\.com/note/(\d+)", url)
|
| 92 |
+
if note:
|
| 93 |
+
return f"https://www.iesdouyin.com/share/note/{note.group(1)}/"
|
| 94 |
+
video = re.search(r"https?://(?:www\.)?douyin\.com/video/(\d+)", url)
|
| 95 |
+
if video:
|
| 96 |
+
return f"https://www.iesdouyin.com/share/video/{video.group(1)}/"
|
| 97 |
+
search_aweme_id = _extract_aweme_id_from_search_url(url)
|
| 98 |
+
if search_aweme_id:
|
| 99 |
+
return f"https://www.iesdouyin.com/share/video/{search_aweme_id}/"
|
| 100 |
+
return url
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def resolve_share_page(session: requests.Session, share_url: str) -> tuple[str, str]:
|
| 104 |
+
response = session.get(share_url, allow_redirects=True, timeout=30)
|
| 105 |
+
response.raise_for_status()
|
| 106 |
+
return str(response.url), response.text
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def extract_aweme_id(page_url: str, html: Optional[str] = None) -> str:
|
| 110 |
+
patterns = [
|
| 111 |
+
r"/video/(\d+)",
|
| 112 |
+
r"/note/(\d+)",
|
| 113 |
+
r"/share/video/(\d+)",
|
| 114 |
+
r"/share/note/(\d+)",
|
| 115 |
+
r"modal_id=(\d+)",
|
| 116 |
+
r"item_ids=(\d+)",
|
| 117 |
+
r'"aweme_id"\s*:\s*"?(\d+)"?',
|
| 118 |
+
r'"itemId"\s*:\s*"?(\d+)"?',
|
| 119 |
+
]
|
| 120 |
+
for pattern in patterns:
|
| 121 |
+
match = re.search(pattern, page_url)
|
| 122 |
+
if match:
|
| 123 |
+
return match.group(1)
|
| 124 |
+
if html:
|
| 125 |
+
for pattern in patterns:
|
| 126 |
+
match = re.search(pattern, html)
|
| 127 |
+
if match:
|
| 128 |
+
return match.group(1)
|
| 129 |
+
raise DouyinResolveError(f"无法从分享页解析作品 ID: {page_url}")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _parse_router_data(html: str) -> Optional[dict[str, Any]]:
|
| 133 |
+
match = ROUTER_DATA_RE.search(html)
|
| 134 |
+
if not match:
|
| 135 |
+
return None
|
| 136 |
+
raw = match.group(1).split("</script>")[0].rstrip().rstrip(";")
|
| 137 |
+
try:
|
| 138 |
+
return json.loads(raw)
|
| 139 |
+
except json.JSONDecodeError:
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _parse_render_data(html: str) -> Optional[dict[str, Any]]:
|
| 144 |
+
match = RENDER_DATA_RE.search(html)
|
| 145 |
+
if not match:
|
| 146 |
+
return None
|
| 147 |
+
try:
|
| 148 |
+
return json.loads(unquote(match.group(1)))
|
| 149 |
+
except json.JSONDecodeError:
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _find_item_list(obj: Any) -> list[dict[str, Any]]:
|
| 154 |
+
if isinstance(obj, dict):
|
| 155 |
+
item_list = obj.get("item_list")
|
| 156 |
+
if isinstance(item_list, list) and item_list:
|
| 157 |
+
first = item_list[0]
|
| 158 |
+
if isinstance(first, dict) and (
|
| 159 |
+
"aweme_id" in first or "awemeId" in first or "video" in first or "images" in first
|
| 160 |
+
):
|
| 161 |
+
return item_list
|
| 162 |
+
for value in obj.values():
|
| 163 |
+
found = _find_item_list(value)
|
| 164 |
+
if found:
|
| 165 |
+
return found
|
| 166 |
+
elif isinstance(obj, list):
|
| 167 |
+
for item in obj:
|
| 168 |
+
found = _find_item_list(item)
|
| 169 |
+
if found:
|
| 170 |
+
return found
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _pick_url_from_image_node(image: dict[str, Any]) -> Optional[str]:
|
| 175 |
+
url_list = image.get("url_list") or []
|
| 176 |
+
if url_list:
|
| 177 |
+
return str(url_list[-1])
|
| 178 |
+
download_list = image.get("download_url_list") or []
|
| 179 |
+
if download_list:
|
| 180 |
+
return str(download_list[-1])
|
| 181 |
+
return None
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _extract_image_urls(item: dict[str, Any]) -> list[str]:
|
| 185 |
+
urls: list[str] = []
|
| 186 |
+
seen: set[str] = set()
|
| 187 |
+
|
| 188 |
+
def add(url: Optional[str]) -> None:
|
| 189 |
+
if url and url not in seen:
|
| 190 |
+
seen.add(url)
|
| 191 |
+
urls.append(url)
|
| 192 |
+
|
| 193 |
+
for image in item.get("images") or []:
|
| 194 |
+
if isinstance(image, dict):
|
| 195 |
+
add(_pick_url_from_image_node(image))
|
| 196 |
+
|
| 197 |
+
post = item.get("image_post_info") or {}
|
| 198 |
+
if isinstance(post, dict):
|
| 199 |
+
for image in post.get("images") or []:
|
| 200 |
+
if isinstance(image, dict):
|
| 201 |
+
add(_pick_url_from_image_node(image))
|
| 202 |
+
|
| 203 |
+
return urls
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _has_playable_video(item: dict[str, Any]) -> bool:
|
| 207 |
+
video = item.get("video") or {}
|
| 208 |
+
if not isinstance(video, dict):
|
| 209 |
+
return False
|
| 210 |
+
play_addr = video.get("play_addr") or video.get("playAddr") or {}
|
| 211 |
+
if not isinstance(play_addr, dict):
|
| 212 |
+
return False
|
| 213 |
+
return bool(play_addr.get("uri") or play_addr.get("url_list"))
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _is_image_note(item: dict[str, Any]) -> bool:
|
| 217 |
+
aweme_type = item.get("aweme_type")
|
| 218 |
+
if aweme_type in IMAGE_AWEME_TYPES:
|
| 219 |
+
return True
|
| 220 |
+
return bool(_extract_image_urls(item)) and not _has_playable_video(item)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def _build_no_watermark_url(play_addr: dict[str, Any]) -> str:
|
| 224 |
+
uri = play_addr.get("uri") or ""
|
| 225 |
+
url_list = play_addr.get("url_list") or []
|
| 226 |
+
if uri:
|
| 227 |
+
return f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=720p&line=0"
|
| 228 |
+
if url_list:
|
| 229 |
+
return str(url_list[0]).replace("playwm", "play")
|
| 230 |
+
raise DouyinResolveError("分享页内嵌数据中未找到视频播放地址")
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _extract_tags(item: dict[str, Any]) -> list[str]:
|
| 234 |
+
tags: list[str] = []
|
| 235 |
+
seen: set[str] = set()
|
| 236 |
+
for tag in item.get("text_extra") or item.get("video_tag") or []:
|
| 237 |
+
if not isinstance(tag, dict):
|
| 238 |
+
continue
|
| 239 |
+
name = tag.get("hashtag_name") or tag.get("tag_name") or tag.get("name")
|
| 240 |
+
if name and name not in seen:
|
| 241 |
+
seen.add(name)
|
| 242 |
+
tags.append(str(name))
|
| 243 |
+
return tags
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _duration_seconds(raw: Any) -> float:
|
| 247 |
+
try:
|
| 248 |
+
value = float(raw or 0)
|
| 249 |
+
except (TypeError, ValueError):
|
| 250 |
+
return 0
|
| 251 |
+
return value / 1000 if value > 10000 else value
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def _meta_from_aweme_item(item: dict[str, Any], source_url: str) -> DouyinContentMeta:
|
| 255 |
+
aweme_id = str(item.get("aweme_id") or item.get("awemeId") or "")
|
| 256 |
+
title = (item.get("desc") or item.get("caption") or "").strip() or f"douyin_{aweme_id}"
|
| 257 |
+
aweme_type = item.get("aweme_type")
|
| 258 |
+
tags = _extract_tags(item)
|
| 259 |
+
|
| 260 |
+
author = ""
|
| 261 |
+
author_info = item.get("author") or {}
|
| 262 |
+
if isinstance(author_info, dict):
|
| 263 |
+
author = author_info.get("nickname") or author_info.get("unique_id") or ""
|
| 264 |
+
|
| 265 |
+
duration = _duration_seconds(item.get("duration"))
|
| 266 |
+
|
| 267 |
+
if _is_image_note(item):
|
| 268 |
+
image_urls = _extract_image_urls(item)
|
| 269 |
+
if not image_urls:
|
| 270 |
+
raise DouyinResolveError("识别为图文,但未找到图片地址")
|
| 271 |
+
return DouyinContentMeta(
|
| 272 |
+
aweme_id=aweme_id,
|
| 273 |
+
title=title,
|
| 274 |
+
author=author,
|
| 275 |
+
source_url=source_url,
|
| 276 |
+
content_type="image",
|
| 277 |
+
aweme_type=aweme_type,
|
| 278 |
+
cover_url=image_urls[0],
|
| 279 |
+
image_urls=image_urls,
|
| 280 |
+
duration=duration,
|
| 281 |
+
tags=tags,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
video = item.get("video") or {}
|
| 285 |
+
if not isinstance(video, dict):
|
| 286 |
+
raise DouyinResolveError("分享页内嵌数据中未找到视频节点")
|
| 287 |
+
play_addr = video.get("play_addr") or video.get("playAddr") or {}
|
| 288 |
+
if not isinstance(play_addr, dict):
|
| 289 |
+
raise DouyinResolveError("视频节点缺少 play_addr")
|
| 290 |
+
|
| 291 |
+
download_url = _build_no_watermark_url(play_addr)
|
| 292 |
+
cover_url = None
|
| 293 |
+
for key in ("cover", "origin_cover", "dynamic_cover", "cover_original_scale"):
|
| 294 |
+
cover_info = video.get(key) or {}
|
| 295 |
+
if isinstance(cover_info, dict):
|
| 296 |
+
covers = cover_info.get("url_list") or []
|
| 297 |
+
if covers:
|
| 298 |
+
cover_url = str(covers[0])
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
for bit_rate in video.get("bit_rate") or []:
|
| 302 |
+
if not isinstance(bit_rate, dict):
|
| 303 |
+
continue
|
| 304 |
+
bit_play = bit_rate.get("play_addr") or {}
|
| 305 |
+
if isinstance(bit_play, dict) and bit_play.get("url_list"):
|
| 306 |
+
candidate = str(bit_play["url_list"][0])
|
| 307 |
+
if "playwm" not in candidate and ("douyinvod" in candidate or "bytecdn" in candidate):
|
| 308 |
+
download_url = candidate
|
| 309 |
+
break
|
| 310 |
+
|
| 311 |
+
return DouyinContentMeta(
|
| 312 |
+
aweme_id=aweme_id,
|
| 313 |
+
title=title,
|
| 314 |
+
author=author,
|
| 315 |
+
source_url=source_url,
|
| 316 |
+
content_type="video",
|
| 317 |
+
aweme_type=aweme_type,
|
| 318 |
+
download_url=download_url,
|
| 319 |
+
cover_url=cover_url,
|
| 320 |
+
duration=duration,
|
| 321 |
+
tags=tags,
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def parse_share_page_html(html: str, page_url: str, original_share: str) -> DouyinContentMeta:
|
| 326 |
+
for parser in (_parse_router_data, _parse_render_data):
|
| 327 |
+
payload = parser(html)
|
| 328 |
+
if not payload:
|
| 329 |
+
continue
|
| 330 |
+
items = _find_item_list(payload)
|
| 331 |
+
if items:
|
| 332 |
+
meta = _meta_from_aweme_item(items[0], original_share)
|
| 333 |
+
if meta.aweme_id:
|
| 334 |
+
return meta
|
| 335 |
+
return DouyinContentMeta(
|
| 336 |
+
aweme_id=extract_aweme_id(page_url, html),
|
| 337 |
+
title=meta.title,
|
| 338 |
+
author=meta.author,
|
| 339 |
+
source_url=meta.source_url,
|
| 340 |
+
content_type=meta.content_type,
|
| 341 |
+
aweme_type=meta.aweme_type,
|
| 342 |
+
download_url=meta.download_url,
|
| 343 |
+
cover_url=meta.cover_url,
|
| 344 |
+
image_urls=meta.image_urls,
|
| 345 |
+
duration=meta.duration,
|
| 346 |
+
tags=meta.tags,
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
raise DouyinResolveError(
|
| 350 |
+
"分享页未找到内嵌公开数据(_ROUTER_DATA / RENDER_DATA)。"
|
| 351 |
+
"请确认链接有效。"
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def resolve_douyin_share(share_text: str) -> DouyinContentMeta:
|
| 356 |
+
session = _session()
|
| 357 |
+
share_url = expand_share_url(share_text)
|
| 358 |
+
fetch_url = normalize_to_share_page(share_url)
|
| 359 |
+
page_url, html = resolve_share_page(session, fetch_url)
|
| 360 |
+
return parse_share_page_html(html, page_url, share_url)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def _download_file(url: str, dest: str) -> str:
|
| 364 |
+
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
| 365 |
+
headers = {"User-Agent": SHARE_PAGE_UA, "Referer": "https://www.iesdouyin.com/"}
|
| 366 |
+
with requests.get(url, headers=headers, stream=True, timeout=120) as response:
|
| 367 |
+
response.raise_for_status()
|
| 368 |
+
with open(dest, "wb") as file:
|
| 369 |
+
for chunk in response.iter_content(chunk_size=1024 * 256):
|
| 370 |
+
if chunk:
|
| 371 |
+
file.write(chunk)
|
| 372 |
+
return dest
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def _extract_audio(video_path: str, audio_path: str) -> None:
|
| 376 |
+
subprocess.run(
|
| 377 |
+
["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path],
|
| 378 |
+
check=True,
|
| 379 |
+
stdout=subprocess.DEVNULL,
|
| 380 |
+
stderr=subprocess.DEVNULL,
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def _build_result(
|
| 385 |
+
meta: DouyinContentMeta,
|
| 386 |
+
audio_path: str,
|
| 387 |
+
video_path: Optional[str],
|
| 388 |
+
) -> AudioDownloadResult:
|
| 389 |
+
return AudioDownloadResult(
|
| 390 |
+
file_path=audio_path,
|
| 391 |
+
title=meta.title,
|
| 392 |
+
duration=meta.duration,
|
| 393 |
+
cover_url=meta.cover_url,
|
| 394 |
+
platform="douyin",
|
| 395 |
+
video_id=meta.aweme_id,
|
| 396 |
+
raw_info={
|
| 397 |
+
"tags": meta.tags,
|
| 398 |
+
"author": meta.author,
|
| 399 |
+
"source_url": meta.source_url,
|
| 400 |
+
"content_type": meta.content_type,
|
| 401 |
+
"image_urls": meta.image_urls,
|
| 402 |
+
},
|
| 403 |
+
video_path=video_path,
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
class DouyinDownloader(Downloader):
|
| 408 |
+
def __init__(self, cookie=None):
|
| 409 |
+
super().__init__()
|
| 410 |
+
|
| 411 |
+
def extract_video_id(self, url: str) -> str:
|
| 412 |
+
try:
|
| 413 |
+
return extract_aweme_id(normalize_to_share_page(expand_share_url(url)))
|
| 414 |
+
except DouyinResolveError:
|
| 415 |
+
return ""
|
| 416 |
+
|
| 417 |
+
def _resolve_meta(self, video_url: str) -> DouyinContentMeta:
|
| 418 |
+
try:
|
| 419 |
+
return resolve_douyin_share(video_url)
|
| 420 |
+
except DouyinResolveError:
|
| 421 |
+
raise
|
| 422 |
+
except Exception as exc:
|
| 423 |
+
raise DouyinResolveError(f"抖音分享页解析失败:{exc}") from exc
|
| 424 |
+
|
| 425 |
+
def download(
|
| 426 |
+
self,
|
| 427 |
+
video_url: str,
|
| 428 |
+
output_dir: Union[str, None] = None,
|
| 429 |
+
quality: DownloadQuality = "fast",
|
| 430 |
+
need_video: Optional[bool] = False,
|
| 431 |
+
skip_download: bool = False,
|
| 432 |
+
) -> AudioDownloadResult:
|
| 433 |
+
if output_dir is None:
|
| 434 |
+
output_dir = get_data_dir()
|
| 435 |
+
if not output_dir:
|
| 436 |
+
output_dir = self.cache_data
|
| 437 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 438 |
+
|
| 439 |
+
meta = self._resolve_meta(video_url)
|
| 440 |
+
if meta.content_type == "image":
|
| 441 |
+
return _build_result(meta, "", None)
|
| 442 |
+
|
| 443 |
+
video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
|
| 444 |
+
audio_path = os.path.join(output_dir, f"{meta.aweme_id}.mp3")
|
| 445 |
+
|
| 446 |
+
if skip_download:
|
| 447 |
+
return _build_result(meta, "", None)
|
| 448 |
+
|
| 449 |
+
if not os.path.exists(video_path):
|
| 450 |
+
_download_file(meta.download_url, video_path)
|
| 451 |
+
|
| 452 |
+
if not os.path.exists(audio_path):
|
| 453 |
+
try:
|
| 454 |
+
_extract_audio(video_path, audio_path)
|
| 455 |
+
except subprocess.CalledProcessError as exc:
|
| 456 |
+
raise RuntimeError("ffmpeg 转换 MP3 失败") from exc
|
| 457 |
+
|
| 458 |
+
return _build_result(
|
| 459 |
+
meta,
|
| 460 |
+
audio_path,
|
| 461 |
+
video_path if need_video or os.path.exists(video_path) else None,
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
def download_video(self, video_url: str, output_dir: Union[str, None] = None) -> str:
|
| 465 |
+
if output_dir is None:
|
| 466 |
+
output_dir = get_data_dir()
|
| 467 |
+
if not output_dir:
|
| 468 |
+
output_dir = self.cache_data
|
| 469 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 470 |
+
|
| 471 |
+
meta = self._resolve_meta(video_url)
|
| 472 |
+
if meta.content_type == "image":
|
| 473 |
+
raise DouyinResolveError("抖音图文内容没有可下载的视频文件")
|
| 474 |
+
|
| 475 |
+
video_path = os.path.join(output_dir, f"{meta.aweme_id}.mp4")
|
| 476 |
+
if not os.path.exists(video_path):
|
| 477 |
+
_download_file(meta.download_url, video_path)
|
| 478 |
+
return video_path
|
| 479 |
+
|
| 480 |
+
def download_subtitles(
|
| 481 |
+
self,
|
| 482 |
+
video_url: str,
|
| 483 |
+
output_dir: str = None,
|
| 484 |
+
langs: list = None,
|
| 485 |
+
) -> Optional[TranscriptResult]:
|
| 486 |
+
meta = self._resolve_meta(video_url)
|
| 487 |
+
if meta.content_type != "image" or not meta.title:
|
| 488 |
+
return None
|
| 489 |
+
return TranscriptResult(
|
| 490 |
+
language="zh",
|
| 491 |
+
full_text=meta.title,
|
| 492 |
+
segments=[
|
| 493 |
+
TranscriptSegment(
|
| 494 |
+
start=0,
|
| 495 |
+
end=meta.duration or 0,
|
| 496 |
+
text=meta.title,
|
| 497 |
+
)
|
| 498 |
+
],
|
| 499 |
+
)
|
backend/app/downloaders/douyin_helper/abogus.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Original Author:
|
| 3 |
+
This file is from https://github.com/JoeanAmier/TikTokDownloader
|
| 4 |
+
And is licensed under the GNU General Public License v3.0
|
| 5 |
+
If you use this code, please keep this license and the original author information.
|
| 6 |
+
|
| 7 |
+
Modified by:
|
| 8 |
+
And this file is now a part of the https://github.com/Evil0ctal/Douyin_TikTok_Download_API open-source project.
|
| 9 |
+
This project is licensed under the Apache License 2.0, and the original author information is kept.
|
| 10 |
+
|
| 11 |
+
Purpose:
|
| 12 |
+
This file is used to generate the `a_bogus` parameter for the Douyin Web API.
|
| 13 |
+
|
| 14 |
+
Changes Made:
|
| 15 |
+
1. Changed the ua_code to compatible with the current config file User-Agent string in https://github.com/Evil0ctal/Douyin_TikTok_Download_API/blob/main/crawlers/douyin/web/config.yaml
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from random import choice
|
| 19 |
+
from random import randint
|
| 20 |
+
from random import random
|
| 21 |
+
from re import compile
|
| 22 |
+
from time import time
|
| 23 |
+
from urllib.parse import urlencode
|
| 24 |
+
from urllib.parse import quote
|
| 25 |
+
from gmssl import sm3, func
|
| 26 |
+
|
| 27 |
+
__all__ = ["ABogus", ]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ABogus:
|
| 31 |
+
__filter = compile(r'%([0-9A-F]{2})')
|
| 32 |
+
__arguments = [0, 1, 14]
|
| 33 |
+
__ua_key = "\u0000\u0001\u000e"
|
| 34 |
+
__end_string = "cus"
|
| 35 |
+
__version = [1, 0, 1, 5]
|
| 36 |
+
__browser = "1536|742|1536|864|0|0|0|0|1536|864|1536|864|1536|742|24|24|MacIntel"
|
| 37 |
+
__reg = [
|
| 38 |
+
1937774191,
|
| 39 |
+
1226093241,
|
| 40 |
+
388252375,
|
| 41 |
+
3666478592,
|
| 42 |
+
2842636476,
|
| 43 |
+
372324522,
|
| 44 |
+
3817729613,
|
| 45 |
+
2969243214,
|
| 46 |
+
]
|
| 47 |
+
__str = {
|
| 48 |
+
"s0": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
|
| 49 |
+
"s1": "Dkdpgh4ZKsQB80/Mfvw36XI1R25+WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
|
| 50 |
+
"s2": "Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
|
| 51 |
+
"s3": "ckdp1h4ZKsUB80/Mfvw36XIgR25+WQAlEi7NLboqYTOPuzmFjJnryx9HVGDaStCe",
|
| 52 |
+
"s4": "Dkdpgh2ZmsQB80/MfvV36XI1R45-WUAlEixNLwoqYTOPuzKFjJnry79HbGcaStCe",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
def __init__(self,
|
| 56 |
+
# user_agent: str = USERAGENT,
|
| 57 |
+
platform: str = None, ):
|
| 58 |
+
self.chunk = []
|
| 59 |
+
self.size = 0
|
| 60 |
+
self.reg = self.__reg[:]
|
| 61 |
+
# self.ua_code = self.generate_ua_code(user_agent)
|
| 62 |
+
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36
|
| 63 |
+
self.ua_code = [
|
| 64 |
+
76,
|
| 65 |
+
98,
|
| 66 |
+
15,
|
| 67 |
+
131,
|
| 68 |
+
97,
|
| 69 |
+
245,
|
| 70 |
+
224,
|
| 71 |
+
133,
|
| 72 |
+
122,
|
| 73 |
+
199,
|
| 74 |
+
241,
|
| 75 |
+
166,
|
| 76 |
+
79,
|
| 77 |
+
34,
|
| 78 |
+
90,
|
| 79 |
+
191,
|
| 80 |
+
128,
|
| 81 |
+
126,
|
| 82 |
+
122,
|
| 83 |
+
98,
|
| 84 |
+
66,
|
| 85 |
+
11,
|
| 86 |
+
14,
|
| 87 |
+
40,
|
| 88 |
+
49,
|
| 89 |
+
110,
|
| 90 |
+
110,
|
| 91 |
+
173,
|
| 92 |
+
67,
|
| 93 |
+
96,
|
| 94 |
+
138,
|
| 95 |
+
252]
|
| 96 |
+
self.browser = self.generate_browser_info(
|
| 97 |
+
platform) if platform else self.__browser
|
| 98 |
+
self.browser_len = len(self.browser)
|
| 99 |
+
self.browser_code = self.char_code_at(self.browser)
|
| 100 |
+
|
| 101 |
+
@classmethod
|
| 102 |
+
def list_1(cls, random_num=None, a=170, b=85, c=45, ) -> list:
|
| 103 |
+
return cls.random_list(
|
| 104 |
+
random_num,
|
| 105 |
+
a,
|
| 106 |
+
b,
|
| 107 |
+
1,
|
| 108 |
+
2,
|
| 109 |
+
5,
|
| 110 |
+
c & a,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
@classmethod
|
| 114 |
+
def list_2(cls, random_num=None, a=170, b=85, ) -> list:
|
| 115 |
+
return cls.random_list(
|
| 116 |
+
random_num,
|
| 117 |
+
a,
|
| 118 |
+
b,
|
| 119 |
+
1,
|
| 120 |
+
0,
|
| 121 |
+
0,
|
| 122 |
+
0,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
@classmethod
|
| 126 |
+
def list_3(cls, random_num=None, a=170, b=85, ) -> list:
|
| 127 |
+
return cls.random_list(
|
| 128 |
+
random_num,
|
| 129 |
+
a,
|
| 130 |
+
b,
|
| 131 |
+
1,
|
| 132 |
+
0,
|
| 133 |
+
5,
|
| 134 |
+
0,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
@staticmethod
|
| 138 |
+
def random_list(
|
| 139 |
+
a: float = None,
|
| 140 |
+
b=170,
|
| 141 |
+
c=85,
|
| 142 |
+
d=0,
|
| 143 |
+
e=0,
|
| 144 |
+
f=0,
|
| 145 |
+
g=0,
|
| 146 |
+
) -> list:
|
| 147 |
+
r = a or (random() * 10000)
|
| 148 |
+
v = [
|
| 149 |
+
r,
|
| 150 |
+
int(r) & 255,
|
| 151 |
+
int(r) >> 8,
|
| 152 |
+
]
|
| 153 |
+
s = v[1] & b | d
|
| 154 |
+
v.append(s)
|
| 155 |
+
s = v[1] & c | e
|
| 156 |
+
v.append(s)
|
| 157 |
+
s = v[2] & b | f
|
| 158 |
+
v.append(s)
|
| 159 |
+
s = v[2] & c | g
|
| 160 |
+
v.append(s)
|
| 161 |
+
return v[-4:]
|
| 162 |
+
|
| 163 |
+
@staticmethod
|
| 164 |
+
def from_char_code(*args):
|
| 165 |
+
return "".join(chr(code) for code in args)
|
| 166 |
+
|
| 167 |
+
@classmethod
|
| 168 |
+
def generate_string_1(
|
| 169 |
+
cls,
|
| 170 |
+
random_num_1=None,
|
| 171 |
+
random_num_2=None,
|
| 172 |
+
random_num_3=None,
|
| 173 |
+
):
|
| 174 |
+
return cls.from_char_code(*cls.list_1(random_num_1)) + cls.from_char_code(
|
| 175 |
+
*cls.list_2(random_num_2)) + cls.from_char_code(*cls.list_3(random_num_3))
|
| 176 |
+
|
| 177 |
+
def generate_string_2(
|
| 178 |
+
self,
|
| 179 |
+
url_params: str,
|
| 180 |
+
method="GET",
|
| 181 |
+
start_time=0,
|
| 182 |
+
end_time=0,
|
| 183 |
+
) -> str:
|
| 184 |
+
a = self.generate_string_2_list(
|
| 185 |
+
url_params,
|
| 186 |
+
method,
|
| 187 |
+
start_time,
|
| 188 |
+
end_time,
|
| 189 |
+
)
|
| 190 |
+
e = self.end_check_num(a)
|
| 191 |
+
a.extend(self.browser_code)
|
| 192 |
+
a.append(e)
|
| 193 |
+
return self.rc4_encrypt(self.from_char_code(*a), "y")
|
| 194 |
+
|
| 195 |
+
def generate_string_2_list(
|
| 196 |
+
self,
|
| 197 |
+
url_params: str,
|
| 198 |
+
method="GET",
|
| 199 |
+
start_time=0,
|
| 200 |
+
end_time=0,
|
| 201 |
+
) -> list:
|
| 202 |
+
start_time = start_time or int(time() * 1000)
|
| 203 |
+
end_time = end_time or (start_time + randint(4, 8))
|
| 204 |
+
params_array = self.generate_params_code(url_params)
|
| 205 |
+
method_array = self.generate_method_code(method)
|
| 206 |
+
return self.list_4(
|
| 207 |
+
(end_time >> 24) & 255,
|
| 208 |
+
params_array[21],
|
| 209 |
+
self.ua_code[23],
|
| 210 |
+
(end_time >> 16) & 255,
|
| 211 |
+
params_array[22],
|
| 212 |
+
self.ua_code[24],
|
| 213 |
+
(end_time >> 8) & 255,
|
| 214 |
+
(end_time >> 0) & 255,
|
| 215 |
+
(start_time >> 24) & 255,
|
| 216 |
+
(start_time >> 16) & 255,
|
| 217 |
+
(start_time >> 8) & 255,
|
| 218 |
+
(start_time >> 0) & 255,
|
| 219 |
+
method_array[21],
|
| 220 |
+
method_array[22],
|
| 221 |
+
int(end_time / 256 / 256 / 256 / 256) >> 0,
|
| 222 |
+
int(start_time / 256 / 256 / 256 / 256) >> 0,
|
| 223 |
+
self.browser_len,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
@staticmethod
|
| 227 |
+
def reg_to_array(a):
|
| 228 |
+
o = [0] * 32
|
| 229 |
+
for i in range(8):
|
| 230 |
+
c = a[i]
|
| 231 |
+
o[4 * i + 3] = (255 & c)
|
| 232 |
+
c >>= 8
|
| 233 |
+
o[4 * i + 2] = (255 & c)
|
| 234 |
+
c >>= 8
|
| 235 |
+
o[4 * i + 1] = (255 & c)
|
| 236 |
+
c >>= 8
|
| 237 |
+
o[4 * i] = (255 & c)
|
| 238 |
+
|
| 239 |
+
return o
|
| 240 |
+
|
| 241 |
+
def compress(self, a):
|
| 242 |
+
f = self.generate_f(a)
|
| 243 |
+
i = self.reg[:]
|
| 244 |
+
for o in range(64):
|
| 245 |
+
c = self.de(i[0], 12) + i[4] + self.de(self.pe(o), o)
|
| 246 |
+
c = (c & 0xFFFFFFFF)
|
| 247 |
+
c = self.de(c, 7)
|
| 248 |
+
s = (c ^ self.de(i[0], 12)) & 0xFFFFFFFF
|
| 249 |
+
|
| 250 |
+
u = self.he(o, i[0], i[1], i[2])
|
| 251 |
+
u = (u + i[3] + s + f[o + 68]) & 0xFFFFFFFF
|
| 252 |
+
|
| 253 |
+
b = self.ve(o, i[4], i[5], i[6])
|
| 254 |
+
b = (b + i[7] + c + f[o]) & 0xFFFFFFFF
|
| 255 |
+
|
| 256 |
+
i[3] = i[2]
|
| 257 |
+
i[2] = self.de(i[1], 9)
|
| 258 |
+
i[1] = i[0]
|
| 259 |
+
i[0] = u
|
| 260 |
+
|
| 261 |
+
i[7] = i[6]
|
| 262 |
+
i[6] = self.de(i[5], 19)
|
| 263 |
+
i[5] = i[4]
|
| 264 |
+
i[4] = (b ^ self.de(b, 9) ^ self.de(b, 17)) & 0xFFFFFFFF
|
| 265 |
+
|
| 266 |
+
for l in range(8):
|
| 267 |
+
self.reg[l] = (self.reg[l] ^ i[l]) & 0xFFFFFFFF
|
| 268 |
+
|
| 269 |
+
@classmethod
|
| 270 |
+
def generate_f(cls, e):
|
| 271 |
+
r = [0] * 132
|
| 272 |
+
|
| 273 |
+
for t in range(16):
|
| 274 |
+
r[t] = (e[4 * t] << 24) | (e[4 * t + 1] <<
|
| 275 |
+
16) | (e[4 * t + 2] << 8) | e[4 * t + 3]
|
| 276 |
+
r[t] &= 0xFFFFFFFF
|
| 277 |
+
|
| 278 |
+
for n in range(16, 68):
|
| 279 |
+
a = r[n - 16] ^ r[n - 9] ^ cls.de(r[n - 3], 15)
|
| 280 |
+
a = a ^ cls.de(a, 15) ^ cls.de(a, 23)
|
| 281 |
+
r[n] = (a ^ cls.de(r[n - 13], 7) ^ r[n - 6]) & 0xFFFFFFFF
|
| 282 |
+
|
| 283 |
+
for n in range(68, 132):
|
| 284 |
+
r[n] = (r[n - 68] ^ r[n - 64]) & 0xFFFFFFFF
|
| 285 |
+
|
| 286 |
+
return r
|
| 287 |
+
|
| 288 |
+
@staticmethod
|
| 289 |
+
def pad_array(arr, length=60):
|
| 290 |
+
while len(arr) < length:
|
| 291 |
+
arr.append(0)
|
| 292 |
+
return arr
|
| 293 |
+
|
| 294 |
+
def fill(self, length=60):
|
| 295 |
+
size = 8 * self.size
|
| 296 |
+
self.chunk.append(128)
|
| 297 |
+
self.chunk = self.pad_array(self.chunk, length)
|
| 298 |
+
for i in range(4):
|
| 299 |
+
self.chunk.append((size >> 8 * (3 - i)) & 255)
|
| 300 |
+
|
| 301 |
+
@staticmethod
|
| 302 |
+
def list_4(
|
| 303 |
+
a: int,
|
| 304 |
+
b: int,
|
| 305 |
+
c: int,
|
| 306 |
+
d: int,
|
| 307 |
+
e: int,
|
| 308 |
+
f: int,
|
| 309 |
+
g: int,
|
| 310 |
+
h: int,
|
| 311 |
+
i: int,
|
| 312 |
+
j: int,
|
| 313 |
+
k: int,
|
| 314 |
+
m: int,
|
| 315 |
+
n: int,
|
| 316 |
+
o: int,
|
| 317 |
+
p: int,
|
| 318 |
+
q: int,
|
| 319 |
+
r: int,
|
| 320 |
+
) -> list:
|
| 321 |
+
return [
|
| 322 |
+
44,
|
| 323 |
+
a,
|
| 324 |
+
0,
|
| 325 |
+
0,
|
| 326 |
+
0,
|
| 327 |
+
0,
|
| 328 |
+
24,
|
| 329 |
+
b,
|
| 330 |
+
n,
|
| 331 |
+
0,
|
| 332 |
+
c,
|
| 333 |
+
d,
|
| 334 |
+
0,
|
| 335 |
+
0,
|
| 336 |
+
0,
|
| 337 |
+
1,
|
| 338 |
+
0,
|
| 339 |
+
239,
|
| 340 |
+
e,
|
| 341 |
+
o,
|
| 342 |
+
f,
|
| 343 |
+
g,
|
| 344 |
+
0,
|
| 345 |
+
0,
|
| 346 |
+
0,
|
| 347 |
+
0,
|
| 348 |
+
h,
|
| 349 |
+
0,
|
| 350 |
+
0,
|
| 351 |
+
14,
|
| 352 |
+
i,
|
| 353 |
+
j,
|
| 354 |
+
0,
|
| 355 |
+
k,
|
| 356 |
+
m,
|
| 357 |
+
3,
|
| 358 |
+
p,
|
| 359 |
+
1,
|
| 360 |
+
q,
|
| 361 |
+
1,
|
| 362 |
+
r,
|
| 363 |
+
0,
|
| 364 |
+
0,
|
| 365 |
+
0]
|
| 366 |
+
|
| 367 |
+
@staticmethod
|
| 368 |
+
def end_check_num(a: list):
|
| 369 |
+
r = 0
|
| 370 |
+
for i in a:
|
| 371 |
+
r ^= i
|
| 372 |
+
return r
|
| 373 |
+
|
| 374 |
+
@classmethod
|
| 375 |
+
def decode_string(cls, url_string, ):
|
| 376 |
+
decoded = cls.__filter.sub(cls.replace_func, url_string)
|
| 377 |
+
return decoded
|
| 378 |
+
|
| 379 |
+
@staticmethod
|
| 380 |
+
def replace_func(match):
|
| 381 |
+
return chr(int(match.group(1), 16))
|
| 382 |
+
|
| 383 |
+
@staticmethod
|
| 384 |
+
def de(e, r):
|
| 385 |
+
r %= 32
|
| 386 |
+
return ((e << r) & 0xFFFFFFFF) | (e >> (32 - r))
|
| 387 |
+
|
| 388 |
+
@staticmethod
|
| 389 |
+
def pe(e):
|
| 390 |
+
return 2043430169 if 0 <= e < 16 else 2055708042
|
| 391 |
+
|
| 392 |
+
@staticmethod
|
| 393 |
+
def he(e, r, t, n):
|
| 394 |
+
if 0 <= e < 16:
|
| 395 |
+
return (r ^ t ^ n) & 0xFFFFFFFF
|
| 396 |
+
elif 16 <= e < 64:
|
| 397 |
+
return (r & t | r & n | t & n) & 0xFFFFFFFF
|
| 398 |
+
raise ValueError
|
| 399 |
+
|
| 400 |
+
@staticmethod
|
| 401 |
+
def ve(e, r, t, n):
|
| 402 |
+
if 0 <= e < 16:
|
| 403 |
+
return (r ^ t ^ n) & 0xFFFFFFFF
|
| 404 |
+
elif 16 <= e < 64:
|
| 405 |
+
return (r & t | ~r & n) & 0xFFFFFFFF
|
| 406 |
+
raise ValueError
|
| 407 |
+
|
| 408 |
+
@staticmethod
|
| 409 |
+
def convert_to_char_code(a):
|
| 410 |
+
d = []
|
| 411 |
+
for i in a:
|
| 412 |
+
d.append(ord(i))
|
| 413 |
+
return d
|
| 414 |
+
|
| 415 |
+
@staticmethod
|
| 416 |
+
def split_array(arr, chunk_size=64):
|
| 417 |
+
result = []
|
| 418 |
+
for i in range(0, len(arr), chunk_size):
|
| 419 |
+
result.append(arr[i:i + chunk_size])
|
| 420 |
+
return result
|
| 421 |
+
|
| 422 |
+
@staticmethod
|
| 423 |
+
def char_code_at(s):
|
| 424 |
+
return [ord(char) for char in s]
|
| 425 |
+
|
| 426 |
+
def write(self, e, ):
|
| 427 |
+
self.size = len(e)
|
| 428 |
+
if isinstance(e, str):
|
| 429 |
+
e = self.decode_string(e)
|
| 430 |
+
e = self.char_code_at(e)
|
| 431 |
+
if len(e) <= 64:
|
| 432 |
+
self.chunk = e
|
| 433 |
+
else:
|
| 434 |
+
chunks = self.split_array(e, 64)
|
| 435 |
+
for i in chunks[:-1]:
|
| 436 |
+
self.compress(i)
|
| 437 |
+
self.chunk = chunks[-1]
|
| 438 |
+
|
| 439 |
+
def reset(self, ):
|
| 440 |
+
self.chunk = []
|
| 441 |
+
self.size = 0
|
| 442 |
+
self.reg = self.__reg[:]
|
| 443 |
+
|
| 444 |
+
def sum(self, e, length=60):
|
| 445 |
+
self.reset()
|
| 446 |
+
self.write(e)
|
| 447 |
+
self.fill(length)
|
| 448 |
+
self.compress(self.chunk)
|
| 449 |
+
return self.reg_to_array(self.reg)
|
| 450 |
+
|
| 451 |
+
@classmethod
|
| 452 |
+
def generate_result_unit(cls, n, s):
|
| 453 |
+
r = ""
|
| 454 |
+
for i, j in zip(range(18, -1, -6), (16515072, 258048, 4032, 63)):
|
| 455 |
+
r += cls.__str[s][(n & j) >> i]
|
| 456 |
+
return r
|
| 457 |
+
|
| 458 |
+
@classmethod
|
| 459 |
+
def generate_result_end(cls, s, e="s4"):
|
| 460 |
+
r = ""
|
| 461 |
+
b = ord(s[120]) << 16
|
| 462 |
+
r += cls.__str[e][(b & 16515072) >> 18]
|
| 463 |
+
r += cls.__str[e][(b & 258048) >> 12]
|
| 464 |
+
r += "=="
|
| 465 |
+
return r
|
| 466 |
+
|
| 467 |
+
@classmethod
|
| 468 |
+
def generate_result(cls, s, e="s4"):
|
| 469 |
+
# r = ""
|
| 470 |
+
# for i in range(len(s)//4):
|
| 471 |
+
# b = ((ord(s[i * 3]) << 16) | (ord(s[i * 3 + 1]))
|
| 472 |
+
# << 8) | ord(s[i * 3 + 2])
|
| 473 |
+
# r += cls.generate_result_unit(b, e)
|
| 474 |
+
# return r
|
| 475 |
+
|
| 476 |
+
r = []
|
| 477 |
+
|
| 478 |
+
for i in range(0, len(s), 3):
|
| 479 |
+
if i + 2 < len(s):
|
| 480 |
+
n = (
|
| 481 |
+
(ord(s[i]) << 16)
|
| 482 |
+
| (ord(s[i + 1]) << 8)
|
| 483 |
+
| ord(s[i + 2])
|
| 484 |
+
)
|
| 485 |
+
elif i + 1 < len(s):
|
| 486 |
+
n = (ord(s[i]) << 16) | (
|
| 487 |
+
ord(s[i + 1]) << 8
|
| 488 |
+
)
|
| 489 |
+
else:
|
| 490 |
+
n = ord(s[i]) << 16
|
| 491 |
+
|
| 492 |
+
for j, k in zip(range(18, -1, -6),
|
| 493 |
+
(0xFC0000, 0x03F000, 0x0FC0, 0x3F)):
|
| 494 |
+
if j == 6 and i + 1 >= len(s):
|
| 495 |
+
break
|
| 496 |
+
if j == 0 and i + 2 >= len(s):
|
| 497 |
+
break
|
| 498 |
+
r.append(cls.__str[e][(n & k) >> j])
|
| 499 |
+
|
| 500 |
+
r.append("=" * ((4 - len(r) % 4) % 4))
|
| 501 |
+
return "".join(r)
|
| 502 |
+
|
| 503 |
+
@classmethod
|
| 504 |
+
def generate_args_code(cls):
|
| 505 |
+
a = []
|
| 506 |
+
for j in range(24, -1, -8):
|
| 507 |
+
a.append(cls.__arguments[0] >> j)
|
| 508 |
+
a.append(cls.__arguments[1] / 256)
|
| 509 |
+
a.append(cls.__arguments[1] % 256)
|
| 510 |
+
a.append(cls.__arguments[1] >> 24)
|
| 511 |
+
a.append(cls.__arguments[1] >> 16)
|
| 512 |
+
for j in range(24, -1, -8):
|
| 513 |
+
a.append(cls.__arguments[2] >> j)
|
| 514 |
+
return [int(i) & 255 for i in a]
|
| 515 |
+
|
| 516 |
+
def generate_method_code(self, method: str = "GET") -> list[int]:
|
| 517 |
+
return self.sm3_to_array(self.sm3_to_array(method + self.__end_string))
|
| 518 |
+
# return self.sum(self.sum(method + self.__end_string))
|
| 519 |
+
|
| 520 |
+
def generate_params_code(self, params: str) -> list[int]:
|
| 521 |
+
return self.sm3_to_array(self.sm3_to_array(params + self.__end_string))
|
| 522 |
+
# return self.sum(self.sum(params + self.__end_string))
|
| 523 |
+
|
| 524 |
+
@classmethod
|
| 525 |
+
def sm3_to_array(cls, data: str | list) -> list[int]:
|
| 526 |
+
"""
|
| 527 |
+
代码参考: https://github.com/Johnserf-Seed/f2/blob/main/f2/utils/abogus.py
|
| 528 |
+
|
| 529 |
+
计算请求体的 SM3 哈希值,并将结果转换为整数数组
|
| 530 |
+
Calculate the SM3 hash value of the request body and convert the result to an array of integers
|
| 531 |
+
|
| 532 |
+
Args:
|
| 533 |
+
data (Union[str, List[int]]): 输入数据 (Input data).
|
| 534 |
+
|
| 535 |
+
Returns:
|
| 536 |
+
List[int]: 哈希值的整数数组 (Array of integers representing the hash value).
|
| 537 |
+
"""
|
| 538 |
+
|
| 539 |
+
if isinstance(data, str):
|
| 540 |
+
b = data.encode("utf-8")
|
| 541 |
+
else:
|
| 542 |
+
b = bytes(data) # 将 List[int] 转换为字节数组
|
| 543 |
+
|
| 544 |
+
# 将字节数组转换为适合 sm3.sm3_hash 函数处理的列表格式
|
| 545 |
+
h = sm3.sm3_hash(func.bytes_to_list(b))
|
| 546 |
+
|
| 547 |
+
# 将十六进制字符串结果转换为十进制整数列表
|
| 548 |
+
return [int(h[i: i + 2], 16) for i in range(0, len(h), 2)]
|
| 549 |
+
|
| 550 |
+
@classmethod
|
| 551 |
+
def generate_browser_info(cls, platform: str = "Win32") -> str:
|
| 552 |
+
inner_width = randint(1280, 1920)
|
| 553 |
+
inner_height = randint(720, 1080)
|
| 554 |
+
outer_width = randint(inner_width, 1920)
|
| 555 |
+
outer_height = randint(inner_height, 1080)
|
| 556 |
+
screen_x = 0
|
| 557 |
+
screen_y = choice((0, 30))
|
| 558 |
+
value_list = [
|
| 559 |
+
inner_width,
|
| 560 |
+
inner_height,
|
| 561 |
+
outer_width,
|
| 562 |
+
outer_height,
|
| 563 |
+
screen_x,
|
| 564 |
+
screen_y,
|
| 565 |
+
0,
|
| 566 |
+
0,
|
| 567 |
+
outer_width,
|
| 568 |
+
outer_height,
|
| 569 |
+
outer_width,
|
| 570 |
+
outer_height,
|
| 571 |
+
inner_width,
|
| 572 |
+
inner_height,
|
| 573 |
+
24,
|
| 574 |
+
24,
|
| 575 |
+
platform,
|
| 576 |
+
]
|
| 577 |
+
return "|".join(str(i) for i in value_list)
|
| 578 |
+
|
| 579 |
+
@staticmethod
|
| 580 |
+
def rc4_encrypt(plaintext, key):
|
| 581 |
+
s = list(range(256))
|
| 582 |
+
j = 0
|
| 583 |
+
|
| 584 |
+
for i in range(256):
|
| 585 |
+
j = (j + s[i] + ord(key[i % len(key)])) % 256
|
| 586 |
+
s[i], s[j] = s[j], s[i]
|
| 587 |
+
|
| 588 |
+
i = 0
|
| 589 |
+
j = 0
|
| 590 |
+
cipher = []
|
| 591 |
+
|
| 592 |
+
for k in range(len(plaintext)):
|
| 593 |
+
i = (i + 1) % 256
|
| 594 |
+
j = (j + s[i]) % 256
|
| 595 |
+
s[i], s[j] = s[j], s[i]
|
| 596 |
+
t = (s[i] + s[j]) % 256
|
| 597 |
+
cipher.append(chr(s[t] ^ ord(plaintext[k])))
|
| 598 |
+
|
| 599 |
+
return ''.join(cipher)
|
| 600 |
+
|
| 601 |
+
def get_value(self,
|
| 602 |
+
url_params: dict | str,
|
| 603 |
+
method="GET",
|
| 604 |
+
start_time=0,
|
| 605 |
+
end_time=0,
|
| 606 |
+
random_num_1=None,
|
| 607 |
+
random_num_2=None,
|
| 608 |
+
random_num_3=None,
|
| 609 |
+
) -> str:
|
| 610 |
+
string_1 = self.generate_string_1(
|
| 611 |
+
random_num_1,
|
| 612 |
+
random_num_2,
|
| 613 |
+
random_num_3,
|
| 614 |
+
)
|
| 615 |
+
string_2 = self.generate_string_2(urlencode(url_params) if isinstance(
|
| 616 |
+
url_params, dict) else url_params, method, start_time, end_time, )
|
| 617 |
+
string = string_1 + string_2
|
| 618 |
+
# return self.generate_result(
|
| 619 |
+
# string, "s4") + self.generate_result_end(string, "s4")
|
| 620 |
+
return self.generate_result(string, "s4")
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
if __name__ == "__main__":
|
| 624 |
+
bogus = ABogus()
|
| 625 |
+
USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
|
| 626 |
+
url_str = "https://www.douyin.com/aweme/v1/web/aweme/detail/?device_platform=webapp&aid=6383&channel=channel_pc_web&pc_client_type=1&version_code=190500&version_name=19.5.0&cookie_enabled=true&browser_language=zh-CN&browser_platform=Win32&browser_name=Firefox&browser_online=true&engine_name=Gecko&os_name=Windows&os_version=10&platform=PC&screen_width=1920&screen_height=1080&browser_version=124.0&engine_version=122.0.0.0&cpu_core_num=12&device_memory=8&aweme_id=7345492945006595379"
|
| 627 |
+
# 将url参数转换为字典
|
| 628 |
+
url_params = dict([param.split("=")
|
| 629 |
+
for param in url_str.split("?")[1].split("&")])
|
| 630 |
+
print(f"URL参数: {url_params}")
|
| 631 |
+
a_bogus = bogus.get_value(url_params, )
|
| 632 |
+
# 使用url编码a_bogus
|
| 633 |
+
a_bogus = quote(a_bogus, safe='')
|
| 634 |
+
print(a_bogus)
|
| 635 |
+
print(USERAGENT)
|
backend/app/downloaders/generic_downloader.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""通用 yt-dlp 下载器:用于用户在「下载配置」里登记的自定义平台。
|
| 2 |
+
|
| 3 |
+
不做任何站点特定逻辑——完全依赖 yt-dlp 内置 extractor。只把:
|
| 4 |
+
- 该平台的 Cookie/cookies-from-browser 注入 ydl_opts
|
| 5 |
+
- 全局代理注入 ydl_opts
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
from abc import ABC
|
| 11 |
+
from typing import Optional, Union
|
| 12 |
+
|
| 13 |
+
import yt_dlp
|
| 14 |
+
|
| 15 |
+
from app.downloaders.base import Downloader, DownloadQuality
|
| 16 |
+
from app.models.notes_model import AudioDownloadResult
|
| 17 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 18 |
+
from app.services.proxy_config_manager import ProxyConfigManager
|
| 19 |
+
from app.utils.path_helper import get_data_dir
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class GenericYtdlpDownloader(Downloader, ABC):
|
| 25 |
+
"""对任意 yt-dlp 支持站点的薄封装。按平台 key 读取 cookie 配置。"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, platform: str, cookie_domain: Optional[str] = None):
|
| 28 |
+
super().__init__()
|
| 29 |
+
self.platform = platform
|
| 30 |
+
# cookie 文件里 Netscape 格式需要 domain;不知道就用通用 . 让 yt-dlp 自己挑
|
| 31 |
+
self.cookie_domain = cookie_domain or f".{platform}.com"
|
| 32 |
+
mgr = CookieConfigManager()
|
| 33 |
+
self._cookie = mgr.get(platform)
|
| 34 |
+
self._browser = mgr.get_browser(platform)
|
| 35 |
+
self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
|
| 36 |
+
|
| 37 |
+
def _write_netscape_cookie_file(self) -> Optional[str]:
|
| 38 |
+
if not self._cookie:
|
| 39 |
+
return None
|
| 40 |
+
lines = ["# Netscape HTTP Cookie File\n"]
|
| 41 |
+
for pair in self._cookie.split("; "):
|
| 42 |
+
if "=" in pair:
|
| 43 |
+
k, v = pair.split("=", 1)
|
| 44 |
+
lines.append(f"{self.cookie_domain}\tTRUE\t/\tFALSE\t0\t{k}\t{v}\n")
|
| 45 |
+
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
|
| 46 |
+
tmp.writelines(lines)
|
| 47 |
+
tmp.close()
|
| 48 |
+
logger.info("已生成 [%s] Netscape Cookie 文件: %s", self.platform, tmp.name)
|
| 49 |
+
return tmp.name
|
| 50 |
+
|
| 51 |
+
def _apply_ydl_extras(self, ydl_opts: dict) -> None:
|
| 52 |
+
proxy = ProxyConfigManager().get_proxy_url()
|
| 53 |
+
if proxy:
|
| 54 |
+
ydl_opts['proxy'] = proxy
|
| 55 |
+
if self._browser:
|
| 56 |
+
ydl_opts['cookiesfrombrowser'] = (self._browser,)
|
| 57 |
+
elif self._cookiefile:
|
| 58 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 59 |
+
|
| 60 |
+
def download(
|
| 61 |
+
self,
|
| 62 |
+
video_url: str,
|
| 63 |
+
output_dir: Union[str, None] = None,
|
| 64 |
+
quality: DownloadQuality = "fast",
|
| 65 |
+
need_video: Optional[bool] = False,
|
| 66 |
+
skip_download: bool = False,
|
| 67 |
+
) -> AudioDownloadResult:
|
| 68 |
+
if output_dir is None:
|
| 69 |
+
output_dir = get_data_dir()
|
| 70 |
+
if not output_dir:
|
| 71 |
+
output_dir = self.cache_data
|
| 72 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 73 |
+
|
| 74 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 75 |
+
ydl_opts = {
|
| 76 |
+
'format': 'bestaudio/best',
|
| 77 |
+
'outtmpl': output_path,
|
| 78 |
+
'noplaylist': True,
|
| 79 |
+
'quiet': False,
|
| 80 |
+
}
|
| 81 |
+
if skip_download:
|
| 82 |
+
ydl_opts['skip_download'] = True
|
| 83 |
+
self._apply_ydl_extras(ydl_opts)
|
| 84 |
+
|
| 85 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 86 |
+
info = ydl.extract_info(video_url, download=not skip_download)
|
| 87 |
+
video_id = info.get("id") or "unknown"
|
| 88 |
+
title = info.get("title") or self.platform
|
| 89 |
+
duration = info.get("duration", 0)
|
| 90 |
+
cover_url = info.get("thumbnail")
|
| 91 |
+
ext = info.get("ext", "mp3")
|
| 92 |
+
audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
|
| 93 |
+
|
| 94 |
+
return AudioDownloadResult(
|
| 95 |
+
file_path=audio_path,
|
| 96 |
+
title=title,
|
| 97 |
+
duration=duration,
|
| 98 |
+
cover_url=cover_url,
|
| 99 |
+
platform=self.platform,
|
| 100 |
+
video_id=video_id,
|
| 101 |
+
raw_info={'tags': info.get('tags')},
|
| 102 |
+
video_path=None,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def download_video(
|
| 106 |
+
self,
|
| 107 |
+
video_url: str,
|
| 108 |
+
output_dir: Union[str, None] = None,
|
| 109 |
+
) -> str:
|
| 110 |
+
if output_dir is None:
|
| 111 |
+
output_dir = get_data_dir()
|
| 112 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 113 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 114 |
+
ydl_opts = {
|
| 115 |
+
'format': 'bestvideo+bestaudio/best',
|
| 116 |
+
'outtmpl': output_path,
|
| 117 |
+
'noplaylist': True,
|
| 118 |
+
'quiet': False,
|
| 119 |
+
'merge_output_format': 'mp4',
|
| 120 |
+
}
|
| 121 |
+
self._apply_ydl_extras(ydl_opts)
|
| 122 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 123 |
+
info = ydl.extract_info(video_url, download=True)
|
| 124 |
+
video_id = info.get("id")
|
| 125 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 126 |
+
if not os.path.exists(video_path):
|
| 127 |
+
raise FileNotFoundError(f"视频文件未找到: {video_path}")
|
| 128 |
+
return video_path
|
backend/app/downloaders/kuaishou_downloader.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
from abc import ABC
|
| 4 |
+
from typing import Union, Optional
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
from app.downloaders.base import Downloader
|
| 9 |
+
from app.downloaders.kuaishou_helper.kuaishou import KuaiShou
|
| 10 |
+
from app.enmus.note_enums import DownloadQuality
|
| 11 |
+
from app.models.audio_model import AudioDownloadResult
|
| 12 |
+
from app.utils.path_helper import get_data_dir
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class KuaiShouDownloader(Downloader, ABC):
|
| 16 |
+
def __init__(self):
|
| 17 |
+
super().__init__()
|
| 18 |
+
|
| 19 |
+
def download(
|
| 20 |
+
self,
|
| 21 |
+
video_url: str,
|
| 22 |
+
output_dir: Union[str, None] = None,
|
| 23 |
+
quality: str = "fast",
|
| 24 |
+
need_video: Optional[bool] = False
|
| 25 |
+
) -> AudioDownloadResult:
|
| 26 |
+
if output_dir is None:
|
| 27 |
+
output_dir = get_data_dir()
|
| 28 |
+
if not output_dir:
|
| 29 |
+
output_dir = self.cache_data
|
| 30 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
ks = KuaiShou()
|
| 33 |
+
video_raw_info = ks.run(video_url)
|
| 34 |
+
print(video_raw_info)
|
| 35 |
+
photo_info = video_raw_info['visionVideoDetail']['photo']
|
| 36 |
+
video_id = photo_info['id']
|
| 37 |
+
title = photo_info['caption'].strip().replace('\n', '').replace(' ', '_')[:50]
|
| 38 |
+
mp4_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 39 |
+
mp3_path = os.path.join(output_dir, f"{video_id}.mp3")
|
| 40 |
+
|
| 41 |
+
if os.path.exists(mp3_path):
|
| 42 |
+
print(f"[已存在] 跳过下载: {mp3_path}")
|
| 43 |
+
return AudioDownloadResult(
|
| 44 |
+
file_path=mp3_path,
|
| 45 |
+
title=title,
|
| 46 |
+
duration=photo_info['duration'],
|
| 47 |
+
cover_url=photo_info['coverUrl'],
|
| 48 |
+
platform="kuaishou",
|
| 49 |
+
video_id=video_id,
|
| 50 |
+
raw_info={
|
| 51 |
+
'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
|
| 52 |
+
},
|
| 53 |
+
video_path=mp4_path
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# 下载 mp4 视频
|
| 57 |
+
resp = requests.get(photo_info['photoUrl'], stream=True)
|
| 58 |
+
if resp.status_code == 200:
|
| 59 |
+
with open(mp4_path, "wb") as f:
|
| 60 |
+
for chunk in resp.iter_content(1024 * 1024):
|
| 61 |
+
f.write(chunk)
|
| 62 |
+
else:
|
| 63 |
+
raise Exception(f"视频下载失败: {resp.status_code}")
|
| 64 |
+
|
| 65 |
+
# 使用 ffmpeg 转换为 mp3
|
| 66 |
+
try:
|
| 67 |
+
subprocess.run([
|
| 68 |
+
"ffmpeg", "-y", "-i", mp4_path, "-vn", "-acodec", "libmp3lame", mp3_path
|
| 69 |
+
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 70 |
+
except subprocess.CalledProcessError:
|
| 71 |
+
raise Exception("ffmpeg 转换 MP3 失败")
|
| 72 |
+
|
| 73 |
+
return AudioDownloadResult(
|
| 74 |
+
file_path=mp3_path,
|
| 75 |
+
title=photo_info['caption'],
|
| 76 |
+
duration=photo_info['duration'],
|
| 77 |
+
cover_url=photo_info['coverUrl'],
|
| 78 |
+
platform="kuaishou",
|
| 79 |
+
video_id=video_id,
|
| 80 |
+
raw_info={
|
| 81 |
+
'tags': ','.join(tag['name'] for tag in video_raw_info.get('tags', []) if tag.get('name'))
|
| 82 |
+
},
|
| 83 |
+
video_path=mp4_path
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
def download_video(
|
| 87 |
+
self,
|
| 88 |
+
video_url: str,
|
| 89 |
+
output_dir: Union[str, None] = None,
|
| 90 |
+
) -> str:
|
| 91 |
+
print('self.download(video_url, output_dir).video_path',self.download(video_url, output_dir).video_path)
|
| 92 |
+
return self.download(video_url, output_dir).video_path
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
if __name__ == '__main__':
|
| 96 |
+
ks = KuaiShouDownloader()
|
| 97 |
+
ks.download('https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')
|
backend/app/downloaders/kuaishou_helper/__init__.py
ADDED
|
File without changes
|
backend/app/downloaders/kuaishou_helper/kuaishou.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 9 |
+
from app.utils.logger import get_logger
|
| 10 |
+
KUAISHOU_API_BASE = 'https://www.kuaishou.com/graphql'
|
| 11 |
+
KUAISHOU_URL = "https://www.kuaishou.com/"
|
| 12 |
+
load_dotenv()
|
| 13 |
+
headers = {
|
| 14 |
+
'Accept-Language': 'zh-CN,zh;q=0.9',
|
| 15 |
+
'Cache-Control': 'no-cache',
|
| 16 |
+
'Connection': 'keep-alive',
|
| 17 |
+
# 'Cookie': 'did=web_9e8cfa4403000587b9e7d67233e6b04c; didv=1719811812378; kpf=PC_WEB; clientid=3; kpn=KUAISHOU_VISION',
|
| 18 |
+
'Origin': 'https://www.kuaishou.com',
|
| 19 |
+
'Pragma': 'no-cache',
|
| 20 |
+
'Referer': 'https://www.kuaishou.com/',
|
| 21 |
+
'Sec-Fetch-Dest': 'empty',
|
| 22 |
+
'Sec-Fetch-Mode': 'cors',
|
| 23 |
+
'Sec-Fetch-Site': 'same-origin',
|
| 24 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 25 |
+
'accept': '*/*',
|
| 26 |
+
'content-type': 'application/json',
|
| 27 |
+
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
|
| 28 |
+
'sec-ch-ua-mobile': '?0',
|
| 29 |
+
'sec-ch-ua-platform': '"Windows"',
|
| 30 |
+
# 'Cookie':cookies.strip()
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
logger = get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
cfm=CookieConfigManager()
|
| 36 |
+
class KuaiShou:
|
| 37 |
+
def __init__(self):
|
| 38 |
+
self.header = headers.copy()
|
| 39 |
+
self.cookie = None
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def _extract_kuaishou_link(text):
|
| 43 |
+
|
| 44 |
+
url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
|
| 45 |
+
return url[0]
|
| 46 |
+
|
| 47 |
+
def get_photo_id(self, url):
|
| 48 |
+
response = requests.get(url, allow_redirects=True, headers=self.header)
|
| 49 |
+
real_url = response.url
|
| 50 |
+
# 提取short—video/后面的id
|
| 51 |
+
pattern = re.compile(r'short-video/(\w+)')
|
| 52 |
+
match = pattern.search(real_url)
|
| 53 |
+
return match.group().split('/')[1]
|
| 54 |
+
|
| 55 |
+
def get_temp_cookies(self):
|
| 56 |
+
is_exist = cfm.get('kuaishou')
|
| 57 |
+
print(is_exist)
|
| 58 |
+
if is_exist:
|
| 59 |
+
return is_exist
|
| 60 |
+
res = requests.get(url=KUAISHOU_URL, headers=self.header, allow_redirects=True)
|
| 61 |
+
cookie_string = '; '.join([f"{k}={v}" for k, v in res.cookies.get_dict().items()])
|
| 62 |
+
return cookie_string
|
| 63 |
+
|
| 64 |
+
def get_video_details(self, url, photo_id):
|
| 65 |
+
json_data = {
|
| 66 |
+
'operationName': 'visionVideoDetail',
|
| 67 |
+
"variables": {"photoId": photo_id, "page": "detail"},
|
| 68 |
+
"query": "query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {\n visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {\n status\n type\n author {\n id\n name\n following\n headerUrl\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n photoUrl\n liked\n timestamp\n expTag\n llsid\n viewCount\n videoRatio\n stereoType\n croppedPhotoUrl\n manifest {\n mediaType\n businessType\n version\n adaptationSet {\n id\n duration\n representation {\n id\n defaultSelect\n backupUrl\n codecs\n url\n height\n width\n avgBitrate\n maxBitrate\n m3u8Slice\n qualityType\n qualityLabel\n frameRate\n featureP2sp\n hidden\n disableAdaptive\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n commentLimit {\n canAddComment\n __typename\n }\n llsid\n danmakuSwitch\n __typename\n }\n}\n"
|
| 69 |
+
}
|
| 70 |
+
response = requests.post(url=KUAISHOU_API_BASE, headers=self.header, json=json_data)
|
| 71 |
+
if response.status_code == 200:
|
| 72 |
+
response.raise_for_status()
|
| 73 |
+
|
| 74 |
+
return response.json()
|
| 75 |
+
else:
|
| 76 |
+
return None
|
| 77 |
+
|
| 78 |
+
def run(self, url):
|
| 79 |
+
real_url = self._extract_kuaishou_link(url)
|
| 80 |
+
if not real_url:
|
| 81 |
+
logger.error(f"快手视频 URL 解析失败 {url}")
|
| 82 |
+
|
| 83 |
+
cookies = self.get_temp_cookies()
|
| 84 |
+
if not cookies:
|
| 85 |
+
logger.error(f"快手视频 cookies 解析失败 {url},请考虑设置环境变量 KUAISHOU_COOKIES")
|
| 86 |
+
|
| 87 |
+
self.header['Cookie'] = cookies.strip()
|
| 88 |
+
photo_id = self.get_photo_id(real_url)
|
| 89 |
+
if photo_id is None:
|
| 90 |
+
logger.error(f"快手视频 ID 解析失败 {url}")
|
| 91 |
+
video_details = self.get_video_details(real_url, photo_id)
|
| 92 |
+
print(video_details)
|
| 93 |
+
if video_details is None:
|
| 94 |
+
logger.error(f"快手视频详情解析失败 {url}")
|
| 95 |
+
return video_details['data']
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
if __name__ == '__main__':
|
| 99 |
+
ks = KuaiShou()
|
| 100 |
+
ks.run(
|
| 101 |
+
'https://v.kuaishou.com/2vBqX74 王宝强携手刘昊然、岳云鹏上演精彩名场面 全程高能 看一遍笑一遍 "唐探1900 "快成长计划 ...更多')
|
backend/app/downloaders/local_downloader.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
from abc import ABC
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from app.downloaders.base import Downloader
|
| 7 |
+
from app.enmus.note_enums import DownloadQuality
|
| 8 |
+
from app.models.audio_model import AudioDownloadResult
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
|
| 12 |
+
from app.utils.video_helper import save_cover_to_static
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LocalDownloader(Downloader, ABC):
|
| 16 |
+
def __init__(self):
|
| 17 |
+
|
| 18 |
+
super().__init__()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def extract_cover(self, input_path: str, output_dir: Optional[str] = None) -> str:
|
| 22 |
+
"""
|
| 23 |
+
从本地视频文件中提取一张封面图(默认取第一帧)
|
| 24 |
+
:param input_path: 输入视频路径
|
| 25 |
+
:param output_dir: 输出目录,默认和视频同目录
|
| 26 |
+
:return: 提取出的封面图片路径
|
| 27 |
+
"""
|
| 28 |
+
if not os.path.exists(input_path):
|
| 29 |
+
raise FileNotFoundError(f"输入文件不存在: {input_path}")
|
| 30 |
+
|
| 31 |
+
if output_dir is None:
|
| 32 |
+
output_dir = os.path.dirname(input_path)
|
| 33 |
+
|
| 34 |
+
base_name = os.path.splitext(os.path.basename(input_path))[0]
|
| 35 |
+
output_path = os.path.join(output_dir, f"{base_name}_cover.jpg")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
command = [
|
| 39 |
+
'ffmpeg',
|
| 40 |
+
'-i', input_path,
|
| 41 |
+
'-ss', '00:00:01', # 跳到视频第1秒,防止黑屏
|
| 42 |
+
'-vframes', '1', # 只截取一帧
|
| 43 |
+
'-q:v', '2', # 输出质量高一点(qscale,2是很高)
|
| 44 |
+
'-y', # 覆盖
|
| 45 |
+
output_path
|
| 46 |
+
]
|
| 47 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 48 |
+
|
| 49 |
+
if not os.path.exists(output_path):
|
| 50 |
+
raise RuntimeError(f"封面图片生成失败: {output_path}")
|
| 51 |
+
|
| 52 |
+
return output_path
|
| 53 |
+
except subprocess.CalledProcessError as e:
|
| 54 |
+
raise RuntimeError(f"提取封面失败: {output_path}") from e
|
| 55 |
+
|
| 56 |
+
def convert_to_mp3(self,input_path: str, output_path: str = None) -> str:
|
| 57 |
+
"""
|
| 58 |
+
将本地视频文件转为 MP3 音频文件
|
| 59 |
+
:param input_path: 输入文件路径(如 .mp4)
|
| 60 |
+
:param output_path: 输出文件路径(可选,默认同目录同名 .mp3)
|
| 61 |
+
:return: 生成的 mp3 文件路径
|
| 62 |
+
"""
|
| 63 |
+
if not os.path.exists(input_path):
|
| 64 |
+
raise FileNotFoundError(f"输入文件不存在: {input_path}")
|
| 65 |
+
|
| 66 |
+
if output_path is None:
|
| 67 |
+
base, _ = os.path.splitext(input_path)
|
| 68 |
+
output_path = base + ".mp3"
|
| 69 |
+
try:
|
| 70 |
+
# 调用 ffmpeg 转换
|
| 71 |
+
command = [
|
| 72 |
+
'ffmpeg',
|
| 73 |
+
'-i', input_path,
|
| 74 |
+
'-vn', # 不要视频流
|
| 75 |
+
'-acodec', 'libmp3lame', # 使用mp3编码
|
| 76 |
+
'-y', # 覆盖输出文件
|
| 77 |
+
output_path
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 81 |
+
|
| 82 |
+
if not os.path.exists(output_path):
|
| 83 |
+
raise RuntimeError(f"mp3 文件生成失败: {output_path}")
|
| 84 |
+
|
| 85 |
+
return output_path
|
| 86 |
+
except subprocess.CalledProcessError as e:
|
| 87 |
+
raise RuntimeError(f"mp3 文件生成失败: {output_path}") from e
|
| 88 |
+
def download_video(self, video_url: str, output_dir: str = None) -> str:
|
| 89 |
+
"""
|
| 90 |
+
处理本地文件路径,返回视频文件路径
|
| 91 |
+
"""
|
| 92 |
+
if video_url.startswith('/uploads'):
|
| 93 |
+
project_root = os.getcwd()
|
| 94 |
+
video_url = os.path.join(project_root, video_url.lstrip('/'))
|
| 95 |
+
video_url = os.path.normpath(video_url)
|
| 96 |
+
|
| 97 |
+
if not os.path.exists(video_url):
|
| 98 |
+
raise FileNotFoundError()
|
| 99 |
+
return video_url
|
| 100 |
+
def download(
|
| 101 |
+
self,
|
| 102 |
+
video_url: str,
|
| 103 |
+
output_dir: str = None,
|
| 104 |
+
quality: DownloadQuality = "fast",
|
| 105 |
+
need_video: Optional[bool] = False
|
| 106 |
+
) -> AudioDownloadResult:
|
| 107 |
+
"""
|
| 108 |
+
处理本地文件路径,返回音频元信息
|
| 109 |
+
"""
|
| 110 |
+
if video_url.startswith('/uploads'):
|
| 111 |
+
project_root = os.getcwd()
|
| 112 |
+
video_url = os.path.join(project_root, video_url.lstrip('/'))
|
| 113 |
+
video_url = os.path.normpath(video_url)
|
| 114 |
+
|
| 115 |
+
if not os.path.exists(video_url):
|
| 116 |
+
raise FileNotFoundError(f"本地文件不存在: {video_url}")
|
| 117 |
+
|
| 118 |
+
file_name = os.path.basename(video_url)
|
| 119 |
+
title, _ = os.path.splitext(file_name)
|
| 120 |
+
print(title, file_name,video_url)
|
| 121 |
+
file_path=self.convert_to_mp3(video_url)
|
| 122 |
+
cover_path = self.extract_cover(video_url)
|
| 123 |
+
cover_url = save_cover_to_static(cover_path)
|
| 124 |
+
|
| 125 |
+
print('file——path',file_path)
|
| 126 |
+
return AudioDownloadResult(
|
| 127 |
+
file_path=file_path,
|
| 128 |
+
title=title,
|
| 129 |
+
duration=0, # 可选:后续加上读取时长
|
| 130 |
+
cover_url=cover_url, # 暂无封面
|
| 131 |
+
platform="local",
|
| 132 |
+
video_id=title,
|
| 133 |
+
raw_info={
|
| 134 |
+
'path': file_path
|
| 135 |
+
},
|
| 136 |
+
video_path=None
|
| 137 |
+
)
|
backend/app/downloaders/xiaohongshu_downloader.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""小红书下载器:基于 yt-dlp 内置 XiaoHongShu extractor。
|
| 2 |
+
|
| 3 |
+
URL 模式:
|
| 4 |
+
- https://www.xiaohongshu.com/explore/{id}
|
| 5 |
+
- https://www.xiaohongshu.com/discovery/item/{id}
|
| 6 |
+
- 短链 xhslink.com/xxx 由 yt-dlp 自行跟随重定向
|
| 7 |
+
|
| 8 |
+
小红书很多内容是图文笔记(无视频/音频)。无视频的会触发 yt-dlp 报「请求格式不可用」,
|
| 9 |
+
前端会展示生成失败——这是预期行为,不强行兜底。
|
| 10 |
+
"""
|
| 11 |
+
import os
|
| 12 |
+
import logging
|
| 13 |
+
import tempfile
|
| 14 |
+
from abc import ABC
|
| 15 |
+
from typing import Union, Optional
|
| 16 |
+
|
| 17 |
+
import yt_dlp
|
| 18 |
+
|
| 19 |
+
from app.downloaders.base import Downloader, DownloadQuality
|
| 20 |
+
from app.models.notes_model import AudioDownloadResult
|
| 21 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 22 |
+
from app.utils.path_helper import get_data_dir
|
| 23 |
+
from app.utils.url_parser import extract_video_id, clean_url
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class XiaohongshuDownloader(Downloader, ABC):
|
| 29 |
+
def __init__(self):
|
| 30 |
+
super().__init__()
|
| 31 |
+
self._cookie_mgr = CookieConfigManager()
|
| 32 |
+
self._cookie = self._cookie_mgr.get('xiaohongshu')
|
| 33 |
+
self._browser = self._cookie_mgr.get_browser('xiaohongshu')
|
| 34 |
+
self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
|
| 35 |
+
|
| 36 |
+
def _write_netscape_cookie_file(self) -> Optional[str]:
|
| 37 |
+
if not self._cookie:
|
| 38 |
+
logger.warning("小红书 Cookie 未配置,部分内容可能下载失败")
|
| 39 |
+
return None
|
| 40 |
+
lines = ["# Netscape HTTP Cookie File\n"]
|
| 41 |
+
for pair in self._cookie.split("; "):
|
| 42 |
+
if "=" in pair:
|
| 43 |
+
key, value = pair.split("=", 1)
|
| 44 |
+
lines.append(f".xiaohongshu.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
|
| 45 |
+
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
|
| 46 |
+
tmp.writelines(lines)
|
| 47 |
+
tmp.close()
|
| 48 |
+
logger.info("已生成小红书 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
|
| 49 |
+
return tmp.name
|
| 50 |
+
|
| 51 |
+
def _apply_cookie(self, ydl_opts: dict) -> None:
|
| 52 |
+
if self._browser:
|
| 53 |
+
ydl_opts['cookiesfrombrowser'] = (self._browser,)
|
| 54 |
+
logger.info(f"小红书使用 cookies-from-browser: {self._browser}")
|
| 55 |
+
elif self._cookiefile:
|
| 56 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 57 |
+
|
| 58 |
+
def download(
|
| 59 |
+
self,
|
| 60 |
+
video_url: str,
|
| 61 |
+
output_dir: Union[str, None] = None,
|
| 62 |
+
quality: DownloadQuality = "fast",
|
| 63 |
+
need_video: Optional[bool] = False,
|
| 64 |
+
skip_download: bool = False,
|
| 65 |
+
) -> AudioDownloadResult:
|
| 66 |
+
# 从分享文案中提取干净链接(标题+不可见字符+短链 整段粘贴也能用)
|
| 67 |
+
video_url = clean_url(video_url)
|
| 68 |
+
if output_dir is None:
|
| 69 |
+
output_dir = get_data_dir()
|
| 70 |
+
if not output_dir:
|
| 71 |
+
output_dir = self.cache_data
|
| 72 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 73 |
+
|
| 74 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 75 |
+
ydl_opts = {
|
| 76 |
+
'format': 'bestaudio/best',
|
| 77 |
+
'outtmpl': output_path,
|
| 78 |
+
'noplaylist': True,
|
| 79 |
+
'quiet': False,
|
| 80 |
+
}
|
| 81 |
+
if skip_download:
|
| 82 |
+
ydl_opts['skip_download'] = True
|
| 83 |
+
self._apply_cookie(ydl_opts)
|
| 84 |
+
|
| 85 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 86 |
+
info = ydl.extract_info(video_url, download=not skip_download)
|
| 87 |
+
video_id = info.get("id")
|
| 88 |
+
title = info.get("title")
|
| 89 |
+
duration = info.get("duration", 0)
|
| 90 |
+
cover_url = info.get("thumbnail")
|
| 91 |
+
ext = info.get("ext", "mp3")
|
| 92 |
+
audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
|
| 93 |
+
|
| 94 |
+
return AudioDownloadResult(
|
| 95 |
+
file_path=audio_path,
|
| 96 |
+
title=title,
|
| 97 |
+
duration=duration,
|
| 98 |
+
cover_url=cover_url,
|
| 99 |
+
platform="xiaohongshu",
|
| 100 |
+
video_id=video_id,
|
| 101 |
+
raw_info={'tags': info.get('tags')},
|
| 102 |
+
video_path=None,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def download_video(
|
| 106 |
+
self,
|
| 107 |
+
video_url: str,
|
| 108 |
+
output_dir: Union[str, None] = None,
|
| 109 |
+
) -> str:
|
| 110 |
+
video_url = clean_url(video_url)
|
| 111 |
+
if output_dir is None:
|
| 112 |
+
output_dir = get_data_dir()
|
| 113 |
+
video_id = extract_video_id(video_url, "xiaohongshu")
|
| 114 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 115 |
+
if os.path.exists(video_path):
|
| 116 |
+
return video_path
|
| 117 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 118 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 119 |
+
ydl_opts = {
|
| 120 |
+
'format': 'bestvideo+bestaudio/best',
|
| 121 |
+
'outtmpl': output_path,
|
| 122 |
+
'noplaylist': True,
|
| 123 |
+
'quiet': False,
|
| 124 |
+
'merge_output_format': 'mp4',
|
| 125 |
+
}
|
| 126 |
+
self._apply_cookie(ydl_opts)
|
| 127 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 128 |
+
info = ydl.extract_info(video_url, download=True)
|
| 129 |
+
video_id = info.get("id")
|
| 130 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 131 |
+
if not os.path.exists(video_path):
|
| 132 |
+
raise FileNotFoundError(f"视频文件未找到: {video_path}")
|
| 133 |
+
return video_path
|
backend/app/downloaders/xiaoyuzhoufm_download.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Union, Optional
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
|
| 5 |
+
from app.downloaders.base import Downloader
|
| 6 |
+
from app.enmus.note_enums import DownloadQuality
|
| 7 |
+
from app.models.audio_model import AudioDownloadResult
|
| 8 |
+
|
| 9 |
+
url='https://www.xiaoyuzhoufm.com/_next/data/5Pvt_oGntgdyBD_XgwBaB/podcast/62382c1103bea1ebfffa1c00.json?id=62382c1103bea1ebfffa1c00'
|
| 10 |
+
header ={
|
| 11 |
+
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36'
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
response = requests.get(url, headers=header)
|
| 15 |
+
print(response.json())
|
| 16 |
+
|
| 17 |
+
class Xiaoyuzhoufm_download(Downloader):
|
| 18 |
+
def download(
|
| 19 |
+
self,
|
| 20 |
+
video_url: str,
|
| 21 |
+
output_dir: Union[str, None] = None,
|
| 22 |
+
quality: DownloadQuality = "fast",
|
| 23 |
+
need_video:Optional[bool]=False
|
| 24 |
+
) -> AudioDownloadResult:
|
| 25 |
+
pass
|
backend/app/downloaders/youtube_downloader.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import tempfile
|
| 4 |
+
from abc import ABC
|
| 5 |
+
from typing import Union, Optional, List
|
| 6 |
+
|
| 7 |
+
import yt_dlp
|
| 8 |
+
|
| 9 |
+
from app.downloaders.base import Downloader, DownloadQuality
|
| 10 |
+
from app.downloaders.youtube_subtitle import YouTubeSubtitleFetcher
|
| 11 |
+
from app.models.notes_model import AudioDownloadResult
|
| 12 |
+
from app.models.transcriber_model import TranscriptResult
|
| 13 |
+
from app.services.cookie_manager import CookieConfigManager
|
| 14 |
+
from app.services.proxy_config_manager import ProxyConfigManager
|
| 15 |
+
from app.utils.path_helper import get_data_dir
|
| 16 |
+
from app.utils.url_parser import extract_video_id
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _apply_proxy(ydl_opts: dict) -> dict:
|
| 22 |
+
"""YouTube 在国内需要代理。配置了全局代理就塞进 yt-dlp opts。"""
|
| 23 |
+
proxy = ProxyConfigManager().get_proxy_url()
|
| 24 |
+
if proxy:
|
| 25 |
+
ydl_opts['proxy'] = proxy
|
| 26 |
+
logger.info(f"yt-dlp 走代理: {proxy}")
|
| 27 |
+
return ydl_opts
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _apply_youtube_extractor_args(ydl_opts: dict) -> dict:
|
| 31 |
+
"""YouTube player_client 选择。
|
| 32 |
+
|
| 33 |
+
默认不再覆盖、交给 yt-dlp 的内置策略:
|
| 34 |
+
早期为绕开 SSAP 实验(issue #12482)硬编码过 ['tv', 'web_safari'],
|
| 35 |
+
但 YouTube 后来对 tv 客户端做「全量 DRM」实验(issue #12563),命中的会话
|
| 36 |
+
所有视频都报 "This video is DRM protected";而 web 系客户端需要 JS runtime
|
| 37 |
+
(deno)解 n challenge,装好后 yt-dlp 默认客户端列表即可正常取流。
|
| 38 |
+
硬编码的客户端列表会随 YouTube 风控变化反复失效,不如跟随 yt-dlp 升级。
|
| 39 |
+
|
| 40 |
+
如需临时指定,可设环境变量 YT_PLAYER_CLIENT(逗号分隔),如
|
| 41 |
+
YT_PLAYER_CLIENT=web_safari,android_vr。
|
| 42 |
+
"""
|
| 43 |
+
clients = os.getenv('YT_PLAYER_CLIENT', '').strip()
|
| 44 |
+
if clients:
|
| 45 |
+
ydl_opts.setdefault('extractor_args', {})
|
| 46 |
+
ydl_opts['extractor_args'].setdefault('youtube', {})
|
| 47 |
+
ydl_opts['extractor_args']['youtube']['player_client'] = [
|
| 48 |
+
c.strip() for c in clients.split(',') if c.strip()
|
| 49 |
+
]
|
| 50 |
+
return ydl_opts
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class YoutubeDownloader(Downloader, ABC):
|
| 54 |
+
def __init__(self):
|
| 55 |
+
|
| 56 |
+
super().__init__()
|
| 57 |
+
self._cookie_mgr = CookieConfigManager()
|
| 58 |
+
self._cookie = self._cookie_mgr.get('youtube')
|
| 59 |
+
# 优先级:浏览器实时 cookies > 粘贴的 cookie 字符串。
|
| 60 |
+
# 配了浏览器就走 yt-dlp `cookiesfrombrowser`,能避开 YouTube 的会话轮换风控。
|
| 61 |
+
self._browser = self._cookie_mgr.get_browser('youtube')
|
| 62 |
+
self._cookiefile = None if self._browser else self._write_netscape_cookie_file()
|
| 63 |
+
|
| 64 |
+
def _write_netscape_cookie_file(self) -> Optional[str]:
|
| 65 |
+
"""将 YouTube Cookie 写入 Netscape 格式临时文件,供 yt-dlp cookiefile 使用。
|
| 66 |
+
|
| 67 |
+
没有 Cookie 时返回 None;YouTube 现在没 Cookie 基本会被拦在「Sign in to confirm you're not a bot」。
|
| 68 |
+
"""
|
| 69 |
+
if not self._cookie:
|
| 70 |
+
logger.warning("YouTube Cookie 未配置,下载可能会被风控为机器人")
|
| 71 |
+
return None
|
| 72 |
+
lines = ["# Netscape HTTP Cookie File\n"]
|
| 73 |
+
for pair in self._cookie.split("; "):
|
| 74 |
+
if "=" in pair:
|
| 75 |
+
key, value = pair.split("=", 1)
|
| 76 |
+
lines.append(f".youtube.com\tTRUE\t/\tFALSE\t0\t{key}\t{value}\n")
|
| 77 |
+
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
|
| 78 |
+
tmp.writelines(lines)
|
| 79 |
+
tmp.close()
|
| 80 |
+
logger.info("已生成 YouTube Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1)
|
| 81 |
+
return tmp.name
|
| 82 |
+
|
| 83 |
+
def download(
|
| 84 |
+
self,
|
| 85 |
+
video_url: str,
|
| 86 |
+
output_dir: Union[str, None] = None,
|
| 87 |
+
quality: DownloadQuality = "fast",
|
| 88 |
+
need_video: Optional[bool] = False,
|
| 89 |
+
skip_download: bool = False,
|
| 90 |
+
) -> AudioDownloadResult:
|
| 91 |
+
if output_dir is None:
|
| 92 |
+
output_dir = get_data_dir()
|
| 93 |
+
if not output_dir:
|
| 94 |
+
output_dir = self.cache_data
|
| 95 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 96 |
+
|
| 97 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 98 |
+
|
| 99 |
+
ydl_opts = {
|
| 100 |
+
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
| 101 |
+
'outtmpl': output_path,
|
| 102 |
+
'noplaylist': True,
|
| 103 |
+
'quiet': False,
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
if skip_download:
|
| 107 |
+
ydl_opts['skip_download'] = True
|
| 108 |
+
|
| 109 |
+
_apply_proxy(ydl_opts)
|
| 110 |
+
_apply_youtube_extractor_args(ydl_opts)
|
| 111 |
+
if self._browser:
|
| 112 |
+
# (browser_name,) 形式即可;profile/keyring/container 留默认
|
| 113 |
+
ydl_opts['cookiesfrombrowser'] = (self._browser,)
|
| 114 |
+
logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
|
| 115 |
+
elif self._cookiefile:
|
| 116 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 120 |
+
info = ydl.extract_info(video_url, download=not skip_download)
|
| 121 |
+
video_id = info.get("id")
|
| 122 |
+
title = info.get("title")
|
| 123 |
+
duration = info.get("duration", 0)
|
| 124 |
+
cover_url = info.get("thumbnail")
|
| 125 |
+
ext = info.get("ext", "m4a")
|
| 126 |
+
audio_path = os.path.join(output_dir, f"{video_id}.{ext}")
|
| 127 |
+
|
| 128 |
+
return AudioDownloadResult(
|
| 129 |
+
file_path=audio_path,
|
| 130 |
+
title=title,
|
| 131 |
+
duration=duration,
|
| 132 |
+
cover_url=cover_url,
|
| 133 |
+
platform="youtube",
|
| 134 |
+
video_id=video_id,
|
| 135 |
+
raw_info={'tags': info.get('tags')},
|
| 136 |
+
video_path=None,
|
| 137 |
+
)
|
| 138 |
+
except Exception as exc:
|
| 139 |
+
# DRM / 反爬 / 格式不可用等情况下 yt-dlp 拉不动;只要本次仅需要 metadata
|
| 140 |
+
# (即字幕路径,skip_download=True),就退到 YouTube oEmbed 兜底拿标题+封面,
|
| 141 |
+
# 让流程能继续走总结。需要下载音视频时只能向上抛。
|
| 142 |
+
if not skip_download:
|
| 143 |
+
raise
|
| 144 |
+
logger.warning(f"yt-dlp 获取元数据失败,回退 oEmbed: {exc}")
|
| 145 |
+
return self._fallback_metadata(video_url)
|
| 146 |
+
|
| 147 |
+
def _fallback_metadata(self, video_url: str) -> AudioDownloadResult:
|
| 148 |
+
"""yt-dlp 失败时的兜底:用 YouTube 公开的 oEmbed 接口拿基础 metadata。
|
| 149 |
+
|
| 150 |
+
只能拿到 title / thumbnail / author 这几样;duration / tags 拿不到,做空值处理。
|
| 151 |
+
DRM、bot 拦截等都不影响 oEmbed。
|
| 152 |
+
"""
|
| 153 |
+
import requests
|
| 154 |
+
|
| 155 |
+
video_id = extract_video_id(video_url, "youtube") or ""
|
| 156 |
+
title = video_id or "YouTube 视频"
|
| 157 |
+
cover = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg" if video_id else ""
|
| 158 |
+
try:
|
| 159 |
+
proxies = None
|
| 160 |
+
proxy = ProxyConfigManager().get_proxy_url()
|
| 161 |
+
if proxy:
|
| 162 |
+
proxies = {"http": proxy, "https": proxy}
|
| 163 |
+
resp = requests.get(
|
| 164 |
+
"https://www.youtube.com/oembed",
|
| 165 |
+
params={"url": video_url, "format": "json"},
|
| 166 |
+
proxies=proxies,
|
| 167 |
+
timeout=10,
|
| 168 |
+
)
|
| 169 |
+
resp.raise_for_status()
|
| 170 |
+
data = resp.json()
|
| 171 |
+
if data.get("title"):
|
| 172 |
+
title = data["title"]
|
| 173 |
+
if data.get("thumbnail_url"):
|
| 174 |
+
cover = data["thumbnail_url"]
|
| 175 |
+
logger.info(f"oEmbed 兜底成功:title={title}")
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.warning(f"oEmbed 兜底也失败,使用最小元数据:{e}")
|
| 178 |
+
|
| 179 |
+
return AudioDownloadResult(
|
| 180 |
+
file_path="", # 没下载音视频文件
|
| 181 |
+
title=title,
|
| 182 |
+
duration=0, # oEmbed 不返回时长
|
| 183 |
+
cover_url=cover,
|
| 184 |
+
platform="youtube",
|
| 185 |
+
video_id=video_id,
|
| 186 |
+
raw_info={"tags": []}, # oEmbed 不返回标签
|
| 187 |
+
video_path=None,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
def download_video(
|
| 191 |
+
self,
|
| 192 |
+
video_url: str,
|
| 193 |
+
output_dir: Union[str, None] = None,
|
| 194 |
+
) -> str:
|
| 195 |
+
"""
|
| 196 |
+
下载视频,返回视频文件路径
|
| 197 |
+
"""
|
| 198 |
+
if output_dir is None:
|
| 199 |
+
output_dir = get_data_dir()
|
| 200 |
+
video_id = extract_video_id(video_url, "youtube")
|
| 201 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 202 |
+
if os.path.exists(video_path):
|
| 203 |
+
return video_path
|
| 204 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 205 |
+
output_path = os.path.join(output_dir, "%(id)s.%(ext)s")
|
| 206 |
+
|
| 207 |
+
ydl_opts = {
|
| 208 |
+
# 这里下载的视频只用于截图网格/视频理解抽帧,720p 足够:
|
| 209 |
+
# 不设上限的话 bestvideo 会选 4K AV1(动辄 300MB+,下载和 ffmpeg
|
| 210 |
+
# 解码抽帧都极慢)。优先 avc1(解码远快于 av01),同高度再退 av01。
|
| 211 |
+
'format': (
|
| 212 |
+
'bestvideo[height<=720][vcodec^=avc1]+bestaudio[ext=m4a]'
|
| 213 |
+
'/bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]'
|
| 214 |
+
'/best[height<=720][ext=mp4]/best[ext=mp4]'
|
| 215 |
+
),
|
| 216 |
+
'outtmpl': output_path,
|
| 217 |
+
'noplaylist': True,
|
| 218 |
+
'quiet': False,
|
| 219 |
+
'merge_output_format': 'mp4', # 确保合并成 mp4
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_apply_proxy(ydl_opts)
|
| 223 |
+
_apply_youtube_extractor_args(ydl_opts)
|
| 224 |
+
if self._browser:
|
| 225 |
+
# (browser_name,) 形式即可;profile/keyring/container 留默认
|
| 226 |
+
ydl_opts['cookiesfrombrowser'] = (self._browser,)
|
| 227 |
+
logger.info(f"YouTube 使用 cookies-from-browser: {self._browser}")
|
| 228 |
+
elif self._cookiefile:
|
| 229 |
+
ydl_opts['cookiefile'] = self._cookiefile
|
| 230 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 231 |
+
info = ydl.extract_info(video_url, download=True)
|
| 232 |
+
video_id = info.get("id")
|
| 233 |
+
video_path = os.path.join(output_dir, f"{video_id}.mp4")
|
| 234 |
+
|
| 235 |
+
if not os.path.exists(video_path):
|
| 236 |
+
raise FileNotFoundError(f"视频文件未找到: {video_path}")
|
| 237 |
+
|
| 238 |
+
return video_path
|
| 239 |
+
|
| 240 |
+
def download_subtitles(self, video_url: str, output_dir: str = None,
|
| 241 |
+
langs: List[str] = None) -> Optional[TranscriptResult]:
|
| 242 |
+
"""
|
| 243 |
+
通过 YouTube InnerTube API 直接获取字幕(优先人工字幕,其次自动生成)。
|
| 244 |
+
比 yt_dlp 方式更轻量,无需写临时文件到磁盘。
|
| 245 |
+
|
| 246 |
+
:param video_url: 视频链接
|
| 247 |
+
:param output_dir: 未使用(保留接口兼容)
|
| 248 |
+
:param langs: 优先语言列表
|
| 249 |
+
:return: TranscriptResult 或 None
|
| 250 |
+
"""
|
| 251 |
+
if langs is None:
|
| 252 |
+
langs = ['zh-Hans', 'zh', 'zh-CN', 'zh-TW', 'en', 'en-US', 'ja']
|
| 253 |
+
|
| 254 |
+
video_id = extract_video_id(video_url, "youtube")
|
| 255 |
+
fetcher = YouTubeSubtitleFetcher()
|
| 256 |
+
print(
|
| 257 |
+
f"尝试获取字幕,video_id={video_id}, langs={langs}"
|
| 258 |
+
)
|
| 259 |
+
return fetcher.fetch_subtitles(video_id, langs)
|
backend/app/downloaders/youtube_subtitle.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
通过 youtube-transcript-api 获取 YouTube 字幕。
|
| 3 |
+
优先人工字幕,其次自动生成字幕。不依赖 yt_dlp,无需下载任何文件。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Optional, List
|
| 7 |
+
|
| 8 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 9 |
+
|
| 10 |
+
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
| 11 |
+
from app.services.proxy_config_manager import ProxyConfigManager
|
| 12 |
+
from app.utils.logger import get_logger
|
| 13 |
+
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class YouTubeSubtitleFetcher:
|
| 18 |
+
"""通过 youtube-transcript-api 获取 YouTube 字幕。"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
# 配了全局代理就给 youtube-transcript-api 套一个带 proxies 的 requests.Session,
|
| 22 |
+
# 否则国内拉字幕同样会超时。代理未配置时退回默认无代理客户端。
|
| 23 |
+
proxy = ProxyConfigManager().get_proxy_url()
|
| 24 |
+
if proxy:
|
| 25 |
+
try:
|
| 26 |
+
import requests
|
| 27 |
+
session = requests.Session()
|
| 28 |
+
session.proxies = {"http": proxy, "https": proxy}
|
| 29 |
+
self._api = YouTubeTranscriptApi(http_client=session)
|
| 30 |
+
logger.info(f"YouTube 字幕走代理: {proxy}")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.warning(f"为 youtube-transcript-api 注入代理失败,回退无代理: {e}")
|
| 33 |
+
self._api = YouTubeTranscriptApi()
|
| 34 |
+
else:
|
| 35 |
+
self._api = YouTubeTranscriptApi()
|
| 36 |
+
|
| 37 |
+
def fetch_subtitles(
|
| 38 |
+
self,
|
| 39 |
+
video_id: str,
|
| 40 |
+
langs: Optional[List[str]] = None,
|
| 41 |
+
) -> Optional[TranscriptResult]:
|
| 42 |
+
if langs is None:
|
| 43 |
+
langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"]
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
# 1. 列出所有可用字幕
|
| 47 |
+
transcript_list = self._api.list(video_id)
|
| 48 |
+
|
| 49 |
+
available = []
|
| 50 |
+
for t in transcript_list:
|
| 51 |
+
available.append(
|
| 52 |
+
f"{t.language_code}({'auto' if t.is_generated else 'manual'})"
|
| 53 |
+
)
|
| 54 |
+
logger.info(f"可用字幕轨道: {', '.join(available)}")
|
| 55 |
+
|
| 56 |
+
# 2. 按优先级查找:先人工字幕,再自动字幕
|
| 57 |
+
transcript = None
|
| 58 |
+
try:
|
| 59 |
+
transcript = transcript_list.find_manually_created_transcript(langs)
|
| 60 |
+
logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})")
|
| 61 |
+
except Exception:
|
| 62 |
+
try:
|
| 63 |
+
transcript = transcript_list.find_generated_transcript(langs)
|
| 64 |
+
logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})")
|
| 65 |
+
except Exception:
|
| 66 |
+
# 都没匹配,取第一个可用的
|
| 67 |
+
for t in transcript_list:
|
| 68 |
+
transcript = t
|
| 69 |
+
source = "auto" if t.is_generated else "manual"
|
| 70 |
+
logger.info(f"使用首个可用字幕: {t.language_code} ({source})")
|
| 71 |
+
break
|
| 72 |
+
|
| 73 |
+
if not transcript:
|
| 74 |
+
logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
# 3. 获取字幕内容
|
| 78 |
+
fetched = transcript.fetch()
|
| 79 |
+
segments = []
|
| 80 |
+
for snippet in fetched:
|
| 81 |
+
text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip()
|
| 82 |
+
if not text:
|
| 83 |
+
continue
|
| 84 |
+
start = snippet.get("start", 0) if isinstance(snippet, dict) else 0
|
| 85 |
+
duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0
|
| 86 |
+
segments.append(TranscriptSegment(
|
| 87 |
+
start=float(start),
|
| 88 |
+
end=float(start) + float(duration),
|
| 89 |
+
text=text,
|
| 90 |
+
))
|
| 91 |
+
|
| 92 |
+
if not segments:
|
| 93 |
+
logger.warning(f"YouTube 字幕内容为空: {video_id}")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
full_text = " ".join(seg.text for seg in segments)
|
| 97 |
+
logger.info(f"成功获取 YouTube 字幕,共 {len(segments)} 段")
|
| 98 |
+
|
| 99 |
+
return TranscriptResult(
|
| 100 |
+
language=transcript.language_code,
|
| 101 |
+
full_text=full_text,
|
| 102 |
+
segments=segments,
|
| 103 |
+
raw={
|
| 104 |
+
"source": "youtube_transcript_api",
|
| 105 |
+
"language": transcript.language,
|
| 106 |
+
"language_code": transcript.language_code,
|
| 107 |
+
"is_generated": transcript.is_generated,
|
| 108 |
+
},
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.warning(f"YouTube 字幕获取失败: {e}")
|
| 113 |
+
return None
|
backend/app/enmus/exception.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ProviderErrorEnum(enum.Enum):
|
| 5 |
+
CONNECTION_TEST_FAILED = (200101, "供应商连接测试失败")
|
| 6 |
+
SAVE_FAILED = (200102, "供应商保存失败")
|
| 7 |
+
CREATE_FAILED = (200103, "供应商创建失败")
|
| 8 |
+
NOT_FOUND = (200104, "供应商不存在/未保存")
|
| 9 |
+
WRONG_PARAMETER = (200105, "API / API 地址不正确")
|
| 10 |
+
UNKNOW_ERROR = (200106, "未知错误")
|
| 11 |
+
|
| 12 |
+
def __init__(self, code, message):
|
| 13 |
+
self.code = code
|
| 14 |
+
self.message = message
|
| 15 |
+
|
| 16 |
+
class NoteErrorEnum(enum.Enum):
|
| 17 |
+
PLATFORM_NOT_SUPPORTED = (300101 ,"选择的平台不受支持")
|
| 18 |
+
|
| 19 |
+
def __init__(self, code, message):
|
| 20 |
+
self.code = code
|
| 21 |
+
self.message = message
|
backend/app/enmus/note_enums.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class DownloadQuality(str, enum.Enum):
|
| 5 |
+
fast = "fast"
|
| 6 |
+
medium = "medium"
|
| 7 |
+
slow = "slow"
|