Upload 8 files
Browse files- Dockerfile +18 -0
- accounts.json +9 -0
- app.py +198 -0
- data/output.xlsx +0 -0
- docker-compose.yml +12 -0
- history.json +1 -0
- requirements.txt +4 -0
- templates/index.html +61 -0
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
+
tzdata && \
|
| 9 |
+
pip install --no-cache-dir -r requirements.txt && \
|
| 10 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
ENV TZ=Asia/Shanghai
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
ENV FLASK_RUN_HOST=0.0.0.0
|
| 16 |
+
EXPOSE 5000
|
| 17 |
+
|
| 18 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app", "--workers", "1", "--timeout", "180"]
|
accounts.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tiktok": [
|
| 3 |
+
"https://www.tiktok.com/@bayashi.tiktok"
|
| 4 |
+
],
|
| 5 |
+
"youtube": [
|
| 6 |
+
"https://www.youtube.com/@AdrenalineRushReviews/shorts"
|
| 7 |
+
],
|
| 8 |
+
"schedule_hours": 6
|
| 9 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request, redirect, url_for, send_file, flash
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from yt_dlp import YoutubeDL
|
| 7 |
+
|
| 8 |
+
app = Flask(__name__)
|
| 9 |
+
app.secret_key = "change_this_to_something_random"
|
| 10 |
+
|
| 11 |
+
HISTORY_FILE = "history.json"
|
| 12 |
+
OUTPUT_FILE = "output.xlsx"
|
| 13 |
+
MAX_NEW = 30 # 每次最多采集最新 30 个视频
|
| 14 |
+
|
| 15 |
+
# 确保 history 文件存在
|
| 16 |
+
if os.environ.get("RUN_SCHEDULER", "false") == "true":
|
| 17 |
+
scheduler = BackgroundScheduler()
|
| 18 |
+
scheduler.add_job(run_scheduled_scrape, 'interval', hours=get_schedule_hours())
|
| 19 |
+
scheduler.start()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_history():
|
| 23 |
+
with open(HISTORY_FILE, "r", encoding="utf-8") as f:
|
| 24 |
+
return json.load(f)
|
| 25 |
+
|
| 26 |
+
def save_history(history):
|
| 27 |
+
with open(HISTORY_FILE, "w", encoding="utf-8") as f:
|
| 28 |
+
json.dump(history, f, ensure_ascii=False, indent=2)
|
| 29 |
+
|
| 30 |
+
def append_to_excel(rows):
|
| 31 |
+
"""
|
| 32 |
+
将新抓取的记录写入两个文件:
|
| 33 |
+
1. output.xlsx(累计追加)
|
| 34 |
+
2. output_YYYYMMDD_HHMMSS.xlsx(当前批次独立保存)
|
| 35 |
+
"""
|
| 36 |
+
df = pd.DataFrame(rows)
|
| 37 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 38 |
+
time_file = f"output_{timestamp}.xlsx"
|
| 39 |
+
|
| 40 |
+
# 写出当前批次独立文件
|
| 41 |
+
df.to_excel(time_file, index=False)
|
| 42 |
+
|
| 43 |
+
# 同时更新主文件 output.xlsx(追加)
|
| 44 |
+
if os.path.exists(OUTPUT_FILE):
|
| 45 |
+
old = pd.read_excel(OUTPUT_FILE)
|
| 46 |
+
combined = pd.concat([old, df], ignore_index=True)
|
| 47 |
+
combined.to_excel(OUTPUT_FILE, index=False)
|
| 48 |
+
else:
|
| 49 |
+
df.to_excel(OUTPUT_FILE, index=False)
|
| 50 |
+
|
| 51 |
+
return time_file
|
| 52 |
+
|
| 53 |
+
def fetch_latest_videos(profile_url, max_items=10):
|
| 54 |
+
ydl_opts = {
|
| 55 |
+
"ignoreerrors": True,
|
| 56 |
+
"quiet": True,
|
| 57 |
+
"skip_download": True,
|
| 58 |
+
}
|
| 59 |
+
with YoutubeDL(ydl_opts) as ydl:
|
| 60 |
+
info = ydl.extract_info(profile_url, download=False)
|
| 61 |
+
entries = []
|
| 62 |
+
if not info:
|
| 63 |
+
return []
|
| 64 |
+
if "entries" in info and info["entries"]:
|
| 65 |
+
for e in info["entries"]:
|
| 66 |
+
if e:
|
| 67 |
+
entries.append(e)
|
| 68 |
+
else:
|
| 69 |
+
entries = [info]
|
| 70 |
+
|
| 71 |
+
normalized = []
|
| 72 |
+
for e in entries:
|
| 73 |
+
video_id = e.get("id") or e.get("display_id") or e.get("url")
|
| 74 |
+
title = e.get("title", "")
|
| 75 |
+
webpage_url = e.get("webpage_url") or e.get("url")
|
| 76 |
+
upload_date = e.get("upload_date")
|
| 77 |
+
timestamp = e.get("timestamp")
|
| 78 |
+
if timestamp:
|
| 79 |
+
dt = datetime.utcfromtimestamp(timestamp)
|
| 80 |
+
date_str = dt.strftime("%Y-%m-%d")
|
| 81 |
+
elif upload_date:
|
| 82 |
+
try:
|
| 83 |
+
date_str = datetime.strptime(str(upload_date), "%Y%m%d").strftime("%Y-%m-%d")
|
| 84 |
+
except Exception:
|
| 85 |
+
date_str = str(upload_date)
|
| 86 |
+
else:
|
| 87 |
+
date_str = ""
|
| 88 |
+
view_count = e.get("view_count") if e.get("view_count") is not None else e.get("views")
|
| 89 |
+
like_count = e.get("like_count") if e.get("like_count") is not None else e.get("likes")
|
| 90 |
+
normalized.append({
|
| 91 |
+
"id": video_id,
|
| 92 |
+
"title": title,
|
| 93 |
+
"url": webpage_url,
|
| 94 |
+
"date": date_str,
|
| 95 |
+
"views": view_count,
|
| 96 |
+
"likes": like_count
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
def sort_key(x):
|
| 100 |
+
try:
|
| 101 |
+
return datetime.strptime(x["date"], "%Y-%m-%d")
|
| 102 |
+
except:
|
| 103 |
+
return datetime.min
|
| 104 |
+
normalized.sort(key=sort_key, reverse=True)
|
| 105 |
+
return normalized[:max_items]
|
| 106 |
+
|
| 107 |
+
def list_output_files():
|
| 108 |
+
"""列出当前目录下所有 output*.xlsx 文件(带时间和大小)"""
|
| 109 |
+
files = []
|
| 110 |
+
for f in os.listdir("."):
|
| 111 |
+
if f.startswith("output") and f.endswith(".xlsx"):
|
| 112 |
+
size_kb = os.path.getsize(f) / 1024
|
| 113 |
+
mtime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
|
| 114 |
+
files.append({
|
| 115 |
+
"name": f,
|
| 116 |
+
"mtime": mtime,
|
| 117 |
+
"size": f"{size_kb:.1f} KB"
|
| 118 |
+
})
|
| 119 |
+
# 按时间倒序
|
| 120 |
+
files.sort(key=lambda x: x["mtime"], reverse=True)
|
| 121 |
+
return files
|
| 122 |
+
|
| 123 |
+
@app.route("/", methods=["GET", "POST"])
|
| 124 |
+
def index():
|
| 125 |
+
if request.method == "POST":
|
| 126 |
+
profile_url = request.form.get("profile_url", "").strip()
|
| 127 |
+
if not profile_url:
|
| 128 |
+
flash("请先输入 TikTok 或 YouTube 的主页链接。")
|
| 129 |
+
return redirect(url_for("index"))
|
| 130 |
+
try:
|
| 131 |
+
entries = fetch_latest_videos(profile_url, max_items=MAX_NEW)
|
| 132 |
+
except Exception as ex:
|
| 133 |
+
flash(f"抓取失败:{ex}")
|
| 134 |
+
return redirect(url_for("index"))
|
| 135 |
+
|
| 136 |
+
if not entries:
|
| 137 |
+
flash("未能提取到视频信息,请确认主页链接是否有效。")
|
| 138 |
+
return redirect(url_for("index"))
|
| 139 |
+
|
| 140 |
+
history = load_history()
|
| 141 |
+
key = profile_url
|
| 142 |
+
seen = set(history.get(key, []))
|
| 143 |
+
|
| 144 |
+
new_rows = []
|
| 145 |
+
new_ids = []
|
| 146 |
+
skipped = 0
|
| 147 |
+
added = 0
|
| 148 |
+
|
| 149 |
+
for e in entries:
|
| 150 |
+
vid = e.get("id") or e.get("url")
|
| 151 |
+
if not vid:
|
| 152 |
+
continue
|
| 153 |
+
if vid in seen:
|
| 154 |
+
skipped += 1
|
| 155 |
+
continue
|
| 156 |
+
row = {
|
| 157 |
+
"source_profile": profile_url,
|
| 158 |
+
"video_id": vid,
|
| 159 |
+
"date": e.get("date", ""),
|
| 160 |
+
"title": e.get("title", ""),
|
| 161 |
+
"video_url": e.get("url", ""),
|
| 162 |
+
"views": e.get("views"),
|
| 163 |
+
"likes": e.get("likes")
|
| 164 |
+
}
|
| 165 |
+
new_rows.append(row)
|
| 166 |
+
new_ids.append(vid)
|
| 167 |
+
seen.add(vid)
|
| 168 |
+
added += 1
|
| 169 |
+
|
| 170 |
+
if new_rows:
|
| 171 |
+
time_file = append_to_excel(new_rows)
|
| 172 |
+
else:
|
| 173 |
+
time_file = None
|
| 174 |
+
|
| 175 |
+
history[key] = list(seen)
|
| 176 |
+
save_history(history)
|
| 177 |
+
|
| 178 |
+
if new_rows:
|
| 179 |
+
flash(f"抓取完成:新增 {added} 条,跳过 {skipped} 条。"
|
| 180 |
+
f"已保存至 output.xlsx 及 {time_file}。")
|
| 181 |
+
else:
|
| 182 |
+
flash(f"没有发现新视频,跳过 {skipped} 条。")
|
| 183 |
+
|
| 184 |
+
return redirect(url_for("index"))
|
| 185 |
+
|
| 186 |
+
files = list_output_files()
|
| 187 |
+
return render_template("index.html", files=files)
|
| 188 |
+
|
| 189 |
+
@app.route("/download/<filename>")
|
| 190 |
+
def download(filename):
|
| 191 |
+
if os.path.exists(filename):
|
| 192 |
+
return send_file(filename, as_attachment=True)
|
| 193 |
+
else:
|
| 194 |
+
flash("文件不存在。")
|
| 195 |
+
return redirect(url_for("index"))
|
| 196 |
+
|
| 197 |
+
if __name__ == '__main__':
|
| 198 |
+
app.run(host='0.0.0.0', port=7860, debug=False)
|
data/output.xlsx
ADDED
|
Binary file (8.55 kB). View file
|
|
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
scraper:
|
| 3 |
+
build: .
|
| 4 |
+
container_name: tiktok_scraper
|
| 5 |
+
ports:
|
| 6 |
+
- "7860:7860"
|
| 7 |
+
volumes:
|
| 8 |
+
- ./data:/app/data # ✅ 修正挂载路径,避免覆盖代码
|
| 9 |
+
environment:
|
| 10 |
+
- TZ=Asia/Shanghai
|
| 11 |
+
- RUN_SCHEDULER=true
|
| 12 |
+
restart: unless-stopped
|
history.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask
|
| 2 |
+
yt-dlp
|
| 3 |
+
pandas
|
| 4 |
+
openpyxl
|
templates/index.html
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="zh-CN">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<title>视频抓取器 — TikTok / YouTube</title>
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 7 |
+
<style>
|
| 8 |
+
body { font-family: Arial, sans-serif; margin: 30px; background:#f7f7f7 }
|
| 9 |
+
.card { background:white; padding:20px; border-radius:8px; max-width:800px; margin:auto; box-shadow:0 4px 12px rgba(0,0,0,0.06)}
|
| 10 |
+
input[type=text]{width:100%; padding:10px; margin:8px 0; box-sizing:border-box}
|
| 11 |
+
button{padding:10px 18px; border:none; background:#007bff; color:white; border-radius:6px; cursor:pointer}
|
| 12 |
+
button:hover{background:#0056cc}
|
| 13 |
+
.note{color:#555; font-size:0.9em}
|
| 14 |
+
.flash{background:#fffae6; padding:10px; border-left:4px solid #ffd24d; margin-bottom:12px}
|
| 15 |
+
table{width:100%; border-collapse:collapse; margin-top:20px; font-size:0.95em}
|
| 16 |
+
th,td{padding:8px; border-bottom:1px solid #ddd; text-align:left}
|
| 17 |
+
th{background:#f0f0f0}
|
| 18 |
+
a.download-link{text-decoration:none; color:#007bff}
|
| 19 |
+
a.download-link:hover{text-decoration:underline}
|
| 20 |
+
</style>
|
| 21 |
+
</head>
|
| 22 |
+
<body>
|
| 23 |
+
<div class="card">
|
| 24 |
+
<h2>视频抓取器(TikTok / YouTube)</h2>
|
| 25 |
+
|
| 26 |
+
{% with messages = get_flashed_messages() %}
|
| 27 |
+
{% if messages %}
|
| 28 |
+
<div class="flash">
|
| 29 |
+
{% for msg in messages %}
|
| 30 |
+
<div>{{msg}}</div>
|
| 31 |
+
{% endfor %}
|
| 32 |
+
</div>
|
| 33 |
+
{% endif %}
|
| 34 |
+
{% endwith %}
|
| 35 |
+
|
| 36 |
+
<form method="post">
|
| 37 |
+
<label>主页链接(TikTok 或 YouTube 主页 / Shorts 页面):</label>
|
| 38 |
+
<input type="text" name="profile_url" placeholder="例如:https://www.tiktok.com/@bayashi.tiktok 或 https://www.youtube.com/@AdrenalineRushReviews/shorts">
|
| 39 |
+
<button type="submit">抓取最新 30 条</button>
|
| 40 |
+
<p class="note">说明:每次抓取会跳过已采集过的视频。结果追加到 output.xlsx,并另存时间戳文件。</p>
|
| 41 |
+
</form>
|
| 42 |
+
|
| 43 |
+
<h3 style="margin-top:30px;">📂 当前目录中的数据文件:</h3>
|
| 44 |
+
{% if files %}
|
| 45 |
+
<table>
|
| 46 |
+
<tr><th>文件名</th><th>最后修改时间</th><th>大小</th><th>下载</th></tr>
|
| 47 |
+
{% for f in files %}
|
| 48 |
+
<tr>
|
| 49 |
+
<td>{{ f.name }}</td>
|
| 50 |
+
<td>{{ f.mtime }}</td>
|
| 51 |
+
<td>{{ f.size }}</td>
|
| 52 |
+
<td><a class="download-link" href="{{ url_for('download', filename=f.name) }}">下载</a></td>
|
| 53 |
+
</tr>
|
| 54 |
+
{% endfor %}
|
| 55 |
+
</table>
|
| 56 |
+
{% else %}
|
| 57 |
+
<p>暂无抓取结果文件。</p>
|
| 58 |
+
{% endif %}
|
| 59 |
+
</div>
|
| 60 |
+
</body>
|
| 61 |
+
</html>
|