alige commited on
Commit
5efe4e4
·
verified ·
1 Parent(s): 31e0396

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile +18 -0
  2. accounts.json +9 -0
  3. app.py +198 -0
  4. data/output.xlsx +0 -0
  5. docker-compose.yml +12 -0
  6. history.json +1 -0
  7. requirements.txt +4 -0
  8. templates/index.html +61 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ tzdata && \
9
+ pip install --no-cache-dir -r requirements.txt && \
10
+ apt-get clean && rm -rf /var/lib/apt/lists/*
11
+
12
+ ENV TZ=Asia/Shanghai
13
+ COPY . .
14
+
15
+ ENV FLASK_RUN_HOST=0.0.0.0
16
+ EXPOSE 5000
17
+
18
+ CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app", "--workers", "1", "--timeout", "180"]
accounts.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tiktok": [
3
+ "https://www.tiktok.com/@bayashi.tiktok"
4
+ ],
5
+ "youtube": [
6
+ "https://www.youtube.com/@AdrenalineRushReviews/shorts"
7
+ ],
8
+ "schedule_hours": 6
9
+ }
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, redirect, url_for, send_file, flash
2
+ import os
3
+ import json
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ from yt_dlp import YoutubeDL
7
+
8
+ app = Flask(__name__)
9
+ app.secret_key = "change_this_to_something_random"
10
+
11
+ HISTORY_FILE = "history.json"
12
+ OUTPUT_FILE = "output.xlsx"
13
+ MAX_NEW = 30 # 每次最多采集最新 30 个视频
14
+
15
+ # 确保 history 文件存在
16
+ if os.environ.get("RUN_SCHEDULER", "false") == "true":
17
+ scheduler = BackgroundScheduler()
18
+ scheduler.add_job(run_scheduled_scrape, 'interval', hours=get_schedule_hours())
19
+ scheduler.start()
20
+
21
+
22
+ def load_history():
23
+ with open(HISTORY_FILE, "r", encoding="utf-8") as f:
24
+ return json.load(f)
25
+
26
+ def save_history(history):
27
+ with open(HISTORY_FILE, "w", encoding="utf-8") as f:
28
+ json.dump(history, f, ensure_ascii=False, indent=2)
29
+
30
+ def append_to_excel(rows):
31
+ """
32
+ 将新抓取的记录写入两个文件:
33
+ 1. output.xlsx(累计追加)
34
+ 2. output_YYYYMMDD_HHMMSS.xlsx(当前批次独立保存)
35
+ """
36
+ df = pd.DataFrame(rows)
37
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
38
+ time_file = f"output_{timestamp}.xlsx"
39
+
40
+ # 写出当前批次独立文件
41
+ df.to_excel(time_file, index=False)
42
+
43
+ # 同时更新主文件 output.xlsx(追加)
44
+ if os.path.exists(OUTPUT_FILE):
45
+ old = pd.read_excel(OUTPUT_FILE)
46
+ combined = pd.concat([old, df], ignore_index=True)
47
+ combined.to_excel(OUTPUT_FILE, index=False)
48
+ else:
49
+ df.to_excel(OUTPUT_FILE, index=False)
50
+
51
+ return time_file
52
+
53
+ def fetch_latest_videos(profile_url, max_items=10):
54
+ ydl_opts = {
55
+ "ignoreerrors": True,
56
+ "quiet": True,
57
+ "skip_download": True,
58
+ }
59
+ with YoutubeDL(ydl_opts) as ydl:
60
+ info = ydl.extract_info(profile_url, download=False)
61
+ entries = []
62
+ if not info:
63
+ return []
64
+ if "entries" in info and info["entries"]:
65
+ for e in info["entries"]:
66
+ if e:
67
+ entries.append(e)
68
+ else:
69
+ entries = [info]
70
+
71
+ normalized = []
72
+ for e in entries:
73
+ video_id = e.get("id") or e.get("display_id") or e.get("url")
74
+ title = e.get("title", "")
75
+ webpage_url = e.get("webpage_url") or e.get("url")
76
+ upload_date = e.get("upload_date")
77
+ timestamp = e.get("timestamp")
78
+ if timestamp:
79
+ dt = datetime.utcfromtimestamp(timestamp)
80
+ date_str = dt.strftime("%Y-%m-%d")
81
+ elif upload_date:
82
+ try:
83
+ date_str = datetime.strptime(str(upload_date), "%Y%m%d").strftime("%Y-%m-%d")
84
+ except Exception:
85
+ date_str = str(upload_date)
86
+ else:
87
+ date_str = ""
88
+ view_count = e.get("view_count") if e.get("view_count") is not None else e.get("views")
89
+ like_count = e.get("like_count") if e.get("like_count") is not None else e.get("likes")
90
+ normalized.append({
91
+ "id": video_id,
92
+ "title": title,
93
+ "url": webpage_url,
94
+ "date": date_str,
95
+ "views": view_count,
96
+ "likes": like_count
97
+ })
98
+
99
+ def sort_key(x):
100
+ try:
101
+ return datetime.strptime(x["date"], "%Y-%m-%d")
102
+ except:
103
+ return datetime.min
104
+ normalized.sort(key=sort_key, reverse=True)
105
+ return normalized[:max_items]
106
+
107
+ def list_output_files():
108
+ """列出当前目录下所有 output*.xlsx 文件(带时间和大小)"""
109
+ files = []
110
+ for f in os.listdir("."):
111
+ if f.startswith("output") and f.endswith(".xlsx"):
112
+ size_kb = os.path.getsize(f) / 1024
113
+ mtime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
114
+ files.append({
115
+ "name": f,
116
+ "mtime": mtime,
117
+ "size": f"{size_kb:.1f} KB"
118
+ })
119
+ # 按时间倒序
120
+ files.sort(key=lambda x: x["mtime"], reverse=True)
121
+ return files
122
+
123
+ @app.route("/", methods=["GET", "POST"])
124
+ def index():
125
+ if request.method == "POST":
126
+ profile_url = request.form.get("profile_url", "").strip()
127
+ if not profile_url:
128
+ flash("请先输入 TikTok 或 YouTube 的主页链接。")
129
+ return redirect(url_for("index"))
130
+ try:
131
+ entries = fetch_latest_videos(profile_url, max_items=MAX_NEW)
132
+ except Exception as ex:
133
+ flash(f"抓取失败:{ex}")
134
+ return redirect(url_for("index"))
135
+
136
+ if not entries:
137
+ flash("未能提取到视频信息,请确认主页链接是否有效。")
138
+ return redirect(url_for("index"))
139
+
140
+ history = load_history()
141
+ key = profile_url
142
+ seen = set(history.get(key, []))
143
+
144
+ new_rows = []
145
+ new_ids = []
146
+ skipped = 0
147
+ added = 0
148
+
149
+ for e in entries:
150
+ vid = e.get("id") or e.get("url")
151
+ if not vid:
152
+ continue
153
+ if vid in seen:
154
+ skipped += 1
155
+ continue
156
+ row = {
157
+ "source_profile": profile_url,
158
+ "video_id": vid,
159
+ "date": e.get("date", ""),
160
+ "title": e.get("title", ""),
161
+ "video_url": e.get("url", ""),
162
+ "views": e.get("views"),
163
+ "likes": e.get("likes")
164
+ }
165
+ new_rows.append(row)
166
+ new_ids.append(vid)
167
+ seen.add(vid)
168
+ added += 1
169
+
170
+ if new_rows:
171
+ time_file = append_to_excel(new_rows)
172
+ else:
173
+ time_file = None
174
+
175
+ history[key] = list(seen)
176
+ save_history(history)
177
+
178
+ if new_rows:
179
+ flash(f"抓取完成:新增 {added} 条,跳过 {skipped} 条。"
180
+ f"已保存至 output.xlsx 及 {time_file}。")
181
+ else:
182
+ flash(f"没有发现新视频,跳过 {skipped} 条。")
183
+
184
+ return redirect(url_for("index"))
185
+
186
+ files = list_output_files()
187
+ return render_template("index.html", files=files)
188
+
189
+ @app.route("/download/<filename>")
190
+ def download(filename):
191
+ if os.path.exists(filename):
192
+ return send_file(filename, as_attachment=True)
193
+ else:
194
+ flash("文件不存在。")
195
+ return redirect(url_for("index"))
196
+
197
+ if __name__ == '__main__':
198
+ app.run(host='0.0.0.0', port=7860, debug=False)
data/output.xlsx ADDED
Binary file (8.55 kB). View file
 
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ scraper:
3
+ build: .
4
+ container_name: tiktok_scraper
5
+ ports:
6
+ - "7860:7860"
7
+ volumes:
8
+ - ./data:/app/data # ✅ 修正挂载路径,避免覆盖代码
9
+ environment:
10
+ - TZ=Asia/Shanghai
11
+ - RUN_SCHEDULER=true
12
+ restart: unless-stopped
history.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Flask
2
+ yt-dlp
3
+ pandas
4
+ openpyxl
templates/index.html ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="zh-CN">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>视频抓取器 — TikTok / YouTube</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <style>
8
+ body { font-family: Arial, sans-serif; margin: 30px; background:#f7f7f7 }
9
+ .card { background:white; padding:20px; border-radius:8px; max-width:800px; margin:auto; box-shadow:0 4px 12px rgba(0,0,0,0.06)}
10
+ input[type=text]{width:100%; padding:10px; margin:8px 0; box-sizing:border-box}
11
+ button{padding:10px 18px; border:none; background:#007bff; color:white; border-radius:6px; cursor:pointer}
12
+ button:hover{background:#0056cc}
13
+ .note{color:#555; font-size:0.9em}
14
+ .flash{background:#fffae6; padding:10px; border-left:4px solid #ffd24d; margin-bottom:12px}
15
+ table{width:100%; border-collapse:collapse; margin-top:20px; font-size:0.95em}
16
+ th,td{padding:8px; border-bottom:1px solid #ddd; text-align:left}
17
+ th{background:#f0f0f0}
18
+ a.download-link{text-decoration:none; color:#007bff}
19
+ a.download-link:hover{text-decoration:underline}
20
+ </style>
21
+ </head>
22
+ <body>
23
+ <div class="card">
24
+ <h2>视频抓取器(TikTok / YouTube)</h2>
25
+
26
+ {% with messages = get_flashed_messages() %}
27
+ {% if messages %}
28
+ <div class="flash">
29
+ {% for msg in messages %}
30
+ <div>{{msg}}</div>
31
+ {% endfor %}
32
+ </div>
33
+ {% endif %}
34
+ {% endwith %}
35
+
36
+ <form method="post">
37
+ <label>主页链接(TikTok 或 YouTube 主页 / Shorts 页面):</label>
38
+ <input type="text" name="profile_url" placeholder="例如:https://www.tiktok.com/@bayashi.tiktok 或 https://www.youtube.com/@AdrenalineRushReviews/shorts">
39
+ <button type="submit">抓取最新 30 条</button>
40
+ <p class="note">说明:每次抓取会跳过已采集过的视频。结果追加到 output.xlsx,并另存时间戳文件。</p>
41
+ </form>
42
+
43
+ <h3 style="margin-top:30px;">📂 当前目录中的数据文件:</h3>
44
+ {% if files %}
45
+ <table>
46
+ <tr><th>文件名</th><th>最后修改时间</th><th>大小</th><th>下载</th></tr>
47
+ {% for f in files %}
48
+ <tr>
49
+ <td>{{ f.name }}</td>
50
+ <td>{{ f.mtime }}</td>
51
+ <td>{{ f.size }}</td>
52
+ <td><a class="download-link" href="{{ url_for('download', filename=f.name) }}">下载</a></td>
53
+ </tr>
54
+ {% endfor %}
55
+ </table>
56
+ {% else %}
57
+ <p>暂无抓取结果文件。</p>
58
+ {% endif %}
59
+ </div>
60
+ </body>
61
+ </html>