Spaces:

alige
/

Dk

Sleeping

App Files Files Community

alige commited on Jan 12

Commit

5efe4e4

verified ·

1 Parent(s): 31e0396

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +18 -0
accounts.json +9 -0
app.py +198 -0
data/output.xlsx +0 -0
docker-compose.yml +12 -0
history.json +1 -0
requirements.txt +4 -0
templates/index.html +61 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tzdata && \
+    pip install --no-cache-dir -r requirements.txt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+ENV TZ=Asia/Shanghai
+COPY . .
+ENV FLASK_RUN_HOST=0.0.0.0
+EXPOSE 5000
+CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app", "--workers", "1", "--timeout", "180"]

accounts.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "tiktok": [
+    "https://www.tiktok.com/@bayashi.tiktok"
+  ],
+  "youtube": [
+    "https://www.youtube.com/@AdrenalineRushReviews/shorts"
+  ],
+  "schedule_hours": 6
+}

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from flask import Flask, render_template, request, redirect, url_for, send_file, flash
+import os
+import json
+from datetime import datetime
+import pandas as pd
+from yt_dlp import YoutubeDL
+app = Flask(__name__)
+app.secret_key = "change_this_to_something_random"
+HISTORY_FILE = "history.json"
+OUTPUT_FILE = "output.xlsx"
+MAX_NEW = 30  # 每次最多采集最新 30 个视频
+# 确保 history 文件存在
+if os.environ.get("RUN_SCHEDULER", "false") == "true":
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(run_scheduled_scrape, 'interval', hours=get_schedule_hours())
+    scheduler.start()
+def load_history():
+    with open(HISTORY_FILE, "r", encoding="utf-8") as f:
+        return json.load(f)
+def save_history(history):
+    with open(HISTORY_FILE, "w", encoding="utf-8") as f:
+        json.dump(history, f, ensure_ascii=False, indent=2)
+def append_to_excel(rows):
+    """
+    将新抓取的记录写入两个文件：
+      1. output.xlsx（累计追加）
+      2. output_YYYYMMDD_HHMMSS.xlsx（当前批次独立保存）
+    """
+    df = pd.DataFrame(rows)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    time_file = f"output_{timestamp}.xlsx"
+    # 写出当前批次独立文件
+    df.to_excel(time_file, index=False)
+    # 同时更新主文件 output.xlsx（追加）
+    if os.path.exists(OUTPUT_FILE):
+        old = pd.read_excel(OUTPUT_FILE)
+        combined = pd.concat([old, df], ignore_index=True)
+        combined.to_excel(OUTPUT_FILE, index=False)
+    else:
+        df.to_excel(OUTPUT_FILE, index=False)
+    return time_file
+def fetch_latest_videos(profile_url, max_items=10):
+    ydl_opts = {
+        "ignoreerrors": True,
+        "quiet": True,
+        "skip_download": True,
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(profile_url, download=False)
+    entries = []
+    if not info:
+        return []
+    if "entries" in info and info["entries"]:
+        for e in info["entries"]:
+            if e:
+                entries.append(e)
+    else:
+        entries = [info]
+    normalized = []
+    for e in entries:
+        video_id = e.get("id") or e.get("display_id") or e.get("url")
+        title = e.get("title", "")
+        webpage_url = e.get("webpage_url") or e.get("url")
+        upload_date = e.get("upload_date")
+        timestamp = e.get("timestamp")
+        if timestamp:
+            dt = datetime.utcfromtimestamp(timestamp)
+            date_str = dt.strftime("%Y-%m-%d")
+        elif upload_date:
+            try:
+                date_str = datetime.strptime(str(upload_date), "%Y%m%d").strftime("%Y-%m-%d")
+            except Exception:
+                date_str = str(upload_date)
+        else:
+            date_str = ""
+        view_count = e.get("view_count") if e.get("view_count") is not None else e.get("views")
+        like_count = e.get("like_count") if e.get("like_count") is not None else e.get("likes")
+        normalized.append({
+            "id": video_id,
+            "title": title,
+            "url": webpage_url,
+            "date": date_str,
+            "views": view_count,
+            "likes": like_count
+        })
+    def sort_key(x):
+        try:
+            return datetime.strptime(x["date"], "%Y-%m-%d")
+        except:
+            return datetime.min
+    normalized.sort(key=sort_key, reverse=True)
+    return normalized[:max_items]
+def list_output_files():
+    """列出当前目录下所有 output*.xlsx 文件（带时间和大小）"""
+    files = []
+    for f in os.listdir("."):
+        if f.startswith("output") and f.endswith(".xlsx"):
+            size_kb = os.path.getsize(f) / 1024
+            mtime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
+            files.append({
+                "name": f,
+                "mtime": mtime,
+                "size": f"{size_kb:.1f} KB"
+            })
+    # 按时间倒序
+    files.sort(key=lambda x: x["mtime"], reverse=True)
+    return files
+@app.route("/", methods=["GET", "POST"])
+def index():
+    if request.method == "POST":
+        profile_url = request.form.get("profile_url", "").strip()
+        if not profile_url:
+            flash("请先输入 TikTok 或 YouTube 的主页链接。")
+            return redirect(url_for("index"))
+        try:
+            entries = fetch_latest_videos(profile_url, max_items=MAX_NEW)
+        except Exception as ex:
+            flash(f"抓取失败：{ex}")
+            return redirect(url_for("index"))
+        if not entries:
+            flash("未能提取到视频信息，请确认主页链接是否有效。")
+            return redirect(url_for("index"))
+        history = load_history()
+        key = profile_url
+        seen = set(history.get(key, []))
+        new_rows = []
+        new_ids = []
+        skipped = 0
+        added = 0
+        for e in entries:
+            vid = e.get("id") or e.get("url")
+            if not vid:
+                continue
+            if vid in seen:
+                skipped += 1
+                continue
+            row = {
+                "source_profile": profile_url,
+                "video_id": vid,
+                "date": e.get("date", ""),
+                "title": e.get("title", ""),
+                "video_url": e.get("url", ""),
+                "views": e.get("views"),
+                "likes": e.get("likes")
+            }
+            new_rows.append(row)
+            new_ids.append(vid)
+            seen.add(vid)
+            added += 1
+        if new_rows:
+            time_file = append_to_excel(new_rows)
+        else:
+            time_file = None
+        history[key] = list(seen)
+        save_history(history)
+        if new_rows:
+            flash(f"抓取完成：新增 {added} 条，跳过 {skipped} 条。"
+                  f"已保存至 output.xlsx 及 {time_file}。")
+        else:
+            flash(f"没有发现新视频，跳过 {skipped} 条。")
+        return redirect(url_for("index"))
+    files = list_output_files()
+    return render_template("index.html", files=files)
+@app.route("/download/<filename>")
+def download(filename):
+    if os.path.exists(filename):
+        return send_file(filename, as_attachment=True)
+    else:
+        flash("文件不存在。")
+        return redirect(url_for("index"))
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860, debug=False)

data/output.xlsx ADDED Viewed

Binary file (8.55 kB). View file

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+services:
+  scraper:
+    build: .
+    container_name: tiktok_scraper
+    ports:
+        - "7860:7860"
+    volumes:
+      - ./data:/app/data  # ✅ 修正挂载路径，避免覆盖代码
+    environment:
+      - TZ=Asia/Shanghai
+      - RUN_SCHEDULER=true
+    restart: unless-stopped

history.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Flask
+yt-dlp
+pandas
+openpyxl

templates/index.html ADDED Viewed

	@@ -0,0 +1,61 @@

+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <title>视频抓取器 — TikTok / YouTube</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+    body { font-family: Arial, sans-serif; margin: 30px; background:#f7f7f7 }
+    .card { background:white; padding:20px; border-radius:8px; max-width:800px; margin:auto; box-shadow:0 4px 12px rgba(0,0,0,0.06)}
+    input[type=text]{width:100%; padding:10px; margin:8px 0; box-sizing:border-box}
+    button{padding:10px 18px; border:none; background:#007bff; color:white; border-radius:6px; cursor:pointer}
+    button:hover{background:#0056cc}
+    .note{color:#555; font-size:0.9em}
+    .flash{background:#fffae6; padding:10px; border-left:4px solid #ffd24d; margin-bottom:12px}
+    table{width:100%; border-collapse:collapse; margin-top:20px; font-size:0.95em}
+    th,td{padding:8px; border-bottom:1px solid #ddd; text-align:left}
+    th{background:#f0f0f0}
+    a.download-link{text-decoration:none; color:#007bff}
+    a.download-link:hover{text-decoration:underline}
+  </style>
+</head>
+<body>
+  <div class="card">
+    <h2>视频抓取器（TikTok / YouTube）</h2>
+    {% with messages = get_flashed_messages() %}
+      {% if messages %}
+        <div class="flash">
+          {% for msg in messages %}
+            <div>{{msg}}</div>
+          {% endfor %}
+        </div>
+      {% endif %}
+    {% endwith %}
+    <form method="post">
+      <label>主页链接（TikTok 或 YouTube 主页 / Shorts 页面）：</label>
+      <input type="text" name="profile_url" placeholder="例如：https://www.tiktok.com/@bayashi.tiktok 或 https://www.youtube.com/@AdrenalineRushReviews/shorts">
+      <button type="submit">抓取最新 30 条</button>
+      <p class="note">说明：每次抓取会跳过已采集过的视频。结果追加到 output.xlsx，并另存时间戳文件。</p>
+    </form>
+    <h3 style="margin-top:30px;">📂 当前目录中的数据文件：</h3>
+    {% if files %}
+      <table>
+        <tr><th>文件名</th><th>最后修改时间</th><th>大小</th><th>下载</th></tr>
+        {% for f in files %}
+        <tr>
+          <td>{{ f.name }}</td>
+          <td>{{ f.mtime }}</td>
+          <td>{{ f.size }}</td>
+          <td><a class="download-link" href="{{ url_for('download', filename=f.name) }}">下载</a></td>
+        </tr>
+        {% endfor %}
+      </table>
+    {% else %}
+      <p>暂无抓取结果文件。</p>
+    {% endif %}
+  </div>
+</body>
+</html>