HoneyTian commited on
Commit
2d22311
·
1 Parent(s): 13f0937
toolbox/hf_netdisk/netdisk/netdisk_client.py CHANGED
@@ -1,6 +1,8 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
 
 
4
  from typing import List
5
 
6
  from huggingface_hub.hf_api import CommitInfo
@@ -34,10 +36,22 @@ class NetdiskClient(RepoClient):
34
  result: CommitInfo = self.hf_api.delete_file(
35
  path_in_repo=path_in_repo,
36
  repo_id=repo_id,
37
- repo_type="dataset"
38
  )
39
  return result
40
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def get_args():
43
  parser = argparse.ArgumentParser()
@@ -86,6 +100,13 @@ def main():
86
  )
87
  print(result)
88
 
 
 
 
 
 
 
 
89
  result = client.delete_file(
90
  path_in_repo=args.tgt_file,
91
  repo_id="tianxing1994/hf_netdisk",
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import argparse
4
+ from pathlib import Path
5
+ import shutil
6
  from typing import List
7
 
8
  from huggingface_hub.hf_api import CommitInfo
 
36
  result: CommitInfo = self.hf_api.delete_file(
37
  path_in_repo=path_in_repo,
38
  repo_id=repo_id,
39
+ repo_type="dataset",
40
  )
41
  return result
42
 
43
+ def download_file(self, remote_file: str, local_file: str, repo_id: str, cache_dir: str = None):
44
+ file_path: str = self.hf_api.hf_hub_download(
45
+ filename=remote_file,
46
+ repo_id=repo_id,
47
+ repo_type="dataset",
48
+ cache_dir=cache_dir
49
+ )
50
+ local_file = Path(local_file)
51
+ local_file.parent.mkdir(parents=True, exist_ok=True)
52
+ shutil.move(file_path, local_file.as_posix())
53
+ return local_file
54
+
55
 
56
  def get_args():
57
  parser = argparse.ArgumentParser()
 
100
  )
101
  print(result)
102
 
103
+ # result = client.download_file(
104
+ # remote_file="xianyu/customers/陈杰森/20251226/焦虑个锤子(20251225_165801直播).mp4",
105
+ # local_file="temp.mp4",
106
+ # repo_id="tianxing1994/hf_netdisk",
107
+ # )
108
+ # print(result)
109
+
110
  result = client.delete_file(
111
  path_in_repo=args.tgt_file,
112
  repo_id="tianxing1994/hf_netdisk",
toolbox/youtube_spider/video/video_download.py CHANGED
@@ -54,11 +54,17 @@ class YoutubeVideoDownloadSpider(YoutubeSpiderClient):
54
  target_file = Path(target_file)
55
  target_file.parent.mkdir(parents=True, exist_ok=True)
56
 
 
 
 
 
 
 
57
  ydl_opts = {
58
  'outtmpl': target_file.as_posix(),
59
- 'format': 'bestvideo+bestaudio/best', # 下载最佳视频+音频
60
- 'merge_output_format': 'mp4', # 合并成 mp4
61
- 'noplaylist': True, # 只下载单个视频,不下载播放列表
62
  }
63
 
64
  self.delay_before_download()
@@ -78,5 +84,44 @@ def main():
78
  return
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if __name__ == "__main__":
82
- main()
 
54
  target_file = Path(target_file)
55
  target_file.parent.mkdir(parents=True, exist_ok=True)
56
 
57
+ # ydl_opts = {
58
+ # 'outtmpl': target_file.as_posix(),
59
+ # 'format': 'bestvideo+bestaudio/best', # 下载最佳视频+音频
60
+ # 'merge_output_format': 'mp4', # 合并成 mp4
61
+ # 'noplaylist': True, # 只下载单个视频,不下载播放列表
62
+ # }
63
  ydl_opts = {
64
  'outtmpl': target_file.as_posix(),
65
+ "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
66
+ 'merge_output_format': 'mp4',
67
+ 'noplaylist': True,
68
  }
69
 
70
  self.delay_before_download()
 
84
  return
85
 
86
 
87
+ def main2():
88
+ import json
89
+ from project_settings import project_path
90
+
91
+ client = YoutubeVideoDownloadSpider()
92
+
93
+ filename = project_path / "temp/filename.jsonl"
94
+ target_dir = project_path / "temp/陈杰森"
95
+ target_dir.mkdir(parents=True, exist_ok=True)
96
+
97
+ # finished
98
+ finished_set = set()
99
+ for video_file in target_dir.glob("*.mp4"):
100
+ finished_set.add(video_file.stem)
101
+ print(f"finished_count: {len(finished_set)}")
102
+
103
+ rows = list()
104
+ with open(filename, "r", encoding="utf-8") as f:
105
+ for row in f:
106
+ row = json.loads(row)
107
+ rows.append(row)
108
+
109
+ for row in list(reversed(rows)):
110
+ video_id = row["video_id"]
111
+ title = row["title"]
112
+ if title in finished_set:
113
+ continue
114
+
115
+ print(f"download; video_id: {video_id}, title: {title}")
116
+ client.download_by_video_id_by_yt_dlp(
117
+ video_id=video_id,
118
+ target_file=(target_dir / f"{title}.mp4").as_posix()
119
+ )
120
+ print(f"sleeping ...")
121
+ time.sleep(1800)
122
+
123
+ return
124
+
125
+
126
  if __name__ == "__main__":
127
+ main2()
toolbox/youtube_spider/video/video_list.py CHANGED
@@ -170,8 +170,11 @@ class YoutubeVideoListSpider(YoutubeSpiderClient):
170
  })
171
 
172
  # print(json.dumps(continuation_grid_renderer, ensure_ascii=False, indent=4))
173
- continuation_item_renderer = continuation_grid_renderer["continuationItemRenderer"]
174
- continuation_token = continuation_item_renderer["continuationEndpoint"]["continuationCommand"]["token"]
 
 
 
175
  return video_list, continuation_token
176
 
177
 
@@ -192,5 +195,54 @@ def main():
192
  return
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  if __name__ == "__main__":
196
- main()
 
170
  })
171
 
172
  # print(json.dumps(continuation_grid_renderer, ensure_ascii=False, indent=4))
173
+
174
+ continuation_token = None
175
+ continuation_item_renderer = continuation_grid_renderer.get("continuationItemRenderer")
176
+ if continuation_item_renderer is not None:
177
+ continuation_token = continuation_item_renderer["continuationEndpoint"]["continuationCommand"]["token"]
178
  return video_list, continuation_token
179
 
180
 
 
195
  return
196
 
197
 
198
+ def main2():
199
+ import re
200
+
201
+ client = YoutubeVideoListSpider()
202
+
203
+
204
+ pattern = r"(\d{8})_(\d{6})"
205
+
206
+ filename = "filename.jsonl"
207
+
208
+ with open(filename, "a+", encoding="utf-8") as f:
209
+ video_list, continuation_token = client.get_front_page_video_list_pretty("JasonBear131")
210
+ for video in video_list:
211
+ video_id = video["video_id"]
212
+ title = video["title"]
213
+ match = re.search(pattern, string=title, flags=re.IGNORECASE)
214
+ if match is None:
215
+ continue
216
+ print(title)
217
+
218
+ row = {
219
+ "video_id": video_id,
220
+ "title": title,
221
+ }
222
+ row = json.dumps(row, ensure_ascii=False)
223
+ f.write(f"{row}\n")
224
+ f.flush()
225
+
226
+ for i in range(1000):
227
+ video_list, continuation_token = client.get_continuation_page_video_list_pretty(continuation_token)
228
+ for video in video_list:
229
+ video_id = video["video_id"]
230
+ title = video["title"]
231
+ match = re.search(pattern, string=title, flags=re.IGNORECASE)
232
+ if match is None:
233
+ continue
234
+ print(title)
235
+
236
+ row = {
237
+ "video_id": video_id,
238
+ "title": title,
239
+ }
240
+ row = json.dumps(row, ensure_ascii=False)
241
+ f.write(f"{row}\n")
242
+ f.flush()
243
+
244
+ return
245
+
246
+
247
  if __name__ == "__main__":
248
+ main2()