bharatverse11 commited on
Commit
ea0c32b
Β·
verified Β·
1 Parent(s): 6e54837

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +50 -15
scraper.py CHANGED
@@ -8,7 +8,15 @@ from dejavu import Dejavu
8
  djv = Dejavu(dburl="sqlite:///fingerprints.db")
9
 
10
  # ============================================================
11
- # ADD ARTIST NAMES β€” scraper will find top songs automatically
 
 
 
 
 
 
 
 
12
  # ============================================================
13
  ARTISTS = [
14
  "Arijit Singh",
@@ -31,11 +39,10 @@ ARTISTS = [
31
  "Ed Sheeran",
32
  "The Weeknd",
33
  "Drake",
34
- # Add more artists...
35
  ]
36
 
37
- SONGS_PER_ARTIST = 25 # 25 songs x 200 artists = 5000 total
38
- BATCH_SIZE = 500 # 500 songs per daily run
39
  PROGRESS_FILE = "scraper_progress.json"
40
 
41
  # ============================================================
@@ -51,12 +58,11 @@ def save_progress(progress):
51
  json.dump(progress, f, indent=2)
52
 
53
  def build_queue(progress):
54
- """Search YouTube for each artist and build a queue of video URLs."""
55
  if progress["queue"]:
56
  print(f"πŸ“‹ Resuming existing queue ({len(progress['queue'])} songs left)")
57
  return progress
58
 
59
- print("πŸ” Building song queue from artist names...")
60
  queue = []
61
  done_set = set(progress["done"])
62
 
@@ -66,27 +72,58 @@ def build_queue(progress):
66
  'extract_flat': True,
67
  }
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  for artist in ARTISTS:
70
  search_query = f"ytsearch{SONGS_PER_ARTIST}:{artist} songs"
71
  try:
72
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
73
  result = ydl.extract_info(search_query, download=False)
74
  entries = result.get('entries', [])
 
75
  for entry in entries:
 
 
76
  url = f"https://www.youtube.com/watch?v={entry['id']}"
77
  title = entry.get('title', artist)
78
  if url not in done_set:
79
  queue.append({"url": url, "title": title, "artist": artist})
80
- print(f" βœ… Found songs for: {artist}")
 
81
  except Exception as e:
82
  print(f" ❌ Failed to search {artist}: {e}")
83
 
84
  progress["queue"] = queue
85
  save_progress(progress)
86
- print(f"πŸ“‹ Total queue built: {len(queue)} songs")
87
  return progress
88
 
89
  def download_and_fingerprint(url, title):
 
 
 
 
 
 
90
  ydl_opts = {
91
  'format': 'bestaudio/best',
92
  'outtmpl': '/tmp/song.%(ext)s',
@@ -97,24 +134,24 @@ def download_and_fingerprint(url, title):
97
  'quiet': True,
98
  'no_warnings': True,
99
  }
 
100
  try:
101
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
102
  ydl.extract_info(url, download=True)
103
 
104
- wav_path = '/tmp/song.wav'
105
  if os.path.exists(wav_path):
106
  djv.fingerprint_file(wav_path, song_name=title)
107
  print(f" βœ… Fingerprinted: {title}")
108
  os.remove(wav_path)
109
  return True
110
  else:
111
- print(f" ❌ WAV not found: {title}")
112
  return False
 
113
  except Exception as e:
114
  print(f" ❌ Failed: {title} β†’ {e}")
115
- # Clean up if file exists
116
- if os.path.exists('/tmp/song.wav'):
117
- os.remove('/tmp/song.wav')
118
  return False
119
 
120
  def run_batch():
@@ -128,7 +165,6 @@ def run_batch():
128
  print("πŸŽ‰ All songs have been fingerprinted!")
129
  return
130
 
131
- # Take next BATCH_SIZE from queue
132
  batch = queue[:BATCH_SIZE]
133
  remaining = queue[BATCH_SIZE:]
134
 
@@ -147,7 +183,6 @@ def run_batch():
147
  else:
148
  failed += 1
149
 
150
- # Update progress
151
  progress["queue"] = remaining
152
  progress["done"] = done
153
  save_progress(progress)
 
8
  djv = Dejavu(dburl="sqlite:///fingerprints.db")
9
 
10
  # ============================================================
11
+ # PLAYLISTS β€” songs will be scraped from these directly
12
+ # ============================================================
13
+ PLAYLISTS = [
14
+ "https://www.youtube.com/playlist?list=PLr7xQC-cXWL9EZ3dqpu8E_Xf_4nhS6xEJ",
15
+ "https://www.youtube.com/playlist?list=PLxA687tYuMWjrFhZTNBtk13YUL2TkwUnU",
16
+ ]
17
+
18
+ # ============================================================
19
+ # ARTISTS β€” scraper will find top songs automatically
20
  # ============================================================
21
  ARTISTS = [
22
  "Arijit Singh",
 
39
  "Ed Sheeran",
40
  "The Weeknd",
41
  "Drake",
 
42
  ]
43
 
44
+ SONGS_PER_ARTIST = 25
45
+ BATCH_SIZE = 50
46
  PROGRESS_FILE = "scraper_progress.json"
47
 
48
  # ============================================================
 
58
  json.dump(progress, f, indent=2)
59
 
60
  def build_queue(progress):
 
61
  if progress["queue"]:
62
  print(f"πŸ“‹ Resuming existing queue ({len(progress['queue'])} songs left)")
63
  return progress
64
 
65
+ print("πŸ” Building song queue from playlists and artist names...")
66
  queue = []
67
  done_set = set(progress["done"])
68
 
 
72
  'extract_flat': True,
73
  }
74
 
75
+ # ── Playlists ──────────────────────────────────────────
76
+ for playlist_url in PLAYLISTS:
77
+ try:
78
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
79
+ result = ydl.extract_info(playlist_url, download=False)
80
+ playlist_title = result.get('title', playlist_url)
81
+ entries = result.get('entries', [])
82
+ added = 0
83
+ for entry in entries:
84
+ if not entry or not entry.get('id'):
85
+ continue
86
+ url = f"https://www.youtube.com/watch?v={entry['id']}"
87
+ title = entry.get('title', 'Unknown')
88
+ if url not in done_set:
89
+ queue.append({"url": url, "title": title, "artist": "playlist"})
90
+ added += 1
91
+ print(f" βœ… Playlist '{playlist_title}': {added} songs added")
92
+ except Exception as e:
93
+ print(f" ❌ Failed to load playlist {playlist_url}: {e}")
94
+
95
+ # ── Artists ────────────────────────────────────────────
96
  for artist in ARTISTS:
97
  search_query = f"ytsearch{SONGS_PER_ARTIST}:{artist} songs"
98
  try:
99
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
  result = ydl.extract_info(search_query, download=False)
101
  entries = result.get('entries', [])
102
+ added = 0
103
  for entry in entries:
104
+ if not entry or not entry.get('id'):
105
+ continue
106
  url = f"https://www.youtube.com/watch?v={entry['id']}"
107
  title = entry.get('title', artist)
108
  if url not in done_set:
109
  queue.append({"url": url, "title": title, "artist": artist})
110
+ added += 1
111
+ print(f" βœ… Artist '{artist}': {added} songs added")
112
  except Exception as e:
113
  print(f" ❌ Failed to search {artist}: {e}")
114
 
115
  progress["queue"] = queue
116
  save_progress(progress)
117
+ print(f"\nπŸ“‹ Total queue built: {len(queue)} songs")
118
  return progress
119
 
120
  def download_and_fingerprint(url, title):
121
+ wav_path = '/tmp/song.wav'
122
+
123
+ # Clean up any leftover file from previous run
124
+ if os.path.exists(wav_path):
125
+ os.remove(wav_path)
126
+
127
  ydl_opts = {
128
  'format': 'bestaudio/best',
129
  'outtmpl': '/tmp/song.%(ext)s',
 
134
  'quiet': True,
135
  'no_warnings': True,
136
  }
137
+
138
  try:
139
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
140
  ydl.extract_info(url, download=True)
141
 
 
142
  if os.path.exists(wav_path):
143
  djv.fingerprint_file(wav_path, song_name=title)
144
  print(f" βœ… Fingerprinted: {title}")
145
  os.remove(wav_path)
146
  return True
147
  else:
148
+ print(f" ❌ WAV not found after download: {title}")
149
  return False
150
+
151
  except Exception as e:
152
  print(f" ❌ Failed: {title} β†’ {e}")
153
+ if os.path.exists(wav_path):
154
+ os.remove(wav_path)
 
155
  return False
156
 
157
  def run_batch():
 
165
  print("πŸŽ‰ All songs have been fingerprinted!")
166
  return
167
 
 
168
  batch = queue[:BATCH_SIZE]
169
  remaining = queue[BATCH_SIZE:]
170
 
 
183
  else:
184
  failed += 1
185
 
 
186
  progress["queue"] = remaining
187
  progress["done"] = done
188
  save_progress(progress)