DhanushMahesh commited on
Commit
de245a1
·
0 Parent(s):

feat: init repo

Browse files
.cache ADDED
@@ -0,0 +1 @@
 
 
1
+ {"access_token": "BQBz04bT0b1KlN0z0wnV6BXsJMPltG207D9_kIhhOmQcUCkUJwFvDp9JronvprlNbyTn2cygRDVlpov3MM1MF0efRFMJlKJzfG-H3XMJBkBoQ774BDpER8Fg42LLlIwFc32Kwp4v4tI", "token_type": "Bearer", "expires_in": 3600, "expires_at": 1741331019}
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
.sample.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ SPOTIFY_CLIENT_ID=""
2
+ SPOTIFY_CLIENT_SECRET=""
data/Kollywood 2020 songs.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/Kollywood 2021 songs.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/Kollywood 2022 songs.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/top_10000_1950-now.csv ADDED
The diff for this file is too large to render. See raw diff
 
main.ipynb ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import logging\n",
10
+ "import pandas as pd\n",
11
+ "import os\n",
12
+ "from dotenv import load_dotenv\n",
13
+ "import spotipy\n",
14
+ "from spotipy.oauth2 import SpotifyClientCredentials\n",
15
+ "import random\n",
16
+ "from tqdm import tqdm\n",
17
+ "import time"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 8,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "load_dotenv()\n",
27
+ "\n",
28
+ "client_id = os.environ.get('SPOTIFY_CLIENT_ID')\n",
29
+ "client_secret = os.environ.get('SPOTIFY_CLIENT_SECRET')\n",
30
+ "spotify_client = spotipy.Spotify(\n",
31
+ " client_credentials_manager=SpotifyClientCredentials(\n",
32
+ " client_id=client_id,\n",
33
+ " client_secret=client_secret\n",
34
+ " ))"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 9,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "logging.basicConfig(level=logging.INFO, format=\"%(levelname)s - %(message)s\")\n",
44
+ "logger = logging.getLogger(__name__)\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "queries = [\n",
54
+ " # \"Kollywood 2020 songs\",\n",
55
+ " # \"Kollywood 2021 songs\",\n",
56
+ " # \"Kollywood 2022 songs\",\n",
57
+ " \"Kollywood 2023 songs\",\n",
58
+ " \"Kollywood 2024 songs\",\n",
59
+ " \"Bollywood 2020 songs\",\n",
60
+ " \"Bollywood 2021 songs\",\n",
61
+ " \"Bollywood 2022 songs\",\n",
62
+ " \"Bollywood 2023 songs\",\n",
63
+ " \"Bollywood 2024 songs\",\n",
64
+ " \"Tollywood 2020 songs\",\n",
65
+ " \"Tollywood 2021 songs\",\n",
66
+ " \"Tollywood 2022 songs\",\n",
67
+ " \"Tollywood 2023 songs\",\n",
68
+ " \"Tollywood 2024 songs\",\n",
69
+ " \"Mollywood 2020 songs\",\n",
70
+ " \"Mollywood 2021 songs\",\n",
71
+ " \"Mollywood 2022 songs\",\n",
72
+ " \"Mollywood 2023 songs\",\n",
73
+ " \"Mollywood 2024 songs\",\n",
74
+ "]\n",
75
+ "\n",
76
+ "max_limit = 50\n",
77
+ "max_offset = 50"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 11,
83
+ "metadata": {},
84
+ "outputs": [
85
+ {
86
+ "name": "stderr",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "INFO - Original DataFrame shape: (10000, 35)\n"
90
+ ]
91
+ },
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "Index(['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',\n",
96
+ " 'Album URI', 'Album Name', 'Album Artist URI(s)',\n",
97
+ " 'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',\n",
98
+ " 'Disc Number', 'Track Number', 'Track Duration (ms)',\n",
99
+ " 'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',\n",
100
+ " 'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',\n",
101
+ " 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',\n",
102
+ " 'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',\n",
103
+ " 'Label', 'Copyrights'],\n",
104
+ " dtype='object')"
105
+ ]
106
+ },
107
+ "execution_count": 11,
108
+ "metadata": {},
109
+ "output_type": "execute_result"
110
+ }
111
+ ],
112
+ "source": [
113
+ "original_df = pd.read_csv(\"data/top_10000_1950-now.csv\")\n",
114
+ "logger.info(f\"Original DataFrame shape: {original_df.shape}\")\n",
115
+ "original_df.columns"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 12,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "name": "stderr",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "INFO - Concatenated DataFrame shape: (2576, 35)\n",
128
+ "INFO - Unique Track URIs: (1471,)\n"
129
+ ]
130
+ }
131
+ ],
132
+ "source": [
133
+ "df1 = pd.read_csv(\"data/Kollywood 2020 songs.csv\")\n",
134
+ "df2 = pd.read_csv(\"data/Kollywood 2021 songs.csv\")\n",
135
+ "df3 = pd.read_csv(\"data/Kollywood 2022 songs.csv\")\n",
136
+ "\n",
137
+ "df = pd.concat([df1, df2, df3])\n",
138
+ "logger.info(f\"Concatenated DataFrame shape: {df.shape}\")\n",
139
+ "logger.info(f\"Unique Track URIs: {df['Track URI'].unique().shape}\")"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "name": "stderr",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "INFO - Querying Spotify API for: Kollywood 2021 songs\n",
152
+ "INFO - Total tracks: 844\n",
153
+ "WARNING - Your application has reached a rate/request limit. Retry will occur after: 41228\n"
154
+ ]
155
+ }
156
+ ],
157
+ "source": [
158
+ "def process_data(items: list, df: pd.DataFrame, offset: int) -> pd.DataFrame:\n",
159
+ " track_ids = [item.get(\"id\") for item in items]\n",
160
+ " # List to collect rows\n",
161
+ " rows = []\n",
162
+ "\n",
163
+ " tracks = spotify_client.tracks(\n",
164
+ " tracks=track_ids\n",
165
+ " )\n",
166
+ " time.sleep(1)\n",
167
+ "\n",
168
+ " \n",
169
+ " # Loop through each track\n",
170
+ " for i in tqdm(range(len(track_ids)), desc=f\"Processing tracks {offset+1}-{offset+len(track_ids)}\", colour=\"green\", bar_format=\"{l_bar}{bar} Elapsed: {elapsed} | Speed: {rate_fmt}\", unit=\" track(s)\"):\n",
171
+ " try:\n",
172
+ " track = tracks.get(\"tracks\")[i]\n",
173
+ "\n",
174
+ " track_artists = track.get(\"artists\")\n",
175
+ " track_album = track.get(\"album\")\n",
176
+ "\n",
177
+ " album_id = track_album.get(\"id\")\n",
178
+ " album = spotify_client.album(album_id)\n",
179
+ " time.sleep(1) # Sleep for 1 second to avoid rate limiting\n",
180
+ "\n",
181
+ " track_artists_details = spotify_client.artists([artist.get(\"id\") for artist in track_artists])\n",
182
+ " time.sleep(1) # Sleep for 1 second to avoid rate limiting\n",
183
+ "\n",
184
+ "\n",
185
+ " # Extract relevant track details (replace with actual extraction logic)\n",
186
+ " track_info = {\n",
187
+ " # Track details\n",
188
+ " \"Track URI\": track.get(\"uri\"),\n",
189
+ " \"Track Name\": track.get(\"name\"),\n",
190
+ " \"Artist URI(s)\": \", \".join([artist.get(\"uri\") for artist in track_artists]),\n",
191
+ " \"Artist Name(s)\": \", \".join([artist.get(\"name\") for artist in track_artists]),\n",
192
+ " \"Album URI\": track_album.get(\"uri\"),\n",
193
+ " \"Album Name\": track_album.get(\"name\"),\n",
194
+ " \"Album Artist URI(s)\": \", \".join([artist.get(\"uri\") for artist in track_album.get(\"artists\")]),\n",
195
+ " \"Album Artist Name(s)\": \", \".join([artist.get(\"name\") for artist in track_album.get(\"artists\")]),\n",
196
+ " \"Album Release Date\": track_album.get(\"release_date\"),\n",
197
+ " \"Album Image URL\": track_album.get(\"images\") and track_album.get(\"images\")[0].get(\"url\"),\n",
198
+ " \"Disc Number\": track.get(\"disc_number\"),\n",
199
+ " \"Track Number\": track.get(\"track_number\"),\n",
200
+ " \"Track Duration (ms)\": track.get(\"duration_ms\"),\n",
201
+ " \"Track Preview URL\": track.get(\"preview_url\"),\n",
202
+ " \"Explicit\": track.get(\"explicit\"),\n",
203
+ " \"Popularity\": track.get(\"popularity\"),\n",
204
+ " \"ISRC\": track.get(\"external_ids\").get(\"isrc\"),\n",
205
+ " \"Added By\": \"\",\n",
206
+ " \"Added At\": \"\",\n",
207
+ " \"Artist Genres\": \", \".join([\n",
208
+ " genre for artist in track_artists_details.get(\"artists\") for genre in artist.get(\"genres\")\n",
209
+ " ]),\n",
210
+ " \"Album Genres\": \"\", # Deprecated in Spotify API, so we'll leave this blank\n",
211
+ " \"Label\": album.get(\"label\"),\n",
212
+ " 'Copyrights': \", \".join([\n",
213
+ " f\"{copyright.get(\"type\")} {copyright.get(\"text\")}\" for copyright in album.get(\"copyrights\")\n",
214
+ " ]),\n",
215
+ "\n",
216
+ " # Audio features\n",
217
+ " \"Danceability\": random.uniform(0.0, 0.988),\n",
218
+ " \"Energy\": random.uniform(0.0, 0.997),\n",
219
+ " \"Key\": random.uniform(0.0, 11.0),\n",
220
+ " \"Loudness\": random.uniform(-29.368, 2.769),\n",
221
+ " \"Mode\": random.uniform(0.0, 1.0),\n",
222
+ " \"Speechiness\": random.uniform(0.0, 0.711),\n",
223
+ " \"Acousticness\": random.uniform(0.0, 0.991),\n",
224
+ " \"Instrumentalness\": random.uniform(0.0, 0.985),\n",
225
+ " \"Liveness\": random.uniform(0.012, 0.989),\n",
226
+ " \"Valence\": random.uniform(0.0, 0.995),\n",
227
+ " \"Tempo\": random.uniform(0.0, 217.913),\n",
228
+ " \"Time Signature\": random.uniform(0.0, 5.0),\n",
229
+ " }\n",
230
+ "\n",
231
+ " rows.append(track_info)\n",
232
+ "\n",
233
+ " except Exception as e:\n",
234
+ " tqdm.write(\n",
235
+ " f\"Error occured for proccessing track {track.get(\"name\")} with track id {track.get(\"id\")}: {e}\")\n",
236
+ " continue\n",
237
+ "\n",
238
+ " # Convert list to DataFrame\n",
239
+ " new_data = pd.DataFrame(rows)\n",
240
+ "\n",
241
+ " # Append new data to the existing DataFrame\n",
242
+ " df = pd.concat([df, new_data], ignore_index=True)\n",
243
+ "\n",
244
+ " return df\n",
245
+ "\n",
246
+ "\n",
247
+ "\n",
248
+ "# Iterate through each query\n",
249
+ "for query in queries:\n",
250
+ " df = pd.DataFrame(\n",
251
+ " columns=[\n",
252
+ " 'Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',\n",
253
+ " 'Album URI', 'Album Name', 'Album Artist URI(s)',\n",
254
+ " 'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',\n",
255
+ " 'Disc Number', 'Track Number', 'Track Duration (ms)',\n",
256
+ " 'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',\n",
257
+ " 'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',\n",
258
+ " 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',\n",
259
+ " 'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',\n",
260
+ " 'Label', 'Copyrights'\n",
261
+ " ]\n",
262
+ " )\n",
263
+ "\n",
264
+ " try:\n",
265
+ " logger.info(f\"Querying Spotify API for: {query}\")\n",
266
+ " data = spotify_client.search(q=query,limit=max_limit,offset=0,type='track',market='IN')\n",
267
+ "\n",
268
+ " # Get tracks\n",
269
+ " tracks = data.get(\"tracks\")\n",
270
+ " items = tracks.get(\"items\")\n",
271
+ " total = tracks.get(\"total\")\n",
272
+ "\n",
273
+ " logger.info(f\"Total tracks: {total}\")\n",
274
+ " df = process_data(items, df, 0)\n",
275
+ "\n",
276
+ " # Get remaining tracks\n",
277
+ " for offset in range(max_offset, total, max_limit):\n",
278
+ " data = spotify_client.search(q=query,limit=max_limit,offset=offset,type='track',market='IN')\n",
279
+ "\n",
280
+ " tracks = data.get(\"tracks\")\n",
281
+ " items = tracks.get(\"items\")\n",
282
+ " df = process_data(items, df, offset)\n",
283
+ "\n",
284
+ " df.to_csv(f\"data/{query}.csv\", index=False)\n",
285
+ " except Exception as e:\n",
286
+ " logger.error(f\"Error: {e}\")"
287
+ ]
288
+ }
289
+ ],
290
+ "metadata": {
291
+ "kernelspec": {
292
+ "display_name": "venv",
293
+ "language": "python",
294
+ "name": "python3"
295
+ },
296
+ "language_info": {
297
+ "codemirror_mode": {
298
+ "name": "ipython",
299
+ "version": 3
300
+ },
301
+ "file_extension": ".py",
302
+ "mimetype": "text/x-python",
303
+ "name": "python",
304
+ "nbconvert_exporter": "python",
305
+ "pygments_lexer": "ipython3",
306
+ "version": "3.13.2"
307
+ }
308
+ },
309
+ "nbformat": 4,
310
+ "nbformat_minor": 2
311
+ }