Avik Rao commited on
Commit
2193f98
·
1 Parent(s): 5dd4ae5

Add NLP model and tag list

Browse files
Files changed (2) hide show
  1. nlp/nlp.py +60 -0
  2. nlp/tags.json +700 -0
nlp/nlp.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Semantically picks most similar tags in training space of
2
+ generative model """
3
+
4
+ # IMPORTS
5
+ import json
6
+ import torch
7
+ from typing import List
8
+ from transformers import AutoTokenizer, AutoModel
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+
12
+ # FUNCTIONS
13
+ # create embeddings
14
+ def get_embeddings(text, token_length, tokenizer, model):
15
+ tokens = tokenizer(text, max_length=token_length,
16
+ padding='max_length', truncation=True)
17
+ output = model(torch.tensor(tokens.input_ids).unsqueeze(0),
18
+ attention_mask=torch.tensor(
19
+ tokens.attention_mask
20
+ ).unsqueeze(0)).hidden_states[-1]
21
+ return torch.mean(output, axis=1).detach().numpy()
22
+
23
+
24
+ # get doc with highest similarity to query
25
+ def nearest_doc(doc_list: List[str],
26
+ query: str,
27
+ tokenizer,
28
+ model,
29
+ token_length: int = 50):
30
+ # get embeddings for each document
31
+ outs = [
32
+ get_embeddings(doc, token_length, tokenizer, model) for doc in doc_list
33
+ ]
34
+ # get embeddings for query
35
+ query_embeddings = get_embeddings(query, token_length=token_length)
36
+ # get similarity of each document embedding to query embedding
37
+ sims = [cosine_similarity(out, query_embeddings)[0][0] for out in outs]
38
+ return max(zip(sims, doc_list))[1]
39
+
40
+
41
+ # MAIN
42
+ def get_nearest_tags(user_tags: List[str]):
43
+ # download pretrained model
44
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",)
45
+ model = AutoModel.from_pretrained("bert-base-uncased",
46
+ output_hidden_states=True)
47
+
48
+ # get tag lists from local json file
49
+ with open("./tags.json", "r") as jf:
50
+ tags = json.load(jf)
51
+
52
+ # separate tags by type
53
+ user_genre, user_mood, user_instr = user_tags
54
+ genres, moods, instrs = tags["genre"], tags["mood"], tags["instrument"]
55
+
56
+ return (
57
+ nearest_doc(genres, user_genre, tokenizer, model),
58
+ nearest_doc(moods, user_mood, tokenizer, model),
59
+ nearest_doc(instrs, user_instr, tokenizer, model)
60
+ )
nlp/tags.json ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "genre": [
3
+ "countryrock",
4
+ "dub",
5
+ "classicalmusic",
6
+ "indierock",
7
+ "dubelectro",
8
+ "technopop",
9
+ "rocksteady",
10
+ "trancetechno",
11
+ "easylistening",
12
+ "trancemelodique",
13
+ "eletronica",
14
+ "artrock",
15
+ "ambiant",
16
+ "edm",
17
+ "trip",
18
+ "dreampop",
19
+ "acid",
20
+ "christian",
21
+ "symphonic",
22
+ "electrojazz",
23
+ "chansonatexte",
24
+ "downbeat",
25
+ "electronicmusic",
26
+ "dancemusic",
27
+ "cabaret",
28
+ "electrorock",
29
+ "dubstep",
30
+ "instrumentalpop",
31
+ "electronic",
32
+ "death",
33
+ "instrumentalhiphop",
34
+ "dancehouse",
35
+ "noise",
36
+ "housemusic",
37
+ "smoothjazz",
38
+ "experimentalrock",
39
+ "glitch",
40
+ "groove",
41
+ "postrock",
42
+ "ethnicrock",
43
+ "funk",
44
+ "experimental",
45
+ "progressiverock",
46
+ "darkwave",
47
+ "alternative",
48
+ "soundtrack",
49
+ "metal",
50
+ "techno",
51
+ "alternativerock",
52
+ "goth",
53
+ "gospel",
54
+ "blackmetal",
55
+ "trap",
56
+ "rnb",
57
+ "heavymetal",
58
+ "poppunk",
59
+ "house",
60
+ "hardstyle",
61
+ "electronical",
62
+ "downtempo",
63
+ "hardcore",
64
+ "emocore",
65
+ "newage",
66
+ "waltz",
67
+ "gipsy",
68
+ "jazzrock",
69
+ "trance",
70
+ "deephouse",
71
+ "score",
72
+ "industrial",
73
+ "indiepop",
74
+ "dancepop",
75
+ "lounge",
76
+ "electronika",
77
+ "elektro",
78
+ "powerpop",
79
+ "glam",
80
+ "rockpop",
81
+ "samba",
82
+ "rhythmandblues",
83
+ "freejazz",
84
+ "popelectro",
85
+ "electropunk",
86
+ "loungemusic",
87
+ "rockalternatif",
88
+ "krautrock",
89
+ "jungle",
90
+ "atmospheric",
91
+ "worldmusik",
92
+ "hiphop",
93
+ "ambient",
94
+ "rockfrancais",
95
+ "dance",
96
+ "latino",
97
+ "worldfusion",
98
+ "hardtrance",
99
+ "synthpop",
100
+ "fusion",
101
+ "newwave",
102
+ "punk",
103
+ "8bit",
104
+ "computermusic",
105
+ "middleeastern",
106
+ "punkrock",
107
+ "christianrap",
108
+ "soundtracks",
109
+ "acidjazz",
110
+ "darkambient",
111
+ "disco",
112
+ "triphop",
113
+ "chansonfrancaise",
114
+ "90s",
115
+ "tribal",
116
+ "choral",
117
+ "reggae",
118
+ "asian",
119
+ "electropop",
120
+ "jazzfusion",
121
+ "spacerock",
122
+ "chillout",
123
+ "club",
124
+ "swing",
125
+ "country",
126
+ "garage",
127
+ "psytrance",
128
+ "techhouse",
129
+ "electronik",
130
+ "lofi",
131
+ "rave",
132
+ "minimal",
133
+ "ragga",
134
+ "indietronica",
135
+ "hard",
136
+ "technoindustrial",
137
+ "bebop",
138
+ "postpunk",
139
+ "rapfrancais",
140
+ "tango",
141
+ "technohouse",
142
+ "rocknroll",
143
+ "rap",
144
+ "poprock",
145
+ "80s",
146
+ "progressivehouse",
147
+ "popfunk",
148
+ "hardrock",
149
+ "metalcore",
150
+ "rapcore",
151
+ "hiphopinstrumental",
152
+ "reggaeton",
153
+ "minimalism",
154
+ "houseelectro",
155
+ "gothic",
156
+ "thrashmetal",
157
+ "gangstarap",
158
+ "celtic",
159
+ "souljazz",
160
+ "classicrock",
161
+ "droneambient",
162
+ "classical",
163
+ "rockandroll",
164
+ "ska",
165
+ "pop",
166
+ "70s",
167
+ "acidhouse",
168
+ "minimaltechno",
169
+ "southernrock",
170
+ "breakcore",
171
+ "indie",
172
+ "island",
173
+ "deutschrock",
174
+ "raps",
175
+ "neoclassical",
176
+ "electroambient",
177
+ "nujazz",
178
+ "drone",
179
+ "ebm",
180
+ "medieval",
181
+ "improvisation",
182
+ "chilloutlounge",
183
+ "ambientmusic",
184
+ "electrohouse",
185
+ "flamenco",
186
+ "prog",
187
+ "ambiente",
188
+ "dancehall",
189
+ "grindcore",
190
+ "tecno",
191
+ "ethno",
192
+ "variete",
193
+ "deathmetal",
194
+ "bossanova",
195
+ "african",
196
+ "folkpop",
197
+ "acidrock",
198
+ "darkelectro",
199
+ "popfolk",
200
+ "salsa",
201
+ "drumnbass",
202
+ "indierockpop",
203
+ "emo",
204
+ "orchestral",
205
+ "synthwave",
206
+ "rock",
207
+ "electrodance",
208
+ "gangsta",
209
+ "chanson",
210
+ "abstracthiphop",
211
+ "electronicambient",
212
+ "folk",
213
+ "psychedelic",
214
+ "contemporary",
215
+ "avantgarde",
216
+ "britpop",
217
+ "trancemusic",
218
+ "jazz",
219
+ "numetal",
220
+ "singersongwriter",
221
+ "powermetal",
222
+ "bluesrock",
223
+ "ragtime",
224
+ "progressive",
225
+ "soundscapes",
226
+ "jrock",
227
+ "60s",
228
+ "oriental",
229
+ "breakbeat",
230
+ "soul",
231
+ "americana",
232
+ "chiptune",
233
+ "stoner",
234
+ "grunge",
235
+ "tekno",
236
+ "shoegaze",
237
+ "eurodance",
238
+ "progressivemetal",
239
+ "doom",
240
+ "baroque",
241
+ "choir",
242
+ "jazzfunk",
243
+ "hardtek",
244
+ "latin",
245
+ "instrumentalrock",
246
+ "opera",
247
+ "guitarrock",
248
+ "world",
249
+ "manouche",
250
+ "blues",
251
+ "idm",
252
+ "guitarras",
253
+ "acousticrock",
254
+ "march"
255
+ ],
256
+ "mood": [
257
+ "crime",
258
+ "peaceful",
259
+ "dirty",
260
+ "light",
261
+ "determined",
262
+ "relaxed",
263
+ "motivational",
264
+ "filmmusic",
265
+ "fantasy",
266
+ "france",
267
+ "yoga",
268
+ "funny",
269
+ "travel",
270
+ "confused",
271
+ "holiday",
272
+ "weird",
273
+ "laidback",
274
+ "kids",
275
+ "documentary",
276
+ "fast",
277
+ "technology",
278
+ "fun",
279
+ "ocean",
280
+ "powerful",
281
+ "cold",
282
+ "sex",
283
+ "brazil",
284
+ "mystery",
285
+ "communication",
286
+ "cinema",
287
+ "youtube",
288
+ "optimistic",
289
+ "christmas",
290
+ "drunk",
291
+ "uplifting",
292
+ "chilled",
293
+ "playful",
294
+ "trailermusic",
295
+ "contemplative",
296
+ "oldschool",
297
+ "video",
298
+ "drama",
299
+ "water",
300
+ "aggressive",
301
+ "dream",
302
+ "silentfilm",
303
+ "videogame",
304
+ "trippy",
305
+ "commercial",
306
+ "epic",
307
+ "sport",
308
+ "slow",
309
+ "relaxation",
310
+ "workout",
311
+ "rain",
312
+ "night",
313
+ "darkness",
314
+ "society",
315
+ "sadness",
316
+ "heavy",
317
+ "india",
318
+ "excited",
319
+ "melancholy",
320
+ "western",
321
+ "battle",
322
+ "zen",
323
+ "productive",
324
+ "filmscore",
325
+ "loud",
326
+ "mexico",
327
+ "curious",
328
+ "happy",
329
+ "cheerful",
330
+ "strong",
331
+ "argentina",
332
+ "suspense",
333
+ "motivate",
334
+ "wedding",
335
+ "war",
336
+ "sleepy",
337
+ "gloomy",
338
+ "entertainment",
339
+ "dynamic",
340
+ "busy",
341
+ "future",
342
+ "child",
343
+ "nervous",
344
+ "disappointed",
345
+ "colombia",
346
+ "motivation",
347
+ "silly",
348
+ "loved",
349
+ "romantic",
350
+ "health",
351
+ "presentation",
352
+ "lazy",
353
+ "party",
354
+ "xmas",
355
+ "retro",
356
+ "heroic",
357
+ "dreamy",
358
+ "joy",
359
+ "animation",
360
+ "soft",
361
+ "urban",
362
+ "hopeful",
363
+ "hope",
364
+ "corporate",
365
+ "russia",
366
+ "education",
367
+ "bright",
368
+ "chaos",
369
+ "emotional",
370
+ "nostalgic",
371
+ "tv",
372
+ "lovesong",
373
+ "extreme",
374
+ "melodic",
375
+ "space",
376
+ "advertising",
377
+ "positive",
378
+ "trailer",
379
+ "indian",
380
+ "discontent",
381
+ "arabic",
382
+ "mellow",
383
+ "business",
384
+ "mysterious",
385
+ "festive",
386
+ "japan",
387
+ "beach",
388
+ "thriller",
389
+ "inspirational",
390
+ "eclectic",
391
+ "sentimental",
392
+ "tropical",
393
+ "sensible",
394
+ "sad",
395
+ "happiness",
396
+ "joyful",
397
+ "dancefloor",
398
+ "movie",
399
+ "guilty",
400
+ "magic",
401
+ "impressed",
402
+ "scifi",
403
+ "drive",
404
+ "comedy",
405
+ "lovemusic",
406
+ "abstract",
407
+ "sexy",
408
+ "violence",
409
+ "driving",
410
+ "relaxingmusic",
411
+ "bouncy",
412
+ "adult",
413
+ "beautiful",
414
+ "meditative",
415
+ "berlin",
416
+ "life",
417
+ "groovy",
418
+ "thankful",
419
+ "energic",
420
+ "film",
421
+ "calm",
422
+ "catchy",
423
+ "computers",
424
+ "lullaby",
425
+ "success",
426
+ "horror",
427
+ "poesia",
428
+ "ethereal",
429
+ "humor",
430
+ "soulful",
431
+ "love",
432
+ "clubbing",
433
+ "awake",
434
+ "dark",
435
+ "hypnotic",
436
+ "anxious",
437
+ "romance",
438
+ "freedom",
439
+ "tribe",
440
+ "cool",
441
+ "africa",
442
+ "sea",
443
+ "tension",
444
+ "scary",
445
+ "robot",
446
+ "warm",
447
+ "spacey",
448
+ "history",
449
+ "news",
450
+ "dancing",
451
+ "tech",
452
+ "sciencefiction",
453
+ "game",
454
+ "psycho",
455
+ "pensive",
456
+ "inspiring",
457
+ "emotion",
458
+ "soundscape",
459
+ "halloween",
460
+ "traditional",
461
+ "action",
462
+ "adventure",
463
+ "jingle",
464
+ "melancolic",
465
+ "crazy",
466
+ "irish",
467
+ "angry",
468
+ "creative",
469
+ "poetry",
470
+ "energetic",
471
+ "science",
472
+ "children",
473
+ "depressed",
474
+ "religious",
475
+ "vintage",
476
+ "sports",
477
+ "nature",
478
+ "melancholic",
479
+ "spiritual",
480
+ "conscient",
481
+ "strange",
482
+ "ambiental",
483
+ "reflective",
484
+ "cosmic",
485
+ "upbeat",
486
+ "winter",
487
+ "nice",
488
+ "epicmusic",
489
+ "relaxing",
490
+ "spy",
491
+ "deep",
492
+ "social",
493
+ "fashion",
494
+ "energy",
495
+ "folkrock",
496
+ "festif",
497
+ "thoughtful",
498
+ "culture",
499
+ "silence",
500
+ "political",
501
+ "quiet",
502
+ "high",
503
+ "summer",
504
+ "north",
505
+ "planant",
506
+ "surf",
507
+ "ballad",
508
+ "sweet",
509
+ "celebration",
510
+ "dramatic",
511
+ "background"
512
+ ],
513
+ "instrument": [
514
+ "tambourine",
515
+ "shekere",
516
+ "mandolin",
517
+ "synthetiseurs",
518
+ "gobletdrum",
519
+ "kora",
520
+ "violins",
521
+ "pianoforte",
522
+ "musicalsaw",
523
+ "cimbalom",
524
+ "teclados",
525
+ "harpsichord",
526
+ "guiro",
527
+ "organ",
528
+ "chapmanstick",
529
+ "keyboard",
530
+ "acousticbassguitar",
531
+ "mandola",
532
+ "sarrusophone",
533
+ "electricpiano",
534
+ "harp",
535
+ "sousaphone",
536
+ "woodblock",
537
+ "continuum",
538
+ "bombarde",
539
+ "bongo",
540
+ "electricguitar",
541
+ "celesta",
542
+ "koto",
543
+ "metallophone",
544
+ "synth\u00e9",
545
+ "orchestra",
546
+ "classicalguitar",
547
+ "triangle",
548
+ "clavichord",
549
+ "bajo",
550
+ "sampler",
551
+ "slideguitar",
552
+ "drummachine",
553
+ "synthesizers",
554
+ "computer",
555
+ "carillon",
556
+ "percussions",
557
+ "lyre",
558
+ "synthetizer",
559
+ "berimbau",
560
+ "hang",
561
+ "castanets",
562
+ "basso",
563
+ "glassharmonica",
564
+ "didgeridoo",
565
+ "pad",
566
+ "guitars",
567
+ "cymbal",
568
+ "pipeorgan",
569
+ "horn",
570
+ "trombone",
571
+ "drums",
572
+ "sitar",
573
+ "trumpet",
574
+ "drum",
575
+ "kaosspad",
576
+ "washboard",
577
+ "woodwind",
578
+ "bandoneon",
579
+ "chitarra",
580
+ "bells",
581
+ "strings",
582
+ "balalaika",
583
+ "cvp509",
584
+ "guitar",
585
+ "piano",
586
+ "flute",
587
+ "euphonium",
588
+ "clarinet",
589
+ "acousticguitar",
590
+ "accordeon",
591
+ "oboe",
592
+ "guitare",
593
+ "timpani",
594
+ "bagpipes",
595
+ "tubax",
596
+ "bassoon",
597
+ "doublebass",
598
+ "kalimba",
599
+ "bassguitar",
600
+ "batterie",
601
+ "choirs",
602
+ "tabla",
603
+ "ocarina",
604
+ "erhu",
605
+ "bass",
606
+ "shamisen",
607
+ "gimbri",
608
+ "saxophone",
609
+ "bandura",
610
+ "pianosolo",
611
+ "mellotron",
612
+ "piccolo",
613
+ "violons",
614
+ "steelpan",
615
+ "balafon",
616
+ "tambura",
617
+ "sax",
618
+ "bell",
619
+ "synthetiseur",
620
+ "tuba",
621
+ "oud",
622
+ "accordion",
623
+ "panflute",
624
+ "synths",
625
+ "cello",
626
+ "harmonica",
627
+ "ukulele",
628
+ "theremin",
629
+ "singingbowl",
630
+ "batteria",
631
+ "guitareelectrique",
632
+ "bateria",
633
+ "alboka",
634
+ "bodhran",
635
+ "guitares",
636
+ "synthesizer",
637
+ "brass",
638
+ "cajon",
639
+ "cabasa",
640
+ "dunun",
641
+ "clavier",
642
+ "synth",
643
+ "pandeiro",
644
+ "ewi",
645
+ "guitarra",
646
+ "melodica",
647
+ "surdo",
648
+ "pads",
649
+ "cowbell",
650
+ "bongos",
651
+ "jewsharp",
652
+ "voice",
653
+ "handbell",
654
+ "concertina",
655
+ "tamtam",
656
+ "djembe",
657
+ "lute",
658
+ "claviers",
659
+ "lapsteelguitar",
660
+ "banjo",
661
+ "tinwhistle",
662
+ "santur",
663
+ "percussion",
664
+ "vocoder",
665
+ "string",
666
+ "keys",
667
+ "fiddle",
668
+ "violon",
669
+ "harmonium",
670
+ "taiko",
671
+ "vibraphone",
672
+ "xylophone",
673
+ "cuica",
674
+ "mbira",
675
+ "marimba",
676
+ "hammond",
677
+ "rhodes",
678
+ "alphorn",
679
+ "kazoo",
680
+ "glockenspiel",
681
+ "flugelhorn",
682
+ "keyboards",
683
+ "electronicorgan",
684
+ "udu",
685
+ "charango",
686
+ "mellophone",
687
+ "horns",
688
+ "tubularbell",
689
+ "pianos",
690
+ "beats",
691
+ "violin",
692
+ "electricguitars",
693
+ "viola",
694
+ "sarangi",
695
+ "flutes",
696
+ "beat",
697
+ "conga",
698
+ "ney"
699
+ ]
700
+ }