| const axios = require('axios'); |
| const cheerio = require('cheerio'); |
| const fs = require('fs'); |
| const path = require('path'); |
|
|
| class Artist { |
| constructor(data) { |
| this.name = ''; |
| this.url = ''; |
| this.biography = ''; |
| this.discography = []; |
|
|
| if (data) { |
| for (const prop in data) { |
| if (data.hasOwnProperty(prop)) { |
| this[prop] = data[prop]; |
| } |
| } |
| } |
| } |
| } |
|
|
| class Record { |
| constructor(data) { |
| this.artist = ''; |
| this.title = ''; |
| this.label = ''; |
| this.url = ''; |
| this.rating = 0; |
| this.year = ''; |
| this.genre = ''; |
| this.text = ''; |
| this.styles = []; |
| this.tracks = []; |
| if (data) { |
| for (const prop in data) { |
| if (data.hasOwnProperty(prop)) { |
| this[prop] = data[prop]; |
| } |
| } |
| } |
| } |
| } |
|
|
| class MusicScraper { |
| constructor() { |
| this.artistsToGet = 5; |
| this.artistsFetched = []; |
| this.relatedArtists = []; |
| this.artists = []; |
|
|
| |
| this.timestampedDirectoryName = this.getTimestampedDirectoryName(); |
|
|
| process.on('SIGINT', () => { |
| |
| console.log('Exiting the script...'); |
| this.done(); |
| process.exit(0); |
| }); |
| } |
|
|
| getTimestampedDirectoryName() { |
| const now = new Date(); |
| const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
| return `${timestamp}_data`; |
| } |
|
|
| async fetchData(url) { |
| if (!url) { |
| console.log(`Can't fetch URL since it's null`); |
| return false; |
| } |
| try { |
| const response = await axios.get(url); |
| return response.data; |
| } catch (error) { |
| throw new Error(`Error fetching data: ${error.message}`); |
| } |
| } |
|
|
| generateUrl(baseUrl, type) { |
| return `${baseUrl}/${type}`; |
| } |
|
|
| cleanText(text) { |
| |
| var newText = text.replace(/\s+/g, ' ').trim(); |
| newText = newText.replace(/[\\"]/g, ''); |
| return newText; |
| } |
|
|
| async getRelatedArtists(Artist) { |
| try { |
| const html = await this.fetchData(this.generateUrl(Artist.url, 'related')); |
| var relatedArtists = this.scrapeRelatedArtistData(html); |
| return relatedArtists; |
| } catch (error) { |
| console.error(error); |
| } |
| } |
|
|
| async getArtistBiography(Artist) { |
| try { |
| const html = await this.fetchData(this.generateUrl(Artist.url, 'biography')); |
| const $ = cheerio.load(html); |
| var biographyText = $('section.biography .text'); |
| var cleanBiographyText = this.cleanText(biographyText.text()); |
| return cleanBiographyText; |
| } catch (error) { |
| console.error(`Error scraping ${Artist.url}: ${error.message}`); |
| return false; |
| } |
| } |
|
|
| async getArtistDiscography(Artist) { |
| try { |
| const html = await this.fetchData(this.generateUrl(Artist.url, 'discography')); |
| const $ = cheerio.load(html); |
| var tableRows = $('.discography table tbody tr'); |
| var records = []; |
| tableRows.each(function () { |
| var recordData = { |
| year: $(this).find('.year').text().trim(), |
| title: $(this).find('.title').text().trim(), |
| label: $(this).find('.label').text().trim(), |
| url: $(this).find('.title a').attr('href'), |
| }; |
| var record = new Record(recordData); |
| records.push(record); |
| }); |
| return records; |
| } catch (error) { |
| console.error(`Error scraping ${Artist.url}: ${error.message}`); |
| return false; |
| } |
| } |
|
|
| async getSingleRecordData(record) { |
| try { |
| console.log(`Getting record ${record.title}`); |
| const html = await this.fetchData(record.url); |
| const $ = cheerio.load(html); |
| var trackRows = $('.track-listing table tbody tr'); |
| var recordData = { |
| title: record.title, |
| year: record.year, |
| label: record.label, |
| url: record.url, |
| rating: this.cleanText($('.allmusic-rating').text()), |
| |
| |
| |
| genre: this.cleanText($('.basic-info .genre div a').text()), |
| |
| text: this.cleanText($('section.review .text').text()), |
| tracks: [], |
| |
| }; |
| trackRows.each(function () { |
| recordData.tracks.push($(this).find('.title a').text().trim()); |
| }); |
| var fullRecord = new Record(recordData); |
| return fullRecord; |
| } catch (error) { |
| console.error(`Error scraping ${Record.url}: ${error.message}`); |
| return false; |
| } |
| } |
|
|
| scrapeRelatedArtistData(html) { |
| const $ = cheerio.load(html); |
| const liElements = $('.related.similars ul li'); |
|
|
| let relatedArtists = []; |
|
|
| liElements.each((index, element) => { |
| let artist = new Artist({ |
| name: $(element).text().trim(), |
| url: $(element).find('a').attr('href'), |
| }); |
|
|
| if (artist.name && artist.url) { |
| relatedArtists.push(artist); |
| } |
| }); |
| return relatedArtists; |
| } |
|
|
| async run() { |
| console.log(' '); |
| console.log(' '); |
| console.log(' '); |
| console.log(' '); |
| console.log('------------------------'); |
| var initialArtistData = { |
| name: 'The Abyssinians', |
| url: 'https://www.allmusic.com/artist/the-abyssinians-mn0000588943', |
| }; |
| var InitialArtist = new Artist(initialArtistData); |
| var data = await this.getArtistData(InitialArtist); |
| if (data) { |
| this.artists.push(data); |
| } |
|
|
| var relatedArtists = await this.getRelatedArtists(InitialArtist); |
| if (relatedArtists) { |
| this.relatedArtists = this.relatedArtists.concat(relatedArtists); |
| } |
|
|
| |
|
|
| for (let artist of this.relatedArtists) { |
| var data = await this.getArtistData(artist); |
| if (!data) { |
| this.done(); |
| } |
| this.artists.push(data); |
| } |
|
|
| this.done(); |
| } |
|
|
| async getArtistData(Artist) { |
| if (!Artist || !Artist.name || !Artist.url) { |
| return false; |
| } |
|
|
| |
| if (this.artistsFetched.includes(Artist.name)) { |
| console.log(`Artist already fetched.`); |
| return false; |
| } |
| |
| if (this.artistsToGet < this.artistsFetched.length) { |
| console.log(`Reached the limit of artists to fetch.`); |
| this.done(); |
| } |
|
|
| this.artistsFetched.push(Artist.name); |
| var biography = await this.getArtistBiography(Artist); |
| var discography = await this.getArtistDiscography(Artist); |
| if (biography) { |
| Artist.biography = biography; |
| } |
| if (discography) { |
| |
| var records = []; |
| for (let record of discography) { |
| console.log(record); |
| var fullRecordData = await this.getSingleRecordData(record); |
| records.push(fullRecordData); |
| } |
| Artist.discography = records; |
| } |
| return Artist; |
| } |
|
|
| done() { |
| console.log(this.artists); |
| console.log(this.artists[0].discography); |
| this.writeToDisk(); |
| process.exit(0); |
| } |
|
|
| removeSpecialCharacters(string) { |
| const noSpecialCharacters = string.replace(/[^a-zA-Z0-9– ]/g, ''); |
| return noSpecialCharacters; |
| } |
|
|
| writeToDisk() { |
| fs.mkdir(this.timestampedDirectoryName, (err) => { |
| if (err) { |
| console.error('Error creating folder:', err); |
| } else { |
| console.log('Folder created successfully'); |
| } |
| }); |
|
|
| |
| for (let artist of this.artists) { |
| const jsonFileName = `${artist.name} biography.txt`; |
| const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
| const jsonString = JSON.stringify(artist, null, 2); |
|
|
| var bioText = `${artist.name} biography\n`; |
| bioText += artist.biography; |
| fs.writeFileSync(jsonFilePath, bioText); |
| console.log(`Data written to ${jsonFileName}`); |
|
|
| for (let record of artist.discography) { |
| var artistAndTitle = this.removeSpecialCharacters(artist.name + ' – ' + record.title); |
|
|
| const jsonFileName = `${artistAndTitle} review.txt`; |
| const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
| const jsonString = JSON.stringify(artist, null, 2); |
|
|
| var reviewText = `Review of ${record.title} by ${artist.name}\n`; |
| reviewText += `Artist: ${artist.name}\n`; |
| reviewText += `Album title: ${record.title}\n`; |
| reviewText += `Release year: ${record.year}\n`; |
| reviewText += `Label: ${record.label}\n`; |
| reviewText += `Genre: ${record.genre}\n`; |
| if (record.rating) { |
| reviewText += `Rating: ${record.rating} out of 10\n`; |
| } |
| reviewText += `\n\Track listing:\n`; |
| for (let track of record.tracks) { |
| reviewText += `${track}`; |
| reviewText += '\n'; |
| } |
| if (record.text) { |
| reviewText += '\n'; |
| reviewText += `Review: ${record.text}\n`; |
| } |
| fs.writeFileSync(jsonFilePath, reviewText); |
| console.log(`Data written to ${jsonFileName}`); |
| } |
| } |
|
|
| |
| } |
| } |
|
|
| const scraper = new MusicScraper(); |
| scraper.run(); |
|
|