| const axios = require('axios'); |
| const cheerio = require('cheerio'); |
| const fs = require('fs'); |
| const path = require('path'); |
|
|
| class MusicScraper { |
| constructor() { |
| this.relatedFetched = 0; |
| this.relatedNumberToGet = 50; |
| this.urls = []; |
| this.data = []; |
| this.fetchedUrls = []; |
|
|
| |
| this.timestampedDirectoryName = this.getTimestampedDirectoryName(); |
|
|
| process.on('SIGINT', () => { |
| |
| this.done(); |
| console.log('Exiting the script...'); |
|
|
| process.exit(0); |
| }); |
| } |
|
|
| async fetchData(url) { |
| console.log(`Fetching ${url}`); |
| try { |
| const response = await axios.get(url); |
| return response.data; |
| } catch (error) { |
| throw new Error(`Error fetching data: ${error.message}`); |
| } |
| } |
|
|
| scrapeRelatedData(html) { |
| const $ = cheerio.load(html); |
| const liElements = $('.related.similars ul li'); |
|
|
| liElements.each((index, element) => { |
| const link = $(element).find('a').attr('href'); |
| const relatedLink = $(element).find('a').attr('href') + '/related'; |
| const biographyLink = $(element).find('a').attr('href') + '/biography'; |
| const discographyLink = $(element).find('a').attr('href') + '/discography'; |
| const text = $(element).text().trim(); |
|
|
| if (link && text) { |
| this.data.push({ link, text, relatedLink, biographyLink, discographyLink, discographies: [] }); |
| } |
| }); |
| } |
|
|
| async scrapeUrls() { |
| for (const url of this.urls) { |
| try { |
| this.relatedFetched++; |
| if (this.relatedNumberToGet > this.relatedFetched) { |
| const html = await this.fetchData(url); |
| this.scrapeRelatedData(html); |
| } |
| } catch (error) { |
| console.error(`Error scraping ${url}: ${error.message}`); |
| } |
| } |
| } |
|
|
| cleanText(text) { |
| |
| var newText = text.replace(/\s+/g, ' ').trim(); |
| newText = newText.replace(/[\\"]/g, ''); |
| return newText; |
| } |
|
|
| async scrapeBiographies() { |
| |
| let index = 0; |
| for (const item of this.data) { |
| console.log('Scraping biography for ' + item.text); |
| try { |
| const html = await this.fetchData(item.biographyLink); |
| const $ = cheerio.load(html); |
| var biographyText = $('section.biography .text'); |
|
|
| this.data[index].biography = this.cleanText(biographyText.text()); |
| |
|
|
| index++; |
| } catch (error) { |
| console.error(`Error scraping ${item.biographyLink}: ${error.message}`); |
| index++; |
| } |
| } |
| } |
|
|
| async scrapeDiscographies() { |
| |
| let index = 0; |
| for (const item of this.data) { |
| console.log('Scraping discographies for ' + item.text); |
| try { |
| const html = await this.fetchData(item.discographyLink); |
| const $ = cheerio.load(html); |
| var rows = $('.discography').find('tr'); |
| console.log(rows.text()); |
| if (rows) { |
| rows.each(() => { |
| var release = { |
| |
| year: $(this).find('.year') ? $(element).find('.year').text() : '', |
| }; |
| this.data[index].discographies.push(release); |
| }); |
| } |
| index++; |
| } catch (error) { |
| console.error(`Error scraping ${item.discographyLink}: ${error.message}`); |
| index++; |
| } |
| } |
| } |
|
|
| async run() { |
| try { |
| const initialUrl = 'https://www.allmusic.com/artist/johnny-osbourne-mn0000248916/related'; |
| const initialHtml = await this.fetchData(initialUrl); |
| this.scrapeRelatedData(initialHtml); |
|
|
| |
| this.urls = this.data.map((item) => item.relatedLink); |
|
|
| await this.scrapeUrls(); |
| await this.scrapeBiographies(); |
| |
|
|
| this.done(); |
| } catch (error) { |
| console.error(error); |
| } |
| } |
|
|
| getTimestampedDirectoryName() { |
| const now = new Date(); |
| const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
| return `${timestamp}_data`; |
| } |
|
|
| getTimestampedFileName() { |
| const now = new Date(); |
| const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); |
| return `${timestamp}_data.txt`; |
| } |
|
|
| writeDataToDisk() { |
| const jsonFileName = this.getTimestampedFileName(); |
| const jsonFilePath = path.join(__dirname, jsonFileName); |
| const jsonData = JSON.stringify(this.data, null, 2); |
|
|
| fs.writeFileSync(jsonFilePath, jsonData); |
| console.log(`Data written to ${jsonFileName}`); |
| } |
|
|
| writeBiographyDataToDisk() { |
| fs.mkdir(this.timestampedDirectoryName, (err) => { |
| if (err) { |
| console.error('Error creating folder:', err); |
| } else { |
| console.log('Folder created successfully'); |
| } |
| }); |
|
|
| const jsonFileName = item.text + '.txt'; |
| const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); |
| const jsonString = JSON.stringify(jsonObject, null, 2); |
| fs.writeFileSync(jsonFilePath, jsonString); |
| console.log(`Data written to ${jsonFileName}`); |
| } |
|
|
| done() { |
| |
| console.log(this.data); |
| this.writeBiographyDataToDisk(); |
| process.exit(); |
| } |
| } |
|
|
| const scraper = new MusicScraper(); |
| scraper.run(); |
|
|