rkwyu
commited on
Commit
·
58293b7
1
Parent(s):
3a25a20
Support everand podcast
Browse files- README.md +21 -7
- run.js +1 -1
- src/App.js +4 -0
- src/const/EverandRegex.js +6 -0
- src/service/EverandDownloader.js +112 -0
README.md
CHANGED
|
@@ -5,7 +5,9 @@
|
|
| 5 |
</a>
|
| 6 |
|
| 7 |
## About ##
|
| 8 |
-
Scribd-dl helps downloading
|
|
|
|
|
|
|
| 9 |
|
| 10 |
## Prerequisites ##
|
| 11 |
To use Scridb-dl, you need to install [Node.js](https://nodejs.org/en/download/). It is recommended that you use the latest LTS version available.
|
|
@@ -39,40 +41,52 @@ rendertime=100
|
|
| 39 |
[DIRECTORY]
|
| 40 |
output=output
|
| 41 |
```
|
| 42 |
-
`rendertime` is the waiting time in millisecond for single page rendering
|
| 43 |
`output` is the ouput directory for generated .pdf files.
|
| 44 |
|
| 45 |
## Usage (CLI) ##
|
| 46 |
```console
|
| 47 |
Usage: npm start [options] url
|
| 48 |
Options:
|
| 49 |
-
/i image-based: generated by image snapshots taken for pages
|
| 50 |
```
|
| 51 |
|
| 52 |
-
#### Example 1: Download 《The Minds of Billy Milligan》 ####
|
| 53 |
```console
|
| 54 |
npm start "https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes"
|
| 55 |
```
|
| 56 |
|
| 57 |
-
#### Example 2: Download 《The Minds of Billy Milligan》 using `image-based` method ####
|
| 58 |
```console
|
| 59 |
npm start /i "https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes"
|
| 60 |
```
|
| 61 |
|
| 62 |
-
#### Example 3: Download 《Everything You Need To Know About ChatGPT》 ####
|
| 63 |
```console
|
| 64 |
npm start "https://www.slideshare.net/slideshow/everything-you-need-to-know-about-chatgpt-8ba3/266783915"
|
| 65 |
```
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
## Support URL Format ##
|
| 68 |
- https://www.scribd.com/doc/**
|
| 69 |
- https://www.scribd.com/embeds/**
|
| 70 |
- https://www.slideshare.net/**
|
| 71 |
- https://www.slideshare.net/slideshow/**
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
## Development Plan ##
|
| 74 |
|
| 75 |
-
- Support [everand.com](https://www.everand.com/)
|
| 76 |
- Scribd obfuscates the .pdf files, the texts copied from the documents might become strange garbled message. De-obfuscating is one of the future plan.
|
| 77 |
|
| 78 |
## License ##
|
|
|
|
| 5 |
</a>
|
| 6 |
|
| 7 |
## About ##
|
| 8 |
+
Scribd-dl helps downloading:
|
| 9 |
+
- documents on [scribd.com](https://www.scribd.com/) and [slideshare.net](https://www.slideshare.net/) without membership / sign-in
|
| 10 |
+
- podcast audios on [everand.com](https://www.everand.com/podcasts)
|
| 11 |
|
| 12 |
## Prerequisites ##
|
| 13 |
To use Scridb-dl, you need to install [Node.js](https://nodejs.org/en/download/). It is recommended that you use the latest LTS version available.
|
|
|
|
| 41 |
[DIRECTORY]
|
| 42 |
output=output
|
| 43 |
```
|
| 44 |
+
`rendertime` is the waiting time in millisecond for single page rendering on [scribd.com](https://www.scribd.com/), it is only applicable for `default` mode.
|
| 45 |
`output` is the ouput directory for generated .pdf files.
|
| 46 |
|
| 47 |
## Usage (CLI) ##
|
| 48 |
```console
|
| 49 |
Usage: npm start [options] url
|
| 50 |
Options:
|
| 51 |
+
/i image-based: generated by image snapshots taken for pages on scribd.com
|
| 52 |
```
|
| 53 |
|
| 54 |
+
#### Example 1: Download 《The Minds of Billy Milligan》 on scribd.com ####
|
| 55 |
```console
|
| 56 |
npm start "https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes"
|
| 57 |
```
|
| 58 |
|
| 59 |
+
#### Example 2: Download 《The Minds of Billy Milligan》 using `image-based` method on scribd.com ####
|
| 60 |
```console
|
| 61 |
npm start /i "https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes"
|
| 62 |
```
|
| 63 |
|
| 64 |
+
#### Example 3: Download 《Everything You Need To Know About ChatGPT》 on slideshare.net ####
|
| 65 |
```console
|
| 66 |
npm start "https://www.slideshare.net/slideshow/everything-you-need-to-know-about-chatgpt-8ba3/266783915"
|
| 67 |
```
|
| 68 |
|
| 69 |
+
#### Example 4: Download all 《TED Talks Daily》 episodes on everand.com ####
|
| 70 |
+
```console
|
| 71 |
+
npm start "https://www.everand.com/podcast-show/414106971/TED-Talks-Daily"
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
#### Example 5: Download 《Sunday Pick: How to care for the people who take care of us (w/ Ai-jen Poo)》 on everand.com ####
|
| 75 |
+
```console
|
| 76 |
+
npm start "https://www.everand.com/listen/podcast/731670963"
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
## Support URL Format ##
|
| 80 |
- https://www.scribd.com/doc/**
|
| 81 |
- https://www.scribd.com/embeds/**
|
| 82 |
- https://www.slideshare.net/**
|
| 83 |
- https://www.slideshare.net/slideshow/**
|
| 84 |
+
- https://www.everand.com/podcast-show/**
|
| 85 |
+
- https://www.everand.com/podcast/**
|
| 86 |
+
- https://www.everand.com/listen/podcast/**
|
| 87 |
|
| 88 |
## Development Plan ##
|
| 89 |
|
|
|
|
| 90 |
- Scribd obfuscates the .pdf files, the texts copied from the documents might become strange garbled message. De-obfuscating is one of the future plan.
|
| 91 |
|
| 92 |
## License ##
|
run.js
CHANGED
|
@@ -18,6 +18,6 @@ if (process.argv.length >= 3) {
|
|
| 18 |
console.error(`
|
| 19 |
Usage: npm start [options] url
|
| 20 |
Options:
|
| 21 |
-
/i image-based: generated by image snapshots taken for pages
|
| 22 |
`)
|
| 23 |
}
|
|
|
|
| 18 |
console.error(`
|
| 19 |
Usage: npm start [options] url
|
| 20 |
Options:
|
| 21 |
+
/i image-based: generated by image snapshots taken for pages on scribd.com
|
| 22 |
`)
|
| 23 |
}
|
src/App.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import { scribdDownloader } from "./service/ScribdDownloader.js"
|
| 2 |
import { slideshareDownloader } from "./service/SlideshareDownloader.js"
|
|
|
|
| 3 |
import * as scribdRegex from "./const/ScribdRegex.js"
|
| 4 |
import * as slideshareRegex from "./const/SlideshareRegex.js"
|
|
|
|
| 5 |
|
| 6 |
class App {
|
| 7 |
constructor() {
|
|
@@ -16,6 +18,8 @@ class App {
|
|
| 16 |
await scribdDownloader.execute(url, flag)
|
| 17 |
} else if (url.match(slideshareRegex.DOMAIN)) {
|
| 18 |
await slideshareDownloader.execute(url)
|
|
|
|
|
|
|
| 19 |
} else {
|
| 20 |
throw new Error(`Unsupported URL: ${url}`)
|
| 21 |
}
|
|
|
|
| 1 |
import { scribdDownloader } from "./service/ScribdDownloader.js"
|
| 2 |
import { slideshareDownloader } from "./service/SlideshareDownloader.js"
|
| 3 |
+
import { everandDownloader } from "./service/EverandDownloader.js"
|
| 4 |
import * as scribdRegex from "./const/ScribdRegex.js"
|
| 5 |
import * as slideshareRegex from "./const/SlideshareRegex.js"
|
| 6 |
+
import * as everandRegex from "./const/EverandRegex.js"
|
| 7 |
|
| 8 |
class App {
|
| 9 |
constructor() {
|
|
|
|
| 18 |
await scribdDownloader.execute(url, flag)
|
| 19 |
} else if (url.match(slideshareRegex.DOMAIN)) {
|
| 20 |
await slideshareDownloader.execute(url)
|
| 21 |
+
} else if (url.match(everandRegex.DOMAIN)) {
|
| 22 |
+
await everandDownloader.execute(url)
|
| 23 |
} else {
|
| 24 |
throw new Error(`Unsupported URL: ${url}`)
|
| 25 |
}
|
src/const/EverandRegex.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const DOMAIN = /^https:\/\/www\.everand\.com/
|
| 2 |
+
const PODCAST_SERIES = /^https:\/\/www\.everand\.com\/podcast-show\/([0-9]+)\/([a-zA-z0-9_-]+)/
|
| 3 |
+
const PODCAST_EPISODE = /^https:\/\/www\.everand\.com\/podcast\/([0-9]+)\/([a-zA-z0-9_-]+)/
|
| 4 |
+
const PODCAST_LISTEN = /^https:\/\/www\.everand\.com\/listen\/podcast\/([0-9]+)/
|
| 5 |
+
|
| 6 |
+
export { DOMAIN, PODCAST_SERIES, PODCAST_EPISODE, PODCAST_LISTEN }
|
src/service/EverandDownloader.js
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cliProgress from "cli-progress"
|
| 2 |
+
import { puppeteerSg } from "../utils/request/PuppeteerSg.js";
|
| 3 |
+
import { pdfGenerator } from "../utils/io/PdfGenerator.js";
|
| 4 |
+
import { configLoader } from "../utils/io/ConfigLoader.js";
|
| 5 |
+
import { directoryIo } from "../utils/io/DirectoryIo.js"
|
| 6 |
+
import * as everandRegex from "../const/EverandRegex.js"
|
| 7 |
+
import { Image } from "../object/Image.js"
|
| 8 |
+
import sharp from "sharp";
|
| 9 |
+
import axios from "axios";
|
| 10 |
+
import fs from "fs"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
const output = configLoader.load("DIRECTORY", "output")
|
| 14 |
+
|
| 15 |
+
class EverandDownloader {
|
| 16 |
+
constructor() {
|
| 17 |
+
if (!EverandDownloader.instance) {
|
| 18 |
+
EverandDownloader.instance = this
|
| 19 |
+
}
|
| 20 |
+
return EverandDownloader.instance
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
async execute(url) {
|
| 24 |
+
if (url.match(everandRegex.PODCAST_SERIES)) {
|
| 25 |
+
await this.series(url, )
|
| 26 |
+
} else if (url.match(everandRegex.PODCAST_EPISODE)) {
|
| 27 |
+
await this.listen(`https://www.everand.com/listen/podcast/${everandRegex.PODCAST_EPISODE.exec(url)[1]}`)
|
| 28 |
+
} else if (url.match(everandRegex.PODCAST_LISTEN)) {
|
| 29 |
+
await this.listen(url)
|
| 30 |
+
} else {
|
| 31 |
+
throw new Error(`Unsupported URL: ${url}`)
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
async listen(url, isEpisode) {
|
| 36 |
+
if (typeof isEpisode === "undefined") {
|
| 37 |
+
isEpisode = true
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
const episodeId = everandRegex.PODCAST_LISTEN.exec(url)[1]
|
| 41 |
+
|
| 42 |
+
// navigate to everand
|
| 43 |
+
let page = await puppeteerSg.getPage(url)
|
| 44 |
+
|
| 45 |
+
// wait rendering
|
| 46 |
+
await new Promise(resolve => setTimeout(resolve, 1000))
|
| 47 |
+
|
| 48 |
+
// get title, audio-url, series-url
|
| 49 |
+
const title = await page.evaluate(() => eval('Scribd.current_doc.short_title'))
|
| 50 |
+
const audioUrl = await page.evaluate(() => document.querySelector('audio#audioplayer').src)
|
| 51 |
+
const seriesUrl = await page.evaluate(() => document.querySelector('a[href^="https://www.everand.com/podcast-show/"]').href)
|
| 52 |
+
|
| 53 |
+
// prepare output dir
|
| 54 |
+
let seriesId = everandRegex.PODCAST_SERIES.exec(seriesUrl)[1]
|
| 55 |
+
let dir = `${output}/${seriesId}`
|
| 56 |
+
await directoryIo.create(dir)
|
| 57 |
+
|
| 58 |
+
// download audio
|
| 59 |
+
const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
| 60 |
+
if (isEpisode) {
|
| 61 |
+
bar.start(1, 0)
|
| 62 |
+
}
|
| 63 |
+
let path = `${dir}/${episodeId}_${title}.mp3`
|
| 64 |
+
const resp = await axios.get(audioUrl, { responseType: 'stream' })
|
| 65 |
+
resp.data.pipe(fs.createWriteStream(path))
|
| 66 |
+
if (isEpisode) {
|
| 67 |
+
bar.update(1)
|
| 68 |
+
bar.stop()
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
await page.close()
|
| 72 |
+
if (isEpisode) {
|
| 73 |
+
await puppeteerSg.close()
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
async series(url) {
|
| 78 |
+
const seriesId = everandRegex.PODCAST_SERIES.exec(url)[1]
|
| 79 |
+
|
| 80 |
+
// navigate to everand
|
| 81 |
+
let page = await puppeteerSg.getPage(url)
|
| 82 |
+
|
| 83 |
+
// wait rendering
|
| 84 |
+
await new Promise(resolve => setTimeout(resolve, 1000))
|
| 85 |
+
|
| 86 |
+
// get number-of-episodes
|
| 87 |
+
const totalEpisode = await page.evaluate(() => parseInt(document.querySelector('span[data-e2e="podcast-series-header-total-episodes"]').textContent.replace("episodes", "").trim()))
|
| 88 |
+
|
| 89 |
+
// get pages
|
| 90 |
+
const totalPage = await page.evaluate(() => [...document.querySelectorAll('div[data-e2e="pagination"] a[aria-label^="Page"]')].at(-1).textContent)
|
| 91 |
+
const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic)
|
| 92 |
+
bar.start(totalEpisode, 0)
|
| 93 |
+
xx:
|
| 94 |
+
for (let i = 1; i <= totalPage; i++) {
|
| 95 |
+
await page.goto(`${url}?page=${i}&sort=desc`, { waitUntil: "load" })
|
| 96 |
+
await new Promise(resolve => setTimeout(resolve, 1000))
|
| 97 |
+
|
| 98 |
+
let episodes = await page.evaluate(() => [...document.querySelectorAll('div.breakpoint_hide.below a[data-e2e="podcast-episode-player-button"]')].map(x => x.href))
|
| 99 |
+
for (let j = 0; j < episodes.length; j++ ) {
|
| 100 |
+
await this.listen(episodes[j], false)
|
| 101 |
+
bar.update(((i - 1) * 10) + (j + 1))
|
| 102 |
+
break xx
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
bar.stop()
|
| 106 |
+
|
| 107 |
+
await page.close()
|
| 108 |
+
await puppeteerSg.close()
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
export const everandDownloader = new EverandDownloader()
|