| package scraper |
|
|
| import ( |
| "net/url" |
| "time" |
|
|
| "github.com/gocolly/colly/v2" |
| "go.uber.org/atomic" |
| "golang.org/x/text/language" |
|
|
| "github.com/metatube-community/metatube-sdk-go/provider" |
| ) |
|
|
| var ( |
| _ provider.Provider = (*Scraper)(nil) |
| _ provider.ProxySetter = (*Scraper)(nil) |
| _ provider.RequestTimeoutSetter = (*Scraper)(nil) |
| ) |
|
|
| |
| type Scraper struct { |
| name string |
| baseURL *url.URL |
| priority *atomic.Float64 |
| language language.Tag |
| c *colly.Collector |
| } |
|
|
| |
| func NewScraper(name, base string, priority float64, lang language.Tag, opts ...Option) *Scraper { |
| baseURL, err := url.Parse(base) |
| if err != nil { |
| panic(err) |
| } |
| s := &Scraper{ |
| name: name, |
| baseURL: baseURL, |
| priority: atomic.NewFloat64(priority), |
| language: lang, |
| c: colly.NewCollector(), |
| } |
| for _, opt := range opts { |
| |
| if err := opt(s); err != nil { |
| panic(err) |
| } |
| } |
| return s |
| } |
|
|
| |
| func NewDefaultScraper(name, baseURL string, priority float64, lang language.Tag, opts ...Option) *Scraper { |
| return NewScraper(name, baseURL, priority, lang, append([]Option{ |
| WithAllowURLRevisit(), |
| WithIgnoreRobotsTxt(), |
| WithRandomUserAgent(), |
| }, opts...)...) |
| } |
|
|
| func (s *Scraper) Name() string { return s.name } |
|
|
| func (s *Scraper) URL() *url.URL { return s.baseURL } |
|
|
| func (s *Scraper) Priority() float64 { return s.priority.Load() } |
|
|
| func (s *Scraper) SetPriority(v float64) { s.priority.Store(v) } |
|
|
| func (s *Scraper) Language() language.Tag { return s.language } |
|
|
| func (s *Scraper) NormalizeMovieID(id string) string { return id } |
|
|
| func (s *Scraper) ParseMovieIDFromURL(string) (string, error) { panic("unimplemented") } |
|
|
| func (s *Scraper) NormalizeActorID(id string) string { return id } |
|
|
| func (s *Scraper) ParseActorIDFromURL(string) (string, error) { panic("unimplemented") } |
|
|
| |
| func (s *Scraper) ClonedCollector() *colly.Collector { return s.c.Clone() } |
|
|
| |
| func (s *Scraper) SetProxy(proxyURL string) error { return s.c.SetProxy(proxyURL) } |
|
|
| |
| func (s *Scraper) SetRequestTimeout(timeout time.Duration) { s.c.SetRequestTimeout(timeout) } |
|
|