kankur0007 commited on
Commit
ef497f6
·
2 Parent(s): a49a636 d0f6abe
Files changed (1) hide show
  1. scraper/jutsu_data_scraper.py +0 -45
scraper/jutsu_data_scraper.py DELETED
@@ -1,45 +0,0 @@
1
- import scrapy
2
- from bs4 import BeautifulSoup
3
-
4
- class BlogSpider(scrapy.Spider):
5
- name = 'narutospider'
6
- start_urls = ['https://naruto.fandom.com/wiki/Special:BrowseData/Jutsu?limit=250&offset=0&_cat=Jutsu']
7
-
8
- def parse(self, response):
9
- for href in response.css('.smw-columnlist-container')[0].css("a::attr(href)").extract():
10
- extracted_data = scrapy.Request("https://naruto.fandom.com"+href,
11
- callback=self.parse_jutsu)
12
- yield extracted_data
13
-
14
- for next_page in response.css('a.mw-nextlink'):
15
- yield response.follow(next_page, self.parse)
16
-
17
- def parse_jutsu(self, response):
18
- jutsu_name = response.css("span.mw-page-title-main::text").extract()[0]
19
- jutsu_name = jutsu_name.strip()
20
-
21
- div_selector = response.css("div.mw-parser-output")[0]
22
- div_html = div_selector.extract()
23
-
24
- soup = BeautifulSoup(div_html).find('div')
25
-
26
- jutsu_type=""
27
- if soup.find('aside'):
28
- aside = soup.find('aside')
29
-
30
- for cell in aside.find_all('div',{'class':'pi-data'}):
31
- if cell.find('h3'):
32
- cell_name = cell.find('h3').text.strip()
33
- if cell_name == "Classification":
34
- jutsu_type = cell.find('div').text.strip()
35
-
36
- soup.find('aside').decompose()
37
-
38
- jutsu_description = soup.text.strip()
39
- jutsu_description = jutsu_description.split('Trivia')[0].strip()
40
-
41
- return dict (
42
- jutsu_name = jutsu_name,
43
- jutsu_type = jutsu_type,
44
- jutsu_description = jutsu_description
45
- )