Spaces:
Sleeping
Sleeping
scraper/jutsu_data_scraper.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
import scrapy
|
| 2 |
-
from bs4 import BeautifulSoup
|
| 3 |
-
|
| 4 |
-
class BlogSpider(scrapy.Spider):
|
| 5 |
-
name = 'narutospider'
|
| 6 |
-
start_urls = ['https://naruto.fandom.com/wiki/Special:BrowseData/Jutsu?limit=250&offset=0&_cat=Jutsu']
|
| 7 |
-
|
| 8 |
-
def parse(self, response):
|
| 9 |
-
for href in response.css('.smw-columnlist-container')[0].css("a::attr(href)").extract():
|
| 10 |
-
extracted_data = scrapy.Request("https://naruto.fandom.com"+href,
|
| 11 |
-
callback=self.parse_jutsu)
|
| 12 |
-
yield extracted_data
|
| 13 |
-
|
| 14 |
-
for next_page in response.css('a.mw-nextlink'):
|
| 15 |
-
yield response.follow(next_page, self.parse)
|
| 16 |
-
|
| 17 |
-
def parse_jutsu(self, response):
|
| 18 |
-
jutsu_name = response.css("span.mw-page-title-main::text").extract()[0]
|
| 19 |
-
jutsu_name = jutsu_name.strip()
|
| 20 |
-
|
| 21 |
-
div_selector = response.css("div.mw-parser-output")[0]
|
| 22 |
-
div_html = div_selector.extract()
|
| 23 |
-
|
| 24 |
-
soup = BeautifulSoup(div_html).find('div')
|
| 25 |
-
|
| 26 |
-
jutsu_type=""
|
| 27 |
-
if soup.find('aside'):
|
| 28 |
-
aside = soup.find('aside')
|
| 29 |
-
|
| 30 |
-
for cell in aside.find_all('div',{'class':'pi-data'}):
|
| 31 |
-
if cell.find('h3'):
|
| 32 |
-
cell_name = cell.find('h3').text.strip()
|
| 33 |
-
if cell_name == "Classification":
|
| 34 |
-
jutsu_type = cell.find('div').text.strip()
|
| 35 |
-
|
| 36 |
-
soup.find('aside').decompose()
|
| 37 |
-
|
| 38 |
-
jutsu_description = soup.text.strip()
|
| 39 |
-
jutsu_description = jutsu_description.split('Trivia')[0].strip()
|
| 40 |
-
|
| 41 |
-
return dict (
|
| 42 |
-
jutsu_name = jutsu_name,
|
| 43 |
-
jutsu_type = jutsu_type,
|
| 44 |
-
jutsu_description = jutsu_description
|
| 45 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|