Upload 2 files
Browse files- notebooks/parc_lover.ipynb +98 -0
- notebooks/pars.ipynb +0 -0
notebooks/parc_lover.ipynb
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"from bs4 import BeautifulSoup\n",
|
| 10 |
+
"import requests\n",
|
| 11 |
+
"import csv\n",
|
| 12 |
+
"import time\n",
|
| 13 |
+
"from urllib.parse import urlparse, parse_qs, urlencode, urlunparse\n",
|
| 14 |
+
"\n",
|
| 15 |
+
"main_url = 'http://loveread.ec/'\n",
|
| 16 |
+
"base_url = 'http://loveread.ec/index_book.php?id_genre=1&p=85'\n",
|
| 17 |
+
"def increment_page(url):\n",
|
| 18 |
+
" parsed_url = urlparse(url)\n",
|
| 19 |
+
" query_params = parse_qs(parsed_url.query)\n",
|
| 20 |
+
" if 'p' in query_params:\n",
|
| 21 |
+
" current_page = int(query_params['p'][0])\n",
|
| 22 |
+
" next_page = current_page + 1\n",
|
| 23 |
+
" query_params['p'] = str(next_page)\n",
|
| 24 |
+
" new_query_string = urlencode(query_params, doseq=True)\n",
|
| 25 |
+
" new_url = urlunparse(parsed_url._replace(query=new_query_string))\n",
|
| 26 |
+
" return new_url\n",
|
| 27 |
+
" else:\n",
|
| 28 |
+
" return url # если параметр p не найден, возвращаем исходный URL\n",
|
| 29 |
+
" \n",
|
| 30 |
+
"def parcing(num_books, output_csv='books.csv'):\n",
|
| 31 |
+
" count = 0\n",
|
| 32 |
+
" current_url = base_url\n",
|
| 33 |
+
" headers = {\n",
|
| 34 |
+
" \"Accept\": \"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5\",\n",
|
| 35 |
+
" \"User-Agent\": \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0\"\n",
|
| 36 |
+
" }\n",
|
| 37 |
+
" # Открываем CSV-файл для записи\n",
|
| 38 |
+
" with open(output_csv, mode='w', newline='', encoding='utf-8') as file:\n",
|
| 39 |
+
" writer = csv.writer(file)\n",
|
| 40 |
+
" writer.writerow(['page_url', 'image_url', 'author', 'title', 'annotation'])\n",
|
| 41 |
+
" while count < num_books:\n",
|
| 42 |
+
" # print(f'Fetching URL: {current_url}')\n",
|
| 43 |
+
" response = requests.get(current_url, headers=headers)\n",
|
| 44 |
+
" soup = BeautifulSoup(response.text, 'lxml')\n",
|
| 45 |
+
" # Найдем все блоки с книгами\n",
|
| 46 |
+
" book_blocks = soup.find_all('tr', class_='td_center_color')\n",
|
| 47 |
+
" for i in range(0, len(book_blocks), 2):\n",
|
| 48 |
+
" if count >= num_books:\n",
|
| 49 |
+
" break\n",
|
| 50 |
+
" book_info_block = book_blocks[i]\n",
|
| 51 |
+
" book_annotation_block = book_blocks[i + 1]\n",
|
| 52 |
+
" title_tag = book_info_block.find('a', title=True)\n",
|
| 53 |
+
" # if not title_tag:\n",
|
| 54 |
+
" # continue\n",
|
| 55 |
+
" title = title_tag['title']\n",
|
| 56 |
+
" # print(title)\n",
|
| 57 |
+
" author_tag = book_info_block.find('a', href=lambda x: x and 'biography-author' in x)\n",
|
| 58 |
+
" # if not author_tag:\n",
|
| 59 |
+
" # continue\n",
|
| 60 |
+
" author = author_tag.text.strip()\n",
|
| 61 |
+
" # print(author)\n",
|
| 62 |
+
" annotation = book_annotation_block.find('p').text.strip()\n",
|
| 63 |
+
" # print(annotation)\n",
|
| 64 |
+
" image_tag = book_info_block.find('img', class_='margin-right_8')\n",
|
| 65 |
+
" # if not image_tag:\n",
|
| 66 |
+
" # continue\n",
|
| 67 |
+
" image_url = main_url + image_tag['src']\n",
|
| 68 |
+
" # print(image_url)\n",
|
| 69 |
+
" # if not book_url_tag:\n",
|
| 70 |
+
" # continue\n",
|
| 71 |
+
" book_url_tag = book_info_block.find('a', href=lambda x: x and 'view_global.php?' in x)['href']\n",
|
| 72 |
+
" # print(book_url_tag)\n",
|
| 73 |
+
" page_url = main_url + book_url_tag\n",
|
| 74 |
+
" # print(page_url)\n",
|
| 75 |
+
" # Записываем данные в CSV-файл\n",
|
| 76 |
+
" writer.writerow([page_url, image_url, author, title, annotation])\n",
|
| 77 |
+
" count += 1\n",
|
| 78 |
+
" # Каждые 10 книг делаем паузу на 10 секунд\n",
|
| 79 |
+
" if count % 10 == 0:\n",
|
| 80 |
+
" time.sleep(10)\n",
|
| 81 |
+
" print(f'Парсинг в процессе, спарсил {count} книг')\n",
|
| 82 |
+
" # Получаем URL следующей страницы\n",
|
| 83 |
+
" current_url = increment_page(current_url)\n",
|
| 84 |
+
" # print(f'Next URL: {current_url}')\n",
|
| 85 |
+
" print('Парсинг окончен')\n",
|
| 86 |
+
" \n",
|
| 87 |
+
"parcing(1000, 'books_1000.csv')\n"
|
| 88 |
+
]
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
"metadata": {
|
| 92 |
+
"language_info": {
|
| 93 |
+
"name": "python"
|
| 94 |
+
}
|
| 95 |
+
},
|
| 96 |
+
"nbformat": 4,
|
| 97 |
+
"nbformat_minor": 2
|
| 98 |
+
}
|
notebooks/pars.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|