OverSide88 commited on
Commit
f930925
·
verified ·
1 Parent(s): 9c52fcd

Upload 2 files

Browse files
Files changed (2) hide show
  1. notebooks/parc_lover.ipynb +98 -0
  2. notebooks/pars.ipynb +0 -0
notebooks/parc_lover.ipynb ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from bs4 import BeautifulSoup\n",
10
+ "import requests\n",
11
+ "import csv\n",
12
+ "import time\n",
13
+ "from urllib.parse import urlparse, parse_qs, urlencode, urlunparse\n",
14
+ "\n",
15
+ "main_url = 'http://loveread.ec/'\n",
16
+ "base_url = 'http://loveread.ec/index_book.php?id_genre=1&p=85'\n",
17
+ "def increment_page(url):\n",
18
+ " parsed_url = urlparse(url)\n",
19
+ " query_params = parse_qs(parsed_url.query)\n",
20
+ " if 'p' in query_params:\n",
21
+ " current_page = int(query_params['p'][0])\n",
22
+ " next_page = current_page + 1\n",
23
+ " query_params['p'] = str(next_page)\n",
24
+ " new_query_string = urlencode(query_params, doseq=True)\n",
25
+ " new_url = urlunparse(parsed_url._replace(query=new_query_string))\n",
26
+ " return new_url\n",
27
+ " else:\n",
28
+ " return url # если параметр p не найден, возвращаем исходный URL\n",
29
+ " \n",
30
+ "def parcing(num_books, output_csv='books.csv'):\n",
31
+ " count = 0\n",
32
+ " current_url = base_url\n",
33
+ " headers = {\n",
34
+ " \"Accept\": \"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5\",\n",
35
+ " \"User-Agent\": \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0\"\n",
36
+ " }\n",
37
+ " # Открываем CSV-файл для записи\n",
38
+ " with open(output_csv, mode='w', newline='', encoding='utf-8') as file:\n",
39
+ " writer = csv.writer(file)\n",
40
+ " writer.writerow(['page_url', 'image_url', 'author', 'title', 'annotation'])\n",
41
+ " while count < num_books:\n",
42
+ " # print(f'Fetching URL: {current_url}')\n",
43
+ " response = requests.get(current_url, headers=headers)\n",
44
+ " soup = BeautifulSoup(response.text, 'lxml')\n",
45
+ " # Найдем все блоки с книгами\n",
46
+ " book_blocks = soup.find_all('tr', class_='td_center_color')\n",
47
+ " for i in range(0, len(book_blocks), 2):\n",
48
+ " if count >= num_books:\n",
49
+ " break\n",
50
+ " book_info_block = book_blocks[i]\n",
51
+ " book_annotation_block = book_blocks[i + 1]\n",
52
+ " title_tag = book_info_block.find('a', title=True)\n",
53
+ " # if not title_tag:\n",
54
+ " # continue\n",
55
+ " title = title_tag['title']\n",
56
+ " # print(title)\n",
57
+ " author_tag = book_info_block.find('a', href=lambda x: x and 'biography-author' in x)\n",
58
+ " # if not author_tag:\n",
59
+ " # continue\n",
60
+ " author = author_tag.text.strip()\n",
61
+ " # print(author)\n",
62
+ " annotation = book_annotation_block.find('p').text.strip()\n",
63
+ " # print(annotation)\n",
64
+ " image_tag = book_info_block.find('img', class_='margin-right_8')\n",
65
+ " # if not image_tag:\n",
66
+ " # continue\n",
67
+ " image_url = main_url + image_tag['src']\n",
68
+ " # print(image_url)\n",
69
+ " # if not book_url_tag:\n",
70
+ " # continue\n",
71
+ " book_url_tag = book_info_block.find('a', href=lambda x: x and 'view_global.php?' in x)['href']\n",
72
+ " # print(book_url_tag)\n",
73
+ " page_url = main_url + book_url_tag\n",
74
+ " # print(page_url)\n",
75
+ " # Записываем данные в CSV-файл\n",
76
+ " writer.writerow([page_url, image_url, author, title, annotation])\n",
77
+ " count += 1\n",
78
+ " # Каждые 10 книг делаем паузу на 10 секунд\n",
79
+ " if count % 10 == 0:\n",
80
+ " time.sleep(10)\n",
81
+ " print(f'Парсинг в процессе, спарсил {count} книг')\n",
82
+ " # Получаем URL следующей страницы\n",
83
+ " current_url = increment_page(current_url)\n",
84
+ " # print(f'Next URL: {current_url}')\n",
85
+ " print('Парсинг окончен')\n",
86
+ " \n",
87
+ "parcing(1000, 'books_1000.csv')\n"
88
+ ]
89
+ }
90
+ ],
91
+ "metadata": {
92
+ "language_info": {
93
+ "name": "python"
94
+ }
95
+ },
96
+ "nbformat": 4,
97
+ "nbformat_minor": 2
98
+ }
notebooks/pars.ipynb ADDED
The diff for this file is too large to render. See raw diff