Kdnv commited on
Commit
2045dbf
·
1 Parent(s): 7de79a5

add more files to repo

Browse files
Files changed (4) hide show
  1. Dockerfile +11 -0
  2. data/links.txt +0 -0
  3. notebooks/embeddings.ipynb +374 -0
  4. notebooks/parsing.ipynb +367 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ EXPOSE 8501
10
+
11
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
data/links.txt ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/embeddings.ipynb ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {
5
+ "ExecuteTime": {
6
+ "end_time": "2024-08-15T13:02:38.555535Z",
7
+ "start_time": "2024-08-15T13:02:37.106569Z"
8
+ }
9
+ },
10
+ "cell_type": "code",
11
+ "source": [
12
+ "import pandas as pd\n",
13
+ "import numpy as np\n",
14
+ "import warnings\n",
15
+ "warnings.filterwarnings('ignore')\n",
16
+ "pd.options.display.float_format = '{:,.2f}'.format\n",
17
+ "import torch\n",
18
+ "import faiss"
19
+ ],
20
+ "id": "ae2dfff5aa25ae18",
21
+ "outputs": [],
22
+ "execution_count": 1
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "id": "initial_id",
27
+ "metadata": {
28
+ "collapsed": true,
29
+ "ExecuteTime": {
30
+ "end_time": "2024-08-15T13:02:42.684945Z",
31
+ "start_time": "2024-08-15T13:02:41.287350Z"
32
+ }
33
+ },
34
+ "source": "from sentence_transformers import SentenceTransformer",
35
+ "outputs": [],
36
+ "execution_count": 2
37
+ },
38
+ {
39
+ "metadata": {
40
+ "ExecuteTime": {
41
+ "end_time": "2024-08-14T12:54:42.598181Z",
42
+ "start_time": "2024-08-14T12:54:39.398163Z"
43
+ }
44
+ },
45
+ "cell_type": "code",
46
+ "source": "model = SentenceTransformer(\"cointegrated/rubert-tiny2\")",
47
+ "id": "989af6384d4e5a8e",
48
+ "outputs": [
49
+ {
50
+ "data": {
51
+ "text/plain": [
52
+ "modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]"
53
+ ],
54
+ "application/vnd.jupyter.widget-view+json": {
55
+ "version_major": 2,
56
+ "version_minor": 0,
57
+ "model_id": "dd56bb4ebdc84557b8dee2060631e9d3"
58
+ }
59
+ },
60
+ "metadata": {},
61
+ "output_type": "display_data"
62
+ },
63
+ {
64
+ "data": {
65
+ "text/plain": [
66
+ "README.md: 0%| | 0.00/2.19k [00:00<?, ?B/s]"
67
+ ],
68
+ "application/vnd.jupyter.widget-view+json": {
69
+ "version_major": 2,
70
+ "version_minor": 0,
71
+ "model_id": "14e4f50b6d6c481e9d76ca68ef5bbef5"
72
+ }
73
+ },
74
+ "metadata": {},
75
+ "output_type": "display_data"
76
+ },
77
+ {
78
+ "data": {
79
+ "text/plain": [
80
+ "sentence_bert_config.json: 0%| | 0.00/54.0 [00:00<?, ?B/s]"
81
+ ],
82
+ "application/vnd.jupyter.widget-view+json": {
83
+ "version_major": 2,
84
+ "version_minor": 0,
85
+ "model_id": "d11b7d08f8d7464c8f4e3b6509d01d1f"
86
+ }
87
+ },
88
+ "metadata": {},
89
+ "output_type": "display_data"
90
+ },
91
+ {
92
+ "data": {
93
+ "text/plain": [
94
+ "1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
95
+ ],
96
+ "application/vnd.jupyter.widget-view+json": {
97
+ "version_major": 2,
98
+ "version_minor": 0,
99
+ "model_id": "b943691acd3343d199d063bc87c1baaf"
100
+ }
101
+ },
102
+ "metadata": {},
103
+ "output_type": "display_data"
104
+ }
105
+ ],
106
+ "execution_count": 10
107
+ },
108
+ {
109
+ "metadata": {
110
+ "ExecuteTime": {
111
+ "end_time": "2024-08-14T12:55:31.493814Z",
112
+ "start_time": "2024-08-14T12:55:31.471146Z"
113
+ }
114
+ },
115
+ "cell_type": "code",
116
+ "source": "df = pd.read_csv('../data/data.csv')",
117
+ "id": "1a0b35d6e0fbb3c1",
118
+ "outputs": [],
119
+ "execution_count": 13
120
+ },
121
+ {
122
+ "metadata": {
123
+ "ExecuteTime": {
124
+ "end_time": "2024-08-14T12:56:16.355759Z",
125
+ "start_time": "2024-08-14T12:56:16.353642Z"
126
+ }
127
+ },
128
+ "cell_type": "code",
129
+ "source": "sentences = df['description'].tolist()",
130
+ "id": "23b45de76e63745f",
131
+ "outputs": [],
132
+ "execution_count": 15
133
+ },
134
+ {
135
+ "metadata": {
136
+ "ExecuteTime": {
137
+ "end_time": "2024-08-14T12:56:33.698140Z",
138
+ "start_time": "2024-08-14T12:56:26.610169Z"
139
+ }
140
+ },
141
+ "cell_type": "code",
142
+ "source": "embeddings = model.encode(sentences)",
143
+ "id": "d1b4f38e54c2c927",
144
+ "outputs": [],
145
+ "execution_count": 16
146
+ },
147
+ {
148
+ "metadata": {
149
+ "ExecuteTime": {
150
+ "end_time": "2024-08-14T12:56:36.694104Z",
151
+ "start_time": "2024-08-14T12:56:36.690947Z"
152
+ }
153
+ },
154
+ "cell_type": "code",
155
+ "source": "print(embeddings.shape)",
156
+ "id": "9724188f0fc24546",
157
+ "outputs": [
158
+ {
159
+ "name": "stdout",
160
+ "output_type": "stream",
161
+ "text": [
162
+ "(1049, 312)\n"
163
+ ]
164
+ }
165
+ ],
166
+ "execution_count": 17
167
+ },
168
+ {
169
+ "metadata": {
170
+ "ExecuteTime": {
171
+ "end_time": "2024-08-14T13:02:23.235437Z",
172
+ "start_time": "2024-08-14T13:02:23.233630Z"
173
+ }
174
+ },
175
+ "cell_type": "code",
176
+ "source": "text = 'Сериал про джедаев'",
177
+ "id": "1d44f85a9a1b5b85",
178
+ "outputs": [],
179
+ "execution_count": 48
180
+ },
181
+ {
182
+ "metadata": {
183
+ "ExecuteTime": {
184
+ "end_time": "2024-08-14T13:02:23.758117Z",
185
+ "start_time": "2024-08-14T13:02:23.689210Z"
186
+ }
187
+ },
188
+ "cell_type": "code",
189
+ "source": "text_embedding = model.encode(text)",
190
+ "id": "e98b8b855ab1fdcf",
191
+ "outputs": [],
192
+ "execution_count": 49
193
+ },
194
+ {
195
+ "metadata": {},
196
+ "cell_type": "code",
197
+ "outputs": [],
198
+ "execution_count": 60,
199
+ "source": "search = model.similarity(embeddings, text_embedding)",
200
+ "id": "5b23c0aef0e3022"
201
+ },
202
+ {
203
+ "metadata": {
204
+ "ExecuteTime": {
205
+ "end_time": "2024-08-14T13:05:08.343582Z",
206
+ "start_time": "2024-08-14T13:05:08.340619Z"
207
+ }
208
+ },
209
+ "cell_type": "code",
210
+ "source": "search.shape",
211
+ "id": "d2059301c9734464",
212
+ "outputs": [
213
+ {
214
+ "data": {
215
+ "text/plain": [
216
+ "torch.Size([1049, 1])"
217
+ ]
218
+ },
219
+ "execution_count": 61,
220
+ "metadata": {},
221
+ "output_type": "execute_result"
222
+ }
223
+ ],
224
+ "execution_count": 61
225
+ },
226
+ {
227
+ "metadata": {
228
+ "ExecuteTime": {
229
+ "end_time": "2024-08-14T13:38:59.864081Z",
230
+ "start_time": "2024-08-14T13:38:59.860326Z"
231
+ }
232
+ },
233
+ "cell_type": "code",
234
+ "source": "torch.argmax(search).item()",
235
+ "id": "e696066aab627853",
236
+ "outputs": [
237
+ {
238
+ "data": {
239
+ "text/plain": [
240
+ "905"
241
+ ]
242
+ },
243
+ "execution_count": 65,
244
+ "metadata": {},
245
+ "output_type": "execute_result"
246
+ }
247
+ ],
248
+ "execution_count": 65
249
+ },
250
+ {
251
+ "metadata": {
252
+ "ExecuteTime": {
253
+ "end_time": "2024-08-14T20:45:20.736400Z",
254
+ "start_time": "2024-08-14T20:45:20.734292Z"
255
+ }
256
+ },
257
+ "cell_type": "code",
258
+ "source": [
259
+ "def load_model():\n",
260
+ " return SentenceTransformer(\"paraphrase-multilingual-mpnet-base-v2\")"
261
+ ],
262
+ "id": "9e2ed7e66abc6a49",
263
+ "outputs": [],
264
+ "execution_count": 2
265
+ },
266
+ {
267
+ "metadata": {
268
+ "ExecuteTime": {
269
+ "end_time": "2024-08-15T13:03:08.428996Z",
270
+ "start_time": "2024-08-15T13:03:00.332695Z"
271
+ }
272
+ },
273
+ "cell_type": "code",
274
+ "source": "model = load_model()",
275
+ "id": "532e538411ab573f",
276
+ "outputs": [],
277
+ "execution_count": 4
278
+ },
279
+ {
280
+ "metadata": {
281
+ "ExecuteTime": {
282
+ "end_time": "2024-08-15T13:03:11.008048Z",
283
+ "start_time": "2024-08-15T13:03:11.003802Z"
284
+ }
285
+ },
286
+ "cell_type": "code",
287
+ "source": [
288
+ "def compute_index(_model, sentences):\n",
289
+ " embeddings = _model.encode(sentences).astype('float32')\n",
290
+ " normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)\n",
291
+ "\n",
292
+ " index_l2 = faiss.IndexFlatL2(embeddings.shape[1])\n",
293
+ " index_dot = faiss.IndexFlatIP(embeddings.shape[1])\n",
294
+ " index_cosine = faiss.IndexFlatIP(embeddings.shape[1])\n",
295
+ "\n",
296
+ " index_l2.add(embeddings)\n",
297
+ " index_dot.add(embeddings)\n",
298
+ " index_cosine.add(normalized_embeddings)\n",
299
+ "\n",
300
+ " return index_l2, index_dot, index_cosine"
301
+ ],
302
+ "id": "e7af30a720365bd5",
303
+ "outputs": [],
304
+ "execution_count": 5
305
+ },
306
+ {
307
+ "metadata": {
308
+ "ExecuteTime": {
309
+ "end_time": "2024-08-15T13:03:13.039314Z",
310
+ "start_time": "2024-08-15T13:03:12.964545Z"
311
+ }
312
+ },
313
+ "cell_type": "code",
314
+ "source": [
315
+ "data = pd.read_csv('../data/data.csv')\n",
316
+ "sentences = data['description'].tolist()"
317
+ ],
318
+ "id": "ed466f547af86849",
319
+ "outputs": [],
320
+ "execution_count": 6
321
+ },
322
+ {
323
+ "metadata": {
324
+ "ExecuteTime": {
325
+ "end_time": "2024-08-15T13:05:33.501428Z",
326
+ "start_time": "2024-08-15T13:03:14.381677Z"
327
+ }
328
+ },
329
+ "cell_type": "code",
330
+ "source": "index_l2, index_dot, index_cosine = compute_index(model, sentences)",
331
+ "id": "1ae620dfaa1177d7",
332
+ "outputs": [],
333
+ "execution_count": 7
334
+ },
335
+ {
336
+ "metadata": {
337
+ "ExecuteTime": {
338
+ "end_time": "2024-08-15T13:05:33.537496Z",
339
+ "start_time": "2024-08-15T13:05:33.503326Z"
340
+ }
341
+ },
342
+ "cell_type": "code",
343
+ "source": [
344
+ "faiss.write_index(index_l2, \"../models/index_l2.faiss\")\n",
345
+ "faiss.write_index(index_dot, \"../models/index_dot.faiss\")\n",
346
+ "faiss.write_index(index_cosine, \"../models/index_cosine.faiss\")"
347
+ ],
348
+ "id": "cf07f22fb88110ed",
349
+ "outputs": [],
350
+ "execution_count": 8
351
+ }
352
+ ],
353
+ "metadata": {
354
+ "kernelspec": {
355
+ "display_name": "Python 3",
356
+ "language": "python",
357
+ "name": "python3"
358
+ },
359
+ "language_info": {
360
+ "codemirror_mode": {
361
+ "name": "ipython",
362
+ "version": 2
363
+ },
364
+ "file_extension": ".py",
365
+ "mimetype": "text/x-python",
366
+ "name": "python",
367
+ "nbconvert_exporter": "python",
368
+ "pygments_lexer": "ipython2",
369
+ "version": "2.7.6"
370
+ }
371
+ },
372
+ "nbformat": 4,
373
+ "nbformat_minor": 5
374
+ }
notebooks/parsing.ipynb ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2024-08-14T17:49:15.441508Z",
10
+ "start_time": "2024-08-14T17:49:14.960186Z"
11
+ }
12
+ },
13
+ "source": [
14
+ "import pandas as pd\n",
15
+ "import requests\n",
16
+ "from bs4 import BeautifulSoup\n",
17
+ "import re\n",
18
+ "from selenium import webdriver\n",
19
+ "from selenium.webdriver.common.by import By\n",
20
+ "from selenium.webdriver.common.keys import Keys\n",
21
+ "from selenium.webdriver.chrome.service import Service\n",
22
+ "from webdriver_manager.chrome import ChromeDriverManager\n",
23
+ "import time\n",
24
+ "from tqdm import tqdm\n",
25
+ "\n",
26
+ "import pandas as pd"
27
+ ],
28
+ "outputs": [],
29
+ "execution_count": 1
30
+ },
31
+ {
32
+ "metadata": {
33
+ "ExecuteTime": {
34
+ "end_time": "2024-08-14T17:49:16.227697Z",
35
+ "start_time": "2024-08-14T17:49:16.224674Z"
36
+ }
37
+ },
38
+ "cell_type": "code",
39
+ "source": [
40
+ "url_list = []\n",
41
+ "image_list = []\n",
42
+ "title_list = []\n",
43
+ "info_list = []\n",
44
+ "description_list = []\n",
45
+ "rating_list = []"
46
+ ],
47
+ "id": "3a5a63adab6eb73d",
48
+ "outputs": [],
49
+ "execution_count": 2
50
+ },
51
+ {
52
+ "metadata": {
53
+ "ExecuteTime": {
54
+ "end_time": "2024-08-14T17:49:18.720761Z",
55
+ "start_time": "2024-08-14T17:49:18.718782Z"
56
+ }
57
+ },
58
+ "cell_type": "code",
59
+ "source": [
60
+ "url = 'https://www.film.ru/serials/vo-vse-tyazhkie'\n",
61
+ "headers = {\n",
62
+ " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'\n",
63
+ "}"
64
+ ],
65
+ "id": "201dd8c0282195e6",
66
+ "outputs": [],
67
+ "execution_count": 3
68
+ },
69
+ {
70
+ "metadata": {
71
+ "ExecuteTime": {
72
+ "end_time": "2024-08-14T17:49:27.912392Z",
73
+ "start_time": "2024-08-14T17:49:27.908710Z"
74
+ }
75
+ },
76
+ "cell_type": "code",
77
+ "source": [
78
+ "def add_info(url):\n",
79
+ " response = requests.get(url, headers=headers)\n",
80
+ " soup = BeautifulSoup(response.text, 'lxml')\n",
81
+ " description = soup.find(class_='wrapper_movies_text').text.replace('\\n', '')\n",
82
+ " title = soup.find('h1').text\n",
83
+ " info = soup.find('div', class_='block_table').text.replace('\\n', ' ').strip()\n",
84
+ " img = soup.find('a', class_='wrapper_block_stack wrapper_movies_poster').get('data-src')\n",
85
+ " ratings = soup.find_all('div', class_='wrapper_movies_scores_score')\n",
86
+ " \n",
87
+ " imdb_score = '-'\n",
88
+ " \n",
89
+ " for rating in ratings:\n",
90
+ " try:\n",
91
+ " rating = rating.text.split()\n",
92
+ " if rating[1] == 'IMDb':\n",
93
+ " imdb_score = rating[0]\n",
94
+ " break\n",
95
+ " except:\n",
96
+ " pass\n",
97
+ " \n",
98
+ " match = re.match(r'([^()]+)', title)\n",
99
+ " if match:\n",
100
+ " title = match.group(1).strip()\n",
101
+ " \n",
102
+ " info = info.replace('длительность ', '')\n",
103
+ " \n",
104
+ " url_list.append(url)\n",
105
+ " image_list.append('https://www.film.ru' + img)\n",
106
+ " title_list.append(title)\n",
107
+ " info_list.append(info)\n",
108
+ " description_list.append(description)\n",
109
+ " rating_list.append(imdb_score)"
110
+ ],
111
+ "id": "c6cefa2a540572b1",
112
+ "outputs": [],
113
+ "execution_count": 4
114
+ },
115
+ {
116
+ "metadata": {
117
+ "ExecuteTime": {
118
+ "end_time": "2024-08-14T15:04:04.815204Z",
119
+ "start_time": "2024-08-14T15:03:59.311545Z"
120
+ }
121
+ },
122
+ "cell_type": "code",
123
+ "source": [
124
+ "driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))\n",
125
+ "driver.get('https://www.film.ru/a-z/serials')"
126
+ ],
127
+ "id": "69dbfb4122848980",
128
+ "outputs": [],
129
+ "execution_count": 5
130
+ },
131
+ {
132
+ "metadata": {
133
+ "ExecuteTime": {
134
+ "end_time": "2024-08-14T15:30:19.653512Z",
135
+ "start_time": "2024-08-14T15:04:17.495683Z"
136
+ }
137
+ },
138
+ "cell_type": "code",
139
+ "source": [
140
+ "base_url = 'https://www.film.ru'\n",
141
+ "\n",
142
+ "top_url = 'https://www.film.ru/a-z/serials'\n",
143
+ "\n",
144
+ "response = requests.get(top_url, headers=headers)\n",
145
+ "soup = BeautifulSoup(response.text, 'lxml')\n",
146
+ "\n",
147
+ "links = set()\n",
148
+ "while len(links) < 5000:\n",
149
+ " driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)\n",
150
+ " time.sleep(2)\n",
151
+ " \n",
152
+ " elements = driver.find_elements(By.CSS_SELECTOR, 'a.redesign_afisha_movie_main_title')\n",
153
+ "\n",
154
+ " for element in elements:\n",
155
+ " link = element.get_attribute('href')\n",
156
+ " if link and link.startswith('https://www.film.ru/'):\n",
157
+ " links.add(link)\n",
158
+ "\n",
159
+ "driver.quit()"
160
+ ],
161
+ "id": "28b12c8e79a361b",
162
+ "outputs": [],
163
+ "execution_count": 6
164
+ },
165
+ {
166
+ "metadata": {
167
+ "ExecuteTime": {
168
+ "end_time": "2024-08-14T15:30:31.175426Z",
169
+ "start_time": "2024-08-14T15:30:31.172648Z"
170
+ }
171
+ },
172
+ "cell_type": "code",
173
+ "source": "links = list(links)",
174
+ "id": "8ff5f0471f4c761c",
175
+ "outputs": [],
176
+ "execution_count": 7
177
+ },
178
+ {
179
+ "metadata": {
180
+ "ExecuteTime": {
181
+ "end_time": "2024-08-14T17:50:57.870245Z",
182
+ "start_time": "2024-08-14T17:50:57.865304Z"
183
+ }
184
+ },
185
+ "cell_type": "code",
186
+ "source": [
187
+ "links = []\n",
188
+ "with open('../data/links.txt', 'r') as f:\n",
189
+ " for line in f:\n",
190
+ " links.append(line.strip())"
191
+ ],
192
+ "id": "ed6bf0739d9afda2",
193
+ "outputs": [],
194
+ "execution_count": 10
195
+ },
196
+ {
197
+ "metadata": {
198
+ "ExecuteTime": {
199
+ "end_time": "2024-08-14T17:50:59.134056Z",
200
+ "start_time": "2024-08-14T17:50:59.130899Z"
201
+ }
202
+ },
203
+ "cell_type": "code",
204
+ "source": "len(links)",
205
+ "id": "ebc31a62f4d06d31",
206
+ "outputs": [
207
+ {
208
+ "data": {
209
+ "text/plain": [
210
+ "5040"
211
+ ]
212
+ },
213
+ "execution_count": 11,
214
+ "metadata": {},
215
+ "output_type": "execute_result"
216
+ }
217
+ ],
218
+ "execution_count": 11
219
+ },
220
+ {
221
+ "metadata": {
222
+ "ExecuteTime": {
223
+ "end_time": "2024-08-14T15:31:34.654081Z",
224
+ "start_time": "2024-08-14T15:31:34.650167Z"
225
+ }
226
+ },
227
+ "cell_type": "code",
228
+ "source": [
229
+ "# with open('../data/links.txt', 'w') as f:\n",
230
+ "# for link in links:\n",
231
+ "# f.write(link + '\\n')"
232
+ ],
233
+ "id": "32b00a8e9ba45c05",
234
+ "outputs": [],
235
+ "execution_count": 9
236
+ },
237
+ {
238
+ "metadata": {
239
+ "ExecuteTime": {
240
+ "end_time": "2024-08-14T18:19:29.984692Z",
241
+ "start_time": "2024-08-14T17:51:14.351523Z"
242
+ }
243
+ },
244
+ "cell_type": "code",
245
+ "source": [
246
+ "for link in tqdm(links, desc=\"Парсинг ссылок\"):\n",
247
+ " try:\n",
248
+ " add_info(link)\n",
249
+ " except:\n",
250
+ " pass"
251
+ ],
252
+ "id": "78799c96450db456",
253
+ "outputs": [
254
+ {
255
+ "name": "stderr",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "Парсинг ссылок: 100%|██████████| 5040/5040 [28:15<00:00, 2.97it/s] \n"
259
+ ]
260
+ }
261
+ ],
262
+ "execution_count": 13
263
+ },
264
+ {
265
+ "metadata": {
266
+ "ExecuteTime": {
267
+ "end_time": "2024-08-14T18:19:35.016156Z",
268
+ "start_time": "2024-08-14T18:19:35.012126Z"
269
+ }
270
+ },
271
+ "cell_type": "code",
272
+ "source": "len(description_list) == len(title_list) == len(info_list) == len(image_list) == len(url_list)==len(rating_list)",
273
+ "id": "a058eaa1450a171d",
274
+ "outputs": [
275
+ {
276
+ "data": {
277
+ "text/plain": [
278
+ "True"
279
+ ]
280
+ },
281
+ "execution_count": 14,
282
+ "metadata": {},
283
+ "output_type": "execute_result"
284
+ }
285
+ ],
286
+ "execution_count": 14
287
+ },
288
+ {
289
+ "metadata": {
290
+ "ExecuteTime": {
291
+ "end_time": "2024-08-14T18:19:47.473578Z",
292
+ "start_time": "2024-08-14T18:19:47.469096Z"
293
+ }
294
+ },
295
+ "cell_type": "code",
296
+ "source": "data = pd.DataFrame({'page_url': url_list, 'image_url': image_list, 'tvshow_title': title_list, 'info': info_list, 'description': description_list, 'rating': rating_list})",
297
+ "id": "301b579630a1ef56",
298
+ "outputs": [],
299
+ "execution_count": 16
300
+ },
301
+ {
302
+ "metadata": {
303
+ "ExecuteTime": {
304
+ "end_time": "2024-08-14T18:19:47.847888Z",
305
+ "start_time": "2024-08-14T18:19:47.845311Z"
306
+ }
307
+ },
308
+ "cell_type": "code",
309
+ "source": [
310
+ "def clean_description(text):\n",
311
+ " cleaned_text = re.sub(r'Сезон \\d+', '', text)\n",
312
+ " cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()\n",
313
+ " return cleaned_text"
314
+ ],
315
+ "id": "94a061cf7812343e",
316
+ "outputs": [],
317
+ "execution_count": 17
318
+ },
319
+ {
320
+ "metadata": {
321
+ "ExecuteTime": {
322
+ "end_time": "2024-08-14T18:19:48.380448Z",
323
+ "start_time": "2024-08-14T18:19:48.317552Z"
324
+ }
325
+ },
326
+ "cell_type": "code",
327
+ "source": "data['description'] = data['description'].apply(clean_description)",
328
+ "id": "8597746104442cc7",
329
+ "outputs": [],
330
+ "execution_count": 18
331
+ },
332
+ {
333
+ "metadata": {
334
+ "ExecuteTime": {
335
+ "end_time": "2024-08-14T18:19:49.497581Z",
336
+ "start_time": "2024-08-14T18:19:49.438608Z"
337
+ }
338
+ },
339
+ "cell_type": "code",
340
+ "source": "data.to_csv('../data/data.csv')",
341
+ "id": "4423b3bffd94bbbf",
342
+ "outputs": [],
343
+ "execution_count": 19
344
+ }
345
+ ],
346
+ "metadata": {
347
+ "kernelspec": {
348
+ "display_name": "Python 3",
349
+ "language": "python",
350
+ "name": "python3"
351
+ },
352
+ "language_info": {
353
+ "codemirror_mode": {
354
+ "name": "ipython",
355
+ "version": 2
356
+ },
357
+ "file_extension": ".py",
358
+ "mimetype": "text/x-python",
359
+ "name": "python",
360
+ "nbconvert_exporter": "python",
361
+ "pygments_lexer": "ipython2",
362
+ "version": "2.7.6"
363
+ }
364
+ },
365
+ "nbformat": 4,
366
+ "nbformat_minor": 5
367
+ }