EddyGiusepe commited on
Commit
f3c1697
·
1 Parent(s): cf73b55

pyspark estudo

Browse files
Files changed (1) hide show
  1. 4_spark/spark4.ipynb +765 -0
4_spark/spark4.ipynb ADDED
@@ -0,0 +1,765 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# <h1 align=\"center\"><font color=\"yellow\">spark</font></h1>"
9
+ ]
10
+ },
11
+ {
12
+ "attachments": {},
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "<font color=\"yellow\">Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro</font>"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import findspark\n",
26
+ "\n",
27
+ "findspark.init()"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "23/07/26 11:36:59 WARN Utils: Your hostname, eddygiusepe resolves to a loopback address: 127.0.1.1; using 192.168.0.141 instead (on interface wlp0s20f3)\n",
40
+ "23/07/26 11:36:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
41
+ "Setting default log level to \"WARN\".\n",
42
+ "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43
+ "23/07/26 11:37:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44
+ ]
45
+ },
46
+ {
47
+ "data": {
48
+ "text/html": [
49
+ "\n",
50
+ " <div>\n",
51
+ " <p><b>SparkSession - in-memory</b></p>\n",
52
+ " \n",
53
+ " <div>\n",
54
+ " <p><b>SparkContext</b></p>\n",
55
+ "\n",
56
+ " <p><a href=\"http://192.168.0.141:4040\">Spark UI</a></p>\n",
57
+ "\n",
58
+ " <dl>\n",
59
+ " <dt>Version</dt>\n",
60
+ " <dd><code>v3.4.1</code></dd>\n",
61
+ " <dt>Master</dt>\n",
62
+ " <dd><code>local</code></dd>\n",
63
+ " <dt>AppName</dt>\n",
64
+ " <dd><code>spark_Eddy4</code></dd>\n",
65
+ " </dl>\n",
66
+ " </div>\n",
67
+ " \n",
68
+ " </div>\n",
69
+ " "
70
+ ],
71
+ "text/plain": [
72
+ "<pyspark.sql.session.SparkSession at 0x7f46d41b71c0>"
73
+ ]
74
+ },
75
+ "execution_count": 2,
76
+ "metadata": {},
77
+ "output_type": "execute_result"
78
+ }
79
+ ],
80
+ "source": [
81
+ "from pyspark.sql import SparkSession\n",
82
+ "\n",
83
+ "# Inicializa o SparkSession:\n",
84
+ "spark = SparkSession.builder.master('local')\\\n",
85
+ " .appName(\"spark_Eddy4\")\\\n",
86
+ " .getOrCreate()\n",
87
+ "\n",
88
+ "spark"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 3,
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "name": "stdout",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "+--------+-------------+---------+---------+\n",
101
+ "|id_carro| modelo_carro| preco|cod_marca|\n",
102
+ "+--------+-------------+---------+---------+\n",
103
+ "| 1| Avalon|$78401.95| 54|\n",
104
+ "| 1| Avalon|$78401.95| 54|\n",
105
+ "| 2| RDX|$95987.38| 1|\n",
106
+ "| 3| Golf|$61274.55| 55|\n",
107
+ "| 4| EX|$84981.12| 23|\n",
108
+ "| 5| Escort|$77466.89| 17|\n",
109
+ "| 6| Expedition|$84698.71| 17|\n",
110
+ "| 7| Voyager|$95567.75| 42|\n",
111
+ "| 8| Civic|$84749.22| 20|\n",
112
+ "| 9| Defender|$98600.79| 29|\n",
113
+ "| 10| V8 Vantage S|$94791.61| 2|\n",
114
+ "| 11| C70|$97874.76| 56|\n",
115
+ "| 12|G-Series 1500|$71638.24| 10|\n",
116
+ "| 13| Legacy|$95850.12| 52|\n",
117
+ "| 14| DB9|$86707.30| 2|\n",
118
+ "| 15| Mulsanne|$70453.70| 6|\n",
119
+ "| 16| RX|$46752.60| 30|\n",
120
+ "| 17| Rabbit|$78048.08| 55|\n",
121
+ "| 18| Q|$65193.95| 23|\n",
122
+ "| 19| S60|$65396.98| 56|\n",
123
+ "+--------+-------------+---------+---------+\n",
124
+ "only showing top 20 rows\n",
125
+ "\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "# Lendo arquivos:\n",
131
+ "\n",
132
+ "df_carros= spark.read.format(\"csv\").option(\"header\", True).option(\"encoding\", \"utf-8\").load(\"./data/modelo_carro.csv\", sep=\",\")\n",
133
+ "\n",
134
+ "df_carros.show()"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 4,
140
+ "metadata": {},
141
+ "outputs": [
142
+ {
143
+ "name": "stderr",
144
+ "output_type": "stream",
145
+ "text": [
146
+ " \r"
147
+ ]
148
+ }
149
+ ],
150
+ "source": [
151
+ "# Salvando arquivo:\n",
152
+ "#df_carros.write.format(\"parquet\").save(\"modelo_carro_parquet\")\n",
153
+ "\n",
154
+ "# Para sobreescrever:\n",
155
+ "df_carros.write.format(\"parquet\").mode(\"overwrite\").save(\"modelo_carro_parquet\")"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 5,
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "# Em outros formatos:\n",
165
+ "\n",
166
+ "# df_carros.write.format(\"avro\").save(\"modelo_carro_avro\") --> Não consegui instalar!\n",
167
+ " \n",
168
+ "df_carros.write.format(\"json\").save(\"modelo_carro_json\")"
169
+ ]
170
+ },
171
+ {
172
+ "attachments": {},
173
+ "cell_type": "markdown",
174
+ "metadata": {},
175
+ "source": [
176
+ "# <font color=\"red\">Select</font>"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 6,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "+--------+-------------+---------+---------+\n",
189
+ "|id_carro| modelo_carro| preco|cod_marca|\n",
190
+ "+--------+-------------+---------+---------+\n",
191
+ "| 1| Avalon|$78401.95| 54|\n",
192
+ "| 1| Avalon|$78401.95| 54|\n",
193
+ "| 2| RDX|$95987.38| 1|\n",
194
+ "| 3| Golf|$61274.55| 55|\n",
195
+ "| 4| EX|$84981.12| 23|\n",
196
+ "| 5| Escort|$77466.89| 17|\n",
197
+ "| 6| Expedition|$84698.71| 17|\n",
198
+ "| 7| Voyager|$95567.75| 42|\n",
199
+ "| 8| Civic|$84749.22| 20|\n",
200
+ "| 9| Defender|$98600.79| 29|\n",
201
+ "| 10| V8 Vantage S|$94791.61| 2|\n",
202
+ "| 11| C70|$97874.76| 56|\n",
203
+ "| 12|G-Series 1500|$71638.24| 10|\n",
204
+ "| 13| Legacy|$95850.12| 52|\n",
205
+ "| 14| DB9|$86707.30| 2|\n",
206
+ "| 15| Mulsanne|$70453.70| 6|\n",
207
+ "| 16| RX|$46752.60| 30|\n",
208
+ "| 17| Rabbit|$78048.08| 55|\n",
209
+ "| 18| Q|$65193.95| 23|\n",
210
+ "| 19| S60|$65396.98| 56|\n",
211
+ "+--------+-------------+---------+---------+\n",
212
+ "only showing top 20 rows\n",
213
+ "\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "df_carros= spark.read.format(\"csv\").option(\"header\", True).load(\"./data/modelo_carro.csv\")\n",
219
+ "\n",
220
+ "df_carros.show()"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 7,
226
+ "metadata": {},
227
+ "outputs": [
228
+ {
229
+ "name": "stdout",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "root\n",
233
+ " |-- id_carro: string (nullable = true)\n",
234
+ " |-- modelo_carro: string (nullable = true)\n",
235
+ " |-- preco: string (nullable = true)\n",
236
+ " |-- cod_marca: string (nullable = true)\n",
237
+ "\n"
238
+ ]
239
+ }
240
+ ],
241
+ "source": [
242
+ "df_carros.printSchema()"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 8,
248
+ "metadata": {},
249
+ "outputs": [
250
+ {
251
+ "name": "stdout",
252
+ "output_type": "stream",
253
+ "text": [
254
+ "+-------------+--------+\n",
255
+ "| modelo_carro|id_carro|\n",
256
+ "+-------------+--------+\n",
257
+ "| Avalon| 1|\n",
258
+ "| Avalon| 1|\n",
259
+ "| RDX| 2|\n",
260
+ "| Golf| 3|\n",
261
+ "| EX| 4|\n",
262
+ "| Escort| 5|\n",
263
+ "| Expedition| 6|\n",
264
+ "| Voyager| 7|\n",
265
+ "| Civic| 8|\n",
266
+ "| Defender| 9|\n",
267
+ "| V8 Vantage S| 10|\n",
268
+ "| C70| 11|\n",
269
+ "|G-Series 1500| 12|\n",
270
+ "| Legacy| 13|\n",
271
+ "| DB9| 14|\n",
272
+ "| Mulsanne| 15|\n",
273
+ "| RX| 16|\n",
274
+ "| Rabbit| 17|\n",
275
+ "| Q| 18|\n",
276
+ "| S60| 19|\n",
277
+ "+-------------+--------+\n",
278
+ "only showing top 20 rows\n",
279
+ "\n"
280
+ ]
281
+ }
282
+ ],
283
+ "source": [
284
+ "df_carros_spark = df_carros.select(\"modelo_carro\", \"id_carro\")\n",
285
+ "\n",
286
+ "df_carros_spark.show()"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 9,
292
+ "metadata": {},
293
+ "outputs": [
294
+ {
295
+ "name": "stdout",
296
+ "output_type": "stream",
297
+ "text": [
298
+ "+-------------+--------+\n",
299
+ "| eddy_modelo|id_carro|\n",
300
+ "+-------------+--------+\n",
301
+ "| Avalon| 1|\n",
302
+ "| Avalon| 1|\n",
303
+ "| RDX| 2|\n",
304
+ "| Golf| 3|\n",
305
+ "| EX| 4|\n",
306
+ "| Escort| 5|\n",
307
+ "| Expedition| 6|\n",
308
+ "| Voyager| 7|\n",
309
+ "| Civic| 8|\n",
310
+ "| Defender| 9|\n",
311
+ "| V8 Vantage S| 10|\n",
312
+ "| C70| 11|\n",
313
+ "|G-Series 1500| 12|\n",
314
+ "| Legacy| 13|\n",
315
+ "| DB9| 14|\n",
316
+ "| Mulsanne| 15|\n",
317
+ "| RX| 16|\n",
318
+ "| Rabbit| 17|\n",
319
+ "| Q| 18|\n",
320
+ "| S60| 19|\n",
321
+ "+-------------+--------+\n",
322
+ "only showing top 20 rows\n",
323
+ "\n"
324
+ ]
325
+ }
326
+ ],
327
+ "source": [
328
+ "# Posso colocar um Alias:\n",
329
+ "from pyspark.sql.functions import col \n",
330
+ "\n",
331
+ "df_carros_spark = df_carros.select(col(\"modelo_carro\").alias(\"eddy_modelo\"), \"id_carro\")\n",
332
+ "\n",
333
+ "df_carros_spark.show()"
334
+ ]
335
+ },
336
+ {
337
+ "attachments": {},
338
+ "cell_type": "markdown",
339
+ "metadata": {},
340
+ "source": [
341
+ "# <font color=\"red\">Filtros</font>"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 10,
347
+ "metadata": {},
348
+ "outputs": [
349
+ {
350
+ "name": "stdout",
351
+ "output_type": "stream",
352
+ "text": [
353
+ "+--------+------------+---------+---------+\n",
354
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
355
+ "+--------+------------+---------+---------+\n",
356
+ "| 1| Avalon|$78401.95| 54|\n",
357
+ "| 1| Avalon|$78401.95| 54|\n",
358
+ "| 2| RDX|$95987.38| 1|\n",
359
+ "| 3| Golf|$61274.55| 55|\n",
360
+ "| 4| EX|$84981.12| 23|\n",
361
+ "+--------+------------+---------+---------+\n",
362
+ "only showing top 5 rows\n",
363
+ "\n"
364
+ ]
365
+ }
366
+ ],
367
+ "source": [
368
+ "df_carros= spark.read.format(\"csv\").option(\"header\", True).load(\"./data/modelo_carro.csv\")\n",
369
+ "\n",
370
+ "df_carros.show(5)"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 11,
376
+ "metadata": {},
377
+ "outputs": [
378
+ {
379
+ "name": "stdout",
380
+ "output_type": "stream",
381
+ "text": [
382
+ "+--------+------------+---------+---------+\n",
383
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
384
+ "+--------+------------+---------+---------+\n",
385
+ "| 1| Avalon|$78401.95| 54|\n",
386
+ "| 1| Avalon|$78401.95| 54|\n",
387
+ "+--------+------------+---------+---------+\n",
388
+ "\n"
389
+ ]
390
+ }
391
+ ],
392
+ "source": [
393
+ "df_carros.where(\"id_carro = '1'\").show()"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 12,
399
+ "metadata": {},
400
+ "outputs": [
401
+ {
402
+ "name": "stdout",
403
+ "output_type": "stream",
404
+ "text": [
405
+ "+--------+------------+---------+---------+\n",
406
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
407
+ "+--------+------------+---------+---------+\n",
408
+ "| 1| Avalon|$78401.95| 54|\n",
409
+ "| 1| Avalon|$78401.95| 54|\n",
410
+ "+--------+------------+---------+---------+\n",
411
+ "\n"
412
+ ]
413
+ }
414
+ ],
415
+ "source": [
416
+ "# ou\n",
417
+ "df_carros.filter(\"id_carro = '1'\").show()"
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": 13,
423
+ "metadata": {},
424
+ "outputs": [
425
+ {
426
+ "name": "stdout",
427
+ "output_type": "stream",
428
+ "text": [
429
+ "+--------+------------+---------+---------+\n",
430
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
431
+ "+--------+------------+---------+---------+\n",
432
+ "| 1| Avalon|$78401.95| 54|\n",
433
+ "| 1| Avalon|$78401.95| 54|\n",
434
+ "+--------+------------+---------+---------+\n",
435
+ "\n"
436
+ ]
437
+ }
438
+ ],
439
+ "source": [
440
+ "from pyspark.sql.functions import *\n",
441
+ "\n",
442
+ "df_carros.where(col(\"id_carro\") == '1').show()"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 14,
448
+ "metadata": {},
449
+ "outputs": [
450
+ {
451
+ "name": "stdout",
452
+ "output_type": "stream",
453
+ "text": [
454
+ "+--------+------------+-----+---------+\n",
455
+ "|id_carro|modelo_carro|preco|cod_marca|\n",
456
+ "+--------+------------+-----+---------+\n",
457
+ "+--------+------------+-----+---------+\n",
458
+ "\n"
459
+ ]
460
+ }
461
+ ],
462
+ "source": [
463
+ "df_carros.where((col(\"id_carro\") == '1') & (col(\"modelo_carro\") == 'Golf')).show()\n"
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "execution_count": 15,
469
+ "metadata": {},
470
+ "outputs": [
471
+ {
472
+ "name": "stdout",
473
+ "output_type": "stream",
474
+ "text": [
475
+ "+--------+------------+---------+---------+\n",
476
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
477
+ "+--------+------------+---------+---------+\n",
478
+ "| 1| Avalon|$78401.95| 54|\n",
479
+ "| 1| Avalon|$78401.95| 54|\n",
480
+ "| 3| Golf|$61274.55| 55|\n",
481
+ "| 237| Golf|$66249.75| 55|\n",
482
+ "| 330| Golf|$82099.83| 55|\n",
483
+ "+--------+------------+---------+---------+\n",
484
+ "\n"
485
+ ]
486
+ }
487
+ ],
488
+ "source": [
489
+ "df_carros.where((col(\"id_carro\") == '1') | (col(\"modelo_carro\") == 'Golf')).show()"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 16,
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "+--------+------------+---------+---------+\n",
502
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
503
+ "+--------+------------+---------+---------+\n",
504
+ "| 1| Avalon|$78401.95| 54|\n",
505
+ "| 1| Avalon|$78401.95| 54|\n",
506
+ "+--------+------------+---------+---------+\n",
507
+ "\n"
508
+ ]
509
+ }
510
+ ],
511
+ "source": [
512
+ "# Também:\n",
513
+ "\n",
514
+ "df_carros.where(df_carros[\"id_carro\"]== \"1\").show()\n"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": 17,
520
+ "metadata": {},
521
+ "outputs": [
522
+ {
523
+ "name": "stdout",
524
+ "output_type": "stream",
525
+ "text": [
526
+ "+--------+-------------------+---------+---------+\n",
527
+ "|id_carro| modelo_carro| preco|cod_marca|\n",
528
+ "+--------+-------------------+---------+---------+\n",
529
+ "| 5| Escort|$77466.89| 17|\n",
530
+ "| 6| Expedition|$84698.71| 17|\n",
531
+ "| 33| Mustang|$49088.78| 17|\n",
532
+ "| 46| Crown Victoria|$91605.58| 17|\n",
533
+ "| 52| Freestar|$52971.00| 17|\n",
534
+ "| 59| LTD Crown Victoria|$94694.22| 17|\n",
535
+ "| 94| E-Series|$54544.30| 17|\n",
536
+ "| 97|Explorer Sport Trac|$77874.85| 17|\n",
537
+ "| 118| Taurus|$80544.37| 17|\n",
538
+ "| 126| Taurus|$66074.65| 17|\n",
539
+ "| 141| Mustang|$51926.66| 17|\n",
540
+ "| 168| Escort|$53343.40| 17|\n",
541
+ "| 176| Thunderbird|$62453.21| 17|\n",
542
+ "| 199| Econoline E250|$62348.12| 17|\n",
543
+ "| 206| Mustang|$93492.30| 17|\n",
544
+ "| 209| Bronco|$67474.88| 17|\n",
545
+ "| 215| Five Hundred|$69267.39| 17|\n",
546
+ "| 221| F250|$98421.52| 17|\n",
547
+ "| 230| Thunderbird|$78179.55| 17|\n",
548
+ "| 231| Aerostar|$83764.41| 17|\n",
549
+ "+--------+-------------------+---------+---------+\n",
550
+ "only showing top 20 rows\n",
551
+ "\n"
552
+ ]
553
+ }
554
+ ],
555
+ "source": [
556
+ "# Posso até criar um DataFrame:\n",
557
+ "\n",
558
+ "df_carros_marca = df_carros.where(df_carros[\"cod_marca\"] == \"17\").show()"
559
+ ]
560
+ },
561
+ {
562
+ "attachments": {},
563
+ "cell_type": "markdown",
564
+ "metadata": {},
565
+ "source": [
566
+ "# <font color=\"red\">Duplicados e replace</font>"
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": 3,
572
+ "metadata": {},
573
+ "outputs": [
574
+ {
575
+ "name": "stdout",
576
+ "output_type": "stream",
577
+ "text": [
578
+ "+--------+------------+---------+---------+\n",
579
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
580
+ "+--------+------------+---------+---------+\n",
581
+ "| 1| Avalon|$78401.95| 54|\n",
582
+ "| 1| Avalon|$78401.95| 54|\n",
583
+ "| 2| RDX|$95987.38| 1|\n",
584
+ "| 3| Golf|$61274.55| 55|\n",
585
+ "| 4| EX|$84981.12| 23|\n",
586
+ "+--------+------------+---------+---------+\n",
587
+ "only showing top 5 rows\n",
588
+ "\n"
589
+ ]
590
+ }
591
+ ],
592
+ "source": [
593
+ "df_carros= spark.read.format(\"csv\").option(\"header\", True).load(\"./data/modelo_carro.csv\")\n",
594
+ "\n",
595
+ "df_carros.show(5)"
596
+ ]
597
+ },
598
+ {
599
+ "cell_type": "code",
600
+ "execution_count": 25,
601
+ "metadata": {},
602
+ "outputs": [
603
+ {
604
+ "name": "stdout",
605
+ "output_type": "stream",
606
+ "text": [
607
+ "+--------+------------+---------+---------+\n",
608
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
609
+ "+--------+------------+---------+---------+\n",
610
+ "| 1| Avalon|$78401.95| 54|\n",
611
+ "| 1| Avalon|$78401.95| 54|\n",
612
+ "+--------+------------+---------+---------+\n",
613
+ "\n"
614
+ ]
615
+ }
616
+ ],
617
+ "source": [
618
+ "df_carros.where(df_carros['id_carro'] == '1').show()"
619
+ ]
620
+ },
621
+ {
622
+ "cell_type": "code",
623
+ "execution_count": 26,
624
+ "metadata": {},
625
+ "outputs": [
626
+ {
627
+ "name": "stdout",
628
+ "output_type": "stream",
629
+ "text": [
630
+ "+--------+------------+---------+---------+\n",
631
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
632
+ "+--------+------------+---------+---------+\n",
633
+ "| 81| 928|$75144.55| 44|\n",
634
+ "| 87| Truck|$57007.15| 39|\n",
635
+ "| 319| Vision|$80349.11| 15|\n",
636
+ "+--------+------------+---------+---------+\n",
637
+ "only showing top 3 rows\n",
638
+ "\n"
639
+ ]
640
+ }
641
+ ],
642
+ "source": [
643
+ "df_carros_duplicates = df_carros.distinct() # Com isto removemos as linhas duplicadas \n",
644
+ "\n",
645
+ "# Posso usar também (fazem a mesma coisas) --> df_carros_duplicates = df_carros.dropDuplicates()\n",
646
+ "df_carros_duplicates.show(3)\n"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": 27,
652
+ "metadata": {},
653
+ "outputs": [
654
+ {
655
+ "name": "stdout",
656
+ "output_type": "stream",
657
+ "text": [
658
+ "+--------+------------+---------+---------+\n",
659
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
660
+ "+--------+------------+---------+---------+\n",
661
+ "| 1| Avalon|$78401.95| 54|\n",
662
+ "+--------+------------+---------+---------+\n",
663
+ "\n"
664
+ ]
665
+ }
666
+ ],
667
+ "source": [
668
+ "df_carros_duplicates.where(df_carros_duplicates['id_carro'] == '1').show()"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": 15,
674
+ "metadata": {},
675
+ "outputs": [
676
+ {
677
+ "name": "stdout",
678
+ "output_type": "stream",
679
+ "text": [
680
+ "+--------+------------+--------+---------+\n",
681
+ "|id_carro|modelo_carro| preco|cod_marca|\n",
682
+ "+--------+------------+--------+---------+\n",
683
+ "| 1| Avalon|78401.95| 54|\n",
684
+ "| 1| Avalon|78401.95| 54|\n",
685
+ "| 2| RDX|95987.38| 1|\n",
686
+ "+--------+------------+--------+---------+\n",
687
+ "only showing top 3 rows\n",
688
+ "\n"
689
+ ]
690
+ }
691
+ ],
692
+ "source": [
693
+ "# Vamos remover o \"$\":\n",
694
+ "from pyspark.sql.functions import * # posso usar * ou regexp_replace\n",
695
+ "\n",
696
+ "df_carros_cifrao = df_carros\n",
697
+ "\n",
698
+ "df_carros_cifrao = df_carros_cifrao.withColumn(\"preco\", regexp_replace(\"preco\", \"\\$\", \"\")) # Tem colocar a contra barra \"\\\" para reconhecer o caráter especial\n",
699
+ "\n",
700
+ "df_carros_cifrao = df_carros_cifrao.withColumn(\"preco\", col(\"preco\").cast(\"float\")) # Troquei a coluna \"preco\" de str para float (32bits - 7 dígitos decimais) ou double (64bits - 15 ou 16 dígitos decimais).\n",
701
+ "\n",
702
+ "df_carros_cifrao.show(3)"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": 16,
708
+ "metadata": {},
709
+ "outputs": [
710
+ {
711
+ "name": "stdout",
712
+ "output_type": "stream",
713
+ "text": [
714
+ "root\n",
715
+ " |-- id_carro: string (nullable = true)\n",
716
+ " |-- modelo_carro: string (nullable = true)\n",
717
+ " |-- preco: float (nullable = true)\n",
718
+ " |-- cod_marca: string (nullable = true)\n",
719
+ "\n"
720
+ ]
721
+ }
722
+ ],
723
+ "source": [
724
+ "df_carros_cifrao.printSchema()"
725
+ ]
726
+ },
727
+ {
728
+ "attachments": {},
729
+ "cell_type": "markdown",
730
+ "metadata": {},
731
+ "source": [
732
+ "# <font color=\"red\">Tipagem de Dados</font>"
733
+ ]
734
+ },
735
+ {
736
+ "cell_type": "code",
737
+ "execution_count": null,
738
+ "metadata": {},
739
+ "outputs": [],
740
+ "source": []
741
+ }
742
+ ],
743
+ "metadata": {
744
+ "kernelspec": {
745
+ "display_name": "venv_spark",
746
+ "language": "python",
747
+ "name": "python3"
748
+ },
749
+ "language_info": {
750
+ "codemirror_mode": {
751
+ "name": "ipython",
752
+ "version": 3
753
+ },
754
+ "file_extension": ".py",
755
+ "mimetype": "text/x-python",
756
+ "name": "python",
757
+ "nbconvert_exporter": "python",
758
+ "pygments_lexer": "ipython3",
759
+ "version": "3.10.6"
760
+ },
761
+ "orig_nbformat": 4
762
+ },
763
+ "nbformat": 4,
764
+ "nbformat_minor": 2
765
+ }