pn23 commited on
Commit
20a50c3
·
verified ·
1 Parent(s): b8adbb9

Create scraping.py

Browse files
Files changed (1) hide show
  1. scraping.py +118 -0
scraping.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.support.ui import WebDriverWait
4
+ from selenium.webdriver.support import expected_conditions as EC
5
+ import time
6
+ from selenium.webdriver.chrome.options import Options
7
+ import os
8
+ from pyspark.sql import SparkSession
9
+ from delta import configure_spark_with_delta_pip
10
+ import tempfile
11
+
12
+ # Set the download directory to the "music" subfolder within the current directory
13
+ download_directory = os.path.join(os.getcwd(), "music")
14
+ os.makedirs(download_directory, exist_ok=True)
15
+
16
+ # Set up Chrome options
17
+ chrome_options = Options()
18
+ chrome_options.add_experimental_option("prefs", {
19
+ "download.default_directory": download_directory,
20
+ "download.prompt_for_download": False,
21
+ "download.directory_upgrade": True,
22
+ "safebrowsing.enabled": True
23
+ })
24
+
25
+ # Set up the Selenium WebDriver (e.g., Chrome)
26
+ driver = webdriver.Chrome(options=chrome_options)
27
+
28
+ # Navigate to the website
29
+ driver.get("https://suno.com/me")
30
+
31
+
32
+ # Wait for the sign-in button to be clickable and click it
33
+ sign_in_button = WebDriverWait(driver, 10).until(
34
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "#__next > div > div > div > div > div.cl-main.🔒️.cl-internal-xk295g > div > button.cl-socialButtonsIconButton.cl-socialButtonsIconButton__discord.🔒️.cl-internal-855i1h"))
35
+ )
36
+ sign_in_button.click()
37
+
38
+ # Wait for the username field to be visible and enter the username
39
+ #username: applebottom_12
40
+ username_field = WebDriverWait(driver, 10).until(
41
+ EC.visibility_of_element_located((By.CSS_SELECTOR, "#uid_8"))
42
+ )
43
+ username_field.send_keys("asfasfasfgasdfasgfsag@gmail.com")
44
+
45
+ # Find the password field and enter the password
46
+ password_field = WebDriverWait(driver, 10).until(
47
+ EC.visibility_of_element_located((By.CSS_SELECTOR, "#uid_10"))
48
+ )
49
+ password_field.send_keys("AppleBottom12")
50
+
51
+ # Find the password button and click it
52
+ password_button = WebDriverWait(driver, 10).until(
53
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "#app-mount > div.appAsidePanelWrapper__5e6e2 > div.notAppAsidePanel__95814 > div.app_b1f720 > div > div > div > div > form > div.centeringWrapper__5e247 > div > div.mainLoginContainer_f58870 > div.block__681fa.marginTop20__7e0ad > button.marginBottom8_ce1fb9.button__5573c.button__581d0.lookFilled__950dd.colorBrand__27d57.sizeLarge_b395a7.fullWidth_fdb23d.grow__4c8a4"))
54
+ )
55
+ password_button.click()
56
+
57
+ # Wait for the page to load after signing in
58
+ WebDriverWait(driver, 10).until(
59
+ EC.url_contains("https://suno.com/me")
60
+ )
61
+
62
+ # Click on the specific song
63
+ three_dots = WebDriverWait(driver, 10).until(
64
+ EC.element_to_be_clickable((By.LINK_TEXT, "Samba Kickoff"))
65
+ )
66
+ three_dots.click()
67
+
68
+ # Play the song
69
+ play_button = WebDriverWait(driver, 10).until(
70
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.css-fhtuey > div.css-bhm5u7 > div > div.css-l9hfgy > div.css-144pizt > button.chakra-button.css-15rci1t"))
71
+ )
72
+ play_button.click()
73
+
74
+ time.sleep(3)
75
+
76
+ three_dots = WebDriverWait(driver, 10).until(
77
+ EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="More Actions"]'))
78
+ )
79
+ three_dots.click()
80
+
81
+ # Wait for the Download button to be clickable and click on it
82
+ download_button = WebDriverWait(driver, 10).until(
83
+ EC.element_to_be_clickable((By.XPATH, '//div[@role="menuitem" and contains(text(), "Download")]'))
84
+ )
85
+ download_button.click()
86
+
87
+ # Wait for the Audio element to be clickable and click on it
88
+ audio_element = WebDriverWait(driver, 10).until(
89
+ EC.element_to_be_clickable((By.XPATH, '//div[@role="menuitem" and contains(text(), "Audio")]'))
90
+ )
91
+ audio_element.click()
92
+
93
+ time.sleep(3)
94
+
95
+ print("Successfully signed in!")
96
+
97
+ # Create a SparkSession with Delta Lake configuration
98
+ builder = SparkSession.builder.appName("SaveMP3ToDatabricks") \
99
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
100
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
101
+
102
+ spark = configure_spark_with_delta_pip(builder).getOrCreate()
103
+
104
+ # Setting the Retrieving Director
105
+ retrieve_directory = os.path.join(download_directory, "Samba Kickoff.mp3")
106
+
107
+ # Read the downloaded MP3 file as binary
108
+ mp3_data = spark.sparkContext.binaryFiles(retrieve_directory).collect()[0][1]
109
+
110
+ # Create a DataFrame with the MP3 data
111
+ df = spark.createDataFrame([("Samba Kickoff.mp3", mp3_data,)], ["song_name", "mp3_data"])
112
+
113
+ # Save the DataFrame to a Databricks table
114
+ df.write.format("delta").mode("append").saveAsTable("mp3_table")
115
+
116
+ print("MP3 file saved to Databricks table.")
117
+
118
+ driver.quit()