Spaces:
Sleeping
Sleeping
File size: 6,081 Bytes
30a3a04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import requests
from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urljoin, quote
import time
class SoundgasmScraper:
def __init__(self):
self.base_url = "https://soundgasm.net"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def search_audio(self, query, max_results=10):
"""
Search for audio on Soundgasm.net
Since Soundgasm doesn't have a built-in search, we'll use external search engines
"""
results = []
# Use Google to search for Soundgasm content
search_query = f"site:soundgasm.net {query}"
google_url = f"https://www.google.com/search?q={quote(search_query)}"
try:
response = self.session.get(google_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract Soundgasm links from Google search results
links = soup.find_all('a', href=True)
soundgasm_links = []
for link in links:
href = link.get('href')
if href and 'soundgasm.net/u/' in href:
# Clean up the URL
if href.startswith('/url?q='):
href = href.split('/url?q=')[1].split('&')[0]
if href.startswith('http') and 'soundgasm.net/u/' in href:
soundgasm_links.append(href)
# Remove duplicates and limit results
soundgasm_links = list(set(soundgasm_links))[:max_results]
# Get details for each audio
for link in soundgasm_links:
audio_info = self.get_audio_info(link)
if audio_info:
results.append(audio_info)
except Exception as e:
print(f"Search error: {e}")
return results
def get_audio_info(self, url):
"""
Extract audio information from a Soundgasm page
"""
try:
response = self.session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract title
title_elem = soup.find('title')
title = title_elem.text.strip() if title_elem else "Unknown Title"
# Extract description
description = ""
desc_elem = soup.find('div', class_='jp-description')
if desc_elem:
description = desc_elem.get_text(strip=True)
# Extract audio file URL from JavaScript
audio_url = None
scripts = soup.find_all('script')
for script in scripts:
if script.string:
# Look for the audio file URL in the JavaScript
match = re.search(r'["\']([^"\']*\.m4a)["\']', script.string)
if match:
audio_url = match.group(1)
if not audio_url.startswith('http'):
audio_url = urljoin(self.base_url, audio_url)
break
# Extract username from URL
username = ""
url_match = re.search(r'/u/([^/]+)/', url)
if url_match:
username = url_match.group(1)
# Extract audio title from URL
audio_title = ""
title_match = re.search(r'/u/[^/]+/(.+)$', url)
if title_match:
audio_title = title_match.group(1).replace('-', ' ').replace('_', ' ')
return {
'title': title,
'audio_title': audio_title,
'username': username,
'description': description,
'url': url,
'audio_url': audio_url,
'duration': None # Would need to download file to get duration
}
except Exception as e:
print(f"Error getting audio info for {url}: {e}")
return None
def search_by_username(self, username):
"""
Get all audios from a specific user
"""
user_url = f"{self.base_url}/u/{username}"
try:
response = self.session.get(user_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all audio links on the user page
audio_links = []
links = soup.find_all('a', href=True)
for link in links:
href = link.get('href')
if href and f'/u/{username}/' in href and href != f'/u/{username}':
full_url = urljoin(self.base_url, href)
audio_links.append(full_url)
# Get info for each audio
results = []
for link in audio_links:
audio_info = self.get_audio_info(link)
if audio_info:
results.append(audio_info)
return results
except Exception as e:
print(f"Error searching by username {username}: {e}")
return []
# Test the scraper
if __name__ == "__main__":
scraper = SoundgasmScraper()
# Test search
print("Testing search functionality...")
results = scraper.search_audio("ASMR", max_results=3)
for i, result in enumerate(results, 1):
print(f"\n--- Result {i} ---")
print(f"Title: {result['title']}")
print(f"Audio Title: {result['audio_title']}")
print(f"Username: {result['username']}")
print(f"URL: {result['url']}")
print(f"Audio URL: {result['audio_url']}")
print(f"Description: {result['description'][:100]}..." if result['description'] else "No description")
|