File size: 6,081 Bytes
30a3a04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import requests
from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urljoin, quote
import time

class SoundgasmScraper:
    def __init__(self):
        self.base_url = "https://soundgasm.net"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def search_audio(self, query, max_results=10):
        """
        Search for audio on Soundgasm.net
        Since Soundgasm doesn't have a built-in search, we'll use external search engines
        """
        results = []
        
        # Use Google to search for Soundgasm content
        search_query = f"site:soundgasm.net {query}"
        google_url = f"https://www.google.com/search?q={quote(search_query)}"
        
        try:
            response = self.session.get(google_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract Soundgasm links from Google search results
            links = soup.find_all('a', href=True)
            soundgasm_links = []
            
            for link in links:
                href = link.get('href')
                if href and 'soundgasm.net/u/' in href:
                    # Clean up the URL
                    if href.startswith('/url?q='):
                        href = href.split('/url?q=')[1].split('&')[0]
                    if href.startswith('http') and 'soundgasm.net/u/' in href:
                        soundgasm_links.append(href)
            
            # Remove duplicates and limit results
            soundgasm_links = list(set(soundgasm_links))[:max_results]
            
            # Get details for each audio
            for link in soundgasm_links:
                audio_info = self.get_audio_info(link)
                if audio_info:
                    results.append(audio_info)
                    
        except Exception as e:
            print(f"Search error: {e}")
            
        return results
    
    def get_audio_info(self, url):
        """
        Extract audio information from a Soundgasm page
        """
        try:
            response = self.session.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract title
            title_elem = soup.find('title')
            title = title_elem.text.strip() if title_elem else "Unknown Title"
            
            # Extract description
            description = ""
            desc_elem = soup.find('div', class_='jp-description')
            if desc_elem:
                description = desc_elem.get_text(strip=True)
            
            # Extract audio file URL from JavaScript
            audio_url = None
            scripts = soup.find_all('script')
            for script in scripts:
                if script.string:
                    # Look for the audio file URL in the JavaScript
                    match = re.search(r'["\']([^"\']*\.m4a)["\']', script.string)
                    if match:
                        audio_url = match.group(1)
                        if not audio_url.startswith('http'):
                            audio_url = urljoin(self.base_url, audio_url)
                        break
            
            # Extract username from URL
            username = ""
            url_match = re.search(r'/u/([^/]+)/', url)
            if url_match:
                username = url_match.group(1)
            
            # Extract audio title from URL
            audio_title = ""
            title_match = re.search(r'/u/[^/]+/(.+)$', url)
            if title_match:
                audio_title = title_match.group(1).replace('-', ' ').replace('_', ' ')
            
            return {
                'title': title,
                'audio_title': audio_title,
                'username': username,
                'description': description,
                'url': url,
                'audio_url': audio_url,
                'duration': None  # Would need to download file to get duration
            }
            
        except Exception as e:
            print(f"Error getting audio info for {url}: {e}")
            return None
    
    def search_by_username(self, username):
        """
        Get all audios from a specific user
        """
        user_url = f"{self.base_url}/u/{username}"
        try:
            response = self.session.get(user_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all audio links on the user page
            audio_links = []
            links = soup.find_all('a', href=True)
            
            for link in links:
                href = link.get('href')
                if href and f'/u/{username}/' in href and href != f'/u/{username}':
                    full_url = urljoin(self.base_url, href)
                    audio_links.append(full_url)
            
            # Get info for each audio
            results = []
            for link in audio_links:
                audio_info = self.get_audio_info(link)
                if audio_info:
                    results.append(audio_info)
                    
            return results
            
        except Exception as e:
            print(f"Error searching by username {username}: {e}")
            return []

# Test the scraper
if __name__ == "__main__":
    scraper = SoundgasmScraper()
    
    # Test search
    print("Testing search functionality...")
    results = scraper.search_audio("ASMR", max_results=3)
    
    for i, result in enumerate(results, 1):
        print(f"\n--- Result {i} ---")
        print(f"Title: {result['title']}")
        print(f"Audio Title: {result['audio_title']}")
        print(f"Username: {result['username']}")
        print(f"URL: {result['url']}")
        print(f"Audio URL: {result['audio_url']}")
        print(f"Description: {result['description'][:100]}..." if result['description'] else "No description")