BowoZZZ commited on
Commit
e318dbf
·
verified ·
1 Parent(s): 7c81c68

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. main.py +281 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gunakan image Python yang ringan
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy file requirements dan install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # --- BAGIAN INI YANG HILANG DI FILE KAMU ---
12
+
13
+ # 1. Copy seluruh file project (termasuk main.py) ke dalam container
14
+ COPY . .
15
+
16
+ # 2. Buat user baru (non-root) agar sesuai security policy Hugging Face
17
+ # Ini Wajib agar tidak kena error "Permission Denied"
18
+ RUN useradd -m -u 1000 user
19
+ USER user
20
+ ENV HOME=/home/user \
21
+ PATH=/home/user/.local/bin:$PATH
22
+
23
+ # 3. Expose port 7860 (Port wajib untuk HF Spaces)
24
+ EXPOSE 7860
25
+
26
+ # 4. Perintah utama untuk menyalakan server saat Space dijalankan
27
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ import httpx
3
+ from bs4 import BeautifulSoup
4
+ import uvicorn
5
+ import os
6
+ from urllib.parse import unquote, urlparse, parse_qs
7
+ from contextlib import asynccontextmanager
8
+ import asyncio
9
+ import re
10
+
11
+ # Setup Async Client
12
+ client = None
13
+
14
+ @asynccontextmanager
15
+ async def lifespan(app: FastAPI):
16
+ global client
17
+ headers = {
18
+ "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
19
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
20
+ }
21
+ # Timeout di-disable agar tidak error saat koneksi lambat
22
+ client = httpx.AsyncClient(headers=headers, verify=False, follow_redirects=True, timeout=None)
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ global client
27
+ headers = {
28
+ "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
29
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
30
+ }
31
+ client = httpx.AsyncClient(headers=headers, verify=False, follow_redirects=True, timeout=None)
32
+ yield
33
+ await client.aclose()
34
+
35
+ app = FastAPI(title="Simontok Scraper", lifespan=lifespan)
36
+
37
+ BASE_DOMAIN = "https://simontokx.tv"
38
+
39
+ def unwrap_google_url(url: str) -> str:
40
+ """Membersihkan URL dari wrapper Google Translate."""
41
+ if not url: return ""
42
+ clean = unquote(url)
43
+
44
+ # Decode jika URL terbungkus format /website?u=...
45
+ if "google" in clean and "/website" in clean and "u=" in clean:
46
+ try:
47
+ parsed = urlparse(clean)
48
+ qs = parse_qs(parsed.query)
49
+ if 'u' in qs:
50
+ return unwrap_google_url(qs['u'][0])
51
+ except:
52
+ pass
53
+
54
+ # Bersihkan domain translate (simontokx-tv.translate.goog -> simontokx.tv)
55
+ clean = clean.replace("simontokx-tv.translate.goog", "simontokx.tv")
56
+
57
+ # Hapus parameter google translate
58
+ clean = clean.split("?_x_tr_")[0]
59
+ clean = clean.split("&_x_tr_")[0]
60
+
61
+ # Handle relative URL
62
+ if clean.startswith("/"):
63
+ clean = BASE_DOMAIN + clean
64
+ return clean
65
+
66
+ async def fetch_until_success(url: str, validator_func) -> BeautifulSoup:
67
+ """
68
+ Core Logic: Terus melakukan request ke URL sampai validator_func mengembalikan True.
69
+ Jika terkena 429 (Too Many Requests) pada Proxy, switch ke Direct URL.
70
+ """
71
+ current_url = url
72
+
73
+ while True:
74
+ try:
75
+ res = await client.get(current_url)
76
+
77
+ # Jika terkena limit (429) dan sedang menggunakan proxy translate
78
+ if res.status_code == 429 and "translate.goog" in current_url:
79
+ current_url = unwrap_google_url(current_url)
80
+ continue
81
+
82
+ soup = BeautifulSoup(res.text, 'html.parser')
83
+ if validator_func(soup):
84
+ return soup
85
+ except Exception:
86
+ pass
87
+
88
+ async def process_item_fully(name, detail_url, image):
89
+ """
90
+ Memproses satu item video:
91
+ 1. Masuk detail (via Proxy).
92
+ 2. Ambil Link Download & Tags.
93
+ """
94
+ while True:
95
+ try:
96
+ # Convert ke Proxy URL untuk halaman detail
97
+ # simontokx.tv -> simontokx-tv.translate.goog
98
+ target_detail_url = detail_url.replace("https://simontokx.tv", "https://simontokx-tv.translate.goog")
99
+ if "?" not in target_detail_url:
100
+ target_detail_url += "?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
101
+ else:
102
+ target_detail_url += "&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
103
+
104
+ # 1. Fetch Halaman Detail
105
+ def detail_page_valid(s):
106
+ # Validasi halaman detail ada judul atau player
107
+ title_exist = bool(s.select('h3.single-title'))
108
+ player_exist = bool(s.select('#main-video-player'))
109
+ return title_exist or player_exist
110
+
111
+ app_soup = await fetch_until_success(target_detail_url, detail_page_valid)
112
+
113
+ # 2. Ambil Link Download
114
+ # Cari tombol <a> dengan class 'btn btn-primary' yang text-nya DOWNLOAD
115
+ download_link = ""
116
+ download_btn = app_soup.find('a', string=re.compile(r'DOWNLOAD', re.IGNORECASE))
117
+
118
+ # Fallback selektor jika text tidak match persis
119
+ if not download_btn:
120
+ download_btn = app_soup.select_one('a.btn.btn-primary[href^="http"]')
121
+
122
+ if download_btn and download_btn.get('href'):
123
+ download_link = unwrap_google_url(download_btn['href'])
124
+
125
+ # 3. Ambil Tags
126
+ # Lokasi: di bawah <h4 class="sidebar-title"><span class="highlight">Tags</span></h4>
127
+ tags_list = []
128
+ tags_header = app_soup.find('h4', class_='sidebar-title')
129
+ if tags_header:
130
+ # Ambil siblings (elemen a setelah header)
131
+ # Berdasarkan HTML: <a href="/?id=..." class="btn btn-default">tagname</a>
132
+ # Kita cari tag <a> dengan class 'btn btn-default' yang mengandung 'id=' di href atau posisinya
133
+ # Cara aman: select semua a.btn.btn-default di halaman yang relevan
134
+ potential_tags = app_soup.select('a.btn.btn-default')
135
+ for tag in potential_tags:
136
+ # Filter sederhana: biasanya tag linknya /?id=
137
+ if "id=" in tag.get('href', ''):
138
+ tag_text = tag.get_text(strip=True)
139
+ if tag_text:
140
+ tags_list.append(tag_text)
141
+
142
+ tags_str = ", ".join(tags_list)
143
+
144
+ return {
145
+ "name": name,
146
+ "link": unwrap_google_url(detail_url), # Return link asli
147
+ "image": image,
148
+ "download": download_link,
149
+ "tags": tags_str
150
+ }
151
+
152
+ except Exception:
153
+ # Jika gagal, bisa retry atau skip. Di sini kita skip agar tidak stuck forever.
154
+ return None
155
+
156
+ @app.get("/")
157
+ async def root():
158
+ return {
159
+ "message": "Search API for Simontokx.tv by Bowo",
160
+ "example_usage": "/search?query=minecraft&limit=5"
161
+ }
162
+
163
+ @app.get("/search")
164
+ async def search_apps(
165
+ query: str = Query(..., description="Video query"),
166
+ limit: int = Query(5, description="Limit results")
167
+ ):
168
+ tasks = []
169
+
170
+ # Logic Pagination Simontok:
171
+ # Page 1: /?id={query}
172
+ # Page 2: /page/tags/{query}/pages/1
173
+ # Page 3: /page/tags/{query}/pages/2
174
+ # Kita gunakan counter 'current_page' (mulai dari 0 untuk halaman pertama)
175
+ current_page = 0
176
+
177
+ while True:
178
+ # Construct URL menggunakan Proxy Google Translate
179
+ # Base: https://simontokx-tv.translate.goog
180
+
181
+ if current_page == 0:
182
+ path_query = f"?id={query}"
183
+ else:
184
+ # Sesuai pola HTML , halaman ke-2 urlnya .../pages/1
185
+ # Jadi logicnya: page_num = current_page
186
+ path_query = f"/page/tags/{query}/pages/{current_page}"
187
+
188
+ search_url = f"https://simontokx-tv.translate.goog/{path_query}"
189
+
190
+ # Tambahkan param proxy jika belum ada (untuk page 2 dst biasanya butuh append manual jika path berubah)
191
+ if "?" in search_url:
192
+ search_url += "&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
193
+ else:
194
+ search_url += "?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
195
+
196
+ # Validator Search Page
197
+ def search_page_valid(s):
198
+ # Cek apakah ada item thumb-block
199
+ has_items = bool(s.select('.thumb-block'))
200
+ text_content = s.get_text().lower()
201
+ # Cek pesan error/kosong
202
+ no_result = "not found" in text_content or "tidak ditemukan" in text_content
203
+ # Jika halaman ada items atau ada pesan not found, berarti valid
204
+ return has_items or no_result
205
+
206
+ soup = await fetch_until_success(search_url, search_page_valid)
207
+
208
+ # Cek jika tidak ada hasil sama sekali
209
+ items = soup.select('.thumb-block')
210
+ if not items:
211
+ if current_page == 0:
212
+ return {
213
+ "success": True,
214
+ "query": query,
215
+ "limit": limit,
216
+ "count": 0,
217
+ "results": []
218
+ }
219
+ else:
220
+ break
221
+
222
+ # 1. Ambil items dari halaman ini
223
+ for item in items:
224
+ # Cari elemen <a> di dalam thumb
225
+ link_el = item.select_one('.thumb a')
226
+ if not link_el: continue
227
+
228
+ detail_link_raw = link_el.get('href')
229
+ detail_link = unwrap_google_url(detail_link_raw)
230
+
231
+ # Cari gambar
232
+ img_el = link_el.select_one('img')
233
+ image = ""
234
+ name = "Unknown"
235
+
236
+ if img_el:
237
+ # Simontok pakai lazy loading (data-src), fallback ke src
238
+ image = img_el.get('data-src') or img_el.get('src')
239
+ image = unwrap_google_url(image)
240
+ # Title ada di atribut title img
241
+ name = img_el.get('title') or "No Title"
242
+
243
+ # Filter hasil "Page Not Found" palsu atau iklan jika ada
244
+ if "/watch/" not in detail_link:
245
+ continue
246
+
247
+ tasks.append(process_item_fully(name, detail_link, image))
248
+
249
+ # Cek apakah tasks sudah memenuhi limit
250
+ if len(tasks) >= limit:
251
+ break
252
+
253
+ # 2. Cek Navigasi Smart (Tombol Next)
254
+ # HTML : <a href="..." class="no-page next-page">Next</a>
255
+ next_btn = soup.select_one('a.next-page')
256
+
257
+ if next_btn:
258
+ # Jika tombol next ada, kita lanjut ke loop berikutnya
259
+ # Increment manual karena kita construct URL sendiri di atas
260
+ current_page += 1
261
+ else:
262
+ break
263
+
264
+ # Jalankan semua task fetching detail
265
+ raw_results = await asyncio.gather(*tasks)
266
+
267
+ # Filter None dan Potong sesuai limit
268
+ results = [res for res in raw_results if res is not None]
269
+ results = results[:limit]
270
+
271
+ return {
272
+ "success": True,
273
+ "query": query,
274
+ "limit": limit,
275
+ "count": len(results),
276
+ "results": results
277
+ }
278
+
279
+ if __name__ == "__main__":
280
+ port = int(os.environ.get("PORT", 7860))
281
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ httpx
4
+ beautifulsoup4