BowoZZZ commited on
Commit
1ba1a41
·
verified ·
1 Parent(s): 06e86c9

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. main.py +225 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gunakan image Python yang ringan
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy file requirements dan install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # --- BAGIAN INI YANG HILANG DI FILE KAMU ---
12
+
13
+ # 1. Copy seluruh file project (termasuk main.py) ke dalam container
14
+ COPY . .
15
+
16
+ # 2. Buat user baru (non-root) agar sesuai security policy Hugging Face
17
+ # Ini Wajib agar tidak kena error "Permission Denied"
18
+ RUN useradd -m -u 1000 user
19
+ USER user
20
+ ENV HOME=/home/user \
21
+ PATH=/home/user/.local/bin:$PATH
22
+
23
+ # 3. Expose port 7860 (Port wajib untuk HF Spaces)
24
+ EXPOSE 7860
25
+
26
+ # 4. Perintah utama untuk menyalakan server saat Space dijalankan
27
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ import httpx
3
+ from bs4 import BeautifulSoup
4
+ import uvicorn
5
+ import os
6
+ from urllib.parse import unquote, urlparse, parse_qs
7
+ from contextlib import asynccontextmanager
8
+ import asyncio
9
+
10
+ # Setup Async Client
11
+ client = None
12
+
13
+ @asynccontextmanager
14
+ async def lifespan(app: FastAPI):
15
+ global client
16
+ headers = {
17
+ "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
18
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
19
+ }
20
+ # Timeout di-disable (set ke None) agar tidak error saat koneksi lambat, biar dia nunggu sampai dapet
21
+ client = httpx.AsyncClient(headers=headers, verify=False, follow_redirects=True, timeout=None)
22
+ yield
23
+ await client.aclose()
24
+
25
+ app = FastAPI(title="AN1.com Aggressive Scraper (No-Fail Mode)", lifespan=lifespan)
26
+
27
+ BASE_DOMAIN = "https://an1.com"
28
+
29
+ def unwrap_google_url(url: str) -> str:
30
+ """Membersihkan URL dari wrapper Google Translate."""
31
+ if not url: return ""
32
+ clean = unquote(url)
33
+
34
+ if "google" in clean and "/website" in clean and "u=" in clean:
35
+ try:
36
+ parsed = urlparse(clean)
37
+ qs = parse_qs(parsed.query)
38
+ if 'u' in qs:
39
+ return unwrap_google_url(qs['u'][0])
40
+ except:
41
+ pass
42
+
43
+ clean = clean.replace("an1-com.translate.goog", "an1.com")
44
+ clean = clean.replace("files-an1-net.translate.goog", "files.an1.net")
45
+ clean = clean.replace("file-an1-co.translate.goog", "file.an1.co")
46
+ clean = clean.split("?_x_tr_")[0]
47
+ clean = clean.split("&_x_tr_")[0]
48
+
49
+ if clean.startswith("/"):
50
+ clean = BASE_DOMAIN + clean
51
+ return clean
52
+
53
+ async def fetch_until_success(url: str, validator_func) -> BeautifulSoup:
54
+ """
55
+ Core Logic: Terus melakukan request ke URL sampai validator_func mengembalikan True.
56
+ Tidak ada delay, tidak ada timeout exception.
57
+ """
58
+ while True:
59
+ try:
60
+ res = await client.get(url)
61
+ soup = BeautifulSoup(res.text, 'html.parser')
62
+
63
+ # Cek apakah hasil valid sesuai kriteria pemanggil
64
+ if validator_func(soup):
65
+ return soup
66
+ except Exception:
67
+ # Jika error koneksi/ssl, abaikan dan coba lagi langsung (agresif)
68
+ pass
69
+
70
+ async def scan_intermediate_page_loop(intermediate_url: str) -> str:
71
+ """Looping scraping halaman intermediate sampai dapat link asli."""
72
+ target_url = intermediate_url.replace("https://an1.com", "https://an1-com.translate.goog")
73
+ if "?" not in target_url:
74
+ target_url += "?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
75
+
76
+ # Validator: Harus ketemu link yang mengandung 'file.an1' atau 'files.an1' dan bukan 'an1store'
77
+ def is_valid_intermediate(soup):
78
+ # Cek tombol pre_download
79
+ btn = soup.select_one('a#pre_download')
80
+ if btn and btn.get('href'): return True
81
+
82
+ # Cek tombol green alternatif
83
+ for b in soup.select('a.btn-green'):
84
+ if b.get('href'): return True
85
+ return False
86
+
87
+ # Retry loop untuk fetch halaman
88
+ soup = await fetch_until_success(target_url, is_valid_intermediate)
89
+
90
+ # Parsing (Logic ini pasti jalan karena soup sudah divalidasi)
91
+ candidates = []
92
+ pre_download_btn = soup.select_one('a#pre_download')
93
+ if pre_download_btn: candidates.append(pre_download_btn['href'])
94
+
95
+ for btn in soup.select('a.btn-green'):
96
+ candidates.append(btn['href'])
97
+
98
+ for raw_link in candidates:
99
+ real_link = unwrap_google_url(raw_link)
100
+ if ("file.an1" in real_link or "files.an1" in real_link) and "an1store" not in real_link:
101
+ return real_link
102
+
103
+ # Should not happen jika validator benar, tapi untuk safety return kosong biar di-retry level atas
104
+ return ""
105
+
106
+ async def process_item_fully(name, raw_link, image):
107
+ """
108
+ Memproses satu item app sampai SEMUA data (size, download link) didapatkan.
109
+ Tidak akan return sampai data lengkap.
110
+ """
111
+ while True:
112
+ try:
113
+ # 1. Fetch Halaman Detail
114
+ # Validator: Harus ada tombol download
115
+ def detail_valid(s):
116
+ return bool(s.select('a.download_line.green'))
117
+
118
+ app_soup = await fetch_until_success(raw_link, detail_valid)
119
+
120
+ # 2. Ambil Size
121
+ size_el = app_soup.select_one('[itemprop="fileSize"]')
122
+ size = size_el.get_text(strip=True) if size_el else "Unknown"
123
+
124
+ # Jika size gagal ambil, anggap page rusak, ulangi loop
125
+ if size == "Unknown":
126
+ continue
127
+
128
+ # 3. Ambil Link Download
129
+ final_links = []
130
+ buttons = app_soup.select('a.download_line.green')
131
+
132
+ all_links_success = True
133
+ for btn in buttons:
134
+ rel_link = btn.get('href')
135
+ if not rel_link: continue
136
+
137
+ intermediate_url = unwrap_google_url(rel_link)
138
+
139
+ # Masuk ke loop retry intermediate
140
+ direct_link = await scan_intermediate_page_loop(intermediate_url)
141
+
142
+ if direct_link:
143
+ final_links.append(direct_link)
144
+ else:
145
+ # Jika satu link gagal, tandai gagal total agar loop utama mengulang
146
+ all_links_success = False
147
+ break
148
+
149
+ if not final_links or not all_links_success:
150
+ continue # Retry dari awal halaman detail
151
+
152
+ # Jika sampai sini, berarti Size ada DAN Link ada
153
+ return {
154
+ "name": name,
155
+ "link": unwrap_google_url(raw_link),
156
+ "image": image,
157
+ "download": ", ".join(final_links),
158
+ "size": size
159
+ }
160
+
161
+ except Exception:
162
+ continue
163
+
164
+ @app.get("/")
165
+ async def root():
166
+ return {
167
+ "message": "Search API for An1.com by Bowo",
168
+ "example_usage": "/search?query=minecraft&limit=5"
169
+ }
170
+
171
+ @app.get("/search")
172
+ async def search_apps(
173
+ query: str = Query(..., description="App name"),
174
+ limit: int = Query(5)
175
+ ):
176
+ search_url = f"https://an1-com.translate.goog/index.php?do=search&subaction=search&story={query}&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
177
+
178
+ # 1. Loop Fetch Search Page
179
+ # Validator: Entah ketemu item, ATAU ketemu pesan "no results"
180
+ def search_page_valid(s):
181
+ has_items = bool(s.select('.search-result .item, .item'))
182
+ no_result_msg = "yielded no results" in s.get_text() or "did not match any documents" in s.get_text()
183
+ return has_items or no_result_msg
184
+
185
+ soup = await fetch_until_success(search_url, search_page_valid)
186
+
187
+ # Cek apakah genuine no result
188
+ if "yielded no results" in soup.get_text() or "did not match any documents" in soup.get_text():
189
+ return {
190
+ "success": True,
191
+ "query": query,
192
+ "count": 0,
193
+ "results": []
194
+ }
195
+
196
+ # Ambil items
197
+ items = soup.select('.search-result .item, .item')
198
+ tasks = []
199
+
200
+ # Buat Task Async untuk setiap item
201
+ for item in items[:limit]:
202
+ name_el = item.select_one('.title a') or item.select_one('a[href*=".html"]')
203
+ if not name_el: continue
204
+
205
+ name = name_el.get_text(strip=True)
206
+ raw_link = name_el['href']
207
+ img_el = item.select_one('img')
208
+ image = unwrap_google_url(img_el['src']) if img_el else ""
209
+
210
+ # Masukkan ke list task untuk diproses paralel
211
+ tasks.append(process_item_fully(name, raw_link, image))
212
+
213
+ # Jalankan semua item secara paralel (tapi setiap item akan looping sendiri sampai sukses)
214
+ results = await asyncio.gather(*tasks)
215
+
216
+ return {
217
+ "success": True,
218
+ "query": query,
219
+ "count": len(results),
220
+ "results": results
221
+ }
222
+
223
+ if __name__ == "__main__":
224
+ port = int(os.environ.get("PORT", 7860))
225
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ httpx
4
+ beautifulsoup4