Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException, Query | |
| import httpx | |
| from bs4 import BeautifulSoup | |
| import uvicorn | |
| import os | |
| from contextlib import asynccontextmanager | |
| import asyncio | |
| import re | |
| import logging | |
| import sys | |
| from urllib.parse import unquote, urlparse, parse_qs # Added for proxy handling | |
| # --- Configuration for Professional Logging --- | |
| # Format: [Time] [Level] [Logger Name]: Message | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| handlers=[logging.StreamHandler(sys.stdout)] | |
| ) | |
| logger = logging.getLogger("PlayModsScraper") | |
| # Setup Async Client | |
| client = None | |
| BASE_DOMAIN = "https://playmods.net" | |
| async def lifespan(app: FastAPI): | |
| global client | |
| logger.info("Initializing HTTP Client and starting application lifespan...") | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
| # Referer dihapus agar tidak konflik saat switch antara proxy dan direct | |
| } | |
| # Timeout di-set None sesuai kode asli | |
| client = httpx.AsyncClient(headers=headers, verify=False, follow_redirects=True, timeout=None) | |
| logger.info("HTTP Client initialized successfully.") | |
| yield | |
| logger.info("Closing HTTP Client and shutting down application...") | |
| await client.aclose() | |
| logger.info("HTTP Client closed.") | |
| app = FastAPI(title="PlayMods Scraper", lifespan=lifespan) | |
| # --- NEW HELPER: Unwrap Google URL --- | |
| def unwrap_google_url(url: str) -> str: | |
| """Membersihkan URL dari wrapper Google Translate.""" | |
| if not url: return "" | |
| clean = unquote(url) | |
| # Decode jika URL terbungkus format /website?u=... | |
| if "google" in clean and "/website" in clean and "u=" in clean: | |
| try: | |
| parsed = urlparse(clean) | |
| qs = parse_qs(parsed.query) | |
| if 'u' in qs: | |
| return unwrap_google_url(qs['u'][0]) | |
| except: | |
| pass | |
| # Bersihkan domain translate (playmods-net.translate.goog -> playmods.net) | |
| clean = clean.replace("playmods-net.translate.goog", "playmods.net") | |
| # Hapus parameter google translate | |
| clean = clean.split("?_x_tr_")[0] | |
| clean = clean.split("&_x_tr_")[0] | |
| return clean | |
| def ensure_absolute_url(url: str) -> str: | |
| """Helper untuk memastikan URL adalah absolute (ada domainnya) dan bersih dari Proxy.""" | |
| if not url: | |
| return "" | |
| # 1. Bersihkan dulu dari wrapper google jika ada | |
| url = unwrap_google_url(url) | |
| # 2. Pastikan format absolute | |
| if url.startswith("/"): | |
| return BASE_DOMAIN + url | |
| return url | |
| async def fetch_until_success(url: str, validator_func) -> BeautifulSoup: | |
| """ | |
| Melakukan request ke URL dengan strategi: Proxy Google -> Fallback Direct jika 429. | |
| """ | |
| # 1. Construct Proxy URL (Default Strategy) | |
| # Convert: https://playmods.net/xxx -> https://playmods-net.translate.goog/xxx | |
| proxy_url = url.replace("https://playmods.net", "https://playmods-net.translate.goog") | |
| if "?" not in proxy_url: | |
| proxy_url += "?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en" | |
| else: | |
| proxy_url += "&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en" | |
| current_url = proxy_url # Mulai dengan Proxy | |
| using_proxy = True | |
| logger.info(f"Initiating request. Target: {url}") | |
| logger.info(f"Trying via Proxy first: {current_url}") | |
| while True: | |
| try: | |
| res = await client.get(current_url) | |
| # --- MODIFIKASI: Fallback Strategy --- | |
| # Jika Proxy kena limit (429), switch ke Direct URL | |
| if res.status_code == 429 and using_proxy: | |
| logger.warning("Switching to DIRECT connection.") | |
| current_url = url # Kembali ke URL asli | |
| using_proxy = False | |
| continue # Retry loop dengan URL baru | |
| # ------------------------------------- | |
| logger.info(f"Status Code: {res.status_code} | Proxy: {using_proxy}") | |
| # --- MODIFIKASI: Stop retry jika status code kritis --- | |
| # Jika direct connection gagal fatal, atau proxy gagal fatal (selain 429 yg sudah dihandle) | |
| if res.status_code in [403, 404, 429, 451, 500]: | |
| logger.error(f"Critical error {res.status_code} encountered. Aborting fetch for: {current_url}") | |
| return None | |
| # ------------------------------------------------------ | |
| if res.status_code != 200: | |
| logger.warning(f"Received non-200 status code: {res.status_code}. Retrying...") | |
| continue | |
| soup = BeautifulSoup(res.text, 'html.parser') | |
| if validator_func(soup): | |
| logger.info("Page validation successful.") | |
| return soup | |
| else: | |
| logger.warning(f"Page validation failed. Structure might be different or content empty.") | |
| return soup | |
| except Exception as e: | |
| logger.error(f"Exception during request: {str(e)}") | |
| # Jika error koneksi saat pakai proxy, coba switch ke direct sebagai upaya terakhir | |
| if using_proxy: | |
| logger.warning("Connection error with Proxy. Switching to DIRECT connection.") | |
| current_url = url | |
| using_proxy = False | |
| continue | |
| else: | |
| return None | |
| logger.error(f"Failed to fetch or validate URL: {url}") | |
| return None | |
| async def get_real_download_link(intermediate_url: str) -> str: | |
| """ | |
| Logic Step 3 & 4: Mendapatkan link download final dari halaman intermediate. | |
| """ | |
| logger.info(f"Resolving real download link from intermediate URL: {intermediate_url}") | |
| def is_valid_intermediate(s): | |
| return bool(s.select_one('#downloadStatejs_id')) | |
| soup = await fetch_until_success(intermediate_url, is_valid_intermediate) | |
| if not soup: | |
| logger.error("Failed to retrieve intermediate download page.") | |
| return "" | |
| script_tag = soup.select_one('#downloadStatejs_id') | |
| if not script_tag: | |
| logger.warning("Script tag #downloadStatejs_id not found in intermediate page.") | |
| return "" | |
| version_id = script_tag.get('versionid') | |
| if not version_id: | |
| logger.warning("Version ID not found in script tag.") | |
| return "" | |
| logger.info(f"Found Version ID: {version_id}") | |
| # Step 4: Hit URL Redirect | |
| # Gunakan direct URL untuk endpoint redirect ini agar lebih reliable (biasanya API) | |
| redirect_endpoint = f"{BASE_DOMAIN}/id/download/version/{version_id}?scheme=https" | |
| try: | |
| logger.info(f"Hitting redirect endpoint: {redirect_endpoint}") | |
| async with httpx.AsyncClient(headers=client.headers, verify=False, follow_redirects=True) as temp_client: | |
| async with temp_client.stream("GET", redirect_endpoint) as response: | |
| final_url = str(response.url) | |
| # Bersihkan URL final (jaga-jaga jika redirect melalui proxy google) | |
| final_url = unwrap_google_url(final_url) | |
| logger.info(f"Redirect resolved to: {final_url}") | |
| # --- MODIFIKASI: Filter PlayMods Installer Link --- | |
| # Mengabaikan link yang mengarah ke installer playmods (iklan) | |
| if "qn-resource.pmapkdown.shop/apk/playmods" in final_url: | |
| logger.warning(f"Ignoring PlayMods installer link (Ad): {final_url}") | |
| return "" | |
| if "apk" in final_url or "resource" in final_url: | |
| return final_url | |
| else: | |
| logger.warning("Final URL does not appear to be an APK or resource.") | |
| except Exception as e: | |
| logger.error(f"Error fetching redirect endpoint: {str(e)}") | |
| return "" | |
| async def process_item_fully(name, detail_url, image): | |
| """ | |
| Memproses satu item app secara lengkap. | |
| """ | |
| logger.info(f"Processing item task started: {name}") | |
| try: | |
| # Pastikan URL Detail Absolute & Bersih | |
| detail_url = ensure_absolute_url(detail_url) | |
| # 1. Fetch Halaman Detail | |
| def detail_page_valid(s): | |
| # Validasi jika ada tombol download atau link "all-download" | |
| return bool(s.select('a.btn-download1.ptn')) or bool(s.select('a[href*="/all-download"]')) | |
| app_soup = await fetch_until_success(detail_url, detail_page_valid) | |
| if not app_soup: | |
| logger.error(f"Failed to fetch detail page for: {name}") | |
| return None | |
| # 2. Ambil Elemen Download | |
| # Cek apakah ada link "all-download" (biasanya jika ada banyak versi) | |
| all_download_link = app_soup.select_one('a[href*="/all-download"]') | |
| intermediate_links = [] | |
| sizes = [] | |
| if all_download_link: | |
| logger.info(f"Multiple download links detected for '{name}'. Fetching all-download page...") | |
| all_download_url = ensure_absolute_url(all_download_link.get('href')) | |
| def all_download_page_valid(s): | |
| # Validasi jika ada link download di halaman all-download | |
| return bool(s.select('a[href*="/download"]')) | |
| all_soup = await fetch_until_success(all_download_url, all_download_page_valid) | |
| if all_soup: | |
| # Ambil semua link yang mengandung /download tapi bukan link navigasi umum | |
| download_links = all_soup.select('a[href*="/download"]') | |
| for link in download_links: | |
| raw_href = link.get('href') | |
| if raw_href and not any(x in raw_href.lower() for x in ["/category/", "/app/", "/all-download"]): | |
| abs_url = ensure_absolute_url(raw_href) | |
| if abs_url not in intermediate_links: | |
| intermediate_links.append(abs_url) | |
| # Coba ambil size dari elemen sekitarnya jika ada | |
| parent_text = link.find_parent().get_text() if link.find_parent() else "" | |
| size_match = re.search(r'(\d+(\.\d+)?\s*(GB|MB|KB))', parent_text, re.IGNORECASE) | |
| sizes.append(size_match.group(1) if size_match else "Unknown") | |
| # Jika tidak ada all-download atau gagal ambil dari sana, ambil tombol utama | |
| if not intermediate_links: | |
| download_btn = app_soup.select_one('a.btn-download1.ptn') | |
| if download_btn: | |
| raw_intermediate_link = download_btn.get('href') | |
| if raw_intermediate_link: | |
| intermediate_links.append(ensure_absolute_url(raw_intermediate_link)) | |
| # Ambil Size | |
| btn_text = download_btn.get_text(strip=True) | |
| size_match = re.search(r'\((.*?)\)', btn_text) | |
| sizes.append(size_match.group(1) if size_match else "Unknown") | |
| if not intermediate_links: | |
| logger.warning(f"No download links found for: {name}") | |
| return None | |
| logger.info(f"Found {len(intermediate_links)} download links for '{name}'. Resolving final links...") | |
| # 3. Dapatkan Link APK Final untuk semua link yang ditemukan | |
| final_links = [] | |
| valid_sizes = [] | |
| for i, link in enumerate(intermediate_links): | |
| final_apk_link = await get_real_download_link(link) | |
| if final_apk_link: | |
| final_links.append(final_apk_link) | |
| if i < len(sizes): | |
| valid_sizes.append(sizes[i]) | |
| if not final_links: | |
| logger.error(f"Failed to get any final APK links for: {name}") | |
| return None | |
| # Gabungkan link dengan koma | |
| combined_download_links = ", ".join(final_links) | |
| # Gabungkan size dengan koma | |
| combined_sizes = ", ".join(valid_sizes) if valid_sizes else "Unknown" | |
| logger.info(f"Successfully processed item: {name} with {len(final_links)} links.") | |
| return { | |
| "name": name, | |
| "link": detail_url, | |
| "image": ensure_absolute_url(image), | |
| "download": combined_download_links, | |
| "size": combined_sizes | |
| } | |
| except Exception as e: | |
| logger.error(f"Exception while processing item '{name}': {str(e)}") | |
| return None | |
| async def root(): | |
| logger.info("Root endpoint accessed.") | |
| return { | |
| "message": "Search API for PlayMods.net by Bowo", | |
| "github": "https://github.com/SaptaZ", | |
| "example_usage": "/search?query=minecraft&limit=5" | |
| } | |
| async def search_apps( | |
| query: str = Query(..., description="App name"), | |
| limit: int = Query(5, description="Limit results") | |
| ): | |
| logger.info(f"Incoming search request - Query: '{query}', Limit: {limit}") | |
| tasks = [] | |
| seen_urls = set() # 1. Set untuk mencegah duplikat item | |
| current_page = 1 | |
| last_page_content_hash = None # 2. Variabel untuk mendeteksi halaman berulang | |
| while True: | |
| search_url = f"{BASE_DOMAIN}/id/search/{query}?page={current_page}" | |
| logger.info(f"Scraping search page {current_page}: {search_url}") | |
| # Validator sederhana | |
| def search_page_valid(s): | |
| return bool(s.select('.media-list .media-item')) or "Found 0" in s.get_text() | |
| soup = await fetch_until_success(search_url, search_page_valid) | |
| if not soup: | |
| break | |
| # 3. Cek apakah halaman ini sama persis dengan halaman sebelumnya? (Infinite Loop Protection) | |
| content_container = soup.select_one('.media-list') | |
| if content_container: | |
| # Menggunakan teks item pertama untuk deteksi duplikasi halaman yang lebih akurat | |
| # karena hash seluruh container bisa berubah karena iklan atau elemen dinamis lain. | |
| first_item = content_container.select_one('.media-item-content h2 a') | |
| current_content_marker = first_item.get_text(strip=True) if first_item else content_container.get_text(strip=True) | |
| if current_content_marker == last_page_content_hash: | |
| logger.info("Page content identical to previous page. Stopping pagination.") | |
| break | |
| last_page_content_hash = current_content_marker | |
| else: | |
| # Jika tidak ada container list, mungkin 0 result atau error layout | |
| logger.info("No .media-list found. End of search results.") | |
| break | |
| items = soup.select('.media-list .media-item') | |
| if not items: | |
| logger.info("No items found on current page.") | |
| break | |
| logger.info(f"Found {len(items)} items on page {current_page}.") | |
| items_added_on_this_page = 0 | |
| for item in items: | |
| # Cek limit dulu sebelum memproses | |
| if len(tasks) >= limit: | |
| break | |
| title_el = item.select_one('.media-item-content h2 a') | |
| if not title_el: continue | |
| # Extract link dan pastikan bersih dari proxy jika ada | |
| raw_link = title_el['href'] | |
| detail_link = ensure_absolute_url(raw_link) | |
| # 4. Filter Duplikat: Jika link ini sudah pernah diambil, skip. | |
| if detail_link in seen_urls: | |
| continue | |
| seen_urls.add(detail_link) | |
| name = title_el.get_text(" ", strip=True) | |
| img_el = item.select_one('.media-item-pic img') | |
| raw_image = img_el.get('data-src') or img_el.get('src') if img_el else "" | |
| image = ensure_absolute_url(raw_image) | |
| tasks.append(process_item_fully(name, detail_link, image)) | |
| items_added_on_this_page += 1 | |
| # Jika limit tercapai, hentikan loop utama | |
| if len(tasks) >= limit: | |
| logger.info(f"Limit reached ({len(tasks)} tasks). Stopping pagination.") | |
| break | |
| # Jika halaman ini tidak menambah item apa-apa (semua duplikat), force stop | |
| if items_added_on_this_page == 0: | |
| logger.info("No new unique items found on this page. Stopping.") | |
| break | |
| # 5. Cek Navigasi Next Page | |
| # Menggunakan selector yang lebih spesifik untuk tombol 'next' | |
| next_btn = soup.select_one('a.next-page, .icon-page-right, a[href*="page="]') | |
| # Verifikasi apakah link next button benar-benar mengarah ke halaman berikutnya | |
| is_real_next = False | |
| if next_btn: | |
| href = next_btn.get('href') | |
| if href: | |
| parsed_href = urlparse(href) | |
| qs = parse_qs(parsed_href.query) | |
| next_page_num = qs.get('page', [None])[0] | |
| if next_page_num and next_page_num.isdigit(): | |
| if int(next_page_num) > current_page: | |
| is_real_next = True | |
| elif f"page={current_page + 1}" in href: | |
| is_real_next = True | |
| else: | |
| # Jika tidak ada href tapi ada class next-page, kita asumsikan ini tombol valid | |
| is_real_next = True | |
| if is_real_next: | |
| current_page += 1 | |
| else: | |
| logger.info("No valid next page indicator found. End of search results.") | |
| break | |
| logger.info(f"Executing {len(tasks)} detail scraping tasks concurrently...") | |
| if not tasks: | |
| return { | |
| "success": True, | |
| "query": query, | |
| "limit": limit, | |
| "count": 0, | |
| "results": [] | |
| } | |
| raw_results = await asyncio.gather(*tasks) | |
| results = [res for res in raw_results if res is not None] | |
| # Potong hasil final in case ada task berlebih yang lolos | |
| results = results[:limit] | |
| return { | |
| "success": True, | |
| "query": query, | |
| "limit": limit, | |
| "count": len(results), | |
| "results": results | |
| } | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| logger.info(f"Starting server on port {port}...") | |
| uvicorn.run(app, host="0.0.0.0", port=port) |