Tendokudl / main.py
BowoZZZ's picture
Upload 3 files
1886f3a verified
from fastapi import FastAPI, HTTPException, Query
import httpx
from bs4 import BeautifulSoup
import uvicorn
import os
from urllib.parse import unquote, urlparse, parse_qs
from contextlib import asynccontextmanager
import asyncio
import re
# Setup Async Client
client = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global client
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}
# Timeout di-disable (set ke None) agar tidak error saat koneksi lambat
client = httpx.AsyncClient(headers=headers, verify=False, follow_redirects=True, timeout=None)
yield
await client.aclose()
app = FastAPI(title="Tendoku Scraper", lifespan=lifespan)
BASE_DOMAIN = "https://www.tendoku.com"
def unwrap_google_url(url: str) -> str:
"""Membersihkan URL dari wrapper Google Translate."""
if not url: return ""
clean = unquote(url)
# Decode jika URL terbungkus format /website?u=...
if "google" in clean and "/website" in clean and "u=" in clean:
try:
parsed = urlparse(clean)
qs = parse_qs(parsed.query)
if 'u' in qs:
return unwrap_google_url(qs['u'][0])
except:
pass
# Bersihkan domain translate (Sesuaikan dengan domain tendoku)
clean = clean.replace("www-tendoku-com.translate.goog", "www.tendoku.com")
# Hapus parameter google translate
clean = clean.split("?_x_tr_")[0]
clean = clean.split("&_x_tr_")[0]
# Handle relative URL
if clean.startswith("/"):
clean = BASE_DOMAIN + clean
return clean
def format_bytes(size):
"""Helper untuk mengubah bytes ke format readable (MB, GB)."""
try:
power = 2**10
n = 0
power_labels = {0 : '', 1: 'KB', 2: 'MB', 3: 'GB', 4: 'TB'}
while size > power:
size /= power
n += 1
return f"{size:.2f} {power_labels[n]}"
except:
return "Unknown"
async def get_remote_file_size(url: str) -> str:
"""
(FITUR DISIMPAN TAPI TIDAK DIPAKAI UNTUK TENDOKU SESUAI REQUEST)
Mengambil ukuran file dari Header tanpa mendownload body.
"""
try:
headers = {"Range": "bytes=0-0"}
req = client.build_request("GET", url, headers=headers)
r = await client.send(req, stream=True)
total_size = 0
content_range = r.headers.get("Content-Range")
if content_range and "/" in content_range:
try:
total_size = int(content_range.split("/")[-1])
except:
pass
if total_size == 0:
content_length = r.headers.get("Content-Length")
if content_length and content_length.isdigit():
total_size = int(content_length)
await r.aclose()
if total_size > 0:
return format_bytes(total_size)
return "Unknown"
except Exception:
return "Unknown"
async def fetch_until_success(url: str, validator_func) -> BeautifulSoup:
"""
Core Logic: Terus melakukan request ke URL sampai validator_func mengembalikan True.
MODIFIKASI: Jika terkena 429 (Too Many Requests) pada Proxy, switch ke Direct URL.
"""
current_url = url
while True:
try:
res = await client.get(current_url)
# Jika terkena limit (429) dan sedang menggunakan proxy translate
if res.status_code == 429 and "translate.goog" in current_url:
current_url = unwrap_google_url(current_url)
continue
soup = BeautifulSoup(res.text, 'html.parser')
if validator_func(soup):
return soup
except Exception:
pass
return BeautifulSoup("", "html.parser")
async def process_item_fully(name, detail_url, image):
"""
Memproses satu item app:
1. Masuk detail (via Proxy).
2. Parsing Size dari HTML Text (<li>Size : ...</li>).
3. Parsing Link Download dari class 'download_disini'.
"""
while True:
try:
# Convert ke Proxy URL untuk halaman detail Tendoku
# Format: https://www-tendoku-com.translate.goog/...
target_detail_url = detail_url.replace("https://www.tendoku.com", "https://www-tendoku-com.translate.goog")
# Tambahkan param google translate jika belum ada
if "?" not in target_detail_url:
target_detail_url += "?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
else:
target_detail_url += "&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
# 1. Fetch Halaman Detail
def detail_page_valid(s):
# Validasi sederhana: pastikan ada konten artikel
return bool(s.select('article.post')) or bool(s.select('.entry-content'))
app_soup = await fetch_until_success(target_detail_url, detail_page_valid)
if not app_soup.text: return None
# 2. Ambil Size Langsung Dari HTML (Sesuai Request: Gak usah make content length)
# Mencari text 'Size :' di dalam list atau paragraf content
final_size_list = []
content_area = app_soup.select_one('.entry-content')
if content_area:
# Cari di <li> (List) seperti contoh: <li>Size : 17GB</li>
for li in content_area.select('li'):
text = li.get_text(strip=True)
if "Size" in text and ":" in text:
# Split berdasarkan ':' dan ambil bagian kanan
try:
size_part = text.split(":", 1)[1].strip()
final_size_list.append(size_part)
except:
pass
# Fallback: Cari di <p> jika tidak ada di <li> (Jaga-jaga struktur beda dikit)
if not final_size_list:
for p in content_area.select('p'):
text = p.get_text(strip=True)
if "Size" in text and ":" in text:
try:
size_part = text.split(":", 1)[1].strip()
final_size_list.append(size_part)
except:
pass
# Jika size tidak ketemu, set Unknown (jangan dipaksa remote check)
size_result = ", ".join(final_size_list) if final_size_list else "Unknown"
# 3. Ambil Link Download (Class: download_disini)
download_elements = app_soup.select('a.download_disini')
final_link_list = []
for btn in download_elements:
raw_link = btn.get('href')
if not raw_link: continue
# Bersihkan link dari google wrapper
clean_link = unwrap_google_url(raw_link)
final_link_list.append(clean_link)
# Jika tidak ada link download sama sekali, skip item ini
if not final_link_list:
return None
return {
"name": name,
"link": unwrap_google_url(detail_url),
"image": image,
"download": ", ".join(final_link_list), # Support multi result koma
"size": size_result # Support multi result koma (jika ada multiple size terdeteksi)
}
except Exception:
# Retry logic handled by fetch_until_success for connection,
# here we catch parsing errors to retry processing if needed or skip
continue
return None
@app.get("/")
async def root():
# Credit tetap dipertahankan sesuai perintah
return {
"message": "Search API for Tendoku.com by Bowo",
"github": "https://github.com/SaptaZ",
"example_usage": "/search?query=minecraft&limit=5"
}
@app.get("/search")
async def search_apps(
query: str = Query(..., description="App name"),
limit: int = Query(5, description="Limit results")
):
collected_results = []
# WordPress Pagination Logic (Page 1, 2, 3...)
current_page = 1
while True:
# Construct URL menggunakan Proxy Google Translate
# Base: https://www-tendoku-com.translate.goog
# Search Pattern WP: /page/N/?s=KEYWORD
if current_page == 1:
search_url = f"https://www-tendoku-com.translate.goog/?s={query}&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
else:
search_url = f"https://www-tendoku-com.translate.goog/page/{current_page}/?s={query}&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en"
# Validator Search Page
def search_page_valid(s):
# Tendoku menggunakan <article class="post"> untuk item
has_items = bool(s.select('article.post'))
text_content = s.get_text()
if not text_content: return False
# Cek string "Nothing Found" atau "Sorry, but nothing matched" (Standard WP)
no_result = "Nothing Found" in text_content or "nothing matched" in text_content
return has_items or no_result
soup = await fetch_until_success(search_url, search_page_valid)
# Cek jika soup kosong (gagal fetch)
if not soup.text:
break
# Cek jika tidak ada hasil sama sekali di halaman pertama
text_content = soup.get_text()
if ("Nothing Found" in text_content or "nothing matched" in text_content) and current_page == 1:
return {
"success": True,
"query": query,
"limit": limit,
"count": 0,
"results": []
}
# 1. Ambil items dari halaman ini
# Selector disesuaikan dengan Tendoku: article.post
items = soup.select('article.post')
if not items:
break
page_tasks = []
for item in items:
# Ambil Title & Link: <h2 class="entry-title"><a href="...">...</a></h2>
title_el = item.select_one('.entry-title a')
if not title_el: continue
name = title_el.get_text(strip=True)
# Bersihkan link detail
detail_link = unwrap_google_url(title_el['href'])
# Ambil Image: <div class="content-thumbnail"> ... <img src="...">
img_el = item.select_one('.content-thumbnail img')
image = unwrap_google_url(img_el['src']) if img_el else ""
# Masukkan ke queue processing
page_tasks.append(process_item_fully(name, detail_link, image))
if not page_tasks:
break
# Jalankan task KHUSUS untuk halaman ini
page_results = await asyncio.gather(*page_tasks)
# Filter None (Link mati / gagal parse)
valid_page_results = [res for res in page_results if res is not None]
# Masukkan ke koleksi utama
collected_results.extend(valid_page_results)
# Cek apakah limit sudah terpenuhi
if len(collected_results) >= limit:
break
# 2. Cek Navigasi Next Page (Pagination WordPress)
# Biasanya ada di <div class="nav-links"> ... <a class="next page-numbers">
next_link_el = soup.select_one('a.next.page-numbers')
# Atau cek text "Next" / "Older posts" jika class berubah
if not next_link_el:
# Regex case insensitive untuk tombol next jika class tidak ketemu
next_link_el = soup.find('a', string=re.compile(r'Next|Older', re.I))
if next_link_el:
current_page += 1
else:
break
# Potong hasil sesuai limit
results = collected_results[:limit]
return {
"success": True,
"query": query,
"limit": limit,
"count": len(results),
"results": results
}
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)