Spaces:

ziffir
/

x-scraper

Sleeping

App Files Files Community

x-scraper / app.py

ziffir

Upload 4 files

7cb594e verified 29 days ago

raw

history blame contribute delete

5.13 kB

	import os
	import asyncio
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from typing import Optional
	import uvicorn

	app = FastAPI(title="X Scraper API", version="1.0.0")

	# Extension'dan gelen isteklere izin ver
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# --- Models ---
	class ScrapeRequest(BaseModel):
	url: str
	cookies: Optional[str] = None # X login cookie (opsiyonel)

	class SearchRequest(BaseModel):
	query: str
	cookies: Optional[str] = None

	# --- Scrapling lazy import (ağır kütüphane) ---
	def get_fetcher(cookies: Optional[str] = None):
	from scrapling.fetchers import StealthyFetcher
	return StealthyFetcher

	# --- Endpoints ---

	@app.get("/")
	def root():
	return {"status": "ok", "service": "X Scraper API", "version": "1.0.0"}

	@app.get("/health")
	def health():
	return {"status": "healthy"}

	@app.post("/scrape/tweet")
	async def scrape_tweet(req: ScrapeRequest):
	"""Tek tweet URL'sini scrape et"""
	if "x.com" not in req.url and "twitter.com" not in req.url:
	raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")

	try:
	from scrapling.fetchers import StealthyFetcher

	headers = {}
	if req.cookies:
	headers["Cookie"] = req.cookies

	page = await asyncio.to_thread(
	lambda: StealthyFetcher.fetch(
	req.url,
	headless=True,
	network_idle=True,
	extra_headers=headers,
	)
	)

	# Tweet verilerini çek
	tweet_data = {}

	# Metin
	text_el = page.css('[data-testid="tweetText"]')
	tweet_data["text"] = text_el.get_text("\n") if text_el else ""

	# Kullanıcı
	user_el = page.css('[data-testid="User-Name"]')
	tweet_data["user"] = user_el.get_text() if user_el else ""

	# Zaman
	time_el = page.css("time")
	tweet_data["timestamp"] = time_el.attrib.get("datetime", "") if time_el else ""

	# Etkileşimler - aria-label'dan parse et
	like_el = page.css('[data-testid="like"]')
	tweet_data["likes"] = like_el.attrib.get("aria-label", "0") if like_el else "0"

	retweet_el = page.css('[data-testid="retweet"]')
	tweet_data["retweets"] = retweet_el.attrib.get("aria-label", "0") if retweet_el else "0"

	# Medya
	tweet_data["images"] = [
	img.attrib.get("src", "")
	for img in page.css('[data-testid="tweetPhoto"] img')
	]
	tweet_data["has_video"] = bool(page.css('[data-testid="videoPlayer"]'))

	tweet_data["url"] = req.url
	tweet_data["source"] = "scrapling"

	return {"success": True, "data": tweet_data}

	except Exception as e:
	raise HTTPException(500, f"Scrape hatası: {str(e)}")


	@app.post("/scrape/profile")
	async def scrape_profile(req: ScrapeRequest):
	"""Profil sayfasından tweetleri çek"""
	if "x.com" not in req.url and "twitter.com" not in req.url:
	raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")

	try:
	from scrapling.fetchers import StealthyFetcher

	headers = {}
	if req.cookies:
	headers["Cookie"] = req.cookies

	page = await asyncio.to_thread(
	lambda: StealthyFetcher.fetch(
	req.url,
	headless=True,
	network_idle=True,
	extra_headers=headers,
	)
	)

	tweets = []
	articles = page.css('article[data-testid="tweet"]')

	for article in articles:
	try:
	link = article.css('a[href*="/status/"]')
	if not link:
	continue

	href = link.attrib.get("href", "")
	import re
	match = re.search(r'/status/(\d+)', href)
	if not match:
	continue

	text_el = article.css('[data-testid="tweetText"]')
	time_el = article.css("time")

	tweet = {
	"id": match.group(1),
	"text": text_el.get_text() if text_el else "",
	"timestamp": time_el.attrib.get("datetime", "") if time_el else "",
	"url": f"https://x.com{href}",
	}
	tweets.append(tweet)
	except Exception:
	continue

	return {"success": True, "count": len(tweets), "data": tweets}

	except Exception as e:
	raise HTTPException(500, f"Scrape hatası: {str(e)}")


	@app.post("/scrape/search")
	async def scrape_search(req: SearchRequest):
	"""Arama sonuçlarını scrape et"""
	import urllib.parse
	query_encoded = urllib.parse.quote(req.query)
	url = f"https://x.com/search?q={query_encoded}&src=typed_query&f=live"

	scrape_req = ScrapeRequest(url=url, cookies=req.cookies)
	return await scrape_profile(scrape_req)


	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)