Spaces:

ziffir
/

x-scraper

Sleeping

App Files Files Community

ziffir commited on 30 days ago

Commit

7cb594e

verified ·

1 Parent(s): 5896c90

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +26 -0
README.md +28 -7
app.py +171 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Sistem bağımlılıkları (Playwright/Chromium için)
+RUN apt-get update && apt-get install -y \
+    wget curl gnupg \
+    libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 \
+    libcups2 libdrm2 libdbus-1-3 libxkbcommon0 libx11-6 libxcomposite1 \
+    libxdamage1 libxext6 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 \
+    libcairo2 libasound2 libatspi2.0-0 libxshmfence1 \
+    && rm -rf /var/lib/apt/lists/*
+# Python bağımlılıkları
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Scrapling browser kurulumu
+RUN scrapling install
+COPY app.py .
+# HF Spaces port 7860
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,33 @@
 ---
-title: X Scraper
-emoji: ⚡
-colorFrom: red
-colorTo: yellow
 sdk: docker
 pinned: false
-license: apache-2.0
-short_description: x-scraper
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: X Tweet Scraper API
+emoji: 🐦
+colorFrom: gray
+colorTo: blue
 sdk: docker
 pinned: false
 ---
+# X Tweet Scraper API
+FastAPI + Scrapling tabanlı X (Twitter) scraper.
+## Endpoints
+| Method | Path | Açıklama |
+|--------|------|----------|
+| GET | `/` | Sağlık kontrolü |
+| POST | `/scrape/tweet` | Tek tweet detayı |
+| POST | `/scrape/profile` | Profil tweetleri |
+| POST | `/scrape/search` | Arama sonuçları |
+## Kullanım
+```json
+POST /scrape/tweet
+{
+  "url": "https://x.com/username/status/123456",
+  "cookies": "auth_token=xxx; ct0=yyy"
+}
+```
+> **Not:** X.com artık login gerektiriyor. `cookies` alanına tarayıcıdan kopyaladığın X cookie'lerini yapıştır.

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import asyncio
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional
+import uvicorn
+app = FastAPI(title="X Scraper API", version="1.0.0")
+# Extension'dan gelen isteklere izin ver
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- Models ---
+class ScrapeRequest(BaseModel):
+    url: str
+    cookies: Optional[str] = None  # X login cookie (opsiyonel)
+class SearchRequest(BaseModel):
+    query: str
+    cookies: Optional[str] = None
+# --- Scrapling lazy import (ağır kütüphane) ---
+def get_fetcher(cookies: Optional[str] = None):
+    from scrapling.fetchers import StealthyFetcher
+    return StealthyFetcher
+# --- Endpoints ---
+@app.get("/")
+def root():
+    return {"status": "ok", "service": "X Scraper API", "version": "1.0.0"}
+@app.get("/health")
+def health():
+    return {"status": "healthy"}
+@app.post("/scrape/tweet")
+async def scrape_tweet(req: ScrapeRequest):
+    """Tek tweet URL'sini scrape et"""
+    if "x.com" not in req.url and "twitter.com" not in req.url:
+        raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")
+    try:
+        from scrapling.fetchers import StealthyFetcher
+        headers = {}
+        if req.cookies:
+            headers["Cookie"] = req.cookies
+        page = await asyncio.to_thread(
+            lambda: StealthyFetcher.fetch(
+                req.url,
+                headless=True,
+                network_idle=True,
+                extra_headers=headers,
+            )
+        )
+        # Tweet verilerini çek
+        tweet_data = {}
+        # Metin
+        text_el = page.css('[data-testid="tweetText"]')
+        tweet_data["text"] = text_el.get_text("\n") if text_el else ""
+        # Kullanıcı
+        user_el = page.css('[data-testid="User-Name"]')
+        tweet_data["user"] = user_el.get_text() if user_el else ""
+        # Zaman
+        time_el = page.css("time")
+        tweet_data["timestamp"] = time_el.attrib.get("datetime", "") if time_el else ""
+        # Etkileşimler - aria-label'dan parse et
+        like_el = page.css('[data-testid="like"]')
+        tweet_data["likes"] = like_el.attrib.get("aria-label", "0") if like_el else "0"
+        retweet_el = page.css('[data-testid="retweet"]')
+        tweet_data["retweets"] = retweet_el.attrib.get("aria-label", "0") if retweet_el else "0"
+        # Medya
+        tweet_data["images"] = [
+            img.attrib.get("src", "")
+            for img in page.css('[data-testid="tweetPhoto"] img')
+        ]
+        tweet_data["has_video"] = bool(page.css('[data-testid="videoPlayer"]'))
+        tweet_data["url"] = req.url
+        tweet_data["source"] = "scrapling"
+        return {"success": True, "data": tweet_data}
+    except Exception as e:
+        raise HTTPException(500, f"Scrape hatası: {str(e)}")
+@app.post("/scrape/profile")
+async def scrape_profile(req: ScrapeRequest):
+    """Profil sayfasından tweetleri çek"""
+    if "x.com" not in req.url and "twitter.com" not in req.url:
+        raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")
+    try:
+        from scrapling.fetchers import StealthyFetcher
+        headers = {}
+        if req.cookies:
+            headers["Cookie"] = req.cookies
+        page = await asyncio.to_thread(
+            lambda: StealthyFetcher.fetch(
+                req.url,
+                headless=True,
+                network_idle=True,
+                extra_headers=headers,
+            )
+        )
+        tweets = []
+        articles = page.css('article[data-testid="tweet"]')
+        for article in articles:
+            try:
+                link = article.css('a[href*="/status/"]')
+                if not link:
+                    continue
+                href = link.attrib.get("href", "")
+                import re
+                match = re.search(r'/status/(\d+)', href)
+                if not match:
+                    continue
+                text_el = article.css('[data-testid="tweetText"]')
+                time_el = article.css("time")
+                tweet = {
+                    "id": match.group(1),
+                    "text": text_el.get_text() if text_el else "",
+                    "timestamp": time_el.attrib.get("datetime", "") if time_el else "",
+                    "url": f"https://x.com{href}",
+                }
+                tweets.append(tweet)
+            except Exception:
+                continue
+        return {"success": True, "count": len(tweets), "data": tweets}
+    except Exception as e:
+        raise HTTPException(500, f"Scrape hatası: {str(e)}")
+@app.post("/scrape/search")
+async def scrape_search(req: SearchRequest):
+    """Arama sonuçlarını scrape et"""
+    import urllib.parse
+    query_encoded = urllib.parse.quote(req.query)
+    url = f"https://x.com/search?q={query_encoded}&src=typed_query&f=live"
+    scrape_req = ScrapeRequest(url=url, cookies=req.cookies)
+    return await scrape_profile(scrape_req)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+scrapling[fetchers]
+fastapi
+uvicorn[standard]
+pydantic