ziffir commited on
Commit
7cb594e
·
verified ·
1 Parent(s): 5896c90

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +26 -0
  2. README.md +28 -7
  3. app.py +171 -0
  4. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Sistem bağımlılıkları (Playwright/Chromium için)
6
+ RUN apt-get update && apt-get install -y \
7
+ wget curl gnupg \
8
+ libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 \
9
+ libcups2 libdrm2 libdbus-1-3 libxkbcommon0 libx11-6 libxcomposite1 \
10
+ libxdamage1 libxext6 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 \
11
+ libcairo2 libasound2 libatspi2.0-0 libxshmfence1 \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Python bağımlılıkları
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Scrapling browser kurulumu
19
+ RUN scrapling install
20
+
21
+ COPY app.py .
22
+
23
+ # HF Spaces port 7860
24
+ EXPOSE 7860
25
+
26
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,33 @@
1
  ---
2
- title: X Scraper
3
- emoji:
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: x-scraper
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: X Tweet Scraper API
3
+ emoji: 🐦
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
 
 
8
  ---
9
 
10
+ # X Tweet Scraper API
11
+
12
+ FastAPI + Scrapling tabanlı X (Twitter) scraper.
13
+
14
+ ## Endpoints
15
+
16
+ | Method | Path | Açıklama |
17
+ |--------|------|----------|
18
+ | GET | `/` | Sağlık kontrolü |
19
+ | POST | `/scrape/tweet` | Tek tweet detayı |
20
+ | POST | `/scrape/profile` | Profil tweetleri |
21
+ | POST | `/scrape/search` | Arama sonuçları |
22
+
23
+ ## Kullanım
24
+
25
+ ```json
26
+ POST /scrape/tweet
27
+ {
28
+ "url": "https://x.com/username/status/123456",
29
+ "cookies": "auth_token=xxx; ct0=yyy"
30
+ }
31
+ ```
32
+
33
+ > **Not:** X.com artık login gerektiriyor. `cookies` alanına tarayıcıdan kopyaladığın X cookie'lerini yapıştır.
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from fastapi import FastAPI, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from pydantic import BaseModel
6
+ from typing import Optional
7
+ import uvicorn
8
+
9
+ app = FastAPI(title="X Scraper API", version="1.0.0")
10
+
11
+ # Extension'dan gelen isteklere izin ver
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_methods=["*"],
16
+ allow_headers=["*"],
17
+ )
18
+
19
+ # --- Models ---
20
+ class ScrapeRequest(BaseModel):
21
+ url: str
22
+ cookies: Optional[str] = None # X login cookie (opsiyonel)
23
+
24
+ class SearchRequest(BaseModel):
25
+ query: str
26
+ cookies: Optional[str] = None
27
+
28
+ # --- Scrapling lazy import (ağır kütüphane) ---
29
+ def get_fetcher(cookies: Optional[str] = None):
30
+ from scrapling.fetchers import StealthyFetcher
31
+ return StealthyFetcher
32
+
33
+ # --- Endpoints ---
34
+
35
+ @app.get("/")
36
+ def root():
37
+ return {"status": "ok", "service": "X Scraper API", "version": "1.0.0"}
38
+
39
+ @app.get("/health")
40
+ def health():
41
+ return {"status": "healthy"}
42
+
43
+ @app.post("/scrape/tweet")
44
+ async def scrape_tweet(req: ScrapeRequest):
45
+ """Tek tweet URL'sini scrape et"""
46
+ if "x.com" not in req.url and "twitter.com" not in req.url:
47
+ raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")
48
+
49
+ try:
50
+ from scrapling.fetchers import StealthyFetcher
51
+
52
+ headers = {}
53
+ if req.cookies:
54
+ headers["Cookie"] = req.cookies
55
+
56
+ page = await asyncio.to_thread(
57
+ lambda: StealthyFetcher.fetch(
58
+ req.url,
59
+ headless=True,
60
+ network_idle=True,
61
+ extra_headers=headers,
62
+ )
63
+ )
64
+
65
+ # Tweet verilerini çek
66
+ tweet_data = {}
67
+
68
+ # Metin
69
+ text_el = page.css('[data-testid="tweetText"]')
70
+ tweet_data["text"] = text_el.get_text("\n") if text_el else ""
71
+
72
+ # Kullanıcı
73
+ user_el = page.css('[data-testid="User-Name"]')
74
+ tweet_data["user"] = user_el.get_text() if user_el else ""
75
+
76
+ # Zaman
77
+ time_el = page.css("time")
78
+ tweet_data["timestamp"] = time_el.attrib.get("datetime", "") if time_el else ""
79
+
80
+ # Etkileşimler - aria-label'dan parse et
81
+ like_el = page.css('[data-testid="like"]')
82
+ tweet_data["likes"] = like_el.attrib.get("aria-label", "0") if like_el else "0"
83
+
84
+ retweet_el = page.css('[data-testid="retweet"]')
85
+ tweet_data["retweets"] = retweet_el.attrib.get("aria-label", "0") if retweet_el else "0"
86
+
87
+ # Medya
88
+ tweet_data["images"] = [
89
+ img.attrib.get("src", "")
90
+ for img in page.css('[data-testid="tweetPhoto"] img')
91
+ ]
92
+ tweet_data["has_video"] = bool(page.css('[data-testid="videoPlayer"]'))
93
+
94
+ tweet_data["url"] = req.url
95
+ tweet_data["source"] = "scrapling"
96
+
97
+ return {"success": True, "data": tweet_data}
98
+
99
+ except Exception as e:
100
+ raise HTTPException(500, f"Scrape hatası: {str(e)}")
101
+
102
+
103
+ @app.post("/scrape/profile")
104
+ async def scrape_profile(req: ScrapeRequest):
105
+ """Profil sayfasından tweetleri çek"""
106
+ if "x.com" not in req.url and "twitter.com" not in req.url:
107
+ raise HTTPException(400, "Sadece X.com URL'leri kabul edilir")
108
+
109
+ try:
110
+ from scrapling.fetchers import StealthyFetcher
111
+
112
+ headers = {}
113
+ if req.cookies:
114
+ headers["Cookie"] = req.cookies
115
+
116
+ page = await asyncio.to_thread(
117
+ lambda: StealthyFetcher.fetch(
118
+ req.url,
119
+ headless=True,
120
+ network_idle=True,
121
+ extra_headers=headers,
122
+ )
123
+ )
124
+
125
+ tweets = []
126
+ articles = page.css('article[data-testid="tweet"]')
127
+
128
+ for article in articles:
129
+ try:
130
+ link = article.css('a[href*="/status/"]')
131
+ if not link:
132
+ continue
133
+
134
+ href = link.attrib.get("href", "")
135
+ import re
136
+ match = re.search(r'/status/(\d+)', href)
137
+ if not match:
138
+ continue
139
+
140
+ text_el = article.css('[data-testid="tweetText"]')
141
+ time_el = article.css("time")
142
+
143
+ tweet = {
144
+ "id": match.group(1),
145
+ "text": text_el.get_text() if text_el else "",
146
+ "timestamp": time_el.attrib.get("datetime", "") if time_el else "",
147
+ "url": f"https://x.com{href}",
148
+ }
149
+ tweets.append(tweet)
150
+ except Exception:
151
+ continue
152
+
153
+ return {"success": True, "count": len(tweets), "data": tweets}
154
+
155
+ except Exception as e:
156
+ raise HTTPException(500, f"Scrape hatası: {str(e)}")
157
+
158
+
159
+ @app.post("/scrape/search")
160
+ async def scrape_search(req: SearchRequest):
161
+ """Arama sonuçlarını scrape et"""
162
+ import urllib.parse
163
+ query_encoded = urllib.parse.quote(req.query)
164
+ url = f"https://x.com/search?q={query_encoded}&src=typed_query&f=live"
165
+
166
+ scrape_req = ScrapeRequest(url=url, cookies=req.cookies)
167
+ return await scrape_profile(scrape_req)
168
+
169
+
170
+ if __name__ == "__main__":
171
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scrapling[fetchers]
2
+ fastapi
3
+ uvicorn[standard]
4
+ pydantic