tecuts commited on
Commit
4c6320f
·
verified ·
1 Parent(s): 64a4922

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +337 -0
app.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import asyncio
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from typing import List, Optional, Dict, Any
7
+ from urllib.parse import urlparse
8
+ from fastapi import FastAPI, HTTPException, Query, Request, BackgroundTasks
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.responses import JSONResponse
11
+ from pydantic import BaseModel
12
+ from selenium import webdriver
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+ from selenium.webdriver.support import expected_conditions as EC
16
+ from selenium.webdriver.chrome.options import Options
17
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
18
+ import uvicorn
19
+
20
+ app = FastAPI(
21
+ title="Threads Media Extractor API",
22
+ description="Extract media URLs from Threads posts - Optimized version",
23
+ version="2.1.0"
24
+ )
25
+
26
+ # Add CORS middleware
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Global driver pool for reuse
36
+ driver_pool = []
37
+ executor = ThreadPoolExecutor(max_workers=2)
38
+
39
+ class MediaItem(BaseModel):
40
+ url: str
41
+
42
+ class ThreadsResponse(BaseModel):
43
+ post_url: str
44
+ url: Optional[str] = None
45
+ picker: Optional[List[MediaItem]] = None
46
+ media_count: int
47
+ post_text: Optional[str] = None
48
+ author: Optional[str] = None
49
+ success: bool
50
+ processing_time: Optional[float] = None
51
+
52
+ class Config:
53
+ # Exclude fields that are None from the response
54
+ exclude_none = True
55
+
56
+ class ErrorResponse(BaseModel):
57
+ error: str
58
+ success: bool = False
59
+
60
+ def create_optimized_driver():
61
+ """Create and configure optimized Chrome WebDriver"""
62
+ options = Options()
63
+ options.add_argument('--headless=new') # Use new headless mode
64
+ options.add_argument('--no-sandbox')
65
+ options.add_argument('--disable-dev-shm-usage')
66
+ options.add_argument('--disable-gpu')
67
+ options.add_argument('--disable-extensions')
68
+ options.add_argument('--disable-plugins')
69
+ options.add_argument('--disable-default-apps')
70
+ options.add_argument('--disable-background-timer-throttling')
71
+ options.add_argument('--disable-backgrounding-occluded-windows')
72
+ options.add_argument('--disable-renderer-backgrounding')
73
+ options.add_argument('--disable-features=TranslateUI')
74
+ options.add_argument('--disable-ipc-flooding-protection')
75
+
76
+ # Performance optimizations
77
+ options.add_argument('--memory-pressure-off')
78
+ options.add_argument('--max_old_space_size=4096')
79
+ options.add_argument('--window-size=1280,720') # Smaller window
80
+
81
+ # Network optimizations
82
+ options.add_argument('--aggressive-cache-discard')
83
+ options.add_argument('--disable-background-networking')
84
+
85
+ # Disable unnecessary features
86
+ options.add_experimental_option('useAutomationExtension', False)
87
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
88
+ options.add_argument('--disable-blink-features=AutomationControlled')
89
+
90
+ # User agent
91
+ options.add_argument('--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
92
+
93
+ try:
94
+ driver = webdriver.Chrome(options=options)
95
+ driver.implicitly_wait(5) # Reduced wait time
96
+ driver.set_page_load_timeout(15) # Reduced timeout
97
+
98
+ # Optimize browser settings
99
+ driver.execute_cdp_cmd('Network.setUserAgentOverride', {
100
+ "userAgent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
101
+ })
102
+
103
+ return driver
104
+ except Exception as e:
105
+ raise HTTPException(status_code=500, detail=f"Failed to create browser driver: {str(e)}")
106
+
107
+ def get_driver():
108
+ """Get driver from pool or create new one"""
109
+ if driver_pool:
110
+ return driver_pool.pop()
111
+ return create_optimized_driver()
112
+
113
+ def return_driver(driver):
114
+ """Return driver to pool for reuse"""
115
+ if len(driver_pool) < 2: # Keep max 2 drivers in pool
116
+ driver_pool.append(driver)
117
+ else:
118
+ try:
119
+ driver.quit()
120
+ except:
121
+ pass
122
+
123
+ def extract_post_id_from_url(url: str) -> Optional[str]:
124
+ """Extract post ID from Threads URL"""
125
+ patterns = [
126
+ r'threads\.net/@[^/]+/post/([A-Za-z0-9_-]+)',
127
+ r'threads\.net/t/([A-Za-z0-9_-]+)',
128
+ r'threads\.com/@[^/]+/post/([A-Za-z0-9_-]+)',
129
+ r'threads\.com/t/([A-Za-z0-9_-]+)',
130
+ ]
131
+
132
+ for pattern in patterns:
133
+ match = re.search(pattern, url)
134
+ if match:
135
+ return match.group(1)
136
+
137
+ return None
138
+
139
+ def is_valid_threads_url(url: str) -> bool:
140
+ """Validate if URL is a valid Threads URL"""
141
+ try:
142
+ parsed = urlparse(url)
143
+ return (
144
+ parsed.netloc in ['threads.net', 'www.threads.net', 'threads.com', 'www.threads.com'] and
145
+ (('/post/' in parsed.path) or ('/t/' in parsed.path))
146
+ )
147
+ except:
148
+ return False
149
+
150
+ def fast_extract_media(driver: webdriver.Chrome, url: str) -> Dict[str, Any]:
151
+ """Optimized media extraction with faster loading"""
152
+ media_urls = []
153
+ post_text = None
154
+ author = None
155
+
156
+ try:
157
+ start_time = time.time()
158
+
159
+ # Navigate to the URL
160
+ driver.get(url)
161
+
162
+ # Wait for essential elements only
163
+ try:
164
+ WebDriverWait(driver, 8).until(
165
+ lambda d: d.execute_script("return document.readyState") == "complete"
166
+ )
167
+ except TimeoutException:
168
+ pass # Continue even if timeout
169
+
170
+ # Quick wait for dynamic content
171
+ time.sleep(1.5) # Reduced from 3 seconds
172
+
173
+ # Extract videos first (most important)
174
+ video_elements = driver.find_elements(By.TAG_NAME, 'video')
175
+ for video in video_elements:
176
+ src = video.get_attribute('src')
177
+ if src and src.startswith('http'):
178
+ media_urls.append(src)
179
+
180
+ # Check source elements
181
+ sources = video.find_elements(By.TAG_NAME, 'source')
182
+ for source in sources:
183
+ src = source.get_attribute('src')
184
+ if src and src.startswith('http'):
185
+ media_urls.append(src)
186
+
187
+ # If no videos found, look for images quickly
188
+ if not media_urls:
189
+ img_elements = driver.find_elements(By.TAG_NAME, 'img')[:10] # Limit to first 10 images
190
+ for img in img_elements:
191
+ src = img.get_attribute('src')
192
+ if src and src.startswith('http') and any(ext in src.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
193
+ if not any(exclude in src.lower() for exclude in ['profile', 'avatar', 'icon', 'logo']):
194
+ media_urls.append(src)
195
+
196
+ # Quick text extraction (optional, skip if taking too long)
197
+ try:
198
+ text_elements = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"] span, article span')[:5]
199
+ for element in text_elements:
200
+ text = element.text.strip()
201
+ if text and len(text) > 10 and not post_text:
202
+ post_text = text
203
+ break
204
+ except:
205
+ pass
206
+
207
+ # Remove duplicates
208
+ seen = set()
209
+ unique_media_urls = []
210
+ for url in media_urls:
211
+ if url not in seen:
212
+ seen.add(url)
213
+ unique_media_urls.append(url)
214
+
215
+ processing_time = time.time() - start_time
216
+
217
+ return {
218
+ "media_urls": unique_media_urls,
219
+ "post_text": post_text,
220
+ "author": author,
221
+ "processing_time": processing_time
222
+ }
223
+
224
+ except Exception as e:
225
+ raise HTTPException(status_code=500, detail=f"Error extracting media: {str(e)}")
226
+
227
+ def extract_media_sync(url: str) -> Dict[str, Any]:
228
+ """Synchronous wrapper for thread execution"""
229
+ driver = None
230
+ try:
231
+ driver = get_driver()
232
+ result = fast_extract_media(driver, url)
233
+ return result
234
+ finally:
235
+ if driver:
236
+ return_driver(driver)
237
+
238
+
239
+
240
+ @app.get("/")
241
+ async def health_check():
242
+ """Health check endpoint"""
243
+ return {
244
+ "status": "healthy",
245
+ "service": "extractor",
246
+ "version": "2.1.0",
247
+ "driver_pool_size": len(driver_pool)
248
+ }
249
+
250
+
251
+
252
+ @app.get("/extract")
253
+ async def extract_media(url: str = Query(..., description="Threads post URL")):
254
+ """
255
+ Extract media URLs from a Threads post - Optimized version
256
+
257
+ Args:
258
+ url: The Threads post URL to extract media from
259
+
260
+ Returns:
261
+ ThreadsResponse with media URLs and metadata
262
+ """
263
+
264
+ # Validate URL
265
+ if not url:
266
+ raise HTTPException(status_code=400, detail="URL parameter is required")
267
+
268
+ if not is_valid_threads_url(url):
269
+ raise HTTPException(status_code=400, detail="Invalid Threads URL format")
270
+
271
+ # Extract post ID
272
+ post_id = extract_post_id_from_url(url)
273
+ if not post_id:
274
+ raise HTTPException(status_code=400, detail="Could not extract post ID from URL")
275
+
276
+ try:
277
+ # Run extraction in thread pool for better async handling
278
+ loop = asyncio.get_event_loop()
279
+ extracted_data = await loop.run_in_executor(executor, extract_media_sync, url)
280
+
281
+ media_urls = extracted_data["media_urls"]
282
+ media_count = len(media_urls)
283
+
284
+ # Base response data
285
+ response_data = {
286
+ "post_url": url,
287
+ "media_count": media_count,
288
+ "post_text": extracted_data["post_text"],
289
+ "author": extracted_data["author"],
290
+ "success": True,
291
+ "processing_time": extracted_data.get("processing_time")
292
+ }
293
+
294
+ # Conditionally add url or picker based on media count
295
+ if media_count == 1:
296
+ response_data["url"] = media_urls[0]
297
+ # Don't include picker field at all
298
+ elif media_count > 1:
299
+ response_data["picker"] = [{"url": url} for url in media_urls]
300
+ # Don't include url field at all
301
+ # If media_count is 0, neither url nor picker will be included
302
+
303
+ # Create response and return as JSON with excluded None values
304
+ response = ThreadsResponse(**response_data)
305
+ return JSONResponse(content=response.model_dump(exclude_none=True))
306
+
307
+ except HTTPException:
308
+ raise
309
+ except Exception as e:
310
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
311
+
312
+
313
+ @app.on_event("shutdown")
314
+ async def shutdown_event():
315
+ """Clean up resources on shutdown"""
316
+ executor.shutdown(wait=False)
317
+ for driver in driver_pool:
318
+ try:
319
+ driver.quit()
320
+ except:
321
+ pass
322
+
323
+ @app.exception_handler(HTTPException)
324
+ async def http_exception_handler(request: Request, exc: HTTPException):
325
+ """Custom HTTP exception handler"""
326
+ return JSONResponse(
327
+ status_code=exc.status_code,
328
+ content={
329
+ "error": exc.detail,
330
+ "success": False,
331
+ "status_code": exc.status_code
332
+ }
333
+ )
334
+
335
+ if __name__ == "__main__":
336
+ port = int(os.environ.get("PORT", 7860))
337
+ uvicorn.run(app, host="0.0.0.0", port=port)