AdarshJi commited on
Commit
5927681
·
verified ·
1 Parent(s): daa47af

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +488 -0
app.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from __future__ import annotations
4
+ import time
5
+ import asyncio
6
+ import json
7
+ import argparse
8
+ import threading
9
+ from typing import Optional, Dict, Any, Tuple
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from urllib.parse import quote_plus
12
+
13
+ from fastapi import FastAPI, HTTPException, Query, Body
14
+ from pydantic import BaseModel
15
+ from starlette.responses import JSONResponse
16
+
17
+ # --- bring in your BrowserManager and EXTRACT_DATA functions (adapted) ---
18
+ # For brevity we import them inline. If you already have them in a module,
19
+ # replace with: from your_module import BrowserManager, EXTRACT_DATA
20
+
21
+ import threading
22
+ from selenium import webdriver
23
+ from selenium.webdriver.chrome.service import Service
24
+ from selenium.webdriver.chrome.options import Options
25
+ from selenium.webdriver.common.by import By
26
+ from selenium.webdriver.support.ui import WebDriverWait
27
+ from selenium.webdriver.support import expected_conditions as EC
28
+ from selenium.common.exceptions import TimeoutException, WebDriverException
29
+ from webdriver_manager.chrome import ChromeDriverManager
30
+ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
31
+ from time import sleep
32
+ from bs4 import BeautifulSoup
33
+ from urllib.parse import urljoin
34
+
35
+ # ---------------- BrowserManager (same as your code but small adjustments) ----------------
36
+ class BrowserManager:
37
+ def __init__(
38
+ self,
39
+ headless: bool = True,
40
+ user_agent: Optional[str] = None,
41
+ window_size: str = "1366,768",
42
+ disable_images: bool = True,
43
+ block_resource_urls: Optional[list[str]] = None,
44
+ page_load_strategy: str = "eager",
45
+ ):
46
+ self.headless = headless
47
+ self.user_agent = user_agent
48
+ self.window_size = window_size
49
+ self.disable_images = disable_images
50
+ self.block_resource_urls = block_resource_urls or [
51
+ "*.doubleclick.net/*",
52
+ "*.google-analytics.com/*",
53
+ "*.googlesyndication.com/*",
54
+ "*.adservice.google.com/*",
55
+ ]
56
+ self.page_load_strategy = page_load_strategy
57
+ self._driver_lock = threading.Lock()
58
+ self._driver: Optional[webdriver.Chrome] = None
59
+ self._start_driver()
60
+
61
+ def _start_driver(self):
62
+ opts = Options()
63
+ if self.headless:
64
+ opts.add_argument("--headless=new")
65
+ opts.add_argument("--headless")
66
+ opts.add_argument(f"--window-size={self.window_size}")
67
+ opts.add_argument("--no-sandbox")
68
+ opts.add_argument("--disable-dev-shm-usage")
69
+ opts.add_argument("--disable-gpu")
70
+ opts.add_argument("--disable-extensions")
71
+ opts.add_argument("--disable-blink-features=AutomationControlled")
72
+ if self.user_agent:
73
+ opts.add_argument(f"--user-agent={self.user_agent}")
74
+
75
+ if self.disable_images:
76
+ prefs = {
77
+ "profile.managed_default_content_settings.images": 2,
78
+ "profile.managed_default_content_settings.stylesheets": 2,
79
+ "profile.managed_default_content_settings.fonts": 2,
80
+ }
81
+ opts.add_experimental_option("prefs", prefs)
82
+
83
+ opts.add_experimental_option("excludeSwitches", ["enable-logging"])
84
+
85
+ # Some systems may not accept the desired_capabilities arg in this signature,
86
+ # so we set pageLoadStrategy later via browser options / capabilities if needed.
87
+ service = Service(ChromeDriverManager().install())
88
+ self._driver = webdriver.Chrome(service=service, options=opts)
89
+
90
+ # quick defaults
91
+ try:
92
+ self._driver.set_page_load_timeout(60)
93
+ # Enable CDP to block some resource URLs (best-effort)
94
+ self._driver.execute_cdp_cmd("Network.enable", {})
95
+ if self.block_resource_urls:
96
+ self._driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": self.block_resource_urls})
97
+ except Exception:
98
+ # ignore CDP failures gracefully
99
+ pass
100
+
101
+ def fetch_html(
102
+ self,
103
+ url: str,
104
+ wait_seconds: Optional[float] = 10.0,
105
+ wait_for_selector: Optional[str] = None,
106
+ force_reload: bool = True,
107
+ ) -> str:
108
+ if self._driver is None:
109
+ self._start_driver()
110
+
111
+ with self._driver_lock:
112
+ driver = self._driver
113
+ try:
114
+ driver.get(url)
115
+
116
+ if wait_for_selector and wait_seconds:
117
+ try:
118
+ WebDriverWait(driver, wait_seconds).until(
119
+ EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_selector))
120
+ )
121
+ except TimeoutException:
122
+ pass
123
+ else:
124
+ if wait_seconds:
125
+ try:
126
+ WebDriverWait(driver, min(wait_seconds, 3)).until(
127
+ lambda d: d.execute_script("return document.readyState") == "complete"
128
+ )
129
+ except Exception:
130
+ time.sleep(0.5)
131
+
132
+ return driver.page_source
133
+ except WebDriverException as e:
134
+ # Try recover
135
+ try:
136
+ self._safe_quit_driver()
137
+ except Exception:
138
+ pass
139
+ self._start_driver()
140
+ raise RuntimeError(f"WebDriver error during fetch: {e}")
141
+
142
+ def _safe_quit_driver(self):
143
+ if self._driver:
144
+ try:
145
+ self._driver.quit()
146
+ except Exception:
147
+ pass
148
+ self._driver = None
149
+
150
+ def close(self):
151
+ self._safe_quit_driver()
152
+
153
+ # ---------------- EXTRACT_DATA (same logic you wrote, returns dict) ----------------
154
+ def EXTRACT_DATA(html: str) -> Dict[str, Any]:
155
+ soup = BeautifulSoup(html, "html.parser")
156
+ BASE_URL = "https://www.google.com"
157
+
158
+ def safe_text(el):
159
+ return el.get_text(strip=True) if el else ""
160
+
161
+ def safe_attr(el, attr):
162
+ return el.get(attr) if el and el.has_attr(attr) else ""
163
+
164
+ def abs_url(url):
165
+ return urljoin(BASE_URL, url) if url else ""
166
+
167
+ def clean_thumb(src):
168
+ if src and not src.startswith("data:"):
169
+ return abs_url(src)
170
+ return None
171
+
172
+ def is_ad_element(element):
173
+ for parent in element.parents:
174
+ if parent.get("id") in ["tads", "tadsb"] or "ads-ad" in parent.get("class", []):
175
+ return True
176
+ return False
177
+
178
+ web_results = []
179
+ for result in soup.select(".tF2Cxc"):
180
+ if is_ad_element(result):
181
+ continue
182
+ title_tag = result.select_one("h3")
183
+ link_tag = result.select_one("a")
184
+ cite_tag = result.select_one("cite")
185
+ snippet_tag = result.select_one(".VwiC3b")
186
+ read_more_tag = result.select_one(".vzmbzf")
187
+
188
+ if title_tag and link_tag:
189
+ entry = {
190
+ "no": len(web_results) + 1,
191
+ "title": safe_text(title_tag),
192
+ "link": abs_url(safe_attr(link_tag, "href")),
193
+ "displayed_url": safe_text(cite_tag),
194
+ "snippet": safe_text(snippet_tag)
195
+ }
196
+ extra = []
197
+ if read_more_tag:
198
+ read_more_url = abs_url(safe_attr(read_more_tag, "href"))
199
+ if read_more_url:
200
+ extra.append({"read_more": read_more_url})
201
+ if extra:
202
+ entry["extra"] = extra
203
+ web_results.append(entry)
204
+
205
+ image_results = []
206
+ for img_item in soup.select(".eA0Zlc"):
207
+ img_tag = img_item.select_one("img")
208
+ link_tag = img_item.select_one("a")
209
+ source_tag = img_item.select_one(".s0fJje span")
210
+ src = safe_attr(img_tag, "data-src") or safe_attr(img_tag, "src")
211
+ thumb = clean_thumb(src)
212
+ if thumb:
213
+ image_results.append({
214
+ "thumbnail": thumb,
215
+ "alt": safe_attr(img_tag, "alt"),
216
+ "source": safe_text(source_tag),
217
+ "link": abs_url(safe_attr(link_tag, "href"))
218
+ })
219
+
220
+ video_results = []
221
+ for video in soup.select(".KYaZsb"):
222
+ title_tag = video.select_one(".tNxQIb.ynAwRc")
223
+ link_tag = video.select_one("a.rIRoqf")
224
+ thumb_img = video.select_one(".AZJdrc img")
225
+ duration_tag = video.select_one(".c8rnLc")
226
+ channel_tag = video.select_one(".Sg4azc span:first-child")
227
+ date_tag = video.select_one(".rbYSKb span")
228
+ desc_tag = video.select_one(".wNifxf .p4wth")
229
+ thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
230
+ thumb = clean_thumb(thumb_src)
231
+ if title_tag and link_tag:
232
+ video_results.append({
233
+ "title": safe_text(title_tag),
234
+ "link": abs_url(safe_attr(link_tag, "href")),
235
+ "thumbnail": thumb,
236
+ "duration": safe_text(duration_tag),
237
+ "channel": safe_text(channel_tag),
238
+ "date": safe_text(date_tag),
239
+ "description_snippet": safe_text(desc_tag)
240
+ })
241
+
242
+ news_results = []
243
+ for news in soup.select(".m7jPZ"):
244
+ title_tag = news.select_one(".n0jPhd")
245
+ link_tag = news.select_one("a")
246
+ source_tag = news.select_one(".MgUUmf span")
247
+ time_tag = news.select_one(".rbYSKb span")
248
+ thumb_img = news.select_one(".uhHOwf img")
249
+ thumb_src = safe_attr(thumb_img, "data-src") or safe_attr(thumb_img, "src")
250
+ thumb = clean_thumb(thumb_src)
251
+ if title_tag and link_tag:
252
+ news_results.append({
253
+ "title": safe_text(title_tag),
254
+ "link": abs_url(safe_attr(link_tag, "href")),
255
+ "source": safe_text(source_tag),
256
+ "time": safe_text(time_tag),
257
+ "thumbnail": thumb
258
+ })
259
+
260
+ knowledge_panel = {}
261
+ rhs = soup.find(id="rhs")
262
+ if rhs:
263
+ title_tag = rhs.select_one(".PZPZlf.ssJ7i")
264
+ subtitle_tag = rhs.select_one(".iAIpCb span")
265
+ if title_tag:
266
+ knowledge_panel["title"] = safe_text(title_tag)
267
+ if subtitle_tag:
268
+ knowledge_panel["subtitle"] = safe_text(subtitle_tag)
269
+
270
+ desc_tag = rhs.select_one(".kno-rdesc span")
271
+ if desc_tag:
272
+ knowledge_panel["description"] = safe_text(desc_tag)
273
+
274
+ facts = {}
275
+ for fact in rhs.select(".zloOqf"):
276
+ label_tag = fact.select_one(".w8qArf")
277
+ value_tag = fact.select_one(".LrzXr")
278
+ if label_tag and value_tag:
279
+ label = safe_text(label_tag).replace(":", "").strip()
280
+ links = value_tag.find_all("a")
281
+ if links and len(links) > 1:
282
+ names = [safe_text(a) for a in links if safe_text(a)]
283
+ if names:
284
+ facts[label] = names
285
+ else:
286
+ text = safe_text(value_tag)
287
+ if text:
288
+ facts[label] = text
289
+ if facts:
290
+ knowledge_panel["facts"] = facts
291
+
292
+ profiles = []
293
+ for profile in rhs.select(".dRrfkf a"):
294
+ name_tag = profile.select_one(".CtCigf")
295
+ link = safe_attr(profile, "href")
296
+ if name_tag and link:
297
+ profiles.append({
298
+ "platform": safe_text(name_tag),
299
+ "link": abs_url(link)
300
+ })
301
+ if profiles:
302
+ knowledge_panel["profiles"] = profiles
303
+
304
+ if not knowledge_panel:
305
+ knowledge_panel = None
306
+
307
+ ai_overview = None
308
+ ai_container = soup.select_one(".p2M1Qe .f5cPye")
309
+ if ai_container:
310
+ text = safe_text(ai_container)
311
+ if text:
312
+ ai_overview = text
313
+
314
+ thumbnails = set()
315
+ for img in soup.select("img[data-src], img[src]"):
316
+ src = safe_attr(img, "data-src") or safe_attr(img, "src")
317
+ clean = clean_thumb(src)
318
+ if clean:
319
+ thumbnails.add(clean)
320
+
321
+ all_thumbnails = sorted(thumbnails) if thumbnails else None
322
+
323
+ data = {}
324
+ if web_results:
325
+ data["web_results"] = web_results
326
+ if image_results:
327
+ data["image_results"] = image_results
328
+ if video_results:
329
+ data["video_results"] = video_results
330
+ if news_results:
331
+ data["news_results"] = news_results
332
+ if knowledge_panel:
333
+ data["knowledge_panel"] = knowledge_panel
334
+ if ai_overview:
335
+ data["ai_overview"] = ai_overview
336
+ if all_thumbnails:
337
+ data["all_thumbnail_urls"] = all_thumbnails
338
+
339
+ return data
340
+
341
+ # ---------------- BrowserPool and API glue ----------------
342
+ class BrowserPool:
343
+ def __init__(self, pool_size: int = 2, headless: bool = True):
344
+ self.pool_size = max(1, pool_size)
345
+ self.managers = [BrowserManager(headless=headless) for _ in range(self.pool_size)]
346
+ self._rr_index = 0
347
+ self._rr_lock = threading.Lock()
348
+
349
+ def pick_manager(self) -> BrowserManager:
350
+ with self._rr_lock:
351
+ idx = self._rr_index
352
+ self._rr_index = (self._rr_index + 1) % self.pool_size
353
+ return self.managers[idx]
354
+
355
+ def close_all(self):
356
+ for m in self.managers:
357
+ try:
358
+ m.close()
359
+ except Exception:
360
+ pass
361
+
362
+ # Simple TTL cache to speed repeated queries
363
+ class SimpleTTLCache:
364
+ def __init__(self, ttl_seconds: int = 20):
365
+ self.ttl = ttl_seconds
366
+ self._cache: Dict[str, Tuple[float, Any]] = {}
367
+ self._lock = threading.Lock()
368
+
369
+ def get(self, key: str):
370
+ with self._lock:
371
+ item = self._cache.get(key)
372
+ if not item:
373
+ return None
374
+ ts, value = item
375
+ if time.time() - ts > self.ttl:
376
+ del self._cache[key]
377
+ return None
378
+ return value
379
+
380
+ def set(self, key: str, value: Any):
381
+ with self._lock:
382
+ self._cache[key] = (time.time(), value)
383
+
384
+ # API models
385
+ class SearchRequest(BaseModel):
386
+ query: Optional[str] = None
387
+ url: Optional[str] = None
388
+ wait_for_selector: Optional[str] = None
389
+ headless: Optional[bool] = True
390
+
391
+ # FastAPI app
392
+ app = FastAPI(title="fast_fetcher_api", version="0.1")
393
+
394
+ # Global shared objects (initialized on startup)
395
+ POOL: Optional[BrowserPool] = None
396
+ EXECUTOR: Optional[ThreadPoolExecutor] = None
397
+ CACHE = SimpleTTLCache(ttl_seconds=25)
398
+
399
+ @app.on_event("startup")
400
+ async def startup_event():
401
+ global POOL, EXECUTOR
402
+ # tune pool_size to your system. Default 2 is a reasonable start.
403
+ POOL = BrowserPool(pool_size=2, headless=False)
404
+ EXECUTOR = ThreadPoolExecutor(max_workers=4)
405
+ app.state.executor = EXECUTOR
406
+ app.state.pool = POOL
407
+ print("Startup: browser pool created (size=2).")
408
+
409
+ @app.on_event("shutdown")
410
+ async def shutdown_event():
411
+ global POOL, EXECUTOR
412
+ if POOL:
413
+ POOL.close_all()
414
+ if EXECUTOR:
415
+ EXECUTOR.shutdown(wait=True)
416
+ print("Shutdown: browsers closed and executor stopped.")
417
+
418
+ # Helper to run blocking fetch in threadpool
419
+ def _blocking_fetch_and_extract(manager: BrowserManager, url: str, wait_for_selector: Optional[str], wait_seconds: Optional[float]):
420
+ start = time.time()
421
+ html = manager.fetch_html(url, wait_seconds=wait_seconds, wait_for_selector=wait_for_selector)
422
+ extracted = EXTRACT_DATA(html)
423
+ duration = time.time() - start
424
+ return {"url": url, "duration": duration, "data": extracted}
425
+
426
+ # Routes
427
+ @app.get("/health")
428
+ async def health():
429
+ return {"status": "ok"}
430
+
431
+ @app.get("/search")
432
+ async def search(query: str = Query(..., min_length=1), wait_for_selector: Optional[str] = None):
433
+ """
434
+ Search endpoint. Example:
435
+ /search?query=python web scraping
436
+ """
437
+ q = query.strip()
438
+ if not q:
439
+ raise HTTPException(status_code=400, detail="query parameter required")
440
+
441
+ url = f"https://www.google.com/search?q={quote_plus(q)}"
442
+ cache_key = f"search:{q}:{wait_for_selector}"
443
+
444
+ cached = CACHE.get(cache_key)
445
+ if cached:
446
+ return JSONResponse(content={"cached": True, **cached})
447
+
448
+ manager = app.state.pool.pick_manager()
449
+ # run in threadpool
450
+ loop = asyncio.get_event_loop()
451
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 5.0)
452
+ result = await fut
453
+ CACHE.set(cache_key, result)
454
+ return JSONResponse(content={"cached": False, **result})
455
+
456
+ @app.get("/fetch")
457
+ async def fetch(url: str = Query(..., min_length=5), wait_for_selector: Optional[str] = None):
458
+ """
459
+ Fetch an arbitrary URL and return extracted JSON.
460
+ /fetch?url=https://example.com
461
+ """
462
+ manager = app.state.pool.pick_manager()
463
+ loop = asyncio.get_event_loop()
464
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, url, wait_for_selector, 6.0)
465
+ result = await fut
466
+ return JSONResponse(content=result)
467
+
468
+ @app.post("/search")
469
+ async def post_search(body: SearchRequest = Body(...)):
470
+ if not (body.query or body.url):
471
+ raise HTTPException(status_code=400, detail="Either query or url must be provided")
472
+ if body.url:
473
+ target = body.url
474
+ else:
475
+ target = f"https://www.google.com/search?q={quote_plus(body.query)}"
476
+
477
+ cache_key = f"search_post:{target}:{body.wait_for_selector}"
478
+ cached = CACHE.get(cache_key)
479
+ if cached:
480
+ return JSONResponse(content={"cached": True, **cached})
481
+
482
+ manager = app.state.pool.pick_manager()
483
+ loop = asyncio.get_event_loop()
484
+ fut = loop.run_in_executor(app.state.executor, _blocking_fetch_and_extract, manager, target, body.wait_for_selector, 6.0)
485
+ result = await fut
486
+ CACHE.set(cache_key, result)
487
+ return JSONResponse(content={"cached": False, **result})
488
+