mdnazib963 commited on
Commit
e6068cc
·
verified ·
1 Parent(s): da612bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -17,34 +17,39 @@ MAX_LINKS = 3
17
 
18
  class CloudResearchEngine:
19
  def __init__(self):
20
- # Browser config optimized for Docker/Cloud containers
21
- # FIX: Used 'extra_args' instead of 'args'
22
  self.browser_conf = BrowserConfig(
23
  headless=True,
24
  verbose=False,
25
- extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"]
 
 
 
 
 
26
  )
 
 
27
  self.run_conf = CrawlerRunConfig(
28
- cache_mode=CacheMode.BYPASS,
29
- # Stealth headers to try and bypass simple bot detection
30
- headers={
31
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
32
- }
33
  )
 
34
  self.ai_client = Client(AI_CLIENT_URL)
35
 
36
  def search_google_url(self, query):
37
  """Generates the Google Search URL."""
38
  encoded_query = urllib.parse.quote_plus(query)
39
- # We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
40
  return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
41
 
42
  async def crawl_single_page(self, url):
43
  """Crawls a URL with error handling for the cloud environment."""
 
44
  async with AsyncWebCrawler(config=self.browser_conf) as crawler:
45
  try:
46
- # Add a small delay to be polite and avoid immediate blocks
47
  await asyncio.sleep(1)
 
48
  result = await crawler.arun(url=url, config=self.run_conf)
49
 
50
  if result.success:
@@ -56,18 +61,15 @@ class CloudResearchEngine:
56
 
57
  def extract_links(self, markdown_text):
58
  """Finds links in the markdown. Handles Google's messy redirection links."""
59
- # Standard markdown links [text](url)
60
  links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
61
 
62
  clean_urls = []
63
  for text, url in links:
64
- # Filter out Google internal links and tiny links
65
  if "google.com" in url or "youtube.com" in url:
66
  continue
67
  if len(url) < 15:
68
  continue
69
 
70
- # De-duplicate
71
  domain = urllib.parse.urlparse(url).netloc
72
  if not any(domain in u for u in clean_urls):
73
  clean_urls.append(url)
@@ -96,7 +98,7 @@ class CloudResearchEngine:
96
 
97
  # --- GRADIO INTERFACE ---
98
 
99
- # Initialize engine globally to persist settings
100
  engine = CloudResearchEngine()
101
 
102
  async def run_process(topic):
 
17
 
18
  class CloudResearchEngine:
19
  def __init__(self):
20
+ # 1. SETUP BROWSER (Headers & User Agent go here)
 
21
  self.browser_conf = BrowserConfig(
22
  headless=True,
23
  verbose=False,
24
+ # 'extra_args' is the correct parameter for passing flags
25
+ extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
26
+ # User Agent matches a real Chrome browser to avoid blocks
27
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
28
+ # Optional: You can pass other headers here if needed, but user_agent is usually enough
29
+ # headers={"Accept-Language": "en-US,en;q=0.9"}
30
  )
31
+
32
+ # 2. SETUP RUN CONFIG (Cache & Execution rules go here)
33
  self.run_conf = CrawlerRunConfig(
34
+ cache_mode=CacheMode.BYPASS
35
+ # removed 'headers' from here as it caused the crash
 
 
 
36
  )
37
+
38
  self.ai_client = Client(AI_CLIENT_URL)
39
 
40
  def search_google_url(self, query):
41
  """Generates the Google Search URL."""
42
  encoded_query = urllib.parse.quote_plus(query)
 
43
  return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
44
 
45
  async def crawl_single_page(self, url):
46
  """Crawls a URL with error handling for the cloud environment."""
47
+ # We pass the browser config here to initialize the browser with our headers
48
  async with AsyncWebCrawler(config=self.browser_conf) as crawler:
49
  try:
50
+ # Add a small delay to be polite
51
  await asyncio.sleep(1)
52
+ # We pass the run config here
53
  result = await crawler.arun(url=url, config=self.run_conf)
54
 
55
  if result.success:
 
61
 
62
  def extract_links(self, markdown_text):
63
  """Finds links in the markdown. Handles Google's messy redirection links."""
 
64
  links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
65
 
66
  clean_urls = []
67
  for text, url in links:
 
68
  if "google.com" in url or "youtube.com" in url:
69
  continue
70
  if len(url) < 15:
71
  continue
72
 
 
73
  domain = urllib.parse.urlparse(url).netloc
74
  if not any(domain in u for u in clean_urls):
75
  clean_urls.append(url)
 
98
 
99
  # --- GRADIO INTERFACE ---
100
 
101
+ # Initialize engine globally
102
  engine = CloudResearchEngine()
103
 
104
  async def run_process(topic):