Spaces:

mdnazib963
/

Crawl4AI

Running

App Files Files Community

mdnazib963 commited on Jan 21

Commit

e6068cc

verified ·

1 Parent(s): da612bc

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -14

app.py CHANGED Viewed

@@ -17,34 +17,39 @@ MAX_LINKS = 3
 class CloudResearchEngine:
     def __init__(self):
-        # Browser config optimized for Docker/Cloud containers
-        # FIX: Used 'extra_args' instead of 'args'
         self.browser_conf = BrowserConfig(
             headless=True,
             verbose=False,
-            extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"]
         )
         self.run_conf = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
-            # Stealth headers to try and bypass simple bot detection
-            headers={
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-            }
         )
         self.ai_client = Client(AI_CLIENT_URL)
     def search_google_url(self, query):
         """Generates the Google Search URL."""
         encoded_query = urllib.parse.quote_plus(query)
-        # We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
         return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
     async def crawl_single_page(self, url):
         """Crawls a URL with error handling for the cloud environment."""
         async with AsyncWebCrawler(config=self.browser_conf) as crawler:
             try:
-                # Add a small delay to be polite and avoid immediate blocks
                 await asyncio.sleep(1)
                 result = await crawler.arun(url=url, config=self.run_conf)
                 if result.success:
@@ -56,18 +61,15 @@ class CloudResearchEngine:
     def extract_links(self, markdown_text):
         """Finds links in the markdown. Handles Google's messy redirection links."""
-        # Standard markdown links [text](url)
         links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
         clean_urls = []
         for text, url in links:
-            # Filter out Google internal links and tiny links
             if "google.com" in url or "youtube.com" in url:
                 continue
             if len(url) < 15:
                 continue
-            # De-duplicate
             domain = urllib.parse.urlparse(url).netloc
             if not any(domain in u for u in clean_urls):
                 clean_urls.append(url)
@@ -96,7 +98,7 @@ class CloudResearchEngine:
 # --- GRADIO INTERFACE ---
-# Initialize engine globally to persist settings
 engine = CloudResearchEngine()
 async def run_process(topic):

 class CloudResearchEngine:
     def __init__(self):
+        # 1. SETUP BROWSER (Headers & User Agent go here)
         self.browser_conf = BrowserConfig(
             headless=True,
             verbose=False,
+            # 'extra_args' is the correct parameter for passing flags
+            extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
+            # User Agent matches a real Chrome browser to avoid blocks
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            # Optional: You can pass other headers here if needed, but user_agent is usually enough
+            # headers={"Accept-Language": "en-US,en;q=0.9"}
         )
+        # 2. SETUP RUN CONFIG (Cache & Execution rules go here)
         self.run_conf = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS
+            # removed 'headers' from here as it caused the crash
         )
         self.ai_client = Client(AI_CLIENT_URL)
     def search_google_url(self, query):
         """Generates the Google Search URL."""
         encoded_query = urllib.parse.quote_plus(query)
         return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
     async def crawl_single_page(self, url):
         """Crawls a URL with error handling for the cloud environment."""
+        # We pass the browser config here to initialize the browser with our headers
         async with AsyncWebCrawler(config=self.browser_conf) as crawler:
             try:
+                # Add a small delay to be polite
                 await asyncio.sleep(1)
+                # We pass the run config here
                 result = await crawler.arun(url=url, config=self.run_conf)
                 if result.success:
     def extract_links(self, markdown_text):
         """Finds links in the markdown. Handles Google's messy redirection links."""
         links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
         clean_urls = []
         for text, url in links:
             if "google.com" in url or "youtube.com" in url:
                 continue
             if len(url) < 15:
                 continue
             domain = urllib.parse.urlparse(url).netloc
             if not any(domain in u for u in clean_urls):
                 clean_urls.append(url)
 # --- GRADIO INTERFACE ---
+# Initialize engine globally
 engine = CloudResearchEngine()
 async def run_process(topic):