Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,34 +17,39 @@ MAX_LINKS = 3
|
|
| 17 |
|
| 18 |
class CloudResearchEngine:
|
| 19 |
def __init__(self):
|
| 20 |
-
#
|
| 21 |
-
# FIX: Used 'extra_args' instead of 'args'
|
| 22 |
self.browser_conf = BrowserConfig(
|
| 23 |
headless=True,
|
| 24 |
verbose=False,
|
| 25 |
-
extra_args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
|
|
|
|
|
|
| 27 |
self.run_conf = CrawlerRunConfig(
|
| 28 |
-
cache_mode=CacheMode.BYPASS
|
| 29 |
-
#
|
| 30 |
-
headers={
|
| 31 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 32 |
-
}
|
| 33 |
)
|
|
|
|
| 34 |
self.ai_client = Client(AI_CLIENT_URL)
|
| 35 |
|
| 36 |
def search_google_url(self, query):
|
| 37 |
"""Generates the Google Search URL."""
|
| 38 |
encoded_query = urllib.parse.quote_plus(query)
|
| 39 |
-
# We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
|
| 40 |
return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
|
| 41 |
|
| 42 |
async def crawl_single_page(self, url):
|
| 43 |
"""Crawls a URL with error handling for the cloud environment."""
|
|
|
|
| 44 |
async with AsyncWebCrawler(config=self.browser_conf) as crawler:
|
| 45 |
try:
|
| 46 |
-
# Add a small delay to be polite
|
| 47 |
await asyncio.sleep(1)
|
|
|
|
| 48 |
result = await crawler.arun(url=url, config=self.run_conf)
|
| 49 |
|
| 50 |
if result.success:
|
|
@@ -56,18 +61,15 @@ class CloudResearchEngine:
|
|
| 56 |
|
| 57 |
def extract_links(self, markdown_text):
|
| 58 |
"""Finds links in the markdown. Handles Google's messy redirection links."""
|
| 59 |
-
# Standard markdown links [text](url)
|
| 60 |
links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
|
| 61 |
|
| 62 |
clean_urls = []
|
| 63 |
for text, url in links:
|
| 64 |
-
# Filter out Google internal links and tiny links
|
| 65 |
if "google.com" in url or "youtube.com" in url:
|
| 66 |
continue
|
| 67 |
if len(url) < 15:
|
| 68 |
continue
|
| 69 |
|
| 70 |
-
# De-duplicate
|
| 71 |
domain = urllib.parse.urlparse(url).netloc
|
| 72 |
if not any(domain in u for u in clean_urls):
|
| 73 |
clean_urls.append(url)
|
|
@@ -96,7 +98,7 @@ class CloudResearchEngine:
|
|
| 96 |
|
| 97 |
# --- GRADIO INTERFACE ---
|
| 98 |
|
| 99 |
-
# Initialize engine globally
|
| 100 |
engine = CloudResearchEngine()
|
| 101 |
|
| 102 |
async def run_process(topic):
|
|
|
|
| 17 |
|
| 18 |
class CloudResearchEngine:
|
| 19 |
def __init__(self):
|
| 20 |
+
# 1. SETUP BROWSER (Headers & User Agent go here)
|
|
|
|
| 21 |
self.browser_conf = BrowserConfig(
|
| 22 |
headless=True,
|
| 23 |
verbose=False,
|
| 24 |
+
# 'extra_args' is the correct parameter for passing flags
|
| 25 |
+
extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
|
| 26 |
+
# User Agent matches a real Chrome browser to avoid blocks
|
| 27 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 28 |
+
# Optional: You can pass other headers here if needed, but user_agent is usually enough
|
| 29 |
+
# headers={"Accept-Language": "en-US,en;q=0.9"}
|
| 30 |
)
|
| 31 |
+
|
| 32 |
+
# 2. SETUP RUN CONFIG (Cache & Execution rules go here)
|
| 33 |
self.run_conf = CrawlerRunConfig(
|
| 34 |
+
cache_mode=CacheMode.BYPASS
|
| 35 |
+
# removed 'headers' from here as it caused the crash
|
|
|
|
|
|
|
|
|
|
| 36 |
)
|
| 37 |
+
|
| 38 |
self.ai_client = Client(AI_CLIENT_URL)
|
| 39 |
|
| 40 |
def search_google_url(self, query):
|
| 41 |
"""Generates the Google Search URL."""
|
| 42 |
encoded_query = urllib.parse.quote_plus(query)
|
|
|
|
| 43 |
return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
|
| 44 |
|
| 45 |
async def crawl_single_page(self, url):
|
| 46 |
"""Crawls a URL with error handling for the cloud environment."""
|
| 47 |
+
# We pass the browser config here to initialize the browser with our headers
|
| 48 |
async with AsyncWebCrawler(config=self.browser_conf) as crawler:
|
| 49 |
try:
|
| 50 |
+
# Add a small delay to be polite
|
| 51 |
await asyncio.sleep(1)
|
| 52 |
+
# We pass the run config here
|
| 53 |
result = await crawler.arun(url=url, config=self.run_conf)
|
| 54 |
|
| 55 |
if result.success:
|
|
|
|
| 61 |
|
| 62 |
def extract_links(self, markdown_text):
|
| 63 |
"""Finds links in the markdown. Handles Google's messy redirection links."""
|
|
|
|
| 64 |
links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
|
| 65 |
|
| 66 |
clean_urls = []
|
| 67 |
for text, url in links:
|
|
|
|
| 68 |
if "google.com" in url or "youtube.com" in url:
|
| 69 |
continue
|
| 70 |
if len(url) < 15:
|
| 71 |
continue
|
| 72 |
|
|
|
|
| 73 |
domain = urllib.parse.urlparse(url).netloc
|
| 74 |
if not any(domain in u for u in clean_urls):
|
| 75 |
clean_urls.append(url)
|
|
|
|
| 98 |
|
| 99 |
# --- GRADIO INTERFACE ---
|
| 100 |
|
| 101 |
+
# Initialize engine globally
|
| 102 |
engine = CloudResearchEngine()
|
| 103 |
|
| 104 |
async def run_process(topic):
|