Spaces:

mdnazib963
/

Crawl4AI

Sleeping

App Files Files Community

Crawl4AI / app.py

mdnazib963

Update app.py

1a46212 verified about 1 month ago

raw

history blame contribute delete

5.65 kB

	import gradio as gr
	import asyncio
	import nest_asyncio
	import re
	import urllib.parse
	import os
	from crawl4ai import AsyncWebCrawler
	from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
	from gradio_client import Client

	# Apply nest_asyncio to handle the event loop in the cloud
	nest_asyncio.apply()

	# --- CONFIGURATIONS ---
	AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
	MAX_LINKS = 3

	class CloudResearchEngine:
	def __init__(self):
	# 1. SETUP BROWSER
	self.browser_conf = BrowserConfig(
	headless=True,
	verbose=False,
	# Specific args for Docker/Cloud
	extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
	# Random real user agent
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)

	# 2. SETUP RUN CONFIG
	self.run_conf = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS
	)

	self.ai_client = Client(AI_CLIENT_URL)

	def search_bing_url(self, query):
	"""Generates the Bing Search URL (Better for Bots)."""
	encoded_query = urllib.parse.quote_plus(query)
	# Using Bing instead of Google
	return f"https://www.bing.com/search?q={encoded_query}"

	async def crawl_single_page(self, url):
	"""Crawls a URL with error handling."""
	async with AsyncWebCrawler(config=self.browser_conf) as crawler:
	try:
	# Small delay to be polite
	await asyncio.sleep(1)
	result = await crawler.arun(url=url, config=self.run_conf)

	if result.success:
	return result.markdown
	else:
	return f"[Error: Could not read {url} - {result.error_message}]"
	except Exception as e:
	return f"[System Error reading {url}: {str(e)}]"

	def extract_links(self, markdown_text):
	"""Finds links in the markdown and filters out Bing/Microsoft junk."""
	# Standard markdown links [text](url)
	links = re.findall(r'\[(.?)\]\((https?://.?)\)', markdown_text)

	clean_urls = []
	for text, url in links:
	# 1. Exclude Bing/Microsoft internal links
	if "bing.com" in url or "microsoft.com" in url or "msn.com" in url:
	continue
	# 2. Exclude other common junk
	if "google.com" in url or "youtube.com" in url:
	continue
	if len(url) < 15:
	continue

	# 3. De-duplicate domains
	domain = urllib.parse.urlparse(url).netloc
	if not any(domain in u for u in clean_urls):
	clean_urls.append(url)

	return clean_urls[:MAX_LINKS]

	def analyze_with_ai(self, prompt, context):
	"""Sends data to the GLM-4.5 Space."""
	full_msg = (
	f"RESEARCH QUERY: {prompt}\n\n"
	f"EXTRACTED WEB DATA:\n{context}\n\n"
	f"TASK: Synthesize this information into a clear summary answer."
	)

	try:
	result = self.ai_client.predict(
	msg=full_msg,
	sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
	thinking_enabled=True,
	temperature=0.7,
	api_name="/chat_wrapper"
	)
	return str(result)
	except Exception as e:
	return f"⚠️ AI API Failed: {str(e)}"

	# --- GRADIO INTERFACE ---

	# Initialize engine globally
	engine = CloudResearchEngine()

	async def run_process(topic):
	log = f"🚀 Starting Research on: {topic}\n"
	yield log, "..."

	# 1. Search Bing
	search_url = engine.search_bing_url(topic)
	log += f"🔎 Search URL generated: {search_url}\n"
	yield log, "..."

	# 2. Get Search Results
	log += "🕷️ Scanning Bing Results...\n"
	yield log, "..."

	serp_markdown = await engine.crawl_single_page(search_url)

	# 3. Extract Links
	links = engine.extract_links(serp_markdown)

	if not links:
	log += "❌ No links found. Even Bing might be blocking the IP, or the page loaded empty.\n"
	log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
	yield log, "Failed to find links."
	return

	log += f"✅ Found {len(links)} Links: {links}\n"
	yield log, "..."

	# 4. Deep Crawl
	context_data = ""
	for i, link in enumerate(links):
	log += f"📥 Reading ({i+1}/{len(links)}): {link}...\n"
	yield log, "..."
	page_text = await engine.crawl_single_page(link)
	context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"

	# 5. AI Analysis
	log += "🧠 Sending data to AI for final report...\n"
	yield log, "Thinking..."

	summary = engine.analyze_with_ai(topic, context_data)

	log += "🏁 Done!"
	yield log, summary

	with gr.Blocks(title="AI Research Agent") as demo:
	gr.Markdown("# 🤖 AI Research Agent (Bing + Crawl4AI)")

	with gr.Row():
	inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
	btn = gr.Button("Research", variant="primary")

	with gr.Row():
	logs = gr.TextArea(label="System Logs", lines=10)
	out = gr.Markdown(label="Final Report")

	btn.click(run_process, inputs=inp, outputs=[logs, out])

	if __name__ == "__main__":
	demo.queue().launch(server_name="0.0.0.0", server_port=7860)