import gradio as gr import asyncio import nest_asyncio import re import urllib.parse import os from crawl4ai import AsyncWebCrawler from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode from gradio_client import Client # Apply nest_asyncio to handle the event loop in the cloud nest_asyncio.apply() # --- CONFIGURATIONS --- AI_CLIENT_URL = "zai-org/GLM-4.5-Space" MAX_LINKS = 3 class CloudResearchEngine: def __init__(self): # 1. SETUP BROWSER self.browser_conf = BrowserConfig( headless=True, verbose=False, # Specific args for Docker/Cloud extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"], # Random real user agent user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) # 2. SETUP RUN CONFIG self.run_conf = CrawlerRunConfig( cache_mode=CacheMode.BYPASS ) self.ai_client = Client(AI_CLIENT_URL) def search_bing_url(self, query): """Generates the Bing Search URL (Better for Bots).""" encoded_query = urllib.parse.quote_plus(query) # Using Bing instead of Google return f"https://www.bing.com/search?q={encoded_query}" async def crawl_single_page(self, url): """Crawls a URL with error handling.""" async with AsyncWebCrawler(config=self.browser_conf) as crawler: try: # Small delay to be polite await asyncio.sleep(1) result = await crawler.arun(url=url, config=self.run_conf) if result.success: return result.markdown else: return f"[Error: Could not read {url} - {result.error_message}]" except Exception as e: return f"[System Error reading {url}: {str(e)}]" def extract_links(self, markdown_text): """Finds links in the markdown and filters out Bing/Microsoft junk.""" # Standard markdown links [text](url) links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text) clean_urls = [] for text, url in links: # 1. Exclude Bing/Microsoft internal links if "bing.com" in url or "microsoft.com" in url or "msn.com" in url: continue # 2. Exclude other common junk if "google.com" in url or "youtube.com" in url: continue if len(url) < 15: continue # 3. De-duplicate domains domain = urllib.parse.urlparse(url).netloc if not any(domain in u for u in clean_urls): clean_urls.append(url) return clean_urls[:MAX_LINKS] def analyze_with_ai(self, prompt, context): """Sends data to the GLM-4.5 Space.""" full_msg = ( f"RESEARCH QUERY: {prompt}\n\n" f"EXTRACTED WEB DATA:\n{context}\n\n" f"TASK: Synthesize this information into a clear summary answer." ) try: result = self.ai_client.predict( msg=full_msg, sys_prompt="You are a helpful research assistant. Summarize the web data accurately.", thinking_enabled=True, temperature=0.7, api_name="/chat_wrapper" ) return str(result) except Exception as e: return f"⚠️ AI API Failed: {str(e)}" # --- GRADIO INTERFACE --- # Initialize engine globally engine = CloudResearchEngine() async def run_process(topic): log = f"🚀 Starting Research on: {topic}\n" yield log, "..." # 1. Search Bing search_url = engine.search_bing_url(topic) log += f"🔎 Search URL generated: {search_url}\n" yield log, "..." # 2. Get Search Results log += "🕷️ Scanning Bing Results...\n" yield log, "..." serp_markdown = await engine.crawl_single_page(search_url) # 3. Extract Links links = engine.extract_links(serp_markdown) if not links: log += "❌ No links found. Even Bing might be blocking the IP, or the page loaded empty.\n" log += f"Debug - Raw Content Length: {len(serp_markdown)}\n" yield log, "Failed to find links." return log += f"✅ Found {len(links)} Links: {links}\n" yield log, "..." # 4. Deep Crawl context_data = "" for i, link in enumerate(links): log += f"📥 Reading ({i+1}/{len(links)}): {link}...\n" yield log, "..." page_text = await engine.crawl_single_page(link) context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n" # 5. AI Analysis log += "🧠 Sending data to AI for final report...\n" yield log, "Thinking..." summary = engine.analyze_with_ai(topic, context_data) log += "🏁 Done!" yield log, summary with gr.Blocks(title="AI Research Agent") as demo: gr.Markdown("# 🤖 AI Research Agent (Bing + Crawl4AI)") with gr.Row(): inp = gr.Textbox(label="Topic", placeholder="Enter research topic...") btn = gr.Button("Research", variant="primary") with gr.Row(): logs = gr.TextArea(label="System Logs", lines=10) out = gr.Markdown(label="Final Report") btn.click(run_process, inputs=inp, outputs=[logs, out]) if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)