Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import asyncio | |
| import nest_asyncio | |
| import re | |
| import urllib.parse | |
| import os | |
| from crawl4ai import AsyncWebCrawler | |
| from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode | |
| from gradio_client import Client | |
| # Apply nest_asyncio to handle the event loop in the cloud | |
| nest_asyncio.apply() | |
| # --- CONFIGURATIONS --- | |
| AI_CLIENT_URL = "zai-org/GLM-4.5-Space" | |
| MAX_LINKS = 3 | |
| class CloudResearchEngine: | |
| def __init__(self): | |
| # 1. SETUP BROWSER | |
| self.browser_conf = BrowserConfig( | |
| headless=True, | |
| verbose=False, | |
| # Specific args for Docker/Cloud | |
| extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"], | |
| # Random real user agent | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| # 2. SETUP RUN CONFIG | |
| self.run_conf = CrawlerRunConfig( | |
| cache_mode=CacheMode.BYPASS | |
| ) | |
| self.ai_client = Client(AI_CLIENT_URL) | |
| def search_bing_url(self, query): | |
| """Generates the Bing Search URL (Better for Bots).""" | |
| encoded_query = urllib.parse.quote_plus(query) | |
| # Using Bing instead of Google | |
| return f"https://www.bing.com/search?q={encoded_query}" | |
| async def crawl_single_page(self, url): | |
| """Crawls a URL with error handling.""" | |
| async with AsyncWebCrawler(config=self.browser_conf) as crawler: | |
| try: | |
| # Small delay to be polite | |
| await asyncio.sleep(1) | |
| result = await crawler.arun(url=url, config=self.run_conf) | |
| if result.success: | |
| return result.markdown | |
| else: | |
| return f"[Error: Could not read {url} - {result.error_message}]" | |
| except Exception as e: | |
| return f"[System Error reading {url}: {str(e)}]" | |
| def extract_links(self, markdown_text): | |
| """Finds links in the markdown and filters out Bing/Microsoft junk.""" | |
| # Standard markdown links [text](url) | |
| links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text) | |
| clean_urls = [] | |
| for text, url in links: | |
| # 1. Exclude Bing/Microsoft internal links | |
| if "bing.com" in url or "microsoft.com" in url or "msn.com" in url: | |
| continue | |
| # 2. Exclude other common junk | |
| if "google.com" in url or "youtube.com" in url: | |
| continue | |
| if len(url) < 15: | |
| continue | |
| # 3. De-duplicate domains | |
| domain = urllib.parse.urlparse(url).netloc | |
| if not any(domain in u for u in clean_urls): | |
| clean_urls.append(url) | |
| return clean_urls[:MAX_LINKS] | |
| def analyze_with_ai(self, prompt, context): | |
| """Sends data to the GLM-4.5 Space.""" | |
| full_msg = ( | |
| f"RESEARCH QUERY: {prompt}\n\n" | |
| f"EXTRACTED WEB DATA:\n{context}\n\n" | |
| f"TASK: Synthesize this information into a clear summary answer." | |
| ) | |
| try: | |
| result = self.ai_client.predict( | |
| msg=full_msg, | |
| sys_prompt="You are a helpful research assistant. Summarize the web data accurately.", | |
| thinking_enabled=True, | |
| temperature=0.7, | |
| api_name="/chat_wrapper" | |
| ) | |
| return str(result) | |
| except Exception as e: | |
| return f"β οΈ AI API Failed: {str(e)}" | |
| # --- GRADIO INTERFACE --- | |
| # Initialize engine globally | |
| engine = CloudResearchEngine() | |
| async def run_process(topic): | |
| log = f"π Starting Research on: {topic}\n" | |
| yield log, "..." | |
| # 1. Search Bing | |
| search_url = engine.search_bing_url(topic) | |
| log += f"π Search URL generated: {search_url}\n" | |
| yield log, "..." | |
| # 2. Get Search Results | |
| log += "π·οΈ Scanning Bing Results...\n" | |
| yield log, "..." | |
| serp_markdown = await engine.crawl_single_page(search_url) | |
| # 3. Extract Links | |
| links = engine.extract_links(serp_markdown) | |
| if not links: | |
| log += "β No links found. Even Bing might be blocking the IP, or the page loaded empty.\n" | |
| log += f"Debug - Raw Content Length: {len(serp_markdown)}\n" | |
| yield log, "Failed to find links." | |
| return | |
| log += f"β Found {len(links)} Links: {links}\n" | |
| yield log, "..." | |
| # 4. Deep Crawl | |
| context_data = "" | |
| for i, link in enumerate(links): | |
| log += f"π₯ Reading ({i+1}/{len(links)}): {link}...\n" | |
| yield log, "..." | |
| page_text = await engine.crawl_single_page(link) | |
| context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n" | |
| # 5. AI Analysis | |
| log += "π§ Sending data to AI for final report...\n" | |
| yield log, "Thinking..." | |
| summary = engine.analyze_with_ai(topic, context_data) | |
| log += "π Done!" | |
| yield log, summary | |
| with gr.Blocks(title="AI Research Agent") as demo: | |
| gr.Markdown("# π€ AI Research Agent (Bing + Crawl4AI)") | |
| with gr.Row(): | |
| inp = gr.Textbox(label="Topic", placeholder="Enter research topic...") | |
| btn = gr.Button("Research", variant="primary") | |
| with gr.Row(): | |
| logs = gr.TextArea(label="System Logs", lines=10) | |
| out = gr.Markdown(label="Final Report") | |
| btn.click(run_process, inputs=inp, outputs=[logs, out]) | |
| if __name__ == "__main__": | |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) |