Spaces:

mdnazib963
/

Crawl4AI

Sleeping

File size: 5,650 Bytes

import gradio as gr
import asyncio
import nest_asyncio
import re
import urllib.parse
import os
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
from gradio_client import Client

# Apply nest_asyncio to handle the event loop in the cloud
nest_asyncio.apply()

# --- CONFIGURATIONS ---
AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
MAX_LINKS = 3

class CloudResearchEngine:
    def __init__(self):
        # 1. SETUP BROWSER
        self.browser_conf = BrowserConfig(
            headless=True,
            verbose=False,
            # Specific args for Docker/Cloud
            extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
            # Random real user agent
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        
        # 2. SETUP RUN CONFIG
        self.run_conf = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS
        )
        
        self.ai_client = Client(AI_CLIENT_URL)

    def search_bing_url(self, query):
        """Generates the Bing Search URL (Better for Bots)."""
        encoded_query = urllib.parse.quote_plus(query)
        # Using Bing instead of Google
        return f"https://www.bing.com/search?q={encoded_query}"

    async def crawl_single_page(self, url):
        """Crawls a URL with error handling."""
        async with AsyncWebCrawler(config=self.browser_conf) as crawler:
            try:
                # Small delay to be polite
                await asyncio.sleep(1) 
                result = await crawler.arun(url=url, config=self.run_conf)
                
                if result.success:
                    return result.markdown
                else:
                    return f"[Error: Could not read {url} - {result.error_message}]"
            except Exception as e:
                return f"[System Error reading {url}: {str(e)}]"

    def extract_links(self, markdown_text):
        """Finds links in the markdown and filters out Bing/Microsoft junk."""
        # Standard markdown links [text](url)
        links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
        
        clean_urls = []
        for text, url in links:
            # 1. Exclude Bing/Microsoft internal links
            if "bing.com" in url or "microsoft.com" in url or "msn.com" in url:
                continue
            # 2. Exclude other common junk
            if "google.com" in url or "youtube.com" in url:
                continue
            if len(url) < 15:
                continue
            
            # 3. De-duplicate domains
            domain = urllib.parse.urlparse(url).netloc
            if not any(domain in u for u in clean_urls):
                clean_urls.append(url)
                
        return clean_urls[:MAX_LINKS]

    def analyze_with_ai(self, prompt, context):
        """Sends data to the GLM-4.5 Space."""
        full_msg = (
            f"RESEARCH QUERY: {prompt}\n\n"
            f"EXTRACTED WEB DATA:\n{context}\n\n"
            f"TASK: Synthesize this information into a clear summary answer."
        )
        
        try:
            result = self.ai_client.predict(
                msg=full_msg,
                sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
                thinking_enabled=True,
                temperature=0.7,
                api_name="/chat_wrapper"
            )
            return str(result)
        except Exception as e:
            return f"⚠️ AI API Failed: {str(e)}"

# --- GRADIO INTERFACE ---

# Initialize engine globally
engine = CloudResearchEngine()

async def run_process(topic):
    log = f"🚀 Starting Research on: {topic}\n"
    yield log, "..."
    
    # 1. Search Bing
    search_url = engine.search_bing_url(topic)
    log += f"🔎 Search URL generated: {search_url}\n"
    yield log, "..."
    
    # 2. Get Search Results
    log += "🕷️ Scanning Bing Results...\n"
    yield log, "..."
    
    serp_markdown = await engine.crawl_single_page(search_url)
    
    # 3. Extract Links
    links = engine.extract_links(serp_markdown)
    
    if not links:
        log += "❌ No links found. Even Bing might be blocking the IP, or the page loaded empty.\n"
        log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
        yield log, "Failed to find links."
        return

    log += f"✅ Found {len(links)} Links: {links}\n"
    yield log, "..."
    
    # 4. Deep Crawl
    context_data = ""
    for i, link in enumerate(links):
        log += f"📥 Reading ({i+1}/{len(links)}): {link}...\n"
        yield log, "..."
        page_text = await engine.crawl_single_page(link)
        context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
    
    # 5. AI Analysis
    log += "🧠 Sending data to AI for final report...\n"
    yield log, "Thinking..."
    
    summary = engine.analyze_with_ai(topic, context_data)
    
    log += "🏁 Done!"
    yield log, summary

with gr.Blocks(title="AI Research Agent") as demo:
    gr.Markdown("# 🤖 AI Research Agent (Bing + Crawl4AI)")
    
    with gr.Row():
        inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
        btn = gr.Button("Research", variant="primary")
        
    with gr.Row():
        logs = gr.TextArea(label="System Logs", lines=10)
        out = gr.Markdown(label="Final Report")
        
    btn.click(run_process, inputs=inp, outputs=[logs, out])

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)