Spaces:

mdnazib963
/

Crawl4AI

Sleeping

App Files Files Community

mdnazib963 commited on Jan 21

Commit

f91ccd4

verified ·

1 Parent(s): 8781d8e

Create app.py

Browse files

Files changed (1) hide show

app.py +159 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gradio as gr
+import asyncio
+import nest_asyncio
+import re
+import urllib.parse
+import os
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+from gradio_client import Client
+# Apply nest_asyncio to handle the event loop in the cloud
+nest_asyncio.apply()
+# --- CONFIGURATIONS ---
+AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
+MAX_LINKS = 3
+class CloudResearchEngine:
+    def __init__(self):
+        # Browser config optimized for Docker/Cloud containers
+        self.browser_conf = BrowserConfig(
+            headless=True,
+            verbose=False,
+            # Specific args to run safely in Docker
+            args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"]
+        )
+        self.run_conf = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            # Stealth headers to try and bypass simple bot detection
+            headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            }
+        )
+        self.ai_client = Client(AI_CLIENT_URL)
+    def search_google_url(self, query):
+        """Generates the Google Search URL."""
+        encoded_query = urllib.parse.quote_plus(query)
+        # We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
+        return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
+    async def crawl_single_page(self, url):
+        """Crawls a URL with error handling for the cloud environment."""
+        async with AsyncWebCrawler(config=self.browser_conf) as crawler:
+            try:
+                # Add a small delay to be polite and avoid immediate blocks
+                await asyncio.sleep(1)
+                result = await crawler.arun(url=url, config=self.run_conf)
+                if result.success:
+                    return result.markdown
+                else:
+                    return f"[Error: Could not read {url} - {result.error_message}]"
+            except Exception as e:
+                return f"[System Error reading {url}: {str(e)}]"
+    def extract_links(self, markdown_text):
+        """Finds links in the markdown. Handles Google's messy redirection links."""
+        # Standard markdown links [text](url)
+        links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
+        clean_urls = []
+        for text, url in links:
+            # Filter out Google internal links and tiny links
+            if "google.com" in url or "youtube.com" in url:
+                continue
+            if len(url) < 15:
+                continue
+            # De-duplicate
+            domain = urllib.parse.urlparse(url).netloc
+            if not any(domain in u for u in clean_urls):
+                clean_urls.append(url)
+        return clean_urls[:MAX_LINKS]
+    def analyze_with_ai(self, prompt, context):
+        """Sends data to the GLM-4.5 Space."""
+        full_msg = (
+            f"RESEARCH QUERY: {prompt}\n\n"
+            f"EXTRACTED WEB DATA:\n{context}\n\n"
+            f"TASK: Synthesize this information into a clear summary answer."
+        )
+        try:
+            result = self.ai_client.predict(
+                msg=full_msg,
+                sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
+                thinking_enabled=True,
+                temperature=0.7,
+                api_name="/chat_wrapper"
+            )
+            return str(result)
+        except Exception as e:
+            return f"⚠️ AI API Failed: {str(e)}"
+# --- GRADIO INTERFACE ---
+engine = CloudResearchEngine()
+async def run_process(topic):
+    log = f"🚀 Starting Research on: {topic}\n"
+    yield log, "..."
+    # 1. Search Google
+    search_url = engine.search_google_url(topic)
+    log += f"🔎 Search URL generated: {search_url}\n"
+    yield log, "..."
+    # 2. Get Search Results
+    log += "🕷️ Scanning Search Results (this may take 10s)...\n"
+    yield log, "..."
+    serp_markdown = await engine.crawl_single_page(search_url)
+    # 3. Extract Links
+    links = engine.extract_links(serp_markdown)
+    if not links:
+        log += "❌ No links found. Google might have blocked the Cloud IP. Try a more specific query.\n"
+        log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
+        yield log, "Failed to find links."
+        return
+    log += f"✅ Found {len(links)} Links: {links}\n"
+    yield log, "..."
+    # 4. Deep Crawl
+    context_data = ""
+    for i, link in enumerate(links):
+        log += f"📥 Reading ({i+1}/{len(links)}): {link}...\n"
+        yield log, "..."
+        page_text = await engine.crawl_single_page(link)
+        context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
+    # 5. AI Analysis
+    log += "🧠 Sending data to AI for final report...\n"
+    yield log, "Thinking..."
+    summary = engine.analyze_with_ai(topic, context_data)
+    log += "🏁 Done!"
+    yield log, summary
+with gr.Blocks(title="AI Research Agent") as demo:
+    gr.Markdown("# 🤖 AI Research Agent (Docker/Crawl4AI)")
+    with gr.Row():
+        inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
+        btn = gr.Button("Research", variant="primary")
+    with gr.Row():
+        logs = gr.TextArea(label="System Logs", lines=10)
+        out = gr.Markdown(label="Final Report")
+    btn.click(run_process, inputs=inp, outputs=[logs, out])
+if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860)