Spaces:
Sleeping
Sleeping
File size: 5,650 Bytes
f91ccd4 1a46212 f91ccd4 1a46212 e6068cc 1a46212 f91ccd4 e6068cc 1a46212 f91ccd4 e6068cc f91ccd4 e6068cc f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 e6068cc f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 1a46212 f91ccd4 da612bc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | import gradio as gr
import asyncio
import nest_asyncio
import re
import urllib.parse
import os
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
from gradio_client import Client
# Apply nest_asyncio to handle the event loop in the cloud
nest_asyncio.apply()
# --- CONFIGURATIONS ---
AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
MAX_LINKS = 3
class CloudResearchEngine:
def __init__(self):
# 1. SETUP BROWSER
self.browser_conf = BrowserConfig(
headless=True,
verbose=False,
# Specific args for Docker/Cloud
extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
# Random real user agent
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
# 2. SETUP RUN CONFIG
self.run_conf = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
self.ai_client = Client(AI_CLIENT_URL)
def search_bing_url(self, query):
"""Generates the Bing Search URL (Better for Bots)."""
encoded_query = urllib.parse.quote_plus(query)
# Using Bing instead of Google
return f"https://www.bing.com/search?q={encoded_query}"
async def crawl_single_page(self, url):
"""Crawls a URL with error handling."""
async with AsyncWebCrawler(config=self.browser_conf) as crawler:
try:
# Small delay to be polite
await asyncio.sleep(1)
result = await crawler.arun(url=url, config=self.run_conf)
if result.success:
return result.markdown
else:
return f"[Error: Could not read {url} - {result.error_message}]"
except Exception as e:
return f"[System Error reading {url}: {str(e)}]"
def extract_links(self, markdown_text):
"""Finds links in the markdown and filters out Bing/Microsoft junk."""
# Standard markdown links [text](url)
links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
clean_urls = []
for text, url in links:
# 1. Exclude Bing/Microsoft internal links
if "bing.com" in url or "microsoft.com" in url or "msn.com" in url:
continue
# 2. Exclude other common junk
if "google.com" in url or "youtube.com" in url:
continue
if len(url) < 15:
continue
# 3. De-duplicate domains
domain = urllib.parse.urlparse(url).netloc
if not any(domain in u for u in clean_urls):
clean_urls.append(url)
return clean_urls[:MAX_LINKS]
def analyze_with_ai(self, prompt, context):
"""Sends data to the GLM-4.5 Space."""
full_msg = (
f"RESEARCH QUERY: {prompt}\n\n"
f"EXTRACTED WEB DATA:\n{context}\n\n"
f"TASK: Synthesize this information into a clear summary answer."
)
try:
result = self.ai_client.predict(
msg=full_msg,
sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
thinking_enabled=True,
temperature=0.7,
api_name="/chat_wrapper"
)
return str(result)
except Exception as e:
return f"β οΈ AI API Failed: {str(e)}"
# --- GRADIO INTERFACE ---
# Initialize engine globally
engine = CloudResearchEngine()
async def run_process(topic):
log = f"π Starting Research on: {topic}\n"
yield log, "..."
# 1. Search Bing
search_url = engine.search_bing_url(topic)
log += f"π Search URL generated: {search_url}\n"
yield log, "..."
# 2. Get Search Results
log += "π·οΈ Scanning Bing Results...\n"
yield log, "..."
serp_markdown = await engine.crawl_single_page(search_url)
# 3. Extract Links
links = engine.extract_links(serp_markdown)
if not links:
log += "β No links found. Even Bing might be blocking the IP, or the page loaded empty.\n"
log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
yield log, "Failed to find links."
return
log += f"β
Found {len(links)} Links: {links}\n"
yield log, "..."
# 4. Deep Crawl
context_data = ""
for i, link in enumerate(links):
log += f"π₯ Reading ({i+1}/{len(links)}): {link}...\n"
yield log, "..."
page_text = await engine.crawl_single_page(link)
context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
# 5. AI Analysis
log += "π§ Sending data to AI for final report...\n"
yield log, "Thinking..."
summary = engine.analyze_with_ai(topic, context_data)
log += "π Done!"
yield log, summary
with gr.Blocks(title="AI Research Agent") as demo:
gr.Markdown("# π€ AI Research Agent (Bing + Crawl4AI)")
with gr.Row():
inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
btn = gr.Button("Research", variant="primary")
with gr.Row():
logs = gr.TextArea(label="System Logs", lines=10)
out = gr.Markdown(label="Final Report")
btn.click(run_process, inputs=inp, outputs=[logs, out])
if __name__ == "__main__":
demo.queue().launch(server_name="0.0.0.0", server_port=7860) |