Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import asyncio
|
| 3 |
+
import nest_asyncio
|
| 4 |
+
import re
|
| 5 |
+
import urllib.parse
|
| 6 |
+
import os
|
| 7 |
+
from crawl4ai import AsyncWebCrawler
|
| 8 |
+
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
| 9 |
+
from gradio_client import Client
|
| 10 |
+
|
| 11 |
+
# Apply nest_asyncio to handle the event loop in the cloud
|
| 12 |
+
nest_asyncio.apply()
|
| 13 |
+
|
| 14 |
+
# --- CONFIGURATIONS ---
|
| 15 |
+
AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
|
| 16 |
+
MAX_LINKS = 3
|
| 17 |
+
|
| 18 |
+
class CloudResearchEngine:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
# Browser config optimized for Docker/Cloud containers
|
| 21 |
+
self.browser_conf = BrowserConfig(
|
| 22 |
+
headless=True,
|
| 23 |
+
verbose=False,
|
| 24 |
+
# Specific args to run safely in Docker
|
| 25 |
+
args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"]
|
| 26 |
+
)
|
| 27 |
+
self.run_conf = CrawlerRunConfig(
|
| 28 |
+
cache_mode=CacheMode.BYPASS,
|
| 29 |
+
# Stealth headers to try and bypass simple bot detection
|
| 30 |
+
headers={
|
| 31 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
self.ai_client = Client(AI_CLIENT_URL)
|
| 35 |
+
|
| 36 |
+
def search_google_url(self, query):
|
| 37 |
+
"""Generates the Google Search URL."""
|
| 38 |
+
encoded_query = urllib.parse.quote_plus(query)
|
| 39 |
+
# We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
|
| 40 |
+
return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
|
| 41 |
+
|
| 42 |
+
async def crawl_single_page(self, url):
|
| 43 |
+
"""Crawls a URL with error handling for the cloud environment."""
|
| 44 |
+
async with AsyncWebCrawler(config=self.browser_conf) as crawler:
|
| 45 |
+
try:
|
| 46 |
+
# Add a small delay to be polite and avoid immediate blocks
|
| 47 |
+
await asyncio.sleep(1)
|
| 48 |
+
result = await crawler.arun(url=url, config=self.run_conf)
|
| 49 |
+
|
| 50 |
+
if result.success:
|
| 51 |
+
return result.markdown
|
| 52 |
+
else:
|
| 53 |
+
return f"[Error: Could not read {url} - {result.error_message}]"
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return f"[System Error reading {url}: {str(e)}]"
|
| 56 |
+
|
| 57 |
+
def extract_links(self, markdown_text):
|
| 58 |
+
"""Finds links in the markdown. Handles Google's messy redirection links."""
|
| 59 |
+
# Standard markdown links [text](url)
|
| 60 |
+
links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
|
| 61 |
+
|
| 62 |
+
clean_urls = []
|
| 63 |
+
for text, url in links:
|
| 64 |
+
# Filter out Google internal links and tiny links
|
| 65 |
+
if "google.com" in url or "youtube.com" in url:
|
| 66 |
+
continue
|
| 67 |
+
if len(url) < 15:
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
# De-duplicate
|
| 71 |
+
domain = urllib.parse.urlparse(url).netloc
|
| 72 |
+
if not any(domain in u for u in clean_urls):
|
| 73 |
+
clean_urls.append(url)
|
| 74 |
+
|
| 75 |
+
return clean_urls[:MAX_LINKS]
|
| 76 |
+
|
| 77 |
+
def analyze_with_ai(self, prompt, context):
|
| 78 |
+
"""Sends data to the GLM-4.5 Space."""
|
| 79 |
+
full_msg = (
|
| 80 |
+
f"RESEARCH QUERY: {prompt}\n\n"
|
| 81 |
+
f"EXTRACTED WEB DATA:\n{context}\n\n"
|
| 82 |
+
f"TASK: Synthesize this information into a clear summary answer."
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
result = self.ai_client.predict(
|
| 87 |
+
msg=full_msg,
|
| 88 |
+
sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
|
| 89 |
+
thinking_enabled=True,
|
| 90 |
+
temperature=0.7,
|
| 91 |
+
api_name="/chat_wrapper"
|
| 92 |
+
)
|
| 93 |
+
return str(result)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
return f"β οΈ AI API Failed: {str(e)}"
|
| 96 |
+
|
| 97 |
+
# --- GRADIO INTERFACE ---
|
| 98 |
+
|
| 99 |
+
engine = CloudResearchEngine()
|
| 100 |
+
|
| 101 |
+
async def run_process(topic):
|
| 102 |
+
log = f"π Starting Research on: {topic}\n"
|
| 103 |
+
yield log, "..."
|
| 104 |
+
|
| 105 |
+
# 1. Search Google
|
| 106 |
+
search_url = engine.search_google_url(topic)
|
| 107 |
+
log += f"π Search URL generated: {search_url}\n"
|
| 108 |
+
yield log, "..."
|
| 109 |
+
|
| 110 |
+
# 2. Get Search Results
|
| 111 |
+
log += "π·οΈ Scanning Search Results (this may take 10s)...\n"
|
| 112 |
+
yield log, "..."
|
| 113 |
+
|
| 114 |
+
serp_markdown = await engine.crawl_single_page(search_url)
|
| 115 |
+
|
| 116 |
+
# 3. Extract Links
|
| 117 |
+
links = engine.extract_links(serp_markdown)
|
| 118 |
+
|
| 119 |
+
if not links:
|
| 120 |
+
log += "β No links found. Google might have blocked the Cloud IP. Try a more specific query.\n"
|
| 121 |
+
log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
|
| 122 |
+
yield log, "Failed to find links."
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
log += f"β
Found {len(links)} Links: {links}\n"
|
| 126 |
+
yield log, "..."
|
| 127 |
+
|
| 128 |
+
# 4. Deep Crawl
|
| 129 |
+
context_data = ""
|
| 130 |
+
for i, link in enumerate(links):
|
| 131 |
+
log += f"π₯ Reading ({i+1}/{len(links)}): {link}...\n"
|
| 132 |
+
yield log, "..."
|
| 133 |
+
page_text = await engine.crawl_single_page(link)
|
| 134 |
+
context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
|
| 135 |
+
|
| 136 |
+
# 5. AI Analysis
|
| 137 |
+
log += "π§ Sending data to AI for final report...\n"
|
| 138 |
+
yield log, "Thinking..."
|
| 139 |
+
|
| 140 |
+
summary = engine.analyze_with_ai(topic, context_data)
|
| 141 |
+
|
| 142 |
+
log += "π Done!"
|
| 143 |
+
yield log, summary
|
| 144 |
+
|
| 145 |
+
with gr.Blocks(title="AI Research Agent") as demo:
|
| 146 |
+
gr.Markdown("# π€ AI Research Agent (Docker/Crawl4AI)")
|
| 147 |
+
|
| 148 |
+
with gr.Row():
|
| 149 |
+
inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
|
| 150 |
+
btn = gr.Button("Research", variant="primary")
|
| 151 |
+
|
| 152 |
+
with gr.Row():
|
| 153 |
+
logs = gr.TextArea(label="System Logs", lines=10)
|
| 154 |
+
out = gr.Markdown(label="Final Report")
|
| 155 |
+
|
| 156 |
+
btn.click(run_process, inputs=inp, outputs=[logs, out])
|
| 157 |
+
|
| 158 |
+
if __name__ == "__main__":
|
| 159 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=7860)
|