File size: 5,650 Bytes
f91ccd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a46212
f91ccd4
 
 
1a46212
e6068cc
1a46212
 
f91ccd4
e6068cc
1a46212
f91ccd4
e6068cc
f91ccd4
e6068cc
f91ccd4
 
1a46212
 
f91ccd4
1a46212
 
f91ccd4
 
1a46212
f91ccd4
 
1a46212
f91ccd4
 
 
 
 
 
 
 
 
 
 
1a46212
 
f91ccd4
 
 
 
1a46212
 
 
 
f91ccd4
 
 
 
 
1a46212
f91ccd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6068cc
f91ccd4
 
 
 
 
 
1a46212
 
f91ccd4
 
 
 
1a46212
f91ccd4
 
 
 
 
 
 
 
1a46212
f91ccd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a46212
f91ccd4
 
 
 
 
 
 
 
 
 
 
 
da612bc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import gradio as gr
import asyncio
import nest_asyncio
import re
import urllib.parse
import os
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
from gradio_client import Client

# Apply nest_asyncio to handle the event loop in the cloud
nest_asyncio.apply()

# --- CONFIGURATIONS ---
AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
MAX_LINKS = 3

class CloudResearchEngine:
    def __init__(self):
        # 1. SETUP BROWSER
        self.browser_conf = BrowserConfig(
            headless=True,
            verbose=False,
            # Specific args for Docker/Cloud
            extra_args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"],
            # Random real user agent
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        
        # 2. SETUP RUN CONFIG
        self.run_conf = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS
        )
        
        self.ai_client = Client(AI_CLIENT_URL)

    def search_bing_url(self, query):
        """Generates the Bing Search URL (Better for Bots)."""
        encoded_query = urllib.parse.quote_plus(query)
        # Using Bing instead of Google
        return f"https://www.bing.com/search?q={encoded_query}"

    async def crawl_single_page(self, url):
        """Crawls a URL with error handling."""
        async with AsyncWebCrawler(config=self.browser_conf) as crawler:
            try:
                # Small delay to be polite
                await asyncio.sleep(1) 
                result = await crawler.arun(url=url, config=self.run_conf)
                
                if result.success:
                    return result.markdown
                else:
                    return f"[Error: Could not read {url} - {result.error_message}]"
            except Exception as e:
                return f"[System Error reading {url}: {str(e)}]"

    def extract_links(self, markdown_text):
        """Finds links in the markdown and filters out Bing/Microsoft junk."""
        # Standard markdown links [text](url)
        links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
        
        clean_urls = []
        for text, url in links:
            # 1. Exclude Bing/Microsoft internal links
            if "bing.com" in url or "microsoft.com" in url or "msn.com" in url:
                continue
            # 2. Exclude other common junk
            if "google.com" in url or "youtube.com" in url:
                continue
            if len(url) < 15:
                continue
            
            # 3. De-duplicate domains
            domain = urllib.parse.urlparse(url).netloc
            if not any(domain in u for u in clean_urls):
                clean_urls.append(url)
                
        return clean_urls[:MAX_LINKS]

    def analyze_with_ai(self, prompt, context):
        """Sends data to the GLM-4.5 Space."""
        full_msg = (
            f"RESEARCH QUERY: {prompt}\n\n"
            f"EXTRACTED WEB DATA:\n{context}\n\n"
            f"TASK: Synthesize this information into a clear summary answer."
        )
        
        try:
            result = self.ai_client.predict(
                msg=full_msg,
                sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
                thinking_enabled=True,
                temperature=0.7,
                api_name="/chat_wrapper"
            )
            return str(result)
        except Exception as e:
            return f"⚠️ AI API Failed: {str(e)}"

# --- GRADIO INTERFACE ---

# Initialize engine globally
engine = CloudResearchEngine()

async def run_process(topic):
    log = f"πŸš€ Starting Research on: {topic}\n"
    yield log, "..."
    
    # 1. Search Bing
    search_url = engine.search_bing_url(topic)
    log += f"πŸ”Ž Search URL generated: {search_url}\n"
    yield log, "..."
    
    # 2. Get Search Results
    log += "πŸ•·οΈ Scanning Bing Results...\n"
    yield log, "..."
    
    serp_markdown = await engine.crawl_single_page(search_url)
    
    # 3. Extract Links
    links = engine.extract_links(serp_markdown)
    
    if not links:
        log += "❌ No links found. Even Bing might be blocking the IP, or the page loaded empty.\n"
        log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
        yield log, "Failed to find links."
        return

    log += f"βœ… Found {len(links)} Links: {links}\n"
    yield log, "..."
    
    # 4. Deep Crawl
    context_data = ""
    for i, link in enumerate(links):
        log += f"πŸ“₯ Reading ({i+1}/{len(links)}): {link}...\n"
        yield log, "..."
        page_text = await engine.crawl_single_page(link)
        context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
    
    # 5. AI Analysis
    log += "🧠 Sending data to AI for final report...\n"
    yield log, "Thinking..."
    
    summary = engine.analyze_with_ai(topic, context_data)
    
    log += "🏁 Done!"
    yield log, summary

with gr.Blocks(title="AI Research Agent") as demo:
    gr.Markdown("# πŸ€– AI Research Agent (Bing + Crawl4AI)")
    
    with gr.Row():
        inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
        btn = gr.Button("Research", variant="primary")
        
    with gr.Row():
        logs = gr.TextArea(label="System Logs", lines=10)
        out = gr.Markdown(label="Final Report")
        
    btn.click(run_process, inputs=inp, outputs=[logs, out])

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)