mdnazib963 commited on
Commit
f91ccd4
Β·
verified Β·
1 Parent(s): 8781d8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import nest_asyncio
4
+ import re
5
+ import urllib.parse
6
+ import os
7
+ from crawl4ai import AsyncWebCrawler
8
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
9
+ from gradio_client import Client
10
+
11
+ # Apply nest_asyncio to handle the event loop in the cloud
12
+ nest_asyncio.apply()
13
+
14
+ # --- CONFIGURATIONS ---
15
+ AI_CLIENT_URL = "zai-org/GLM-4.5-Space"
16
+ MAX_LINKS = 3
17
+
18
+ class CloudResearchEngine:
19
+ def __init__(self):
20
+ # Browser config optimized for Docker/Cloud containers
21
+ self.browser_conf = BrowserConfig(
22
+ headless=True,
23
+ verbose=False,
24
+ # Specific args to run safely in Docker
25
+ args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"]
26
+ )
27
+ self.run_conf = CrawlerRunConfig(
28
+ cache_mode=CacheMode.BYPASS,
29
+ # Stealth headers to try and bypass simple bot detection
30
+ headers={
31
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
32
+ }
33
+ )
34
+ self.ai_client = Client(AI_CLIENT_URL)
35
+
36
+ def search_google_url(self, query):
37
+ """Generates the Google Search URL."""
38
+ encoded_query = urllib.parse.quote_plus(query)
39
+ # We add 'gl=us' (GeoLocation US) and 'hl=en' (Language English)
40
+ return f"https://www.google.com/search?q={encoded_query}&num=10&hl=en&gl=us"
41
+
42
+ async def crawl_single_page(self, url):
43
+ """Crawls a URL with error handling for the cloud environment."""
44
+ async with AsyncWebCrawler(config=self.browser_conf) as crawler:
45
+ try:
46
+ # Add a small delay to be polite and avoid immediate blocks
47
+ await asyncio.sleep(1)
48
+ result = await crawler.arun(url=url, config=self.run_conf)
49
+
50
+ if result.success:
51
+ return result.markdown
52
+ else:
53
+ return f"[Error: Could not read {url} - {result.error_message}]"
54
+ except Exception as e:
55
+ return f"[System Error reading {url}: {str(e)}]"
56
+
57
+ def extract_links(self, markdown_text):
58
+ """Finds links in the markdown. Handles Google's messy redirection links."""
59
+ # Standard markdown links [text](url)
60
+ links = re.findall(r'\[(.*?)\]\((https?://.*?)\)', markdown_text)
61
+
62
+ clean_urls = []
63
+ for text, url in links:
64
+ # Filter out Google internal links and tiny links
65
+ if "google.com" in url or "youtube.com" in url:
66
+ continue
67
+ if len(url) < 15:
68
+ continue
69
+
70
+ # De-duplicate
71
+ domain = urllib.parse.urlparse(url).netloc
72
+ if not any(domain in u for u in clean_urls):
73
+ clean_urls.append(url)
74
+
75
+ return clean_urls[:MAX_LINKS]
76
+
77
+ def analyze_with_ai(self, prompt, context):
78
+ """Sends data to the GLM-4.5 Space."""
79
+ full_msg = (
80
+ f"RESEARCH QUERY: {prompt}\n\n"
81
+ f"EXTRACTED WEB DATA:\n{context}\n\n"
82
+ f"TASK: Synthesize this information into a clear summary answer."
83
+ )
84
+
85
+ try:
86
+ result = self.ai_client.predict(
87
+ msg=full_msg,
88
+ sys_prompt="You are a helpful research assistant. Summarize the web data accurately.",
89
+ thinking_enabled=True,
90
+ temperature=0.7,
91
+ api_name="/chat_wrapper"
92
+ )
93
+ return str(result)
94
+ except Exception as e:
95
+ return f"⚠️ AI API Failed: {str(e)}"
96
+
97
+ # --- GRADIO INTERFACE ---
98
+
99
+ engine = CloudResearchEngine()
100
+
101
+ async def run_process(topic):
102
+ log = f"πŸš€ Starting Research on: {topic}\n"
103
+ yield log, "..."
104
+
105
+ # 1. Search Google
106
+ search_url = engine.search_google_url(topic)
107
+ log += f"πŸ”Ž Search URL generated: {search_url}\n"
108
+ yield log, "..."
109
+
110
+ # 2. Get Search Results
111
+ log += "πŸ•·οΈ Scanning Search Results (this may take 10s)...\n"
112
+ yield log, "..."
113
+
114
+ serp_markdown = await engine.crawl_single_page(search_url)
115
+
116
+ # 3. Extract Links
117
+ links = engine.extract_links(serp_markdown)
118
+
119
+ if not links:
120
+ log += "❌ No links found. Google might have blocked the Cloud IP. Try a more specific query.\n"
121
+ log += f"Debug - Raw Content Length: {len(serp_markdown)}\n"
122
+ yield log, "Failed to find links."
123
+ return
124
+
125
+ log += f"βœ… Found {len(links)} Links: {links}\n"
126
+ yield log, "..."
127
+
128
+ # 4. Deep Crawl
129
+ context_data = ""
130
+ for i, link in enumerate(links):
131
+ log += f"πŸ“₯ Reading ({i+1}/{len(links)}): {link}...\n"
132
+ yield log, "..."
133
+ page_text = await engine.crawl_single_page(link)
134
+ context_data += f"\n--- SOURCE: {link} ---\n{page_text[:10000]}\n"
135
+
136
+ # 5. AI Analysis
137
+ log += "🧠 Sending data to AI for final report...\n"
138
+ yield log, "Thinking..."
139
+
140
+ summary = engine.analyze_with_ai(topic, context_data)
141
+
142
+ log += "🏁 Done!"
143
+ yield log, summary
144
+
145
+ with gr.Blocks(title="AI Research Agent") as demo:
146
+ gr.Markdown("# πŸ€– AI Research Agent (Docker/Crawl4AI)")
147
+
148
+ with gr.Row():
149
+ inp = gr.Textbox(label="Topic", placeholder="Enter research topic...")
150
+ btn = gr.Button("Research", variant="primary")
151
+
152
+ with gr.Row():
153
+ logs = gr.TextArea(label="System Logs", lines=10)
154
+ out = gr.Markdown(label="Final Report")
155
+
156
+ btn.click(run_process, inputs=inp, outputs=[logs, out])
157
+
158
+ if __name__ == "__main__":
159
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)