1qwsd commited on
Commit
f9141cd
Β·
verified Β·
1 Parent(s): bb7338c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -0
app.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Research Agent - Web Search and Summarization Tool
4
+ Deployed on Hugging Face Spaces with Gradio
5
+ """
6
+
7
+ import re
8
+ import urllib.parse
9
+ from ddgs import DDGS
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from sentence_transformers import SentenceTransformer
13
+ import numpy as np
14
+ import time
15
+ import gradio as gr
16
+
17
+ # Configuration
18
+ SEARCH_RESULTS = 6
19
+ PASSAGES_PER_PAGE = 4
20
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
+ TOP_PASSAGES = 5
22
+ SUMMARY_SENTENCES = 3
23
+ TIMEOUT = 8
24
+
25
+
26
+ def unwrap_ddg(url):
27
+ """If DuckDuckGo returns a redirect wrapper, extract the real URL."""
28
+ try:
29
+ parsed = urllib.parse.urlparse(url)
30
+ if "duckduckgo.com" in parsed.netloc:
31
+ qs = urllib.parse.parse_qs(parsed.query)
32
+ uddg = qs.get("uddg")
33
+ if uddg:
34
+ return urllib.parse.unquote(uddg[0])
35
+ except Exception:
36
+ pass
37
+ return url
38
+
39
+
40
+ def search_web(query, max_results=SEARCH_RESULTS):
41
+ """Search the web and return a list of URLs."""
42
+ urls = []
43
+ try:
44
+ with DDGS() as ddgs:
45
+ for r in ddgs.text(query, max_results=max_results):
46
+ url = r.get("href") or r.get("url")
47
+ if not url:
48
+ continue
49
+ url = unwrap_ddg(url)
50
+ urls.append(url)
51
+ except Exception as e:
52
+ print(f"Search error: {e}")
53
+ return urls
54
+
55
+
56
+ def fetch_text(url, timeout=TIMEOUT):
57
+ """Fetch and clean text content from a URL."""
58
+ headers = {"User-Agent": "Mozilla/5.0 (research-agent)"}
59
+ try:
60
+ r = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True)
61
+ if r.status_code != 200:
62
+ return ""
63
+ ct = r.headers.get("content-type", "")
64
+ if "html" not in ct.lower():
65
+ return ""
66
+
67
+ soup = BeautifulSoup(r.text, "html.parser")
68
+
69
+ # Remove unnecessary tags
70
+ for tag in soup(["script", "style", "noscript", "header", "footer",
71
+ "svg", "iframe", "nav", "aside"]):
72
+ tag.extract()
73
+
74
+ # Get paragraph text
75
+ paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
76
+ text = " ".join([p for p in paragraphs if p])
77
+
78
+ if text.strip():
79
+ return re.sub(r"\s+", " ", text).strip()
80
+
81
+ # Fallback to meta description
82
+ meta = soup.find("meta", attrs={"name": "description"}) or \
83
+ soup.find("meta", attrs={"property": "og:description"})
84
+ if meta and meta.get("content"):
85
+ return meta["content"].strip()
86
+
87
+ if soup.title and soup.title.string:
88
+ return soup.title.string.strip()
89
+
90
+ except Exception as e:
91
+ print(f"Fetch error for {url}: {e}")
92
+ return ""
93
+
94
+
95
+ def chunk_passages(text, max_words=120):
96
+ """Split long text into smaller passages."""
97
+ words = text.split()
98
+ if not words:
99
+ return []
100
+ chunks = []
101
+ i = 0
102
+ while i < len(words):
103
+ chunk = words[i : i + max_words]
104
+ chunks.append(" ".join(chunk))
105
+ i += max_words
106
+ return chunks
107
+
108
+
109
+ def split_sentences(text):
110
+ """A simple sentence splitter."""
111
+ parts = re.split(r'(?<=[.!?])\s+', text)
112
+ return [p.strip() for p in parts if p.strip()]
113
+
114
+
115
+ class ShortResearchAgent:
116
+ def __init__(self, embed_model=EMBEDDING_MODEL):
117
+ print(f"Loading embedder: {embed_model}...")
118
+ self.embedder = SentenceTransformer(embed_model)
119
+
120
+ def run(self, query, progress=gr.Progress()):
121
+ """Run the research agent pipeline."""
122
+ start = time.time()
123
+
124
+ # Step 1: Search
125
+ progress(0.1, desc="πŸ” Searching the web...")
126
+ urls = search_web(query)
127
+
128
+ if not urls:
129
+ elapsed = time.time() - start
130
+ return {
131
+ "query": query,
132
+ "passages": [],
133
+ "summary": "⚠️ No search results found. Please try a different query.",
134
+ "time": elapsed,
135
+ "num_urls": 0
136
+ }
137
+
138
+ # Step 2: Fetch & Chunk
139
+ progress(0.3, desc=f"πŸ“₯ Fetching content from {len(urls)} URLs...")
140
+ docs = []
141
+ for u in urls:
142
+ txt = fetch_text(u)
143
+ if not txt:
144
+ continue
145
+ chunks = chunk_passages(txt, max_words=120)
146
+ for c in chunks[:PASSAGES_PER_PAGE]:
147
+ docs.append({"url": u, "passage": c})
148
+
149
+ if not docs:
150
+ elapsed = time.time() - start
151
+ return {
152
+ "query": query,
153
+ "passages": [],
154
+ "summary": "⚠️ No content could be extracted from the search results.",
155
+ "time": elapsed,
156
+ "num_urls": len(urls)
157
+ }
158
+
159
+ # Step 3: Embed
160
+ progress(0.5, desc="🧠 Analyzing content with AI...")
161
+ texts = [d["passage"] for d in docs]
162
+ emb_texts = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
163
+ q_emb = self.embedder.encode([query], convert_to_numpy=True)[0]
164
+
165
+ # Step 4: Rank
166
+ progress(0.7, desc="πŸ“Š Ranking relevant passages...")
167
+ def cosine(a, b):
168
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)
169
+
170
+ sims = [cosine(e, q_emb) for e in emb_texts]
171
+ top_idx = np.argsort(sims)[::-1][:TOP_PASSAGES]
172
+ top_passages = [
173
+ {
174
+ "url": docs[i]["url"],
175
+ "passage": docs[i]["passage"],
176
+ "score": float(sims[i])
177
+ }
178
+ for i in top_idx
179
+ ]
180
+
181
+ # Step 5: Summarize
182
+ progress(0.9, desc="✍️ Generating summary...")
183
+ if not top_passages:
184
+ summary = "⚠️ No relevant passages found for summarization."
185
+ else:
186
+ sentences = []
187
+ for tp in top_passages:
188
+ for s in split_sentences(tp["passage"]):
189
+ sentences.append({"sent": s, "url": tp["url"]})
190
+
191
+ if not sentences:
192
+ summary = "⚠️ No sentences found in relevant passages."
193
+ else:
194
+ sent_texts = [s["sent"] for s in sentences]
195
+ sent_embs = self.embedder.encode(sent_texts, convert_to_numpy=True,
196
+ show_progress_bar=False)
197
+ sent_sims = [cosine(e, q_emb) for e in sent_embs]
198
+ top_sent_idx = np.argsort(sent_sims)[::-1][:SUMMARY_SENTENCES]
199
+ chosen = [sentences[idx] for idx in top_sent_idx]
200
+
201
+ # De-duplicate and format
202
+ seen = set()
203
+ lines = []
204
+ for s in chosen:
205
+ key = s["sent"].lower()[:80]
206
+ if key in seen:
207
+ continue
208
+ seen.add(key)
209
+ lines.append(f"{s['sent']} [(Source)]({s['url']})")
210
+
211
+ summary = "\n\n".join(lines)
212
+
213
+ elapsed = time.time() - start
214
+ progress(1.0, desc="βœ… Complete!")
215
+
216
+ return {
217
+ "query": query,
218
+ "passages": top_passages,
219
+ "summary": summary,
220
+ "time": elapsed,
221
+ "num_urls": len(urls)
222
+ }
223
+
224
+
225
+ # Initialize the agent globally
226
+ print("Initializing Research Agent...")
227
+ agent = ShortResearchAgent()
228
+
229
+
230
+ def research_interface(query):
231
+ """Gradio interface function."""
232
+ if not query or len(query.strip()) < 3:
233
+ return "❌ Please enter a valid query (at least 3 characters).", ""
234
+
235
+ try:
236
+ result = agent.run(query.strip())
237
+
238
+ # Format summary
239
+ summary_md = f"""# πŸ“ Research Summary
240
+
241
+ **Query:** {result['query']}
242
+
243
+ **Time taken:** {result['time']:.2f} seconds
244
+ **URLs searched:** {result['num_urls']}
245
+
246
+ ---
247
+
248
+ ## Summary
249
+
250
+ {result['summary']}
251
+ """
252
+
253
+ # Format detailed passages
254
+ passages_md = "# πŸ” Top Relevant Passages\n\n"
255
+ if result['passages']:
256
+ for i, p in enumerate(result['passages'], 1):
257
+ passages_md += f"""### Passage {i} (Relevance: {p['score']:.2%})
258
+
259
+ **Source:** [{p['url']}]({p['url']})
260
+
261
+ {p['passage']}
262
+
263
+ ---
264
+
265
+ """
266
+ else:
267
+ passages_md += "No passages found."
268
+
269
+ return summary_md, passages_md
270
+
271
+ except Exception as e:
272
+ error_msg = f"❌ **Error:** {str(e)}\n\nPlease try again with a different query."
273
+ return error_msg, ""
274
+
275
+
276
+ # Create Gradio interface
277
+ with gr.Blocks(theme=gr.themes.Soft(), title="AI Research Agent") as demo:
278
+ gr.Markdown("""
279
+ # πŸ€– AI Research Agent
280
+
281
+ ### Intelligent Web Search & Summarization Tool
282
+
283
+ This tool searches the web, analyzes multiple sources, and provides you with:
284
+ - **AI-generated summary** of the most relevant information
285
+ - **Top passages** ranked by relevance with sources
286
+ - **Fast results** powered by semantic search
287
+
288
+ Simply enter your question below and let the AI do the research for you!
289
+ """)
290
+
291
+ with gr.Row():
292
+ with gr.Column(scale=4):
293
+ query_input = gr.Textbox(
294
+ label="πŸ” Enter your research query",
295
+ placeholder="e.g., What causes urban heat islands and how can cities reduce them?",
296
+ lines=2
297
+ )
298
+ with gr.Column(scale=1):
299
+ search_btn = gr.Button("πŸš€ Research", variant="primary", size="lg")
300
+
301
+ gr.Markdown("### πŸ’‘ Example Queries")
302
+ with gr.Row():
303
+ example_btns = [
304
+ gr.Button("🌑️ Urban heat islands", size="sm"),
305
+ gr.Button("πŸ€– Latest AI developments", size="sm"),
306
+ gr.Button("🌱 Sustainable energy solutions", size="sm"),
307
+ gr.Button("🧬 CRISPR gene editing", size="sm")
308
+ ]
309
+
310
+ gr.Markdown("---")
311
+
312
+ with gr.Row():
313
+ with gr.Column():
314
+ summary_output = gr.Markdown(label="Summary")
315
+
316
+ with gr.Accordion("πŸ“š Detailed Passages", open=False):
317
+ passages_output = gr.Markdown(label="Top Passages")
318
+
319
+ # Event handlers
320
+ search_btn.click(
321
+ fn=research_interface,
322
+ inputs=[query_input],
323
+ outputs=[summary_output, passages_output]
324
+ )
325
+
326
+ query_input.submit(
327
+ fn=research_interface,
328
+ inputs=[query_input],
329
+ outputs=[summary_output, passages_output]
330
+ )
331
+
332
+ # Example button handlers
333
+ example_queries = [
334
+ "What causes urban heat islands and how can cities reduce them?",
335
+ "What are the latest developments in artificial intelligence?",
336
+ "What are the most promising sustainable energy solutions?",
337
+ "How does CRISPR gene editing work and what are its applications?"
338
+ ]
339
+
340
+ for btn, query in zip(example_btns, example_queries):
341
+ btn.click(
342
+ fn=lambda q=query: q,
343
+ outputs=[query_input]
344
+ )
345
+
346
+ gr.Markdown("""
347
+ ---
348
+ ### πŸ“Œ Tips
349
+ - Be specific with your queries for better results
350
+ - The tool analyzes 6 web sources by default
351
+ - Results typically take 10-30 seconds depending on query complexity
352
+
353
+ **Built with:** DuckDuckGo Search, Sentence Transformers, Gradio
354
+ """)
355
+
356
+ # Launch the app
357
+ if __name__ == "__main__":
358
+ demo.launch()