Spaces:
Sleeping
Sleeping
| """ | |
| Deep Research Tool - 教学版本 | |
| 基于现有工具实现多源信息整合的深度研究功能 | |
| 设计思路: | |
| 1. 不添加新的外部依赖 | |
| 2. 组合现有的 wiki_search, web_search, arxiv_search | |
| 3. 实现智能去重和结果整合 | |
| 4. 提供清晰的来源追踪 | |
| """ | |
| from langchain_core.tools import tool | |
| from langchain_community.tools.tavily_search import TavilySearchResults | |
| from langchain_community.document_loaders import WikipediaLoader, ArxivLoader | |
| from typing import Dict, List | |
| import hashlib | |
| def deep_research(query: str) -> str: | |
| """Perform comprehensive multi-source research on a topic. | |
| This tool combines Wikipedia, Web Search, and Academic Papers to provide | |
| a thorough analysis from multiple perspectives. | |
| Use this when: | |
| - You need to verify information from multiple sources | |
| - The question requires both general knowledge and current information | |
| - You want academic backing for factual claims | |
| Args: | |
| query: The research question or topic | |
| Returns: | |
| Structured research report with information from multiple sources | |
| """ | |
| print(f"\n{'='*60}") | |
| print(f"🔍 Deep Research: {query}") | |
| print(f"{'='*60}\n") | |
| results = { | |
| "wikipedia": [], | |
| "web": [], | |
| "academic": [] | |
| } | |
| # === 阶段 1: 多源并行搜索 === | |
| print("📚 Phase 1: Multi-source search...") | |
| # 1.1 Wikipedia - 权威的背景知识 | |
| try: | |
| print(" → Searching Wikipedia...") | |
| wiki_docs = WikipediaLoader(query=query, load_max_docs=2).load() | |
| for doc in wiki_docs: | |
| results["wikipedia"].append({ | |
| "source": doc.metadata.get("source", "Wikipedia"), | |
| "content": doc.page_content[:2000], # 截取前500字符 | |
| "full_content": doc.page_content | |
| }) | |
| print(f" ✓ Found {len(results['wikipedia'])} Wikipedia articles") | |
| except Exception as e: | |
| print(f" ✗ Wikipedia search failed: {e}") | |
| # 1.2 Web Search - 最新信息 | |
| try: | |
| print(" → Searching Web (Tavily)...") | |
| tavily_tool = TavilySearchResults(max_results=10) | |
| web_docs = tavily_tool.invoke({"query": query}) # 正确的调用方式 | |
| for doc in web_docs: | |
| # Tavily 返回的是字典列表 | |
| if isinstance(doc, dict): | |
| results["web"].append({ | |
| "source": doc.get("url", "Web"), | |
| "content": doc.get("content", "")[:2000], | |
| "full_content": doc.get("content", "") | |
| }) | |
| print(f" ✓ Found {len(results['web'])} web results") | |
| except Exception as e: | |
| print(f" ✗ Web search failed: {e}") | |
| # 1.3 Arxiv - 学术论文 | |
| try: | |
| print(" → Searching Arxiv...") | |
| arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load() | |
| for doc in arxiv_docs: | |
| results["academic"].append({ | |
| "source": doc.metadata.get("source", "Arxiv"), | |
| "content": doc.page_content[:2000], | |
| "full_content": doc.page_content | |
| }) | |
| print(f" ✓ Found {len(results['academic'])} academic papers") | |
| except Exception as e: | |
| print(f" ✗ Arxiv search failed: {e}") | |
| # === 阶段 2: 信息去重 === | |
| print("\n🧹 Phase 2: Deduplication...") | |
| unique_results = deduplicate_results(results) | |
| print(f" → Removed duplicates, kept {sum(len(v) for v in unique_results.values())} unique results") | |
| # === 阶段 3: 格式化输出 === | |
| print("\n📝 Phase 3: Formatting report...\n") | |
| report = format_research_report(query, unique_results) | |
| return report | |
| def deduplicate_results(results: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]: | |
| """去重:基于内容相似度移除重复结果 | |
| 使用简单的哈希方法: | |
| - 提取每个结果的前100个字符 | |
| - 计算哈希值 | |
| - 移除重复的哈希 | |
| """ | |
| seen_hashes = set() | |
| unique_results = { | |
| "wikipedia": [], | |
| "web": [], | |
| "academic": [] | |
| } | |
| for category, items in results.items(): | |
| for item in items: | |
| # 使用内容的前100个字符计算哈希 | |
| content_sample = item["content"][:100].lower().strip() | |
| content_hash = hashlib.md5(content_sample.encode()).hexdigest() | |
| if content_hash not in seen_hashes: | |
| seen_hashes.add(content_hash) | |
| unique_results[category].append(item) | |
| return unique_results | |
| def format_research_report(query: str, results: Dict[str, List[Dict]]) -> str: | |
| """格式化研究报告 | |
| 输出结构: | |
| 1. 总览 | |
| 2. Wikipedia 发现 | |
| 3. Web 发现 | |
| 4. 学术发现 | |
| 5. 来源列表 | |
| """ | |
| report_lines = [] | |
| # === 标题 === | |
| report_lines.append(f"DEEP RESEARCH REPORT: {query}") | |
| report_lines.append("=" * 70) | |
| report_lines.append("") | |
| # === 总览 === | |
| total_sources = sum(len(v) for v in results.values()) | |
| report_lines.append(f"📊 OVERVIEW") | |
| report_lines.append(f"Total sources found: {total_sources}") | |
| report_lines.append(f" - Wikipedia: {len(results['wikipedia'])} articles") | |
| report_lines.append(f" - Web: {len(results['web'])} pages") | |
| report_lines.append(f" - Academic: {len(results['academic'])} papers") | |
| report_lines.append("") | |
| # === Wikipedia 发现 === | |
| if results["wikipedia"]: | |
| report_lines.append("📚 WIKIPEDIA FINDINGS") | |
| report_lines.append("-" * 70) | |
| for i, item in enumerate(results["wikipedia"], 1): | |
| report_lines.append(f"\n[{i}] Source: {item['source']}") | |
| report_lines.append(f"Content: {item['content']}...") | |
| report_lines.append("") | |
| # === Web 发现 === | |
| if results["web"]: | |
| report_lines.append("🌐 WEB FINDINGS") | |
| report_lines.append("-" * 70) | |
| for i, item in enumerate(results["web"], 1): | |
| report_lines.append(f"\n[{i}] Source: {item['source']}") | |
| report_lines.append(f"Content: {item['content']}...") | |
| report_lines.append("") | |
| # === 学术发现 === | |
| if results["academic"]: | |
| report_lines.append("🎓 ACADEMIC FINDINGS") | |
| report_lines.append("-" * 70) | |
| for i, item in enumerate(results["academic"], 1): | |
| report_lines.append(f"\n[{i}] Source: {item['source']}") | |
| report_lines.append(f"Content: {item['content']}...") | |
| report_lines.append("") | |
| # === 来源列表 === | |
| report_lines.append("📋 ALL SOURCES") | |
| report_lines.append("-" * 70) | |
| source_count = 1 | |
| for category, items in results.items(): | |
| for item in items: | |
| report_lines.append(f"[{source_count}] {item['source']}") | |
| source_count += 1 | |
| report_lines.append("") | |
| report_lines.append("=" * 70) | |
| report_lines.append("End of Deep Research Report") | |
| return "\n".join(report_lines) | |
| # ============================================================================ | |
| # 进阶版本:带有简单的信息综合 | |
| # ============================================================================ | |
| def deep_research_with_synthesis(query: str) -> str: | |
| """Advanced deep research with AI-powered synthesis. | |
| This version not only collects information but also: | |
| 1. Identifies common themes across sources | |
| 2. Highlights contradictions | |
| 3. Provides a synthesized summary | |
| Args: | |
| query: The research question | |
| Returns: | |
| Research report with synthesis | |
| """ | |
| # 先执行基础的 deep research | |
| basic_report = deep_research.invoke({"query": query}) | |
| # TODO: 在未来可以添加 LLM 驱动的综合分析 | |
| # 例如:让 LLM 分析所有来源,找出共同点和差异 | |
| synthesis_note = """ | |
| 🤖 SYNTHESIS (To be implemented): | |
| This section will use an LLM to: | |
| - Identify key themes across all sources | |
| - Highlight agreements and contradictions | |
| - Provide confidence scores for different claims | |
| - Suggest follow-up questions | |
| For now, please manually review the findings above. | |
| """ | |
| return basic_report + synthesis_note | |
| # ============================================================================ | |
| # 测试代码 | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| # 测试 deep research | |
| test_query = "What is quantum computing?" | |
| print("\n" + "="*70) | |
| print("Testing Deep Research Tool") | |
| print("="*70 + "\n") | |
| result = deep_research.invoke({"query": test_query}) | |
| print(result) | |