""" Deep Research Tool - 教学版本 基于现有工具实现多源信息整合的深度研究功能 设计思路: 1. 不添加新的外部依赖 2. 组合现有的 wiki_search, web_search, arxiv_search 3. 实现智能去重和结果整合 4. 提供清晰的来源追踪 """ from langchain_core.tools import tool from langchain_community.tools.tavily_search import TavilySearchResults from langchain_community.document_loaders import WikipediaLoader, ArxivLoader from typing import Dict, List import hashlib @tool def deep_research(query: str) -> str: """Perform comprehensive multi-source research on a topic. This tool combines Wikipedia, Web Search, and Academic Papers to provide a thorough analysis from multiple perspectives. Use this when: - You need to verify information from multiple sources - The question requires both general knowledge and current information - You want academic backing for factual claims Args: query: The research question or topic Returns: Structured research report with information from multiple sources """ print(f"\n{'='*60}") print(f"🔍 Deep Research: {query}") print(f"{'='*60}\n") results = { "wikipedia": [], "web": [], "academic": [] } # === 阶段 1: 多源并行搜索 === print("📚 Phase 1: Multi-source search...") # 1.1 Wikipedia - 权威的背景知识 try: print(" → Searching Wikipedia...") wiki_docs = WikipediaLoader(query=query, load_max_docs=2).load() for doc in wiki_docs: results["wikipedia"].append({ "source": doc.metadata.get("source", "Wikipedia"), "content": doc.page_content[:2000], # 截取前500字符 "full_content": doc.page_content }) print(f" ✓ Found {len(results['wikipedia'])} Wikipedia articles") except Exception as e: print(f" ✗ Wikipedia search failed: {e}") # 1.2 Web Search - 最新信息 try: print(" → Searching Web (Tavily)...") tavily_tool = TavilySearchResults(max_results=10) web_docs = tavily_tool.invoke({"query": query}) # 正确的调用方式 for doc in web_docs: # Tavily 返回的是字典列表 if isinstance(doc, dict): results["web"].append({ "source": doc.get("url", "Web"), "content": doc.get("content", "")[:2000], "full_content": doc.get("content", "") }) print(f" ✓ Found {len(results['web'])} web results") except Exception as e: print(f" ✗ Web search failed: {e}") # 1.3 Arxiv - 学术论文 try: print(" → Searching Arxiv...") arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load() for doc in arxiv_docs: results["academic"].append({ "source": doc.metadata.get("source", "Arxiv"), "content": doc.page_content[:2000], "full_content": doc.page_content }) print(f" ✓ Found {len(results['academic'])} academic papers") except Exception as e: print(f" ✗ Arxiv search failed: {e}") # === 阶段 2: 信息去重 === print("\n🧹 Phase 2: Deduplication...") unique_results = deduplicate_results(results) print(f" → Removed duplicates, kept {sum(len(v) for v in unique_results.values())} unique results") # === 阶段 3: 格式化输出 === print("\n📝 Phase 3: Formatting report...\n") report = format_research_report(query, unique_results) return report def deduplicate_results(results: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]: """去重:基于内容相似度移除重复结果 使用简单的哈希方法: - 提取每个结果的前100个字符 - 计算哈希值 - 移除重复的哈希 """ seen_hashes = set() unique_results = { "wikipedia": [], "web": [], "academic": [] } for category, items in results.items(): for item in items: # 使用内容的前100个字符计算哈希 content_sample = item["content"][:100].lower().strip() content_hash = hashlib.md5(content_sample.encode()).hexdigest() if content_hash not in seen_hashes: seen_hashes.add(content_hash) unique_results[category].append(item) return unique_results def format_research_report(query: str, results: Dict[str, List[Dict]]) -> str: """格式化研究报告 输出结构: 1. 总览 2. Wikipedia 发现 3. Web 发现 4. 学术发现 5. 来源列表 """ report_lines = [] # === 标题 === report_lines.append(f"DEEP RESEARCH REPORT: {query}") report_lines.append("=" * 70) report_lines.append("") # === 总览 === total_sources = sum(len(v) for v in results.values()) report_lines.append(f"📊 OVERVIEW") report_lines.append(f"Total sources found: {total_sources}") report_lines.append(f" - Wikipedia: {len(results['wikipedia'])} articles") report_lines.append(f" - Web: {len(results['web'])} pages") report_lines.append(f" - Academic: {len(results['academic'])} papers") report_lines.append("") # === Wikipedia 发现 === if results["wikipedia"]: report_lines.append("📚 WIKIPEDIA FINDINGS") report_lines.append("-" * 70) for i, item in enumerate(results["wikipedia"], 1): report_lines.append(f"\n[{i}] Source: {item['source']}") report_lines.append(f"Content: {item['content']}...") report_lines.append("") # === Web 发现 === if results["web"]: report_lines.append("🌐 WEB FINDINGS") report_lines.append("-" * 70) for i, item in enumerate(results["web"], 1): report_lines.append(f"\n[{i}] Source: {item['source']}") report_lines.append(f"Content: {item['content']}...") report_lines.append("") # === 学术发现 === if results["academic"]: report_lines.append("🎓 ACADEMIC FINDINGS") report_lines.append("-" * 70) for i, item in enumerate(results["academic"], 1): report_lines.append(f"\n[{i}] Source: {item['source']}") report_lines.append(f"Content: {item['content']}...") report_lines.append("") # === 来源列表 === report_lines.append("📋 ALL SOURCES") report_lines.append("-" * 70) source_count = 1 for category, items in results.items(): for item in items: report_lines.append(f"[{source_count}] {item['source']}") source_count += 1 report_lines.append("") report_lines.append("=" * 70) report_lines.append("End of Deep Research Report") return "\n".join(report_lines) # ============================================================================ # 进阶版本:带有简单的信息综合 # ============================================================================ @tool def deep_research_with_synthesis(query: str) -> str: """Advanced deep research with AI-powered synthesis. This version not only collects information but also: 1. Identifies common themes across sources 2. Highlights contradictions 3. Provides a synthesized summary Args: query: The research question Returns: Research report with synthesis """ # 先执行基础的 deep research basic_report = deep_research.invoke({"query": query}) # TODO: 在未来可以添加 LLM 驱动的综合分析 # 例如:让 LLM 分析所有来源,找出共同点和差异 synthesis_note = """ 🤖 SYNTHESIS (To be implemented): This section will use an LLM to: - Identify key themes across all sources - Highlight agreements and contradictions - Provide confidence scores for different claims - Suggest follow-up questions For now, please manually review the findings above. """ return basic_report + synthesis_note # ============================================================================ # 测试代码 # ============================================================================ if __name__ == "__main__": # 测试 deep research test_query = "What is quantum computing?" print("\n" + "="*70) print("Testing Deep Research Tool") print("="*70 + "\n") result = deep_research.invoke({"query": test_query}) print(result)