GAIA_Agent_DeepResearch / deep_research_tool.py
humblebanana
1st
176a845
"""
Deep Research Tool - 教学版本
基于现有工具实现多源信息整合的深度研究功能
设计思路:
1. 不添加新的外部依赖
2. 组合现有的 wiki_search, web_search, arxiv_search
3. 实现智能去重和结果整合
4. 提供清晰的来源追踪
"""
from langchain_core.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
from typing import Dict, List
import hashlib
@tool
def deep_research(query: str) -> str:
"""Perform comprehensive multi-source research on a topic.
This tool combines Wikipedia, Web Search, and Academic Papers to provide
a thorough analysis from multiple perspectives.
Use this when:
- You need to verify information from multiple sources
- The question requires both general knowledge and current information
- You want academic backing for factual claims
Args:
query: The research question or topic
Returns:
Structured research report with information from multiple sources
"""
print(f"\n{'='*60}")
print(f"🔍 Deep Research: {query}")
print(f"{'='*60}\n")
results = {
"wikipedia": [],
"web": [],
"academic": []
}
# === 阶段 1: 多源并行搜索 ===
print("📚 Phase 1: Multi-source search...")
# 1.1 Wikipedia - 权威的背景知识
try:
print(" → Searching Wikipedia...")
wiki_docs = WikipediaLoader(query=query, load_max_docs=2).load()
for doc in wiki_docs:
results["wikipedia"].append({
"source": doc.metadata.get("source", "Wikipedia"),
"content": doc.page_content[:2000], # 截取前500字符
"full_content": doc.page_content
})
print(f" ✓ Found {len(results['wikipedia'])} Wikipedia articles")
except Exception as e:
print(f" ✗ Wikipedia search failed: {e}")
# 1.2 Web Search - 最新信息
try:
print(" → Searching Web (Tavily)...")
tavily_tool = TavilySearchResults(max_results=10)
web_docs = tavily_tool.invoke({"query": query}) # 正确的调用方式
for doc in web_docs:
# Tavily 返回的是字典列表
if isinstance(doc, dict):
results["web"].append({
"source": doc.get("url", "Web"),
"content": doc.get("content", "")[:2000],
"full_content": doc.get("content", "")
})
print(f" ✓ Found {len(results['web'])} web results")
except Exception as e:
print(f" ✗ Web search failed: {e}")
# 1.3 Arxiv - 学术论文
try:
print(" → Searching Arxiv...")
arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load()
for doc in arxiv_docs:
results["academic"].append({
"source": doc.metadata.get("source", "Arxiv"),
"content": doc.page_content[:2000],
"full_content": doc.page_content
})
print(f" ✓ Found {len(results['academic'])} academic papers")
except Exception as e:
print(f" ✗ Arxiv search failed: {e}")
# === 阶段 2: 信息去重 ===
print("\n🧹 Phase 2: Deduplication...")
unique_results = deduplicate_results(results)
print(f" → Removed duplicates, kept {sum(len(v) for v in unique_results.values())} unique results")
# === 阶段 3: 格式化输出 ===
print("\n📝 Phase 3: Formatting report...\n")
report = format_research_report(query, unique_results)
return report
def deduplicate_results(results: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
"""去重:基于内容相似度移除重复结果
使用简单的哈希方法:
- 提取每个结果的前100个字符
- 计算哈希值
- 移除重复的哈希
"""
seen_hashes = set()
unique_results = {
"wikipedia": [],
"web": [],
"academic": []
}
for category, items in results.items():
for item in items:
# 使用内容的前100个字符计算哈希
content_sample = item["content"][:100].lower().strip()
content_hash = hashlib.md5(content_sample.encode()).hexdigest()
if content_hash not in seen_hashes:
seen_hashes.add(content_hash)
unique_results[category].append(item)
return unique_results
def format_research_report(query: str, results: Dict[str, List[Dict]]) -> str:
"""格式化研究报告
输出结构:
1. 总览
2. Wikipedia 发现
3. Web 发现
4. 学术发现
5. 来源列表
"""
report_lines = []
# === 标题 ===
report_lines.append(f"DEEP RESEARCH REPORT: {query}")
report_lines.append("=" * 70)
report_lines.append("")
# === 总览 ===
total_sources = sum(len(v) for v in results.values())
report_lines.append(f"📊 OVERVIEW")
report_lines.append(f"Total sources found: {total_sources}")
report_lines.append(f" - Wikipedia: {len(results['wikipedia'])} articles")
report_lines.append(f" - Web: {len(results['web'])} pages")
report_lines.append(f" - Academic: {len(results['academic'])} papers")
report_lines.append("")
# === Wikipedia 发现 ===
if results["wikipedia"]:
report_lines.append("📚 WIKIPEDIA FINDINGS")
report_lines.append("-" * 70)
for i, item in enumerate(results["wikipedia"], 1):
report_lines.append(f"\n[{i}] Source: {item['source']}")
report_lines.append(f"Content: {item['content']}...")
report_lines.append("")
# === Web 发现 ===
if results["web"]:
report_lines.append("🌐 WEB FINDINGS")
report_lines.append("-" * 70)
for i, item in enumerate(results["web"], 1):
report_lines.append(f"\n[{i}] Source: {item['source']}")
report_lines.append(f"Content: {item['content']}...")
report_lines.append("")
# === 学术发现 ===
if results["academic"]:
report_lines.append("🎓 ACADEMIC FINDINGS")
report_lines.append("-" * 70)
for i, item in enumerate(results["academic"], 1):
report_lines.append(f"\n[{i}] Source: {item['source']}")
report_lines.append(f"Content: {item['content']}...")
report_lines.append("")
# === 来源列表 ===
report_lines.append("📋 ALL SOURCES")
report_lines.append("-" * 70)
source_count = 1
for category, items in results.items():
for item in items:
report_lines.append(f"[{source_count}] {item['source']}")
source_count += 1
report_lines.append("")
report_lines.append("=" * 70)
report_lines.append("End of Deep Research Report")
return "\n".join(report_lines)
# ============================================================================
# 进阶版本:带有简单的信息综合
# ============================================================================
@tool
def deep_research_with_synthesis(query: str) -> str:
"""Advanced deep research with AI-powered synthesis.
This version not only collects information but also:
1. Identifies common themes across sources
2. Highlights contradictions
3. Provides a synthesized summary
Args:
query: The research question
Returns:
Research report with synthesis
"""
# 先执行基础的 deep research
basic_report = deep_research.invoke({"query": query})
# TODO: 在未来可以添加 LLM 驱动的综合分析
# 例如:让 LLM 分析所有来源,找出共同点和差异
synthesis_note = """
🤖 SYNTHESIS (To be implemented):
This section will use an LLM to:
- Identify key themes across all sources
- Highlight agreements and contradictions
- Provide confidence scores for different claims
- Suggest follow-up questions
For now, please manually review the findings above.
"""
return basic_report + synthesis_note
# ============================================================================
# 测试代码
# ============================================================================
if __name__ == "__main__":
# 测试 deep research
test_query = "What is quantum computing?"
print("\n" + "="*70)
print("Testing Deep Research Tool")
print("="*70 + "\n")
result = deep_research.invoke({"query": test_query})
print(result)