Spaces:

Humbleguava
/

GAIA_Agent_DeepResearch

Sleeping

GAIA_Agent_DeepResearch / deep_research_tool.py

humblebanana

1st

176a845 3 months ago

8.58 kB

	"""
	Deep Research Tool - 教学版本
	基于现有工具实现多源信息整合的深度研究功能

	设计思路：
	1. 不添加新的外部依赖
	2. 组合现有的 wiki_search, web_search, arxiv_search
	3. 实现智能去重和结果整合
	4. 提供清晰的来源追踪
	"""

	from langchain_core.tools import tool
	from langchain_community.tools.tavily_search import TavilySearchResults
	from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
	from typing import Dict, List
	import hashlib


	@tool
	def deep_research(query: str) -> str:
	"""Perform comprehensive multi-source research on a topic.

	This tool combines Wikipedia, Web Search, and Academic Papers to provide
	a thorough analysis from multiple perspectives.

	Use this when:
	- You need to verify information from multiple sources
	- The question requires both general knowledge and current information
	- You want academic backing for factual claims

	Args:
	query: The research question or topic

	Returns:
	Structured research report with information from multiple sources
	"""

	print(f"\n{'='*60}")
	print(f"🔍 Deep Research: {query}")
	print(f"{'='*60}\n")

	results = {
	"wikipedia": [],
	"web": [],
	"academic": []
	}

	# === 阶段 1: 多源并行搜索 ===
	print("📚 Phase 1: Multi-source search...")

	# 1.1 Wikipedia - 权威的背景知识
	try:
	print(" → Searching Wikipedia...")
	wiki_docs = WikipediaLoader(query=query, load_max_docs=2).load()
	for doc in wiki_docs:
	results["wikipedia"].append({
	"source": doc.metadata.get("source", "Wikipedia"),
	"content": doc.page_content[:2000], # 截取前500字符
	"full_content": doc.page_content
	})
	print(f" ✓ Found {len(results['wikipedia'])} Wikipedia articles")
	except Exception as e:
	print(f" ✗ Wikipedia search failed: {e}")

	# 1.2 Web Search - 最新信息
	try:
	print(" → Searching Web (Tavily)...")
	tavily_tool = TavilySearchResults(max_results=10)
	web_docs = tavily_tool.invoke({"query": query}) # 正确的调用方式
	for doc in web_docs:
	# Tavily 返回的是字典列表
	if isinstance(doc, dict):
	results["web"].append({
	"source": doc.get("url", "Web"),
	"content": doc.get("content", "")[:2000],
	"full_content": doc.get("content", "")
	})
	print(f" ✓ Found {len(results['web'])} web results")
	except Exception as e:
	print(f" ✗ Web search failed: {e}")

	# 1.3 Arxiv - 学术论文
	try:
	print(" → Searching Arxiv...")
	arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load()
	for doc in arxiv_docs:
	results["academic"].append({
	"source": doc.metadata.get("source", "Arxiv"),
	"content": doc.page_content[:2000],
	"full_content": doc.page_content
	})
	print(f" ✓ Found {len(results['academic'])} academic papers")
	except Exception as e:
	print(f" ✗ Arxiv search failed: {e}")

	# === 阶段 2: 信息去重 ===
	print("\n🧹 Phase 2: Deduplication...")
	unique_results = deduplicate_results(results)
	print(f" → Removed duplicates, kept {sum(len(v) for v in unique_results.values())} unique results")

	# === 阶段 3: 格式化输出 ===
	print("\n📝 Phase 3: Formatting report...\n")
	report = format_research_report(query, unique_results)

	return report


	def deduplicate_results(results: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
	"""去重：基于内容相似度移除重复结果

	使用简单的哈希方法：
	- 提取每个结果的前100个字符
	- 计算哈希值
	- 移除重复的哈希
	"""
	seen_hashes = set()
	unique_results = {
	"wikipedia": [],
	"web": [],
	"academic": []
	}

	for category, items in results.items():
	for item in items:
	# 使用内容的前100个字符计算哈希
	content_sample = item["content"][:100].lower().strip()
	content_hash = hashlib.md5(content_sample.encode()).hexdigest()

	if content_hash not in seen_hashes:
	seen_hashes.add(content_hash)
	unique_results[category].append(item)

	return unique_results


	def format_research_report(query: str, results: Dict[str, List[Dict]]) -> str:
	"""格式化研究报告

	输出结构：
	1. 总览
	2. Wikipedia 发现
	3. Web 发现
	4. 学术发现
	5. 来源列表
	"""

	report_lines = []

	# === 标题 ===
	report_lines.append(f"DEEP RESEARCH REPORT: {query}")
	report_lines.append("=" * 70)
	report_lines.append("")

	# === 总览 ===
	total_sources = sum(len(v) for v in results.values())
	report_lines.append(f"📊 OVERVIEW")
	report_lines.append(f"Total sources found: {total_sources}")
	report_lines.append(f" - Wikipedia: {len(results['wikipedia'])} articles")
	report_lines.append(f" - Web: {len(results['web'])} pages")
	report_lines.append(f" - Academic: {len(results['academic'])} papers")
	report_lines.append("")

	# === Wikipedia 发现 ===
	if results["wikipedia"]:
	report_lines.append("📚 WIKIPEDIA FINDINGS")
	report_lines.append("-" * 70)
	for i, item in enumerate(results["wikipedia"], 1):
	report_lines.append(f"\n[{i}] Source: {item['source']}")
	report_lines.append(f"Content: {item['content']}...")
	report_lines.append("")

	# === Web 发现 ===
	if results["web"]:
	report_lines.append("🌐 WEB FINDINGS")
	report_lines.append("-" * 70)
	for i, item in enumerate(results["web"], 1):
	report_lines.append(f"\n[{i}] Source: {item['source']}")
	report_lines.append(f"Content: {item['content']}...")
	report_lines.append("")

	# === 学术发现 ===
	if results["academic"]:
	report_lines.append("🎓 ACADEMIC FINDINGS")
	report_lines.append("-" * 70)
	for i, item in enumerate(results["academic"], 1):
	report_lines.append(f"\n[{i}] Source: {item['source']}")
	report_lines.append(f"Content: {item['content']}...")
	report_lines.append("")

	# === 来源列表 ===
	report_lines.append("📋 ALL SOURCES")
	report_lines.append("-" * 70)
	source_count = 1
	for category, items in results.items():
	for item in items:
	report_lines.append(f"[{source_count}] {item['source']}")
	source_count += 1

	report_lines.append("")
	report_lines.append("=" * 70)
	report_lines.append("End of Deep Research Report")

	return "\n".join(report_lines)


	# ============================================================================
	# 进阶版本：带有简单的信息综合
	# ============================================================================

	@tool
	def deep_research_with_synthesis(query: str) -> str:
	"""Advanced deep research with AI-powered synthesis.

	This version not only collects information but also:
	1. Identifies common themes across sources
	2. Highlights contradictions
	3. Provides a synthesized summary

	Args:
	query: The research question

	Returns:
	Research report with synthesis
	"""

	# 先执行基础的 deep research
	basic_report = deep_research.invoke({"query": query})

	# TODO: 在未来可以添加 LLM 驱动的综合分析
	# 例如：让 LLM 分析所有来源，找出共同点和差异

	synthesis_note = """

	🤖 SYNTHESIS (To be implemented):
	This section will use an LLM to:
	- Identify key themes across all sources
	- Highlight agreements and contradictions
	- Provide confidence scores for different claims
	- Suggest follow-up questions

	For now, please manually review the findings above.
	"""

	return basic_report + synthesis_note


	# ============================================================================
	# 测试代码
	# ============================================================================

	if __name__ == "__main__":
	# 测试 deep research
	test_query = "What is quantum computing?"

	print("\n" + "="*70)
	print("Testing Deep Research Tool")
	print("="*70 + "\n")

	result = deep_research.invoke({"query": test_query})
	print(result)