Spaces:

Humbleguava
/

GAIA_Agent_DeepResearch

Sleeping

File size: 8,578 Bytes

176a845

"""
Deep Research Tool - 教学版本
基于现有工具实现多源信息整合的深度研究功能

设计思路：
1. 不添加新的外部依赖
2. 组合现有的 wiki_search, web_search, arxiv_search
3. 实现智能去重和结果整合
4. 提供清晰的来源追踪
"""

from langchain_core.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
from typing import Dict, List
import hashlib


@tool
def deep_research(query: str) -> str:
    """Perform comprehensive multi-source research on a topic.

    This tool combines Wikipedia, Web Search, and Academic Papers to provide
    a thorough analysis from multiple perspectives.

    Use this when:
    - You need to verify information from multiple sources
    - The question requires both general knowledge and current information
    - You want academic backing for factual claims

    Args:
        query: The research question or topic

    Returns:
        Structured research report with information from multiple sources
    """

    print(f"\n{'='*60}")
    print(f"🔍 Deep Research: {query}")
    print(f"{'='*60}\n")

    results = {
        "wikipedia": [],
        "web": [],
        "academic": []
    }

    # === 阶段 1: 多源并行搜索 ===
    print("📚 Phase 1: Multi-source search...")

    # 1.1 Wikipedia - 权威的背景知识
    try:
        print("  → Searching Wikipedia...")
        wiki_docs = WikipediaLoader(query=query, load_max_docs=2).load()
        for doc in wiki_docs:
            results["wikipedia"].append({
                "source": doc.metadata.get("source", "Wikipedia"),
                "content": doc.page_content[:2000],  # 截取前500字符
                "full_content": doc.page_content
            })
        print(f"    ✓ Found {len(results['wikipedia'])} Wikipedia articles")
    except Exception as e:
        print(f"    ✗ Wikipedia search failed: {e}")

    # 1.2 Web Search - 最新信息
    try:
        print("  → Searching Web (Tavily)...")
        tavily_tool = TavilySearchResults(max_results=10)
        web_docs = tavily_tool.invoke({"query": query})  # 正确的调用方式
        for doc in web_docs:
            # Tavily 返回的是字典列表
            if isinstance(doc, dict):
                results["web"].append({
                    "source": doc.get("url", "Web"),
                    "content": doc.get("content", "")[:2000],
                    "full_content": doc.get("content", "")
                })
        print(f"    ✓ Found {len(results['web'])} web results")
    except Exception as e:
        print(f"    ✗ Web search failed: {e}")

    # 1.3 Arxiv - 学术论文
    try:
        print("  → Searching Arxiv...")
        arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load()
        for doc in arxiv_docs:
            results["academic"].append({
                "source": doc.metadata.get("source", "Arxiv"),
                "content": doc.page_content[:2000],
                "full_content": doc.page_content
            })
        print(f"    ✓ Found {len(results['academic'])} academic papers")
    except Exception as e:
        print(f"    ✗ Arxiv search failed: {e}")

    # === 阶段 2: 信息去重 ===
    print("\n🧹 Phase 2: Deduplication...")
    unique_results = deduplicate_results(results)
    print(f"  → Removed duplicates, kept {sum(len(v) for v in unique_results.values())} unique results")

    # === 阶段 3: 格式化输出 ===
    print("\n📝 Phase 3: Formatting report...\n")
    report = format_research_report(query, unique_results)

    return report


def deduplicate_results(results: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
    """去重：基于内容相似度移除重复结果

    使用简单的哈希方法：
    - 提取每个结果的前100个字符
    - 计算哈希值
    - 移除重复的哈希
    """
    seen_hashes = set()
    unique_results = {
        "wikipedia": [],
        "web": [],
        "academic": []
    }

    for category, items in results.items():
        for item in items:
            # 使用内容的前100个字符计算哈希
            content_sample = item["content"][:100].lower().strip()
            content_hash = hashlib.md5(content_sample.encode()).hexdigest()

            if content_hash not in seen_hashes:
                seen_hashes.add(content_hash)
                unique_results[category].append(item)

    return unique_results


def format_research_report(query: str, results: Dict[str, List[Dict]]) -> str:
    """格式化研究报告

    输出结构：
    1. 总览
    2. Wikipedia 发现
    3. Web 发现
    4. 学术发现
    5. 来源列表
    """

    report_lines = []

    # === 标题 ===
    report_lines.append(f"DEEP RESEARCH REPORT: {query}")
    report_lines.append("=" * 70)
    report_lines.append("")

    # === 总览 ===
    total_sources = sum(len(v) for v in results.values())
    report_lines.append(f"📊 OVERVIEW")
    report_lines.append(f"Total sources found: {total_sources}")
    report_lines.append(f"  - Wikipedia: {len(results['wikipedia'])} articles")
    report_lines.append(f"  - Web: {len(results['web'])} pages")
    report_lines.append(f"  - Academic: {len(results['academic'])} papers")
    report_lines.append("")

    # === Wikipedia 发现 ===
    if results["wikipedia"]:
        report_lines.append("📚 WIKIPEDIA FINDINGS")
        report_lines.append("-" * 70)
        for i, item in enumerate(results["wikipedia"], 1):
            report_lines.append(f"\n[{i}] Source: {item['source']}")
            report_lines.append(f"Content: {item['content']}...")
            report_lines.append("")

    # === Web 发现 ===
    if results["web"]:
        report_lines.append("🌐 WEB FINDINGS")
        report_lines.append("-" * 70)
        for i, item in enumerate(results["web"], 1):
            report_lines.append(f"\n[{i}] Source: {item['source']}")
            report_lines.append(f"Content: {item['content']}...")
            report_lines.append("")

    # === 学术发现 ===
    if results["academic"]:
        report_lines.append("🎓 ACADEMIC FINDINGS")
        report_lines.append("-" * 70)
        for i, item in enumerate(results["academic"], 1):
            report_lines.append(f"\n[{i}] Source: {item['source']}")
            report_lines.append(f"Content: {item['content']}...")
            report_lines.append("")

    # === 来源列表 ===
    report_lines.append("📋 ALL SOURCES")
    report_lines.append("-" * 70)
    source_count = 1
    for category, items in results.items():
        for item in items:
            report_lines.append(f"[{source_count}] {item['source']}")
            source_count += 1

    report_lines.append("")
    report_lines.append("=" * 70)
    report_lines.append("End of Deep Research Report")

    return "\n".join(report_lines)


# ============================================================================
# 进阶版本：带有简单的信息综合
# ============================================================================

@tool
def deep_research_with_synthesis(query: str) -> str:
    """Advanced deep research with AI-powered synthesis.

    This version not only collects information but also:
    1. Identifies common themes across sources
    2. Highlights contradictions
    3. Provides a synthesized summary

    Args:
        query: The research question

    Returns:
        Research report with synthesis
    """

    # 先执行基础的 deep research
    basic_report = deep_research.invoke({"query": query})

    # TODO: 在未来可以添加 LLM 驱动的综合分析
    # 例如：让 LLM 分析所有来源，找出共同点和差异

    synthesis_note = """

🤖 SYNTHESIS (To be implemented):
This section will use an LLM to:
- Identify key themes across all sources
- Highlight agreements and contradictions
- Provide confidence scores for different claims
- Suggest follow-up questions

For now, please manually review the findings above.
"""

    return basic_report + synthesis_note


# ============================================================================
# 测试代码
# ============================================================================

if __name__ == "__main__":
    # 测试 deep research
    test_query = "What is quantum computing?"

    print("\n" + "="*70)
    print("Testing Deep Research Tool")
    print("="*70 + "\n")

    result = deep_research.invoke({"query": test_query})
    print(result)