File size: 3,816 Bytes
ee85a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re

import wikipedia
from langchain_core.tools import tool


@tool
def wikipedia_tool(
    title: str, action: str = "summary", section_name: str | None = None, sentences: int = 3
) -> str | list[str]:
    """
    Retrieve information from Wikipedia pages with flexible content extraction.

    This tool provides four main operations for Wikipedia content:
    - Extract summaries of varying lengths
    - Retrieve complete page content including all sections
    - List all section titles to understand page structure
    - Extract specific sections by name with fuzzy matching

    The tool handles section parsing by recognizing Wikipedia's markup format
    (== Section ==, === Subsection ===, etc.) and automatically suggests similar
    sections if exact matches aren't found.

    Args:
        title: Wikipedia page title (supports auto-suggestion for typos)
        action: Operation type - "summary", "full", "sections", or "section"
        section_name: Name of specific section (required when action="section")
        sentences: Number of sentences for summary (default: 3)

    Returns:
        - "summary": Summary text (str)
        - "full": Full page content including all sections (str)
        - "sections": List of all section titles (List[str])
        - "section": Content of matching section or empty string if not found (str)

    Examples:
        wikipedia_tool("Python programming language", "summary")
        wikipedia_tool("Albert Einstein", "full")
        wikipedia_tool("Climate change", "sections")
        wikipedia_tool("Machine learning", "section", section_name="History")
    """

    def parse_sections(content: str) -> list[tuple]:
        """Parse Wikipedia content into sections based on == markers."""
        sections = []

        # Find all section headers with regex
        section_pattern = r"^(={2,})\s*([^=]+?)\s*\1\s*$"
        matches = list(re.finditer(section_pattern, content, re.MULTILINE))

        if not matches:
            return [("Full Content", content.strip())]

        for i, match in enumerate(matches):
            title = match.group(2).strip()
            start_pos = match.end()

            # Find the end position (start of next section or end of content)
            if i + 1 < len(matches):
                end_pos = matches[i + 1].start()
            else:
                end_pos = len(content)

            section_content = content[start_pos:end_pos].strip()
            sections.append((title, section_content))

        return sections

    # Configure wikipedia settings
    wikipedia.set_lang("en")
    wikipedia.set_rate_limiting(True)

    if action == "summary":
        return wikipedia.summary(title, sentences=sentences, auto_suggest=True)

    elif action == "full":
        page = wikipedia.page(title, auto_suggest=True)
        return page.content

    elif action == "sections":
        page = wikipedia.page(title, auto_suggest=True)
        sections = parse_sections(page.content)
        return [section_title for section_title, _ in sections]

    elif action == "section":
        if not section_name:
            raise ValueError("section_name is required when action='section'")

        page = wikipedia.page(title, auto_suggest=True)
        sections = parse_sections(page.content)

        # Find matching section (fuzzy match)
        for section_title, section_content in sections:
            if section_name.lower() in section_title.lower():
                return section_content

        # If no match found, return empty string
        return "No matching section found. Available sections: " + ", ".join(
            [section_title for section_title, _ in sections]
        )

    else:
        raise ValueError(f"Invalid action: {action}. Must be one of: summary, full, sections, section")