File size: 3,069 Bytes
ddd81fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os

import requests
from camel.toolkits import BaseToolkit


class JinaBrowsingToolkit(BaseToolkit):
    def get_url_content(self, url: str) -> str:
        r"""Fetch the content of a URL using the r.jina.ai service.

        Args:
            url (str): The URL to fetch content from.

        Returns:
            str: The markdown content of the URL.
        """

        # Replace http with https and add https if not present
        if not url.startswith("https://"):
            url = "https://" + url.lstrip("https://").lstrip("http://")

        jina_url = f"https://r.jina.ai/{url}"
        headers = {}
        if os.environ.get('JINA_PROXY_URL'):
            headers['X-Proxy-Url'] = os.environ.get('JINA_PROXY_URL')

        auth_token = os.environ.get('JINA_AUTH_TOKEN')
        if auth_token:
            headers['Authorization'] = f'Bearer {auth_token}'
        try:
            response = requests.get(jina_url, headers=headers)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            return f"Error fetching URL content: {e!s}"

    def get_url_content_with_context(
        self,
        url: str,
        search_string: str,
        context_chars: int = 700,
        max_instances: int = 3,
    ) -> str:
        r"""Fetch the content of a URL and return context around all instances of a specific string.

        Args:
            url (str): The URL to fetch content from.
            search_string (str): The string to search for in the content.
            context_chars (int): Number of characters to return before and after each found string.
            max_instances (int): Maximum number of instances to return.

        Returns:
            str: The context around all found instances of the string, or an error message if not found.

        If there are no results, try again with a more likely search string. Start with a more likely string and only use a less likely string if the first one has too many results.
        """
        content = self.get_url_content(url)
        if content.startswith("Error fetching URL content"):
            return content

        instances = []
        start = 0
        while True:
            index = content.lower().find(search_string.lower(), start)
            if index == -1 or len(instances) >= max_instances:
                break

            context_start = max(0, index - context_chars)
            context_end = min(
                len(content), index + len(search_string) + context_chars
            )
            instance_context = content[context_start:context_end]
            instances.append(
                f"Instance {len(instances) + 1}:\n{instance_context}\n"
            )

            start = index + len(search_string)

        if instances:
            return (
                f"Found {len(instances)} instance(s) of '{search_string}':\n\n"
                + '\n'.join(instances)
            )
        else:
            return f"Search string '{search_string}' not found in the content."