File size: 2,433 Bytes
c8440e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import nest_asyncio
nest_asyncio.apply()

from scrapegraphai.graphs import SmartScraperMultiGraph
from scrapegraphai.nodes import FetchNode, ParseNode
from langchain.schema import Document

 # Create a custom graph class
class CustomSmartScraperMultiGraph(SmartScraperMultiGraph):
    def run(self):
        # Fetch data from the URL
        url_data = ""
        for source in self.source:
            if isinstance(source, str) and source.startswith("http"):
                fetch_node = FetchNode( input="url | local_dir",
                                        output=["doc", "link_urls", "img_urls"],
                                        node_config={ 
                                            "verbose": True,
                                            "headless": True,})

                url_data = fetch_node.execute({"url": source})
                
                parse_node = ParseNode(
                                      input="doc",
                                      output=["parsed_doc"],
                                      node_config={
                                          "chunk_size": 4096,
                                          "verbose": True,
                                      }
                              )

                parsed_doc = parse_node.execute({"doc": url_data["doc"]})

                break  # Assuming only one URL needs to be fetched

        # Combine URL data with Document data
        combined_data = ""
        for source in self.source:
            if isinstance(source, Document):
                combined_data += source.page_content
        combined_data += parsed_doc['parsed_doc'][0]


        return combined_data


def get_data(pdf_doc, web_url,openai_key):

    graph_config = {
            "llm": {
                "api_key": openai_key,
                "model": "gpt-4o",
            },
            "verbose": True

        }

    sources = [
            web_url,
            Document(page_content=pdf_doc, metadata={"source": "local_content"})
        ]

    prompt = "give an indepth analysis"

    # Instantiate the custom graph
    multiple_search_graph = CustomSmartScraperMultiGraph(
        prompt=prompt,
        source=sources,
        config=graph_config
    )

    # Run the graph and print the result
    result = multiple_search_graph.run()
    return result