Spaces:
Sleeping
Sleeping
| import nest_asyncio | |
| nest_asyncio.apply() | |
| from scrapegraphai.graphs import SmartScraperMultiGraph | |
| from scrapegraphai.nodes import FetchNode, ParseNode | |
| from langchain.schema import Document | |
| # Create a custom graph class | |
| class CustomSmartScraperMultiGraph(SmartScraperMultiGraph): | |
| def run(self): | |
| # Fetch data from the URL | |
| url_data = "" | |
| for source in self.source: | |
| if isinstance(source, str) and source.startswith("http"): | |
| fetch_node = FetchNode( input="url | local_dir", | |
| output=["doc", "link_urls", "img_urls"], | |
| node_config={ | |
| "verbose": True, | |
| "headless": True,}) | |
| url_data = fetch_node.execute({"url": source}) | |
| parse_node = ParseNode( | |
| input="doc", | |
| output=["parsed_doc"], | |
| node_config={ | |
| "chunk_size": 4096, | |
| "verbose": True, | |
| } | |
| ) | |
| parsed_doc = parse_node.execute({"doc": url_data["doc"]}) | |
| break # Assuming only one URL needs to be fetched | |
| # Combine URL data with Document data | |
| combined_data = "" | |
| for source in self.source: | |
| if isinstance(source, Document): | |
| combined_data += source.page_content | |
| combined_data += parsed_doc['parsed_doc'][0] | |
| return combined_data | |
| def get_data(pdf_doc, web_url,openai_key): | |
| graph_config = { | |
| "llm": { | |
| "api_key": openai_key, | |
| "model": "gpt-4o", | |
| }, | |
| "verbose": True | |
| } | |
| sources = [ | |
| web_url, | |
| Document(page_content=pdf_doc, metadata={"source": "local_content"}) | |
| ] | |
| prompt = "give an indepth analysis" | |
| # Instantiate the custom graph | |
| multiple_search_graph = CustomSmartScraperMultiGraph( | |
| prompt=prompt, | |
| source=sources, | |
| config=graph_config | |
| ) | |
| # Run the graph and print the result | |
| result = multiple_search_graph.run() | |
| return result |