Spaces:
Sleeping
Sleeping
File size: 2,433 Bytes
c8440e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import nest_asyncio
nest_asyncio.apply()
from scrapegraphai.graphs import SmartScraperMultiGraph
from scrapegraphai.nodes import FetchNode, ParseNode
from langchain.schema import Document
# Create a custom graph class
class CustomSmartScraperMultiGraph(SmartScraperMultiGraph):
def run(self):
# Fetch data from the URL
url_data = ""
for source in self.source:
if isinstance(source, str) and source.startswith("http"):
fetch_node = FetchNode( input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
node_config={
"verbose": True,
"headless": True,})
url_data = fetch_node.execute({"url": source})
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": 4096,
"verbose": True,
}
)
parsed_doc = parse_node.execute({"doc": url_data["doc"]})
break # Assuming only one URL needs to be fetched
# Combine URL data with Document data
combined_data = ""
for source in self.source:
if isinstance(source, Document):
combined_data += source.page_content
combined_data += parsed_doc['parsed_doc'][0]
return combined_data
def get_data(pdf_doc, web_url,openai_key):
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4o",
},
"verbose": True
}
sources = [
web_url,
Document(page_content=pdf_doc, metadata={"source": "local_content"})
]
prompt = "give an indepth analysis"
# Instantiate the custom graph
multiple_search_graph = CustomSmartScraperMultiGraph(
prompt=prompt,
source=sources,
config=graph_config
)
# Run the graph and print the result
result = multiple_search_graph.run()
return result |