File size: 3,342 Bytes
7072a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import json

import chromadb

from firecrawl import FirecrawlApp
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
FIRECRAWL_API_KEY = os.environ["FIRECRAWL_API_KEY"]

app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)



website_list = [
    {
        "Category": "BSA Website",
        "Description": "Scouts BSA website that contains all infrormation about scouting",
        "Name": "Scouting.org",
        "URL": "https://scouting.org",
        "Limit": 9999
    },
    {
        "Category": "BSA Blog",
        "Description": "Scouting magazine's blog for articles about scouting.",
        "Name": "Scouting Magazine",
        "URL": "https://blog.scoutingmagazine.org",
        "Limit": 1000
    }
]

# index = VectorStoreIndex.from_documents(documents)
# query_engine = index.as_query_engine()

def crawl_websites(websites):
    # Crawl websites and handle responses
    url_response = {}
    crawl_per_min = 3  # Max crawl per minute

    # Track crawls
    crawled_websites = 0
    scraped_pages = 0
    for i, website_dict in enumerate(websites):
        url = website_dict.get('URL')
        print(f"Crawling: {url}")

        try:
            response = app.crawl_url(
                url,
                params={
                    'limit': website_dict.get('Limit'),  # Limit pages to scrape per site.
                    'scrapeOptions': {'formats': ['markdown', 'html']}
                }
            )
            crawled_websites += 1

        except Exception as exc:
            print(f"Failed to fetch {url} -> {exc}")
            continue

        # Store the scraped data and associated info in the response dict
        url_response[url] = {
            "scraped_data": response.get("data"),
            "csv_data": website_dict
        }
    
    with open('scout_information.json', 'w') as json_file:
        json.dump(url_response, json_file)


def create_vector_store():
    with open('scout_information.json') as file:
        data = json.load(file)

    documents = [Document(text=t) for t in data]

    llm = OpenAI(model="gpt-4o-mini")
    embed_model = OpenAIEmbedding(model="text-embedding-3-large")
    text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=30)

    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.text_splitter = text_splitter

    chroma_client = chromadb.PersistentClient(path="./data/scout_data")
    chroma_collection = chroma_client.create_collection("scout_data")

    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index = VectorStoreIndex.from_documents(
        documents,
        embed_model=OpenAIEmbedding(model="text-embedding-3-large"),
        storage_context=storage_context,
        show_progress=True
    )

if __name__ == '__main__':
    crawl_websites(website_list)
    create_vector_store()