|
|
import os |
|
|
import json |
|
|
|
|
|
import chromadb |
|
|
|
|
|
from firecrawl import FirecrawlApp |
|
|
from dotenv import load_dotenv |
|
|
from llama_index.core import Document |
|
|
from llama_index.llms.openai import OpenAI |
|
|
from llama_index.vector_stores.chroma import ChromaVectorStore |
|
|
from llama_index.core import Settings |
|
|
from llama_index.core import StorageContext |
|
|
from llama_index.core import VectorStoreIndex |
|
|
from llama_index.core.node_parser import SentenceSplitter |
|
|
from llama_index.embeddings.openai import OpenAIEmbedding |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] |
|
|
FIRECRAWL_API_KEY = os.environ["FIRECRAWL_API_KEY"] |
|
|
|
|
|
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY) |
|
|
|
|
|
|
|
|
|
|
|
website_list = [ |
|
|
{ |
|
|
"Category": "BSA Website", |
|
|
"Description": "Scouts BSA website that contains all infrormation about scouting", |
|
|
"Name": "Scouting.org", |
|
|
"URL": "https://scouting.org", |
|
|
"Limit": 9999 |
|
|
}, |
|
|
{ |
|
|
"Category": "BSA Blog", |
|
|
"Description": "Scouting magazine's blog for articles about scouting.", |
|
|
"Name": "Scouting Magazine", |
|
|
"URL": "https://blog.scoutingmagazine.org", |
|
|
"Limit": 1000 |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def crawl_websites(websites): |
|
|
|
|
|
url_response = {} |
|
|
crawl_per_min = 3 |
|
|
|
|
|
|
|
|
crawled_websites = 0 |
|
|
scraped_pages = 0 |
|
|
for i, website_dict in enumerate(websites): |
|
|
url = website_dict.get('URL') |
|
|
print(f"Crawling: {url}") |
|
|
|
|
|
try: |
|
|
response = app.crawl_url( |
|
|
url, |
|
|
params={ |
|
|
'limit': website_dict.get('Limit'), |
|
|
'scrapeOptions': {'formats': ['markdown', 'html']} |
|
|
} |
|
|
) |
|
|
crawled_websites += 1 |
|
|
|
|
|
except Exception as exc: |
|
|
print(f"Failed to fetch {url} -> {exc}") |
|
|
continue |
|
|
|
|
|
|
|
|
url_response[url] = { |
|
|
"scraped_data": response.get("data"), |
|
|
"csv_data": website_dict |
|
|
} |
|
|
|
|
|
with open('scout_information.json', 'w') as json_file: |
|
|
json.dump(url_response, json_file) |
|
|
|
|
|
|
|
|
def create_vector_store(): |
|
|
with open('scout_information.json') as file: |
|
|
data = json.load(file) |
|
|
|
|
|
documents = [Document(text=t) for t in data] |
|
|
|
|
|
llm = OpenAI(model="gpt-4o-mini") |
|
|
embed_model = OpenAIEmbedding(model="text-embedding-3-large") |
|
|
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=30) |
|
|
|
|
|
Settings.llm = llm |
|
|
Settings.embed_model = embed_model |
|
|
Settings.text_splitter = text_splitter |
|
|
|
|
|
chroma_client = chromadb.PersistentClient(path="./data/scout_data") |
|
|
chroma_collection = chroma_client.create_collection("scout_data") |
|
|
|
|
|
vector_store = ChromaVectorStore(chroma_collection=chroma_collection) |
|
|
storage_context = StorageContext.from_defaults(vector_store=vector_store) |
|
|
|
|
|
index = VectorStoreIndex.from_documents( |
|
|
documents, |
|
|
embed_model=OpenAIEmbedding(model="text-embedding-3-large"), |
|
|
storage_context=storage_context, |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
crawl_websites(website_list) |
|
|
create_vector_store() |
|
|
|