import os import bs4 # Make sure to 'pip install beautifulsoup4' from langchain_community.document_loaders import WebBaseLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_qdrant import QdrantVectorStore from embeddings.embedding import get_embedding from config.config import QDRANT_URL, QDRANT_API_KEY # 1. Smarter Loading: Only grab the 'main' content to prevent repeating # headers, footers, and menus in every single chunk. loader = WebBaseLoader( web_paths=[ "https://atomcamp.com/", "https://www.atomcamp.com/course/", "https://www.atomcamp.com/about-us/", "https://www.atomcamp.com/events/", "https://www.atomcamp.com/blogs/", "https://www.atomcamp.com/news/", "https://www.atomcamp.com/webinars/", "https://www.atomcamp.com/publications/", "https://www.atomcamp.com/ai-solutions/" ], bs_kwargs=dict( parse_only=bs4.SoupStrainer(["main", "article", "h1", "h2", "p"]) ) ) data = loader.load() # 2. Split into clean chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) docs = text_splitter.split_documents(data) # 3. Safe Upload to Cloud BATCH_SIZE = 20 print(f"Cleaning old data and starting ingestion of {len(docs)} documents...") # NOTE: Using .from_documents will ADD to the collection. # If you want to fix the repetition from previous runs, # you should delete the collection in your Qdrant Dashboard first. qdrant = QdrantVectorStore.from_documents( docs, get_embedding(), url=QDRANT_URL, api_key=QDRANT_API_KEY, collection_name="atomcamp_knowledge_base", batch_size=BATCH_SIZE, timeout=120 ) print("Ingestion complete! atomcamp knowledge base is clean and ready.")