rag-chatbot / app /ingest.py
Al1Abdullah's picture
Initial clean deploy without binary files
6256536
import os
import bs4 # Make sure to 'pip install beautifulsoup4'
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from embeddings.embedding import get_embedding
from config.config import QDRANT_URL, QDRANT_API_KEY
# 1. Smarter Loading: Only grab the 'main' content to prevent repeating
# headers, footers, and menus in every single chunk.
loader = WebBaseLoader(
web_paths=[
"https://atomcamp.com/",
"https://www.atomcamp.com/course/",
"https://www.atomcamp.com/about-us/",
"https://www.atomcamp.com/events/",
"https://www.atomcamp.com/blogs/",
"https://www.atomcamp.com/news/",
"https://www.atomcamp.com/webinars/",
"https://www.atomcamp.com/publications/",
"https://www.atomcamp.com/ai-solutions/"
],
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(["main", "article", "h1", "h2", "p"])
)
)
data = loader.load()
# 2. Split into clean chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)
# 3. Safe Upload to Cloud
BATCH_SIZE = 20
print(f"Cleaning old data and starting ingestion of {len(docs)} documents...")
# NOTE: Using .from_documents will ADD to the collection.
# If you want to fix the repetition from previous runs,
# you should delete the collection in your Qdrant Dashboard first.
qdrant = QdrantVectorStore.from_documents(
docs,
get_embedding(),
url=QDRANT_URL,
api_key=QDRANT_API_KEY,
collection_name="atomcamp_knowledge_base",
batch_size=BATCH_SIZE,
timeout=120
)
print("Ingestion complete! atomcamp knowledge base is clean and ready.")