Spaces:
Runtime error
Runtime error
| import os | |
| import bs4 # Make sure to 'pip install beautifulsoup4' | |
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_qdrant import QdrantVectorStore | |
| from embeddings.embedding import get_embedding | |
| from config.config import QDRANT_URL, QDRANT_API_KEY | |
| # 1. Smarter Loading: Only grab the 'main' content to prevent repeating | |
| # headers, footers, and menus in every single chunk. | |
| loader = WebBaseLoader( | |
| web_paths=[ | |
| "https://atomcamp.com/", | |
| "https://www.atomcamp.com/course/", | |
| "https://www.atomcamp.com/about-us/", | |
| "https://www.atomcamp.com/events/", | |
| "https://www.atomcamp.com/blogs/", | |
| "https://www.atomcamp.com/news/", | |
| "https://www.atomcamp.com/webinars/", | |
| "https://www.atomcamp.com/publications/", | |
| "https://www.atomcamp.com/ai-solutions/" | |
| ], | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer(["main", "article", "h1", "h2", "p"]) | |
| ) | |
| ) | |
| data = loader.load() | |
| # 2. Split into clean chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| docs = text_splitter.split_documents(data) | |
| # 3. Safe Upload to Cloud | |
| BATCH_SIZE = 20 | |
| print(f"Cleaning old data and starting ingestion of {len(docs)} documents...") | |
| # NOTE: Using .from_documents will ADD to the collection. | |
| # If you want to fix the repetition from previous runs, | |
| # you should delete the collection in your Qdrant Dashboard first. | |
| qdrant = QdrantVectorStore.from_documents( | |
| docs, | |
| get_embedding(), | |
| url=QDRANT_URL, | |
| api_key=QDRANT_API_KEY, | |
| collection_name="atomcamp_knowledge_base", | |
| batch_size=BATCH_SIZE, | |
| timeout=120 | |
| ) | |
| print("Ingestion complete! atomcamp knowledge base is clean and ready.") |