MindBot-v0 / ingest.py
Chirag20's picture
added knowledge
edabb92
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ebooklib import epub
from bs4 import BeautifulSoup
import pdfplumber
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)
from embed_store import get_embeddings, store_embeddings, get_qdrant_client
# --------------------------
# LOAD EPUB
# --------------------------
def load_pdf(file_path):
docs = []
try:
with pdfplumber.open(file_path) as pdf:
total_pages = len(pdf.pages)
print(f" β†’ PDF has {total_pages} pages")
for i, page in enumerate(pdf.pages):
if i % 20 == 0:
print(f" Processing page {i+1}/{total_pages}")
text = page.extract_text()
if text:
docs.append({
"content": text,
"source": file_path,
"book": os.path.basename(file_path),
"type": "book"
})
except Exception as e:
print(f"❌ Error reading PDF {file_path}: {e}")
print(f" β†’ Extracted {len(docs)} pages from PDF")
return docs
# --------------------------
# LOAD PDF
# --------------------------
def load_epub(file_path):
docs = []
try:
book = epub.read_epub(file_path)
count = 0
for item in book.get_items():
try:
if item.get_type() == epub.ITEM_DOCUMENT:
soup = BeautifulSoup(item.get_content(), "lxml")
# remove scripts/styles
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator=" ", strip=True)
if text and len(text) > 50: # filter junk
docs.append({
"content": text,
"source": file_path,
"book": os.path.basename(file_path),
"type": "book"
})
count += 1
except Exception:
continue
print(f" β†’ Extracted {count} sections from EPUB")
except Exception as e:
print(f"❌ Failed EPUB {file_path}: {e}")
return docs
# --------------------------
# LOAD ALL BOOKS
# --------------------------
def load_books(folder_path="knowledge"):
all_docs = []
files = os.listdir(folder_path)
print(f"πŸ“š Found {len(files)} files in '{folder_path}'")
for i, file in enumerate(files):
full_path = os.path.join(folder_path, file)
print(f"\nπŸ“– [{i+1}/{len(files)}] Loading: {file}")
if file.endswith(".epub"):
docs = load_epub(full_path)
elif file.endswith(".pdf"):
docs = load_pdf(full_path)
else:
print(" β†’ Skipped (unsupported)")
continue
all_docs.extend(docs)
print(f"\nβœ… Total extracted documents: {len(all_docs)}")
return all_docs
# --------------------------
# CHUNKING
# --------------------------
def chunk_documents(documents):
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
)
chunks = []
print(f"Chunking {len(documents)} documents...")
for i, doc in enumerate(documents):
split_texts = splitter.split_text(doc["content"])
# βœ… ensure small leftover is kept
if len(split_texts) > 0 and len(split_texts[-1]) < 50:
if len(split_texts) > 1:
split_texts[-2] += " " + split_texts[-1]
split_texts = split_texts[:-1]
print(f"β†’ Processing doc {i+1}/{len(documents)} | chunks: {len(split_texts)}")
for chunk in split_texts:
chunks.append({
"content": chunk,
"source": doc["source"],
"book": doc["book"],
"type": doc["type"]
})
print(f"Total chunks created: {len(chunks)}")
return chunks
# --------------------------
# MAIN INGEST FUNCTION
# --------------------------
def ingest_books(folder_path="knowledge"):
client = get_qdrant_client()
collection_name = "psychology_books"
# βœ… Skip if already ingested
try:
info = client.get_collection(collection_name)
if info.points_count > 0:
print("Embeddings already exist. Skipping ingest.")
return
except Exception:
pass
docs = load_books(folder_path)
chunks = chunk_documents(docs)
embeddings = get_embeddings()
store_embeddings(chunks, embeddings, collection_name)
print(f"Ingested {len(chunks)} chunks from books.")
if __name__ == "__main__":
ingest_books("knowledge")