import os from langchain_text_splitters import RecursiveCharacterTextSplitter from ebooklib import epub from bs4 import BeautifulSoup import pdfplumber import logging logging.getLogger("pdfminer").setLevel(logging.ERROR) from embed_store import get_embeddings, store_embeddings, get_qdrant_client # -------------------------- # LOAD EPUB # -------------------------- def load_pdf(file_path): docs = [] try: with pdfplumber.open(file_path) as pdf: total_pages = len(pdf.pages) print(f" → PDF has {total_pages} pages") for i, page in enumerate(pdf.pages): if i % 20 == 0: print(f" Processing page {i+1}/{total_pages}") text = page.extract_text() if text: docs.append({ "content": text, "source": file_path, "book": os.path.basename(file_path), "type": "book" }) except Exception as e: print(f"❌ Error reading PDF {file_path}: {e}") print(f" → Extracted {len(docs)} pages from PDF") return docs # -------------------------- # LOAD PDF # -------------------------- def load_epub(file_path): docs = [] try: book = epub.read_epub(file_path) count = 0 for item in book.get_items(): try: if item.get_type() == epub.ITEM_DOCUMENT: soup = BeautifulSoup(item.get_content(), "lxml") # remove scripts/styles for tag in soup(["script", "style"]): tag.decompose() text = soup.get_text(separator=" ", strip=True) if text and len(text) > 50: # filter junk docs.append({ "content": text, "source": file_path, "book": os.path.basename(file_path), "type": "book" }) count += 1 except Exception: continue print(f" → Extracted {count} sections from EPUB") except Exception as e: print(f"❌ Failed EPUB {file_path}: {e}") return docs # -------------------------- # LOAD ALL BOOKS # -------------------------- def load_books(folder_path="knowledge"): all_docs = [] files = os.listdir(folder_path) print(f"📚 Found {len(files)} files in '{folder_path}'") for i, file in enumerate(files): full_path = os.path.join(folder_path, file) print(f"\n📖 [{i+1}/{len(files)}] Loading: {file}") if file.endswith(".epub"): docs = load_epub(full_path) elif file.endswith(".pdf"): docs = load_pdf(full_path) else: print(" → Skipped (unsupported)") continue all_docs.extend(docs) print(f"\n✅ Total extracted documents: {len(all_docs)}") return all_docs # -------------------------- # CHUNKING # -------------------------- def chunk_documents(documents): splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100, ) chunks = [] print(f"Chunking {len(documents)} documents...") for i, doc in enumerate(documents): split_texts = splitter.split_text(doc["content"]) # ✅ ensure small leftover is kept if len(split_texts) > 0 and len(split_texts[-1]) < 50: if len(split_texts) > 1: split_texts[-2] += " " + split_texts[-1] split_texts = split_texts[:-1] print(f"→ Processing doc {i+1}/{len(documents)} | chunks: {len(split_texts)}") for chunk in split_texts: chunks.append({ "content": chunk, "source": doc["source"], "book": doc["book"], "type": doc["type"] }) print(f"Total chunks created: {len(chunks)}") return chunks # -------------------------- # MAIN INGEST FUNCTION # -------------------------- def ingest_books(folder_path="knowledge"): client = get_qdrant_client() collection_name = "psychology_books" # ✅ Skip if already ingested try: info = client.get_collection(collection_name) if info.points_count > 0: print("Embeddings already exist. Skipping ingest.") return except Exception: pass docs = load_books(folder_path) chunks = chunk_documents(docs) embeddings = get_embeddings() store_embeddings(chunks, embeddings, collection_name) print(f"Ingested {len(chunks)} chunks from books.") if __name__ == "__main__": ingest_books("knowledge")