Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from ebooklib import epub | |
| from bs4 import BeautifulSoup | |
| import pdfplumber | |
| import logging | |
| logging.getLogger("pdfminer").setLevel(logging.ERROR) | |
| from embed_store import get_embeddings, store_embeddings, get_qdrant_client | |
| # -------------------------- | |
| # LOAD EPUB | |
| # -------------------------- | |
| def load_pdf(file_path): | |
| docs = [] | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| total_pages = len(pdf.pages) | |
| print(f" β PDF has {total_pages} pages") | |
| for i, page in enumerate(pdf.pages): | |
| if i % 20 == 0: | |
| print(f" Processing page {i+1}/{total_pages}") | |
| text = page.extract_text() | |
| if text: | |
| docs.append({ | |
| "content": text, | |
| "source": file_path, | |
| "book": os.path.basename(file_path), | |
| "type": "book" | |
| }) | |
| except Exception as e: | |
| print(f"β Error reading PDF {file_path}: {e}") | |
| print(f" β Extracted {len(docs)} pages from PDF") | |
| return docs | |
| # -------------------------- | |
| # LOAD PDF | |
| # -------------------------- | |
| def load_epub(file_path): | |
| docs = [] | |
| try: | |
| book = epub.read_epub(file_path) | |
| count = 0 | |
| for item in book.get_items(): | |
| try: | |
| if item.get_type() == epub.ITEM_DOCUMENT: | |
| soup = BeautifulSoup(item.get_content(), "lxml") | |
| # remove scripts/styles | |
| for tag in soup(["script", "style"]): | |
| tag.decompose() | |
| text = soup.get_text(separator=" ", strip=True) | |
| if text and len(text) > 50: # filter junk | |
| docs.append({ | |
| "content": text, | |
| "source": file_path, | |
| "book": os.path.basename(file_path), | |
| "type": "book" | |
| }) | |
| count += 1 | |
| except Exception: | |
| continue | |
| print(f" β Extracted {count} sections from EPUB") | |
| except Exception as e: | |
| print(f"β Failed EPUB {file_path}: {e}") | |
| return docs | |
| # -------------------------- | |
| # LOAD ALL BOOKS | |
| # -------------------------- | |
| def load_books(folder_path="knowledge"): | |
| all_docs = [] | |
| files = os.listdir(folder_path) | |
| print(f"π Found {len(files)} files in '{folder_path}'") | |
| for i, file in enumerate(files): | |
| full_path = os.path.join(folder_path, file) | |
| print(f"\nπ [{i+1}/{len(files)}] Loading: {file}") | |
| if file.endswith(".epub"): | |
| docs = load_epub(full_path) | |
| elif file.endswith(".pdf"): | |
| docs = load_pdf(full_path) | |
| else: | |
| print(" β Skipped (unsupported)") | |
| continue | |
| all_docs.extend(docs) | |
| print(f"\nβ Total extracted documents: {len(all_docs)}") | |
| return all_docs | |
| # -------------------------- | |
| # CHUNKING | |
| # -------------------------- | |
| def chunk_documents(documents): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=100, | |
| ) | |
| chunks = [] | |
| print(f"Chunking {len(documents)} documents...") | |
| for i, doc in enumerate(documents): | |
| split_texts = splitter.split_text(doc["content"]) | |
| # β ensure small leftover is kept | |
| if len(split_texts) > 0 and len(split_texts[-1]) < 50: | |
| if len(split_texts) > 1: | |
| split_texts[-2] += " " + split_texts[-1] | |
| split_texts = split_texts[:-1] | |
| print(f"β Processing doc {i+1}/{len(documents)} | chunks: {len(split_texts)}") | |
| for chunk in split_texts: | |
| chunks.append({ | |
| "content": chunk, | |
| "source": doc["source"], | |
| "book": doc["book"], | |
| "type": doc["type"] | |
| }) | |
| print(f"Total chunks created: {len(chunks)}") | |
| return chunks | |
| # -------------------------- | |
| # MAIN INGEST FUNCTION | |
| # -------------------------- | |
| def ingest_books(folder_path="knowledge"): | |
| client = get_qdrant_client() | |
| collection_name = "psychology_books" | |
| # β Skip if already ingested | |
| try: | |
| info = client.get_collection(collection_name) | |
| if info.points_count > 0: | |
| print("Embeddings already exist. Skipping ingest.") | |
| return | |
| except Exception: | |
| pass | |
| docs = load_books(folder_path) | |
| chunks = chunk_documents(docs) | |
| embeddings = get_embeddings() | |
| store_embeddings(chunks, embeddings, collection_name) | |
| print(f"Ingested {len(chunks)} chunks from books.") | |
| if __name__ == "__main__": | |
| ingest_books("knowledge") |