Spaces:

Chirag20
/

MindBot-v0

Sleeping

App Files Files Community

MindBot-v0 / ingest.py

Chirag20

added knowledge

edabb92 about 1 month ago

raw

history blame contribute delete

4.83 kB

	import os
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from ebooklib import epub
	from bs4 import BeautifulSoup
	import pdfplumber
	import logging
	logging.getLogger("pdfminer").setLevel(logging.ERROR)

	from embed_store import get_embeddings, store_embeddings, get_qdrant_client


	# --------------------------
	# LOAD EPUB
	# --------------------------

	def load_pdf(file_path):
	docs = []
	try:
	with pdfplumber.open(file_path) as pdf:
	total_pages = len(pdf.pages)
	print(f" → PDF has {total_pages} pages")

	for i, page in enumerate(pdf.pages):
	if i % 20 == 0:
	print(f" Processing page {i+1}/{total_pages}")

	text = page.extract_text()
	if text:
	docs.append({
	"content": text,
	"source": file_path,
	"book": os.path.basename(file_path),
	"type": "book"
	})
	except Exception as e:
	print(f"❌ Error reading PDF {file_path}: {e}")

	print(f" → Extracted {len(docs)} pages from PDF")
	return docs


	# --------------------------
	# LOAD PDF
	# --------------------------

	def load_epub(file_path):
	docs = []
	try:
	book = epub.read_epub(file_path)
	count = 0

	for item in book.get_items():
	try:
	if item.get_type() == epub.ITEM_DOCUMENT:
	soup = BeautifulSoup(item.get_content(), "lxml")

	# remove scripts/styles
	for tag in soup(["script", "style"]):
	tag.decompose()

	text = soup.get_text(separator=" ", strip=True)

	if text and len(text) > 50: # filter junk
	docs.append({
	"content": text,
	"source": file_path,
	"book": os.path.basename(file_path),
	"type": "book"
	})
	count += 1
	except Exception:
	continue

	print(f" → Extracted {count} sections from EPUB")

	except Exception as e:
	print(f"❌ Failed EPUB {file_path}: {e}")

	return docs

	# --------------------------
	# LOAD ALL BOOKS
	# --------------------------

	def load_books(folder_path="knowledge"):
	all_docs = []
	files = os.listdir(folder_path)

	print(f"📚 Found {len(files)} files in '{folder_path}'")

	for i, file in enumerate(files):
	full_path = os.path.join(folder_path, file)

	print(f"\n📖 [{i+1}/{len(files)}] Loading: {file}")

	if file.endswith(".epub"):
	docs = load_epub(full_path)

	elif file.endswith(".pdf"):
	docs = load_pdf(full_path)

	else:
	print(" → Skipped (unsupported)")
	continue

	all_docs.extend(docs)

	print(f"\n✅ Total extracted documents: {len(all_docs)}")
	return all_docs

	# --------------------------
	# CHUNKING
	# --------------------------

	def chunk_documents(documents):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=100,
	)

	chunks = []
	print(f"Chunking {len(documents)} documents...")

	for i, doc in enumerate(documents):
	split_texts = splitter.split_text(doc["content"])

	# ✅ ensure small leftover is kept
	if len(split_texts) > 0 and len(split_texts[-1]) < 50:
	if len(split_texts) > 1:
	split_texts[-2] += " " + split_texts[-1]
	split_texts = split_texts[:-1]

	print(f"→ Processing doc {i+1}/{len(documents)} \| chunks: {len(split_texts)}")

	for chunk in split_texts:
	chunks.append({
	"content": chunk,
	"source": doc["source"],
	"book": doc["book"],
	"type": doc["type"]
	})

	print(f"Total chunks created: {len(chunks)}")
	return chunks

	# --------------------------
	# MAIN INGEST FUNCTION
	# --------------------------

	def ingest_books(folder_path="knowledge"):
	client = get_qdrant_client()
	collection_name = "psychology_books"

	# ✅ Skip if already ingested
	try:
	info = client.get_collection(collection_name)
	if info.points_count > 0:
	print("Embeddings already exist. Skipping ingest.")
	return
	except Exception:
	pass

	docs = load_books(folder_path)
	chunks = chunk_documents(docs)

	embeddings = get_embeddings()
	store_embeddings(chunks, embeddings, collection_name)

	print(f"Ingested {len(chunks)} chunks from books.")



	if __name__ == "__main__":
	ingest_books("knowledge")