from PIL import Image import pytesseract from pdf2image import convert_from_path import os from langchain.vectorstores import Chroma from langchain.embeddings import SentenceTransformerEmbeddings from langchain.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings CHROMA_DIR = "data/chroma_db" CHROMA_IMG_DIR = "data/image_db" def store_pdf(pdf_path): loader = PyMuPDFLoader(pdf_path) docs = loader.load() splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100) chunks = splitter.split_documents(docs) embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) def store_pdf_image(pdf_path): text = extract_text_from_scanned_pdf(pdf_path) doc = Document(page_content=text) splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents([doc]) embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_IMG_DIR) def store_pdf_image_text(text): doc = Document(page_content=text) splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents([doc]) embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_IMG_DIR) # images = convert_from_path("your_file.pdf", poppler_path="/opt/homebrew/bin") def extract_text_from_scanned_pdf(pdf_path): pages = convert_from_path(pdf_path, dpi=300) all_text = "" for i, page in enumerate(pages): text = pytesseract.image_to_string(page, lang="eng") all_text += f"\n--- Page {i+1} ---\n{text}" return all_text