from openai import OpenAI from llama_index.readers.file import PDFReader from llama_index.core.node_parser import SentenceSplitter from dotenv import load_dotenv load_dotenv() client = OpenAI() EMBED_MODEL = "text-embedding-3-large" EMBED_DIM = 3072 splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200) def load_and_chunk_pdf(path: str): docs = PDFReader().load_data(file=path) texts = [d.text for d in docs if getattr(d, "text", None)] chunks = [] for t in texts: new_chunks = splitter.split_text(t) # Filter out empty chunks chunks.extend([chunk for chunk in new_chunks if chunk.strip()]) return chunks def embed_texts(texts: list[str]) -> list[list[float]]: # Double-check that we don't have empty texts texts = [text for text in texts if text and text.strip()] if not texts: return [] response = client.embeddings.create( model=EMBED_MODEL, input=texts, ) return [item.embedding for item in response.data]