Spaces:
Sleeping
Sleeping
File size: 1,028 Bytes
869f31e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | from openai import OpenAI
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()
EMBED_MODEL = "text-embedding-3-large"
EMBED_DIM = 3072
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
def load_and_chunk_pdf(path: str):
docs = PDFReader().load_data(file=path)
texts = [d.text for d in docs if getattr(d, "text", None)]
chunks = []
for t in texts:
new_chunks = splitter.split_text(t)
# Filter out empty chunks
chunks.extend([chunk for chunk in new_chunks if chunk.strip()])
return chunks
def embed_texts(texts: list[str]) -> list[list[float]]:
# Double-check that we don't have empty texts
texts = [text for text in texts if text and text.strip()]
if not texts:
return []
response = client.embeddings.create(
model=EMBED_MODEL,
input=texts,
)
return [item.embedding for item in response.data] |