GradioApps / data_loader.py
nrigheriu's picture
added app files
869f31e verified
from openai import OpenAI
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()
EMBED_MODEL = "text-embedding-3-large"
EMBED_DIM = 3072
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
def load_and_chunk_pdf(path: str):
docs = PDFReader().load_data(file=path)
texts = [d.text for d in docs if getattr(d, "text", None)]
chunks = []
for t in texts:
new_chunks = splitter.split_text(t)
# Filter out empty chunks
chunks.extend([chunk for chunk in new_chunks if chunk.strip()])
return chunks
def embed_texts(texts: list[str]) -> list[list[float]]:
# Double-check that we don't have empty texts
texts = [text for text in texts if text and text.strip()]
if not texts:
return []
response = client.embeddings.create(
model=EMBED_MODEL,
input=texts,
)
return [item.embedding for item in response.data]