Spaces:

nrigheriu
/

GradioApps

Sleeping

GradioApps / data_loader.py

added app files

869f31e verified 5 months ago

1.03 kB

	from openai import OpenAI
	from llama_index.readers.file import PDFReader
	from llama_index.core.node_parser import SentenceSplitter
	from dotenv import load_dotenv

	load_dotenv()

	client = OpenAI()
	EMBED_MODEL = "text-embedding-3-large"
	EMBED_DIM = 3072

	splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)

	def load_and_chunk_pdf(path: str):
	docs = PDFReader().load_data(file=path)
	texts = [d.text for d in docs if getattr(d, "text", None)]
	chunks = []
	for t in texts:
	new_chunks = splitter.split_text(t)
	# Filter out empty chunks
	chunks.extend([chunk for chunk in new_chunks if chunk.strip()])
	return chunks


	def embed_texts(texts: list[str]) -> list[list[float]]:
	# Double-check that we don't have empty texts
	texts = [text for text in texts if text and text.strip()]

	if not texts:
	return []

	response = client.embeddings.create(
	model=EMBED_MODEL,
	input=texts,
	)
	return [item.embedding for item in response.data]