Spaces:
Sleeping
Sleeping
| from openai import OpenAI | |
| from llama_index.readers.file import PDFReader | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| client = OpenAI() | |
| EMBED_MODEL = "text-embedding-3-large" | |
| EMBED_DIM = 3072 | |
| splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200) | |
| def load_and_chunk_pdf(path: str): | |
| docs = PDFReader().load_data(file=path) | |
| texts = [d.text for d in docs if getattr(d, "text", None)] | |
| chunks = [] | |
| for t in texts: | |
| new_chunks = splitter.split_text(t) | |
| # Filter out empty chunks | |
| chunks.extend([chunk for chunk in new_chunks if chunk.strip()]) | |
| return chunks | |
| def embed_texts(texts: list[str]) -> list[list[float]]: | |
| # Double-check that we don't have empty texts | |
| texts = [text for text in texts if text and text.strip()] | |
| if not texts: | |
| return [] | |
| response = client.embeddings.create( | |
| model=EMBED_MODEL, | |
| input=texts, | |
| ) | |
| return [item.embedding for item in response.data] |