import cohere import os import pinecone from typing import List, Dict from unstructured.chunking.title import chunk_by_title from unstructured.partition.pdf import partition_pdf from dotenv import load_dotenv load_dotenv() co = cohere.Client(os.environ["COHERE_API_KEY"]) pc = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"]) index = pc.Index("td-sec-embeddings") sources = [ { "title": "2023", "url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf", "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf", }, { "title": "2022", "url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf", "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf", }, ] def load() -> List[Dict[str, str]]: """ Loads the documents from the sources and chunks the HTML content. """ print("Loading documents...") docs = [] for source in sources: elements = partition_pdf(filename=source["filename"]) chunks = chunk_by_title(elements) for chunk in chunks: docs.append( { "title": source["title"], "text": str(chunk), "url": source["url"], } ) return docs def embed(docs: List[Dict[str, str]]) -> List[List[float]]: """ Embeds the documents using the Cohere API. """ print("Embedding documents...") batch_size = 90 docs_len = len(docs) docs_embs = [] for i in range(0, docs_len, batch_size): batch = docs[i : min(i + batch_size, docs_len)] texts = [item["text"] for item in batch] docs_embs_batch = co.embed( texts=texts, model="embed-english-v3.0", input_type="search_document" ).embeddings docs_embs.extend(docs_embs_batch) return docs_embs def update_index( index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]] ) -> None: """ Indexes the documents for efficient retrieval. """ print("Indexing documents in Pinecone") batch_size = 100 ids = [str(i) for i in range(len(docs))] to_upsert = list(zip(ids, docs_embs, docs)) for i in range(0, len(docs), batch_size): i_end = min(i + batch_size, len(docs)) index.upsert(vectors=to_upsert[i:i_end]) if __name__ == "__main__": docs = load() docs_embeds = embed(docs) update_index(index, docs=docs, docs_embs=docs_embeds)