Spaces:
Runtime error
Runtime error
| import cohere | |
| import os | |
| import pinecone | |
| from typing import List, Dict | |
| from unstructured.chunking.title import chunk_by_title | |
| from unstructured.partition.pdf import partition_pdf | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| co = cohere.Client(os.environ["COHERE_API_KEY"]) | |
| pc = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"]) | |
| index = pc.Index("td-sec-embeddings") | |
| sources = [ | |
| { | |
| "title": "2023", | |
| "url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf", | |
| "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf", | |
| }, | |
| { | |
| "title": "2022", | |
| "url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf", | |
| "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf", | |
| }, | |
| ] | |
| def load() -> List[Dict[str, str]]: | |
| """ | |
| Loads the documents from the sources and chunks the HTML content. | |
| """ | |
| print("Loading documents...") | |
| docs = [] | |
| for source in sources: | |
| elements = partition_pdf(filename=source["filename"]) | |
| chunks = chunk_by_title(elements) | |
| for chunk in chunks: | |
| docs.append( | |
| { | |
| "title": source["title"], | |
| "text": str(chunk), | |
| "url": source["url"], | |
| } | |
| ) | |
| return docs | |
| def embed(docs: List[Dict[str, str]]) -> List[List[float]]: | |
| """ | |
| Embeds the documents using the Cohere API. | |
| """ | |
| print("Embedding documents...") | |
| batch_size = 90 | |
| docs_len = len(docs) | |
| docs_embs = [] | |
| for i in range(0, docs_len, batch_size): | |
| batch = docs[i : min(i + batch_size, docs_len)] | |
| texts = [item["text"] for item in batch] | |
| docs_embs_batch = co.embed( | |
| texts=texts, model="embed-english-v3.0", input_type="search_document" | |
| ).embeddings | |
| docs_embs.extend(docs_embs_batch) | |
| return docs_embs | |
| def update_index( | |
| index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]] | |
| ) -> None: | |
| """ | |
| Indexes the documents for efficient retrieval. | |
| """ | |
| print("Indexing documents in Pinecone") | |
| batch_size = 100 | |
| ids = [str(i) for i in range(len(docs))] | |
| to_upsert = list(zip(ids, docs_embs, docs)) | |
| for i in range(0, len(docs), batch_size): | |
| i_end = min(i + batch_size, len(docs)) | |
| index.upsert(vectors=to_upsert[i:i_end]) | |
| if __name__ == "__main__": | |
| docs = load() | |
| docs_embeds = embed(docs) | |
| update_index(index, docs=docs, docs_embs=docs_embeds) | |