Spaces:
Runtime error
Runtime error
File size: 2,616 Bytes
99e964c bcf7c58 99e964c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | import cohere
import os
import pinecone
from typing import List, Dict
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf
from dotenv import load_dotenv
load_dotenv()
co = cohere.Client(os.environ["COHERE_API_KEY"])
pc = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("td-sec-embeddings")
sources = [
{
"title": "2023",
"url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf",
"filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
},
{
"title": "2022",
"url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf",
"filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
},
]
def load() -> List[Dict[str, str]]:
"""
Loads the documents from the sources and chunks the HTML content.
"""
print("Loading documents...")
docs = []
for source in sources:
elements = partition_pdf(filename=source["filename"])
chunks = chunk_by_title(elements)
for chunk in chunks:
docs.append(
{
"title": source["title"],
"text": str(chunk),
"url": source["url"],
}
)
return docs
def embed(docs: List[Dict[str, str]]) -> List[List[float]]:
"""
Embeds the documents using the Cohere API.
"""
print("Embedding documents...")
batch_size = 90
docs_len = len(docs)
docs_embs = []
for i in range(0, docs_len, batch_size):
batch = docs[i : min(i + batch_size, docs_len)]
texts = [item["text"] for item in batch]
docs_embs_batch = co.embed(
texts=texts, model="embed-english-v3.0", input_type="search_document"
).embeddings
docs_embs.extend(docs_embs_batch)
return docs_embs
def update_index(
index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]]
) -> None:
"""
Indexes the documents for efficient retrieval.
"""
print("Indexing documents in Pinecone")
batch_size = 100
ids = [str(i) for i in range(len(docs))]
to_upsert = list(zip(ids, docs_embs, docs))
for i in range(0, len(docs), batch_size):
i_end = min(i + batch_size, len(docs))
index.upsert(vectors=to_upsert[i:i_end])
if __name__ == "__main__":
docs = load()
docs_embeds = embed(docs)
update_index(index, docs=docs, docs_embs=docs_embeds)
|