File size: 2,616 Bytes
99e964c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcf7c58
 
99e964c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import cohere
import os
import pinecone

from typing import List, Dict

from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf

from dotenv import load_dotenv

load_dotenv()

co = cohere.Client(os.environ["COHERE_API_KEY"])

pc = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("td-sec-embeddings")


sources = [
    {
        "title": "2023",
        "url": "https://www.td.com/content/dam/tdcom/canada/about-td/pdf/quarterly-results/2023/2023-annual-report-e.pdf",
        "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
    },
    {
        "title": "2022",
        "url": "https://www.td.com/document/PDF/ar2022/ar2022-Complete-Report.pdf",
        "filename": "/Users/clemensadolphs/git-personal/secsplorer/2023-annual-report-e.pdf",
    },
]


def load() -> List[Dict[str, str]]:
    """
    Loads the documents from the sources and chunks the HTML content.
    """
    print("Loading documents...")
    docs = []
    for source in sources:
        elements = partition_pdf(filename=source["filename"])
        chunks = chunk_by_title(elements)
        for chunk in chunks:
            docs.append(
                {
                    "title": source["title"],
                    "text": str(chunk),
                    "url": source["url"],
                }
            )
    return docs


def embed(docs: List[Dict[str, str]]) -> List[List[float]]:
    """
    Embeds the documents using the Cohere API.
    """
    print("Embedding documents...")

    batch_size = 90
    docs_len = len(docs)
    docs_embs = []

    for i in range(0, docs_len, batch_size):
        batch = docs[i : min(i + batch_size, docs_len)]
        texts = [item["text"] for item in batch]
        docs_embs_batch = co.embed(
            texts=texts, model="embed-english-v3.0", input_type="search_document"
        ).embeddings
        docs_embs.extend(docs_embs_batch)
    return docs_embs


def update_index(
    index: pinecone.Index, docs: List[Dict[str, str]], docs_embs: List[List[float]]
) -> None:
    """
    Indexes the documents for efficient retrieval.
    """
    print("Indexing documents in Pinecone")
    batch_size = 100

    ids = [str(i) for i in range(len(docs))]

    to_upsert = list(zip(ids, docs_embs, docs))

    for i in range(0, len(docs), batch_size):
        i_end = min(i + batch_size, len(docs))
        index.upsert(vectors=to_upsert[i:i_end])


if __name__ == "__main__":
    docs = load()
    docs_embeds = embed(docs)
    update_index(index, docs=docs, docs_embs=docs_embeds)