File size: 4,760 Bytes
0001f12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import hashlib
import os
from glob import glob
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from runtime_config import CHROMA_DB_DIR, KNOWLEDGE_BASE_PATH


def build_vector_db(
    persist_directory: str | None = None,
    knowledge_base_path: str | None = None,
):
    """
    Build or load a PDF-backed vector database and return a retriever.
    """

    persist_path = Path(persist_directory).resolve() if persist_directory else CHROMA_DB_DIR
    knowledge_path = (
        Path(knowledge_base_path).resolve()
        if knowledge_base_path
        else KNOWLEDGE_BASE_PATH
    )

    if not knowledge_path.exists():
        knowledge_path.mkdir(parents=True, exist_ok=True)
        print(f"Knowledge base directory '{knowledge_path}' was created but has no PDF files.")
        print("Place your PDF files there and restart the application.")
        return _empty_retriever()

    pdf_files = glob(os.path.join(str(knowledge_path), "**/*.pdf"), recursive=True)
    if not pdf_files:
        print(f"No PDF files found in '{knowledge_path}'.")
        return _empty_retriever()

    print(f"Found {len(pdf_files)} PDF files for the knowledge base.")

    all_docs = []
    processed_hashes = set()

    for pdf_file in pdf_files:
        try:
            print(f"Loading {os.path.basename(pdf_file)}...")
            loader = PyPDFLoader(pdf_file)
            docs = loader.load()

            for doc in docs:
                content_hash = hashlib.md5(doc.page_content.encode()).hexdigest()
                if content_hash in processed_hashes:
                    continue

                processed_hashes.add(content_hash)
                doc.metadata.update(
                    {
                        "source": os.path.basename(pdf_file),
                        "source_path": pdf_file,
                        "file_size": f"{os.path.getsize(pdf_file) / 1024:.1f}KB",
                        "content_hash": content_hash,
                    }
                )
                all_docs.append(doc)
        except Exception as error:
            print(f"Failed to load {os.path.basename(pdf_file)}: {error}")

    if not all_docs:
        raise ValueError("No valid PDF content was successfully loaded.")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", "!", "?", ";", ",", " ", ""],
    )
    splits = text_splitter.split_documents(all_docs)

    persist_path.mkdir(parents=True, exist_ok=True)
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    if any(persist_path.iterdir()):
        print(f"Loading existing vector database from {persist_path}...")
        vectorstore = Chroma(
            persist_directory=str(persist_path),
            embedding_function=embeddings,
        )

        existing_sources = vectorstore.get().get("metadatas", [])
        existing_files = {
            metadata.get("source_path")
            for metadata in existing_sources
            if isinstance(metadata, dict) and metadata.get("source_path")
        }

        if any(pdf_file not in existing_files for pdf_file in pdf_files):
            print("New PDF files detected. Updating vector store...")
            vectorstore.add_documents(splits)
    else:
        print(f"Creating new vector database at {persist_path}...")
        vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embeddings,
            persist_directory=str(persist_path),
        )

    print(f"Vector database ready at {persist_path}.")
    return vectorstore.as_retriever(search_kwargs={"k": 5})


def _empty_retriever():
    from langchain_core.retrievers import BaseRetriever

    class EmptyRetriever(BaseRetriever):
        def _get_relevant_documents(self, query):
            return []

        async def _aget_relevant_documents(self, query):
            return []

    return EmptyRetriever()


if __name__ == "__main__":
    retriever = build_vector_db()
    for query in (
        "What is P/E ratio?",
        "What are the principles of value investing?",
        "How to analyze financial statements?",
    ):
        docs = retriever.invoke(query)
        print(f"\nQuery: {query}")
        if docs:
            for index, doc in enumerate(docs[:2], start=1):
                print(
                    f"{index}. [{doc.metadata.get('source', 'Unknown')}] "
                    f"{doc.page_content[:100]}..."
                )
        else:
            print("No relevant content found.")