File size: 1,826 Bytes
6d12932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

DATA_FOLDER = "fons_knowledge_base"
DB_FOLDER = "chroma_db_fons"
EMBEDDING_MODEL = "text-embedding-ada-002"


def ingest():
    print("🧠 Building Knowledge Graph...")
    if os.path.exists(DB_FOLDER):
        shutil.rmtree(DB_FOLDER)

    embeddings = AzureOpenAIEmbeddings(
        azure_deployment=EMBEDDING_MODEL,
        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    )

    vector_store = Chroma(
        persist_directory=DB_FOLDER,
        embedding_function=embeddings
    )
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    files = [
        f for f in os.listdir(DATA_FOLDER)
        if f.endswith('.pdf')
    ]
    print(f"   Processing {len(files)} PDFs...")

    for i, f in enumerate(files):
        try:
            loader = PyPDFLoader(os.path.join(DATA_FOLDER, f))
            pages = loader.load()
            if pages:
                chunks = text_splitter.split_documents(pages)
                vector_store.add_documents(chunks)
                if (i + 1) % 10 == 0:
                    print(
                        f"   [{i+1}/{len(files)}] Indexed...",
                        end="\r"
                    )
        except Exception:
            pass
    print("\n🎉 Database Rebuilt Successfully!")


if __name__ == "__main__":
    ingest()