File size: 4,437 Bytes
3975d30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# --- Configurations ---
# Folder where scraped JSON articles are saved by scrape_kb.py
SCRAPED_ARTICLES_DIR = "scraped_kb_articles"
# Path to your existing FAISS vector database (from PDF)
VECTOR_DB_PATH = "vector_db"
# Same embedding model name used in prepare_data.py
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# --- 1. Load Scraped Articles from JSON ---
def load_scraped_articles(directory):
    """
    Loads articles saved as JSONs and converts them into LangChain Documents.
    Combines title and content to form 'page_content'.
    """
    articles = []
    print(f"Searching for JSON articles in folder: {directory}")
    if not os.path.exists(directory):
        print(f"Warning: Scraped articles directory not found: {directory}")
        return articles

    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    # Combine title and content for the Document's page_content
                    full_content = f"Title: {data.get('title', 'N/A')}\n\n{data.get('content', '')}"
                    articles.append(Document(
                        page_content=full_content,
                        metadata={"source": data.get('url', filename), "title": data.get('title', '')}
                    ))
            except Exception as e:
                print(f"Error loading or processing file {filename}: {e}")
    print(f"Loaded {len(articles)} scraped KB articles.")
    return articles

# --- 2. Split New Documents into Chunks ---
def split_documents_into_chunks(documents):
    """
    Splits a list of LangChain Documents into smaller chunks.
    Uses the same chunk_size and chunk_overlap settings as the PDF processing.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Documents split into {len(chunks)} new chunks.")
    return chunks

# --- Main Vector Database Update Logic ---
if __name__ == "__main__":
    print("Starting the process of updating the vector database with KB articles...")

    # Load the embedding model (the same one used for the PDF)
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    print("Embedding model loaded.")

    # 1. Load the scraped JSON articles
    new_documents = load_scraped_articles(SCRAPED_ARTICLES_DIR)

    if not new_documents:
        print("No new articles found in the scraped data folder to add to the database. Exiting.")
        exit()

    # 2. Split the new documents into chunks
    new_chunks = split_documents_into_chunks(new_documents)

    # 3. Load the existing FAISS vector database (from the PDF)
    print(f"Loading existing FAISS vector database from: {VECTOR_DB_PATH}...")
    try:
        # Ensure the 'vector_db' was created with 'prepare_data.py' first
        vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
        print("Existing FAISS vector database loaded successfully.")
    except Exception as e:
        print(f"Error loading existing FAISS vector database: {e}")
        print("Please ensure the 'vector_db' database was created with 'prepare_data.py' BEFORE running this script.")
        exit()

    # 4. Add the new chunks to the existing database
    print(f"Adding {len(new_chunks)} new chunks to the FAISS database...")
    # The add_documents method adds the new documents and their embeddings to the existing index
    vector_db.add_documents(new_chunks)
    print("New chunks added to the database.")

    # 5. Save the updated FAISS vector database
    print(f"Saving the updated FAISS vector database to: {VECTOR_DB_PATH}...")
    vector_db.save_local(VECTOR_DB_PATH)
    print("FAISS vector database updated and saved successfully!")
    print("\nNow, run your Streamlit application ('streamlit run app.py') to see your assistant with the new knowledge!")