Spaces:

felipelemes
/

databricks-rag-assistant

Runtime error

App Files Files Community

Felipe Lemes commited on Jul 30, 2025

Commit

e1b25d2

1 Parent(s): b0ce1fa

Update from GitHub push

Browse files

Files changed (10) hide show

.github/workflows/deploy-to-hf.yml +25 -0
hf-space/.gitattributes +2 -0
hf-space/.github/workflows/main.yml +19 -0
hf-space/.gitignore +169 -0
hf-space/LICENSE +21 -0
hf-space/app.py +148 -0
hf-space/prepare_data.py +44 -0
hf-space/requirements.txt +0 -0
hf-space/scrape_kb.py +147 -0
hf-space/update_vector_db_with_kb.py +100 -0

.github/workflows/deploy-to-hf.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Deploy to Hugging Face Space
+on:
+  push:
+    branches: [ master ]  # ou "main" se seu repositório GitHub usa esse nome
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Push to Hugging Face Space
+        run: |
+          git config --global user.email "felipe@gmail.com"
+          git config --global user.name "Felipe Lemes"
+          git clone https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant hf-space
+          rsync -av --exclude='.git' ./ hf-space/
+          cd hf-space
+          git add .
+          git commit -m "Update from GitHub push"
+          git push
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}

hf-space/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

hf-space/.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant main

hf-space/.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ignore large data directories
+venv/
+data/
+scraped_kb_articles/
+vector_db/
+models/

hf-space/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Felipe Lemes
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hf-space/app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import streamlit as st
+import os
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.vectorstores import FAISS
+from langchain_openai import ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain.prompts import ChatPromptTemplate
+# --- Path Configurations ---
+VECTOR_DB_PATH = "vector_db"
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+# --- 1. Load Resources (Vector Database and Embedding Model) ---
+# @st.cache_resource loads these components only once when the Streamlit app starts
+@st.cache_resource
+def load_resources():
+    st.spinner("Loading embedding model...")
+    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
+    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    print("Embedding model loaded.")
+    st.spinner("Loading vector database...")
+    print(f"Loading FAISS vector database from: {VECTOR_DB_PATH}...")
+    # allow_dangerous_deserialization=True is needed for FAISS.load_local
+    # It's safe to use if you generated the database yourself.
+    vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
+    print("Vector database loaded.")
+    return embeddings, vector_db
+embeddings, vector_db = load_resources()
+# --- 2. Load and Configure the OpenAI LLM (GPT-4o) ---
+openai_api_key = os.getenv("OPENAI_API_KEY")
+if openai_api_key:
+    try:
+        llm = ChatOpenAI(
+            temperature=0.85, # Controls creativity/randomness (0.0 to 1.0)
+            api_key=openai_api_key,
+            model_name="gpt-4o",
+            model_kwargs={"top_p": 0.9} # Controls diversity of output
+        )
+        st.success("OpenAI model (gpt-4o) loaded successfully!")
+    except Exception as e:
+        st.error(f"Error initializing OpenAI model. Check your API key, "
+                 f"model name, and plan/quotas: {e}")
+        st.stop() # Stop the app if LLM cannot be initialized
+else:
+    st.error("OpenAI API Key (OPENAI_API_KEY) not found in environment variables.")
+    st.stop() # Stop the app if API key is not found
+# --- 3. Define the System Prompt for Assistant Behavior ---
+SYSTEM_PROMPT_TEMPLATE = """
+You are a friendly, experienced, and patient study tutor specializing in Databricks.
+Your goal is to help the user deeply understand topics from Databricks documentation to prepare for Databricks certifications.
+Follow these guidelines:
+1.  **Always respond in the same language as the user's question.** If the question is in Portuguese, reply in Portuguese. If it's in English, reply in English.
+2.  **Explain clearly and concisely:** Use accessible language and avoid unnecessary jargon where possible.
+3.  **Go beyond simple retrieval:** Do not just reproduce information. Interpret it, reorganize it, and present it in a didactic way.
+4.  **Provide practical examples:** If appropriate, create small examples or analogies to illustrate the concept within the context of Databricks or data engineering scenarios.
+5.  **Maintain an encouraging and motivating tone:** Encourage the user in their learning.
+6.  **Use the provided "Context Documents" to answer the question.** Prioritize information from these documents.
+7.  **If the answer is not in the context documents, be honest:** State that you could not find the information and suggest the user search other sources or rephrase the question. Do not invent information.
+8.  Format your responses legibly, using lists, bold text, or code blocks when appropriate.
+Context Documents:
+{context}
+User Question:
+{question}
+"""
+# Create a ChatPromptTemplate from the System Prompt
+qa_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_PROMPT_TEMPLATE),
+        ("human", "{question}") # Where the user's question will be inserted
+    ]
+)
+# --- 4. Configure the RAG Chain (RetrievalQA) ---
+print("Configuring the RAG chain...")
+qa_chain = RetrievalQA.from_chain_type(
+    llm=llm, # <-- THIS IS THE CORRECTED LINE!
+    chain_type="stuff", # 'stuff' strategy puts all retrieved documents directly into the LLM's prompt
+    retriever=vector_db.as_retriever(search_kwargs={"k": 4}), # Configure FAISS as the retriever
+                                                              # k=4 means it retrieves the 4 most relevant chunks
+    return_source_documents=True, # Optional: returns the documents that were used for the answer
+    chain_type_kwargs={"prompt": qa_prompt} # Pass the custom prompt to the chain
+)
+print("RAG chain configured.")
+# --- 5. Streamlit Interface ---
+st.set_page_config(
+    page_title="📚 Databricks Study Assistant with RAG",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+st.title("📚 Databricks Study Assistant with RAG") # Updated title text
+# Updated objective description
+st.markdown("""
+    This assistant is designed to provide you with precise, context-aware answers directly sourced from the official Azure Databricks documentation.
+    It aims to significantly aid your studies for Databricks certifications and streamline the process of resolving technical challenges by offering a more fluid and natural consultation experience.
+""")
+# Updated context description
+st.markdown("""
+    This assistant's knowledge base is built upon the official Azure Databricks documentation
+    ([https://learn.microsoft.com/en-us/azure/databricks/](https://learn.microsoft.com/en-us/azure/databricks/))
+    and the official Databricks Azure Knowledge Base
+    ([https://kb.databricks.com/](https://kb.databricks.com/)).
+""")
+user_query = st.text_input(
+    "Your question about Databricks documentation:",
+    placeholder="Ex: How to configure Auto Loader in Databricks?"
+)
+if st.button("Get Answer", type="primary"):
+    if user_query:
+        with st.spinner("Searching and generating response..."):
+            try:
+                response = qa_chain({"query": user_query})
+                st.subheader("Answer:")
+                st.markdown(response["result"]) # Use markdown for formatting the response
+                st.subheader("Source Documents:")
+                if response["source_documents"]:
+                    for i, doc in enumerate(response["source_documents"]):
+                        st.write(f"**Page/Source {i+1}:**")
+                        st.info(doc.page_content) # Content of the chunk
+                        if 'page' in doc.metadata: # If the PDF loader added the page number
+                            st.write(f"*(Page: {doc.metadata['page'] + 1})*") # +1 because it's 0-based
+                        st.markdown("---")
+                else:
+                    st.info("No relevant source documents found for this question.")
+            except Exception as e:
+                st.error(f"An error occurred while processing your question: {e}")
+                st.info("Please check your OpenAI API key, model name, and plan/quotas.")
+    else:
+        st.warning("Please type your question before submitting.")
+st.markdown("---")
+st.caption("Developed by you, with LangChain, Streamlit, and LLMs.")

hf-space/prepare_data.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.vectorstores import FAISS
+# --- Configurations ---
+PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
+VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved
+# --- 1. Load the PDF ---
+print(f"Loading PDF from: {PDF_PATH}...")
+try:
+    loader = PyPDFLoader(PDF_PATH)
+    documents = loader.load()
+    print(f"PDF loaded successfully! Total of {len(documents)} pages.")
+except Exception as e:
+    print(f"Error loading PDF: {e}")
+    print("Please ensure the PDF file exists and the path is correct.")
+    exit() # Stop the script if an error occurs
+# --- 2. Split the text into chunks ---
+print("Splitting text into chunks...")
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,       # Maximum size of each chunk (in characters)
+    chunk_overlap=200,     # How many characters chunks can overlap (to maintain context)
+    length_function=len    # Function to calculate chunk length
+)
+chunks = text_splitter.split_documents(documents)
+print(f"Text split into {len(chunks)} chunks.")
+# --- 3. Create Embeddings and Store in FAISS ---
+print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
+embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+print("Generating embeddings and creating the FAISS vector database...")
+# Create the vector database from the chunks and embeddings
+vector_db = FAISS.from_documents(chunks, embeddings)
+# --- 4. Save the Vector Database ---
+print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
+vector_db.save_local(VECTOR_DB_PATH)
+print("Vector database created and saved successfully!")

hf-space/requirements.txt ADDED Viewed

Binary file (302 Bytes). View file

hf-space/scrape_kb.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import time
+from urllib.parse import urljoin
+import json
+# --- Configurations ---
+BASE_URL = "https://kb.databricks.com"
+START_URL = "https://kb.databricks.com/en_US/azure" # URL of the main listing page
+OUTPUT_DIR = "scraped_kb_articles" # Folder to save extracted articles (in JSON format)
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
+}
+# Create output directory if it doesn't exist
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def fetch_page_content(url, delay=1):
+    """Function to fetch HTML content from a URL with error handling and delay."""
+    print(f"Fetching: {url}")
+    try:
+        response = requests.get(url, headers=headers, timeout=30) # Increased timeout to 30s
+        response.raise_for_status() # Raises an HTTPError for bad status codes (4xx or 5xx)
+        time.sleep(delay) # Pause to be polite to the website server
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error accessing {url}: {e}")
+        return None
+def parse_listing_page(html_content):
+    """
+    Function to parse the listing page and extract links and titles of ALL articles.
+    Returns a list of dictionaries: [{'title': '...', 'url': '...'}]
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    articles_data = []
+    # Find all article containers on the main page
+    # The selector is 'div.row[data-helpjuice-element="SubCategory Article"]'
+    # meaning: div that has class 'row' AND attribute 'data-helpjuice-element' equal to "SubCategory Article"
+    article_containers = soup.find_all('div', class_='row', attrs={'data-helpjuice-element': 'SubCategory Article'})
+    if not article_containers:
+        print("Warning: No article containers found on the listing page with the specified selector.")
+        print("This might indicate that the HTML has changed or content is loaded dynamically via JavaScript.")
+        return articles_data
+    for container in article_containers:
+        # Try to find the main article link (the first <a> inside the container)
+        link_tag = container.find('a', href=True)
+        if link_tag:
+            relative_url = link_tag['href']
+            full_url = urljoin(BASE_URL, relative_url) # Constructs the full URL
+            # Try to find the article title (h3 inside the link)
+            title_tag = container.find('h3', attrs={'data-helpjuice-element': 'SubCategory Article Title'})
+            title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"
+            # Add article data to the list
+            articles_data.append({'title': title, 'url': full_url})
+        else:
+            print(f"Warning: Article container with no valid main link found: {container.prettify()[:200]}...")
+    return articles_data
+def scrape_article_content(article_url):
+    """
+    Function to fetch the content of an individual article page, using the validated selector.
+    Returns a dictionary with 'url', 'title', and 'content'.
+    """
+    html_content = fetch_page_content(article_url, delay=2) # Pause a bit more for individual article content
+    if not html_content:
+        return None
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Extract title (Main article question/title)
+    # Validated for h1 with class article-title
+    title_tag = soup.find('h1', class_='article-title')
+    title = title_tag.get_text(strip=True) if title_tag else "Unknown Article Title"
+    # Extract article body (Answer/Main content)
+    # Validated for div with class 'helpjuice-article-body-content'
+    body_content_div = soup.find('div', class_='helpjuice-article-body-content')
+    content = ""
+    if body_content_div:
+        # Extracts all text within this div, using '\n' to separate blocks and strip whitespace
+        content = body_content_div.get_text(separator='\n', strip=True)
+    else:
+        print(f"Warning: Article body with class 'helpjuice-article-body-content' NOT found for {article_url}")
+        print("This might be a JavaScript loading issue or a different HTML structure for this article.")
+    return {'url': article_url, 'title': title, 'content': content}
+# --- Main Scraping Logic ---
+if __name__ == "__main__":
+    print(f"Starting scraping process from: {START_URL}")
+    # 1. Fetch HTML from the main listing page
+    list_page_html = fetch_page_content(START_URL, delay=3) # Increased pause for the main page
+    all_article_links = []
+    if list_page_html:
+        # 2. Parse the listing page and collect ALL article links
+        articles_on_main_page = parse_listing_page(list_page_html)
+        all_article_links.extend(articles_on_main_page)
+        print(f"Total of {len(all_article_links)} article links collected from the main page.")
+    else:
+        print("Could not proceed, error fetching the initial listing page.")
+        exit() # Stop the script if the initial page cannot be accessed
+    scraped_articles_data = []
+    # 3. Iterate over each article link and scrape the full content
+    for i, article_link_info in enumerate(all_article_links):
+        print(f"Scraping article {i+1}/{len(all_article_links)}: {article_link_info['title']}")
+        # Check if the JSON file for this article already exists
+        file_name_hash = article_link_info['url'].split('/')[-1] # Base filename from URL
+        output_filepath = os.path.join(OUTPUT_DIR, f"{file_name_hash}.json")
+        if os.path.exists(output_filepath):
+            print(f"  Article already scraped and saved: {output_filepath}. Skipping.")
+            try: # Try to load to include in total if it exists
+                with open(output_filepath, 'r', encoding='utf-8') as f:
+                    scraped_articles_data.append(json.load(f))
+            except Exception as e:
+                print(f"  Error loading existing file {output_filepath}: {e}")
+            continue # Skip to the next article
+        article_content = scrape_article_content(article_link_info['url'])
+        if article_content:
+            scraped_articles_data.append(article_content)
+            # Save the content as JSON in a file for reference and debugging
+            try:
+                with open(output_filepath, 'w', encoding='utf-8') as f:
+                    json.dump(article_content, f, ensure_ascii=False, indent=4)
+            except Exception as e:
+                print(f"  Error saving JSON file {output_filepath}: {e}")
+    print(f"\nScraping of {len(scraped_articles_data)} articles completed and saved/loaded from '{OUTPUT_DIR}'.")
+    print("\n--- Next Steps ---")
+    print("1. Knowledge Base articles scraped and saved as JSONs in the 'scraped_kb_articles' folder.")
+    print("2. Now, run the 'update_vector_db_with_kb.py' script to integrate this data into your FAISS vector database.")
+    print("   `python update_vector_db_with_kb.py`")

hf-space/update_vector_db_with_kb.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import json
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.vectorstores import FAISS
+# --- Configurations ---
+# Folder where scraped JSON articles are saved by scrape_kb.py
+SCRAPED_ARTICLES_DIR = "scraped_kb_articles"
+# Path to your existing FAISS vector database (from PDF)
+VECTOR_DB_PATH = "vector_db"
+# Same embedding model name used in prepare_data.py
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+# --- 1. Load Scraped Articles from JSON ---
+def load_scraped_articles(directory):
+    """
+    Loads articles saved as JSONs and converts them into LangChain Documents.
+    Combines title and content to form 'page_content'.
+    """
+    articles = []
+    print(f"Searching for JSON articles in folder: {directory}")
+    if not os.path.exists(directory):
+        print(f"Warning: Scraped articles directory not found: {directory}")
+        return articles
+    for filename in os.listdir(directory):
+        if filename.endswith(".json"):
+            filepath = os.path.join(directory, filename)
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    # Combine title and content for the Document's page_content
+                    full_content = f"Title: {data.get('title', 'N/A')}\n\n{data.get('content', '')}"
+                    articles.append(Document(
+                        page_content=full_content,
+                        metadata={"source": data.get('url', filename), "title": data.get('title', '')}
+                    ))
+            except Exception as e:
+                print(f"Error loading or processing file {filename}: {e}")
+    print(f"Loaded {len(articles)} scraped KB articles.")
+    return articles
+# --- 2. Split New Documents into Chunks ---
+def split_documents_into_chunks(documents):
+    """
+    Splits a list of LangChain Documents into smaller chunks.
+    Uses the same chunk_size and chunk_overlap settings as the PDF processing.
+    """
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_documents(documents)
+    print(f"Documents split into {len(chunks)} new chunks.")
+    return chunks
+# --- Main Vector Database Update Logic ---
+if __name__ == "__main__":
+    print("Starting the process of updating the vector database with KB articles...")
+    # Load the embedding model (the same one used for the PDF)
+    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
+    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    print("Embedding model loaded.")
+    # 1. Load the scraped JSON articles
+    new_documents = load_scraped_articles(SCRAPED_ARTICLES_DIR)
+    if not new_documents:
+        print("No new articles found in the scraped data folder to add to the database. Exiting.")
+        exit()
+    # 2. Split the new documents into chunks
+    new_chunks = split_documents_into_chunks(new_documents)
+    # 3. Load the existing FAISS vector database (from the PDF)
+    print(f"Loading existing FAISS vector database from: {VECTOR_DB_PATH}...")
+    try:
+        # Ensure the 'vector_db' was created with 'prepare_data.py' first
+        vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
+        print("Existing FAISS vector database loaded successfully.")
+    except Exception as e:
+        print(f"Error loading existing FAISS vector database: {e}")
+        print("Please ensure the 'vector_db' database was created with 'prepare_data.py' BEFORE running this script.")
+        exit()
+    # 4. Add the new chunks to the existing database
+    print(f"Adding {len(new_chunks)} new chunks to the FAISS database...")
+    # The add_documents method adds the new documents and their embeddings to the existing index
+    vector_db.add_documents(new_chunks)
+    print("New chunks added to the database.")
+    # 5. Save the updated FAISS vector database
+    print(f"Saving the updated FAISS vector database to: {VECTOR_DB_PATH}...")
+    vector_db.save_local(VECTOR_DB_PATH)
+    print("FAISS vector database updated and saved successfully!")
+    print("\nNow, run your Streamlit application ('streamlit run app.py') to see your assistant with the new knowledge!")