Spaces:
Runtime error
Runtime error
Felipe Lemes commited on
Commit ·
e1b25d2
1
Parent(s): b0ce1fa
Update from GitHub push
Browse files- .github/workflows/deploy-to-hf.yml +25 -0
- hf-space/.gitattributes +2 -0
- hf-space/.github/workflows/main.yml +19 -0
- hf-space/.gitignore +169 -0
- hf-space/LICENSE +21 -0
- hf-space/app.py +148 -0
- hf-space/prepare_data.py +44 -0
- hf-space/requirements.txt +0 -0
- hf-space/scrape_kb.py +147 -0
- hf-space/update_vector_db_with_kb.py +100 -0
.github/workflows/deploy-to-hf.yml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Deploy to Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ master ] # ou "main" se seu repositório GitHub usa esse nome
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
deploy:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- name: Checkout repo
|
| 12 |
+
uses: actions/checkout@v3
|
| 13 |
+
|
| 14 |
+
- name: Push to Hugging Face Space
|
| 15 |
+
run: |
|
| 16 |
+
git config --global user.email "felipe@gmail.com"
|
| 17 |
+
git config --global user.name "Felipe Lemes"
|
| 18 |
+
git clone https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant hf-space
|
| 19 |
+
rsync -av --exclude='.git' ./ hf-space/
|
| 20 |
+
cd hf-space
|
| 21 |
+
git add .
|
| 22 |
+
git commit -m "Update from GitHub push"
|
| 23 |
+
git push
|
| 24 |
+
env:
|
| 25 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
hf-space/.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Auto detect text files and perform LF normalization
|
| 2 |
+
* text=auto
|
hf-space/.github/workflows/main.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
# to run this workflow manually from the Actions tab
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
sync-to-hub:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v3
|
| 13 |
+
with:
|
| 14 |
+
fetch-depth: 0
|
| 15 |
+
lfs: true
|
| 16 |
+
- name: Push to hub
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: git push https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant main
|
hf-space/.gitignore
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
#.idea/
|
| 161 |
+
|
| 162 |
+
# Ignore large data directories
|
| 163 |
+
venv/
|
| 164 |
+
data/
|
| 165 |
+
scraped_kb_articles/
|
| 166 |
+
vector_db/
|
| 167 |
+
models/
|
| 168 |
+
|
| 169 |
+
|
hf-space/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Felipe Lemes
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
hf-space/app.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 4 |
+
from langchain.vectorstores import FAISS
|
| 5 |
+
from langchain_openai import ChatOpenAI
|
| 6 |
+
from langchain.chains import RetrievalQA
|
| 7 |
+
from langchain.prompts import ChatPromptTemplate
|
| 8 |
+
|
| 9 |
+
# --- Path Configurations ---
|
| 10 |
+
VECTOR_DB_PATH = "vector_db"
|
| 11 |
+
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
|
| 12 |
+
|
| 13 |
+
# --- 1. Load Resources (Vector Database and Embedding Model) ---
|
| 14 |
+
# @st.cache_resource loads these components only once when the Streamlit app starts
|
| 15 |
+
@st.cache_resource
|
| 16 |
+
def load_resources():
|
| 17 |
+
st.spinner("Loading embedding model...")
|
| 18 |
+
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
|
| 19 |
+
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
| 20 |
+
print("Embedding model loaded.")
|
| 21 |
+
|
| 22 |
+
st.spinner("Loading vector database...")
|
| 23 |
+
print(f"Loading FAISS vector database from: {VECTOR_DB_PATH}...")
|
| 24 |
+
# allow_dangerous_deserialization=True is needed for FAISS.load_local
|
| 25 |
+
# It's safe to use if you generated the database yourself.
|
| 26 |
+
vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
|
| 27 |
+
print("Vector database loaded.")
|
| 28 |
+
|
| 29 |
+
return embeddings, vector_db
|
| 30 |
+
|
| 31 |
+
embeddings, vector_db = load_resources()
|
| 32 |
+
|
| 33 |
+
# --- 2. Load and Configure the OpenAI LLM (GPT-4o) ---
|
| 34 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 35 |
+
|
| 36 |
+
if openai_api_key:
|
| 37 |
+
try:
|
| 38 |
+
llm = ChatOpenAI(
|
| 39 |
+
temperature=0.85, # Controls creativity/randomness (0.0 to 1.0)
|
| 40 |
+
api_key=openai_api_key,
|
| 41 |
+
model_name="gpt-4o",
|
| 42 |
+
model_kwargs={"top_p": 0.9} # Controls diversity of output
|
| 43 |
+
)
|
| 44 |
+
st.success("OpenAI model (gpt-4o) loaded successfully!")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
st.error(f"Error initializing OpenAI model. Check your API key, "
|
| 47 |
+
f"model name, and plan/quotas: {e}")
|
| 48 |
+
st.stop() # Stop the app if LLM cannot be initialized
|
| 49 |
+
else:
|
| 50 |
+
st.error("OpenAI API Key (OPENAI_API_KEY) not found in environment variables.")
|
| 51 |
+
st.stop() # Stop the app if API key is not found
|
| 52 |
+
|
| 53 |
+
# --- 3. Define the System Prompt for Assistant Behavior ---
|
| 54 |
+
SYSTEM_PROMPT_TEMPLATE = """
|
| 55 |
+
You are a friendly, experienced, and patient study tutor specializing in Databricks.
|
| 56 |
+
Your goal is to help the user deeply understand topics from Databricks documentation to prepare for Databricks certifications.
|
| 57 |
+
|
| 58 |
+
Follow these guidelines:
|
| 59 |
+
1. **Always respond in the same language as the user's question.** If the question is in Portuguese, reply in Portuguese. If it's in English, reply in English.
|
| 60 |
+
2. **Explain clearly and concisely:** Use accessible language and avoid unnecessary jargon where possible.
|
| 61 |
+
3. **Go beyond simple retrieval:** Do not just reproduce information. Interpret it, reorganize it, and present it in a didactic way.
|
| 62 |
+
4. **Provide practical examples:** If appropriate, create small examples or analogies to illustrate the concept within the context of Databricks or data engineering scenarios.
|
| 63 |
+
5. **Maintain an encouraging and motivating tone:** Encourage the user in their learning.
|
| 64 |
+
6. **Use the provided "Context Documents" to answer the question.** Prioritize information from these documents.
|
| 65 |
+
7. **If the answer is not in the context documents, be honest:** State that you could not find the information and suggest the user search other sources or rephrase the question. Do not invent information.
|
| 66 |
+
8. Format your responses legibly, using lists, bold text, or code blocks when appropriate.
|
| 67 |
+
|
| 68 |
+
Context Documents:
|
| 69 |
+
{context}
|
| 70 |
+
|
| 71 |
+
User Question:
|
| 72 |
+
{question}
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
# Create a ChatPromptTemplate from the System Prompt
|
| 76 |
+
qa_prompt = ChatPromptTemplate.from_messages(
|
| 77 |
+
[
|
| 78 |
+
("system", SYSTEM_PROMPT_TEMPLATE),
|
| 79 |
+
("human", "{question}") # Where the user's question will be inserted
|
| 80 |
+
]
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# --- 4. Configure the RAG Chain (RetrievalQA) ---
|
| 84 |
+
print("Configuring the RAG chain...")
|
| 85 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 86 |
+
llm=llm, # <-- THIS IS THE CORRECTED LINE!
|
| 87 |
+
chain_type="stuff", # 'stuff' strategy puts all retrieved documents directly into the LLM's prompt
|
| 88 |
+
retriever=vector_db.as_retriever(search_kwargs={"k": 4}), # Configure FAISS as the retriever
|
| 89 |
+
# k=4 means it retrieves the 4 most relevant chunks
|
| 90 |
+
return_source_documents=True, # Optional: returns the documents that were used for the answer
|
| 91 |
+
chain_type_kwargs={"prompt": qa_prompt} # Pass the custom prompt to the chain
|
| 92 |
+
)
|
| 93 |
+
print("RAG chain configured.")
|
| 94 |
+
|
| 95 |
+
# --- 5. Streamlit Interface ---
|
| 96 |
+
st.set_page_config(
|
| 97 |
+
page_title="📚 Databricks Study Assistant with RAG",
|
| 98 |
+
layout="wide",
|
| 99 |
+
initial_sidebar_state="collapsed"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
st.title("📚 Databricks Study Assistant with RAG") # Updated title text
|
| 103 |
+
|
| 104 |
+
# Updated objective description
|
| 105 |
+
st.markdown("""
|
| 106 |
+
This assistant is designed to provide you with precise, context-aware answers directly sourced from the official Azure Databricks documentation.
|
| 107 |
+
It aims to significantly aid your studies for Databricks certifications and streamline the process of resolving technical challenges by offering a more fluid and natural consultation experience.
|
| 108 |
+
""")
|
| 109 |
+
|
| 110 |
+
# Updated context description
|
| 111 |
+
st.markdown("""
|
| 112 |
+
This assistant's knowledge base is built upon the official Azure Databricks documentation
|
| 113 |
+
([https://learn.microsoft.com/en-us/azure/databricks/](https://learn.microsoft.com/en-us/azure/databricks/))
|
| 114 |
+
and the official Databricks Azure Knowledge Base
|
| 115 |
+
([https://kb.databricks.com/](https://kb.databricks.com/)).
|
| 116 |
+
""")
|
| 117 |
+
|
| 118 |
+
user_query = st.text_input(
|
| 119 |
+
"Your question about Databricks documentation:",
|
| 120 |
+
placeholder="Ex: How to configure Auto Loader in Databricks?"
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
if st.button("Get Answer", type="primary"):
|
| 124 |
+
if user_query:
|
| 125 |
+
with st.spinner("Searching and generating response..."):
|
| 126 |
+
try:
|
| 127 |
+
response = qa_chain({"query": user_query})
|
| 128 |
+
st.subheader("Answer:")
|
| 129 |
+
st.markdown(response["result"]) # Use markdown for formatting the response
|
| 130 |
+
|
| 131 |
+
st.subheader("Source Documents:")
|
| 132 |
+
if response["source_documents"]:
|
| 133 |
+
for i, doc in enumerate(response["source_documents"]):
|
| 134 |
+
st.write(f"**Page/Source {i+1}:**")
|
| 135 |
+
st.info(doc.page_content) # Content of the chunk
|
| 136 |
+
if 'page' in doc.metadata: # If the PDF loader added the page number
|
| 137 |
+
st.write(f"*(Page: {doc.metadata['page'] + 1})*") # +1 because it's 0-based
|
| 138 |
+
st.markdown("---")
|
| 139 |
+
else:
|
| 140 |
+
st.info("No relevant source documents found for this question.")
|
| 141 |
+
except Exception as e:
|
| 142 |
+
st.error(f"An error occurred while processing your question: {e}")
|
| 143 |
+
st.info("Please check your OpenAI API key, model name, and plan/quotas.")
|
| 144 |
+
else:
|
| 145 |
+
st.warning("Please type your question before submitting.")
|
| 146 |
+
|
| 147 |
+
st.markdown("---")
|
| 148 |
+
st.caption("Developed by you, with LangChain, Streamlit, and LLMs.")
|
hf-space/prepare_data.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain.document_loaders import PyPDFLoader
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 5 |
+
from langchain.vectorstores import FAISS
|
| 6 |
+
|
| 7 |
+
# --- Configurations ---
|
| 8 |
+
PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
|
| 9 |
+
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
|
| 10 |
+
VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved
|
| 11 |
+
|
| 12 |
+
# --- 1. Load the PDF ---
|
| 13 |
+
print(f"Loading PDF from: {PDF_PATH}...")
|
| 14 |
+
try:
|
| 15 |
+
loader = PyPDFLoader(PDF_PATH)
|
| 16 |
+
documents = loader.load()
|
| 17 |
+
print(f"PDF loaded successfully! Total of {len(documents)} pages.")
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f"Error loading PDF: {e}")
|
| 20 |
+
print("Please ensure the PDF file exists and the path is correct.")
|
| 21 |
+
exit() # Stop the script if an error occurs
|
| 22 |
+
|
| 23 |
+
# --- 2. Split the text into chunks ---
|
| 24 |
+
print("Splitting text into chunks...")
|
| 25 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 26 |
+
chunk_size=1000, # Maximum size of each chunk (in characters)
|
| 27 |
+
chunk_overlap=200, # How many characters chunks can overlap (to maintain context)
|
| 28 |
+
length_function=len # Function to calculate chunk length
|
| 29 |
+
)
|
| 30 |
+
chunks = text_splitter.split_documents(documents)
|
| 31 |
+
print(f"Text split into {len(chunks)} chunks.")
|
| 32 |
+
|
| 33 |
+
# --- 3. Create Embeddings and Store in FAISS ---
|
| 34 |
+
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
|
| 35 |
+
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
| 36 |
+
|
| 37 |
+
print("Generating embeddings and creating the FAISS vector database...")
|
| 38 |
+
# Create the vector database from the chunks and embeddings
|
| 39 |
+
vector_db = FAISS.from_documents(chunks, embeddings)
|
| 40 |
+
|
| 41 |
+
# --- 4. Save the Vector Database ---
|
| 42 |
+
print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
|
| 43 |
+
vector_db.save_local(VECTOR_DB_PATH)
|
| 44 |
+
print("Vector database created and saved successfully!")
|
hf-space/requirements.txt
ADDED
|
Binary file (302 Bytes). View file
|
|
|
hf-space/scrape_kb.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from urllib.parse import urljoin
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
# --- Configurations ---
|
| 9 |
+
BASE_URL = "https://kb.databricks.com"
|
| 10 |
+
START_URL = "https://kb.databricks.com/en_US/azure" # URL of the main listing page
|
| 11 |
+
OUTPUT_DIR = "scraped_kb_articles" # Folder to save extracted articles (in JSON format)
|
| 12 |
+
|
| 13 |
+
headers = {
|
| 14 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
# Create output directory if it doesn't exist
|
| 18 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
def fetch_page_content(url, delay=1):
|
| 21 |
+
"""Function to fetch HTML content from a URL with error handling and delay."""
|
| 22 |
+
print(f"Fetching: {url}")
|
| 23 |
+
try:
|
| 24 |
+
response = requests.get(url, headers=headers, timeout=30) # Increased timeout to 30s
|
| 25 |
+
response.raise_for_status() # Raises an HTTPError for bad status codes (4xx or 5xx)
|
| 26 |
+
time.sleep(delay) # Pause to be polite to the website server
|
| 27 |
+
return response.text
|
| 28 |
+
except requests.exceptions.RequestException as e:
|
| 29 |
+
print(f"Error accessing {url}: {e}")
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
def parse_listing_page(html_content):
|
| 33 |
+
"""
|
| 34 |
+
Function to parse the listing page and extract links and titles of ALL articles.
|
| 35 |
+
Returns a list of dictionaries: [{'title': '...', 'url': '...'}]
|
| 36 |
+
"""
|
| 37 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 38 |
+
articles_data = []
|
| 39 |
+
|
| 40 |
+
# Find all article containers on the main page
|
| 41 |
+
# The selector is 'div.row[data-helpjuice-element="SubCategory Article"]'
|
| 42 |
+
# meaning: div that has class 'row' AND attribute 'data-helpjuice-element' equal to "SubCategory Article"
|
| 43 |
+
article_containers = soup.find_all('div', class_='row', attrs={'data-helpjuice-element': 'SubCategory Article'})
|
| 44 |
+
|
| 45 |
+
if not article_containers:
|
| 46 |
+
print("Warning: No article containers found on the listing page with the specified selector.")
|
| 47 |
+
print("This might indicate that the HTML has changed or content is loaded dynamically via JavaScript.")
|
| 48 |
+
return articles_data
|
| 49 |
+
|
| 50 |
+
for container in article_containers:
|
| 51 |
+
# Try to find the main article link (the first <a> inside the container)
|
| 52 |
+
link_tag = container.find('a', href=True)
|
| 53 |
+
if link_tag:
|
| 54 |
+
relative_url = link_tag['href']
|
| 55 |
+
full_url = urljoin(BASE_URL, relative_url) # Constructs the full URL
|
| 56 |
+
|
| 57 |
+
# Try to find the article title (h3 inside the link)
|
| 58 |
+
title_tag = container.find('h3', attrs={'data-helpjuice-element': 'SubCategory Article Title'})
|
| 59 |
+
title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"
|
| 60 |
+
|
| 61 |
+
# Add article data to the list
|
| 62 |
+
articles_data.append({'title': title, 'url': full_url})
|
| 63 |
+
else:
|
| 64 |
+
print(f"Warning: Article container with no valid main link found: {container.prettify()[:200]}...")
|
| 65 |
+
|
| 66 |
+
return articles_data
|
| 67 |
+
|
| 68 |
+
def scrape_article_content(article_url):
|
| 69 |
+
"""
|
| 70 |
+
Function to fetch the content of an individual article page, using the validated selector.
|
| 71 |
+
Returns a dictionary with 'url', 'title', and 'content'.
|
| 72 |
+
"""
|
| 73 |
+
html_content = fetch_page_content(article_url, delay=2) # Pause a bit more for individual article content
|
| 74 |
+
if not html_content:
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 78 |
+
|
| 79 |
+
# Extract title (Main article question/title)
|
| 80 |
+
# Validated for h1 with class article-title
|
| 81 |
+
title_tag = soup.find('h1', class_='article-title')
|
| 82 |
+
title = title_tag.get_text(strip=True) if title_tag else "Unknown Article Title"
|
| 83 |
+
|
| 84 |
+
# Extract article body (Answer/Main content)
|
| 85 |
+
# Validated for div with class 'helpjuice-article-body-content'
|
| 86 |
+
body_content_div = soup.find('div', class_='helpjuice-article-body-content')
|
| 87 |
+
content = ""
|
| 88 |
+
if body_content_div:
|
| 89 |
+
# Extracts all text within this div, using '\n' to separate blocks and strip whitespace
|
| 90 |
+
content = body_content_div.get_text(separator='\n', strip=True)
|
| 91 |
+
else:
|
| 92 |
+
print(f"Warning: Article body with class 'helpjuice-article-body-content' NOT found for {article_url}")
|
| 93 |
+
print("This might be a JavaScript loading issue or a different HTML structure for this article.")
|
| 94 |
+
|
| 95 |
+
return {'url': article_url, 'title': title, 'content': content}
|
| 96 |
+
|
| 97 |
+
# --- Main Scraping Logic ---
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
print(f"Starting scraping process from: {START_URL}")
|
| 100 |
+
|
| 101 |
+
# 1. Fetch HTML from the main listing page
|
| 102 |
+
list_page_html = fetch_page_content(START_URL, delay=3) # Increased pause for the main page
|
| 103 |
+
|
| 104 |
+
all_article_links = []
|
| 105 |
+
if list_page_html:
|
| 106 |
+
# 2. Parse the listing page and collect ALL article links
|
| 107 |
+
articles_on_main_page = parse_listing_page(list_page_html)
|
| 108 |
+
all_article_links.extend(articles_on_main_page)
|
| 109 |
+
print(f"Total of {len(all_article_links)} article links collected from the main page.")
|
| 110 |
+
else:
|
| 111 |
+
print("Could not proceed, error fetching the initial listing page.")
|
| 112 |
+
exit() # Stop the script if the initial page cannot be accessed
|
| 113 |
+
|
| 114 |
+
scraped_articles_data = []
|
| 115 |
+
# 3. Iterate over each article link and scrape the full content
|
| 116 |
+
for i, article_link_info in enumerate(all_article_links):
|
| 117 |
+
print(f"Scraping article {i+1}/{len(all_article_links)}: {article_link_info['title']}")
|
| 118 |
+
|
| 119 |
+
# Check if the JSON file for this article already exists
|
| 120 |
+
file_name_hash = article_link_info['url'].split('/')[-1] # Base filename from URL
|
| 121 |
+
output_filepath = os.path.join(OUTPUT_DIR, f"{file_name_hash}.json")
|
| 122 |
+
|
| 123 |
+
if os.path.exists(output_filepath):
|
| 124 |
+
print(f" Article already scraped and saved: {output_filepath}. Skipping.")
|
| 125 |
+
try: # Try to load to include in total if it exists
|
| 126 |
+
with open(output_filepath, 'r', encoding='utf-8') as f:
|
| 127 |
+
scraped_articles_data.append(json.load(f))
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f" Error loading existing file {output_filepath}: {e}")
|
| 130 |
+
continue # Skip to the next article
|
| 131 |
+
|
| 132 |
+
article_content = scrape_article_content(article_link_info['url'])
|
| 133 |
+
if article_content:
|
| 134 |
+
scraped_articles_data.append(article_content)
|
| 135 |
+
# Save the content as JSON in a file for reference and debugging
|
| 136 |
+
try:
|
| 137 |
+
with open(output_filepath, 'w', encoding='utf-8') as f:
|
| 138 |
+
json.dump(article_content, f, ensure_ascii=False, indent=4)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f" Error saving JSON file {output_filepath}: {e}")
|
| 141 |
+
|
| 142 |
+
print(f"\nScraping of {len(scraped_articles_data)} articles completed and saved/loaded from '{OUTPUT_DIR}'.")
|
| 143 |
+
|
| 144 |
+
print("\n--- Next Steps ---")
|
| 145 |
+
print("1. Knowledge Base articles scraped and saved as JSONs in the 'scraped_kb_articles' folder.")
|
| 146 |
+
print("2. Now, run the 'update_vector_db_with_kb.py' script to integrate this data into your FAISS vector database.")
|
| 147 |
+
print(" `python update_vector_db_with_kb.py`")
|
hf-space/update_vector_db_with_kb.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from langchain.docstore.document import Document
|
| 4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 6 |
+
from langchain.vectorstores import FAISS
|
| 7 |
+
|
| 8 |
+
# --- Configurations ---
|
| 9 |
+
# Folder where scraped JSON articles are saved by scrape_kb.py
|
| 10 |
+
SCRAPED_ARTICLES_DIR = "scraped_kb_articles"
|
| 11 |
+
# Path to your existing FAISS vector database (from PDF)
|
| 12 |
+
VECTOR_DB_PATH = "vector_db"
|
| 13 |
+
# Same embedding model name used in prepare_data.py
|
| 14 |
+
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
|
| 15 |
+
|
| 16 |
+
# --- 1. Load Scraped Articles from JSON ---
|
| 17 |
+
def load_scraped_articles(directory):
|
| 18 |
+
"""
|
| 19 |
+
Loads articles saved as JSONs and converts them into LangChain Documents.
|
| 20 |
+
Combines title and content to form 'page_content'.
|
| 21 |
+
"""
|
| 22 |
+
articles = []
|
| 23 |
+
print(f"Searching for JSON articles in folder: {directory}")
|
| 24 |
+
if not os.path.exists(directory):
|
| 25 |
+
print(f"Warning: Scraped articles directory not found: {directory}")
|
| 26 |
+
return articles
|
| 27 |
+
|
| 28 |
+
for filename in os.listdir(directory):
|
| 29 |
+
if filename.endswith(".json"):
|
| 30 |
+
filepath = os.path.join(directory, filename)
|
| 31 |
+
try:
|
| 32 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 33 |
+
data = json.load(f)
|
| 34 |
+
# Combine title and content for the Document's page_content
|
| 35 |
+
full_content = f"Title: {data.get('title', 'N/A')}\n\n{data.get('content', '')}"
|
| 36 |
+
articles.append(Document(
|
| 37 |
+
page_content=full_content,
|
| 38 |
+
metadata={"source": data.get('url', filename), "title": data.get('title', '')}
|
| 39 |
+
))
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"Error loading or processing file {filename}: {e}")
|
| 42 |
+
print(f"Loaded {len(articles)} scraped KB articles.")
|
| 43 |
+
return articles
|
| 44 |
+
|
| 45 |
+
# --- 2. Split New Documents into Chunks ---
|
| 46 |
+
def split_documents_into_chunks(documents):
|
| 47 |
+
"""
|
| 48 |
+
Splits a list of LangChain Documents into smaller chunks.
|
| 49 |
+
Uses the same chunk_size and chunk_overlap settings as the PDF processing.
|
| 50 |
+
"""
|
| 51 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 52 |
+
chunk_size=1000,
|
| 53 |
+
chunk_overlap=200,
|
| 54 |
+
length_function=len
|
| 55 |
+
)
|
| 56 |
+
chunks = text_splitter.split_documents(documents)
|
| 57 |
+
print(f"Documents split into {len(chunks)} new chunks.")
|
| 58 |
+
return chunks
|
| 59 |
+
|
| 60 |
+
# --- Main Vector Database Update Logic ---
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
print("Starting the process of updating the vector database with KB articles...")
|
| 63 |
+
|
| 64 |
+
# Load the embedding model (the same one used for the PDF)
|
| 65 |
+
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
|
| 66 |
+
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
| 67 |
+
print("Embedding model loaded.")
|
| 68 |
+
|
| 69 |
+
# 1. Load the scraped JSON articles
|
| 70 |
+
new_documents = load_scraped_articles(SCRAPED_ARTICLES_DIR)
|
| 71 |
+
|
| 72 |
+
if not new_documents:
|
| 73 |
+
print("No new articles found in the scraped data folder to add to the database. Exiting.")
|
| 74 |
+
exit()
|
| 75 |
+
|
| 76 |
+
# 2. Split the new documents into chunks
|
| 77 |
+
new_chunks = split_documents_into_chunks(new_documents)
|
| 78 |
+
|
| 79 |
+
# 3. Load the existing FAISS vector database (from the PDF)
|
| 80 |
+
print(f"Loading existing FAISS vector database from: {VECTOR_DB_PATH}...")
|
| 81 |
+
try:
|
| 82 |
+
# Ensure the 'vector_db' was created with 'prepare_data.py' first
|
| 83 |
+
vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
|
| 84 |
+
print("Existing FAISS vector database loaded successfully.")
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Error loading existing FAISS vector database: {e}")
|
| 87 |
+
print("Please ensure the 'vector_db' database was created with 'prepare_data.py' BEFORE running this script.")
|
| 88 |
+
exit()
|
| 89 |
+
|
| 90 |
+
# 4. Add the new chunks to the existing database
|
| 91 |
+
print(f"Adding {len(new_chunks)} new chunks to the FAISS database...")
|
| 92 |
+
# The add_documents method adds the new documents and their embeddings to the existing index
|
| 93 |
+
vector_db.add_documents(new_chunks)
|
| 94 |
+
print("New chunks added to the database.")
|
| 95 |
+
|
| 96 |
+
# 5. Save the updated FAISS vector database
|
| 97 |
+
print(f"Saving the updated FAISS vector database to: {VECTOR_DB_PATH}...")
|
| 98 |
+
vector_db.save_local(VECTOR_DB_PATH)
|
| 99 |
+
print("FAISS vector database updated and saved successfully!")
|
| 100 |
+
print("\nNow, run your Streamlit application ('streamlit run app.py') to see your assistant with the new knowledge!")
|