Felipe Lemes commited on
Commit
e1b25d2
·
1 Parent(s): b0ce1fa

Update from GitHub push

Browse files
.github/workflows/deploy-to-hf.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ] # ou "main" se seu repositório GitHub usa esse nome
6
+
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repo
12
+ uses: actions/checkout@v3
13
+
14
+ - name: Push to Hugging Face Space
15
+ run: |
16
+ git config --global user.email "felipe@gmail.com"
17
+ git config --global user.name "Felipe Lemes"
18
+ git clone https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant hf-space
19
+ rsync -av --exclude='.git' ./ hf-space/
20
+ cd hf-space
21
+ git add .
22
+ git commit -m "Update from GitHub push"
23
+ git push
24
+ env:
25
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
hf-space/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
hf-space/.github/workflows/main.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ # to run this workflow manually from the Actions tab
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ sync-to-hub:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push https://felipelemes:$HF_TOKEN@huggingface.co/spaces/felipelemes/databricks-rag-assistant main
hf-space/.gitignore ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ # Ignore large data directories
163
+ venv/
164
+ data/
165
+ scraped_kb_articles/
166
+ vector_db/
167
+ models/
168
+
169
+
hf-space/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Felipe Lemes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
hf-space/app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from langchain.vectorstores import FAISS
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.chains import RetrievalQA
7
+ from langchain.prompts import ChatPromptTemplate
8
+
9
+ # --- Path Configurations ---
10
+ VECTOR_DB_PATH = "vector_db"
11
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
12
+
13
+ # --- 1. Load Resources (Vector Database and Embedding Model) ---
14
+ # @st.cache_resource loads these components only once when the Streamlit app starts
15
+ @st.cache_resource
16
+ def load_resources():
17
+ st.spinner("Loading embedding model...")
18
+ print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
19
+ embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
20
+ print("Embedding model loaded.")
21
+
22
+ st.spinner("Loading vector database...")
23
+ print(f"Loading FAISS vector database from: {VECTOR_DB_PATH}...")
24
+ # allow_dangerous_deserialization=True is needed for FAISS.load_local
25
+ # It's safe to use if you generated the database yourself.
26
+ vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
27
+ print("Vector database loaded.")
28
+
29
+ return embeddings, vector_db
30
+
31
+ embeddings, vector_db = load_resources()
32
+
33
+ # --- 2. Load and Configure the OpenAI LLM (GPT-4o) ---
34
+ openai_api_key = os.getenv("OPENAI_API_KEY")
35
+
36
+ if openai_api_key:
37
+ try:
38
+ llm = ChatOpenAI(
39
+ temperature=0.85, # Controls creativity/randomness (0.0 to 1.0)
40
+ api_key=openai_api_key,
41
+ model_name="gpt-4o",
42
+ model_kwargs={"top_p": 0.9} # Controls diversity of output
43
+ )
44
+ st.success("OpenAI model (gpt-4o) loaded successfully!")
45
+ except Exception as e:
46
+ st.error(f"Error initializing OpenAI model. Check your API key, "
47
+ f"model name, and plan/quotas: {e}")
48
+ st.stop() # Stop the app if LLM cannot be initialized
49
+ else:
50
+ st.error("OpenAI API Key (OPENAI_API_KEY) not found in environment variables.")
51
+ st.stop() # Stop the app if API key is not found
52
+
53
+ # --- 3. Define the System Prompt for Assistant Behavior ---
54
+ SYSTEM_PROMPT_TEMPLATE = """
55
+ You are a friendly, experienced, and patient study tutor specializing in Databricks.
56
+ Your goal is to help the user deeply understand topics from Databricks documentation to prepare for Databricks certifications.
57
+
58
+ Follow these guidelines:
59
+ 1. **Always respond in the same language as the user's question.** If the question is in Portuguese, reply in Portuguese. If it's in English, reply in English.
60
+ 2. **Explain clearly and concisely:** Use accessible language and avoid unnecessary jargon where possible.
61
+ 3. **Go beyond simple retrieval:** Do not just reproduce information. Interpret it, reorganize it, and present it in a didactic way.
62
+ 4. **Provide practical examples:** If appropriate, create small examples or analogies to illustrate the concept within the context of Databricks or data engineering scenarios.
63
+ 5. **Maintain an encouraging and motivating tone:** Encourage the user in their learning.
64
+ 6. **Use the provided "Context Documents" to answer the question.** Prioritize information from these documents.
65
+ 7. **If the answer is not in the context documents, be honest:** State that you could not find the information and suggest the user search other sources or rephrase the question. Do not invent information.
66
+ 8. Format your responses legibly, using lists, bold text, or code blocks when appropriate.
67
+
68
+ Context Documents:
69
+ {context}
70
+
71
+ User Question:
72
+ {question}
73
+ """
74
+
75
+ # Create a ChatPromptTemplate from the System Prompt
76
+ qa_prompt = ChatPromptTemplate.from_messages(
77
+ [
78
+ ("system", SYSTEM_PROMPT_TEMPLATE),
79
+ ("human", "{question}") # Where the user's question will be inserted
80
+ ]
81
+ )
82
+
83
+ # --- 4. Configure the RAG Chain (RetrievalQA) ---
84
+ print("Configuring the RAG chain...")
85
+ qa_chain = RetrievalQA.from_chain_type(
86
+ llm=llm, # <-- THIS IS THE CORRECTED LINE!
87
+ chain_type="stuff", # 'stuff' strategy puts all retrieved documents directly into the LLM's prompt
88
+ retriever=vector_db.as_retriever(search_kwargs={"k": 4}), # Configure FAISS as the retriever
89
+ # k=4 means it retrieves the 4 most relevant chunks
90
+ return_source_documents=True, # Optional: returns the documents that were used for the answer
91
+ chain_type_kwargs={"prompt": qa_prompt} # Pass the custom prompt to the chain
92
+ )
93
+ print("RAG chain configured.")
94
+
95
+ # --- 5. Streamlit Interface ---
96
+ st.set_page_config(
97
+ page_title="📚 Databricks Study Assistant with RAG",
98
+ layout="wide",
99
+ initial_sidebar_state="collapsed"
100
+ )
101
+
102
+ st.title("📚 Databricks Study Assistant with RAG") # Updated title text
103
+
104
+ # Updated objective description
105
+ st.markdown("""
106
+ This assistant is designed to provide you with precise, context-aware answers directly sourced from the official Azure Databricks documentation.
107
+ It aims to significantly aid your studies for Databricks certifications and streamline the process of resolving technical challenges by offering a more fluid and natural consultation experience.
108
+ """)
109
+
110
+ # Updated context description
111
+ st.markdown("""
112
+ This assistant's knowledge base is built upon the official Azure Databricks documentation
113
+ ([https://learn.microsoft.com/en-us/azure/databricks/](https://learn.microsoft.com/en-us/azure/databricks/))
114
+ and the official Databricks Azure Knowledge Base
115
+ ([https://kb.databricks.com/](https://kb.databricks.com/)).
116
+ """)
117
+
118
+ user_query = st.text_input(
119
+ "Your question about Databricks documentation:",
120
+ placeholder="Ex: How to configure Auto Loader in Databricks?"
121
+ )
122
+
123
+ if st.button("Get Answer", type="primary"):
124
+ if user_query:
125
+ with st.spinner("Searching and generating response..."):
126
+ try:
127
+ response = qa_chain({"query": user_query})
128
+ st.subheader("Answer:")
129
+ st.markdown(response["result"]) # Use markdown for formatting the response
130
+
131
+ st.subheader("Source Documents:")
132
+ if response["source_documents"]:
133
+ for i, doc in enumerate(response["source_documents"]):
134
+ st.write(f"**Page/Source {i+1}:**")
135
+ st.info(doc.page_content) # Content of the chunk
136
+ if 'page' in doc.metadata: # If the PDF loader added the page number
137
+ st.write(f"*(Page: {doc.metadata['page'] + 1})*") # +1 because it's 0-based
138
+ st.markdown("---")
139
+ else:
140
+ st.info("No relevant source documents found for this question.")
141
+ except Exception as e:
142
+ st.error(f"An error occurred while processing your question: {e}")
143
+ st.info("Please check your OpenAI API key, model name, and plan/quotas.")
144
+ else:
145
+ st.warning("Please type your question before submitting.")
146
+
147
+ st.markdown("---")
148
+ st.caption("Developed by you, with LangChain, Streamlit, and LLMs.")
hf-space/prepare_data.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.embeddings import SentenceTransformerEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+
7
+ # --- Configurations ---
8
+ PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
9
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
10
+ VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved
11
+
12
+ # --- 1. Load the PDF ---
13
+ print(f"Loading PDF from: {PDF_PATH}...")
14
+ try:
15
+ loader = PyPDFLoader(PDF_PATH)
16
+ documents = loader.load()
17
+ print(f"PDF loaded successfully! Total of {len(documents)} pages.")
18
+ except Exception as e:
19
+ print(f"Error loading PDF: {e}")
20
+ print("Please ensure the PDF file exists and the path is correct.")
21
+ exit() # Stop the script if an error occurs
22
+
23
+ # --- 2. Split the text into chunks ---
24
+ print("Splitting text into chunks...")
25
+ text_splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size=1000, # Maximum size of each chunk (in characters)
27
+ chunk_overlap=200, # How many characters chunks can overlap (to maintain context)
28
+ length_function=len # Function to calculate chunk length
29
+ )
30
+ chunks = text_splitter.split_documents(documents)
31
+ print(f"Text split into {len(chunks)} chunks.")
32
+
33
+ # --- 3. Create Embeddings and Store in FAISS ---
34
+ print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
35
+ embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
36
+
37
+ print("Generating embeddings and creating the FAISS vector database...")
38
+ # Create the vector database from the chunks and embeddings
39
+ vector_db = FAISS.from_documents(chunks, embeddings)
40
+
41
+ # --- 4. Save the Vector Database ---
42
+ print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
43
+ vector_db.save_local(VECTOR_DB_PATH)
44
+ print("Vector database created and saved successfully!")
hf-space/requirements.txt ADDED
Binary file (302 Bytes). View file
 
hf-space/scrape_kb.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ import time
5
+ from urllib.parse import urljoin
6
+ import json
7
+
8
+ # --- Configurations ---
9
+ BASE_URL = "https://kb.databricks.com"
10
+ START_URL = "https://kb.databricks.com/en_US/azure" # URL of the main listing page
11
+ OUTPUT_DIR = "scraped_kb_articles" # Folder to save extracted articles (in JSON format)
12
+
13
+ headers = {
14
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
15
+ }
16
+
17
+ # Create output directory if it doesn't exist
18
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
19
+
20
+ def fetch_page_content(url, delay=1):
21
+ """Function to fetch HTML content from a URL with error handling and delay."""
22
+ print(f"Fetching: {url}")
23
+ try:
24
+ response = requests.get(url, headers=headers, timeout=30) # Increased timeout to 30s
25
+ response.raise_for_status() # Raises an HTTPError for bad status codes (4xx or 5xx)
26
+ time.sleep(delay) # Pause to be polite to the website server
27
+ return response.text
28
+ except requests.exceptions.RequestException as e:
29
+ print(f"Error accessing {url}: {e}")
30
+ return None
31
+
32
+ def parse_listing_page(html_content):
33
+ """
34
+ Function to parse the listing page and extract links and titles of ALL articles.
35
+ Returns a list of dictionaries: [{'title': '...', 'url': '...'}]
36
+ """
37
+ soup = BeautifulSoup(html_content, 'html.parser')
38
+ articles_data = []
39
+
40
+ # Find all article containers on the main page
41
+ # The selector is 'div.row[data-helpjuice-element="SubCategory Article"]'
42
+ # meaning: div that has class 'row' AND attribute 'data-helpjuice-element' equal to "SubCategory Article"
43
+ article_containers = soup.find_all('div', class_='row', attrs={'data-helpjuice-element': 'SubCategory Article'})
44
+
45
+ if not article_containers:
46
+ print("Warning: No article containers found on the listing page with the specified selector.")
47
+ print("This might indicate that the HTML has changed or content is loaded dynamically via JavaScript.")
48
+ return articles_data
49
+
50
+ for container in article_containers:
51
+ # Try to find the main article link (the first <a> inside the container)
52
+ link_tag = container.find('a', href=True)
53
+ if link_tag:
54
+ relative_url = link_tag['href']
55
+ full_url = urljoin(BASE_URL, relative_url) # Constructs the full URL
56
+
57
+ # Try to find the article title (h3 inside the link)
58
+ title_tag = container.find('h3', attrs={'data-helpjuice-element': 'SubCategory Article Title'})
59
+ title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"
60
+
61
+ # Add article data to the list
62
+ articles_data.append({'title': title, 'url': full_url})
63
+ else:
64
+ print(f"Warning: Article container with no valid main link found: {container.prettify()[:200]}...")
65
+
66
+ return articles_data
67
+
68
+ def scrape_article_content(article_url):
69
+ """
70
+ Function to fetch the content of an individual article page, using the validated selector.
71
+ Returns a dictionary with 'url', 'title', and 'content'.
72
+ """
73
+ html_content = fetch_page_content(article_url, delay=2) # Pause a bit more for individual article content
74
+ if not html_content:
75
+ return None
76
+
77
+ soup = BeautifulSoup(html_content, 'html.parser')
78
+
79
+ # Extract title (Main article question/title)
80
+ # Validated for h1 with class article-title
81
+ title_tag = soup.find('h1', class_='article-title')
82
+ title = title_tag.get_text(strip=True) if title_tag else "Unknown Article Title"
83
+
84
+ # Extract article body (Answer/Main content)
85
+ # Validated for div with class 'helpjuice-article-body-content'
86
+ body_content_div = soup.find('div', class_='helpjuice-article-body-content')
87
+ content = ""
88
+ if body_content_div:
89
+ # Extracts all text within this div, using '\n' to separate blocks and strip whitespace
90
+ content = body_content_div.get_text(separator='\n', strip=True)
91
+ else:
92
+ print(f"Warning: Article body with class 'helpjuice-article-body-content' NOT found for {article_url}")
93
+ print("This might be a JavaScript loading issue or a different HTML structure for this article.")
94
+
95
+ return {'url': article_url, 'title': title, 'content': content}
96
+
97
+ # --- Main Scraping Logic ---
98
+ if __name__ == "__main__":
99
+ print(f"Starting scraping process from: {START_URL}")
100
+
101
+ # 1. Fetch HTML from the main listing page
102
+ list_page_html = fetch_page_content(START_URL, delay=3) # Increased pause for the main page
103
+
104
+ all_article_links = []
105
+ if list_page_html:
106
+ # 2. Parse the listing page and collect ALL article links
107
+ articles_on_main_page = parse_listing_page(list_page_html)
108
+ all_article_links.extend(articles_on_main_page)
109
+ print(f"Total of {len(all_article_links)} article links collected from the main page.")
110
+ else:
111
+ print("Could not proceed, error fetching the initial listing page.")
112
+ exit() # Stop the script if the initial page cannot be accessed
113
+
114
+ scraped_articles_data = []
115
+ # 3. Iterate over each article link and scrape the full content
116
+ for i, article_link_info in enumerate(all_article_links):
117
+ print(f"Scraping article {i+1}/{len(all_article_links)}: {article_link_info['title']}")
118
+
119
+ # Check if the JSON file for this article already exists
120
+ file_name_hash = article_link_info['url'].split('/')[-1] # Base filename from URL
121
+ output_filepath = os.path.join(OUTPUT_DIR, f"{file_name_hash}.json")
122
+
123
+ if os.path.exists(output_filepath):
124
+ print(f" Article already scraped and saved: {output_filepath}. Skipping.")
125
+ try: # Try to load to include in total if it exists
126
+ with open(output_filepath, 'r', encoding='utf-8') as f:
127
+ scraped_articles_data.append(json.load(f))
128
+ except Exception as e:
129
+ print(f" Error loading existing file {output_filepath}: {e}")
130
+ continue # Skip to the next article
131
+
132
+ article_content = scrape_article_content(article_link_info['url'])
133
+ if article_content:
134
+ scraped_articles_data.append(article_content)
135
+ # Save the content as JSON in a file for reference and debugging
136
+ try:
137
+ with open(output_filepath, 'w', encoding='utf-8') as f:
138
+ json.dump(article_content, f, ensure_ascii=False, indent=4)
139
+ except Exception as e:
140
+ print(f" Error saving JSON file {output_filepath}: {e}")
141
+
142
+ print(f"\nScraping of {len(scraped_articles_data)} articles completed and saved/loaded from '{OUTPUT_DIR}'.")
143
+
144
+ print("\n--- Next Steps ---")
145
+ print("1. Knowledge Base articles scraped and saved as JSONs in the 'scraped_kb_articles' folder.")
146
+ print("2. Now, run the 'update_vector_db_with_kb.py' script to integrate this data into your FAISS vector database.")
147
+ print(" `python update_vector_db_with_kb.py`")
hf-space/update_vector_db_with_kb.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from langchain.docstore.document import Document
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import SentenceTransformerEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+
8
+ # --- Configurations ---
9
+ # Folder where scraped JSON articles are saved by scrape_kb.py
10
+ SCRAPED_ARTICLES_DIR = "scraped_kb_articles"
11
+ # Path to your existing FAISS vector database (from PDF)
12
+ VECTOR_DB_PATH = "vector_db"
13
+ # Same embedding model name used in prepare_data.py
14
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
15
+
16
+ # --- 1. Load Scraped Articles from JSON ---
17
+ def load_scraped_articles(directory):
18
+ """
19
+ Loads articles saved as JSONs and converts them into LangChain Documents.
20
+ Combines title and content to form 'page_content'.
21
+ """
22
+ articles = []
23
+ print(f"Searching for JSON articles in folder: {directory}")
24
+ if not os.path.exists(directory):
25
+ print(f"Warning: Scraped articles directory not found: {directory}")
26
+ return articles
27
+
28
+ for filename in os.listdir(directory):
29
+ if filename.endswith(".json"):
30
+ filepath = os.path.join(directory, filename)
31
+ try:
32
+ with open(filepath, 'r', encoding='utf-8') as f:
33
+ data = json.load(f)
34
+ # Combine title and content for the Document's page_content
35
+ full_content = f"Title: {data.get('title', 'N/A')}\n\n{data.get('content', '')}"
36
+ articles.append(Document(
37
+ page_content=full_content,
38
+ metadata={"source": data.get('url', filename), "title": data.get('title', '')}
39
+ ))
40
+ except Exception as e:
41
+ print(f"Error loading or processing file {filename}: {e}")
42
+ print(f"Loaded {len(articles)} scraped KB articles.")
43
+ return articles
44
+
45
+ # --- 2. Split New Documents into Chunks ---
46
+ def split_documents_into_chunks(documents):
47
+ """
48
+ Splits a list of LangChain Documents into smaller chunks.
49
+ Uses the same chunk_size and chunk_overlap settings as the PDF processing.
50
+ """
51
+ text_splitter = RecursiveCharacterTextSplitter(
52
+ chunk_size=1000,
53
+ chunk_overlap=200,
54
+ length_function=len
55
+ )
56
+ chunks = text_splitter.split_documents(documents)
57
+ print(f"Documents split into {len(chunks)} new chunks.")
58
+ return chunks
59
+
60
+ # --- Main Vector Database Update Logic ---
61
+ if __name__ == "__main__":
62
+ print("Starting the process of updating the vector database with KB articles...")
63
+
64
+ # Load the embedding model (the same one used for the PDF)
65
+ print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
66
+ embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
67
+ print("Embedding model loaded.")
68
+
69
+ # 1. Load the scraped JSON articles
70
+ new_documents = load_scraped_articles(SCRAPED_ARTICLES_DIR)
71
+
72
+ if not new_documents:
73
+ print("No new articles found in the scraped data folder to add to the database. Exiting.")
74
+ exit()
75
+
76
+ # 2. Split the new documents into chunks
77
+ new_chunks = split_documents_into_chunks(new_documents)
78
+
79
+ # 3. Load the existing FAISS vector database (from the PDF)
80
+ print(f"Loading existing FAISS vector database from: {VECTOR_DB_PATH}...")
81
+ try:
82
+ # Ensure the 'vector_db' was created with 'prepare_data.py' first
83
+ vector_db = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
84
+ print("Existing FAISS vector database loaded successfully.")
85
+ except Exception as e:
86
+ print(f"Error loading existing FAISS vector database: {e}")
87
+ print("Please ensure the 'vector_db' database was created with 'prepare_data.py' BEFORE running this script.")
88
+ exit()
89
+
90
+ # 4. Add the new chunks to the existing database
91
+ print(f"Adding {len(new_chunks)} new chunks to the FAISS database...")
92
+ # The add_documents method adds the new documents and their embeddings to the existing index
93
+ vector_db.add_documents(new_chunks)
94
+ print("New chunks added to the database.")
95
+
96
+ # 5. Save the updated FAISS vector database
97
+ print(f"Saving the updated FAISS vector database to: {VECTOR_DB_PATH}...")
98
+ vector_db.save_local(VECTOR_DB_PATH)
99
+ print("FAISS vector database updated and saved successfully!")
100
+ print("\nNow, run your Streamlit application ('streamlit run app.py') to see your assistant with the new knowledge!")