Student_Agent / knowledge_base.py
Rohitface's picture
Create knowledge_base.py
a2f9184 verified
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
# --- Configuration ---
# The root URL of the college website to be scraped.
BASE_URL = "https://ggits.org/"
# Directory to save the FAISS index. This should point to a persistent storage location.
SAVE_PATH = "/data/faiss_index"
# The embedding model to use for vectorizing the text.
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# --- Web Scraping Functions ---
def is_valid_url(url, base_domain):
"""Checks if a URL is valid and belongs to the same domain."""
parsed_url = urlparse(url)
return bool(parsed_url.netloc) and parsed_url.netloc == base_domain
def get_all_website_links(url):
"""
Crawls a website starting from the given URL and returns all unique links
within the same domain.
"""
urls = set()
domain_name = urlparse(url).netloc
queue = [url]
visited = {url}
print("Starting website crawl...")
while queue:
current_url = queue.pop(0)
print(f"Crawling: {current_url}")
try:
response = requests.get(current_url, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(current_url, href)
parsed_href = urlparse(href)
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if is_valid_url(href, domain_name) and href not in visited:
urls.add(href)
visited.add(href)
queue.append(href)
# A small delay to avoid overwhelming the server
time.sleep(0.1)
except (requests.exceptions.RequestException, ValueError) as e:
print(f"Could not process URL {current_url}: {e}")
print(f"Crawl finished. Found {len(urls)} unique links.")
return list(urls)
def scrape_page(url):
"""Scrapes the text content from a single web page."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Remove script and style elements
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# Get text and clean it up
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except requests.exceptions.RequestException as e:
print(f"Error scraping {url}: {e}")
return None
# --- Knowledge Base Creation Functions ---
def create_knowledge_base():
"""
Orchestrates the creation of the knowledge base by scraping the website,
processing the text, and saving it to a FAISS vector store.
"""
if os.path.exists(SAVE_PATH):
print("Knowledge base already exists. Skipping creation.")
return
print("Creating new knowledge base...")
# 1. Scrape all website content
links = get_all_website_links(BASE_URL)
all_text = ""
print(f"Scraping content from {len(links)} pages...")
for link in links:
page_content = scrape_page(link)
if page_content:
all_text += page_content + "\n\n"
if not all_text:
print("No text was scraped. Aborting knowledge base creation.")
return
# 2. Split the text into chunks
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(all_text)
print(f"Created {len(chunks)} text chunks.")
# 3. Create embeddings and FAISS index
print(f"Initializing embedding model: {EMBEDDING_MODEL}")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
print("Creating FAISS vector store from chunks...")
vector_store = FAISS.from_texts(chunks, embeddings)
# 4. Save the index to persistent storage
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
vector_store.save_local(SAVE_PATH)
print(f"Knowledge base created and saved to {SAVE_PATH}")
if __name__ == "__main__":
# This allows the script to be run directly to build the knowledge base.
create_knowledge_base()