import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter import time # --- Configuration --- # The root URL of the college website to be scraped. BASE_URL = "https://ggits.org/" # Directory to save the FAISS index. This should point to a persistent storage location. SAVE_PATH = "/data/faiss_index" # The embedding model to use for vectorizing the text. EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # --- Web Scraping Functions --- def is_valid_url(url, base_domain): """Checks if a URL is valid and belongs to the same domain.""" parsed_url = urlparse(url) return bool(parsed_url.netloc) and parsed_url.netloc == base_domain def get_all_website_links(url): """ Crawls a website starting from the given URL and returns all unique links within the same domain. """ urls = set() domain_name = urlparse(url).netloc queue = [url] visited = {url} print("Starting website crawl...") while queue: current_url = queue.pop(0) print(f"Crawling: {current_url}") try: response = requests.get(current_url, timeout=5) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: continue href = urljoin(current_url, href) parsed_href = urlparse(href) href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if is_valid_url(href, domain_name) and href not in visited: urls.add(href) visited.add(href) queue.append(href) # A small delay to avoid overwhelming the server time.sleep(0.1) except (requests.exceptions.RequestException, ValueError) as e: print(f"Could not process URL {current_url}: {e}") print(f"Crawl finished. Found {len(urls)} unique links.") return list(urls) def scrape_page(url): """Scrapes the text content from a single web page.""" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Remove script and style elements for script_or_style in soup(["script", "style"]): script_or_style.decompose() # Get text and clean it up text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) return text except requests.exceptions.RequestException as e: print(f"Error scraping {url}: {e}") return None # --- Knowledge Base Creation Functions --- def create_knowledge_base(): """ Orchestrates the creation of the knowledge base by scraping the website, processing the text, and saving it to a FAISS vector store. """ if os.path.exists(SAVE_PATH): print("Knowledge base already exists. Skipping creation.") return print("Creating new knowledge base...") # 1. Scrape all website content links = get_all_website_links(BASE_URL) all_text = "" print(f"Scraping content from {len(links)} pages...") for link in links: page_content = scrape_page(link) if page_content: all_text += page_content + "\n\n" if not all_text: print("No text was scraped. Aborting knowledge base creation.") return # 2. Split the text into chunks print("Splitting documents into chunks...") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(all_text) print(f"Created {len(chunks)} text chunks.") # 3. Create embeddings and FAISS index print(f"Initializing embedding model: {EMBEDDING_MODEL}") embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) print("Creating FAISS vector store from chunks...") vector_store = FAISS.from_texts(chunks, embeddings) # 4. Save the index to persistent storage os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True) vector_store.save_local(SAVE_PATH) print(f"Knowledge base created and saved to {SAVE_PATH}") if __name__ == "__main__": # This allows the script to be run directly to build the knowledge base. create_knowledge_base()