Spaces:

Rohitface
/

Student_Agent

Runtime error

App Files Files Community

Student_Agent / knowledge_base.py

Rohitface

Create knowledge_base.py

a2f9184 verified 5 months ago

raw

history blame contribute delete

4.86 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import time

	# --- Configuration ---
	# The root URL of the college website to be scraped.
	BASE_URL = "https://ggits.org/"
	# Directory to save the FAISS index. This should point to a persistent storage location.
	SAVE_PATH = "/data/faiss_index"
	# The embedding model to use for vectorizing the text.
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

	# --- Web Scraping Functions ---

	def is_valid_url(url, base_domain):
	"""Checks if a URL is valid and belongs to the same domain."""
	parsed_url = urlparse(url)
	return bool(parsed_url.netloc) and parsed_url.netloc == base_domain

	def get_all_website_links(url):
	"""
	Crawls a website starting from the given URL and returns all unique links
	within the same domain.
	"""
	urls = set()
	domain_name = urlparse(url).netloc
	queue = [url]
	visited = {url}

	print("Starting website crawl...")
	while queue:
	current_url = queue.pop(0)
	print(f"Crawling: {current_url}")
	try:
	response = requests.get(current_url, timeout=5)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")

	for a_tag in soup.findAll("a"):
	href = a_tag.attrs.get("href")
	if href == "" or href is None:
	continue

	href = urljoin(current_url, href)
	parsed_href = urlparse(href)
	href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

	if is_valid_url(href, domain_name) and href not in visited:
	urls.add(href)
	visited.add(href)
	queue.append(href)
	# A small delay to avoid overwhelming the server
	time.sleep(0.1)
	except (requests.exceptions.RequestException, ValueError) as e:
	print(f"Could not process URL {current_url}: {e}")
	print(f"Crawl finished. Found {len(urls)} unique links.")
	return list(urls)

	def scrape_page(url):
	"""Scrapes the text content from a single web page."""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")

	# Remove script and style elements
	for script_or_style in soup(["script", "style"]):
	script_or_style.decompose()

	# Get text and clean it up
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)
	return text
	except requests.exceptions.RequestException as e:
	print(f"Error scraping {url}: {e}")
	return None

	# --- Knowledge Base Creation Functions ---

	def create_knowledge_base():
	"""
	Orchestrates the creation of the knowledge base by scraping the website,
	processing the text, and saving it to a FAISS vector store.
	"""
	if os.path.exists(SAVE_PATH):
	print("Knowledge base already exists. Skipping creation.")
	return

	print("Creating new knowledge base...")

	# 1. Scrape all website content
	links = get_all_website_links(BASE_URL)
	all_text = ""
	print(f"Scraping content from {len(links)} pages...")
	for link in links:
	page_content = scrape_page(link)
	if page_content:
	all_text += page_content + "\n\n"

	if not all_text:
	print("No text was scraped. Aborting knowledge base creation.")
	return

	# 2. Split the text into chunks
	print("Splitting documents into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(all_text)
	print(f"Created {len(chunks)} text chunks.")

	# 3. Create embeddings and FAISS index
	print(f"Initializing embedding model: {EMBEDDING_MODEL}")
	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

	print("Creating FAISS vector store from chunks...")
	vector_store = FAISS.from_texts(chunks, embeddings)

	# 4. Save the index to persistent storage
	os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
	vector_store.save_local(SAVE_PATH)
	print(f"Knowledge base created and saved to {SAVE_PATH}")

	if __name__ == "__main__":
	# This allows the script to be run directly to build the knowledge base.
	create_knowledge_base()