Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import time | |
| # --- Configuration --- | |
| # The root URL of the college website to be scraped. | |
| BASE_URL = "https://ggits.org/" | |
| # Directory to save the FAISS index. This should point to a persistent storage location. | |
| SAVE_PATH = "/data/faiss_index" | |
| # The embedding model to use for vectorizing the text. | |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| # --- Web Scraping Functions --- | |
| def is_valid_url(url, base_domain): | |
| """Checks if a URL is valid and belongs to the same domain.""" | |
| parsed_url = urlparse(url) | |
| return bool(parsed_url.netloc) and parsed_url.netloc == base_domain | |
| def get_all_website_links(url): | |
| """ | |
| Crawls a website starting from the given URL and returns all unique links | |
| within the same domain. | |
| """ | |
| urls = set() | |
| domain_name = urlparse(url).netloc | |
| queue = [url] | |
| visited = {url} | |
| print("Starting website crawl...") | |
| while queue: | |
| current_url = queue.pop(0) | |
| print(f"Crawling: {current_url}") | |
| try: | |
| response = requests.get(current_url, timeout=5) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| for a_tag in soup.findAll("a"): | |
| href = a_tag.attrs.get("href") | |
| if href == "" or href is None: | |
| continue | |
| href = urljoin(current_url, href) | |
| parsed_href = urlparse(href) | |
| href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path | |
| if is_valid_url(href, domain_name) and href not in visited: | |
| urls.add(href) | |
| visited.add(href) | |
| queue.append(href) | |
| # A small delay to avoid overwhelming the server | |
| time.sleep(0.1) | |
| except (requests.exceptions.RequestException, ValueError) as e: | |
| print(f"Could not process URL {current_url}: {e}") | |
| print(f"Crawl finished. Found {len(urls)} unique links.") | |
| return list(urls) | |
| def scrape_page(url): | |
| """Scrapes the text content from a single web page.""" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Remove script and style elements | |
| for script_or_style in soup(["script", "style"]): | |
| script_or_style.decompose() | |
| # Get text and clean it up | |
| text = soup.get_text() | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = '\n'.join(chunk for chunk in chunks if chunk) | |
| return text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error scraping {url}: {e}") | |
| return None | |
| # --- Knowledge Base Creation Functions --- | |
| def create_knowledge_base(): | |
| """ | |
| Orchestrates the creation of the knowledge base by scraping the website, | |
| processing the text, and saving it to a FAISS vector store. | |
| """ | |
| if os.path.exists(SAVE_PATH): | |
| print("Knowledge base already exists. Skipping creation.") | |
| return | |
| print("Creating new knowledge base...") | |
| # 1. Scrape all website content | |
| links = get_all_website_links(BASE_URL) | |
| all_text = "" | |
| print(f"Scraping content from {len(links)} pages...") | |
| for link in links: | |
| page_content = scrape_page(link) | |
| if page_content: | |
| all_text += page_content + "\n\n" | |
| if not all_text: | |
| print("No text was scraped. Aborting knowledge base creation.") | |
| return | |
| # 2. Split the text into chunks | |
| print("Splitting documents into chunks...") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_text(all_text) | |
| print(f"Created {len(chunks)} text chunks.") | |
| # 3. Create embeddings and FAISS index | |
| print(f"Initializing embedding model: {EMBEDDING_MODEL}") | |
| embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
| print("Creating FAISS vector store from chunks...") | |
| vector_store = FAISS.from_texts(chunks, embeddings) | |
| # 4. Save the index to persistent storage | |
| os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True) | |
| vector_store.save_local(SAVE_PATH) | |
| print(f"Knowledge base created and saved to {SAVE_PATH}") | |
| if __name__ == "__main__": | |
| # This allows the script to be run directly to build the knowledge base. | |
| create_knowledge_base() | |