Spaces:

Rohitface
/

Student_Agent

Runtime error

File size: 4,862 Bytes

a2f9184

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time

# --- Configuration ---
# The root URL of the college website to be scraped.
BASE_URL = "https://ggits.org/"
# Directory to save the FAISS index. This should point to a persistent storage location.
SAVE_PATH = "/data/faiss_index"
# The embedding model to use for vectorizing the text.
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# --- Web Scraping Functions ---

def is_valid_url(url, base_domain):
    """Checks if a URL is valid and belongs to the same domain."""
    parsed_url = urlparse(url)
    return bool(parsed_url.netloc) and parsed_url.netloc == base_domain

def get_all_website_links(url):
    """
    Crawls a website starting from the given URL and returns all unique links
    within the same domain.
    """
    urls = set()
    domain_name = urlparse(url).netloc
    queue = [url]
    visited = {url}

    print("Starting website crawl...")
    while queue:
        current_url = queue.pop(0)
        print(f"Crawling: {current_url}")
        try:
            response = requests.get(current_url, timeout=5)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            for a_tag in soup.findAll("a"):
                href = a_tag.attrs.get("href")
                if href == "" or href is None:
                    continue
                
                href = urljoin(current_url, href)
                parsed_href = urlparse(href)
                href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

                if is_valid_url(href, domain_name) and href not in visited:
                    urls.add(href)
                    visited.add(href)
                    queue.append(href)
            # A small delay to avoid overwhelming the server
            time.sleep(0.1)
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Could not process URL {current_url}: {e}")
    print(f"Crawl finished. Found {len(urls)} unique links.")
    return list(urls)

def scrape_page(url):
    """Scrapes the text content from a single web page."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Remove script and style elements
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()
        
        # Get text and clean it up
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

# --- Knowledge Base Creation Functions ---

def create_knowledge_base():
    """
    Orchestrates the creation of the knowledge base by scraping the website,
    processing the text, and saving it to a FAISS vector store.
    """
    if os.path.exists(SAVE_PATH):
        print("Knowledge base already exists. Skipping creation.")
        return

    print("Creating new knowledge base...")
    
    # 1. Scrape all website content
    links = get_all_website_links(BASE_URL)
    all_text = ""
    print(f"Scraping content from {len(links)} pages...")
    for link in links:
        page_content = scrape_page(link)
        if page_content:
            all_text += page_content + "\n\n"
    
    if not all_text:
        print("No text was scraped. Aborting knowledge base creation.")
        return

    # 2. Split the text into chunks
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(all_text)
    print(f"Created {len(chunks)} text chunks.")

    # 3. Create embeddings and FAISS index
    print(f"Initializing embedding model: {EMBEDDING_MODEL}")
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    
    print("Creating FAISS vector store from chunks...")
    vector_store = FAISS.from_texts(chunks, embeddings)
    
    # 4. Save the index to persistent storage
    os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
    vector_store.save_local(SAVE_PATH)
    print(f"Knowledge base created and saved to {SAVE_PATH}")

if __name__ == "__main__":
    # This allows the script to be run directly to build the knowledge base.
    create_knowledge_base()