Spaces:

vip11017
/

chatbot_platform

Sleeping

File size: 8,490 Bytes

import requests
from bs4 import BeautifulSoup
from uuid import uuid4
from urllib.parse import urljoin, urlparse
from collections import deque
import tldextract
from typing import List, Dict

from app.config import qdrant_client, embedding_model, demo_chatbot_configs

from qdrant_client.models import VectorParams, Distance

from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient

from app.ingestion.models import ChatbotIngest

import re


def scrape_website(
    start_url: str,
    timeout: int = 10,
    max_pages: int = 150,
) -> List[Dict[str, str]]:
    """
    Crawls and extracts cleaned text from all pages and subdomains
    under the same registered domain.

    Args:
        start_url: Entry URL (e.g. https://example.com)
        timeout: Request timeout in seconds
        max_pages: Hard cap to prevent crawl explosion

    Returns:
        List of dicts: [{ "url": str, "text": str }]
    """
    print(f"Scraping website starting at {start_url}")
    def registered_domain(url: str) -> str:
        ext = tldextract.extract(url)
        return f"{ext.domain}.{ext.suffix}"

    base_domain = registered_domain(start_url)

    visited = set()
    queue = deque([start_url])
    results: List[Dict[str, str]] = []

    while queue and len(visited) < max_pages:
        url = queue.popleft()
        if url in visited:
            continue

        visited.add(url)

        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
        except requests.RequestException:
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # Remove non-content elements
        for elem in soup(["script", "style", "noscript", "header", "footer", "aside", "nav", "iframe"]):
            elem.decompose()

        # Semantic-aware content extraction
        content_tags = soup.find_all(["article", "main", "section"])
        text_blocks = []

        if content_tags:
            for tag in content_tags:
                for elem in tag.find_all(["p", "li", "span"]):
                    txt = elem.get_text(strip=True)
                    if len(txt) > 20:  # skip short/noisy text
                        text_blocks.append(txt)
        else:
            # fallback: headings + paragraphs
            for h in soup.find_all(["h1", "h2", "h3"]):
                section_text = [
                    p.get_text(strip=True) for p in h.find_all_next("p") if len(p.get_text(strip=True)) > 20
                ]
                if section_text:
                    text_blocks.append(h.get_text(strip=True) + "\n" + " ".join(section_text))

        if not text_blocks:
            continue

        text = "\n".join(text_blocks)

        # Remove repeated/noisy boilerplate
        text = re.sub(r"(Out of stock|Add to cart|Select Title Default Title)", "", text, flags=re.I)
        text = re.sub(r"\n\s*\n", "\n", text)

        if text_blocks:
            results.append({
                "url": url,
                "text": text
            })

        # Discover internal links + subdomains
        for link in soup.find_all("a", href=True):
            next_url = urljoin(url, link["href"])
            parsed = urlparse(next_url)

            if parsed.scheme not in ("http", "https"):
                continue

            if registered_domain(next_url) != base_domain:
                continue

            if next_url not in visited:
                queue.append(next_url)


    if not results:
        raise ValueError(
            f"""Website scraping failed for {start_url}. No readable content found.\n\n
Possible reasons:\n
1) The URL is incorrect or unreachable.\n
2) The site requires login or JavaScript to display content.\n
3) The page contains only images/media without text.\n\n
Please check the URL and try again.
        """)
        
    return results


def chunk_and_embed(chatbot_id: str, pages: List[Dict[str, str]]):
    """
   Converts scraped website pages into embedded chunks and stores them in a chabot-scoped Qdrant Collection
    """
    if not pages:
        raise ValueError("No pages to chunk and embed")
    

    collection_name = f"chatbot_{chatbot_id}"

    if not qdrant_client.collection_exists(collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=768,
                distance=Distance.COSINE,
            ),
        )


    # Convert pages → LangChain Documents
    documents: List[Document] = [
        Document(
            page_content=page["text"],
            metadata={
                "chatbot_id": chatbot_id,
                "source": "website",
                "url": page["url"],
            },
        )
        for page in pages
        if page.get("text")
    ]

    if not documents:
        raise ValueError("No valid documents extracted from pages")
    
    #Chunk
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512, chunk_overlap=64
    )
    chunks=splitter.split_documents(documents)

    #Embed + Store
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=collection_name,
        embedding=embeddings,
    )

    ids = [str(uuid4()) for _ in chunks]
    vector_store.add_documents(chunks, ids=ids)

    print(f"Stored {len(chunks)} chunks in Qdrant collection {collection_name}")


def build_demo_prompt(ingest: ChatbotIngest) -> str:
    chatbot_name = ingest.chatbot_name
    company_name = ingest.company_name
    allowed_topics = ", ".join(ingest.chatbot_purpose) or "general questions"
    banned_topics = ingest.sensitive_topics or "sensitive topics"
    response_style = ", ".join(ingest.tone_style) if ingest.tone_style else "clear and concise"
    fallback_message = f"Sorry, I cannot answer that question. Please call or email for further assistance. Information can be found on the website"

    template = f"""
You are {chatbot_name}, an assistant for {company_name}.
Answer ONLY using the provided context from {company_name}'s approved content.

STRICT RULES:
1. If the Contextual Knowledge section is empty, say: "{fallback_message}"
2. Do NOT use your own general knowledge. Only reference the Contextual Knowledge.
3. Only reference topics explicitly allowed: {allowed_topics}.
4. Do NOT discuss banned topics: {banned_topics}.
5. Keep responses {response_style}.
5. Keep the answers clear and concise in 1-3 sentences
"""
    return template

def build_welcome_message(ingest: ChatbotIngest) -> str:
    """
    Build a flexible and user-friendly welcome message for the chatbot using its ingest config.
    """
    # Determine chatbot name
    chatbot_name = ingest.chatbot_name or f"{ingest.company_name} Assistant"

    # Filter out "capture leads (email, phone)" from purposes
    purposes = [p for p in ingest.chatbot_purpose or [] if p.lower() != "capture leads (email, phone)"]

    # Start message
    intro_msg = f"Hello! 👋 I'm {chatbot_name}, your virtual assistant for {ingest.company_name}.\n"

    if purposes:
        intro_msg += "I can help you with:\n" + "\n".join(f"- {p}" for p in purposes) + "\n"

    # Add flexible closing
    intro_msg += (
        "Just type your question below and I'll do my best to help!"
    )

    return intro_msg


def store_demo_rag_config(chatbot_id, company_id, ingest: ChatbotIngest) -> None:
    """
    Stores the RAG configuration prompt for the demo chatbot in MongoDB.
    """
    demo_rag_dict = {
        "submission_id": ingest.submission_id,
        "chatbot_id": chatbot_id,
        "company_id": company_id,
        "chatbot_name": ingest.chatbot_name,
        "company_name": ingest.company_name,
        "pricing_plan": ingest.pricing_plan,
        "prompt_template": build_demo_prompt(ingest),
        "welcome_message": build_welcome_message(ingest),
        "retrievers": [
            {
                "name": "all",
                "collection": f"chatbot_{chatbot_id}",
                "top_k": 25,
                "filter_score": 0.7
            }
        ]
    }
    result = demo_chatbot_configs.insert_one(demo_rag_dict)
    print(f"Inserted RAG config for {ingest.company_name}, _id={result.inserted_id}")