Spaces:
Sleeping
Sleeping
File size: 8,490 Bytes
0df7bd3 1fdc232 0df7bd3 1fdc232 1085917 0df7bd3 1085917 0df7bd3 1085917 0df7bd3 1085917 0df7bd3 44569c0 0df7bd3 f8f11c7 0df7bd3 1fdc232 0df7bd3 1fdc232 f8f11c7 1fdc232 649efae 1fdc232 f8f11c7 1fdc232 73962a1 fa5a94c 73962a1 fa5a94c 73962a1 649efae 1fdc232 207e62f 1fdc232 649efae 846ec25 1fdc232 f7867c7 1fdc232 73962a1 1fdc232 649efae f8f11c7 1fdc232 27cf46e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | import requests
from bs4 import BeautifulSoup
from uuid import uuid4
from urllib.parse import urljoin, urlparse
from collections import deque
import tldextract
from typing import List, Dict
from app.config import qdrant_client, embedding_model, demo_chatbot_configs
from qdrant_client.models import VectorParams, Distance
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from app.ingestion.models import ChatbotIngest
import re
def scrape_website(
start_url: str,
timeout: int = 10,
max_pages: int = 150,
) -> List[Dict[str, str]]:
"""
Crawls and extracts cleaned text from all pages and subdomains
under the same registered domain.
Args:
start_url: Entry URL (e.g. https://example.com)
timeout: Request timeout in seconds
max_pages: Hard cap to prevent crawl explosion
Returns:
List of dicts: [{ "url": str, "text": str }]
"""
print(f"Scraping website starting at {start_url}")
def registered_domain(url: str) -> str:
ext = tldextract.extract(url)
return f"{ext.domain}.{ext.suffix}"
base_domain = registered_domain(start_url)
visited = set()
queue = deque([start_url])
results: List[Dict[str, str]] = []
while queue and len(visited) < max_pages:
url = queue.popleft()
if url in visited:
continue
visited.add(url)
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
except requests.RequestException:
continue
soup = BeautifulSoup(response.text, "lxml")
# Remove non-content elements
for elem in soup(["script", "style", "noscript", "header", "footer", "aside", "nav", "iframe"]):
elem.decompose()
# Semantic-aware content extraction
content_tags = soup.find_all(["article", "main", "section"])
text_blocks = []
if content_tags:
for tag in content_tags:
for elem in tag.find_all(["p", "li", "span"]):
txt = elem.get_text(strip=True)
if len(txt) > 20: # skip short/noisy text
text_blocks.append(txt)
else:
# fallback: headings + paragraphs
for h in soup.find_all(["h1", "h2", "h3"]):
section_text = [
p.get_text(strip=True) for p in h.find_all_next("p") if len(p.get_text(strip=True)) > 20
]
if section_text:
text_blocks.append(h.get_text(strip=True) + "\n" + " ".join(section_text))
if not text_blocks:
continue
text = "\n".join(text_blocks)
# Remove repeated/noisy boilerplate
text = re.sub(r"(Out of stock|Add to cart|Select Title Default Title)", "", text, flags=re.I)
text = re.sub(r"\n\s*\n", "\n", text)
if text_blocks:
results.append({
"url": url,
"text": text
})
# Discover internal links + subdomains
for link in soup.find_all("a", href=True):
next_url = urljoin(url, link["href"])
parsed = urlparse(next_url)
if parsed.scheme not in ("http", "https"):
continue
if registered_domain(next_url) != base_domain:
continue
if next_url not in visited:
queue.append(next_url)
if not results:
raise ValueError(
f"""Website scraping failed for {start_url}. No readable content found.\n\n
Possible reasons:\n
1) The URL is incorrect or unreachable.\n
2) The site requires login or JavaScript to display content.\n
3) The page contains only images/media without text.\n\n
Please check the URL and try again.
""")
return results
def chunk_and_embed(chatbot_id: str, pages: List[Dict[str, str]]):
"""
Converts scraped website pages into embedded chunks and stores them in a chabot-scoped Qdrant Collection
"""
if not pages:
raise ValueError("No pages to chunk and embed")
collection_name = f"chatbot_{chatbot_id}"
if not qdrant_client.collection_exists(collection_name):
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=768,
distance=Distance.COSINE,
),
)
# Convert pages → LangChain Documents
documents: List[Document] = [
Document(
page_content=page["text"],
metadata={
"chatbot_id": chatbot_id,
"source": "website",
"url": page["url"],
},
)
for page in pages
if page.get("text")
]
if not documents:
raise ValueError("No valid documents extracted from pages")
#Chunk
splitter = RecursiveCharacterTextSplitter(
chunk_size=512, chunk_overlap=64
)
chunks=splitter.split_documents(documents)
#Embed + Store
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
vector_store = QdrantVectorStore(
client=qdrant_client,
collection_name=collection_name,
embedding=embeddings,
)
ids = [str(uuid4()) for _ in chunks]
vector_store.add_documents(chunks, ids=ids)
print(f"Stored {len(chunks)} chunks in Qdrant collection {collection_name}")
def build_demo_prompt(ingest: ChatbotIngest) -> str:
chatbot_name = ingest.chatbot_name
company_name = ingest.company_name
allowed_topics = ", ".join(ingest.chatbot_purpose) or "general questions"
banned_topics = ingest.sensitive_topics or "sensitive topics"
response_style = ", ".join(ingest.tone_style) if ingest.tone_style else "clear and concise"
fallback_message = f"Sorry, I cannot answer that question. Please call or email for further assistance. Information can be found on the website"
template = f"""
You are {chatbot_name}, an assistant for {company_name}.
Answer ONLY using the provided context from {company_name}'s approved content.
STRICT RULES:
1. If the Contextual Knowledge section is empty, say: "{fallback_message}"
2. Do NOT use your own general knowledge. Only reference the Contextual Knowledge.
3. Only reference topics explicitly allowed: {allowed_topics}.
4. Do NOT discuss banned topics: {banned_topics}.
5. Keep responses {response_style}.
5. Keep the answers clear and concise in 1-3 sentences
"""
return template
def build_welcome_message(ingest: ChatbotIngest) -> str:
"""
Build a flexible and user-friendly welcome message for the chatbot using its ingest config.
"""
# Determine chatbot name
chatbot_name = ingest.chatbot_name or f"{ingest.company_name} Assistant"
# Filter out "capture leads (email, phone)" from purposes
purposes = [p for p in ingest.chatbot_purpose or [] if p.lower() != "capture leads (email, phone)"]
# Start message
intro_msg = f"Hello! 👋 I'm {chatbot_name}, your virtual assistant for {ingest.company_name}.\n"
if purposes:
intro_msg += "I can help you with:\n" + "\n".join(f"- {p}" for p in purposes) + "\n"
# Add flexible closing
intro_msg += (
"Just type your question below and I'll do my best to help!"
)
return intro_msg
def store_demo_rag_config(chatbot_id, company_id, ingest: ChatbotIngest) -> None:
"""
Stores the RAG configuration prompt for the demo chatbot in MongoDB.
"""
demo_rag_dict = {
"submission_id": ingest.submission_id,
"chatbot_id": chatbot_id,
"company_id": company_id,
"chatbot_name": ingest.chatbot_name,
"company_name": ingest.company_name,
"pricing_plan": ingest.pricing_plan,
"prompt_template": build_demo_prompt(ingest),
"welcome_message": build_welcome_message(ingest),
"retrievers": [
{
"name": "all",
"collection": f"chatbot_{chatbot_id}",
"top_k": 25,
"filter_score": 0.7
}
]
}
result = demo_chatbot_configs.insert_one(demo_rag_dict)
print(f"Inserted RAG config for {ingest.company_name}, _id={result.inserted_id}") |