Spaces:

codegood
/

demochatbot

Running

File size: 5,091 Bytes

aa8691d

import re
import sys
from dataclasses import dataclass
from pathlib import Path

import bs4
import numpy as np
import requests
from sambanova import SambaNova
import yaml
from langchain_huggingface import HuggingFaceEmbeddings

@dataclass
class DocumentChunk:
    text: str
    source: str
    vector: np.ndarray


def load_config(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return yaml.safe_load(f)


def scrape_website(url: str) -> str:
    response = requests.get(url, timeout=15)
    response.raise_for_status()
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    text = re.sub(r"\n{2,}", "\n", text).strip()
    return text


def split_into_chunks(text: str, chunk_size: int = 400, overlap: int = 100) -> list[str]:
    sentences = [s.strip() for s in re.split(r"(?<=[\.\?\!])\s+", text) if s.strip()]
    chunks = []
    current = ""
    for sentence in sentences:
        if len(current) + len(sentence) + 1 > chunk_size and current:
            chunks.append(current.strip())
            current = current[-overlap:] if overlap < len(current) else current
        current += " " + sentence
    if current.strip():
        chunks.append(current.strip())
    return chunks


def embed_texts(texts: list[str], embed_model: HuggingFaceEmbeddings = None) -> list[np.ndarray]:
    if not texts:
        return []
    if embed_model:
        return embed_model.embed_documents(texts)


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def build_rag_corpus(config: dict, embed_model: HuggingFaceEmbeddings, url: str) -> list[DocumentChunk]:
    print(f"Scraping website: {url}")
    page_text = scrape_website(url)
    chunks = split_into_chunks(page_text)
    print(f"Split content into {len(chunks)} chunks")
    embeddings = embed_texts(chunks, embed_model)
    return [DocumentChunk(text=chunk, source=url, vector=np.array(vector)) for chunk, vector in zip(chunks, embeddings)]


def retrieve_relevant_chunks(chunks: list[DocumentChunk], question: str, embed_model: HuggingFaceEmbeddings, top_k: int = 4) -> list[DocumentChunk]:
    question_embeddings = embed_texts([question], embed_model)
    if not question_embeddings:
        return chunks[:top_k]
    question_vector = np.array(question_embeddings[0])
    scored = [
        (chunk, cosine_similarity(question_vector, chunk.vector))
        for chunk in chunks
    ]
    scored.sort(key=lambda item: item[1], reverse=True)
    return [chunk for chunk, _ in scored[:top_k]]


def build_prompt(system_prompt: str, question: str, context_chunks: list[DocumentChunk]) -> str:
    context_text = "\n---\n".join(chunk.text for chunk in context_chunks)
    return (
        f"{system_prompt}\n\n"
        f"Use the following extracted website text to answer the question clearly.\n"
        f"Context:\n{context_text}\n\n"
        f"Question: {question}\n"
    )


def create_llm_client(config: dict) -> SambaNova:
    return SambaNova(
        api_key=config.get("sambanova_api_key"),
        base_url="https://api.sambanova.ai/v1",
        timeout=30,
    )


def ask_model(prompt: str, client: SambaNova) -> str:
    response = client.chat.completions.create(
        model="DeepSeek-V3.1",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1056,
        temperature=0.2,
    )
    return response.choices[0].message.content.strip()


def format_answer(raw: str, chunks: list[DocumentChunk]) -> str:
    return raw


def main() -> int:
    config_path = Path(__file__).parent / "config.yaml"
    if not config_path.exists():
        print(f"Missing config file: {config_path}")
        return 1

    config = load_config(config_path)
    llm_api_key = config.get("sambanova_api_key")
    website = config.get("website")
    system_prompt = config.get("system_prompt", "You are a helpful assistant.")

    if not llm_api_key or not website:
        print("Please set sambanova_api_key and website in config.yaml")
        return 1
    embed_model = HuggingFaceEmbeddings(model_name=config.get("embedding_model"))
    chunks = build_rag_corpus(config, embed_model, website)
    client = create_llm_client(config)
    print("RAG corpus ready. Ask a question or type 'exit'.")

    while True:
        try:
            question = input("Question> ").strip()
        except EOFError:
            break
        if not question:
            continue
        if question.lower() in {"exit", "quit"}:
            break

        selected = retrieve_relevant_chunks(chunks, question, embed_model)
        prompt = build_prompt(system_prompt, question, selected)
        raw_answer = ask_model(prompt, client)
        response = format_answer(raw_answer, selected)

        print(response)
        print()

    return 0


if __name__ == "__main__":
    sys.exit(main())