File size: 3,644 Bytes
f25282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d979a23
f25282e
 
 
 
 
 
 
 
 
3d12ae8
f25282e
 
3d12ae8
f25282e
3d12ae8
 
 
 
f25282e
3d12ae8
f25282e
 
 
 
 
3d12ae8
f25282e
3d12ae8
 
 
 
 
 
 
 
 
 
 
 
f25282e
 
3d12ae8
f25282e
3d12ae8
 
 
 
 
 
 
f25282e
3d12ae8
f25282e
 
 
3d12ae8
f25282e
3d12ae8
f25282e
 
 
 
 
3d12ae8
f25282e
 
3d12ae8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import os

# --- Global variables for RAG components ---
vector_store = None
llm = None
retrieval_chain = None

def initialize_rag_components():
    global llm
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)

def scrape_and_process_url(url: str) -> str:
    global vector_store, retrieval_chain
    
    try:
        # Scrape content using WebBaseLoader for simplicity and robustness
        # This handles parsing and extracting main content from various web pages
        loader = WebBaseLoader(url)
        docs = loader.load()
        
        if not docs:
            return "Failed to load content from the URL. Please check the URL or try another one."
        
        # Split documents into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        chunks = text_splitter.split_documents(docs)
        
        # Create embeddings and vector store
        # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        vector_store = FAISS.from_documents(chunks, embeddings)
        
        # Create RAG chain with polished prompt
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a knowledgeable and friendly assistant helping users understand documentation. Answer questions naturally and conversationally, as if you're explaining to a colleague.

Your task:
- Read the context carefully and provide clear, helpful answers based on what's there
- Explain concepts in a simple, approachable way that anyone can understand
- If you find the answer in the context, explain it thoroughly with examples when available
- Be direct and confident in your responses - act like an expert who knows this documentation well
- If the information isn't in the context, simply say "I don't see that information in this documentation"
- Use a warm, professional tone - like a helpful coworker, not a robot

Context from documentation:
{context}"""),
            ("user", "{input}")
        ])
        
        document_chain = create_stuff_documents_chain(llm, prompt)
        retrieval_chain = create_retrieval_chain(
            vector_store.as_retriever(search_kwargs={"k": 4}), 
            document_chain
        )
        
        return f"✅ Successfully scraped and processed content from {url}.\n\nDocument chunks created: {len(chunks)}\n\nYou can now ask questions about the documentation!"
        
    except Exception as e:
        return f"❌ An error occurred during scraping or processing: {str(e)}"

def answer_question(question: str) -> str:
    global retrieval_chain
    
    if retrieval_chain is None:
        return "⚠️ Please scrape and process a URL first before asking questions."
    
    try:
        response = retrieval_chain.invoke({"input": question})
        return response["answer"]
    except Exception as e:
        return f"❌ An error occurred while answering the question: {str(e)}"

# Initialize LLM when the module is imported
initialize_rag_components()