File size: 2,810 Bytes
f25282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import os

# --- Global variables for RAG components ---
vector_store = None
llm = None
retrieval_chain = None

def initialize_rag_components():
    global llm
    llm = ChatOpenAI(model="gemini-2.5-flash", temperature=0.3)

def scrape_and_process_url(url: str) -> str:
    global vector_store, retrieval_chain
    
    try:
        # Scrape content using WebBaseLoader for simplicity and robustness
        # This handles parsing and extracting main content from various web pages
        loader = WebBaseLoader(url)
        docs = loader.load()

        if not docs:
            return "Failed to load content from the URL. Please check the URL or try another one."

        # Split documents into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_documents(docs)

        # Create embeddings and vector store
        # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        vector_store = FAISS.from_documents(chunks, embeddings)
        
        # Create RAG chain
        prompt = ChatPromptTemplate.from_messages([
            ("system", "Answer the user's questions based on the provided context only. "
             "If you don't know the answer, just say that you don't know, don't make up an answer.\n\n{context}"),
            ("user", "{input}")
        ])
        document_chain = create_stuff_documents_chain(llm, prompt)
        retrieval_chain = create_retrieval_chain(vector_store.as_retriever(), document_chain)

        return f"Successfully scraped and processed content from {url}. You can now ask questions."

    except Exception as e:
        return f"An error occurred during scraping or processing: {str(e)}"

def answer_question(question: str) -> str:
    global retrieval_chain
    if retrieval_chain is None:
        return "Please scrape and process a URL first before asking questions."
    
    try:
        response = retrieval_chain.invoke({"input": question})
        return response["answer"]
    except Exception as e:
        return f"An error occurred while answering the question: {str(e)}"

# Initialize LLM when the module is imported
initialize_rag_components()