Spaces:

Rahaf2001
/

RAG-Project

Sleeping

File size: 3,644 Bytes

f25282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d979a23
f25282e
 
 
 
 
 
 
 
 
3d12ae8
f25282e
 
3d12ae8
f25282e
3d12ae8
 
 
 
f25282e
3d12ae8
f25282e
 
 
 
 
3d12ae8
f25282e
3d12ae8
 
 
 
 
 
 
 
 
 
 
 
f25282e
 
3d12ae8
f25282e
3d12ae8
 
 
 
 
 
 
f25282e
3d12ae8
f25282e
 
 
3d12ae8
f25282e
3d12ae8
f25282e
 
 
 
 
3d12ae8
f25282e
 
3d12ae8

import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import os

# --- Global variables for RAG components ---
vector_store = None
llm = None
retrieval_chain = None

def initialize_rag_components():
    global llm
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)

def scrape_and_process_url(url: str) -> str:
    global vector_store, retrieval_chain
    
    try:
        # Scrape content using WebBaseLoader for simplicity and robustness
        # This handles parsing and extracting main content from various web pages
        loader = WebBaseLoader(url)
        docs = loader.load()
        
        if not docs:
            return "Failed to load content from the URL. Please check the URL or try another one."
        
        # Split documents into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        chunks = text_splitter.split_documents(docs)
        
        # Create embeddings and vector store
        # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        vector_store = FAISS.from_documents(chunks, embeddings)
        
        # Create RAG chain with polished prompt
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a knowledgeable and friendly assistant helping users understand documentation. Answer questions naturally and conversationally, as if you're explaining to a colleague.

Your task:
- Read the context carefully and provide clear, helpful answers based on what's there
- Explain concepts in a simple, approachable way that anyone can understand
- If you find the answer in the context, explain it thoroughly with examples when available
- Be direct and confident in your responses - act like an expert who knows this documentation well
- If the information isn't in the context, simply say "I don't see that information in this documentation"
- Use a warm, professional tone - like a helpful coworker, not a robot

Context from documentation:
{context}"""),
            ("user", "{input}")
        ])
        
        document_chain = create_stuff_documents_chain(llm, prompt)
        retrieval_chain = create_retrieval_chain(
            vector_store.as_retriever(search_kwargs={"k": 4}), 
            document_chain
        )
        
        return f"✅ Successfully scraped and processed content from {url}.\n\nDocument chunks created: {len(chunks)}\n\nYou can now ask questions about the documentation!"
        
    except Exception as e:
        return f"❌ An error occurred during scraping or processing: {str(e)}"

def answer_question(question: str) -> str:
    global retrieval_chain
    
    if retrieval_chain is None:
        return "⚠️ Please scrape and process a URL first before asking questions."
    
    try:
        response = retrieval_chain.invoke({"input": question})
        return response["answer"]
    except Exception as e:
        return f"❌ An error occurred while answering the question: {str(e)}"

# Initialize LLM when the module is imported
initialize_rag_components()