import requests from bs4 import BeautifulSoup from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate import os # --- Global variables for RAG components --- vector_store = None llm = None retrieval_chain = None def initialize_rag_components(): global llm llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3) def scrape_and_process_url(url: str) -> str: global vector_store, retrieval_chain try: # Scrape content using WebBaseLoader for simplicity and robustness # This handles parsing and extracting main content from various web pages loader = WebBaseLoader(url) docs = loader.load() if not docs: return "Failed to load content from the URL. Please check the URL or try another one." # Split documents into smaller chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) chunks = text_splitter.split_documents(docs) # Create embeddings and vector store # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vector_store = FAISS.from_documents(chunks, embeddings) # Create RAG chain with polished prompt prompt = ChatPromptTemplate.from_messages([ ("system", """You are a knowledgeable and friendly assistant helping users understand documentation. Answer questions naturally and conversationally, as if you're explaining to a colleague. Your task: - Read the context carefully and provide clear, helpful answers based on what's there - Explain concepts in a simple, approachable way that anyone can understand - If you find the answer in the context, explain it thoroughly with examples when available - Be direct and confident in your responses - act like an expert who knows this documentation well - If the information isn't in the context, simply say "I don't see that information in this documentation" - Use a warm, professional tone - like a helpful coworker, not a robot Context from documentation: {context}"""), ("user", "{input}") ]) document_chain = create_stuff_documents_chain(llm, prompt) retrieval_chain = create_retrieval_chain( vector_store.as_retriever(search_kwargs={"k": 4}), document_chain ) return f"✅ Successfully scraped and processed content from {url}.\n\nDocument chunks created: {len(chunks)}\n\nYou can now ask questions about the documentation!" except Exception as e: return f"❌ An error occurred during scraping or processing: {str(e)}" def answer_question(question: str) -> str: global retrieval_chain if retrieval_chain is None: return "⚠️ Please scrape and process a URL first before asking questions." try: response = retrieval_chain.invoke({"input": question}) return response["answer"] except Exception as e: return f"❌ An error occurred while answering the question: {str(e)}" # Initialize LLM when the module is imported initialize_rag_components()