Rahaf2001 commited on
Commit
f25282e
·
verified ·
1 Parent(s): 5aa3790

Upload rag_core.py

Browse files
Files changed (1) hide show
  1. rag_core.py +70 -0
rag_core.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from langchain_community.document_loaders import WebBaseLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import OpenAIEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ import os
12
+
13
+ # --- Global variables for RAG components ---
14
+ vector_store = None
15
+ llm = None
16
+ retrieval_chain = None
17
+
18
+ def initialize_rag_components():
19
+ global llm
20
+ llm = ChatOpenAI(model="gemini-2.5-flash", temperature=0.3)
21
+
22
+ def scrape_and_process_url(url: str) -> str:
23
+ global vector_store, retrieval_chain
24
+
25
+ try:
26
+ # Scrape content using WebBaseLoader for simplicity and robustness
27
+ # This handles parsing and extracting main content from various web pages
28
+ loader = WebBaseLoader(url)
29
+ docs = loader.load()
30
+
31
+ if not docs:
32
+ return "Failed to load content from the URL. Please check the URL or try another one."
33
+
34
+ # Split documents into smaller chunks
35
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
36
+ chunks = text_splitter.split_documents(docs)
37
+
38
+ # Create embeddings and vector store
39
+ # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
40
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
41
+ vector_store = FAISS.from_documents(chunks, embeddings)
42
+
43
+ # Create RAG chain
44
+ prompt = ChatPromptTemplate.from_messages([
45
+ ("system", "Answer the user's questions based on the provided context only. "
46
+ "If you don't know the answer, just say that you don't know, don't make up an answer.\n\n{context}"),
47
+ ("user", "{input}")
48
+ ])
49
+ document_chain = create_stuff_documents_chain(llm, prompt)
50
+ retrieval_chain = create_retrieval_chain(vector_store.as_retriever(), document_chain)
51
+
52
+ return f"Successfully scraped and processed content from {url}. You can now ask questions."
53
+
54
+ except Exception as e:
55
+ return f"An error occurred during scraping or processing: {str(e)}"
56
+
57
+ def answer_question(question: str) -> str:
58
+ global retrieval_chain
59
+ if retrieval_chain is None:
60
+ return "Please scrape and process a URL first before asking questions."
61
+
62
+ try:
63
+ response = retrieval_chain.invoke({"input": question})
64
+ return response["answer"]
65
+ except Exception as e:
66
+ return f"An error occurred while answering the question: {str(e)}"
67
+
68
+ # Initialize LLM when the module is imported
69
+ initialize_rag_components()
70
+