claytonsds's picture
Update app.py
c69af1e verified
import os
import pickle
import gradio as gr
from transformers import pipeline
from langchain_classic.chains import RetrievalQAWithSourcesChain
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_google_genai import ChatGoogleGenerativeAI
# Get HuggingFace API token from environment variables
token = os.environ.get("API_TOKEN")
# ------------------------
# LLM
# ------------------------
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash",
temperature=0.7,api_key = token,
max_token =100
)
# Global variable to store the QA chain
chain = None
# Paths to save FAISS and URLs
# ------------------------
FAISS_FILE = "vectorstore.pkl"
URLS_FILE = "urls.pkl"
# ------------------------
# Function to process URLs with logging and FAISS management
# ------------------------
def process_urls_with_logs(url1, url2, url3):
global chain
urls = [url1, url2, url3]
urls = [u.strip() for u in urls if u.strip() != ""]
if len(urls) == 0:
return "Please provide at least one URL."
# Check if FAISS and saved URLs exist
if os.path.exists(FAISS_FILE) and os.path.exists(URLS_FILE):
with open(URLS_FILE, "rb") as f:
saved_urls = pickle.load(f)
else:
saved_urls = []
# If there are new URLs, recreate FAISS
if set(urls) != set(saved_urls):
print("New URLs detected or FAISS does not exist. Recreating FAISS...")
# Remove old FAISS from memory to free RAM
if 'vectorstore' in globals():
del globals()['vectorstore']
print("Loading URLs...")
loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
splits = text_splitter.split_documents(documents)
print("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
print("Creating vector database (FAISS)...")
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save FAISS and URLs to pickle
with open(FAISS_FILE, "wb") as f:
pickle.dump(vectorstore, f)
with open(URLS_FILE, "wb") as f:
pickle.dump(urls, f)
print("Initializing LLM chain...")
chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
return "FAISS successfully created/recreated!"
else:
print("No new URLs. Using existing FAISS.")
# Load existing FAISS
with open(FAISS_FILE, "rb") as f:
vectorstore = pickle.load(f)
chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
return "Existing FAISS loaded."
# ------------------------
# Function to answer questions
# ------------------------
def ask_question(question):
global chain
if chain is None:
return "Please process URLs first."
result = chain.invoke({'question': question})
answer = result.get("answer", "")
sources = result.get("sources", "")
return answer, sources
# ------------------------
# Gradio Interface
# ------------------------
with gr.Blocks() as app:
with gr.Row():
# Sidebar: URL input and processing
with gr.Column(scale=1):
gr.Markdown("## Insert URLs")
url1 = gr.Textbox(label="URL 1")
url2 = gr.Textbox(label="URL 2")
url3 = gr.Textbox(label="URL 3")
process_btn = gr.Button("Process URLs")
status_output = gr.Textbox(label="Status", lines=8)
# Main Area: Question input and answer output
with gr.Column(scale=2):
gr.Markdown("## Write your question")
question_box = gr.Textbox(
label="Your Question",
placeholder="Type your question based on the URLs...",
lines=4
)
ask_btn = gr.Button("Ask")
answer_output = gr.Textbox(label="Answer", lines=8)
sources_output = gr.Textbox(label="Sources", lines=4)
# Connect buttons to suas funções
process_btn.click(
process_urls_with_logs,
inputs=[url1, url2, url3],
outputs=status_output
)
ask_btn.click(
ask_question,
inputs=question_box,
outputs=[answer_output, sources_output]
)
# Launch the Gradio app
app.launch()