Spaces:
Sleeping
Sleeping
File size: 4,847 Bytes
02c5e78 0612daf 02c5e78 6f39035 02c5e78 6f39035 7a1c90d 02c5e78 6f39035 02c5e78 6f39035 c69af1e 02c5e78 6f39035 02c5e78 5c8f5eb 7a1c90d 6f39035 02c5e78 5c8f5eb 02c5e78 9cafd43 5c8f5eb 6f39035 5c8f5eb 02c5e78 5c8f5eb 6f39035 5c8f5eb 02c5e78 6f39035 02c5e78 6f39035 fd67371 02c5e78 6f39035 02c5e78 fd67371 02c5e78 7a1c90d 02c5e78 fd67371 02c5e78 6f39035 7a1c90d d949de5 02c5e78 7a1c90d 02c5e78 d949de5 02c5e78 7a1c90d 02c5e78 6f39035 02c5e78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | import os
import pickle
import gradio as gr
from transformers import pipeline
from langchain_classic.chains import RetrievalQAWithSourcesChain
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_google_genai import ChatGoogleGenerativeAI
# Get HuggingFace API token from environment variables
token = os.environ.get("API_TOKEN")
# ------------------------
# LLM
# ------------------------
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash",
temperature=0.7,api_key = token,
max_token =100
)
# Global variable to store the QA chain
chain = None
# Paths to save FAISS and URLs
# ------------------------
FAISS_FILE = "vectorstore.pkl"
URLS_FILE = "urls.pkl"
# ------------------------
# Function to process URLs with logging and FAISS management
# ------------------------
def process_urls_with_logs(url1, url2, url3):
global chain
urls = [url1, url2, url3]
urls = [u.strip() for u in urls if u.strip() != ""]
if len(urls) == 0:
return "Please provide at least one URL."
# Check if FAISS and saved URLs exist
if os.path.exists(FAISS_FILE) and os.path.exists(URLS_FILE):
with open(URLS_FILE, "rb") as f:
saved_urls = pickle.load(f)
else:
saved_urls = []
# If there are new URLs, recreate FAISS
if set(urls) != set(saved_urls):
print("New URLs detected or FAISS does not exist. Recreating FAISS...")
# Remove old FAISS from memory to free RAM
if 'vectorstore' in globals():
del globals()['vectorstore']
print("Loading URLs...")
loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
splits = text_splitter.split_documents(documents)
print("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
print("Creating vector database (FAISS)...")
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save FAISS and URLs to pickle
with open(FAISS_FILE, "wb") as f:
pickle.dump(vectorstore, f)
with open(URLS_FILE, "wb") as f:
pickle.dump(urls, f)
print("Initializing LLM chain...")
chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
return "FAISS successfully created/recreated!"
else:
print("No new URLs. Using existing FAISS.")
# Load existing FAISS
with open(FAISS_FILE, "rb") as f:
vectorstore = pickle.load(f)
chain = RetrievalQAWithSourcesChain.from_llm( llm=llm, retriever=vectorstore.as_retriever())
return "Existing FAISS loaded."
# ------------------------
# Function to answer questions
# ------------------------
def ask_question(question):
global chain
if chain is None:
return "Please process URLs first."
result = chain.invoke({'question': question})
answer = result.get("answer", "")
sources = result.get("sources", "")
return answer, sources
# ------------------------
# Gradio Interface
# ------------------------
with gr.Blocks() as app:
with gr.Row():
# Sidebar: URL input and processing
with gr.Column(scale=1):
gr.Markdown("## Insert URLs")
url1 = gr.Textbox(label="URL 1")
url2 = gr.Textbox(label="URL 2")
url3 = gr.Textbox(label="URL 3")
process_btn = gr.Button("Process URLs")
status_output = gr.Textbox(label="Status", lines=8)
# Main Area: Question input and answer output
with gr.Column(scale=2):
gr.Markdown("## Write your question")
question_box = gr.Textbox(
label="Your Question",
placeholder="Type your question based on the URLs...",
lines=4
)
ask_btn = gr.Button("Ask")
answer_output = gr.Textbox(label="Answer", lines=8)
sources_output = gr.Textbox(label="Sources", lines=4)
# Connect buttons to suas funções
process_btn.click(
process_urls_with_logs,
inputs=[url1, url2, url3],
outputs=status_output
)
ask_btn.click(
ask_question,
inputs=question_box,
outputs=[answer_output, sources_output]
)
# Launch the Gradio app
app.launch() |