Spaces:
Sleeping
Sleeping
File size: 3,754 Bytes
b08b892 24a6253 e7e52d0 24a6253 e7e52d0 24a6253 5184502 e7e52d0 24a6253 5be1825 e7e52d0 5be1825 0b16223 e7e52d0 5be1825 e7e52d0 0b16223 e7e52d0 5be1825 e7e52d0 5be1825 e7e52d0 5be1825 e7e52d0 0b16223 e7e52d0 5be1825 3f252a6 e7e52d0 5be1825 e7e52d0 5be1825 e7e52d0 f43b632 5be1825 f43b632 5be1825 0b16223 5be1825 0b16223 5be1825 e7e52d0 0b16223 e7e52d0 5be1825 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import (
PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredHTMLLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
import gradio as gr
load_dotenv()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", temperature=0.5)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
VECTOR_STORE_PATH = "faiss_store_openai"
def load_any_file(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
loader = PyPDFLoader(file_path)
elif ext == ".txt":
loader = TextLoader(file_path)
elif ext == ".docx":
loader = Docx2txtLoader(file_path)
elif ext in [".html", ".htm"]:
loader = UnstructuredHTMLLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}")
return loader.load()
def process_inputs(url, file):
data = []
if url:
loader = UnstructuredURLLoader(urls=[url])
data.extend(loader.load())
if file:
upload_file_path = file.name
data.extend(load_any_file(upload_file_path))
if not data:
return "Please provide a URL or a file to process.", gr.update(visible=False), gr.update(visible=False)
splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
docs = splitter.split_documents(data)
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local(VECTOR_STORE_PATH)
return "✅ Documents processed successfully! Please switch to the 'Ask a Question' tab.", gr.update(visible=True), gr.update(visible=True)
def answer_question(query):
if not os.path.exists(f"{VECTOR_STORE_PATH}/index.faiss"):
return "No Data found. Please upload a document or URL first.", ""
vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
result = chain({"question": query}, return_only_outputs=True)
return result.get("answer", "No answer generated."), result.get("sources", "No sources found.")
with gr.Blocks(title="InfoSEARCH") as demo:
gr.Markdown("""
# 🧾 InfoSEARCH
Upload a document or provide a URL. Ask anything from the content.
""")
with gr.Tab("📄 Upload or Link"):
with gr.Row():
url_input = gr.Textbox(label="Upload URL", placeholder="Paste a news article URL")
file_input = gr.File(label="Upload Document", file_types=[".pdf", ".txt", ".docx", ".html", ".htm"])
process_btn = gr.Button("📥 Process Input")
process_status = gr.Textbox(label="Status", interactive=False)
jump_notice = gr.Textbox(visible=False, interactive=False)
with gr.Tab("❓ Ask a Question"):
query_input = gr.Textbox(label="Ask a question", placeholder="Type your question here and hit Enter")
answer_output = gr.Textbox(label="🧾 Answer", lines=4)
sources_output = gr.Textbox(label="🔗 Sources", lines=3)
process_btn.click(
fn=process_inputs,
inputs=[url_input, file_input],
outputs=[process_status, jump_notice, query_input]
)
query_input.submit(fn=answer_question, inputs=query_input, outputs=[answer_output, sources_output])
demo.launch()
|