Spaces:

Nigz
/

Semantic_Search

Sleeping

File size: 3,754 Bytes

b08b892
 
24a6253
 
e7e52d0
24a6253
 
e7e52d0
24a6253
 
 
5184502
e7e52d0
 
 
24a6253
5be1825
 
 
e7e52d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5be1825
0b16223
e7e52d0
 
 
 
 
 
 
5be1825
 
e7e52d0
 
0b16223
e7e52d0
5be1825
e7e52d0
 
 
5be1825
e7e52d0
5be1825
 
e7e52d0
0b16223
e7e52d0
 
 
5be1825
3f252a6
e7e52d0
5be1825
e7e52d0
 
 
5be1825
e7e52d0
 
f43b632
5be1825
f43b632
5be1825
 
 
 
 
0b16223
5be1825
 
 
0b16223
5be1825
 
 
 
 
e7e52d0
0b16223
 
 
 
 
e7e52d0
 
5be1825

from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import (
    PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredHTMLLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
import gradio as gr

load_dotenv()

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", temperature=0.5)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
VECTOR_STORE_PATH = "faiss_store_openai"


def load_any_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path)
    elif ext == ".docx":
        loader = Docx2txtLoader(file_path)
    elif ext in [".html", ".htm"]:
        loader = UnstructuredHTMLLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")
    return loader.load()



def process_inputs(url, file):
    data = []
    if url:
        loader = UnstructuredURLLoader(urls=[url])
        data.extend(loader.load())

    if file:
        upload_file_path = file.name
        data.extend(load_any_file(upload_file_path))

    if not data:
        return "Please provide a URL or a file to process.", gr.update(visible=False), gr.update(visible=False)

    splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    docs = splitter.split_documents(data)

    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(VECTOR_STORE_PATH)

    return "✅ Documents processed successfully! Please switch to the 'Ask a Question' tab.", gr.update(visible=True), gr.update(visible=True)


def answer_question(query):
    if not os.path.exists(f"{VECTOR_STORE_PATH}/index.faiss"):
        return "No Data found. Please upload a document or URL first.", ""

    vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
    result = chain({"question": query}, return_only_outputs=True)

    return result.get("answer", "No answer generated."), result.get("sources", "No sources found.")


with gr.Blocks(title="InfoSEARCH") as demo:
    gr.Markdown("""
    # 🧾 InfoSEARCH
    Upload a document or provide a URL. Ask anything from the content.
    """)

    with gr.Tab("📄 Upload or Link"):
        with gr.Row():
            url_input = gr.Textbox(label="Upload URL", placeholder="Paste a news article URL")
            file_input = gr.File(label="Upload Document", file_types=[".pdf", ".txt", ".docx", ".html", ".htm"])
        process_btn = gr.Button("📥 Process Input")
        process_status = gr.Textbox(label="Status", interactive=False)
        jump_notice = gr.Textbox(visible=False, interactive=False)

    with gr.Tab("❓ Ask a Question"):
        query_input = gr.Textbox(label="Ask a question", placeholder="Type your question here and hit Enter")
        answer_output = gr.Textbox(label="🧾 Answer", lines=4)
        sources_output = gr.Textbox(label="🔗 Sources", lines=3)

    process_btn.click(
        fn=process_inputs,
        inputs=[url_input, file_input],
        outputs=[process_status, jump_notice, query_input]
    )
    query_input.submit(fn=answer_question, inputs=query_input, outputs=[answer_output, sources_output])


demo.launch()