from langchain_community.document_loaders import UnstructuredURLLoader from langchain_community.document_loaders import ( PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredHTMLLoader ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain from langchain_google_genai import ChatGoogleGenerativeAI from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_community.vectorstores import FAISS from dotenv import load_dotenv import os import gradio as gr load_dotenv() llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", temperature=0.5) embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") VECTOR_STORE_PATH = "faiss_store_openai" def load_any_file(file_path): ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": loader = PyPDFLoader(file_path) elif ext == ".txt": loader = TextLoader(file_path) elif ext == ".docx": loader = Docx2txtLoader(file_path) elif ext in [".html", ".htm"]: loader = UnstructuredHTMLLoader(file_path) else: raise ValueError(f"Unsupported file type: {ext}") return loader.load() def process_inputs(url, file): data = [] if url: loader = UnstructuredURLLoader(urls=[url]) data.extend(loader.load()) if file: upload_file_path = file.name data.extend(load_any_file(upload_file_path)) if not data: return "Please provide a URL or a file to process.", gr.update(visible=False), gr.update(visible=False) splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=1000 ) docs = splitter.split_documents(data) vectorstore = FAISS.from_documents(docs, embeddings) vectorstore.save_local(VECTOR_STORE_PATH) return "โœ… Documents processed successfully! Please switch to the 'Ask a Question' tab.", gr.update(visible=True), gr.update(visible=True) def answer_question(query): if not os.path.exists(f"{VECTOR_STORE_PATH}/index.faiss"): return "No Data found. Please upload a document or URL first.", "" vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True) chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) result = chain({"question": query}, return_only_outputs=True) return result.get("answer", "No answer generated."), result.get("sources", "No sources found.") with gr.Blocks(title="InfoSEARCH") as demo: gr.Markdown(""" # ๐Ÿงพ InfoSEARCH Upload a document or provide a URL. Ask anything from the content. """) with gr.Tab("๐Ÿ“„ Upload or Link"): with gr.Row(): url_input = gr.Textbox(label="Upload URL", placeholder="Paste a news article URL") file_input = gr.File(label="Upload Document", file_types=[".pdf", ".txt", ".docx", ".html", ".htm"]) process_btn = gr.Button("๐Ÿ“ฅ Process Input") process_status = gr.Textbox(label="Status", interactive=False) jump_notice = gr.Textbox(visible=False, interactive=False) with gr.Tab("โ“ Ask a Question"): query_input = gr.Textbox(label="Ask a question", placeholder="Type your question here and hit Enter") answer_output = gr.Textbox(label="๐Ÿงพ Answer", lines=4) sources_output = gr.Textbox(label="๐Ÿ”— Sources", lines=3) process_btn.click( fn=process_inputs, inputs=[url_input, file_input], outputs=[process_status, jump_notice, query_input] ) query_input.submit(fn=answer_question, inputs=query_input, outputs=[answer_output, sources_output]) demo.launch()