Spaces:

Nigz
/

Semantic_Search

Sleeping

App Files Files Community

Semantic_Search / app.py

Nigz

Update app.py

0b16223 verified 7 months ago

raw

history blame contribute delete

3.75 kB

	from langchain_community.document_loaders import UnstructuredURLLoader
	from langchain_community.document_loaders import (
	PyPDFLoader, TextLoader, Docx2txtLoader, UnstructuredHTMLLoader
	)
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from dotenv import load_dotenv
	import os
	import gradio as gr

	load_dotenv()

	llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", temperature=0.5)
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
	VECTOR_STORE_PATH = "faiss_store_openai"


	def load_any_file(file_path):
	ext = os.path.splitext(file_path)[1].lower()
	if ext == ".pdf":
	loader = PyPDFLoader(file_path)
	elif ext == ".txt":
	loader = TextLoader(file_path)
	elif ext == ".docx":
	loader = Docx2txtLoader(file_path)
	elif ext in [".html", ".htm"]:
	loader = UnstructuredHTMLLoader(file_path)
	else:
	raise ValueError(f"Unsupported file type: {ext}")
	return loader.load()



	def process_inputs(url, file):
	data = []
	if url:
	loader = UnstructuredURLLoader(urls=[url])
	data.extend(loader.load())

	if file:
	upload_file_path = file.name
	data.extend(load_any_file(upload_file_path))

	if not data:
	return "Please provide a URL or a file to process.", gr.update(visible=False), gr.update(visible=False)

	splitter = RecursiveCharacterTextSplitter(
	separators=['\n\n', '\n', '.', ','],
	chunk_size=1000
	)
	docs = splitter.split_documents(data)

	vectorstore = FAISS.from_documents(docs, embeddings)
	vectorstore.save_local(VECTOR_STORE_PATH)

	return "✅ Documents processed successfully! Please switch to the 'Ask a Question' tab.", gr.update(visible=True), gr.update(visible=True)


	def answer_question(query):
	if not os.path.exists(f"{VECTOR_STORE_PATH}/index.faiss"):
	return "No Data found. Please upload a document or URL first.", ""

	vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
	chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
	result = chain({"question": query}, return_only_outputs=True)

	return result.get("answer", "No answer generated."), result.get("sources", "No sources found.")


	with gr.Blocks(title="InfoSEARCH") as demo:
	gr.Markdown("""
	# 🧾 InfoSEARCH
	Upload a document or provide a URL. Ask anything from the content.
	""")

	with gr.Tab("📄 Upload or Link"):
	with gr.Row():
	url_input = gr.Textbox(label="Upload URL", placeholder="Paste a news article URL")
	file_input = gr.File(label="Upload Document", file_types=[".pdf", ".txt", ".docx", ".html", ".htm"])
	process_btn = gr.Button("📥 Process Input")
	process_status = gr.Textbox(label="Status", interactive=False)
	jump_notice = gr.Textbox(visible=False, interactive=False)

	with gr.Tab("❓ Ask a Question"):
	query_input = gr.Textbox(label="Ask a question", placeholder="Type your question here and hit Enter")
	answer_output = gr.Textbox(label="🧾 Answer", lines=4)
	sources_output = gr.Textbox(label="🔗 Sources", lines=3)

	process_btn.click(
	fn=process_inputs,
	inputs=[url_input, file_input],
	outputs=[process_status, jump_notice, query_input]
	)
	query_input.submit(fn=answer_question, inputs=query_input, outputs=[answer_output, sources_output])


	demo.launch()