Spaces:

SalmaHeshamSalem
/

MLvectorDB

Sleeping

App Files Files Community

MLvectorDB / app.py

SalmaHeshamSalem

Update app.py

823e02b verified over 1 year ago

raw

history blame contribute delete

3.48 kB

	import gradio as gr
	import os

	# 1. Import your ingestion or retrieval code
	import chromadb
	from sentence_transformers import SentenceTransformer

	# We'll assume you have a function like 'init_chroma()' or we do it inline:
	client = chromadb.Client()
	collection = client.get_or_create_collection("ml_basics_collection")

	# Same embedding model as in your ingestion
	embed_model = SentenceTransformer('all-MiniLM-L6-v2')

	def query_db(user_query):
	"""
	1) Convert user query to embedding
	2) Query the Chroma collection
	3) Build HTML that shows chunk previews & embedded PDFs
	"""

	# A) Embed the query
	q_vec = embed_model.encode(user_query).tolist()

	# B) Query top results
	results = collection.query(
	query_embeddings=[q_vec],
	n_results=3 # or however many you want
	)

	# results is typically a dict with: "documents", "metadatas", "ids", ...
	metadatas_list = results.get("metadatas", [[]])[0] # first query's top docs
	documents_list = results.get("documents", [[]])[0]

	# If empty, return a message
	if not metadatas_list:
	return "<h4>No results found!</h4>"

	# C) Build an HTML string that displays each chunk & an <iframe> to the PDF
	# We'll show only the first chunk from each source_id or similar
	html_output = "<h2>Search Results</h2>"

	# We'll keep track of which PDF we've already displayed
	displayed_pdfs = set()

	for meta, chunk_text in zip(metadatas_list, documents_list):
	pdf_path = meta.get("file_path")
	title = meta.get("title", "No Title")
	source_id = meta.get("source_id")
	chunk_index = meta.get("chunk_index")

	html_output += f"<h3>{title} - chunk {chunk_index}</h3>"
	# Show snippet
	html_output += f"<p><b>Excerpt:</b> {chunk_text[:300]} ...</p>"

	if pdf_path and pdf_path not in displayed_pdfs:
	displayed_pdfs.add(pdf_path)

	# The 'pdf_path' must be accessible in your HF Space
	# For example: pdfs/1234.5678.pdf
	# We'll embed an iframe. Adjust width/height as needed.
	if os.path.exists(pdf_path):
	# We build a relative path for the iframe
	iframe_src = f"./{pdf_path}" # e.g. "./pdfs/1234.5678.pdf"
	# Or a direct URL with your space name:
	# iframe_src = f"https://<space-id>.hf.space/file/{pdf_path}"

	html_output += f"""
	<iframe
	src="{iframe_src}"
	width="600"
	height="400">
	</iframe>
	"""
	else:
	html_output += f"<p style='color:red'>PDF not found at {pdf_path}</p>"

	# Add a horizontal line
	html_output += "<hr>"

	return html_output


	########################
	# Gradio UI
	########################
	def build_app():
	with gr.Blocks() as demo:
	gr.Markdown("## Ask Me About Machine Learning Basics")

	user_query = gr.Textbox(
	label="Enter your query",
	placeholder="e.g. What is supervised learning?"
	)

	results_html = gr.HTML(label="Results")

	# When user presses Enter on the query box, call `query_db`
	user_query.submit(
	fn=query_db,
	inputs=user_query,
	outputs=results_html
	)

	return demo

	demo = build_app()

	if __name__ == "__main__":
	demo.launch()