MLvectorDB / app.py
SalmaHeshamSalem's picture
Update app.py
823e02b verified
import gradio as gr
import os
# 1. Import your ingestion or retrieval code
import chromadb
from sentence_transformers import SentenceTransformer
# We'll assume you have a function like 'init_chroma()' or we do it inline:
client = chromadb.Client()
collection = client.get_or_create_collection("ml_basics_collection")
# Same embedding model as in your ingestion
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
def query_db(user_query):
"""
1) Convert user query to embedding
2) Query the Chroma collection
3) Build HTML that shows chunk previews & embedded PDFs
"""
# A) Embed the query
q_vec = embed_model.encode(user_query).tolist()
# B) Query top results
results = collection.query(
query_embeddings=[q_vec],
n_results=3 # or however many you want
)
# results is typically a dict with: "documents", "metadatas", "ids", ...
metadatas_list = results.get("metadatas", [[]])[0] # first query's top docs
documents_list = results.get("documents", [[]])[0]
# If empty, return a message
if not metadatas_list:
return "<h4>No results found!</h4>"
# C) Build an HTML string that displays each chunk & an <iframe> to the PDF
# We'll show only the first chunk from each source_id or similar
html_output = "<h2>Search Results</h2>"
# We'll keep track of which PDF we've already displayed
displayed_pdfs = set()
for meta, chunk_text in zip(metadatas_list, documents_list):
pdf_path = meta.get("file_path")
title = meta.get("title", "No Title")
source_id = meta.get("source_id")
chunk_index = meta.get("chunk_index")
html_output += f"<h3>{title} - chunk {chunk_index}</h3>"
# Show snippet
html_output += f"<p><b>Excerpt:</b> {chunk_text[:300]} ...</p>"
if pdf_path and pdf_path not in displayed_pdfs:
displayed_pdfs.add(pdf_path)
# The 'pdf_path' must be accessible in your HF Space
# For example: pdfs/1234.5678.pdf
# We'll embed an iframe. Adjust width/height as needed.
if os.path.exists(pdf_path):
# We build a relative path for the iframe
iframe_src = f"./{pdf_path}" # e.g. "./pdfs/1234.5678.pdf"
# Or a direct URL with your space name:
# iframe_src = f"https://<space-id>.hf.space/file/{pdf_path}"
html_output += f"""
<iframe
src="{iframe_src}"
width="600"
height="400">
</iframe>
"""
else:
html_output += f"<p style='color:red'>PDF not found at {pdf_path}</p>"
# Add a horizontal line
html_output += "<hr>"
return html_output
########################
# Gradio UI
########################
def build_app():
with gr.Blocks() as demo:
gr.Markdown("## Ask Me About Machine Learning Basics")
user_query = gr.Textbox(
label="Enter your query",
placeholder="e.g. What is supervised learning?"
)
results_html = gr.HTML(label="Results")
# When user presses Enter on the query box, call `query_db`
user_query.submit(
fn=query_db,
inputs=user_query,
outputs=results_html
)
return demo
demo = build_app()
if __name__ == "__main__":
demo.launch()