final_try_SS / app.py
jaydeep123423's picture
Upload app.py
6b07c43 verified
"""
Semantic Quote Search Engine, 510, Jaideep, week 13
"""
import gradio as gr
from sentence_transformers import SentenceTransformer
import chromadb
from datasets import load_dataset
import pandas as pd
import os
# INITIALIZATIO
print(" Starting Semantic Search Engine...")
# Load embedding model
print(" Loading embedding model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded!")
# Initialize ChromaDB
chroma_path = "./chroma_db"
os.makedirs(chroma_path, exist_ok=True)
client = chromadb.PersistentClient(path=chroma_path)
# Check if collection exists, otherwise create it
try:
collection = client.get_collection("quotes_collection")
print(f"Loaded existing collection with {collection.count()} documents")
except:
print("Creating new collection from dataset...")
# Load dataset
dataset = load_dataset("Abirate/english_quotes", split="train")
df = pd.DataFrame(dataset)
texts = []
metadata = []
for idx, row in df.iterrows():
quote = row['quote']
author = row['author']
tags = ', '.join(row['tags']) if row['tags'] else 'No tags'
text = f"{quote} - {author}"
texts.append(text)
metadata.append({
'quote': quote,
'author': author,
'tags': tags
})
if idx >= 499: # Limit to 500 quotes
break
# Generate embeddings
print("🔢 Generating embeddings...")
embeddings = model.encode(texts, show_progress_bar=True)
# Create collection
collection = client.create_collection(
name="quotes_collection",
metadata={"description": "Famous quotes collection"}
)
# Add documents in batches
ids = [f"quote_{i}" for i in range(len(texts))]
batch_size = 100
for i in range(0, len(texts), batch_size):
end_idx = min(i + batch_size, len(texts))
collection.add(
documents=texts[i:end_idx],
embeddings=embeddings[i:end_idx].tolist(),
ids=ids[i:end_idx],
metadatas=metadata[i:end_idx]
)
print(f" Collection created with {collection.count()} documents!")
# SEARCH FUNCTION
def semantic_search(query, n_results=5):
"""
Perform semantic search over the quotes collection.
"""
# Encode query
query_embedding = model.encode([query])
# Query ChromaDB
results = collection.query(
query_embeddings=query_embedding.tolist(),
n_results=n_results,
include=['documents', 'metadatas', 'distances']
)
# Format results
output = []
for i in range(len(results['documents'][0])):
meta = results['metadatas'][0][i]
distance = results['distances'][0][i]
similarity = 1 - (distance / 2) # Convert distance to similarity
result_text = f"""
### Result {i+1} (Similarity: {similarity:.1%})
> "{meta['quote']}"
**— {meta['author']}**
🏷️ *Tags: {meta['tags']}*
"""
output.append(result_text)
return "\n---\n".join(output)
def search_quotes(query, num_results):
"""Wrapper for Gradio interface"""
if not query.strip():
return "⚠ Please enter a search query!"
return semantic_search(query, n_results=int(num_results))
# GRADIO INTERFACE
demo = gr.Interface(
fn=search_quotes,
inputs=[
gr.Textbox(
label="🔍 Search Query",
placeholder="Try: 'love', 'success', 'wisdom', 'courage'...",
lines=2
),
gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label=" Number of Results"
)
],
outputs=gr.Markdown(label="Search Results"),
title=" Semantic Quote Search Engine",
description="""
## Search through famous quotes using AI-powered semantic similarity!
Unlike traditional keyword search, this understands the **meaning** of your query.
**How it works:**
1. Your query is converted to a vector using a transformer model
2. We find quotes with the most similar meaning
3. Results are ranked by semantic similarity
*Built with SentenceTransformers, ChromaDB, and Gradio*
""",
examples=[
["finding happiness in life", 5],
["overcoming fear and challenges", 5],
["the importance of friendship", 3],
["learning from mistakes", 5],
["believing in yourself", 3]
]
)
# Launch
if __name__ == "__main__":
demo.launch()