Spaces:
Sleeping
Sleeping
| """ | |
| Semantic Quote Search Engine, 510, Jaideep, week 13 | |
| """ | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import os | |
| # INITIALIZATIO | |
| print(" Starting Semantic Search Engine...") | |
| # Load embedding model | |
| print(" Loading embedding model...") | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| print("Model loaded!") | |
| # Initialize ChromaDB | |
| chroma_path = "./chroma_db" | |
| os.makedirs(chroma_path, exist_ok=True) | |
| client = chromadb.PersistentClient(path=chroma_path) | |
| # Check if collection exists, otherwise create it | |
| try: | |
| collection = client.get_collection("quotes_collection") | |
| print(f"Loaded existing collection with {collection.count()} documents") | |
| except: | |
| print("Creating new collection from dataset...") | |
| # Load dataset | |
| dataset = load_dataset("Abirate/english_quotes", split="train") | |
| df = pd.DataFrame(dataset) | |
| texts = [] | |
| metadata = [] | |
| for idx, row in df.iterrows(): | |
| quote = row['quote'] | |
| author = row['author'] | |
| tags = ', '.join(row['tags']) if row['tags'] else 'No tags' | |
| text = f"{quote} - {author}" | |
| texts.append(text) | |
| metadata.append({ | |
| 'quote': quote, | |
| 'author': author, | |
| 'tags': tags | |
| }) | |
| if idx >= 499: # Limit to 500 quotes | |
| break | |
| # Generate embeddings | |
| print("🔢 Generating embeddings...") | |
| embeddings = model.encode(texts, show_progress_bar=True) | |
| # Create collection | |
| collection = client.create_collection( | |
| name="quotes_collection", | |
| metadata={"description": "Famous quotes collection"} | |
| ) | |
| # Add documents in batches | |
| ids = [f"quote_{i}" for i in range(len(texts))] | |
| batch_size = 100 | |
| for i in range(0, len(texts), batch_size): | |
| end_idx = min(i + batch_size, len(texts)) | |
| collection.add( | |
| documents=texts[i:end_idx], | |
| embeddings=embeddings[i:end_idx].tolist(), | |
| ids=ids[i:end_idx], | |
| metadatas=metadata[i:end_idx] | |
| ) | |
| print(f" Collection created with {collection.count()} documents!") | |
| # SEARCH FUNCTION | |
| def semantic_search(query, n_results=5): | |
| """ | |
| Perform semantic search over the quotes collection. | |
| """ | |
| # Encode query | |
| query_embedding = model.encode([query]) | |
| # Query ChromaDB | |
| results = collection.query( | |
| query_embeddings=query_embedding.tolist(), | |
| n_results=n_results, | |
| include=['documents', 'metadatas', 'distances'] | |
| ) | |
| # Format results | |
| output = [] | |
| for i in range(len(results['documents'][0])): | |
| meta = results['metadatas'][0][i] | |
| distance = results['distances'][0][i] | |
| similarity = 1 - (distance / 2) # Convert distance to similarity | |
| result_text = f""" | |
| ### Result {i+1} (Similarity: {similarity:.1%}) | |
| > "{meta['quote']}" | |
| **— {meta['author']}** | |
| 🏷️ *Tags: {meta['tags']}* | |
| """ | |
| output.append(result_text) | |
| return "\n---\n".join(output) | |
| def search_quotes(query, num_results): | |
| """Wrapper for Gradio interface""" | |
| if not query.strip(): | |
| return "⚠ Please enter a search query!" | |
| return semantic_search(query, n_results=int(num_results)) | |
| # GRADIO INTERFACE | |
| demo = gr.Interface( | |
| fn=search_quotes, | |
| inputs=[ | |
| gr.Textbox( | |
| label="🔍 Search Query", | |
| placeholder="Try: 'love', 'success', 'wisdom', 'courage'...", | |
| lines=2 | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=5, | |
| step=1, | |
| label=" Number of Results" | |
| ) | |
| ], | |
| outputs=gr.Markdown(label="Search Results"), | |
| title=" Semantic Quote Search Engine", | |
| description=""" | |
| ## Search through famous quotes using AI-powered semantic similarity! | |
| Unlike traditional keyword search, this understands the **meaning** of your query. | |
| **How it works:** | |
| 1. Your query is converted to a vector using a transformer model | |
| 2. We find quotes with the most similar meaning | |
| 3. Results are ranked by semantic similarity | |
| *Built with SentenceTransformers, ChromaDB, and Gradio* | |
| """, | |
| examples=[ | |
| ["finding happiness in life", 5], | |
| ["overcoming fear and challenges", 5], | |
| ["the importance of friendship", 3], | |
| ["learning from mistakes", 5], | |
| ["believing in yourself", 3] | |
| ] | |
| ) | |
| # Launch | |
| if __name__ == "__main__": | |
| demo.launch() | |