jaydeep123423 commited on
Commit
1f2ac71
·
verified ·
1 Parent(s): d3bebf0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Semantic Quote Search Engine
3
+ Deploy this to Hugging Face Spaces!
4
+ """
5
+
6
+ import gradio as gr
7
+ from sentence_transformers import SentenceTransformer
8
+ import chromadb
9
+ from datasets import load_dataset
10
+ import pandas as pd
11
+ import os
12
+
13
+ # ============== INITIALIZATION ==============
14
+ print("🚀 Starting Semantic Search Engine...")
15
+
16
+ # Load embedding model
17
+ print("📦 Loading embedding model...")
18
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
19
+ print("✅ Model loaded!")
20
+
21
+ # Initialize ChromaDB
22
+ chroma_path = "./chroma_db"
23
+ os.makedirs(chroma_path, exist_ok=True)
24
+ client = chromadb.PersistentClient(path=chroma_path)
25
+
26
+ # Check if collection exists, otherwise create it
27
+ try:
28
+ collection = client.get_collection("quotes_collection")
29
+ print(f"✅ Loaded existing collection with {collection.count()} documents")
30
+ except:
31
+ print("📊 Creating new collection from dataset...")
32
+
33
+ # Load dataset
34
+ dataset = load_dataset("Abirate/english_quotes", split="train")
35
+ df = pd.DataFrame(dataset)
36
+
37
+ texts = []
38
+ metadata = []
39
+
40
+ for idx, row in df.iterrows():
41
+ quote = row['quote']
42
+ author = row['author']
43
+ tags = ', '.join(row['tags']) if row['tags'] else 'No tags'
44
+ text = f"{quote} - {author}"
45
+ texts.append(text)
46
+ metadata.append({
47
+ 'quote': quote,
48
+ 'author': author,
49
+ 'tags': tags
50
+ })
51
+ if idx >= 499: # Limit to 500 quotes
52
+ break
53
+
54
+ # Generate embeddings
55
+ print("🔢 Generating embeddings...")
56
+ embeddings = model.encode(texts, show_progress_bar=True)
57
+
58
+ # Create collection
59
+ collection = client.create_collection(
60
+ name="quotes_collection",
61
+ metadata={"description": "Famous quotes collection"}
62
+ )
63
+
64
+ # Add documents in batches
65
+ ids = [f"quote_{i}" for i in range(len(texts))]
66
+ batch_size = 100
67
+
68
+ for i in range(0, len(texts), batch_size):
69
+ end_idx = min(i + batch_size, len(texts))
70
+ collection.add(
71
+ documents=texts[i:end_idx],
72
+ embeddings=embeddings[i:end_idx].tolist(),
73
+ ids=ids[i:end_idx],
74
+ metadatas=metadata[i:end_idx]
75
+ )
76
+
77
+ print(f"✅ Collection created with {collection.count()} documents!")
78
+
79
+
80
+ # ============== SEARCH FUNCTION ==============
81
+ def semantic_search(query, n_results=5):
82
+ """
83
+ Perform semantic search over the quotes collection.
84
+ """
85
+ # Encode query
86
+ query_embedding = model.encode([query])
87
+
88
+ # Query ChromaDB
89
+ results = collection.query(
90
+ query_embeddings=query_embedding.tolist(),
91
+ n_results=n_results,
92
+ include=['documents', 'metadatas', 'distances']
93
+ )
94
+
95
+ # Format results
96
+ output = []
97
+ for i in range(len(results['documents'][0])):
98
+ meta = results['metadatas'][0][i]
99
+ distance = results['distances'][0][i]
100
+ similarity = 1 - (distance / 2) # Convert distance to similarity
101
+
102
+ result_text = f"""
103
+ ### Result {i+1} (Similarity: {similarity:.1%})
104
+
105
+ > "{meta['quote']}"
106
+
107
+ **— {meta['author']}**
108
+
109
+ 🏷️ *Tags: {meta['tags']}*
110
+ """
111
+ output.append(result_text)
112
+
113
+ return "\n---\n".join(output)
114
+
115
+
116
+ def search_quotes(query, num_results):
117
+ """Wrapper for Gradio interface"""
118
+ if not query.strip():
119
+ return "⚠️ Please enter a search query!"
120
+ return semantic_search(query, n_results=int(num_results))
121
+
122
+
123
+ # ============== GRADIO INTERFACE ==============
124
+ demo = gr.Interface(
125
+ fn=search_quotes,
126
+ inputs=[
127
+ gr.Textbox(
128
+ label="🔍 Search Query",
129
+ placeholder="Try: 'love', 'success', 'wisdom', 'courage'...",
130
+ lines=2
131
+ ),
132
+ gr.Slider(
133
+ minimum=1,
134
+ maximum=10,
135
+ value=5,
136
+ step=1,
137
+ label="📊 Number of Results"
138
+ )
139
+ ],
140
+ outputs=gr.Markdown(label="📚 Search Results"),
141
+ title="📚 Semantic Quote Search Engine",
142
+ description="""
143
+ ## Search through famous quotes using AI-powered semantic similarity!
144
+
145
+ Unlike traditional keyword search, this understands the **meaning** of your query.
146
+
147
+ **How it works:**
148
+ 1. Your query is converted to a vector using a transformer model
149
+ 2. We find quotes with the most similar meaning
150
+ 3. Results are ranked by semantic similarity
151
+
152
+ *Built with SentenceTransformers, ChromaDB, and Gradio*
153
+ """,
154
+ examples=[
155
+ ["finding happiness in life", 5],
156
+ ["overcoming fear and challenges", 5],
157
+ ["the importance of friendship", 3],
158
+ ["learning from mistakes", 5],
159
+ ["believing in yourself", 3]
160
+ ]
161
+ )
162
+
163
+ # Launch
164
+ if __name__ == "__main__":
165
+ demo.launch()