jaydeep123423 commited on
Commit
cdb89d7
Β·
verified Β·
1 Parent(s): 4c9bcc3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from sentence_transformers import SentenceTransformer
4
+ import chromadb
5
+ from datasets import load_dataset
6
+ import pandas as pd
7
+ import os
8
+
9
+ # Initialize model
10
+ print("Loading model...")
11
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
12
+
13
+ # Initialize ChromaDB
14
+ chroma_path = "./chroma_db"
15
+ os.makedirs(chroma_path, exist_ok=True)
16
+
17
+ client = chromadb.PersistentClient(path=chroma_path)
18
+
19
+ # Check if collection exists, if not create it
20
+ try:
21
+ collection = client.get_collection("quotes_collection")
22
+ print(f"Loaded existing collection with {collection.count()} documents")
23
+ except:
24
+ print("Creating new collection...")
25
+ # Load and prepare data
26
+ dataset = load_dataset("Abirate/english_quotes", split="train")
27
+ df = pd.DataFrame(dataset)
28
+
29
+ texts = []
30
+ metadata = []
31
+
32
+ for idx, row in df.iterrows():
33
+ quote = row['quote']
34
+ author = row['author']
35
+ tags = ', '.join(row['tags']) if row['tags'] else 'No tags'
36
+ text = f"{quote} - {author}"
37
+ texts.append(text)
38
+ metadata.append({'quote': quote, 'author': author, 'tags': tags})
39
+ if idx >= 499:
40
+ break
41
+
42
+ # Generate embeddings
43
+ print("Generating embeddings...")
44
+ embeddings = model.encode(texts, show_progress_bar=True)
45
+
46
+ # Create collection and add data
47
+ collection = client.create_collection("quotes_collection")
48
+ ids = [f"quote_{i}" for i in range(len(texts))]
49
+
50
+ batch_size = 100
51
+ for i in range(0, len(texts), batch_size):
52
+ end_idx = min(i + batch_size, len(texts))
53
+ collection.add(
54
+ documents=texts[i:end_idx],
55
+ embeddings=embeddings[i:end_idx].tolist(),
56
+ ids=ids[i:end_idx],
57
+ metadatas=metadata[i:end_idx]
58
+ )
59
+ print(f"Collection created with {collection.count()} documents!")
60
+
61
+ def semantic_search(query, n_results=5):
62
+ query_embedding = model.encode([query])
63
+ results = collection.query(
64
+ query_embeddings=query_embedding.tolist(),
65
+ n_results=n_results,
66
+ include=['documents', 'metadatas', 'distances']
67
+ )
68
+
69
+ output = []
70
+ for i in range(len(results['documents'][0])):
71
+ meta = results['metadatas'][0][i]
72
+ distance = results['distances'][0][i]
73
+ similarity = 1 - (distance / 2)
74
+
75
+ result_text = f"""
76
+ **Result {i+1}** (Similarity: {similarity:.2%})
77
+ πŸ“ "{meta['quote']}"
78
+ ✍️ β€” {meta['author']}
79
+ 🏷️ Tags: {meta['tags']}
80
+ """
81
+ output.append(result_text)
82
+
83
+ return "\n---\n".join(output)
84
+
85
+ def search_quotes(query, num_results):
86
+ if not query.strip():
87
+ return "Please enter a search query!"
88
+ return semantic_search(query, n_results=int(num_results))
89
+
90
+ demo = gr.Interface(
91
+ fn=search_quotes,
92
+ inputs=[
93
+ gr.Textbox(
94
+ label="πŸ” Search Query",
95
+ placeholder="Enter your search...",
96
+ lines=2
97
+ ),
98
+ gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Results")
99
+ ],
100
+ outputs=gr.Markdown(label="Search Results"),
101
+ title="πŸ“š Semantic Quote Search Engine",
102
+ description="Search through famous quotes using semantic similarity!",
103
+ examples=[
104
+ ["finding inner peace", 5],
105
+ ["never giving up", 3],
106
+ ["the meaning of life", 5]
107
+ ],
108
+ theme=gr.themes.Soft()
109
+ )
110
+
111
+ demo.launch()