Spaces:
Sleeping
Sleeping
| # app.py — robust version (works with quotes_1000.csv, quotes.csv, or a public fallback) | |
| import os | |
| import pandas as pd | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer, util | |
| from datasets import load_dataset | |
| import torch | |
| # ---- 1) Load quotes from available source ---- | |
| def load_quotes(): | |
| # Priority 1: use your new 1000 quotes | |
| if os.path.exists("quotes_1000.csv"): | |
| df = pd.read_csv("quotes_1000.csv") | |
| # Priority 2: use your older file if present | |
| elif os.path.exists("quotes.csv"): | |
| df = pd.read_csv("quotes.csv") | |
| else: | |
| # Priority 3 (fallback): load 1000 from public dataset | |
| ds = load_dataset("Abirate/english_quotes", split="train") | |
| df = pd.DataFrame({"quote": [row["quote"] for row in ds.select(range(1000))]}) | |
| # Normalize column names (accept 'quote' or 'text' or single unnamed column) | |
| cols = [c.lower() for c in df.columns] | |
| df.columns = cols | |
| if "quote" in df.columns: | |
| series = df["quote"] | |
| elif "text" in df.columns: | |
| series = df["text"] | |
| else: | |
| # if there is only one column, use it | |
| if len(df.columns) == 1: | |
| series = df.iloc[:, 0] | |
| else: | |
| raise ValueError("CSV must have a 'quote' or 'text' column, or a single column.") | |
| quotes = series.dropna().astype(str).tolist() | |
| # Deduplicate while preserving order | |
| seen = set() | |
| unique_quotes = [] | |
| for q in quotes: | |
| if q not in seen and q.strip(): | |
| unique_quotes.append(q.strip()) | |
| seen.add(q.strip()) | |
| if len(unique_quotes) < 10: | |
| raise ValueError("Not enough quotes found. Please check your CSV.") | |
| return unique_quotes | |
| quotes = load_quotes() | |
| # ---- 2) Load embedding model and build embeddings ---- | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| model = SentenceTransformer(MODEL_NAME) | |
| # tensors for cosine sim | |
| corpus_emb = model.encode(quotes, convert_to_tensor=True, show_progress_bar=True) | |
| # ---- 3) Search function (top-3 similar) ---- | |
| def get_top3(user_input: str): | |
| if not user_input or not user_input.strip(): | |
| return "Please type something (e.g., 'happiness', 'overcoming failure', 'creativity')." | |
| q_emb = model.encode(user_input, convert_to_tensor=True) | |
| sims = util.cos_sim(q_emb, corpus_emb)[0] # tensor of similarities | |
| topk = torch.topk(sims, k=3) | |
| idxs = topk.indices.tolist() | |
| results = [f"{i+1}. {quotes[idx]}" for i, idx in enumerate(idxs)] | |
| return "\n\n".join(results) | |
| # ---- 4) Gradio UI ---- | |
| demo = gr.Interface( | |
| fn=get_top3, | |
| inputs=gr.Textbox(lines=2, placeholder="Type a theme or sentence..."), | |
| outputs=gr.Textbox(label="Top 3 similar quotes"), | |
| title="Quote Finder (Semantic Search)", | |
| description="Enter any phrase to get 3 semantically similar quotes. Works with your uploaded CSV or a public fallback.", | |
| examples=[["happiness"], ["overcoming failure"], ["friendship"]] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |