Spaces:

TutuAwad
/

HarmoniFind

Sleeping

App Files Files Community

TutuAwad commited on Nov 28, 2025

Commit

91018f0

verified ·

1 Parent(s): 3f33df2

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -36

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from langchain_community.llms import HuggingFaceEndpoint
 # 1. SETUP & AUTHENTICATION
 # ---------------------------------------------------------
-# Load Environment Variables (Set these in Space Settings)
 SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
 SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -25,9 +25,8 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
 sp = spotipy.Spotify(auth_manager=auth_manager)
-# Setup LLM (Serverless Inference - No massive GPU needed locally)
-# We use Mistral or Zephyr (faster/better than Llama 2 for this) or Llama 2 via API
-repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
 llm = HuggingFaceEndpoint(
     repo_id=repo_id,
@@ -37,34 +36,46 @@ llm = HuggingFaceEndpoint(
 )
 # ---------------------------------------------------------
-# 2. DATA LOADING & VECTOR INDEXING
 # ---------------------------------------------------------
 print("⏳ Loading Data...")
-df = pd.read_csv("data.csv")
-# Data Cleaning (Same as your notebook)
-df = df.replace(r"^\s*$", np.nan, regex=True)
-df['text'] = df['text'].astype(str).str.replace(r"\r|\n", " ", regex=True)
-df['song'] = df['song'].astype(str).str.replace(r"\r|\n", " ", regex=True)
-df['artist'] = df['artist'].astype(str).str.replace(r"\r|\n", " ", regex=True)
-df['combined'] = (
-    "Title: " + df['song'].str.strip() +
-    "; Artist: " + df['artist'].str.strip() +
-    "; Lyrics: " + df['text'].str.strip()
-).str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)
-print("⏳ Loading Embedding Model...")
 embedder = SentenceTransformer('all-mpnet-base-v2')
-print("⏳ Creating FAISS Index (This runs once on startup)...")
-# We rebuild the index on startup to ensure compatibility with CPU environment
-df_embeddings = embedder.encode(df['combined'].tolist(), show_progress_bar=True)
-d = df_embeddings.shape[1]
-index = faiss.IndexFlatL2(d)
-index.add(df_embeddings)
-print(f"✅ Index built with {index.ntotal} songs.")
 GENERIC_ARTISTS = ["religious music", "christmas songs", "various artists", "soundtrack", "unknown", "traditional"]
 # ---------------------------------------------------------
@@ -80,7 +91,6 @@ def normalize_text(text):
     return re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
 def get_best_spotify_match(artist, title):
-    """Finds the best Spotify link/image for a song"""
     artist_clean = clean_metadata(artist)
     title_clean = clean_metadata(title)
     query = f"{artist_clean} {title_clean}"
@@ -100,8 +110,14 @@ def get_best_spotify_match(artist, title):
     for item in items:
         track_artists = " ".join([normalize_text(a['name']) for a in item['artists']])
         score = difflib.SequenceMatcher(None, target_artist, track_artists).ratio()
-        if score > best_score:
-            best_score = score
             best_match = item
     if best_match:
@@ -111,7 +127,6 @@ def get_best_spotify_match(artist, title):
     return None, None
 def get_theme_colors(query):
-    """Generates a color theme based on the query hash"""
     palettes = [
         {"name": "Spotify Classic", "accent": "#1DB954", "bg_grad": "linear-gradient(135deg, #103018 0%, #000000 100%)", "text": "#1DB954", "btn_text": "#000000"},
         {"name": "Midnight Purple", "accent": "#D0BCFF", "bg_grad": "linear-gradient(135deg, #240046 0%, #000000 100%)", "text": "#D0BCFF", "btn_text": "#000000"},
@@ -143,7 +158,7 @@ def harmonifind_search(user_query, k=7, use_llama=True):
     if use_llama:
         try:
-            # We use the inference API here
             prompt = f"User Query: '{user_query}'\nOutput exactly 5 descriptive keywords regarding the mood, instruments, or genre. Do not output full sentences. Keywords:"
             raw_response = llm.invoke(prompt)
             keywords = raw_response.replace("\n", " ").strip()
@@ -152,15 +167,16 @@ def harmonifind_search(user_query, k=7, use_llama=True):
         except Exception as e:
             print(f"⚠️ AI skipped: {e}")
     q_vec = embedder.encode([search_query])
     distances, indices = index.search(q_vec, k)
-    results_df = df.iloc[indices[0]].copy()
-    # Calculate match %
     scores = []
     for dist in distances[0]:
-        # Simple heuristic to convert L2 distance to percentage
         scores.append(int(max(0, min(100, (1 - (dist / 1.5)) * 100))))
     results_df['match_score'] = scores
@@ -183,7 +199,6 @@ def gradio_interface_fn(query):
     df_results = harmonifind_search(query, k=7, use_llama=True)
     theme = get_theme_colors(query)
-    # Prepare Share Links
     share_text = urllib.parse.quote(f"Listening to '{query}' via HarmoniFind 🎵")
     share_url_x = f"https://twitter.com/intent/tweet?text={share_text}"

 # 1. SETUP & AUTHENTICATION
 # ---------------------------------------------------------
+# Load Environment Variables from Space Settings
 SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
 SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
 HF_TOKEN = os.getenv("HF_TOKEN")
 auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
 sp = spotipy.Spotify(auth_manager=auth_manager)
+# Setup LLM (Using Mistral-7B via Inference API - fast and free)
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
 llm = HuggingFaceEndpoint(
     repo_id=repo_id,
 )
 # ---------------------------------------------------------
+# 2. DATA LOADING (The Safe Way)
 # ---------------------------------------------------------
 print("⏳ Loading Data...")
+# 1. Load CSV
+try:
+    df_combined = pd.read_csv("data.csv")
+    # Ensure text columns are strings to prevent errors
+    df_combined['text'] = df_combined['text'].astype(str)
+    df_combined['song'] = df_combined['song'].astype(str)
+    df_combined['artist'] = df_combined['artist'].astype(str)
+    print("✅ CSV Loaded")
+except Exception as e:
+    print(f"❌ Error loading data.csv: {e}")
+# 2. Load Embeddings (Crucial Step)
+print("⏳ Loading Embeddings from .npz...")
+try:
+    # Load the file you uploaded
+    data = np.load("df_embed.npz")
+    df_embeddings = data['df_embeddings']
+    print(f"✅ Embeddings Loaded. Shape: {df_embeddings.shape}")
+    # Create FAISS Index on CPU
+    # We use IndexFlatL2 which is exact, simple, and works everywhere
+    d = df_embeddings.shape[1]
+    index = faiss.IndexFlatL2(d)
+    index.add(df_embeddings)
+    print(f"✅ FAISS Index ready with {index.ntotal} vectors.")
+except Exception as e:
+    print(f"❌ Error loading df_embed.npz: {e}")
+    print("CRITICAL: Make sure you uploaded 'df_embed.npz' to the Files tab.")
+    # Create a dummy index so the app doesn't crash immediately, but search won't work
+    index = faiss.IndexFlatL2(768)
+# 3. Load Model (Only needed to encode the USER query, not the database)
+print("⏳ Loading Sentence Transformer...")
 embedder = SentenceTransformer('all-mpnet-base-v2')
 GENERIC_ARTISTS = ["religious music", "christmas songs", "various artists", "soundtrack", "unknown", "traditional"]
 # ---------------------------------------------------------
     return re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
 def get_best_spotify_match(artist, title):
     artist_clean = clean_metadata(artist)
     title_clean = clean_metadata(title)
     query = f"{artist_clean} {title_clean}"
     for item in items:
         track_artists = " ".join([normalize_text(a['name']) for a in item['artists']])
         score = difflib.SequenceMatcher(None, target_artist, track_artists).ratio()
+        found_title = normalize_text(item['name'])
+        t_score = difflib.SequenceMatcher(None, normalize_text(title), found_title).ratio()
+        final_score = (score * 0.6) + (t_score * 0.4)
+        if final_score > best_score:
+            best_score = final_score
             best_match = item
     if best_match:
     return None, None
 def get_theme_colors(query):
     palettes = [
         {"name": "Spotify Classic", "accent": "#1DB954", "bg_grad": "linear-gradient(135deg, #103018 0%, #000000 100%)", "text": "#1DB954", "btn_text": "#000000"},
         {"name": "Midnight Purple", "accent": "#D0BCFF", "bg_grad": "linear-gradient(135deg, #240046 0%, #000000 100%)", "text": "#D0BCFF", "btn_text": "#000000"},
     if use_llama:
         try:
+            # We use the inference API here - Safe for CPU spaces
             prompt = f"User Query: '{user_query}'\nOutput exactly 5 descriptive keywords regarding the mood, instruments, or genre. Do not output full sentences. Keywords:"
             raw_response = llm.invoke(prompt)
             keywords = raw_response.replace("\n", " ").strip()
         except Exception as e:
             print(f"⚠️ AI skipped: {e}")
+    # Encode user query using the local CPU model
     q_vec = embedder.encode([search_query])
+    # Search the Pre-loaded Index
     distances, indices = index.search(q_vec, k)
+    results_df = df_combined.iloc[indices[0]].copy()
     scores = []
     for dist in distances[0]:
         scores.append(int(max(0, min(100, (1 - (dist / 1.5)) * 100))))
     results_df['match_score'] = scores
     df_results = harmonifind_search(query, k=7, use_llama=True)
     theme = get_theme_colors(query)
     share_text = urllib.parse.quote(f"Listening to '{query}' via HarmoniFind 🎵")
     share_url_x = f"https://twitter.com/intent/tweet?text={share_text}"