TutuAwad commited on
Commit
91018f0
·
verified ·
1 Parent(s): 3f33df2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -36
app.py CHANGED
@@ -16,7 +16,7 @@ from langchain_community.llms import HuggingFaceEndpoint
16
  # 1. SETUP & AUTHENTICATION
17
  # ---------------------------------------------------------
18
 
19
- # Load Environment Variables (Set these in Space Settings)
20
  SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
21
  SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
22
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -25,9 +25,8 @@ HF_TOKEN = os.getenv("HF_TOKEN")
25
  auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
26
  sp = spotipy.Spotify(auth_manager=auth_manager)
27
 
28
- # Setup LLM (Serverless Inference - No massive GPU needed locally)
29
- # We use Mistral or Zephyr (faster/better than Llama 2 for this) or Llama 2 via API
30
- repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
31
 
32
  llm = HuggingFaceEndpoint(
33
  repo_id=repo_id,
@@ -37,34 +36,46 @@ llm = HuggingFaceEndpoint(
37
  )
38
 
39
  # ---------------------------------------------------------
40
- # 2. DATA LOADING & VECTOR INDEXING
41
  # ---------------------------------------------------------
42
  print("⏳ Loading Data...")
43
- df = pd.read_csv("data.csv")
44
 
45
- # Data Cleaning (Same as your notebook)
46
- df = df.replace(r"^\s*$", np.nan, regex=True)
47
- df['text'] = df['text'].astype(str).str.replace(r"\r|\n", " ", regex=True)
48
- df['song'] = df['song'].astype(str).str.replace(r"\r|\n", " ", regex=True)
49
- df['artist'] = df['artist'].astype(str).str.replace(r"\r|\n", " ", regex=True)
50
-
51
- df['combined'] = (
52
- "Title: " + df['song'].str.strip() +
53
- "; Artist: " + df['artist'].str.strip() +
54
- "; Lyrics: " + df['text'].str.strip()
55
- ).str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)
56
-
57
- print("⏳ Loading Embedding Model...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  embedder = SentenceTransformer('all-mpnet-base-v2')
59
 
60
- print("⏳ Creating FAISS Index (This runs once on startup)...")
61
- # We rebuild the index on startup to ensure compatibility with CPU environment
62
- df_embeddings = embedder.encode(df['combined'].tolist(), show_progress_bar=True)
63
- d = df_embeddings.shape[1]
64
- index = faiss.IndexFlatL2(d)
65
- index.add(df_embeddings)
66
- print(f"✅ Index built with {index.ntotal} songs.")
67
-
68
  GENERIC_ARTISTS = ["religious music", "christmas songs", "various artists", "soundtrack", "unknown", "traditional"]
69
 
70
  # ---------------------------------------------------------
@@ -80,7 +91,6 @@ def normalize_text(text):
80
  return re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
81
 
82
  def get_best_spotify_match(artist, title):
83
- """Finds the best Spotify link/image for a song"""
84
  artist_clean = clean_metadata(artist)
85
  title_clean = clean_metadata(title)
86
  query = f"{artist_clean} {title_clean}"
@@ -100,8 +110,14 @@ def get_best_spotify_match(artist, title):
100
  for item in items:
101
  track_artists = " ".join([normalize_text(a['name']) for a in item['artists']])
102
  score = difflib.SequenceMatcher(None, target_artist, track_artists).ratio()
103
- if score > best_score:
104
- best_score = score
 
 
 
 
 
 
105
  best_match = item
106
 
107
  if best_match:
@@ -111,7 +127,6 @@ def get_best_spotify_match(artist, title):
111
  return None, None
112
 
113
  def get_theme_colors(query):
114
- """Generates a color theme based on the query hash"""
115
  palettes = [
116
  {"name": "Spotify Classic", "accent": "#1DB954", "bg_grad": "linear-gradient(135deg, #103018 0%, #000000 100%)", "text": "#1DB954", "btn_text": "#000000"},
117
  {"name": "Midnight Purple", "accent": "#D0BCFF", "bg_grad": "linear-gradient(135deg, #240046 0%, #000000 100%)", "text": "#D0BCFF", "btn_text": "#000000"},
@@ -143,7 +158,7 @@ def harmonifind_search(user_query, k=7, use_llama=True):
143
 
144
  if use_llama:
145
  try:
146
- # We use the inference API here
147
  prompt = f"User Query: '{user_query}'\nOutput exactly 5 descriptive keywords regarding the mood, instruments, or genre. Do not output full sentences. Keywords:"
148
  raw_response = llm.invoke(prompt)
149
  keywords = raw_response.replace("\n", " ").strip()
@@ -152,15 +167,16 @@ def harmonifind_search(user_query, k=7, use_llama=True):
152
  except Exception as e:
153
  print(f"⚠️ AI skipped: {e}")
154
 
 
155
  q_vec = embedder.encode([search_query])
 
 
156
  distances, indices = index.search(q_vec, k)
157
 
158
- results_df = df.iloc[indices[0]].copy()
159
 
160
- # Calculate match %
161
  scores = []
162
  for dist in distances[0]:
163
- # Simple heuristic to convert L2 distance to percentage
164
  scores.append(int(max(0, min(100, (1 - (dist / 1.5)) * 100))))
165
  results_df['match_score'] = scores
166
 
@@ -183,7 +199,6 @@ def gradio_interface_fn(query):
183
  df_results = harmonifind_search(query, k=7, use_llama=True)
184
  theme = get_theme_colors(query)
185
 
186
- # Prepare Share Links
187
  share_text = urllib.parse.quote(f"Listening to '{query}' via HarmoniFind 🎵")
188
  share_url_x = f"https://twitter.com/intent/tweet?text={share_text}"
189
 
 
16
  # 1. SETUP & AUTHENTICATION
17
  # ---------------------------------------------------------
18
 
19
+ # Load Environment Variables from Space Settings
20
  SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
21
  SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
22
  HF_TOKEN = os.getenv("HF_TOKEN")
 
25
  auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
26
  sp = spotipy.Spotify(auth_manager=auth_manager)
27
 
28
+ # Setup LLM (Using Mistral-7B via Inference API - fast and free)
29
+ repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
 
30
 
31
  llm = HuggingFaceEndpoint(
32
  repo_id=repo_id,
 
36
  )
37
 
38
  # ---------------------------------------------------------
39
+ # 2. DATA LOADING (The Safe Way)
40
  # ---------------------------------------------------------
41
  print("⏳ Loading Data...")
 
42
 
43
+ # 1. Load CSV
44
+ try:
45
+ df_combined = pd.read_csv("data.csv")
46
+ # Ensure text columns are strings to prevent errors
47
+ df_combined['text'] = df_combined['text'].astype(str)
48
+ df_combined['song'] = df_combined['song'].astype(str)
49
+ df_combined['artist'] = df_combined['artist'].astype(str)
50
+ print(" CSV Loaded")
51
+ except Exception as e:
52
+ print(f" Error loading data.csv: {e}")
53
+
54
+ # 2. Load Embeddings (Crucial Step)
55
+ print("⏳ Loading Embeddings from .npz...")
56
+ try:
57
+ # Load the file you uploaded
58
+ data = np.load("df_embed.npz")
59
+ df_embeddings = data['df_embeddings']
60
+ print(f"✅ Embeddings Loaded. Shape: {df_embeddings.shape}")
61
+
62
+ # Create FAISS Index on CPU
63
+ # We use IndexFlatL2 which is exact, simple, and works everywhere
64
+ d = df_embeddings.shape[1]
65
+ index = faiss.IndexFlatL2(d)
66
+ index.add(df_embeddings)
67
+ print(f"✅ FAISS Index ready with {index.ntotal} vectors.")
68
+
69
+ except Exception as e:
70
+ print(f"❌ Error loading df_embed.npz: {e}")
71
+ print("CRITICAL: Make sure you uploaded 'df_embed.npz' to the Files tab.")
72
+ # Create a dummy index so the app doesn't crash immediately, but search won't work
73
+ index = faiss.IndexFlatL2(768)
74
+
75
+ # 3. Load Model (Only needed to encode the USER query, not the database)
76
+ print("⏳ Loading Sentence Transformer...")
77
  embedder = SentenceTransformer('all-mpnet-base-v2')
78
 
 
 
 
 
 
 
 
 
79
  GENERIC_ARTISTS = ["religious music", "christmas songs", "various artists", "soundtrack", "unknown", "traditional"]
80
 
81
  # ---------------------------------------------------------
 
91
  return re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
92
 
93
  def get_best_spotify_match(artist, title):
 
94
  artist_clean = clean_metadata(artist)
95
  title_clean = clean_metadata(title)
96
  query = f"{artist_clean} {title_clean}"
 
110
  for item in items:
111
  track_artists = " ".join([normalize_text(a['name']) for a in item['artists']])
112
  score = difflib.SequenceMatcher(None, target_artist, track_artists).ratio()
113
+
114
+ found_title = normalize_text(item['name'])
115
+ t_score = difflib.SequenceMatcher(None, normalize_text(title), found_title).ratio()
116
+
117
+ final_score = (score * 0.6) + (t_score * 0.4)
118
+
119
+ if final_score > best_score:
120
+ best_score = final_score
121
  best_match = item
122
 
123
  if best_match:
 
127
  return None, None
128
 
129
  def get_theme_colors(query):
 
130
  palettes = [
131
  {"name": "Spotify Classic", "accent": "#1DB954", "bg_grad": "linear-gradient(135deg, #103018 0%, #000000 100%)", "text": "#1DB954", "btn_text": "#000000"},
132
  {"name": "Midnight Purple", "accent": "#D0BCFF", "bg_grad": "linear-gradient(135deg, #240046 0%, #000000 100%)", "text": "#D0BCFF", "btn_text": "#000000"},
 
158
 
159
  if use_llama:
160
  try:
161
+ # We use the inference API here - Safe for CPU spaces
162
  prompt = f"User Query: '{user_query}'\nOutput exactly 5 descriptive keywords regarding the mood, instruments, or genre. Do not output full sentences. Keywords:"
163
  raw_response = llm.invoke(prompt)
164
  keywords = raw_response.replace("\n", " ").strip()
 
167
  except Exception as e:
168
  print(f"⚠️ AI skipped: {e}")
169
 
170
+ # Encode user query using the local CPU model
171
  q_vec = embedder.encode([search_query])
172
+
173
+ # Search the Pre-loaded Index
174
  distances, indices = index.search(q_vec, k)
175
 
176
+ results_df = df_combined.iloc[indices[0]].copy()
177
 
 
178
  scores = []
179
  for dist in distances[0]:
 
180
  scores.append(int(max(0, min(100, (1 - (dist / 1.5)) * 100))))
181
  results_df['match_score'] = scores
182
 
 
199
  df_results = harmonifind_search(query, k=7, use_llama=True)
200
  theme = get_theme_colors(query)
201
 
 
202
  share_text = urllib.parse.quote(f"Listening to '{query}' via HarmoniFind 🎵")
203
  share_url_x = f"https://twitter.com/intent/tweet?text={share_text}"
204