Rob Learsch commited on
Commit
c460746
·
1 Parent(s): ca9437b

Rewrite to save memory

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Kendrick_Lamar_scraper.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Radiohead_scraper.ipynb ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from lyricsgenius import Genius\n",
10
+ "import os\n",
11
+ "import time\n",
12
+ "\n",
13
+ "# Replace with your Genius API key\n",
14
+ "GENIUS_API_KEY = os.environ.get(\"GENIUS_TOKEN\")\n",
15
+ "\n",
16
+ "#https://lyricsgenius.readthedocs.io/en/master/usage.html\n",
17
+ "genius = Genius(GENIUS_API_KEY) \n",
18
+ "genius.remove_section_headers = True # Removes section headers from lyrics\n",
19
+ "genius.excluded_terms = [\"(Remix)\", \"(Live)\", \"(Acoustic)\", \"(Demo)\"] # Exclude these terms from song titles\n",
20
+ "#albums_list = genius.artist_albums(1421, per_page=None, page=None)"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "album_list = [\"Kid A\", \"OK Computer\", \"In Rainbows\", \"A Moon Shaped Pool\", \"Hail to the Thief\", \"The King of Limbs\",'The Bends',]\n",
30
+ "album_list_2= [\"Pablo Honey\", \"Amnesiac\",]\n",
31
+ " # \"A Brief History of Love\", \"The Eraser\", \"Tomorrow's Modern Boxes\",\n",
32
+ " #\"Anima\", \"The Smile\", \"Kid A Mnesia\", \"I Might Be Wrong\", \"Live in Berlin\", \"Com Lag (2plus2isfive)\"]"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "genius.sleep_time=3\n",
42
+ "genius.timeout = 15 # Set timeout to 15 seconds to avoid rate limiting"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 4,
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "Searching for \"Pablo Honey\" by Radiohead...\n",
55
+ "Wrote lyrics for Pablo Honey.\n",
56
+ "Searching for \"Amnesiac\" by Radiohead...\n",
57
+ "Wrote lyrics for Amnesiac.\n",
58
+ "Searching for \"A Brief History of Love\" by Radiohead...\n",
59
+ "No results found for: 'A Brief History of Love Radiohead'\n"
60
+ ]
61
+ },
62
+ {
63
+ "ename": "AttributeError",
64
+ "evalue": "'NoneType' object has no attribute 'to_dict'",
65
+ "output_type": "error",
66
+ "traceback": [
67
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
68
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
69
+ "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m album_name \u001b[38;5;129;01min\u001b[39;00m album_list_2:\n\u001b[1;32m 2\u001b[0m album \u001b[38;5;241m=\u001b[39m genius\u001b[38;5;241m.\u001b[39msearch_album(album_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRadiohead\u001b[39m\u001b[38;5;124m\"\u001b[39m,) \u001b[38;5;66;03m# Search for the album by title and artist\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m album_dict \u001b[38;5;241m=\u001b[39m \u001b[43malbum\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dict\u001b[49m() \u001b[38;5;66;03m# Convert the album object to a dictionary\u001b[39;00m\n\u001b[1;32m 4\u001b[0m album_length \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(album_dict[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtracks\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mradiohead_lyrics.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma\u001b[39m\u001b[38;5;124m\"\u001b[39m, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n",
70
+ "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'to_dict'"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "for album_name in album_list_2:\n",
76
+ " album = genius.search_album(album_name, \"Radiohead\",) # Search for the album by title and artist\n",
77
+ " album_dict = album.to_dict() # Convert the album object to a dictionary\n",
78
+ " album_length = len(album_dict['tracks'])\n",
79
+ " with open(\"radiohead_lyrics.txt\", \"a\", encoding=\"utf-8\") as file:\n",
80
+ " for song in range(album_length):\n",
81
+ " lyrics = album_dict['tracks'][song]['song']['lyrics'].split('Lyrics\\n')[-1]\n",
82
+ " file.write(lyrics + \"\\n\\n\" + \"=\" * 50 + \"\\n\\n\") # Lyrics + divider\n",
83
+ " print(f\"Wrote lyrics for {album_name}.\")\n",
84
+ " \n"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": []
93
+ }
94
+ ],
95
+ "metadata": {
96
+ "kernelspec": {
97
+ "display_name": "3.10.16",
98
+ "language": "python",
99
+ "name": "python3"
100
+ },
101
+ "language_info": {
102
+ "codemirror_mode": {
103
+ "name": "ipython",
104
+ "version": 3
105
+ },
106
+ "file_extension": ".py",
107
+ "mimetype": "text/x-python",
108
+ "name": "python",
109
+ "nbconvert_exporter": "python",
110
+ "pygments_lexer": "ipython3",
111
+ "version": "3.10.16"
112
+ }
113
+ },
114
+ "nbformat": 4,
115
+ "nbformat_minor": 2
116
+ }
app.py CHANGED
@@ -99,12 +99,10 @@ def artist_response(gemma_response, artist):
99
  lyric_list = all_phrases_grateful_dead
100
  if artist == "Google Gemma":
101
  return gemma_response
102
-
103
- encoded_gemma = encoder_model.encode(gemma_response)
104
- similarity_result = encoder_model.similarity(
105
- encoded_gemma,
106
- artist_embeddings,
107
- )
108
  result_max_index = np.argmax(similarity_result)
109
  lyric_response = lyric_list[result_max_index]
110
 
@@ -146,7 +144,7 @@ def chat_with_musician(user_input, history, artist):
146
  try:
147
  response = client.chat_completion(
148
  messages=messages,
149
- model="google/gemma-2-2b-it",
150
  max_tokens=256,
151
  temperature=0.75,
152
  #top_p=0.9
@@ -162,36 +160,39 @@ def chat_with_musician(user_input, history, artist):
162
  artist_history[:] = artist_history[-10:] # Keep only last 10 entries
163
  return lyric_response
164
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  HF_API_KEY = os.environ["HF_API_KEY"]
166
 
167
- radiohead_lyrics = load_lyrics("radiohead_lyrics.txt")
168
- kendrick_lyrics = load_lyrics("kendrick_lamar_lyrics.txt")
169
- grateful_dead_lyrics = load_lyrics('grateful_dead_lyrics.txt')
170
 
171
- all_phrases_radiohead = generate_cumulative_phrases(songs_from_text(radiohead_lyrics))
172
- all_phrases_kendrick = generate_cumulative_phrases(songs_from_text(kendrick_lyrics))
173
- all_phrases_grateful_dead = generate_cumulative_phrases(songs_from_text(grateful_dead_lyrics))
174
 
175
- encoder_model = SentenceTransformer('all-MiniLM-L6-v2',
 
 
 
 
 
176
  #'sentence-transformers/all-MiniLM-L6-v2',
177
- backend='openvino',
178
- model_kwargs={"file_name": "openvino/openvino_model_qint8_quantized.xml"},
179
  #to increase speed:
180
  #similarity_function=SimilarityFunction.DOT_PRODUCT,
181
  )
182
- radiohead_embeddings = encoder_model.encode(all_phrases_radiohead,
183
- precision="int8",
184
- show_progress_bar=True,)
185
- radiohead_embeddings = radiohead_embeddings.astype(np.float32)
186
- kendrick_embeddings = encoder_model.encode(all_phrases_kendrick,
187
- precision="int8",
188
- show_progress_bar=True,)
189
- kendrick_embeddings = kendrick_embeddings.astype(np.float32)
190
- grateful_dead_embeddings = encoder_model.encode(all_phrases_grateful_dead,
191
- precision='int8',
192
- show_progress_bar=True,)
193
- grateful_dead_embeddings = grateful_dead_embeddings.astype(np.float32)
194
 
 
 
 
195
 
196
 
197
  size = 350 #256
 
99
  lyric_list = all_phrases_grateful_dead
100
  if artist == "Google Gemma":
101
  return gemma_response
102
+ encoder = get_encoder()
103
+ encoded_gemma = encoder.encode(gemma_response, precision="int8")
104
+ #encoded_gemma = encoder_model.encode(gemma_response)
105
+ similarity_result = cosine_similarity_int8(encoded_gemma, artist_embeddings)
 
 
106
  result_max_index = np.argmax(similarity_result)
107
  lyric_response = lyric_list[result_max_index]
108
 
 
144
  try:
145
  response = client.chat_completion(
146
  messages=messages,
147
+ #model="google/gemma-2-2b-it",
148
  max_tokens=256,
149
  temperature=0.75,
150
  #top_p=0.9
 
160
  artist_history[:] = artist_history[-10:] # Keep only last 10 entries
161
  return lyric_response
162
 
163
+ def cosine_similarity_int8(query, embeddings):
164
+ # query: (d,)
165
+ # embeddings: (n, d)
166
+ query = query.astype(np.int32)
167
+ embeddings = embeddings.astype(np.int32)
168
+
169
+ dots = embeddings @ query
170
+ query_norm = np.linalg.norm(query)
171
+ emb_norms = np.linalg.norm(embeddings, axis=1)
172
+
173
+ return dots / (emb_norms * query_norm + 1e-8)
174
+
175
  HF_API_KEY = os.environ["HF_API_KEY"]
176
 
 
 
 
177
 
 
 
 
178
 
179
+ _encoder_model = None
180
+
181
+ def get_encoder():
182
+ global _encoder_model
183
+ if _encoder_model is None:
184
+ _encoder_model = SentenceTransformer('all-MiniLM-L6-v2',
185
  #'sentence-transformers/all-MiniLM-L6-v2',
186
+ #backend='openvino',
187
+ #model_kwargs={"file_name": "openvino/openvino_model_qint8_quantized.xml"},
188
  #to increase speed:
189
  #similarity_function=SimilarityFunction.DOT_PRODUCT,
190
  )
191
+ return _encoder_model
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ radiohead_embeddings = np.load("radiohead_embeddings.npy", mmap_mode="r")
194
+ kendrick_embeddings = np.load("kendrick_embeddings.npy", mmap_mode="r")
195
+ grateful_dead_embeddings = np.load("grateful_dead_embeddings.npy", mmap_mode="r")
196
 
197
 
198
  size = 350 #256
grateful_dead_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0134ba4610693e490146b62ac478e93ad351d1e07b0ef09560b229d2cb4aad10
3
+ size 18111104
kendrick_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a333db77a592551e63fcdfd92d4f0529b60cdb4e99c3936220068a876ce28593
3
+ size 9623168
radiohead_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da9a1b03efd746fa27fb30ea9e401c46fe87bc5b592bb3ebbdacd1b4b7843b0f
3
+ size 6394496