Nikita Makarov commited on
Commit
27ca72c
Β·
1 Parent(s): 0df1705
Files changed (4) hide show
  1. requirements.txt +4 -1
  2. src/app.py +23 -10
  3. src/radio_agent.py +116 -13
  4. src/rag_system.py +455 -38
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
- gradio==4.44.0
2
  openai>=1.0.0
3
  elevenlabs==1.10.0
4
  llama-index==0.11.20
5
  llama-index-llms-openai>=0.1.5
 
 
 
6
  requests==2.32.3
7
  python-dotenv==1.0.1
8
  pydantic==2.9.2
 
1
+ gradio==4.44.1
2
  openai>=1.0.0
3
  elevenlabs==1.10.0
4
  llama-index==0.11.20
5
  llama-index-llms-openai>=0.1.5
6
+ llama-index-embeddings-openai>=0.1.5
7
+ llama-index-embeddings-huggingface>=0.1.0
8
+ sentence-transformers>=2.2.0
9
  requests==2.32.3
10
  python-dotenv==1.0.1
11
  pydantic==2.9.2
src/app.py CHANGED
@@ -80,12 +80,15 @@ def save_preferences(name: str, favorite_genres: List[str], interests: List[str]
80
  radio_state["content_filter"] = preferences["content_filter"]
81
 
82
  radio_state["user_preferences"] = preferences
83
- agent.rag_system.store_user_preferences(preferences)
 
 
 
84
 
85
  # Save to user memory for persistence
86
- if radio_state.get("user_id"):
87
- user_memory.save_user_preferences(radio_state["user_id"], preferences)
88
- print(f"πŸ’Ύ Preferences saved for user {radio_state['user_id']}")
89
 
90
  return f"βœ… Preferences saved! Welcome, {preferences['name']}! Your personalized radio is ready."
91
 
@@ -151,7 +154,9 @@ def start_radio_stream():
151
  return "πŸ“» Radio is already playing!", None, None, None, ""
152
 
153
  # Plan the show with content filter
 
154
  show_plan = agent.plan_radio_show(
 
155
  user_preferences=radio_state["user_preferences"],
156
  duration_minutes=30,
157
  content_filter=radio_state["content_filter"]
@@ -227,7 +232,9 @@ def start_and_play_first_segment():
227
 
228
  # Step 1: Plan skeleton show (YouTube searches happen here)
229
  print(" [1/3] Planning show (searching YouTube)...")
 
230
  show_plan = agent.plan_radio_show(
 
231
  user_preferences=radio_state["user_preferences"],
232
  duration_minutes=30,
233
  content_filter=radio_state["content_filter"]
@@ -315,11 +322,15 @@ def play_next_segment():
315
  radio_state["current_segment_index"] += 1
316
 
317
  # Lazily generate LLM content for this segment
318
- segment = agent.enrich_segment(segment, radio_state["user_preferences"])
 
 
 
 
319
  radio_state["planned_show"][segment_index] = segment
320
 
321
  # Execute segment (log to RAG)
322
- agent.execute_segment(segment)
323
  radio_state["last_segment"] = segment
324
 
325
  # Generate content display and underlying script text
@@ -404,7 +415,7 @@ def play_next_segment():
404
  youtube_id = track["url"].split("v=")[-1].split("&")[0]
405
 
406
  if youtube_id:
407
- # YouTube embed with direct iframe (Gradio sanitizes script tags)
408
  music_player_html = f"""
409
  <div style="padding: 1rem; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin: 1rem 0; box-shadow: 0 4px 15px rgba(0,0,0,0.3);">
410
  <h4 style="margin: 0 0 0.75rem 0; color: #fff; font-size: 1.1em;">🎡 {track.get('title', 'Unknown')}</h4>
@@ -1197,14 +1208,16 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎡", theme=gr.themes.Soft()) as
1197
  dislike_btn = gr.Button("πŸ‘Ž Don't Play Again", variant="secondary", size="sm")
1198
  like_status = gr.Textbox(label="Rating Status", value="Rate the current song!", interactive=False, scale=2)
1199
 
1200
- progress_text = gr.Textbox(label="Progress", value="Ready to start", interactive=False)
1201
 
1202
  segment_info = gr.Markdown("**Welcome!** Set your preferences and start the radio.", elem_id="segment-display")
 
1203
  llm_script = gr.Textbox(
1204
  label="🧠 Model Script (LLM answer)",
1205
  value="Model-generated script will appear here for each segment.",
1206
  interactive=False,
1207
- lines=6
 
1208
  )
1209
 
1210
  gr.Markdown("**πŸ’‘ Tip:** Host speech plays first, then music/podcasts will stream automatically!")
@@ -1212,7 +1225,7 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎡", theme=gr.themes.Soft()) as
1212
  audio_output = gr.Audio(label="πŸ”Š Host Speech", autoplay=True, type="filepath", elem_id="host_audio")
1213
  music_player = gr.HTML(label="🎡 Music/Podcast Player (streaming)")
1214
 
1215
- status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
1216
 
1217
  # Connect buttons
1218
  start_btn.click(
 
80
  radio_state["content_filter"] = preferences["content_filter"]
81
 
82
  radio_state["user_preferences"] = preferences
83
+
84
+ # Save to RAG system with user_id
85
+ user_id = radio_state.get("user_id")
86
+ agent.rag_system.store_user_preferences(preferences, user_id=user_id)
87
 
88
  # Save to user memory for persistence
89
+ if user_id:
90
+ user_memory.save_user_preferences(user_id, preferences)
91
+ print(f"πŸ’Ύ Preferences saved for user {user_id}")
92
 
93
  return f"βœ… Preferences saved! Welcome, {preferences['name']}! Your personalized radio is ready."
94
 
 
154
  return "πŸ“» Radio is already playing!", None, None, None, ""
155
 
156
  # Plan the show with content filter
157
+ user_id = radio_state.get("user_id")
158
  show_plan = agent.plan_radio_show(
159
+ user_id=user_id,
160
  user_preferences=radio_state["user_preferences"],
161
  duration_minutes=30,
162
  content_filter=radio_state["content_filter"]
 
232
 
233
  # Step 1: Plan skeleton show (YouTube searches happen here)
234
  print(" [1/3] Planning show (searching YouTube)...")
235
+ user_id = radio_state.get("user_id")
236
  show_plan = agent.plan_radio_show(
237
+ user_id=user_id,
238
  user_preferences=radio_state["user_preferences"],
239
  duration_minutes=30,
240
  content_filter=radio_state["content_filter"]
 
322
  radio_state["current_segment_index"] += 1
323
 
324
  # Lazily generate LLM content for this segment
325
+ user_prefs = radio_state["user_preferences"].copy()
326
+ user_id = radio_state.get("user_id")
327
+ if user_id:
328
+ user_prefs["_user_id"] = user_id # Pass user_id in preferences for RAG queries
329
+ segment = agent.enrich_segment(segment, user_prefs)
330
  radio_state["planned_show"][segment_index] = segment
331
 
332
  # Execute segment (log to RAG)
333
+ agent.execute_segment(segment, user_id=user_id)
334
  radio_state["last_segment"] = segment
335
 
336
  # Generate content display and underlying script text
 
415
  youtube_id = track["url"].split("v=")[-1].split("&")[0]
416
 
417
  if youtube_id:
418
+ # Simple YouTube iframe embed with autoplay
419
  music_player_html = f"""
420
  <div style="padding: 1rem; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin: 1rem 0; box-shadow: 0 4px 15px rgba(0,0,0,0.3);">
421
  <h4 style="margin: 0 0 0.75rem 0; color: #fff; font-size: 1.1em;">🎡 {track.get('title', 'Unknown')}</h4>
 
1208
  dislike_btn = gr.Button("πŸ‘Ž Don't Play Again", variant="secondary", size="sm")
1209
  like_status = gr.Textbox(label="Rating Status", value="Rate the current song!", interactive=False, scale=2)
1210
 
1211
+ progress_text = gr.Textbox(label="Progress", value="Ready to start", interactive=False, visible=False)
1212
 
1213
  segment_info = gr.Markdown("**Welcome!** Set your preferences and start the radio.", elem_id="segment-display")
1214
+
1215
  llm_script = gr.Textbox(
1216
  label="🧠 Model Script (LLM answer)",
1217
  value="Model-generated script will appear here for each segment.",
1218
  interactive=False,
1219
+ lines=6,
1220
+ visible=False
1221
  )
1222
 
1223
  gr.Markdown("**πŸ’‘ Tip:** Host speech plays first, then music/podcasts will stream automatically!")
 
1225
  audio_output = gr.Audio(label="πŸ”Š Host Speech", autoplay=True, type="filepath", elem_id="host_audio")
1226
  music_player = gr.HTML(label="🎡 Music/Podcast Player (streaming)")
1227
 
1228
+ status_text = gr.Textbox(label="Status", value="Ready", interactive=False, visible=False)
1229
 
1230
  # Connect buttons
1231
  start_btn.click(
src/radio_agent.py CHANGED
@@ -61,9 +61,10 @@ class RadioAgent:
61
  self.is_streaming = False
62
  self.current_segment = None
63
  self.segment_history = []
 
64
 
65
  def plan_radio_show(self, user_preferences: Dict[str, Any], duration_minutes: int = 30,
66
- content_filter: Dict[str, bool] = None) -> List[Dict[str, Any]]:
67
  """
68
  Plan a personalized radio show based on user preferences
69
  This demonstrates autonomous planning behavior
@@ -72,10 +73,14 @@ class RadioAgent:
72
  user_preferences: User's preferences and mood
73
  duration_minutes: Total duration of the show
74
  content_filter: Dictionary with content type filters (music, news, podcasts, stories)
 
75
 
76
  Returns:
77
  List of planned segments
78
  """
 
 
 
79
  segments = []
80
 
81
  # Default filter - all enabled
@@ -87,8 +92,8 @@ class RadioAgent:
87
  "stories": True
88
  }
89
 
90
- # Get user preferences from RAG
91
- stored_prefs = self.rag_system.get_user_preferences()
92
  merged_prefs = {**stored_prefs, **user_preferences}
93
 
94
  # Calculate segment distribution based on filter
@@ -320,7 +325,7 @@ class RadioAgent:
320
  'duration': 2
321
  }
322
 
323
- def _generate_music_commentary(self, track: Dict[str, Any], preferences: Dict[str, Any]) -> str:
324
  """Generate longer, more engaging commentary for music track with jokes/facts"""
325
  if not track or not self.client:
326
  return f"Here's a great track for you!"
@@ -328,12 +333,31 @@ class RadioAgent:
328
  mood = preferences.get('mood', 'happy')
329
  interests = preferences.get('interests', [])
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  try:
332
  prompt = f"""You are an energetic, entertaining radio DJ named Lera. This is a middle of a show. Introduce this song in a fun, engaging way:
333
  Title: {track['title']}
334
  Artist: {track['artist']}
335
  Genre: {track['genre']}
336
- Listener mood: {mood}
337
 
338
  Make it:
339
  - About 10-15 seconds of speech, max_tokens = 400
@@ -341,7 +365,11 @@ class RadioAgent:
341
  - Add a light joke or witty comment
342
  - Be enthusiastic and engaging
343
  - Sound natural, like a real radio host
344
- - Connect it to the listener's mood if possible"""
 
 
 
 
345
 
346
  response = self.client.chat.completions.create(
347
  model=self.config.nebius_model,
@@ -350,10 +378,31 @@ class RadioAgent:
350
  max_tokens=400 #400
351
  )
352
  text = response.choices[0].message.content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  llm_logger.info(
354
- "MUSIC | model=%s | prompt=%r | response=%r",
355
  self.config.nebius_model,
356
- prompt,
 
357
  text[:1000],
358
  )
359
  return text
@@ -528,15 +577,20 @@ Don't use emojis. Just speak naturally like a real DJ."""
528
  else:
529
  return "evening"
530
 
531
- def execute_segment(self, segment: Dict[str, Any]) -> Dict[str, Any]:
532
  """Execute a planned segment and log to RAG"""
533
  self.current_segment = segment
534
  self.segment_history.append(segment)
535
 
536
- # Store in RAG system
 
 
 
 
537
  self.rag_system.store_listening_history(
538
  item_type=segment['type'],
539
- item_data=segment
 
540
  )
541
 
542
  return segment
@@ -567,10 +621,42 @@ Don't use emojis. Just speak naturally like a real DJ."""
567
  elif seg_type == "music":
568
  # Lazy fetch track if not yet fetched
569
  if not segment.get("track"):
 
570
  genres = prefs.get('favorite_genres', ['pop'])
571
  mood = prefs.get('mood', 'happy')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  tracks = self.music_server.search_free_music(
573
- genre=random.choice(genres),
574
  mood=mood,
575
  limit=1
576
  )
@@ -578,7 +664,24 @@ Don't use emojis. Just speak naturally like a real DJ."""
578
 
579
  track = segment.get("track")
580
  if track and not segment.get("commentary"):
581
- segment["commentary"] = self._generate_music_commentary(track, prefs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
  elif seg_type == "news":
584
  # Lazy fetch news if not yet fetched
 
61
  self.is_streaming = False
62
  self.current_segment = None
63
  self.segment_history = []
64
+ self.current_user_id = None # Track current user for RAG queries
65
 
66
  def plan_radio_show(self, user_preferences: Dict[str, Any], duration_minutes: int = 30,
67
+ content_filter: Dict[str, bool] = None, user_id: str = None) -> List[Dict[str, Any]]:
68
  """
69
  Plan a personalized radio show based on user preferences
70
  This demonstrates autonomous planning behavior
 
73
  user_preferences: User's preferences and mood
74
  duration_minutes: Total duration of the show
75
  content_filter: Dictionary with content type filters (music, news, podcasts, stories)
76
+ user_id: User ID for RAG queries
77
 
78
  Returns:
79
  List of planned segments
80
  """
81
+ # Store user_id for later use
82
+ self.current_user_id = user_id
83
+
84
  segments = []
85
 
86
  # Default filter - all enabled
 
92
  "stories": True
93
  }
94
 
95
+ # Get user preferences from RAG (filtered by user_id)
96
+ stored_prefs = self.rag_system.get_user_preferences(user_id=user_id)
97
  merged_prefs = {**stored_prefs, **user_preferences}
98
 
99
  # Calculate segment distribution based on filter
 
325
  'duration': 2
326
  }
327
 
328
+ def _generate_music_commentary(self, track: Dict[str, Any], preferences: Dict[str, Any], rag_context: List[Dict[str, Any]] = None) -> str:
329
  """Generate longer, more engaging commentary for music track with jokes/facts"""
330
  if not track or not self.client:
331
  return f"Here's a great track for you!"
 
333
  mood = preferences.get('mood', 'happy')
334
  interests = preferences.get('interests', [])
335
 
336
+ # Add RAG context to prompt if available
337
+ rag_context_text = ""
338
+ if rag_context:
339
+ rag_context_text = "\n\nUser's listening history context (use this to personalize your commentary):\n"
340
+ for i, ctx in enumerate(rag_context[:2]): # Use top 2 most relevant
341
+ ctx_text = ctx.get('text', '')[:200]
342
+ score = ctx.get('score', 0)
343
+ rag_context_text += f"- Context #{i+1} (relevance: {score:.4f}): {ctx_text}\n"
344
+
345
+ llm_logger.info(
346
+ "RAG_CONTEXT | track=%s | context_docs=%d | context_preview=%s",
347
+ track.get('title', 'Unknown'),
348
+ len(rag_context),
349
+ rag_context_text[:300]
350
+ )
351
+ print(f" πŸ“ [RAG] Added {len(rag_context)} context documents to LLM prompt")
352
+ else:
353
+ print(f" ℹ️ [RAG] No context available - using standard prompt")
354
+
355
  try:
356
  prompt = f"""You are an energetic, entertaining radio DJ named Lera. This is a middle of a show. Introduce this song in a fun, engaging way:
357
  Title: {track['title']}
358
  Artist: {track['artist']}
359
  Genre: {track['genre']}
360
+ Listener mood: {mood}{rag_context_text}
361
 
362
  Make it:
363
  - About 10-15 seconds of speech, max_tokens = 400
 
365
  - Add a light joke or witty comment
366
  - Be enthusiastic and engaging
367
  - Sound natural, like a real radio host
368
+ - Connect it to the listener's mood if possible
369
+ - Reference their listening history if relevant (from context above)"""
370
+
371
+ if rag_context:
372
+ print(f" βœ… [RAG] LLM prompt includes RAG context - model will use personalized history")
373
 
374
  response = self.client.chat.completions.create(
375
  model=self.config.nebius_model,
 
378
  max_tokens=400 #400
379
  )
380
  text = response.choices[0].message.content.strip()
381
+
382
+ # Check if response references RAG context
383
+ rag_used = False
384
+ if rag_context:
385
+ # Check if response mentions anything from context
386
+ text_lower = text.lower()
387
+ for ctx in rag_context:
388
+ ctx_text = ctx.get('text', '').lower()
389
+ # Look for key terms from context in response
390
+ key_terms = ['listened', 'enjoyed', 'prefer', 'favorite', 'history', 'before', 'previous', 'similar']
391
+ if any(term in ctx_text and term in text_lower for term in key_terms):
392
+ rag_used = True
393
+ break
394
+
395
+ if rag_used:
396
+ print(f" βœ… [RAG] LLM response appears to reference RAG context!")
397
+ rag_logger.info(f"βœ… RAG CONTEXT USED in commentary for {track.get('title')}")
398
+ else:
399
+ print(f" ℹ️ [RAG] LLM response doesn't explicitly reference context (may still be influenced)")
400
+
401
  llm_logger.info(
402
+ "MUSIC | model=%s | rag_used=%s | prompt_length=%d | response=%r",
403
  self.config.nebius_model,
404
+ rag_used,
405
+ len(prompt),
406
  text[:1000],
407
  )
408
  return text
 
577
  else:
578
  return "evening"
579
 
580
+ def execute_segment(self, segment: Dict[str, Any], user_id: str = None) -> Dict[str, Any]:
581
  """Execute a planned segment and log to RAG"""
582
  self.current_segment = segment
583
  self.segment_history.append(segment)
584
 
585
+ # Use stored user_id if not provided
586
+ if not user_id:
587
+ user_id = self.current_user_id
588
+
589
+ # Store in RAG system with user_id
590
  self.rag_system.store_listening_history(
591
  item_type=segment['type'],
592
+ item_data=segment,
593
+ user_id=user_id
594
  )
595
 
596
  return segment
 
621
  elif seg_type == "music":
622
  # Lazy fetch track if not yet fetched
623
  if not segment.get("track"):
624
+ # Use RAG to get personalized recommendations
625
  genres = prefs.get('favorite_genres', ['pop'])
626
  mood = prefs.get('mood', 'happy')
627
+
628
+ # Use stored user_id if available
629
+ user_id = prefs.get('_user_id') or self.current_user_id
630
+
631
+ # Query RAG for music preferences based on listening history
632
+ rag_query = f"What music genres and styles does the user prefer? Favorite genres: {', '.join(genres)}, Mood: {mood}"
633
+ print(f"\n🎡 [RAG] Querying for genre recommendations for user {user_id}...")
634
+ rag_context = self.rag_system.query_user_context(rag_query, user_id=user_id, top_k=3)
635
+
636
+ original_genres = genres.copy()
637
+ # Extract genre suggestions from RAG if available
638
+ if rag_context:
639
+ print(f"🎡 [RAG] Analyzing {len(rag_context)} context documents for genre suggestions...")
640
+ for i, ctx in enumerate(rag_context):
641
+ ctx_text = ctx.get('text', '').lower()
642
+ score = ctx.get('score', 0)
643
+ # Look for genre mentions in context
644
+ for genre in ['pop', 'rock', 'jazz', 'classical', 'electronic', 'hip-hop', 'country', 'indie', 'rap', 'blues', 'folk', 'metal', 'reggae', 'soul', 'r&b']:
645
+ if genre in ctx_text and genre not in genres:
646
+ genres.append(genre)
647
+ print(f" βœ… [RAG] Suggested genre from context #{i+1} (score: {score:.4f}): {genre}")
648
+ else:
649
+ print(f" ℹ️ [RAG] No context found, using original genres: {original_genres}")
650
+
651
+ # Use RAG-enhanced genre selection
652
+ selected_genre = random.choice(genres) if genres else 'pop'
653
+ if selected_genre not in original_genres:
654
+ print(f"🎡 [RAG] Using RAG-suggested genre: {selected_genre} (was: {original_genres})")
655
+ else:
656
+ print(f"🎡 [RAG] Using genre: {selected_genre} (from preferences)")
657
+
658
  tracks = self.music_server.search_free_music(
659
+ genre=selected_genre,
660
  mood=mood,
661
  limit=1
662
  )
 
664
 
665
  track = segment.get("track")
666
  if track and not segment.get("commentary"):
667
+ # Use stored user_id if available
668
+ user_id = prefs.get('_user_id') or self.current_user_id
669
+
670
+ # Use RAG context for more personalized commentary
671
+ rag_query = f"User's music listening history and preferences for {track.get('title', 'this track')} by {track.get('artist', 'this artist')}"
672
+ print(f"\n🎀 [RAG] Querying for commentary context for user {user_id}: {track.get('title')} by {track.get('artist')}")
673
+ rag_context = self.rag_system.query_user_context(rag_query, user_id=user_id, top_k=2)
674
+
675
+ if rag_context:
676
+ print(f"🎀 [RAG] Using {len(rag_context)} context documents for personalized commentary")
677
+ for i, ctx in enumerate(rag_context):
678
+ score = ctx.get('score', 0)
679
+ preview = ctx.get('text', '')[:80].replace('\n', ' ')
680
+ print(f" πŸ“„ Context #{i+1} (score: {score:.4f}): {preview}...")
681
+ else:
682
+ print(f" ℹ️ [RAG] No context found, generating standard commentary")
683
+
684
+ segment["commentary"] = self._generate_music_commentary(track, prefs, rag_context=rag_context)
685
 
686
  elif seg_type == "news":
687
  # Lazy fetch news if not yet fetched
src/rag_system.py CHANGED
@@ -1,15 +1,29 @@
1
  """RAG System for User Preferences and History using LlamaIndex"""
2
  import json
3
  import os
 
4
  from typing import Dict, Any, List
5
  from datetime import datetime
6
  from llama_index.core import VectorStoreIndex, Document, Settings
7
  from llama_index.core.storage.storage_context import StorageContext
8
  from llama_index.core.vector_stores import SimpleVectorStore
 
9
  from llama_index.llms.openai import OpenAI as LlamaOpenAI
 
10
 
11
  # Get project root directory (parent of src/)
12
  PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class RadioRAGSystem:
15
  """RAG system for storing and retrieving user preferences and listening history"""
@@ -32,19 +46,88 @@ class RadioRAGSystem:
32
  temperature=0.7
33
  )
34
  self.llm_available = True
 
35
  except Exception as e:
36
  print(f"Warning: Could not initialize Nebius/OpenAI LLM: {e}")
37
  print("RAG system will work in fallback mode without LLM features")
38
  self.llm_available = False
39
 
40
- # For embeddings, we'll use a simple approach or skip for now
41
- # Embeddings can be added later if needed
42
- self.embedding_available = False # Disabled for now - can add OpenAI embeddings later
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Initialize vector store
45
  self.vector_store = SimpleVectorStore()
46
  self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
47
 
 
 
 
 
 
 
 
48
  # Load existing index or create new one
49
  self.index = None
50
  self.documents = []
@@ -53,44 +136,188 @@ class RadioRAGSystem:
53
  self._load_user_data()
54
 
55
  def _load_user_data(self):
56
- """Load user data from file"""
57
  if os.path.exists(self.user_data_file):
58
  try:
59
  with open(self.user_data_file, 'r') as f:
60
  data = json.load(f)
61
- self.documents = [Document(text=json.dumps(d)) for d in data]
62
- if self.documents and self.embedding_available:
63
- try:
64
- self.index = VectorStoreIndex.from_documents(
65
- self.documents,
66
- storage_context=self.storage_context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
- except Exception as e:
69
- print(f"Warning: Could not build vector index: {e}")
70
- self.index = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
 
72
  print(f"Error loading user data: {e}")
73
 
74
  def _save_user_data(self):
75
  """Save user data to file"""
76
  try:
77
- data = [json.loads(doc.text) for doc in self.documents]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  with open(self.user_data_file, 'w') as f:
79
  json.dump(data, f, indent=2)
 
80
  except Exception as e:
 
 
 
81
  print(f"Error saving user data: {e}")
82
 
83
- def store_user_preferences(self, preferences: Dict[str, Any]):
84
- """Store user preferences in RAG system"""
 
 
 
 
85
  pref_doc = {
86
  "type": "preferences",
 
87
  "timestamp": datetime.now().isoformat(),
88
  "data": preferences
89
  }
90
 
91
- doc = Document(text=json.dumps(pref_doc))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  self.documents.append(doc)
93
 
 
 
 
94
  # Rebuild index if embeddings are available
95
  if self.embedding_available:
96
  try:
@@ -98,68 +325,258 @@ class RadioRAGSystem:
98
  self.documents,
99
  storage_context=self.storage_context
100
  )
 
 
101
  except Exception as e:
 
102
  print(f"Warning: Could not rebuild index: {e}")
103
  self.index = None
 
 
 
104
 
105
  self._save_user_data()
106
 
107
- def store_listening_history(self, item_type: str, item_data: Dict[str, Any], user_feedback: str = None):
108
- """Store listening history with optional feedback"""
 
 
 
 
109
  history_doc = {
110
  "type": "history",
 
111
  "item_type": item_type, # music, news, podcast, story
112
  "timestamp": datetime.now().isoformat(),
113
  "data": item_data,
114
  "feedback": user_feedback
115
  }
116
 
117
- doc = Document(text=json.dumps(history_doc))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  self.documents.append(doc)
119
 
120
- # Rebuild index if embeddings are available
121
- if self.embedding_available:
 
122
  try:
123
  self.index = VectorStoreIndex.from_documents(
124
  self.documents,
125
  storage_context=self.storage_context
126
  )
 
 
 
127
  except Exception as e:
 
128
  print(f"Warning: Could not rebuild index: {e}")
129
  self.index = None
130
 
131
  self._save_user_data()
132
 
133
- def get_user_preferences(self) -> Dict[str, Any]:
134
- """Retrieve latest user preferences"""
135
  preferences = {}
136
  for doc in reversed(self.documents):
137
  try:
138
- data = json.loads(doc.text)
139
- if data.get("type") == "preferences":
140
- preferences = data.get("data", {})
141
- break
142
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  continue
144
 
 
 
 
145
  return preferences
146
 
147
- def get_recommendations(self, query: str) -> Dict[str, Any]:
148
- """Get personalized recommendations based on user history and preferences"""
149
  if not self.index or not self.llm_available:
150
- return self._get_default_recommendations()
 
 
151
 
152
  try:
153
- query_engine = self.index.as_query_engine()
 
 
 
 
 
 
 
154
  response = query_engine.query(query)
155
 
156
- return {
157
- "recommendations": str(response),
158
- "source": "RAG"
 
 
 
 
 
 
159
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  except Exception as e:
161
- print(f"Error getting recommendations: {e}")
162
- return self._get_default_recommendations()
 
 
 
163
 
164
  def _get_default_recommendations(self) -> Dict[str, Any]:
165
  """Return default recommendations when RAG is not available"""
 
1
  """RAG System for User Preferences and History using LlamaIndex"""
2
  import json
3
  import os
4
+ import logging
5
  from typing import Dict, Any, List
6
  from datetime import datetime
7
  from llama_index.core import VectorStoreIndex, Document, Settings
8
  from llama_index.core.storage.storage_context import StorageContext
9
  from llama_index.core.vector_stores import SimpleVectorStore
10
+ from llama_index.core.node_parser import SimpleNodeParser
11
  from llama_index.llms.openai import OpenAI as LlamaOpenAI
12
+ from llama_index.embeddings.openai import OpenAIEmbedding
13
 
14
  # Get project root directory (parent of src/)
15
  PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
16
+ LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
17
+ os.makedirs(LOG_DIR, exist_ok=True)
18
+
19
+ # Setup RAG logger
20
+ rag_logger = logging.getLogger("ai_radio.rag")
21
+ if not rag_logger.handlers:
22
+ rag_logger.setLevel(logging.INFO)
23
+ _fh = logging.FileHandler(os.path.join(LOG_DIR, "rag_system.log"), encoding="utf-8")
24
+ _fmt = logging.Formatter("%(asctime)s [RAG] %(levelname)s: %(message)s")
25
+ _fh.setFormatter(_fmt)
26
+ rag_logger.addHandler(_fh)
27
 
28
  class RadioRAGSystem:
29
  """RAG system for storing and retrieving user preferences and listening history"""
 
46
  temperature=0.7
47
  )
48
  self.llm_available = True
49
+ print("βœ… RAG LLM initialized (Nebius GPT-OSS-120B)")
50
  except Exception as e:
51
  print(f"Warning: Could not initialize Nebius/OpenAI LLM: {e}")
52
  print("RAG system will work in fallback mode without LLM features")
53
  self.llm_available = False
54
 
55
+ # Enable embeddings - try local model first, then OpenAI
56
+ # Note: Nebius doesn't support embeddings, so we use local or OpenAI
57
+ self.embedding_available = False
58
+ try:
59
+ # First, try local sentence-transformers (no API key needed)
60
+ try:
61
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
62
+ Settings.embed_model = HuggingFaceEmbedding(
63
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
64
+ )
65
+ self.embedding_available = True
66
+ print("βœ… RAG Embeddings enabled (local sentence-transformers/all-MiniLM-L6-v2)")
67
+ rag_logger.info("βœ… Using local HuggingFace embeddings: sentence-transformers/all-MiniLM-L6-v2")
68
+ except ImportError:
69
+ # Fallback: Try OpenAI embeddings endpoint (requires OpenAI API key)
70
+ openai_key = os.environ.get("OPENAI_API_KEY")
71
+ if openai_key:
72
+ try:
73
+ Settings.embed_model = OpenAIEmbedding(
74
+ api_key=openai_key,
75
+ api_base="https://api.openai.com/v1",
76
+ model="text-embedding-3-small"
77
+ )
78
+ self.embedding_available = True
79
+ print("βœ… RAG Embeddings enabled (OpenAI text-embedding-3-small via OPENAI_API_KEY)")
80
+ rag_logger.info("βœ… Using OpenAI embeddings: text-embedding-3-small")
81
+ except Exception as e:
82
+ print(f"⚠️ OpenAI embeddings failed: {e}")
83
+ print("ℹ️ Embeddings disabled. RAG will use fallback mode.")
84
+ rag_logger.warning(f"⚠️ OpenAI embeddings failed: {e}")
85
+ self.embedding_available = False
86
+ else:
87
+ print("ℹ️ Embeddings disabled: No local model or OPENAI_API_KEY found.")
88
+ print(" Install: pip install sentence-transformers")
89
+ print(" Or set OPENAI_API_KEY environment variable.")
90
+ rag_logger.warning("⚠️ No embeddings available - install sentence-transformers or set OPENAI_API_KEY")
91
+ self.embedding_available = False
92
+ except Exception as e:
93
+ print(f"⚠️ Local embeddings failed: {e}")
94
+ # Try OpenAI as last resort
95
+ openai_key = os.environ.get("OPENAI_API_KEY")
96
+ if openai_key:
97
+ try:
98
+ Settings.embed_model = OpenAIEmbedding(
99
+ api_key=openai_key,
100
+ api_base="https://api.openai.com/v1",
101
+ model="text-embedding-ada-002" # Try older model
102
+ )
103
+ self.embedding_available = True
104
+ print("βœ… RAG Embeddings enabled (OpenAI text-embedding-ada-002)")
105
+ rag_logger.info("βœ… Using OpenAI embeddings: text-embedding-ada-002")
106
+ except:
107
+ print("ℹ️ All embedding options failed. RAG will use fallback mode.")
108
+ rag_logger.warning("⚠️ All embedding options failed")
109
+ self.embedding_available = False
110
+ else:
111
+ print("ℹ️ Embeddings disabled. RAG will use fallback mode.")
112
+ rag_logger.warning("⚠️ Embeddings disabled - no fallback available")
113
+ self.embedding_available = False
114
+ except Exception as e:
115
+ print(f"Warning: Could not initialize embeddings: {e}")
116
+ print("RAG will work without vector search (fallback mode)")
117
+ rag_logger.error(f"❌ Embedding initialization error: {e}")
118
+ self.embedding_available = False
119
 
120
  # Initialize vector store
121
  self.vector_store = SimpleVectorStore()
122
  self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
123
 
124
+ # Configure chunk size to handle larger metadata
125
+ # Increase chunk size to 4096 to accommodate metadata (metadata can be up to 1438 chars)
126
+ Settings.chunk_size = 4096
127
+ Settings.chunk_overlap = 400
128
+ rag_logger.info(f"πŸ“ RAG chunk size set to {Settings.chunk_size} (overlap: {Settings.chunk_overlap})")
129
+ print(f"πŸ“ [RAG] Chunk size: {Settings.chunk_size}, Overlap: {Settings.chunk_overlap}")
130
+
131
  # Load existing index or create new one
132
  self.index = None
133
  self.documents = []
 
136
  self._load_user_data()
137
 
138
  def _load_user_data(self):
139
+ """Load user data from file and build vector index"""
140
  if os.path.exists(self.user_data_file):
141
  try:
142
  with open(self.user_data_file, 'r') as f:
143
  data = json.load(f)
144
+
145
+ # Convert old format (JSON in text) to new format (descriptive text + raw_data in metadata)
146
+ self.documents = []
147
+ for d in data:
148
+ try:
149
+ # Create descriptive text for RAG retrieval
150
+ if d.get("type") == "preferences":
151
+ prefs = d.get("data", {})
152
+ pref_text = f"""User Preferences:
153
+ Name: {prefs.get('name', 'Unknown')}
154
+ Favorite Genres: {', '.join(prefs.get('favorite_genres', []))}
155
+ Interests: {', '.join(prefs.get('interests', []))}
156
+ Podcast Interests: {', '.join(prefs.get('podcast_interests', []))}
157
+ Mood: {prefs.get('mood', 'neutral')}
158
+ Content Filter: Music={prefs.get('content_filter', {}).get('music', True)}, News={prefs.get('content_filter', {}).get('news', True)}, Podcasts={prefs.get('content_filter', {}).get('podcasts', True)}, Stories={prefs.get('content_filter', {}).get('stories', True)}
159
+ """
160
+ doc = Document(
161
+ text=pref_text,
162
+ metadata={
163
+ "type": "preferences",
164
+ "user_id": d.get("user_id"),
165
+ "timestamp": d.get("timestamp", datetime.now().isoformat()),
166
+ # Removed raw_data to reduce metadata size - essential fields stored separately
167
+ }
168
  )
169
+ elif d.get("type") == "history":
170
+ item_type = d.get("item_type", "")
171
+ item_data = d.get("data", {})
172
+ user_feedback = d.get("feedback")
173
+
174
+ if item_type == "music":
175
+ track = item_data.get("track", {})
176
+ history_text = f"""Music Listening History:
177
+ Title: {track.get('title', 'Unknown')}
178
+ Artist: {track.get('artist', 'Unknown')}
179
+ Genre: {track.get('genre', 'Unknown')}
180
+ Source: {track.get('source', 'Unknown')}
181
+ Feedback: {user_feedback or 'No feedback'}
182
+ """
183
+ elif item_type == "news":
184
+ history_text = f"""News Listening History:
185
+ Items: {len(item_data.get('news_items', []))} news items
186
+ Topics: {', '.join([item.get('category', '') for item in item_data.get('news_items', [])[:3]])}
187
+ Feedback: {user_feedback or 'No feedback'}
188
+ """
189
+ elif item_type == "podcast":
190
+ podcast = item_data.get("podcast", {})
191
+ history_text = f"""Podcast Listening History:
192
+ Title: {podcast.get('title', 'Unknown')}
193
+ Host: {podcast.get('host', 'Unknown')}
194
+ Category: {podcast.get('category', 'Unknown')}
195
+ Feedback: {user_feedback or 'No feedback'}
196
+ """
197
+ else:
198
+ history_text = f"""Story Listening History:
199
+ Type: {item_type}
200
+ Feedback: {user_feedback or 'No feedback'}
201
+ """
202
+
203
+ # Store minimal metadata to avoid chunk size issues
204
+ doc = Document(
205
+ text=history_text,
206
+ metadata={
207
+ "type": "history",
208
+ "user_id": d.get("user_id"),
209
+ "item_type": item_type,
210
+ "timestamp": d.get("timestamp", datetime.now().isoformat()),
211
+ # Store only essential fields, not full raw_data
212
+ "feedback": user_feedback or ""
213
+ }
214
+ )
215
+ else:
216
+ # Unknown type, create basic document
217
+ doc = Document(
218
+ text=json.dumps(d),
219
+ metadata={
220
+ "type": d.get("type", "unknown"),
221
+ "user_id": d.get("user_id"),
222
+ "timestamp": d.get("timestamp", datetime.now().isoformat()),
223
+ # Removed raw_data to reduce metadata size - essential fields stored separately
224
+ }
225
+ )
226
+
227
+ self.documents.append(doc)
228
+ except Exception as e:
229
+ rag_logger.warning(f"⚠️ Skipping invalid document during load: {e}")
230
+ continue
231
+
232
+ rag_logger.info(f"πŸ“‚ Loaded {len(self.documents)} documents from RAG storage")
233
+ print(f"πŸ“‚ [RAG] Loaded {len(self.documents)} documents from storage")
234
+
235
+ # Build vector index if embeddings are available
236
+ if self.documents and self.embedding_available:
237
+ try:
238
+ rag_logger.info(f"πŸ”¨ Building vector index from {len(self.documents)} documents...")
239
+ print(f"πŸ”¨ [RAG] Building vector index from {len(self.documents)} documents...")
240
+ self.index = VectorStoreIndex.from_documents(
241
+ self.documents,
242
+ storage_context=self.storage_context
243
+ )
244
+ rag_logger.info(f"βœ… Vector index built successfully with {len(self.documents)} documents")
245
+ print(f"βœ… [RAG] Vector index built with {len(self.documents)} documents")
246
+ except Exception as e:
247
+ rag_logger.error(f"❌ Failed to build vector index: {e}")
248
+ print(f"Warning: Could not build vector index: {e}")
249
+ self.index = None
250
+ elif self.documents:
251
+ rag_logger.info(f"ℹ️ {len(self.documents)} documents loaded but embeddings disabled - using fallback mode")
252
+ print(f"ℹ️ [RAG] {len(self.documents)} documents loaded but embeddings disabled - using fallback mode")
253
  except Exception as e:
254
+ rag_logger.error(f"❌ Error loading user data: {e}")
255
  print(f"Error loading user data: {e}")
256
 
257
  def _save_user_data(self):
258
  """Save user data to file"""
259
  try:
260
+ data = []
261
+ for doc in self.documents:
262
+ try:
263
+ # Try to get raw_data from metadata first (new format)
264
+ raw_data = doc.metadata.get("raw_data")
265
+ if raw_data:
266
+ data.append(json.loads(raw_data))
267
+ else:
268
+ # Fallback: try to parse doc.text as JSON (old format)
269
+ data.append(json.loads(doc.text))
270
+ except (json.JSONDecodeError, KeyError) as e:
271
+ # Skip documents that can't be parsed
272
+ rag_logger.warning(f"⚠️ Skipping document that couldn't be parsed: {e}")
273
+ continue
274
+
275
  with open(self.user_data_file, 'w') as f:
276
  json.dump(data, f, indent=2)
277
+ rag_logger.info(f"πŸ’Ύ Saved {len(data)} documents to {self.user_data_file}")
278
  except Exception as e:
279
+ rag_logger.error(f"❌ Error saving user data: {e}")
280
+ import traceback
281
+ rag_logger.error(traceback.format_exc())
282
  print(f"Error saving user data: {e}")
283
 
284
+ def store_user_preferences(self, preferences: Dict[str, Any], user_id: str = None):
285
+ """Store user preferences in RAG system with user ID"""
286
+ if not user_id:
287
+ rag_logger.warning("⚠️ Storing preferences without user_id - data will not be user-specific")
288
+ print("⚠️ [RAG] Warning: Storing preferences without user_id")
289
+
290
  pref_doc = {
291
  "type": "preferences",
292
+ "user_id": user_id,
293
  "timestamp": datetime.now().isoformat(),
294
  "data": preferences
295
  }
296
 
297
+ # Create a more descriptive document for better RAG retrieval
298
+ pref_text = f"""User Preferences:
299
+ Name: {preferences.get('name', 'Unknown')}
300
+ Favorite Genres: {', '.join(preferences.get('favorite_genres', []))}
301
+ Interests: {', '.join(preferences.get('interests', []))}
302
+ Podcast Interests: {', '.join(preferences.get('podcast_interests', []))}
303
+ Mood: {preferences.get('mood', 'neutral')}
304
+ Content Filter: Music={preferences.get('content_filter', {}).get('music', True)}, News={preferences.get('content_filter', {}).get('news', True)}, Podcasts={preferences.get('content_filter', {}).get('podcasts', True)}, Stories={preferences.get('content_filter', {}).get('stories', True)}
305
+ """
306
+
307
+ doc = Document(
308
+ text=pref_text,
309
+ metadata={
310
+ "type": "preferences",
311
+ "user_id": user_id,
312
+ "timestamp": datetime.now().isoformat(),
313
+ "raw_data": json.dumps(pref_doc)
314
+ }
315
+ )
316
  self.documents.append(doc)
317
 
318
+ rag_logger.info(f"πŸ“ STORING PREFERENCES: user_id={user_id}, Name={preferences.get('name')}, Genres={preferences.get('favorite_genres')}, Mood={preferences.get('mood')}")
319
+ print(f"πŸ“ [RAG] Storing preferences for user {user_id} ({preferences.get('name', 'user')})")
320
+
321
  # Rebuild index if embeddings are available
322
  if self.embedding_available:
323
  try:
 
325
  self.documents,
326
  storage_context=self.storage_context
327
  )
328
+ rag_logger.info(f"βœ… Vector index rebuilt with {len(self.documents)} documents (embeddings enabled)")
329
+ print(f"βœ… [RAG] Index updated with {len(self.documents)} documents")
330
  except Exception as e:
331
+ rag_logger.error(f"❌ Failed to rebuild index: {e}")
332
  print(f"Warning: Could not rebuild index: {e}")
333
  self.index = None
334
+ else:
335
+ rag_logger.info(f"ℹ️ Preferences stored (embeddings disabled, {len(self.documents)} total documents)")
336
+ print(f"ℹ️ [RAG] Preferences stored (embeddings disabled)")
337
 
338
  self._save_user_data()
339
 
340
+ def store_listening_history(self, item_type: str, item_data: Dict[str, Any], user_id: str = None, user_feedback: str = None):
341
+ """Store listening history with optional feedback and user ID"""
342
+ if not user_id:
343
+ rag_logger.warning(f"⚠️ Storing {item_type} history without user_id - data will not be user-specific")
344
+ print(f"⚠️ [RAG] Warning: Storing {item_type} history without user_id")
345
+
346
  history_doc = {
347
  "type": "history",
348
+ "user_id": user_id,
349
  "item_type": item_type, # music, news, podcast, story
350
  "timestamp": datetime.now().isoformat(),
351
  "data": item_data,
352
  "feedback": user_feedback
353
  }
354
 
355
+ # Create descriptive text for better RAG retrieval
356
+ if item_type == "music":
357
+ track = item_data.get("track", {})
358
+ history_text = f"""Music Listening History:
359
+ Title: {track.get('title', 'Unknown')}
360
+ Artist: {track.get('artist', 'Unknown')}
361
+ Genre: {track.get('genre', 'Unknown')}
362
+ Source: {track.get('source', 'Unknown')}
363
+ Feedback: {user_feedback or 'No feedback'}
364
+ """
365
+ rag_logger.info(f"🎡 STORING MUSIC HISTORY: user_id={user_id}, {track.get('title', 'Unknown')} by {track.get('artist', 'Unknown')} ({track.get('genre', 'Unknown')}) - Feedback: {user_feedback or 'None'}")
366
+ print(f"🎡 [RAG] Storing music for user {user_id}: {track.get('title', 'Unknown')} by {track.get('artist', 'Unknown')}")
367
+ elif item_type == "news":
368
+ history_text = f"""News Listening History:
369
+ Items: {len(item_data.get('news_items', []))} news items
370
+ Topics: {', '.join([item.get('category', '') for item in item_data.get('news_items', [])[:3]])}
371
+ Feedback: {user_feedback or 'No feedback'}
372
+ """
373
+ rag_logger.info(f"πŸ“° STORING NEWS HISTORY: user_id={user_id}, {len(item_data.get('news_items', []))} items - Feedback: {user_feedback or 'None'}")
374
+ print(f"πŸ“° [RAG] Storing news history for user {user_id}: {len(item_data.get('news_items', []))} items")
375
+ elif item_type == "podcast":
376
+ podcast = item_data.get("podcast", {})
377
+ history_text = f"""Podcast Listening History:
378
+ Title: {podcast.get('title', 'Unknown')}
379
+ Host: {podcast.get('host', 'Unknown')}
380
+ Category: {podcast.get('category', 'Unknown')}
381
+ Feedback: {user_feedback or 'No feedback'}
382
+ """
383
+ rag_logger.info(f"πŸŽ™οΈ STORING PODCAST HISTORY: user_id={user_id}, {podcast.get('title', 'Unknown')} - Feedback: {user_feedback or 'None'}")
384
+ print(f"πŸŽ™οΈ [RAG] Storing podcast for user {user_id}: {podcast.get('title', 'Unknown')}")
385
+ else:
386
+ history_text = f"""Story Listening History:
387
+ Type: {item_type}
388
+ Feedback: {user_feedback or 'No feedback'}
389
+ """
390
+ rag_logger.info(f"πŸ“– STORING STORY HISTORY: user_id={user_id}, {item_type} - Feedback: {user_feedback or 'None'}")
391
+ print(f"πŸ“– [RAG] Storing story history for user {user_id}: {item_type}")
392
+
393
+ doc = Document(
394
+ text=history_text,
395
+ metadata={
396
+ "type": "history",
397
+ "user_id": user_id,
398
+ "item_type": item_type,
399
+ "timestamp": datetime.now().isoformat(),
400
+ "raw_data": json.dumps(history_doc)
401
+ }
402
+ )
403
  self.documents.append(doc)
404
 
405
+ # Rebuild index if embeddings are available (but only periodically to avoid too many rebuilds)
406
+ # Rebuild every 5 documents or if index doesn't exist
407
+ if self.embedding_available and (not self.index or len(self.documents) % 5 == 0):
408
  try:
409
  self.index = VectorStoreIndex.from_documents(
410
  self.documents,
411
  storage_context=self.storage_context
412
  )
413
+ if len(self.documents) % 5 == 0:
414
+ rag_logger.info(f"βœ… Vector index rebuilt (total documents: {len(self.documents)})")
415
+ print(f"βœ… [RAG] Index updated (total documents: {len(self.documents)})")
416
  except Exception as e:
417
+ rag_logger.error(f"❌ Failed to rebuild index: {e}")
418
  print(f"Warning: Could not rebuild index: {e}")
419
  self.index = None
420
 
421
  self._save_user_data()
422
 
423
+ def get_user_preferences(self, user_id: str = None) -> Dict[str, Any]:
424
+ """Retrieve latest user preferences for a specific user"""
425
  preferences = {}
426
  for doc in reversed(self.documents):
427
  try:
428
+ # Check metadata first (new format with user_id)
429
+ doc_user_id = doc.metadata.get("user_id")
430
+ if user_id and doc_user_id != user_id:
431
+ continue # Skip documents from other users
432
+
433
+ # Try to parse from metadata raw_data (new format)
434
+ raw_data = doc.metadata.get("raw_data")
435
+ if raw_data:
436
+ data = json.loads(raw_data)
437
+ if data.get("type") == "preferences":
438
+ if not user_id or data.get("user_id") == user_id:
439
+ preferences = data.get("data", {})
440
+ break
441
+ else:
442
+ # Fallback: try to parse doc.text as JSON (old format)
443
+ data = json.loads(doc.text)
444
+ if data.get("type") == "preferences":
445
+ if not user_id or data.get("user_id") == user_id:
446
+ preferences = data.get("data", {})
447
+ break
448
+ except (json.JSONDecodeError, KeyError, AttributeError) as e:
449
+ rag_logger.debug(f"Skipping document in get_user_preferences: {e}")
450
  continue
451
 
452
+ if user_id and not preferences:
453
+ rag_logger.warning(f"⚠️ No preferences found for user_id={user_id}")
454
+
455
  return preferences
456
 
457
+ def get_recommendations(self, query: str, user_id: str = None) -> Dict[str, Any]:
458
+ """Get personalized recommendations based on user history and preferences using RAG"""
459
  if not self.index or not self.llm_available:
460
+ rag_logger.warning(f"⚠️ RAG RECOMMENDATIONS UNAVAILABLE: user_id={user_id}, query='{query}' (no index or LLM)")
461
+ print("ℹ️ RAG recommendations unavailable (no index or LLM) - using defaults")
462
+ return self._get_default_recommendations(user_id=user_id)
463
 
464
  try:
465
+ rag_logger.info(f"πŸ” RAG RECOMMENDATIONS QUERY: '{query}'")
466
+ print(f"πŸ” [RAG] Getting recommendations for: '{query[:60]}...'")
467
+
468
+ # Use RAG to query user history and preferences
469
+ query_engine = self.index.as_query_engine(
470
+ similarity_top_k=5, # Get top 5 relevant documents
471
+ response_mode="compact" # Compact response
472
+ )
473
  response = query_engine.query(query)
474
 
475
+ response_text = str(response)
476
+ rag_logger.info(f"βœ… RAG RECOMMENDATIONS RESPONSE: {response_text[:200]}...")
477
+ print(f"βœ… [RAG] LLM generated recommendations: {response_text[:150]}...")
478
+
479
+ # Extract recommendations from RAG response
480
+ recommendations = {
481
+ "recommendations": response_text,
482
+ "source": "RAG",
483
+ "query": query
484
  }
485
+
486
+ # Also try to extract structured data from response
487
+ response_lower = response_text.lower()
488
+ if "genre" in response_lower or "music" in response_lower:
489
+ # Try to extract genre preferences
490
+ for genre in ["pop", "rock", "jazz", "classical", "electronic", "hip-hop", "country", "indie", "rap", "blues", "folk"]:
491
+ if genre in response_lower:
492
+ recommendations.setdefault("suggested_genres", []).append(genre)
493
+ rag_logger.info(f" 🎡 Extracted genre from RAG: {genre}")
494
+
495
+ return recommendations
496
+ except Exception as e:
497
+ rag_logger.error(f"❌ RAG RECOMMENDATIONS ERROR: {e}")
498
+ import traceback
499
+ rag_logger.error(traceback.format_exc())
500
+ print(f"Error getting RAG recommendations: {e}")
501
+ traceback.print_exc()
502
+ return self._get_default_recommendations(user_id=user_id)
503
+
504
+ def query_user_context(self, query: str, user_id: str = None, top_k: int = 3) -> List[Dict[str, Any]]:
505
+ """Query user context using vector search - returns relevant documents filtered by user_id"""
506
+ if not self.index or not self.embedding_available:
507
+ rag_logger.warning(f"⚠️ RAG QUERY SKIPPED (no index/embeddings): '{query}'")
508
+ print(f"⚠️ [RAG] Query skipped - embeddings not available")
509
+ return []
510
+
511
+ try:
512
+ rag_logger.info(f"πŸ” RAG QUERY: user_id={user_id}, query='{query}' (top_k={top_k})")
513
+ print(f"πŸ” [RAG] Querying for user {user_id}: '{query[:60]}...'")
514
+
515
+ # Retrieve more documents than needed, then filter by user_id
516
+ # This ensures we get top_k results for the specific user
517
+ retrieve_count = top_k * 3 if user_id else top_k # Get more if filtering
518
+ retriever = self.index.as_retriever(similarity_top_k=retrieve_count)
519
+ nodes = retriever.retrieve(query)
520
+
521
+ results = []
522
+ for i, node in enumerate(nodes):
523
+ try:
524
+ # Filter by user_id if provided
525
+ node_user_id = node.metadata.get("user_id")
526
+ if user_id and node_user_id != user_id:
527
+ continue # Skip documents from other users
528
+
529
+ score = node.score if hasattr(node, 'score') else None
530
+ node_type = node.metadata.get("type", "unknown")
531
+ item_type = node.metadata.get("item_type", "")
532
+
533
+ data = json.loads(node.metadata.get("raw_data", "{}"))
534
+ result = {
535
+ "text": node.text,
536
+ "score": score,
537
+ "metadata": node.metadata,
538
+ "data": data
539
+ }
540
+ results.append(result)
541
+
542
+ # Log each retrieved document
543
+ preview = node.text[:100].replace('\n', ' ')
544
+ rag_logger.info(f" πŸ“„ Retrieved #{len(results)}: user_id={node_user_id}, type={node_type}, item_type={item_type}, score={score:.4f if score else 'N/A'}, preview='{preview}...'")
545
+ print(f" πŸ“„ [RAG] Retrieved #{len(results)}: {node_type} (user: {node_user_id}, score: {score:.4f if score else 'N/A'}) - {preview}...")
546
+
547
+ # Stop if we have enough results for this user
548
+ if len(results) >= top_k:
549
+ break
550
+
551
+ except Exception as parse_error:
552
+ # Still check user_id even if parsing fails
553
+ node_user_id = node.metadata.get("user_id")
554
+ if user_id and node_user_id != user_id:
555
+ continue
556
+
557
+ score = node.score if hasattr(node, 'score') else None
558
+ result = {
559
+ "text": node.text,
560
+ "score": score,
561
+ "metadata": node.metadata
562
+ }
563
+ results.append(result)
564
+ rag_logger.warning(f" ⚠️ Retrieved #{len(results)} (parse error): user_id={node_user_id}, score={score}, text_preview='{node.text[:50]}...'")
565
+ print(f" ⚠️ [RAG] Retrieved #{len(results)} (parse error, user: {node_user_id}, score: {score:.4f if score else 'N/A'})")
566
+
567
+ if len(results) >= top_k:
568
+ break
569
+
570
+ rag_logger.info(f"βœ… RAG QUERY COMPLETE: Retrieved {len(results)} documents for user_id={user_id}")
571
+ print(f"βœ… [RAG] Query complete: {len(results)} documents retrieved for user {user_id}")
572
+
573
+ return results
574
  except Exception as e:
575
+ rag_logger.error(f"❌ RAG QUERY ERROR: {e}")
576
+ import traceback
577
+ rag_logger.error(traceback.format_exc())
578
+ print(f"❌ [RAG] Query error: {e}")
579
+ return []
580
 
581
  def _get_default_recommendations(self) -> Dict[str, Any]:
582
  """Return default recommendations when RAG is not available"""