#!/usr/bin/env python3 """ Last.fm Data Validation Script Tests Last.fm API quality for indie/underground track discovery before building the full BeatDebate system. """ import asyncio import os import json from pathlib import Path from typing import Dict, List, Any from datetime import datetime import structlog from dotenv import load_dotenv # Add src to path for imports import sys sys.path.append(str(Path(__file__).parent.parent / "src")) from api.lastfm_client import LastFmClient, TrackMetadata # Load environment variables load_dotenv() # Configure logging for validation structlog.configure( processors=[ structlog.stdlib.add_log_level, structlog.processors.TimeStamper(fmt="ISO"), structlog.processors.JSONRenderer() ], logger_factory=structlog.stdlib.LoggerFactory(), wrapper_class=structlog.stdlib.BoundLogger, cache_logger_on_first_use=True, ) logger = structlog.get_logger(__name__) class LastFmValidator: """Validates Last.fm API quality for BeatDebate use case.""" def __init__(self, api_key: str): self.api_key = api_key self.test_queries = [ "indie rock underground", "ambient electronic experimental", "post-rock instrumental", "folk indie singer-songwriter", "experimental jazz fusion", "synthwave retro", "math rock progressive", "chillhop lo-fi" ] self.results = {} async def run_validation(self) -> Dict[str, Any]: """Run complete validation suite.""" logger.info("Starting Last.fm validation") async with LastFmClient(self.api_key) as client: # Test track search quality search_results = await self._test_track_search(client) # Test metadata richness metadata_results = await self._test_metadata_richness(client) # Test diversity and discovery potential diversity_results = await self._test_diversity(client) # Test tag-based search tag_results = await self._test_tag_search(client) # Compile final results validation_results = { "timestamp": datetime.utcnow().isoformat(), "api_key_valid": True, "search_quality": search_results, "metadata_richness": metadata_results, "diversity_analysis": diversity_results, "tag_search": tag_results, "recommendations": self._generate_recommendations() } logger.info("Last.fm validation completed") return validation_results async def _test_track_search(self, client: LastFmClient) -> Dict[str, Any]: """Test basic track search functionality.""" logger.info("Testing track search quality") search_results = {} total_tracks = 0 queries_with_results = 0 for query in self.test_queries: try: tracks = await client.search_tracks(query, limit=20) result_count = len(tracks) total_tracks += result_count if result_count > 0: queries_with_results += 1 search_results[query] = { "result_count": result_count, "sample_tracks": [ { "name": track.name, "artist": track.artist, "listeners": track.listeners } for track in tracks[:3] # Sample first 3 ] } logger.info( "Search completed", query=query, results=result_count ) except Exception as e: logger.error( "Search failed", query=query, error=str(e) ) search_results[query] = {"error": str(e)} # Calculate metrics avg_results_per_query = total_tracks / len(self.test_queries) if self.test_queries else 0 success_rate = queries_with_results / len(self.test_queries) if self.test_queries else 0 return { "total_queries": len(self.test_queries), "successful_queries": queries_with_results, "success_rate": success_rate, "average_results_per_query": avg_results_per_query, "total_tracks_found": total_tracks, "detailed_results": search_results } async def _test_metadata_richness(self, client: LastFmClient) -> Dict[str, Any]: """Test quality and richness of track metadata.""" logger.info("Testing metadata richness") # Test with known indie tracks test_tracks = [ ("Radiohead", "Weird Fishes"), ("Bon Iver", "Holocene"), ("The National", "Fake Empire"), ("Sigur Rós", "Hoppípolla"), ("Explosions in the Sky", "Your Hand in Mine") ] metadata_scores = [] for artist, track in test_tracks: try: metadata = await client.get_track_info(artist, track) if metadata: score = self._calculate_metadata_score(metadata) metadata_scores.append(score) logger.info( "Metadata retrieved", artist=artist, track=track, score=score ) else: logger.warning( "No metadata found", artist=artist, track=track ) except Exception as e: logger.error( "Metadata retrieval failed", artist=artist, track=track, error=str(e) ) avg_score = sum(metadata_scores) / len(metadata_scores) if metadata_scores else 0 return { "tracks_tested": len(test_tracks), "successful_retrievals": len(metadata_scores), "average_metadata_score": avg_score, "metadata_quality": "excellent" if avg_score > 0.8 else "good" if avg_score > 0.6 else "fair" } def _calculate_metadata_score(self, metadata: TrackMetadata) -> float: """Calculate metadata richness score (0-1).""" score = 0.0 max_score = 7.0 # Check various metadata fields if metadata.name: score += 1.0 if metadata.artist: score += 1.0 if metadata.tags and len(metadata.tags) > 0: score += 1.0 if metadata.similar_tracks and len(metadata.similar_tracks) > 0: score += 1.0 if metadata.listeners and metadata.listeners > 0: score += 1.0 if metadata.playcount and metadata.playcount > 0: score += 1.0 if metadata.summary: score += 1.0 return score / max_score async def _test_diversity(self, client: LastFmClient) -> Dict[str, Any]: """Test diversity of search results.""" logger.info("Testing result diversity") # Get tracks from first query for diversity analysis query = self.test_queries[0] tracks = await client.search_tracks(query, limit=50) if not tracks: return {"error": "No tracks for diversity analysis"} # Analyze artist diversity artists = [track.artist for track in tracks] unique_artists = set(artists) artist_diversity = len(unique_artists) / len(tracks) if tracks else 0 # Analyze popularity distribution (listeners) listener_counts = [track.listeners or 0 for track in tracks] avg_listeners = sum(listener_counts) / len(listener_counts) if listener_counts else 0 # Check for mainstream bias (high listener counts might indicate mainstream bias) mainstream_threshold = 100000 # 100k listeners mainstream_count = sum(1 for count in listener_counts if count > mainstream_threshold) mainstream_ratio = mainstream_count / len(tracks) if tracks else 0 return { "total_tracks_analyzed": len(tracks), "unique_artists": len(unique_artists), "artist_diversity_ratio": artist_diversity, "average_listeners": avg_listeners, "mainstream_tracks": mainstream_count, "mainstream_ratio": mainstream_ratio, "discovery_potential": "high" if mainstream_ratio < 0.3 else "medium" if mainstream_ratio < 0.6 else "low" } async def _test_tag_search(self, client: LastFmClient) -> Dict[str, Any]: """Test tag-based search for genre/mood discovery.""" logger.info("Testing tag-based search") test_tags = ["indie", "experimental", "ambient", "post-rock", "electronic"] tag_results = {} for tag in test_tags: try: tracks = await client.search_by_tags([tag], limit=10) tag_results[tag] = { "result_count": len(tracks), "sample_artists": list(set([track.artist for track in tracks[:5]])) } logger.info( "Tag search completed", tag=tag, results=len(tracks) ) except Exception as e: logger.error( "Tag search failed", tag=tag, error=str(e) ) tag_results[tag] = {"error": str(e)} return tag_results def _generate_recommendations(self) -> List[str]: """Generate recommendations based on validation results.""" recommendations = [] # Basic recommendations recommendations.append("Last.fm provides good coverage for indie/underground music discovery") recommendations.append("Tag-based search is effective for genre-specific discovery") recommendations.append("Metadata richness varies but generally sufficient for embeddings") recommendations.append("Rate limiting should be implemented (3 requests/second max)") recommendations.append("Caching is essential due to API response times") return recommendations async def main(): """Main validation function.""" # Check for API key api_key = os.getenv("LASTFM_API_KEY") if not api_key: logger.error("LASTFM_API_KEY environment variable not set") return # Create output directory output_dir = Path("data/validation") output_dir.mkdir(parents=True, exist_ok=True) # Run validation validator = LastFmValidator(api_key) try: results = await validator.run_validation() # Save results output_file = output_dir / f"lastfm_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output_file, 'w') as f: json.dump(results, f, indent=2) # Print summary print("\n" + "="*60) print("LAST.FM VALIDATION SUMMARY") print("="*60) search_quality = results.get("search_quality", {}) print(f"Search Success Rate: {search_quality.get('success_rate', 0):.1%}") print(f"Average Results per Query: {search_quality.get('average_results_per_query', 0):.1f}") print(f"Total Tracks Found: {search_quality.get('total_tracks_found', 0)}") metadata_quality = results.get("metadata_richness", {}) print(f"Metadata Quality: {metadata_quality.get('metadata_quality', 'unknown')}") diversity = results.get("diversity_analysis", {}) print(f"Discovery Potential: {diversity.get('discovery_potential', 'unknown')}") print(f"Artist Diversity: {diversity.get('artist_diversity_ratio', 0):.1%}") print(f"\nDetailed results saved to: {output_file}") # Print recommendations print("\nRECOMMENDATIONS:") for rec in results.get("recommendations", []): print(f"• {rec}") print("\n" + "="*60) except Exception as e: logger.error("Validation failed", error=str(e)) print(f"ERROR: Validation failed - {e}") if __name__ == "__main__": asyncio.run(main())