Spaces:

Ralitza1
/

LolMultiAgent

Sleeping

Ralitza Mondal commited on Jan 24

Commit

a09a20c

1 Parent(s): c066961

🧪 Add tests and improve YouTube scraper

- Add test_youtube_api.py to verify YouTube Data API works
- Add test_scraper_fix.py to diagnose scraper issues
- Update youtube_scraper.py with improved headers and bot detection handling
- Add delays and cookie handling to bypass YouTube blocking
- Both tests confirm web scraping is blocked, API is the solution

Files changed (3) hide show

test_scraper_fix.py +47 -0
test_youtube_api.py +63 -0
youtube_scraper.py +25 -3

test_scraper_fix.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/usr/bin/env python3
+"""Test the updated YouTube scraper"""
+from youtube_scraper import YouTubeScraper
+def test_scraper():
+    scraper = YouTubeScraper()
+    print('🧪 Testing UPDATED YouTube Scraper...')
+    print('=' * 60)
+    print()
+    # Test with Miss Fortune
+    print('Test: Searching for "League of Legends Miss Fortune guide"')
+    videos = scraper.search_videos('League of Legends Miss Fortune guide', 3)
+    print()
+    print(f'📊 Results: Found {len(videos)} videos')
+    print('=' * 60)
+    print()
+    if videos:
+        print('✅ SUCCESS! Scraper is working again!')
+        print()
+        for i, v in enumerate(videos, 1):
+            print(f'{i}. {v.get("title", "No title")}')
+            print(f'   URL: {v.get("url", "No URL")}')
+            print(f'   Views: {v.get("views", "Unknown")}')
+            print(f'   Duration: {v.get("duration", "Unknown")}')
+            print()
+        return True
+    else:
+        print('❌ Still not working - YouTube blocking is too strong')
+        print()
+        print('📝 Analysis:')
+        print('   - YouTube has strengthened bot detection')
+        print('   - Web scraping is being actively blocked')
+        print('   - This is why the scraper that "was working" stopped')
+        print()
+        print('💡 Solutions:')
+        print('   1. Use YouTube Data API (recommended, reliable)')
+        print('   2. Use a proxy service')
+        print('   3. Accept that video search is unavailable')
+        print()
+        return False
+if __name__ == '__main__':
+    test_scraper()

test_youtube_api.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Test YouTube API integration"""
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+print("🧪 Testing YouTube Data API Integration")
+print("=" * 60)
+print()
+# Test 1: Check API key
+api_key = os.getenv("YOUTUBE_API_KEY")
+if api_key:
+    print(f"✅ API Key found: {api_key[:10]}...{api_key[-5:]}")
+else:
+    print("❌ API Key not found in environment")
+    exit(1)
+print()
+# Test 2: Import and initialize
+try:
+    from youtube_api import YouTubeAPIClient
+    print("✅ YouTubeAPIClient imported successfully")
+    client = YouTubeAPIClient(api_key)
+    print("✅ YouTubeAPIClient initialized")
+except Exception as e:
+    print(f"❌ Import/initialization failed: {e}")
+    exit(1)
+print()
+# Test 3: Search for videos
+try:
+    print("📺 Searching for: 'League of Legends Miss Fortune guide'")
+    videos = client.search_videos('League of Legends Miss Fortune guide', 3)
+    print(f"✅ Found {len(videos)} videos")
+    print()
+    if videos:
+        for i, video in enumerate(videos, 1):
+            print(f"{i}. {video.get('title', 'No title')}")
+            print(f"   URL: {video.get('url', 'No URL')}")
+            print(f"   Views: {video.get('views', 'Unknown')}")
+            print(f"   Duration: {video.get('duration', 'Unknown')}")
+            print()
+        print("=" * 60)
+        print("🎉 SUCCESS! YouTube Data API is working perfectly!")
+        print("=" * 60)
+    else:
+        print("⚠️  No videos found (might be API issue)")
+except Exception as e:
+    print(f"❌ Search failed: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)

youtube_scraper.py CHANGED Viewed

@@ -20,14 +20,23 @@ class YouTubeScraper:
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
             'Accept-Encoding': 'gzip, deflate, br',
             'DNT': '1',
             'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
         })
     def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
@@ -48,9 +57,22 @@ class YouTubeScraper:
             print(f"🔍 Scraping YouTube search: {query}")
             response = self.session.get(url, timeout=15)
             response.raise_for_status()
             # Parse HTML
             soup = BeautifulSoup(response.content, 'html.parser')

     def __init__(self):
         self.session = requests.Session()
+        # Updated headers to bypass YouTube's bot detection (Jan 2026)
         self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
             'Accept-Encoding': 'gzip, deflate, br',
             'DNT': '1',
             'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-User': '?1',
+            'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+            'Cache-Control': 'max-age=0'
         })
     def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
             print(f"🔍 Scraping YouTube search: {query}")
+            # Add small delay to avoid rate limiting
+            import time
+            time.sleep(1)
             response = self.session.get(url, timeout=15)
             response.raise_for_status()
+            # Check if we got blocked
+            if 'captcha' in response.text.lower() or 'unusual traffic' in response.text.lower():
+                print("⚠️  YouTube detected automation - trying with cookies...")
+                # Add some cookies to look more like a real browser
+                self.session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
+                self.session.cookies.set('PREF', 'tz=America.New_York', domain='.youtube.com')
+                time.sleep(2)
+                response = self.session.get(url, timeout=15)
             # Parse HTML
             soup = BeautifulSoup(response.content, 'html.parser')