Ralitza Mondal commited on
Commit
a09a20c
ยท
1 Parent(s): c066961

๐Ÿงช Add tests and improve YouTube scraper

Browse files

- Add test_youtube_api.py to verify YouTube Data API works
- Add test_scraper_fix.py to diagnose scraper issues
- Update youtube_scraper.py with improved headers and bot detection handling
- Add delays and cookie handling to bypass YouTube blocking
- Both tests confirm web scraping is blocked, API is the solution

Files changed (3) hide show
  1. test_scraper_fix.py +47 -0
  2. test_youtube_api.py +63 -0
  3. youtube_scraper.py +25 -3
test_scraper_fix.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test the updated YouTube scraper"""
3
+
4
+ from youtube_scraper import YouTubeScraper
5
+
6
+ def test_scraper():
7
+ scraper = YouTubeScraper()
8
+ print('๐Ÿงช Testing UPDATED YouTube Scraper...')
9
+ print('=' * 60)
10
+ print()
11
+
12
+ # Test with Miss Fortune
13
+ print('Test: Searching for "League of Legends Miss Fortune guide"')
14
+ videos = scraper.search_videos('League of Legends Miss Fortune guide', 3)
15
+
16
+ print()
17
+ print(f'๐Ÿ“Š Results: Found {len(videos)} videos')
18
+ print('=' * 60)
19
+ print()
20
+
21
+ if videos:
22
+ print('โœ… SUCCESS! Scraper is working again!')
23
+ print()
24
+ for i, v in enumerate(videos, 1):
25
+ print(f'{i}. {v.get("title", "No title")}')
26
+ print(f' URL: {v.get("url", "No URL")}')
27
+ print(f' Views: {v.get("views", "Unknown")}')
28
+ print(f' Duration: {v.get("duration", "Unknown")}')
29
+ print()
30
+ return True
31
+ else:
32
+ print('โŒ Still not working - YouTube blocking is too strong')
33
+ print()
34
+ print('๐Ÿ“ Analysis:')
35
+ print(' - YouTube has strengthened bot detection')
36
+ print(' - Web scraping is being actively blocked')
37
+ print(' - This is why the scraper that "was working" stopped')
38
+ print()
39
+ print('๐Ÿ’ก Solutions:')
40
+ print(' 1. Use YouTube Data API (recommended, reliable)')
41
+ print(' 2. Use a proxy service')
42
+ print(' 3. Accept that video search is unavailable')
43
+ print()
44
+ return False
45
+
46
+ if __name__ == '__main__':
47
+ test_scraper()
test_youtube_api.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test YouTube API integration"""
3
+
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ print("๐Ÿงช Testing YouTube Data API Integration")
11
+ print("=" * 60)
12
+ print()
13
+
14
+ # Test 1: Check API key
15
+ api_key = os.getenv("YOUTUBE_API_KEY")
16
+ if api_key:
17
+ print(f"โœ… API Key found: {api_key[:10]}...{api_key[-5:]}")
18
+ else:
19
+ print("โŒ API Key not found in environment")
20
+ exit(1)
21
+
22
+ print()
23
+
24
+ # Test 2: Import and initialize
25
+ try:
26
+ from youtube_api import YouTubeAPIClient
27
+ print("โœ… YouTubeAPIClient imported successfully")
28
+
29
+ client = YouTubeAPIClient(api_key)
30
+ print("โœ… YouTubeAPIClient initialized")
31
+ except Exception as e:
32
+ print(f"โŒ Import/initialization failed: {e}")
33
+ exit(1)
34
+
35
+ print()
36
+
37
+ # Test 3: Search for videos
38
+ try:
39
+ print("๐Ÿ“บ Searching for: 'League of Legends Miss Fortune guide'")
40
+ videos = client.search_videos('League of Legends Miss Fortune guide', 3)
41
+
42
+ print(f"โœ… Found {len(videos)} videos")
43
+ print()
44
+
45
+ if videos:
46
+ for i, video in enumerate(videos, 1):
47
+ print(f"{i}. {video.get('title', 'No title')}")
48
+ print(f" URL: {video.get('url', 'No URL')}")
49
+ print(f" Views: {video.get('views', 'Unknown')}")
50
+ print(f" Duration: {video.get('duration', 'Unknown')}")
51
+ print()
52
+
53
+ print("=" * 60)
54
+ print("๐ŸŽ‰ SUCCESS! YouTube Data API is working perfectly!")
55
+ print("=" * 60)
56
+ else:
57
+ print("โš ๏ธ No videos found (might be API issue)")
58
+
59
+ except Exception as e:
60
+ print(f"โŒ Search failed: {e}")
61
+ import traceback
62
+ traceback.print_exc()
63
+ exit(1)
youtube_scraper.py CHANGED
@@ -20,14 +20,23 @@ class YouTubeScraper:
20
 
21
  def __init__(self):
22
  self.session = requests.Session()
 
23
  self.session.headers.update({
24
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
25
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
26
  'Accept-Language': 'en-US,en;q=0.9',
27
  'Accept-Encoding': 'gzip, deflate, br',
28
  'DNT': '1',
29
  'Connection': 'keep-alive',
30
- 'Upgrade-Insecure-Requests': '1'
 
 
 
 
 
 
 
 
31
  })
32
 
33
  def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
@@ -48,9 +57,22 @@ class YouTubeScraper:
48
 
49
  print(f"๐Ÿ” Scraping YouTube search: {query}")
50
 
 
 
 
 
51
  response = self.session.get(url, timeout=15)
52
  response.raise_for_status()
53
 
 
 
 
 
 
 
 
 
 
54
  # Parse HTML
55
  soup = BeautifulSoup(response.content, 'html.parser')
56
 
 
20
 
21
  def __init__(self):
22
  self.session = requests.Session()
23
+ # Updated headers to bypass YouTube's bot detection (Jan 2026)
24
  self.session.headers.update({
25
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
26
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
27
  'Accept-Language': 'en-US,en;q=0.9',
28
  'Accept-Encoding': 'gzip, deflate, br',
29
  'DNT': '1',
30
  'Connection': 'keep-alive',
31
+ 'Upgrade-Insecure-Requests': '1',
32
+ 'Sec-Fetch-Dest': 'document',
33
+ 'Sec-Fetch-Mode': 'navigate',
34
+ 'Sec-Fetch-Site': 'none',
35
+ 'Sec-Fetch-User': '?1',
36
+ 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
37
+ 'sec-ch-ua-mobile': '?0',
38
+ 'sec-ch-ua-platform': '"macOS"',
39
+ 'Cache-Control': 'max-age=0'
40
  })
41
 
42
  def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
 
57
 
58
  print(f"๐Ÿ” Scraping YouTube search: {query}")
59
 
60
+ # Add small delay to avoid rate limiting
61
+ import time
62
+ time.sleep(1)
63
+
64
  response = self.session.get(url, timeout=15)
65
  response.raise_for_status()
66
 
67
+ # Check if we got blocked
68
+ if 'captcha' in response.text.lower() or 'unusual traffic' in response.text.lower():
69
+ print("โš ๏ธ YouTube detected automation - trying with cookies...")
70
+ # Add some cookies to look more like a real browser
71
+ self.session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
72
+ self.session.cookies.set('PREF', 'tz=America.New_York', domain='.youtube.com')
73
+ time.sleep(2)
74
+ response = self.session.get(url, timeout=15)
75
+
76
  # Parse HTML
77
  soup = BeautifulSoup(response.content, 'html.parser')
78