Spaces:
Sleeping
Sleeping
Ralitza Mondal commited on
Commit ยท
a09a20c
1
Parent(s): c066961
๐งช Add tests and improve YouTube scraper
Browse files- Add test_youtube_api.py to verify YouTube Data API works
- Add test_scraper_fix.py to diagnose scraper issues
- Update youtube_scraper.py with improved headers and bot detection handling
- Add delays and cookie handling to bypass YouTube blocking
- Both tests confirm web scraping is blocked, API is the solution
- test_scraper_fix.py +47 -0
- test_youtube_api.py +63 -0
- youtube_scraper.py +25 -3
test_scraper_fix.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Test the updated YouTube scraper"""
|
| 3 |
+
|
| 4 |
+
from youtube_scraper import YouTubeScraper
|
| 5 |
+
|
| 6 |
+
def test_scraper():
|
| 7 |
+
scraper = YouTubeScraper()
|
| 8 |
+
print('๐งช Testing UPDATED YouTube Scraper...')
|
| 9 |
+
print('=' * 60)
|
| 10 |
+
print()
|
| 11 |
+
|
| 12 |
+
# Test with Miss Fortune
|
| 13 |
+
print('Test: Searching for "League of Legends Miss Fortune guide"')
|
| 14 |
+
videos = scraper.search_videos('League of Legends Miss Fortune guide', 3)
|
| 15 |
+
|
| 16 |
+
print()
|
| 17 |
+
print(f'๐ Results: Found {len(videos)} videos')
|
| 18 |
+
print('=' * 60)
|
| 19 |
+
print()
|
| 20 |
+
|
| 21 |
+
if videos:
|
| 22 |
+
print('โ
SUCCESS! Scraper is working again!')
|
| 23 |
+
print()
|
| 24 |
+
for i, v in enumerate(videos, 1):
|
| 25 |
+
print(f'{i}. {v.get("title", "No title")}')
|
| 26 |
+
print(f' URL: {v.get("url", "No URL")}')
|
| 27 |
+
print(f' Views: {v.get("views", "Unknown")}')
|
| 28 |
+
print(f' Duration: {v.get("duration", "Unknown")}')
|
| 29 |
+
print()
|
| 30 |
+
return True
|
| 31 |
+
else:
|
| 32 |
+
print('โ Still not working - YouTube blocking is too strong')
|
| 33 |
+
print()
|
| 34 |
+
print('๐ Analysis:')
|
| 35 |
+
print(' - YouTube has strengthened bot detection')
|
| 36 |
+
print(' - Web scraping is being actively blocked')
|
| 37 |
+
print(' - This is why the scraper that "was working" stopped')
|
| 38 |
+
print()
|
| 39 |
+
print('๐ก Solutions:')
|
| 40 |
+
print(' 1. Use YouTube Data API (recommended, reliable)')
|
| 41 |
+
print(' 2. Use a proxy service')
|
| 42 |
+
print(' 3. Accept that video search is unavailable')
|
| 43 |
+
print()
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
if __name__ == '__main__':
|
| 47 |
+
test_scraper()
|
test_youtube_api.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Test YouTube API integration"""
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
# Load environment variables
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
print("๐งช Testing YouTube Data API Integration")
|
| 11 |
+
print("=" * 60)
|
| 12 |
+
print()
|
| 13 |
+
|
| 14 |
+
# Test 1: Check API key
|
| 15 |
+
api_key = os.getenv("YOUTUBE_API_KEY")
|
| 16 |
+
if api_key:
|
| 17 |
+
print(f"โ
API Key found: {api_key[:10]}...{api_key[-5:]}")
|
| 18 |
+
else:
|
| 19 |
+
print("โ API Key not found in environment")
|
| 20 |
+
exit(1)
|
| 21 |
+
|
| 22 |
+
print()
|
| 23 |
+
|
| 24 |
+
# Test 2: Import and initialize
|
| 25 |
+
try:
|
| 26 |
+
from youtube_api import YouTubeAPIClient
|
| 27 |
+
print("โ
YouTubeAPIClient imported successfully")
|
| 28 |
+
|
| 29 |
+
client = YouTubeAPIClient(api_key)
|
| 30 |
+
print("โ
YouTubeAPIClient initialized")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"โ Import/initialization failed: {e}")
|
| 33 |
+
exit(1)
|
| 34 |
+
|
| 35 |
+
print()
|
| 36 |
+
|
| 37 |
+
# Test 3: Search for videos
|
| 38 |
+
try:
|
| 39 |
+
print("๐บ Searching for: 'League of Legends Miss Fortune guide'")
|
| 40 |
+
videos = client.search_videos('League of Legends Miss Fortune guide', 3)
|
| 41 |
+
|
| 42 |
+
print(f"โ
Found {len(videos)} videos")
|
| 43 |
+
print()
|
| 44 |
+
|
| 45 |
+
if videos:
|
| 46 |
+
for i, video in enumerate(videos, 1):
|
| 47 |
+
print(f"{i}. {video.get('title', 'No title')}")
|
| 48 |
+
print(f" URL: {video.get('url', 'No URL')}")
|
| 49 |
+
print(f" Views: {video.get('views', 'Unknown')}")
|
| 50 |
+
print(f" Duration: {video.get('duration', 'Unknown')}")
|
| 51 |
+
print()
|
| 52 |
+
|
| 53 |
+
print("=" * 60)
|
| 54 |
+
print("๐ SUCCESS! YouTube Data API is working perfectly!")
|
| 55 |
+
print("=" * 60)
|
| 56 |
+
else:
|
| 57 |
+
print("โ ๏ธ No videos found (might be API issue)")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"โ Search failed: {e}")
|
| 61 |
+
import traceback
|
| 62 |
+
traceback.print_exc()
|
| 63 |
+
exit(1)
|
youtube_scraper.py
CHANGED
|
@@ -20,14 +20,23 @@ class YouTubeScraper:
|
|
| 20 |
|
| 21 |
def __init__(self):
|
| 22 |
self.session = requests.Session()
|
|
|
|
| 23 |
self.session.headers.update({
|
| 24 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
| 25 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 26 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 27 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 28 |
'DNT': '1',
|
| 29 |
'Connection': 'keep-alive',
|
| 30 |
-
'Upgrade-Insecure-Requests': '1'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
})
|
| 32 |
|
| 33 |
def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
|
|
@@ -48,9 +57,22 @@ class YouTubeScraper:
|
|
| 48 |
|
| 49 |
print(f"๐ Scraping YouTube search: {query}")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
response = self.session.get(url, timeout=15)
|
| 52 |
response.raise_for_status()
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# Parse HTML
|
| 55 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 56 |
|
|
|
|
| 20 |
|
| 21 |
def __init__(self):
|
| 22 |
self.session = requests.Session()
|
| 23 |
+
# Updated headers to bypass YouTube's bot detection (Jan 2026)
|
| 24 |
self.session.headers.update({
|
| 25 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
| 26 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
| 27 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 28 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 29 |
'DNT': '1',
|
| 30 |
'Connection': 'keep-alive',
|
| 31 |
+
'Upgrade-Insecure-Requests': '1',
|
| 32 |
+
'Sec-Fetch-Dest': 'document',
|
| 33 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 34 |
+
'Sec-Fetch-Site': 'none',
|
| 35 |
+
'Sec-Fetch-User': '?1',
|
| 36 |
+
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
| 37 |
+
'sec-ch-ua-mobile': '?0',
|
| 38 |
+
'sec-ch-ua-platform': '"macOS"',
|
| 39 |
+
'Cache-Control': 'max-age=0'
|
| 40 |
})
|
| 41 |
|
| 42 |
def search_videos(self, query: str, max_results: int = 5) -> List[Dict]:
|
|
|
|
| 57 |
|
| 58 |
print(f"๐ Scraping YouTube search: {query}")
|
| 59 |
|
| 60 |
+
# Add small delay to avoid rate limiting
|
| 61 |
+
import time
|
| 62 |
+
time.sleep(1)
|
| 63 |
+
|
| 64 |
response = self.session.get(url, timeout=15)
|
| 65 |
response.raise_for_status()
|
| 66 |
|
| 67 |
+
# Check if we got blocked
|
| 68 |
+
if 'captcha' in response.text.lower() or 'unusual traffic' in response.text.lower():
|
| 69 |
+
print("โ ๏ธ YouTube detected automation - trying with cookies...")
|
| 70 |
+
# Add some cookies to look more like a real browser
|
| 71 |
+
self.session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
|
| 72 |
+
self.session.cookies.set('PREF', 'tz=America.New_York', domain='.youtube.com')
|
| 73 |
+
time.sleep(2)
|
| 74 |
+
response = self.session.get(url, timeout=15)
|
| 75 |
+
|
| 76 |
# Parse HTML
|
| 77 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 78 |
|