Spaces:

NitinBot001
/

PhoneArena

Sleeping

File size: 28,770 Bytes

a6c126b

import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, quote
import logging
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PhoneDBScraper:
    def __init__(self):
        self.base_url = "https://phonedb.net"
        self.session = requests.Session()
        
        # Configure session with better headers and SSL handling
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Set up retry strategy
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"],  # Updated parameter name
            backoff_factor=1
        )
        
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        
        # Disable SSL verification (use with caution)
        self.session.verify = False
        
    def search_phone(self, phone_name):
        """Search for a phone by name and return search results"""
        # Try different search approaches
        search_urls = [
            f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
            f"{self.base_url}/search?q={quote(phone_name)}",
            f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
        ]
        
        for search_url in search_urls:
            try:
                logger.info(f"Trying search URL: {search_url}")
                response = self.session.get(search_url, timeout=30)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find search results with multiple selectors
                results = []
                
                # Look for various possible result containers
                selectors = [
                    'div.device-item',
                    'div.device',
                    'div.phone-item', 
                    'tr[onclick*="device"]',
                    'a[href*="device"]',
                    'a[href*="phone"]',
                    'td a[href*="index.php"]'
                ]
                
                search_results = []
                for selector in selectors:
                    found = soup.select(selector)
                    if found:
                        search_results.extend(found)
                        break
                
                # Also try finding links with device IDs
                if not search_results:
                    search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
                
                for result in search_results[:10]:  # Limit to first 10 results
                    title = ""
                    link = ""
                    
                    if result.name == 'a':
                        link = result.get('href', '')
                        title = result.get_text(strip=True) or result.get('title', '')
                    elif result.name in ['div', 'tr']:
                        link_elem = result.find('a')
                        if link_elem:
                            link = link_elem.get('href', '')
                            title = link_elem.get_text(strip=True) or result.get_text(strip=True)
                        else:
                            # Check for onclick events with device info
                            onclick = result.get('onclick', '')
                            if 'device' in onclick:
                                # Extract device ID from onclick
                                device_match = re.search(r'id=(\d+)', onclick)
                                if device_match:
                                    link = f"/index.php?m=device&id={device_match.group(1)}"
                                    title = result.get_text(strip=True)
                    
                    # Clean up the link and title
                    if link and title:
                        # Clean title
                        title = re.sub(r'\s+', ' ', title).strip()
                        
                        # Ensure absolute URL
                        if link.startswith('/'):
                            link = self.base_url + link
                        elif not link.startswith('http'):
                            link = f"{self.base_url}/{link}"
                        
                        # Filter relevant results
                        if any(word.lower() in title.lower() for word in phone_name.split()):
                            results.append({
                                'title': title,
                                'url': link
                            })
                
                if results:
                    logger.info(f"Found {len(results)} results using URL: {search_url}")
                    return results
                    
            except Exception as e:
                logger.warning(f"Search URL failed {search_url}: {e}")
                continue
        
        logger.error(f"All search methods failed for: {phone_name}")
        return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from a phone page"""
        try:
            logger.info(f"Fetching specs from: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract phone data
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name from multiple possible locations
            title_candidates = [
                soup.find('h1'),
                soup.find('h2'), 
                soup.find('title'),
                soup.find('div', class_=re.compile(r'title|name|header')),
                soup.find('td', string=re.compile(r'Model|Name', re.I))
            ]
            
            for candidate in title_candidates:
                if candidate:
                    title = candidate.get_text(strip=True)
                    if title and len(title) > 3:
                        phone_data['name'] = title
                        break
            
            # Extract brand from title or URL
            if phone_data['name']:
                phone_data['brand'] = phone_data['name'].split()[0]
            
            # Get images with multiple approaches
            images = []
            
            # Look for images in various containers
            img_selectors = [
                'img[src*="phone"]',
                'img[src*="device"]', 
                'img[src*="mobile"]',
                'img[alt*="phone"]',
                'img[alt*="device"]',
                '.device-image img',
                '.phone-image img',
                'td img',
                'div img'
            ]
            
            for selector in img_selectors:
                imgs = soup.select(selector)
                for img in imgs:
                    src = img.get('src', '')
                    if src:
                        # Convert relative URLs to absolute
                        if src.startswith('/'):
                            img_url = self.base_url + src
                        elif not src.startswith('http'):
                            img_url = f"{self.base_url}/{src}"
                        else:
                            img_url = src
                        
                        # Avoid duplicates and filter out tiny images
                        if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
                            images.append(img_url)
            
            phone_data['images'] = images[:5]  # Limit to 5 images
            
            # Extract specifications using multiple methods
            specs = {}
            
            # Method 1: PhoneDB specific table structure
            spec_tables = soup.find_all('table')
            for table in spec_tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100 and len(value) < 500:
                            specs[key] = value
            
            # Method 2: Look for labeled specifications
            labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
            for label in labeled_specs:
                label_text = label.get_text(strip=True)
                if ':' in label_text:
                    key, value = label_text.split(':', 1)
                    specs[key.strip()] = value.strip()
                else:
                    # Look for value in next sibling
                    sibling = label.find_next_sibling()
                    if sibling:
                        value = sibling.get_text(strip=True)
                        if value:
                            specs[label_text] = value
            
            # Method 3: Extract common phone specifications from text
            text_content = soup.get_text()
            
            # Updated patterns for better matching
            spec_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|″)',
                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)',
                'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
                'Battery': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
                'Operating System': r'(Android|iOS)\s*[\d\.]*',
                'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
                'Network': r'(2G|3G|4G|5G|LTE)',
                'Weight': r'(\d+)\s*(?:g|gram)',
                'Dimensions': r'(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*mm'
            }
            
            for spec_name, pattern in spec_patterns.items():
                if spec_name not in specs:  # Don't override existing specs
                    matches = re.findall(pattern, text_content, re.IGNORECASE)
                    if matches:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
                        elif spec_name == 'Dimensions':
                            specs[spec_name] = f"{matches[0][0]}×{matches[0][1]}×{matches[0][2]} mm"
                        else:
                            specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
            
            phone_data['specifications'] = specs
            
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name"""
        logger.info(f"Searching for: {phone_name}")
        
        # Search for the phone
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        
        # Process results
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
                
            # Be respectful with requests
            time.sleep(1)
        
        return results[0] if get_first_result and results else results

    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones and return structured JSON"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(2)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones

    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

# Example usage with error handling and alternative sites
def main():
    scraper = PhoneDBScraper()
    
    # Example 1: Scrape a single phone
    phone_name = "iPhone 15 Pro"
    print(f"Attempting to scrape: {phone_name}")
    
    result = scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"✅ Successfully scraped {result['name']}")
        print(f"Found {len(result['specifications'])} specifications")
        print(f"Found {len(result['images'])} images")
        print(json.dumps(result, indent=2))
        scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
    else:
        print(f"❌ Failed to scrape {phone_name}")
        print("This might be due to:")
        print("1. PhoneDB.net blocking automated requests")
        print("2. Phone not found in their database")
        print("3. Site structure changes")
        print("\nAlternative solutions:")
        print("- Try with a different phone name")
        print("- Use a VPN if blocked by IP")
        print("- Consider using alternative sites like GSMArena")
    
    # Example 2: Test with multiple phones
    phone_list = [
        "Samsung Galaxy S24",
        "Google Pixel 8", 
        "OnePlus 12"
    ]
    
    print(f"\nTesting multiple phones: {phone_list}")
    results = scraper.scrape_multiple_phones(phone_list)
    
    if results:
        scraper.save_to_json(results, "multiple_phones_specs.json")
        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
        
        for phone in results:
            print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Enhanced GSMArena scraper as main alternative
class GSMArenaScraperAlternative:
    """Enhanced GSMArena scraper with full functionality"""
    
    def __init__(self):
        self.base_url = "https://www.gsmarena.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
    
    def search_phone(self, phone_name):
        """Search GSMArena for phone"""
        search_url = f"{self.base_url}/results.php3"
        params = {'sQuickSearch': 'yes', 'sName': phone_name}
        
        try:
            logger.info(f"Searching GSMArena for: {phone_name}")
            response = self.session.get(search_url, params=params, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            results = []
            
            # Find search results in makers section
            makers = soup.find_all('div', class_='makers')
            for maker in makers:
                links = maker.find_all('a')
                for link in links[:5]:  # Limit results
                    href = link.get('href', '')
                    title = link.get_text(strip=True)
                    
                    if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
                        full_url = self.base_url + '/' + href if not href.startswith('http') else href
                        results.append({
                            'title': title,
                            'url': full_url
                        })
            
            logger.info(f"Found {len(results)} results on GSMArena")
            return results
            
        except Exception as e:
            logger.error(f"GSMArena search failed: {e}")
            return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from GSMArena phone page"""
        try:
            logger.info(f"Fetching specs from GSMArena: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name
            title_elem = soup.find('h1', class_='specs-phone-name-title')
            if not title_elem:
                title_elem = soup.find('h1') or soup.find('title')
            
            if title_elem:
                phone_data['name'] = title_elem.get_text(strip=True)
                phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
            
            # Get images
            images = []
            
            # Main phone image
            main_img_container = soup.find('div', class_='specs-photo-main')
            if main_img_container:
                img = main_img_container.find('img')
                if img and img.get('src'):
                    img_url = urljoin(phone_url, img['src'])
                    images.append(img_url)
            
            # Additional images from carousel or gallery
            carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
            if carousel:
                for img in carousel.find_all('img'):
                    src = img.get('src', '')
                    if src:
                        img_url = urljoin(phone_url, src)
                        if img_url not in images:
                            images.append(img_url)
            
            phone_data['images'] = images[:5]
            
            # Extract specifications from GSMArena's table structure
            specs = {}
            
            # GSMArena uses specific table structure
            spec_tables = soup.find_all('table', cellspacing='0')
            
            for table in spec_tables:
                # Get category header
                category = ''
                category_elem = table.find_previous('th') or table.find_previous('h2')
                if category_elem:
                    category = category_elem.get_text(strip=True)
                
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up the key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100:
                            # Add category prefix if available
                            final_key = f"{category} - {key}" if category and len(category) < 30 else key
                            specs[final_key] = value
            
            # Also extract from the detailed specs list structure
            detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
            for detail_list in detail_lists:
                items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
                for item in items:
                    text = item.get_text(strip=True)
                    if ':' in text:
                        parts = text.split(':', 1)
                        if len(parts) == 2:
                            key, value = parts
                            specs[key.strip()] = value.strip()
            
            # Extract key specs using patterns from page text
            page_text = soup.get_text()
            
            key_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)\s*pixels',
                'RAM': r'(\d+)\s*GB\s*RAM',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
                'Battery Capacity': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
                'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
                'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
                'Weight': r'(\d+)\s*g\s*weight',
                'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
            }
            
            for spec_name, pattern in key_patterns.items():
                if spec_name not in specs:
                    match = re.search(pattern, page_text, re.IGNORECASE)
                    if match:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{match.group(1)}×{match.group(2)}"
                        elif spec_name == 'Launch Date':
                            specs[spec_name] = f"{match.group(1)} {match.group(2)}"
                        else:
                            specs[spec_name] = match.group(0)
            
            phone_data['specifications'] = specs
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name from GSMArena"""
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
            time.sleep(2)  # Be respectful
        
        return results[0] if get_first_result and results else results
    
    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones from GSMArena"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(3)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones
    
    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

def test_alternative_scraper():
    """Test the enhanced GSMArena scraper"""
    print("\n" + "="*50)
    print("Testing Enhanced GSMArena Scraper")
    print("="*50)
    
    gsm_scraper = GSMArenaScraperAlternative()
    
    # Test single phone
    phone_name = "iPhone 15 Pro"
    print(f"Testing single phone: {phone_name}")
    
    result = gsm_scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"✅ Successfully scraped: {result['name']}")
        print(f"📱 Found {len(result['specifications'])} specifications")
        print(f"🖼️ Found {len(result['images'])} images")
        
        # Show some key specs
        key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
        print("\n📋 Key Specifications:")
        for spec in key_specs:
            for key, value in result['specifications'].items():
                if spec.lower() in key.lower():
                    print(f"  • {key}: {value}")
                    break
        
        # Save result
        gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
        
    else:
        print(f"❌ Failed to scrape {phone_name}")
    
    # Test multiple phones
    print(f"\n" + "-"*40)
    print("Testing Multiple Phones")
    print("-"*40)
    
    phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
    results = gsm_scraper.scrape_multiple_phones(phone_list)
    
    if results:
        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
        gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
        
        for phone in results:
            print(f"📱 {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Main function with both scrapers
def main():
    print("🚀 Phone Specifications Scraper")
    print("="*50)
    
    # Try PhoneDB first
    try:
        print("Attempting PhoneDB scraper...")
        scraper = PhoneDBScraper()
        phone_name = "iPhone 15 Pro"
        result = scraper.scrape_phone_by_name(phone_name)
        
        if result:
            print(f"✅ PhoneDB: Successfully scraped {result['name']}")
            scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
            return
        else:
            print("❌ PhoneDB scraper failed, trying GSMArena...")
            
    except Exception as e:
        print(f"❌ PhoneDB initialization failed: {str(e)}")
        print("🔄 Switching to GSMArena scraper...")
    
    # Use GSMArena as fallback
    test_alternative_scraper()

if __name__ == "__main__":
#    main()
    # Uncomment the line below to test GSMArena alternative
     test_alternative_scraper()