File size: 28,770 Bytes
a6c126b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, quote
import logging
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PhoneDBScraper:
    def __init__(self):
        self.base_url = "https://phonedb.net"
        self.session = requests.Session()
        
        # Configure session with better headers and SSL handling
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Set up retry strategy
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"],  # Updated parameter name
            backoff_factor=1
        )
        
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        
        # Disable SSL verification (use with caution)
        self.session.verify = False
        
    def search_phone(self, phone_name):
        """Search for a phone by name and return search results"""
        # Try different search approaches
        search_urls = [
            f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
            f"{self.base_url}/search?q={quote(phone_name)}",
            f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
        ]
        
        for search_url in search_urls:
            try:
                logger.info(f"Trying search URL: {search_url}")
                response = self.session.get(search_url, timeout=30)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find search results with multiple selectors
                results = []
                
                # Look for various possible result containers
                selectors = [
                    'div.device-item',
                    'div.device',
                    'div.phone-item', 
                    'tr[onclick*="device"]',
                    'a[href*="device"]',
                    'a[href*="phone"]',
                    'td a[href*="index.php"]'
                ]
                
                search_results = []
                for selector in selectors:
                    found = soup.select(selector)
                    if found:
                        search_results.extend(found)
                        break
                
                # Also try finding links with device IDs
                if not search_results:
                    search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
                
                for result in search_results[:10]:  # Limit to first 10 results
                    title = ""
                    link = ""
                    
                    if result.name == 'a':
                        link = result.get('href', '')
                        title = result.get_text(strip=True) or result.get('title', '')
                    elif result.name in ['div', 'tr']:
                        link_elem = result.find('a')
                        if link_elem:
                            link = link_elem.get('href', '')
                            title = link_elem.get_text(strip=True) or result.get_text(strip=True)
                        else:
                            # Check for onclick events with device info
                            onclick = result.get('onclick', '')
                            if 'device' in onclick:
                                # Extract device ID from onclick
                                device_match = re.search(r'id=(\d+)', onclick)
                                if device_match:
                                    link = f"/index.php?m=device&id={device_match.group(1)}"
                                    title = result.get_text(strip=True)
                    
                    # Clean up the link and title
                    if link and title:
                        # Clean title
                        title = re.sub(r'\s+', ' ', title).strip()
                        
                        # Ensure absolute URL
                        if link.startswith('/'):
                            link = self.base_url + link
                        elif not link.startswith('http'):
                            link = f"{self.base_url}/{link}"
                        
                        # Filter relevant results
                        if any(word.lower() in title.lower() for word in phone_name.split()):
                            results.append({
                                'title': title,
                                'url': link
                            })
                
                if results:
                    logger.info(f"Found {len(results)} results using URL: {search_url}")
                    return results
                    
            except Exception as e:
                logger.warning(f"Search URL failed {search_url}: {e}")
                continue
        
        logger.error(f"All search methods failed for: {phone_name}")
        return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from a phone page"""
        try:
            logger.info(f"Fetching specs from: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract phone data
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name from multiple possible locations
            title_candidates = [
                soup.find('h1'),
                soup.find('h2'), 
                soup.find('title'),
                soup.find('div', class_=re.compile(r'title|name|header')),
                soup.find('td', string=re.compile(r'Model|Name', re.I))
            ]
            
            for candidate in title_candidates:
                if candidate:
                    title = candidate.get_text(strip=True)
                    if title and len(title) > 3:
                        phone_data['name'] = title
                        break
            
            # Extract brand from title or URL
            if phone_data['name']:
                phone_data['brand'] = phone_data['name'].split()[0]
            
            # Get images with multiple approaches
            images = []
            
            # Look for images in various containers
            img_selectors = [
                'img[src*="phone"]',
                'img[src*="device"]', 
                'img[src*="mobile"]',
                'img[alt*="phone"]',
                'img[alt*="device"]',
                '.device-image img',
                '.phone-image img',
                'td img',
                'div img'
            ]
            
            for selector in img_selectors:
                imgs = soup.select(selector)
                for img in imgs:
                    src = img.get('src', '')
                    if src:
                        # Convert relative URLs to absolute
                        if src.startswith('/'):
                            img_url = self.base_url + src
                        elif not src.startswith('http'):
                            img_url = f"{self.base_url}/{src}"
                        else:
                            img_url = src
                        
                        # Avoid duplicates and filter out tiny images
                        if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
                            images.append(img_url)
            
            phone_data['images'] = images[:5]  # Limit to 5 images
            
            # Extract specifications using multiple methods
            specs = {}
            
            # Method 1: PhoneDB specific table structure
            spec_tables = soup.find_all('table')
            for table in spec_tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100 and len(value) < 500:
                            specs[key] = value
            
            # Method 2: Look for labeled specifications
            labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
            for label in labeled_specs:
                label_text = label.get_text(strip=True)
                if ':' in label_text:
                    key, value = label_text.split(':', 1)
                    specs[key.strip()] = value.strip()
                else:
                    # Look for value in next sibling
                    sibling = label.find_next_sibling()
                    if sibling:
                        value = sibling.get_text(strip=True)
                        if value:
                            specs[label_text] = value
            
            # Method 3: Extract common phone specifications from text
            text_content = soup.get_text()
            
            # Updated patterns for better matching
            spec_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|β€³)',
                'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)',
                'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
                'Battery': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
                'Operating System': r'(Android|iOS)\s*[\d\.]*',
                'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
                'Network': r'(2G|3G|4G|5G|LTE)',
                'Weight': r'(\d+)\s*(?:g|gram)',
                'Dimensions': r'(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*mm'
            }
            
            for spec_name, pattern in spec_patterns.items():
                if spec_name not in specs:  # Don't override existing specs
                    matches = re.findall(pattern, text_content, re.IGNORECASE)
                    if matches:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
                        elif spec_name == 'Dimensions':
                            specs[spec_name] = f"{matches[0][0]}Γ—{matches[0][1]}Γ—{matches[0][2]} mm"
                        else:
                            specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
            
            phone_data['specifications'] = specs
            
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name"""
        logger.info(f"Searching for: {phone_name}")
        
        # Search for the phone
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        
        # Process results
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
                
            # Be respectful with requests
            time.sleep(1)
        
        return results[0] if get_first_result and results else results

    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones and return structured JSON"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(2)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones

    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

# Example usage with error handling and alternative sites
def main():
    scraper = PhoneDBScraper()
    
    # Example 1: Scrape a single phone
    phone_name = "iPhone 15 Pro"
    print(f"Attempting to scrape: {phone_name}")
    
    result = scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"βœ… Successfully scraped {result['name']}")
        print(f"Found {len(result['specifications'])} specifications")
        print(f"Found {len(result['images'])} images")
        print(json.dumps(result, indent=2))
        scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
    else:
        print(f"❌ Failed to scrape {phone_name}")
        print("This might be due to:")
        print("1. PhoneDB.net blocking automated requests")
        print("2. Phone not found in their database")
        print("3. Site structure changes")
        print("\nAlternative solutions:")
        print("- Try with a different phone name")
        print("- Use a VPN if blocked by IP")
        print("- Consider using alternative sites like GSMArena")
    
    # Example 2: Test with multiple phones
    phone_list = [
        "Samsung Galaxy S24",
        "Google Pixel 8", 
        "OnePlus 12"
    ]
    
    print(f"\nTesting multiple phones: {phone_list}")
    results = scraper.scrape_multiple_phones(phone_list)
    
    if results:
        scraper.save_to_json(results, "multiple_phones_specs.json")
        print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
        
        for phone in results:
            print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Enhanced GSMArena scraper as main alternative
class GSMArenaScraperAlternative:
    """Enhanced GSMArena scraper with full functionality"""
    
    def __init__(self):
        self.base_url = "https://www.gsmarena.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
    
    def search_phone(self, phone_name):
        """Search GSMArena for phone"""
        search_url = f"{self.base_url}/results.php3"
        params = {'sQuickSearch': 'yes', 'sName': phone_name}
        
        try:
            logger.info(f"Searching GSMArena for: {phone_name}")
            response = self.session.get(search_url, params=params, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            results = []
            
            # Find search results in makers section
            makers = soup.find_all('div', class_='makers')
            for maker in makers:
                links = maker.find_all('a')
                for link in links[:5]:  # Limit results
                    href = link.get('href', '')
                    title = link.get_text(strip=True)
                    
                    if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
                        full_url = self.base_url + '/' + href if not href.startswith('http') else href
                        results.append({
                            'title': title,
                            'url': full_url
                        })
            
            logger.info(f"Found {len(results)} results on GSMArena")
            return results
            
        except Exception as e:
            logger.error(f"GSMArena search failed: {e}")
            return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from GSMArena phone page"""
        try:
            logger.info(f"Fetching specs from GSMArena: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name
            title_elem = soup.find('h1', class_='specs-phone-name-title')
            if not title_elem:
                title_elem = soup.find('h1') or soup.find('title')
            
            if title_elem:
                phone_data['name'] = title_elem.get_text(strip=True)
                phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
            
            # Get images
            images = []
            
            # Main phone image
            main_img_container = soup.find('div', class_='specs-photo-main')
            if main_img_container:
                img = main_img_container.find('img')
                if img and img.get('src'):
                    img_url = urljoin(phone_url, img['src'])
                    images.append(img_url)
            
            # Additional images from carousel or gallery
            carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
            if carousel:
                for img in carousel.find_all('img'):
                    src = img.get('src', '')
                    if src:
                        img_url = urljoin(phone_url, src)
                        if img_url not in images:
                            images.append(img_url)
            
            phone_data['images'] = images[:5]
            
            # Extract specifications from GSMArena's table structure
            specs = {}
            
            # GSMArena uses specific table structure
            spec_tables = soup.find_all('table', cellspacing='0')
            
            for table in spec_tables:
                # Get category header
                category = ''
                category_elem = table.find_previous('th') or table.find_previous('h2')
                if category_elem:
                    category = category_elem.get_text(strip=True)
                
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up the key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100:
                            # Add category prefix if available
                            final_key = f"{category} - {key}" if category and len(category) < 30 else key
                            specs[final_key] = value
            
            # Also extract from the detailed specs list structure
            detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
            for detail_list in detail_lists:
                items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
                for item in items:
                    text = item.get_text(strip=True)
                    if ':' in text:
                        parts = text.split(':', 1)
                        if len(parts) == 2:
                            key, value = parts
                            specs[key.strip()] = value.strip()
            
            # Extract key specs using patterns from page text
            page_text = soup.get_text()
            
            key_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
                'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)\s*pixels',
                'RAM': r'(\d+)\s*GB\s*RAM',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
                'Battery Capacity': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
                'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
                'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
                'Weight': r'(\d+)\s*g\s*weight',
                'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
            }
            
            for spec_name, pattern in key_patterns.items():
                if spec_name not in specs:
                    match = re.search(pattern, page_text, re.IGNORECASE)
                    if match:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{match.group(1)}Γ—{match.group(2)}"
                        elif spec_name == 'Launch Date':
                            specs[spec_name] = f"{match.group(1)} {match.group(2)}"
                        else:
                            specs[spec_name] = match.group(0)
            
            phone_data['specifications'] = specs
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name from GSMArena"""
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
            time.sleep(2)  # Be respectful
        
        return results[0] if get_first_result and results else results
    
    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones from GSMArena"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(3)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones
    
    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

def test_alternative_scraper():
    """Test the enhanced GSMArena scraper"""
    print("\n" + "="*50)
    print("Testing Enhanced GSMArena Scraper")
    print("="*50)
    
    gsm_scraper = GSMArenaScraperAlternative()
    
    # Test single phone
    phone_name = "iPhone 15 Pro"
    print(f"Testing single phone: {phone_name}")
    
    result = gsm_scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"βœ… Successfully scraped: {result['name']}")
        print(f"πŸ“± Found {len(result['specifications'])} specifications")
        print(f"πŸ–ΌοΈ Found {len(result['images'])} images")
        
        # Show some key specs
        key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
        print("\nπŸ“‹ Key Specifications:")
        for spec in key_specs:
            for key, value in result['specifications'].items():
                if spec.lower() in key.lower():
                    print(f"  β€’ {key}: {value}")
                    break
        
        # Save result
        gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
        
    else:
        print(f"❌ Failed to scrape {phone_name}")
    
    # Test multiple phones
    print(f"\n" + "-"*40)
    print("Testing Multiple Phones")
    print("-"*40)
    
    phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
    results = gsm_scraper.scrape_multiple_phones(phone_list)
    
    if results:
        print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
        gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
        
        for phone in results:
            print(f"πŸ“± {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Main function with both scrapers
def main():
    print("πŸš€ Phone Specifications Scraper")
    print("="*50)
    
    # Try PhoneDB first
    try:
        print("Attempting PhoneDB scraper...")
        scraper = PhoneDBScraper()
        phone_name = "iPhone 15 Pro"
        result = scraper.scrape_phone_by_name(phone_name)
        
        if result:
            print(f"βœ… PhoneDB: Successfully scraped {result['name']}")
            scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
            return
        else:
            print("❌ PhoneDB scraper failed, trying GSMArena...")
            
    except Exception as e:
        print(f"❌ PhoneDB initialization failed: {str(e)}")
        print("πŸ”„ Switching to GSMArena scraper...")
    
    # Use GSMArena as fallback
    test_alternative_scraper()

if __name__ == "__main__":
#    main()
    # Uncomment the line below to test GSMArena alternative
     test_alternative_scraper()