import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from typing import Dict, List, Optional
import json
from datetime import datetime
import io

class ManusCopistaRequestsScraper:
    def __init__(self):
        self.base_url = "https://manus.iccu.sbn.it"
        self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
        self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
        
        # Setup session with proper headers
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
        })
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch a page and return BeautifulSoup object"""
        try:
            print(f"Fetching: {url}")
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            
            # Check if we got a proper response
            if response.status_code != 200:
                print(f"Bad status code: {response.status_code}")
                return None
            
            return BeautifulSoup(response.text, 'html.parser')
            
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error for {url}: {e}")
            return None
    
    def discover_copyist_ids(self) -> List[str]:
        """Discover copyist IDs from the browse page"""
        print("Discovering copyist IDs...")
        
        # Try different approaches to get the data
        urls_to_try = [
            self.browse_url,
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
            "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
        ]
        
        all_ids = set()
        
        for url in urls_to_try:
            soup = self.get_page(url)
            if soup:
                ids = self.extract_ids_from_page(soup)
                all_ids.update(ids)
                print(f"Found {len(ids)} IDs from {url}")
                
                # If we found IDs, try to get more from pagination
                if ids:
                    pagination_ids = self.handle_pagination(soup, url)
                    all_ids.update(pagination_ids)
        
        # If no IDs found from browse page, try a range-based approach
        if not all_ids:
            print("No IDs found from browse page, trying range-based discovery...")
            all_ids = self.discover_ids_by_range()
        
        return sorted(list(all_ids))
    
    def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
        """Extract copyist IDs from a page"""
        ids = set()
        
        # Look for links that contain detail/ followed by numbers
        links = soup.find_all('a', href=True)
        for link in links:
            href = link.get('href', '')
            match = re.search(r'detail/(\d+)', href)
            if match:
                copyist_id = match.group(1)
                if len(copyist_id) >= 5:  # Valid ID length
                    ids.add(copyist_id)
        
        # Also look for any numbers that might be IDs in the page
        text = soup.get_text()
        numbers = re.findall(r'\b\d{6,7}\b', text)
        for num in numbers:
            if self.is_valid_id_format(num):
                ids.add(num)
        
        return list(ids)
    
    def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """Handle pagination to get more IDs"""
        all_ids = set()
        
        # Look for pagination links
        pagination_links = []
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link.get('href', '')
            text = link.get_text(strip=True).lower()
            
            # Look for next page or numbered pages
            if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
                if href and href.startswith('/'):
                    full_url = self.base_url + href
                    pagination_links.append(full_url)
        
        # Visit pagination pages
        for page_url in pagination_links[:10]:  # Limit to prevent infinite loops
            print(f"Checking pagination page: {page_url}")
            page_soup = self.get_page(page_url)
            if page_soup:
                page_ids = self.extract_ids_from_page(page_soup)
                all_ids.update(page_ids)
                time.sleep(1)  # Be respectful
        
        return list(all_ids)
    
    def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
        """Discover IDs by testing a range of potential IDs"""
        print(f"Testing range-based discovery with {sample_size} samples...")
        
        valid_ids = []
        
        # Test a sample of IDs in the range
        import random
        test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
        
        for i, test_id in enumerate(test_ids):
            if i % 100 == 0:
                print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
            
            if self.test_id_exists(str(test_id)):
                valid_ids.append(str(test_id))
            
            time.sleep(0.1)  # Small delay
        
        return valid_ids
    
    def test_id_exists(self, copyist_id: str) -> bool:
        """Test if a copyist ID exists by making a HEAD request"""
        url = f"{self.detail_base_url}{copyist_id}"
        try:
            response = self.session.head(url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def is_valid_id_format(self, id_str: str) -> bool:
        """Check if string looks like a valid copyist ID"""
        if not id_str.isdigit():
            return False
        return 5 <= len(id_str) <= 7
    
    def scrape_copyist_detail(self, copyist_id: str) -> Dict:
        """Scrape detailed information for a single copyist"""
        url = f"{self.detail_base_url}{copyist_id}"
        soup = self.get_page(url)
        
        if not soup:
            return {'error': f'Could not fetch page for ID {copyist_id}'}
        
        # Extract basic info
        data = {
            'copyist_id': copyist_id,
            'detail_url': url,
            'scrape_timestamp': datetime.now().isoformat()
        }
        
        # Extract title
        title = soup.find('title')
        if title:
            data['page_title'] = title.get_text(strip=True)
        
        # Extract main content
        self.extract_copyist_data(soup, data)
        
        return data
    
    def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
        """Extract copyist data from the page"""
        # Try to find the main content table
        table = soup.find('table', class_='table')
        if not table:
            table = soup.find('table')
        
        if table:
            self.extract_table_data(table, data)
        
        # Try to extract name from various locations
        name_candidates = []
        
        # Look in headings
        for heading in soup.find_all(['h1', 'h2', 'h3']):
            text = heading.get_text(strip=True)
            if text and len(text) > 2:
                name_candidates.append(text)
        
        # Look in title
        if 'page_title' in data:
            title_parts = data['page_title'].split(' - ')
            for part in title_parts:
                if part.strip() and len(part.strip()) > 2:
                    name_candidates.append(part.strip())
        
        # Set the most likely name
        if name_candidates:
            data['copyist_name'] = name_candidates[0]
    
    def extract_table_data(self, table, data: Dict):
        """Extract data from the main table"""
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                key_cell = cells[0]
                value_cell = cells[1]
                
                key = key_cell.get_text(strip=True).lower()
                value = value_cell.get_text(strip=True)
                
                # Map common fields
                if 'cnmn' in key:
                    data['cnmn_code'] = value
                elif 'sbn' in key:
                    data['vid_sbn'] = value
                    link = value_cell.find('a')
                    if link:
                        data['vid_sbn_url'] = link.get('href', '')
                elif 'isni' in key:
                    data['isni_code'] = value
                    link = value_cell.find('a')
                    if link:
                        data['isni_url'] = link.get('href', '')
                elif 'biographical' in key or 'biografica' in key:
                    data['biographical_note'] = value
                elif 'bibliographical' in key or 'bibliografia' in key:
                    if 'source' in key:
                        data['bibliographical_sources'] = value
                    else:
                        data['bibliographical_notes'] = value
                elif 'name' in key and 'manuscript' in key:
                    data['names_in_manuscript'] = value
                elif 'creation' in key or 'creazione' in key:
                    data['date_of_creation'] = value
                elif 'modification' in key or 'modifica' in key:
                    data['last_modification'] = value
                elif 'identifier' in key:
                    data['other_identifiers'] = value
    
    def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
        """Scrape all copyists"""
        print("Starting full scrape...")
        
        # Discover IDs
        copyist_ids = self.discover_copyist_ids()
        print(f"Found {len(copyist_ids)} copyist IDs")
        
        if not copyist_ids:
            print("No copyist IDs found!")
            return pd.DataFrame()
        
        # Limit if requested
        if max_entries and max_entries > 0:
            copyist_ids = copyist_ids[:max_entries]
            print(f"Limited to {max_entries} entries")
        
        # Scrape each copyist
        all_data = []
        
        for i, copyist_id in enumerate(copyist_ids, 1):
            print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
            
            data = self.scrape_copyist_detail(copyist_id)
            
            if 'error' not in data:
                data['scrape_order'] = i
                all_data.append(data)
            else:
                print(f"Error scraping {copyist_id}: {data['error']}")
            
            # Delay between requests
            if delay > 0:
                time.sleep(delay)
        
        df = pd.DataFrame(all_data)
        print(f"Successfully scraped {len(df)} copyists")
        return df


# Simple usage example
def main():
    """Main function to run the scraper"""
    scraper = ManusCopistaRequestsScraper()
    
    # Test with a small number first
    print("Testing with 10 entries...")
    df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
    
    if not df.empty:
        print(f"Successfully scraped {len(df)} copyists")
        print("\nColumns:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        
        # Save to CSV
        filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False)
        print(f"\nSaved to {filename}")
    else:
        print("No data scraped!")


if __name__ == "__main__":
    main()