Spaces:

kambris
/

Soup

Runtime error

File size: 12,514 Bytes

2bd6de2
 
 
 
 
 
 
 
3bd337f
d1c8665
3bd337f
 
 
d1c8665
 
 
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
3bd337f
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
d1c8665
 
3bd337f
d1c8665
 
3bd337f
 
 
d1c8665
3bd337f
 
 
 
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
 
 
 
 
 
 
 
 
 
d1c8665
3bd337f
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
d1c8665
 
3bd337f
 
d1c8665
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
 
 
3bd337f
 
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
 
 
d1c8665
3bd337f
 
 
 
 
 
 
 
2bd6de2
3bd337f
d1c8665
3bd337f
 
 
55fd359
3bd337f
d1c8665
3bd337f
 
 
33a28ec
3bd337f
 
 
 
 
 
 
 
33a28ec
3bd337f
2bd6de2
3bd337f
 
 
33a28ec
3bd337f
33a28ec
 
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
2bd6de2
 
3bd337f
2bd6de2
3bd337f
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
2bd6de2
3bd337f
 
5852f69
3bd337f
5852f69
3bd337f
 
 
 
 
 
5852f69
3bd337f
 
5852f69
3bd337f
 
5852f69
3bd337f
 
 
 
 
5852f69
3bd337f
 
 
 
 
 
5852f69
3bd337f
 
 
2bd6de2
3bd337f
 
 
2bd6de2
 
3bd337f
 
 
 
2bd6de2
3bd337f
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
 
3bd337f
d1c8665
3bd337f
d1c8665
3bd337f
 
 
d1c8665
3bd337f
d1c8665
 
 
 
 
3bd337f
 
 
2bd6de2
b15e2d6
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
1e189cd
2bd6de2
3bd337f

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from typing import Dict, List, Optional
import json
from datetime import datetime
import io

class ManusCopistaRequestsScraper:
    def __init__(self):
        self.base_url = "https://manus.iccu.sbn.it"
        self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
        self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
        
        # Setup session with proper headers
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
        })
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch a page and return BeautifulSoup object"""
        try:
            print(f"Fetching: {url}")
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            
            # Check if we got a proper response
            if response.status_code != 200:
                print(f"Bad status code: {response.status_code}")
                return None
            
            return BeautifulSoup(response.text, 'html.parser')
            
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error for {url}: {e}")
            return None
    
    def discover_copyist_ids(self) -> List[str]:
        """Discover copyist IDs from the browse page"""
        print("Discovering copyist IDs...")
        
        # Try different approaches to get the data
        urls_to_try = [
            self.browse_url,
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
            "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
        ]
        
        all_ids = set()
        
        for url in urls_to_try:
            soup = self.get_page(url)
            if soup:
                ids = self.extract_ids_from_page(soup)
                all_ids.update(ids)
                print(f"Found {len(ids)} IDs from {url}")
                
                # If we found IDs, try to get more from pagination
                if ids:
                    pagination_ids = self.handle_pagination(soup, url)
                    all_ids.update(pagination_ids)
        
        # If no IDs found from browse page, try a range-based approach
        if not all_ids:
            print("No IDs found from browse page, trying range-based discovery...")
            all_ids = self.discover_ids_by_range()
        
        return sorted(list(all_ids))
    
    def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
        """Extract copyist IDs from a page"""
        ids = set()
        
        # Look for links that contain detail/ followed by numbers
        links = soup.find_all('a', href=True)
        for link in links:
            href = link.get('href', '')
            match = re.search(r'detail/(\d+)', href)
            if match:
                copyist_id = match.group(1)
                if len(copyist_id) >= 5:  # Valid ID length
                    ids.add(copyist_id)
        
        # Also look for any numbers that might be IDs in the page
        text = soup.get_text()
        numbers = re.findall(r'\b\d{6,7}\b', text)
        for num in numbers:
            if self.is_valid_id_format(num):
                ids.add(num)
        
        return list(ids)
    
    def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """Handle pagination to get more IDs"""
        all_ids = set()
        
        # Look for pagination links
        pagination_links = []
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link.get('href', '')
            text = link.get_text(strip=True).lower()
            
            # Look for next page or numbered pages
            if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
                if href and href.startswith('/'):
                    full_url = self.base_url + href
                    pagination_links.append(full_url)
        
        # Visit pagination pages
        for page_url in pagination_links[:10]:  # Limit to prevent infinite loops
            print(f"Checking pagination page: {page_url}")
            page_soup = self.get_page(page_url)
            if page_soup:
                page_ids = self.extract_ids_from_page(page_soup)
                all_ids.update(page_ids)
                time.sleep(1)  # Be respectful
        
        return list(all_ids)
    
    def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
        """Discover IDs by testing a range of potential IDs"""
        print(f"Testing range-based discovery with {sample_size} samples...")
        
        valid_ids = []
        
        # Test a sample of IDs in the range
        import random
        test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
        
        for i, test_id in enumerate(test_ids):
            if i % 100 == 0:
                print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
            
            if self.test_id_exists(str(test_id)):
                valid_ids.append(str(test_id))
            
            time.sleep(0.1)  # Small delay
        
        return valid_ids
    
    def test_id_exists(self, copyist_id: str) -> bool:
        """Test if a copyist ID exists by making a HEAD request"""
        url = f"{self.detail_base_url}{copyist_id}"
        try:
            response = self.session.head(url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def is_valid_id_format(self, id_str: str) -> bool:
        """Check if string looks like a valid copyist ID"""
        if not id_str.isdigit():
            return False
        return 5 <= len(id_str) <= 7
    
    def scrape_copyist_detail(self, copyist_id: str) -> Dict:
        """Scrape detailed information for a single copyist"""
        url = f"{self.detail_base_url}{copyist_id}"
        soup = self.get_page(url)
        
        if not soup:
            return {'error': f'Could not fetch page for ID {copyist_id}'}
        
        # Extract basic info
        data = {
            'copyist_id': copyist_id,
            'detail_url': url,
            'scrape_timestamp': datetime.now().isoformat()
        }
        
        # Extract title
        title = soup.find('title')
        if title:
            data['page_title'] = title.get_text(strip=True)
        
        # Extract main content
        self.extract_copyist_data(soup, data)
        
        return data
    
    def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
        """Extract copyist data from the page"""
        # Try to find the main content table
        table = soup.find('table', class_='table')
        if not table:
            table = soup.find('table')
        
        if table:
            self.extract_table_data(table, data)
        
        # Try to extract name from various locations
        name_candidates = []
        
        # Look in headings
        for heading in soup.find_all(['h1', 'h2', 'h3']):
            text = heading.get_text(strip=True)
            if text and len(text) > 2:
                name_candidates.append(text)
        
        # Look in title
        if 'page_title' in data:
            title_parts = data['page_title'].split(' - ')
            for part in title_parts:
                if part.strip() and len(part.strip()) > 2:
                    name_candidates.append(part.strip())
        
        # Set the most likely name
        if name_candidates:
            data['copyist_name'] = name_candidates[0]
    
    def extract_table_data(self, table, data: Dict):
        """Extract data from the main table"""
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                key_cell = cells[0]
                value_cell = cells[1]
                
                key = key_cell.get_text(strip=True).lower()
                value = value_cell.get_text(strip=True)
                
                # Map common fields
                if 'cnmn' in key:
                    data['cnmn_code'] = value
                elif 'sbn' in key:
                    data['vid_sbn'] = value
                    link = value_cell.find('a')
                    if link:
                        data['vid_sbn_url'] = link.get('href', '')
                elif 'isni' in key:
                    data['isni_code'] = value
                    link = value_cell.find('a')
                    if link:
                        data['isni_url'] = link.get('href', '')
                elif 'biographical' in key or 'biografica' in key:
                    data['biographical_note'] = value
                elif 'bibliographical' in key or 'bibliografia' in key:
                    if 'source' in key:
                        data['bibliographical_sources'] = value
                    else:
                        data['bibliographical_notes'] = value
                elif 'name' in key and 'manuscript' in key:
                    data['names_in_manuscript'] = value
                elif 'creation' in key or 'creazione' in key:
                    data['date_of_creation'] = value
                elif 'modification' in key or 'modifica' in key:
                    data['last_modification'] = value
                elif 'identifier' in key:
                    data['other_identifiers'] = value
    
    def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
        """Scrape all copyists"""
        print("Starting full scrape...")
        
        # Discover IDs
        copyist_ids = self.discover_copyist_ids()
        print(f"Found {len(copyist_ids)} copyist IDs")
        
        if not copyist_ids:
            print("No copyist IDs found!")
            return pd.DataFrame()
        
        # Limit if requested
        if max_entries and max_entries > 0:
            copyist_ids = copyist_ids[:max_entries]
            print(f"Limited to {max_entries} entries")
        
        # Scrape each copyist
        all_data = []
        
        for i, copyist_id in enumerate(copyist_ids, 1):
            print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
            
            data = self.scrape_copyist_detail(copyist_id)
            
            if 'error' not in data:
                data['scrape_order'] = i
                all_data.append(data)
            else:
                print(f"Error scraping {copyist_id}: {data['error']}")
            
            # Delay between requests
            if delay > 0:
                time.sleep(delay)
        
        df = pd.DataFrame(all_data)
        print(f"Successfully scraped {len(df)} copyists")
        return df


# Simple usage example
def main():
    """Main function to run the scraper"""
    scraper = ManusCopistaRequestsScraper()
    
    # Test with a small number first
    print("Testing with 10 entries...")
    df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
    
    if not df.empty:
        print(f"Successfully scraped {len(df)} copyists")
        print("\nColumns:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        
        # Save to CSV
        filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False)
        print(f"\nSaved to {filename}")
    else:
        print("No data scraped!")


if __name__ == "__main__":
    main()