import requests from bs4 import BeautifulSoup import pandas as pd import time import re from typing import Dict, List, Optional import json from datetime import datetime import io class ManusCopistaRequestsScraper: def __init__(self): self.base_url = "https://manus.iccu.sbn.it" self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/" self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse" # Setup session with proper headers self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', }) def get_page(self, url: str) -> Optional[BeautifulSoup]: """Fetch a page and return BeautifulSoup object""" try: print(f"Fetching: {url}") response = self.session.get(url, timeout=15) response.raise_for_status() # Check if we got a proper response if response.status_code != 200: print(f"Bad status code: {response.status_code}") return None return BeautifulSoup(response.text, 'html.parser') except requests.exceptions.RequestException as e: print(f"Request error for {url}: {e}") return None except Exception as e: print(f"Unexpected error for {url}: {e}") return None def discover_copyist_ids(self) -> List[str]: """Discover copyist IDs from the browse page""" print("Discovering copyist IDs...") # Try different approaches to get the data urls_to_try = [ self.browse_url, "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse", "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50", "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100" ] all_ids = set() for url in urls_to_try: soup = self.get_page(url) if soup: ids = self.extract_ids_from_page(soup) all_ids.update(ids) print(f"Found {len(ids)} IDs from {url}") # If we found IDs, try to get more from pagination if ids: pagination_ids = self.handle_pagination(soup, url) all_ids.update(pagination_ids) # If no IDs found from browse page, try a range-based approach if not all_ids: print("No IDs found from browse page, trying range-based discovery...") all_ids = self.discover_ids_by_range() return sorted(list(all_ids)) def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]: """Extract copyist IDs from a page""" ids = set() # Look for links that contain detail/ followed by numbers links = soup.find_all('a', href=True) for link in links: href = link.get('href', '') match = re.search(r'detail/(\d+)', href) if match: copyist_id = match.group(1) if len(copyist_id) >= 5: # Valid ID length ids.add(copyist_id) # Also look for any numbers that might be IDs in the page text = soup.get_text() numbers = re.findall(r'\b\d{6,7}\b', text) for num in numbers: if self.is_valid_id_format(num): ids.add(num) return list(ids) def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]: """Handle pagination to get more IDs""" all_ids = set() # Look for pagination links pagination_links = [] links = soup.find_all('a', href=True) for link in links: href = link.get('href', '') text = link.get_text(strip=True).lower() # Look for next page or numbered pages if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit(): if href and href.startswith('/'): full_url = self.base_url + href pagination_links.append(full_url) # Visit pagination pages for page_url in pagination_links[:10]: # Limit to prevent infinite loops print(f"Checking pagination page: {page_url}") page_soup = self.get_page(page_url) if page_soup: page_ids = self.extract_ids_from_page(page_soup) all_ids.update(page_ids) time.sleep(1) # Be respectful return list(all_ids) def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]: """Discover IDs by testing a range of potential IDs""" print(f"Testing range-based discovery with {sample_size} samples...") valid_ids = [] # Test a sample of IDs in the range import random test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id)) for i, test_id in enumerate(test_ids): if i % 100 == 0: print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid") if self.test_id_exists(str(test_id)): valid_ids.append(str(test_id)) time.sleep(0.1) # Small delay return valid_ids def test_id_exists(self, copyist_id: str) -> bool: """Test if a copyist ID exists by making a HEAD request""" url = f"{self.detail_base_url}{copyist_id}" try: response = self.session.head(url, timeout=5) return response.status_code == 200 except: return False def is_valid_id_format(self, id_str: str) -> bool: """Check if string looks like a valid copyist ID""" if not id_str.isdigit(): return False return 5 <= len(id_str) <= 7 def scrape_copyist_detail(self, copyist_id: str) -> Dict: """Scrape detailed information for a single copyist""" url = f"{self.detail_base_url}{copyist_id}" soup = self.get_page(url) if not soup: return {'error': f'Could not fetch page for ID {copyist_id}'} # Extract basic info data = { 'copyist_id': copyist_id, 'detail_url': url, 'scrape_timestamp': datetime.now().isoformat() } # Extract title title = soup.find('title') if title: data['page_title'] = title.get_text(strip=True) # Extract main content self.extract_copyist_data(soup, data) return data def extract_copyist_data(self, soup: BeautifulSoup, data: Dict): """Extract copyist data from the page""" # Try to find the main content table table = soup.find('table', class_='table') if not table: table = soup.find('table') if table: self.extract_table_data(table, data) # Try to extract name from various locations name_candidates = [] # Look in headings for heading in soup.find_all(['h1', 'h2', 'h3']): text = heading.get_text(strip=True) if text and len(text) > 2: name_candidates.append(text) # Look in title if 'page_title' in data: title_parts = data['page_title'].split(' - ') for part in title_parts: if part.strip() and len(part.strip()) > 2: name_candidates.append(part.strip()) # Set the most likely name if name_candidates: data['copyist_name'] = name_candidates[0] def extract_table_data(self, table, data: Dict): """Extract data from the main table""" rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 2: key_cell = cells[0] value_cell = cells[1] key = key_cell.get_text(strip=True).lower() value = value_cell.get_text(strip=True) # Map common fields if 'cnmn' in key: data['cnmn_code'] = value elif 'sbn' in key: data['vid_sbn'] = value link = value_cell.find('a') if link: data['vid_sbn_url'] = link.get('href', '') elif 'isni' in key: data['isni_code'] = value link = value_cell.find('a') if link: data['isni_url'] = link.get('href', '') elif 'biographical' in key or 'biografica' in key: data['biographical_note'] = value elif 'bibliographical' in key or 'bibliografia' in key: if 'source' in key: data['bibliographical_sources'] = value else: data['bibliographical_notes'] = value elif 'name' in key and 'manuscript' in key: data['names_in_manuscript'] = value elif 'creation' in key or 'creazione' in key: data['date_of_creation'] = value elif 'modification' in key or 'modifica' in key: data['last_modification'] = value elif 'identifier' in key: data['other_identifiers'] = value def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame: """Scrape all copyists""" print("Starting full scrape...") # Discover IDs copyist_ids = self.discover_copyist_ids() print(f"Found {len(copyist_ids)} copyist IDs") if not copyist_ids: print("No copyist IDs found!") return pd.DataFrame() # Limit if requested if max_entries and max_entries > 0: copyist_ids = copyist_ids[:max_entries] print(f"Limited to {max_entries} entries") # Scrape each copyist all_data = [] for i, copyist_id in enumerate(copyist_ids, 1): print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}") data = self.scrape_copyist_detail(copyist_id) if 'error' not in data: data['scrape_order'] = i all_data.append(data) else: print(f"Error scraping {copyist_id}: {data['error']}") # Delay between requests if delay > 0: time.sleep(delay) df = pd.DataFrame(all_data) print(f"Successfully scraped {len(df)} copyists") return df # Simple usage example def main(): """Main function to run the scraper""" scraper = ManusCopistaRequestsScraper() # Test with a small number first print("Testing with 10 entries...") df = scraper.scrape_all_copyists(delay=1.0, max_entries=10) if not df.empty: print(f"Successfully scraped {len(df)} copyists") print("\nColumns:", df.columns.tolist()) print("\nFirst few rows:") print(df.head()) # Save to CSV filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" df.to_csv(filename, index=False) print(f"\nSaved to {filename}") else: print("No data scraped!") if __name__ == "__main__": main()