| | import requests |
| | from bs4 import BeautifulSoup |
| | import pandas as pd |
| | import time |
| | import re |
| | from typing import Dict, List, Optional |
| | import json |
| | from datetime import datetime |
| | import io |
| |
|
| | class ManusCopistaRequestsScraper: |
| | def __init__(self): |
| | self.base_url = "https://manus.iccu.sbn.it" |
| | self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/" |
| | self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse" |
| | |
| | |
| | self.session = requests.Session() |
| | self.session.headers.update({ |
| | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
| | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| | 'Accept-Language': 'en-US,en;q=0.5', |
| | 'Accept-Encoding': 'gzip, deflate, br', |
| | 'Connection': 'keep-alive', |
| | 'Upgrade-Insecure-Requests': '1', |
| | 'Sec-Fetch-Dest': 'document', |
| | 'Sec-Fetch-Mode': 'navigate', |
| | 'Sec-Fetch-Site': 'none', |
| | }) |
| | |
| | def get_page(self, url: str) -> Optional[BeautifulSoup]: |
| | """Fetch a page and return BeautifulSoup object""" |
| | try: |
| | print(f"Fetching: {url}") |
| | response = self.session.get(url, timeout=15) |
| | response.raise_for_status() |
| | |
| | |
| | if response.status_code != 200: |
| | print(f"Bad status code: {response.status_code}") |
| | return None |
| | |
| | return BeautifulSoup(response.text, 'html.parser') |
| | |
| | except requests.exceptions.RequestException as e: |
| | print(f"Request error for {url}: {e}") |
| | return None |
| | except Exception as e: |
| | print(f"Unexpected error for {url}: {e}") |
| | return None |
| | |
| | def discover_copyist_ids(self) -> List[str]: |
| | """Discover copyist IDs from the browse page""" |
| | print("Discovering copyist IDs...") |
| | |
| | |
| | urls_to_try = [ |
| | self.browse_url, |
| | "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse", |
| | "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50", |
| | "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100" |
| | ] |
| | |
| | all_ids = set() |
| | |
| | for url in urls_to_try: |
| | soup = self.get_page(url) |
| | if soup: |
| | ids = self.extract_ids_from_page(soup) |
| | all_ids.update(ids) |
| | print(f"Found {len(ids)} IDs from {url}") |
| | |
| | |
| | if ids: |
| | pagination_ids = self.handle_pagination(soup, url) |
| | all_ids.update(pagination_ids) |
| | |
| | |
| | if not all_ids: |
| | print("No IDs found from browse page, trying range-based discovery...") |
| | all_ids = self.discover_ids_by_range() |
| | |
| | return sorted(list(all_ids)) |
| | |
| | def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]: |
| | """Extract copyist IDs from a page""" |
| | ids = set() |
| | |
| | |
| | links = soup.find_all('a', href=True) |
| | for link in links: |
| | href = link.get('href', '') |
| | match = re.search(r'detail/(\d+)', href) |
| | if match: |
| | copyist_id = match.group(1) |
| | if len(copyist_id) >= 5: |
| | ids.add(copyist_id) |
| | |
| | |
| | text = soup.get_text() |
| | numbers = re.findall(r'\b\d{6,7}\b', text) |
| | for num in numbers: |
| | if self.is_valid_id_format(num): |
| | ids.add(num) |
| | |
| | return list(ids) |
| | |
| | def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]: |
| | """Handle pagination to get more IDs""" |
| | all_ids = set() |
| | |
| | |
| | pagination_links = [] |
| | links = soup.find_all('a', href=True) |
| | |
| | for link in links: |
| | href = link.get('href', '') |
| | text = link.get_text(strip=True).lower() |
| | |
| | |
| | if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit(): |
| | if href and href.startswith('/'): |
| | full_url = self.base_url + href |
| | pagination_links.append(full_url) |
| | |
| | |
| | for page_url in pagination_links[:10]: |
| | print(f"Checking pagination page: {page_url}") |
| | page_soup = self.get_page(page_url) |
| | if page_soup: |
| | page_ids = self.extract_ids_from_page(page_soup) |
| | all_ids.update(page_ids) |
| | time.sleep(1) |
| | |
| | return list(all_ids) |
| | |
| | def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]: |
| | """Discover IDs by testing a range of potential IDs""" |
| | print(f"Testing range-based discovery with {sample_size} samples...") |
| | |
| | valid_ids = [] |
| | |
| | |
| | import random |
| | test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id)) |
| | |
| | for i, test_id in enumerate(test_ids): |
| | if i % 100 == 0: |
| | print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid") |
| | |
| | if self.test_id_exists(str(test_id)): |
| | valid_ids.append(str(test_id)) |
| | |
| | time.sleep(0.1) |
| | |
| | return valid_ids |
| | |
| | def test_id_exists(self, copyist_id: str) -> bool: |
| | """Test if a copyist ID exists by making a HEAD request""" |
| | url = f"{self.detail_base_url}{copyist_id}" |
| | try: |
| | response = self.session.head(url, timeout=5) |
| | return response.status_code == 200 |
| | except: |
| | return False |
| | |
| | def is_valid_id_format(self, id_str: str) -> bool: |
| | """Check if string looks like a valid copyist ID""" |
| | if not id_str.isdigit(): |
| | return False |
| | return 5 <= len(id_str) <= 7 |
| | |
| | def scrape_copyist_detail(self, copyist_id: str) -> Dict: |
| | """Scrape detailed information for a single copyist""" |
| | url = f"{self.detail_base_url}{copyist_id}" |
| | soup = self.get_page(url) |
| | |
| | if not soup: |
| | return {'error': f'Could not fetch page for ID {copyist_id}'} |
| | |
| | |
| | data = { |
| | 'copyist_id': copyist_id, |
| | 'detail_url': url, |
| | 'scrape_timestamp': datetime.now().isoformat() |
| | } |
| | |
| | |
| | title = soup.find('title') |
| | if title: |
| | data['page_title'] = title.get_text(strip=True) |
| | |
| | |
| | self.extract_copyist_data(soup, data) |
| | |
| | return data |
| | |
| | def extract_copyist_data(self, soup: BeautifulSoup, data: Dict): |
| | """Extract copyist data from the page""" |
| | |
| | table = soup.find('table', class_='table') |
| | if not table: |
| | table = soup.find('table') |
| | |
| | if table: |
| | self.extract_table_data(table, data) |
| | |
| | |
| | name_candidates = [] |
| | |
| | |
| | for heading in soup.find_all(['h1', 'h2', 'h3']): |
| | text = heading.get_text(strip=True) |
| | if text and len(text) > 2: |
| | name_candidates.append(text) |
| | |
| | |
| | if 'page_title' in data: |
| | title_parts = data['page_title'].split(' - ') |
| | for part in title_parts: |
| | if part.strip() and len(part.strip()) > 2: |
| | name_candidates.append(part.strip()) |
| | |
| | |
| | if name_candidates: |
| | data['copyist_name'] = name_candidates[0] |
| | |
| | def extract_table_data(self, table, data: Dict): |
| | """Extract data from the main table""" |
| | rows = table.find_all('tr') |
| | |
| | for row in rows: |
| | cells = row.find_all(['td', 'th']) |
| | if len(cells) >= 2: |
| | key_cell = cells[0] |
| | value_cell = cells[1] |
| | |
| | key = key_cell.get_text(strip=True).lower() |
| | value = value_cell.get_text(strip=True) |
| | |
| | |
| | if 'cnmn' in key: |
| | data['cnmn_code'] = value |
| | elif 'sbn' in key: |
| | data['vid_sbn'] = value |
| | link = value_cell.find('a') |
| | if link: |
| | data['vid_sbn_url'] = link.get('href', '') |
| | elif 'isni' in key: |
| | data['isni_code'] = value |
| | link = value_cell.find('a') |
| | if link: |
| | data['isni_url'] = link.get('href', '') |
| | elif 'biographical' in key or 'biografica' in key: |
| | data['biographical_note'] = value |
| | elif 'bibliographical' in key or 'bibliografia' in key: |
| | if 'source' in key: |
| | data['bibliographical_sources'] = value |
| | else: |
| | data['bibliographical_notes'] = value |
| | elif 'name' in key and 'manuscript' in key: |
| | data['names_in_manuscript'] = value |
| | elif 'creation' in key or 'creazione' in key: |
| | data['date_of_creation'] = value |
| | elif 'modification' in key or 'modifica' in key: |
| | data['last_modification'] = value |
| | elif 'identifier' in key: |
| | data['other_identifiers'] = value |
| | |
| | def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame: |
| | """Scrape all copyists""" |
| | print("Starting full scrape...") |
| | |
| | |
| | copyist_ids = self.discover_copyist_ids() |
| | print(f"Found {len(copyist_ids)} copyist IDs") |
| | |
| | if not copyist_ids: |
| | print("No copyist IDs found!") |
| | return pd.DataFrame() |
| | |
| | |
| | if max_entries and max_entries > 0: |
| | copyist_ids = copyist_ids[:max_entries] |
| | print(f"Limited to {max_entries} entries") |
| | |
| | |
| | all_data = [] |
| | |
| | for i, copyist_id in enumerate(copyist_ids, 1): |
| | print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}") |
| | |
| | data = self.scrape_copyist_detail(copyist_id) |
| | |
| | if 'error' not in data: |
| | data['scrape_order'] = i |
| | all_data.append(data) |
| | else: |
| | print(f"Error scraping {copyist_id}: {data['error']}") |
| | |
| | |
| | if delay > 0: |
| | time.sleep(delay) |
| | |
| | df = pd.DataFrame(all_data) |
| | print(f"Successfully scraped {len(df)} copyists") |
| | return df |
| |
|
| |
|
| | |
| | def main(): |
| | """Main function to run the scraper""" |
| | scraper = ManusCopistaRequestsScraper() |
| | |
| | |
| | print("Testing with 10 entries...") |
| | df = scraper.scrape_all_copyists(delay=1.0, max_entries=10) |
| | |
| | if not df.empty: |
| | print(f"Successfully scraped {len(df)} copyists") |
| | print("\nColumns:", df.columns.tolist()) |
| | print("\nFirst few rows:") |
| | print(df.head()) |
| | |
| | |
| | filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" |
| | df.to_csv(filename, index=False) |
| | print(f"\nSaved to {filename}") |
| | else: |
| | print("No data scraped!") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |