Spaces:

kambris
/

Soup

Runtime error

App Files Files Community

Soup / app.py

kambris

Update app.py

3bd337f verified 8 months ago

raw

history blame contribute delete

12.5 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import time
	import re
	from typing import Dict, List, Optional
	import json
	from datetime import datetime
	import io

	class ManusCopistaRequestsScraper:
	def __init__(self):
	self.base_url = "https://manus.iccu.sbn.it"
	self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
	self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"

	# Setup session with proper headers
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate, br',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'Sec-Fetch-Dest': 'document',
	'Sec-Fetch-Mode': 'navigate',
	'Sec-Fetch-Site': 'none',
	})

	def get_page(self, url: str) -> Optional[BeautifulSoup]:
	"""Fetch a page and return BeautifulSoup object"""
	try:
	print(f"Fetching: {url}")
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	# Check if we got a proper response
	if response.status_code != 200:
	print(f"Bad status code: {response.status_code}")
	return None

	return BeautifulSoup(response.text, 'html.parser')

	except requests.exceptions.RequestException as e:
	print(f"Request error for {url}: {e}")
	return None
	except Exception as e:
	print(f"Unexpected error for {url}: {e}")
	return None

	def discover_copyist_ids(self) -> List[str]:
	"""Discover copyist IDs from the browse page"""
	print("Discovering copyist IDs...")

	# Try different approaches to get the data
	urls_to_try = [
	self.browse_url,
	"https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
	"https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
	"https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
	]

	all_ids = set()

	for url in urls_to_try:
	soup = self.get_page(url)
	if soup:
	ids = self.extract_ids_from_page(soup)
	all_ids.update(ids)
	print(f"Found {len(ids)} IDs from {url}")

	# If we found IDs, try to get more from pagination
	if ids:
	pagination_ids = self.handle_pagination(soup, url)
	all_ids.update(pagination_ids)

	# If no IDs found from browse page, try a range-based approach
	if not all_ids:
	print("No IDs found from browse page, trying range-based discovery...")
	all_ids = self.discover_ids_by_range()

	return sorted(list(all_ids))

	def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
	"""Extract copyist IDs from a page"""
	ids = set()

	# Look for links that contain detail/ followed by numbers
	links = soup.find_all('a', href=True)
	for link in links:
	href = link.get('href', '')
	match = re.search(r'detail/(\d+)', href)
	if match:
	copyist_id = match.group(1)
	if len(copyist_id) >= 5: # Valid ID length
	ids.add(copyist_id)

	# Also look for any numbers that might be IDs in the page
	text = soup.get_text()
	numbers = re.findall(r'\b\d{6,7}\b', text)
	for num in numbers:
	if self.is_valid_id_format(num):
	ids.add(num)

	return list(ids)

	def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
	"""Handle pagination to get more IDs"""
	all_ids = set()

	# Look for pagination links
	pagination_links = []
	links = soup.find_all('a', href=True)

	for link in links:
	href = link.get('href', '')
	text = link.get_text(strip=True).lower()

	# Look for next page or numbered pages
	if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
	if href and href.startswith('/'):
	full_url = self.base_url + href
	pagination_links.append(full_url)

	# Visit pagination pages
	for page_url in pagination_links[:10]: # Limit to prevent infinite loops
	print(f"Checking pagination page: {page_url}")
	page_soup = self.get_page(page_url)
	if page_soup:
	page_ids = self.extract_ids_from_page(page_soup)
	all_ids.update(page_ids)
	time.sleep(1) # Be respectful

	return list(all_ids)

	def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
	"""Discover IDs by testing a range of potential IDs"""
	print(f"Testing range-based discovery with {sample_size} samples...")

	valid_ids = []

	# Test a sample of IDs in the range
	import random
	test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))

	for i, test_id in enumerate(test_ids):
	if i % 100 == 0:
	print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")

	if self.test_id_exists(str(test_id)):
	valid_ids.append(str(test_id))

	time.sleep(0.1) # Small delay

	return valid_ids

	def test_id_exists(self, copyist_id: str) -> bool:
	"""Test if a copyist ID exists by making a HEAD request"""
	url = f"{self.detail_base_url}{copyist_id}"
	try:
	response = self.session.head(url, timeout=5)
	return response.status_code == 200
	except:
	return False

	def is_valid_id_format(self, id_str: str) -> bool:
	"""Check if string looks like a valid copyist ID"""
	if not id_str.isdigit():
	return False
	return 5 <= len(id_str) <= 7

	def scrape_copyist_detail(self, copyist_id: str) -> Dict:
	"""Scrape detailed information for a single copyist"""
	url = f"{self.detail_base_url}{copyist_id}"
	soup = self.get_page(url)

	if not soup:
	return {'error': f'Could not fetch page for ID {copyist_id}'}

	# Extract basic info
	data = {
	'copyist_id': copyist_id,
	'detail_url': url,
	'scrape_timestamp': datetime.now().isoformat()
	}

	# Extract title
	title = soup.find('title')
	if title:
	data['page_title'] = title.get_text(strip=True)

	# Extract main content
	self.extract_copyist_data(soup, data)

	return data

	def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
	"""Extract copyist data from the page"""
	# Try to find the main content table
	table = soup.find('table', class_='table')
	if not table:
	table = soup.find('table')

	if table:
	self.extract_table_data(table, data)

	# Try to extract name from various locations
	name_candidates = []

	# Look in headings
	for heading in soup.find_all(['h1', 'h2', 'h3']):
	text = heading.get_text(strip=True)
	if text and len(text) > 2:
	name_candidates.append(text)

	# Look in title
	if 'page_title' in data:
	title_parts = data['page_title'].split(' - ')
	for part in title_parts:
	if part.strip() and len(part.strip()) > 2:
	name_candidates.append(part.strip())

	# Set the most likely name
	if name_candidates:
	data['copyist_name'] = name_candidates[0]

	def extract_table_data(self, table, data: Dict):
	"""Extract data from the main table"""
	rows = table.find_all('tr')

	for row in rows:
	cells = row.find_all(['td', 'th'])
	if len(cells) >= 2:
	key_cell = cells[0]
	value_cell = cells[1]

	key = key_cell.get_text(strip=True).lower()
	value = value_cell.get_text(strip=True)

	# Map common fields
	if 'cnmn' in key:
	data['cnmn_code'] = value
	elif 'sbn' in key:
	data['vid_sbn'] = value
	link = value_cell.find('a')
	if link:
	data['vid_sbn_url'] = link.get('href', '')
	elif 'isni' in key:
	data['isni_code'] = value
	link = value_cell.find('a')
	if link:
	data['isni_url'] = link.get('href', '')
	elif 'biographical' in key or 'biografica' in key:
	data['biographical_note'] = value
	elif 'bibliographical' in key or 'bibliografia' in key:
	if 'source' in key:
	data['bibliographical_sources'] = value
	else:
	data['bibliographical_notes'] = value
	elif 'name' in key and 'manuscript' in key:
	data['names_in_manuscript'] = value
	elif 'creation' in key or 'creazione' in key:
	data['date_of_creation'] = value
	elif 'modification' in key or 'modifica' in key:
	data['last_modification'] = value
	elif 'identifier' in key:
	data['other_identifiers'] = value

	def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
	"""Scrape all copyists"""
	print("Starting full scrape...")

	# Discover IDs
	copyist_ids = self.discover_copyist_ids()
	print(f"Found {len(copyist_ids)} copyist IDs")

	if not copyist_ids:
	print("No copyist IDs found!")
	return pd.DataFrame()

	# Limit if requested
	if max_entries and max_entries > 0:
	copyist_ids = copyist_ids[:max_entries]
	print(f"Limited to {max_entries} entries")

	# Scrape each copyist
	all_data = []

	for i, copyist_id in enumerate(copyist_ids, 1):
	print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")

	data = self.scrape_copyist_detail(copyist_id)

	if 'error' not in data:
	data['scrape_order'] = i
	all_data.append(data)
	else:
	print(f"Error scraping {copyist_id}: {data['error']}")

	# Delay between requests
	if delay > 0:
	time.sleep(delay)

	df = pd.DataFrame(all_data)
	print(f"Successfully scraped {len(df)} copyists")
	return df


	# Simple usage example
	def main():
	"""Main function to run the scraper"""
	scraper = ManusCopistaRequestsScraper()

	# Test with a small number first
	print("Testing with 10 entries...")
	df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)

	if not df.empty:
	print(f"Successfully scraped {len(df)} copyists")
	print("\nColumns:", df.columns.tolist())
	print("\nFirst few rows:")
	print(df.head())

	# Save to CSV
	filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
	df.to_csv(filename, index=False)
	print(f"\nSaved to {filename}")
	else:
	print("No data scraped!")


	if __name__ == "__main__":
	main()