Update app.py
Browse files
app.py
CHANGED
|
@@ -1,1092 +1,334 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
import pandas as pd
|
| 5 |
-
from urllib.parse import urljoin, urlparse
|
| 6 |
import time
|
| 7 |
import re
|
| 8 |
from typing import Dict, List, Optional
|
| 9 |
import json
|
| 10 |
-
import io
|
| 11 |
from datetime import datetime
|
| 12 |
-
import
|
| 13 |
-
|
| 14 |
-
# Selenium imports
|
| 15 |
-
from selenium import webdriver
|
| 16 |
-
from selenium.webdriver.common.by import By
|
| 17 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
| 18 |
-
from selenium.webdriver.support import expected_conditions as EC
|
| 19 |
-
from selenium.webdriver.chrome.options import Options
|
| 20 |
-
from selenium.common.exceptions import TimeoutException, WebDriverException
|
| 21 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
| 22 |
|
| 23 |
-
class
|
| 24 |
-
def __init__(self
|
| 25 |
-
self.base_url =
|
| 26 |
self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
|
| 27 |
self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
|
| 28 |
-
self.driver = None
|
| 29 |
-
self.setup_driver()
|
| 30 |
-
|
| 31 |
-
def setup_driver(self):
|
| 32 |
-
"""Setup Chrome driver with appropriate options"""
|
| 33 |
-
chrome_options = Options()
|
| 34 |
-
chrome_options.add_argument("--headless")
|
| 35 |
-
chrome_options.add_argument("--no-sandbox")
|
| 36 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
| 37 |
-
chrome_options.add_argument("--disable-gpu")
|
| 38 |
-
chrome_options.add_argument("--window-size=1920,1080")
|
| 39 |
-
chrome_options.add_argument("--disable-extensions")
|
| 40 |
-
chrome_options.add_argument("--disable-plugins")
|
| 41 |
-
chrome_options.add_argument("--disable-images")
|
| 42 |
-
chrome_options.add_argument("--disable-javascript")
|
| 43 |
-
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
|
| 58 |
-
def
|
| 59 |
-
"""
|
| 60 |
try:
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
if
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
)
|
| 68 |
-
else:
|
| 69 |
-
# Default wait for page to load
|
| 70 |
-
time.sleep(3)
|
| 71 |
|
| 72 |
-
|
| 73 |
-
page_source = self.driver.page_source
|
| 74 |
-
return BeautifulSoup(page_source, 'html.parser')
|
| 75 |
|
| 76 |
-
except
|
| 77 |
-
print(f"
|
| 78 |
-
return None
|
| 79 |
-
except WebDriverException as e:
|
| 80 |
-
print(f"WebDriver error on {url}: {e}")
|
| 81 |
return None
|
| 82 |
except Exception as e:
|
| 83 |
-
print(f"
|
| 84 |
return None
|
| 85 |
|
| 86 |
-
def
|
| 87 |
-
"""Discover
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
timeout=15
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
if not soup:
|
| 101 |
-
if progress_callback:
|
| 102 |
-
progress_callback("Failed to fetch the copyist list page.")
|
| 103 |
-
return []
|
| 104 |
-
|
| 105 |
-
# Extract IDs from the table
|
| 106 |
-
page_ids = self.extract_copyist_ids_from_table(soup)
|
| 107 |
-
all_ids.update(page_ids)
|
| 108 |
|
| 109 |
-
|
| 110 |
-
progress_callback(f"Found {len(all_ids)} copyist IDs from main page.")
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
|
| 118 |
-
return sorted(list(all_ids)
|
| 119 |
|
| 120 |
-
def
|
| 121 |
-
"""Extract copyist IDs from
|
| 122 |
ids = set()
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
# Look for the specific table body
|
| 128 |
-
table_body = soup.find('tbody', id='authorities-results-content')
|
| 129 |
-
if not table_body:
|
| 130 |
-
# Fallback: look for any table with copyist links
|
| 131 |
-
table_body = soup.find('tbody')
|
| 132 |
-
|
| 133 |
-
if not table_body:
|
| 134 |
-
return []
|
| 135 |
-
|
| 136 |
-
# Find all links in the table
|
| 137 |
-
links = table_body.find_all('a', href=True)
|
| 138 |
for link in links:
|
| 139 |
-
href = link
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
return list(ids)
|
| 146 |
|
| 147 |
-
def handle_pagination(self, soup: BeautifulSoup,
|
| 148 |
-
"""Handle pagination to get
|
| 149 |
-
|
| 150 |
-
# Look for pagination controls
|
| 151 |
-
pagination_links = soup.find_all('a', href=True)
|
| 152 |
-
next_page_found = False
|
| 153 |
-
|
| 154 |
-
for link in pagination_links:
|
| 155 |
-
link_text = link.get_text(strip=True).lower()
|
| 156 |
-
href = link.get('href', '')
|
| 157 |
-
|
| 158 |
-
# Look for "next" or page numbers
|
| 159 |
-
if ('next' in link_text or 'seguente' in link_text or
|
| 160 |
-
(link_text.isdigit() and int(link_text) > 1)):
|
| 161 |
-
|
| 162 |
-
next_page_found = True
|
| 163 |
-
if progress_callback:
|
| 164 |
-
progress_callback(f"Found pagination link: {link_text}")
|
| 165 |
-
|
| 166 |
-
# Navigate to next page
|
| 167 |
-
full_url = urljoin(self.base_url, href)
|
| 168 |
-
next_soup = self.get_page_with_selenium(
|
| 169 |
-
full_url,
|
| 170 |
-
wait_for_element="tbody#authorities-results-content",
|
| 171 |
-
timeout=15
|
| 172 |
-
)
|
| 173 |
-
|
| 174 |
-
if next_soup:
|
| 175 |
-
new_ids = self.extract_copyist_ids_from_table(next_soup)
|
| 176 |
-
all_ids.update(new_ids)
|
| 177 |
-
if progress_callback:
|
| 178 |
-
progress_callback(f"Added {len(new_ids)} IDs from pagination page")
|
| 179 |
-
|
| 180 |
-
# Recursively handle more pagination
|
| 181 |
-
self.handle_pagination(next_soup, all_ids, progress_callback)
|
| 182 |
-
break
|
| 183 |
-
|
| 184 |
-
return next_page_found
|
| 185 |
-
|
| 186 |
-
except Exception as e:
|
| 187 |
-
if progress_callback:
|
| 188 |
-
progress_callback(f"Error handling pagination: {e}")
|
| 189 |
-
return False
|
| 190 |
-
|
| 191 |
-
def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
|
| 192 |
-
"""Extract metadata from the copyist detail page"""
|
| 193 |
-
metadata = {
|
| 194 |
-
'cnmn_code': '',
|
| 195 |
-
'vid_sbn': '',
|
| 196 |
-
'vid_sbn_url': '',
|
| 197 |
-
'isni_code': '',
|
| 198 |
-
'isni_url': '',
|
| 199 |
-
'other_identifiers': '',
|
| 200 |
-
'biographical_note': '',
|
| 201 |
-
'bibliographical_sources': '',
|
| 202 |
-
'bibliographical_notes': '',
|
| 203 |
-
'names_in_manuscript': '',
|
| 204 |
-
'date_of_creation': '',
|
| 205 |
-
'last_modification': '',
|
| 206 |
-
'page_title': '',
|
| 207 |
-
'copyist_name': ''
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
if not soup:
|
| 211 |
-
return metadata
|
| 212 |
-
|
| 213 |
-
# Extract page title
|
| 214 |
-
title_tag = soup.find('title')
|
| 215 |
-
if title_tag:
|
| 216 |
-
metadata['page_title'] = title_tag.get_text(strip=True)
|
| 217 |
-
|
| 218 |
-
# Try to extract copyist name
|
| 219 |
-
name_selectors = [
|
| 220 |
-
'h1', 'h2', '.title', '.copyist-name',
|
| 221 |
-
'[class*="name"]', '[class*="title"]'
|
| 222 |
-
]
|
| 223 |
-
|
| 224 |
-
for selector in name_selectors:
|
| 225 |
-
element = soup.select_one(selector)
|
| 226 |
-
if element:
|
| 227 |
-
name_text = element.get_text(strip=True)
|
| 228 |
-
if name_text and len(name_text) > 2:
|
| 229 |
-
metadata['copyist_name'] = name_text
|
| 230 |
-
break
|
| 231 |
-
|
| 232 |
-
# Find the main data table
|
| 233 |
-
main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
|
| 234 |
-
if not main_table:
|
| 235 |
-
main_table = soup.find('table')
|
| 236 |
-
|
| 237 |
-
if not main_table:
|
| 238 |
-
return metadata
|
| 239 |
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
try:
|
| 244 |
-
title_cell = row.find('td', class_='table-title')
|
| 245 |
-
if not title_cell:
|
| 246 |
-
continue
|
| 247 |
-
|
| 248 |
-
title_div = title_cell.find('div', class_='table-title-item')
|
| 249 |
-
if not title_div:
|
| 250 |
-
continue
|
| 251 |
-
|
| 252 |
-
field_name = title_div.get_text(strip=True)
|
| 253 |
-
|
| 254 |
-
data_cells = row.find_all('td')
|
| 255 |
-
data_cell = data_cells[1] if len(data_cells) > 1 else None
|
| 256 |
-
if not data_cell:
|
| 257 |
-
continue
|
| 258 |
-
|
| 259 |
-
self.extract_cell_data(field_name, data_cell, metadata)
|
| 260 |
-
|
| 261 |
-
except (AttributeError, IndexError):
|
| 262 |
-
continue
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
"""Extract data from table cells"""
|
| 268 |
-
try:
|
| 269 |
-
cell_classes = data_cell.get('class', [])
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
if
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
elif 'table-link' in cell_classes:
|
| 278 |
-
text_item = data_cell.find('div', class_='table-text-item')
|
| 279 |
-
if text_item:
|
| 280 |
-
link = text_item.find('a')
|
| 281 |
-
if link:
|
| 282 |
-
link_text = link.get_text(strip=True)
|
| 283 |
-
link_url = link.get('href', '')
|
| 284 |
-
self.map_field_link(field_name, link_text, link_url, metadata)
|
| 285 |
-
else:
|
| 286 |
-
value = text_item.get_text(strip=True)
|
| 287 |
-
self.map_field_value(field_name, value, metadata)
|
| 288 |
-
|
| 289 |
-
elif 'table-list' in cell_classes:
|
| 290 |
-
values = []
|
| 291 |
-
list_containers = data_cell.find_all('div', class_='table-list-item')
|
| 292 |
-
|
| 293 |
-
if list_containers:
|
| 294 |
-
for container in list_containers:
|
| 295 |
-
text_items = container.find_all('div', class_='table-text-item')
|
| 296 |
-
for item in text_items:
|
| 297 |
-
try:
|
| 298 |
-
link = item.find('a')
|
| 299 |
-
if link:
|
| 300 |
-
link_text = link.get_text(strip=True)
|
| 301 |
-
link_url = link.get('href', '')
|
| 302 |
-
if link_url:
|
| 303 |
-
values.append(f"{link_text} ({link_url})")
|
| 304 |
-
else:
|
| 305 |
-
values.append(link_text)
|
| 306 |
-
else:
|
| 307 |
-
text = item.get_text(strip=True)
|
| 308 |
-
if text:
|
| 309 |
-
values.append(text)
|
| 310 |
-
except AttributeError:
|
| 311 |
-
continue
|
| 312 |
-
else:
|
| 313 |
-
text_items = data_cell.find_all('div', class_='table-text-item')
|
| 314 |
-
for item in text_items:
|
| 315 |
-
try:
|
| 316 |
-
link = item.find('a')
|
| 317 |
-
if link:
|
| 318 |
-
link_text = link.get_text(strip=True)
|
| 319 |
-
link_url = link.get('href', '')
|
| 320 |
-
if link_url:
|
| 321 |
-
values.append(f"{link_text} ({link_url})")
|
| 322 |
-
else:
|
| 323 |
-
values.append(link_text)
|
| 324 |
-
else:
|
| 325 |
-
text = item.get_text(strip=True)
|
| 326 |
-
if text:
|
| 327 |
-
values.append(text)
|
| 328 |
-
except AttributeError:
|
| 329 |
-
continue
|
| 330 |
-
|
| 331 |
-
self.map_field_list(field_name, values, metadata)
|
| 332 |
-
|
| 333 |
-
elif 'table-text-html' in cell_classes:
|
| 334 |
-
text_item = data_cell.find('div', class_='table-text-item')
|
| 335 |
-
if text_item:
|
| 336 |
-
value = ' '.join(text_item.get_text(strip=True).split())
|
| 337 |
-
self.map_field_value(field_name, value, metadata)
|
| 338 |
-
|
| 339 |
-
except (AttributeError, TypeError):
|
| 340 |
-
pass
|
| 341 |
-
|
| 342 |
-
def map_field_value(self, field_name: str, value: str, metadata: Dict):
|
| 343 |
-
"""Map field values to metadata keys"""
|
| 344 |
-
field_mapping = {
|
| 345 |
-
'CNMN code': 'cnmn_code',
|
| 346 |
-
'Date of creation': 'date_of_creation',
|
| 347 |
-
'Last modification': 'last_modification',
|
| 348 |
-
'Biographical note': 'biographical_note',
|
| 349 |
-
'Bibliographical notes': 'bibliographical_notes'
|
| 350 |
-
}
|
| 351 |
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
metadata['vid_sbn_url'] = link_url
|
| 361 |
-
elif field_name == 'Codice ISNI':
|
| 362 |
-
metadata['isni_code'] = link_text
|
| 363 |
-
metadata['isni_url'] = link_url
|
| 364 |
-
|
| 365 |
-
def map_field_list(self, field_name: str, values: List, metadata: Dict):
|
| 366 |
-
"""Map field lists to metadata"""
|
| 367 |
-
joined_values = '; '.join(str(v) for v in values if v)
|
| 368 |
-
|
| 369 |
-
if field_name == 'Other identifiers':
|
| 370 |
-
metadata['other_identifiers'] = joined_values
|
| 371 |
-
elif field_name == 'Bibliographical sources':
|
| 372 |
-
metadata['bibliographical_sources'] = joined_values
|
| 373 |
-
elif field_name == 'Names in manuscript':
|
| 374 |
-
metadata['names_in_manuscript'] = joined_values
|
| 375 |
-
|
| 376 |
-
def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
|
| 377 |
-
"""Scrape all copyists with Selenium"""
|
| 378 |
-
try:
|
| 379 |
-
# Discover all copyist IDs
|
| 380 |
-
copyist_ids = self.discover_all_copyist_ids(progress_callback)
|
| 381 |
-
|
| 382 |
-
if not copyist_ids:
|
| 383 |
-
return pd.DataFrame(), "No copyist IDs found"
|
| 384 |
-
|
| 385 |
-
if progress_callback:
|
| 386 |
-
progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
|
| 387 |
-
|
| 388 |
-
# Limit entries if specified
|
| 389 |
-
if max_entries and max_entries > 0:
|
| 390 |
-
copyist_ids = copyist_ids[:max_entries]
|
| 391 |
-
if progress_callback:
|
| 392 |
-
progress_callback(f"Limited to first {max_entries} entries for testing")
|
| 393 |
-
|
| 394 |
-
# Process each copyist
|
| 395 |
-
all_metadata = []
|
| 396 |
-
total_ids = len(copyist_ids)
|
| 397 |
-
successful_scrapes = 0
|
| 398 |
-
failed_scrapes = 0
|
| 399 |
-
|
| 400 |
-
for i, copyist_id in enumerate(copyist_ids, 1):
|
| 401 |
-
if progress_callback:
|
| 402 |
-
progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
|
| 403 |
-
|
| 404 |
-
detail_url = f"{self.detail_base_url}{copyist_id}"
|
| 405 |
-
|
| 406 |
-
# Get detailed metadata using Selenium
|
| 407 |
-
detail_soup = self.get_page_with_selenium(
|
| 408 |
-
detail_url,
|
| 409 |
-
wait_for_element="table",
|
| 410 |
-
timeout=10
|
| 411 |
-
)
|
| 412 |
-
|
| 413 |
-
if detail_soup:
|
| 414 |
-
metadata = self.extract_metadata_from_table(detail_soup)
|
| 415 |
-
|
| 416 |
-
combined_data = {
|
| 417 |
-
'copyist_id': copyist_id,
|
| 418 |
-
'detail_url': detail_url,
|
| 419 |
-
'scrape_order': i,
|
| 420 |
-
'scrape_timestamp': datetime.now().isoformat(),
|
| 421 |
-
**metadata
|
| 422 |
-
}
|
| 423 |
-
|
| 424 |
-
all_metadata.append(combined_data)
|
| 425 |
-
successful_scrapes += 1
|
| 426 |
-
else:
|
| 427 |
-
failed_scrapes += 1
|
| 428 |
-
if progress_callback:
|
| 429 |
-
progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
|
| 430 |
-
|
| 431 |
-
# Progress update
|
| 432 |
-
if i % 50 == 0 and progress_callback:
|
| 433 |
-
progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
|
| 434 |
-
|
| 435 |
-
# Delay between requests
|
| 436 |
-
if delay > 0:
|
| 437 |
-
time.sleep(delay)
|
| 438 |
-
|
| 439 |
-
df = pd.DataFrame(all_metadata)
|
| 440 |
-
success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
|
| 441 |
-
return df, success_msg
|
| 442 |
-
|
| 443 |
-
except Exception as e:
|
| 444 |
-
return pd.DataFrame(), f"Error during scraping: {str(e)}"
|
| 445 |
-
|
| 446 |
-
def cleanup(self):
|
| 447 |
-
"""Clean up resources"""
|
| 448 |
-
if self.driver:
|
| 449 |
-
self.driver.quit()
|
| 450 |
-
self.driver = None
|
| 451 |
-
|
| 452 |
-
def __del__(self):
|
| 453 |
-
"""Destructor to ensure cleanup"""
|
| 454 |
-
self.cleanup()
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
class ManusCopistaMetadataScraper:
|
| 458 |
-
def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
|
| 459 |
-
self.base_url = base_url
|
| 460 |
-
self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
|
| 461 |
-
self.session = requests.Session()
|
| 462 |
-
# Add headers to mimic a real browser
|
| 463 |
-
self.session.headers.update({
|
| 464 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 465 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
| 466 |
-
'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
|
| 467 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
| 468 |
-
'Connection': 'keep-alive',
|
| 469 |
-
'Upgrade-Insecure-Requests': '1',
|
| 470 |
-
'Sec-Fetch-Dest': 'document',
|
| 471 |
-
'Sec-Fetch-Mode': 'navigate',
|
| 472 |
-
'Sec-Fetch-Site': 'none',
|
| 473 |
-
'Cache-Control': 'max-age=0',
|
| 474 |
-
})
|
| 475 |
|
| 476 |
-
|
| 477 |
-
"""Fetch and parse a web page with error handling"""
|
| 478 |
-
try:
|
| 479 |
-
response = self.session.get(url, timeout=15)
|
| 480 |
-
response.raise_for_status()
|
| 481 |
-
|
| 482 |
-
# Handle different encodings
|
| 483 |
-
if response.encoding and response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
|
| 484 |
-
response.encoding = 'utf-8'
|
| 485 |
-
|
| 486 |
-
return BeautifulSoup(response.text, 'html.parser')
|
| 487 |
-
except requests.RequestException as e:
|
| 488 |
-
print(f"Error fetching {url}: {e}")
|
| 489 |
-
return None
|
| 490 |
-
|
| 491 |
-
def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
|
| 492 |
-
"""Discover all available copyist IDs from the browse page"""
|
| 493 |
-
all_ids = set()
|
| 494 |
-
|
| 495 |
-
# This is the key page where the real data table appears
|
| 496 |
-
url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
|
| 497 |
-
|
| 498 |
-
if progress_callback:
|
| 499 |
-
progress_callback(f"Fetching copyist list from: {url}")
|
| 500 |
-
|
| 501 |
-
soup = self.get_page_content(url)
|
| 502 |
-
if not soup:
|
| 503 |
-
if progress_callback:
|
| 504 |
-
progress_callback("Failed to fetch the copyist list page.")
|
| 505 |
-
return []
|
| 506 |
-
|
| 507 |
-
page_ids = self.extract_copyist_ids_from_page(soup)
|
| 508 |
-
all_ids.update(page_ids)
|
| 509 |
-
|
| 510 |
-
if progress_callback:
|
| 511 |
-
progress_callback(f"Found {len(all_ids)} copyist IDs.")
|
| 512 |
-
|
| 513 |
-
return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
|
| 514 |
-
|
| 515 |
-
def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
|
| 516 |
-
"""Extract copyist IDs from the table with id 'authorities-results-content'"""
|
| 517 |
-
ids = set()
|
| 518 |
-
|
| 519 |
-
if not soup:
|
| 520 |
-
return []
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
links = table_body.find_all('a', href=True)
|
| 527 |
-
for link in links:
|
| 528 |
-
href = link['href']
|
| 529 |
-
if 'detail/' in href:
|
| 530 |
-
match = re.search(r'detail/(\d+)', href)
|
| 531 |
-
if match:
|
| 532 |
-
ids.add(match.group(1))
|
| 533 |
-
|
| 534 |
-
return list(ids)
|
| 535 |
-
|
| 536 |
-
def extract_copyist_id_from_url(self, url: str) -> Optional[str]:
|
| 537 |
-
"""Extract copyist ID from a URL"""
|
| 538 |
-
patterns = [
|
| 539 |
-
r'/manus-authorities/detail/(\d+)',
|
| 540 |
-
r'copisti2.*?detail/(\d+)',
|
| 541 |
-
r'/detail/(\d+)',
|
| 542 |
-
r'authorities.*?(\d{5,7})',
|
| 543 |
-
r'copista.*?(\d{5,7})'
|
| 544 |
-
]
|
| 545 |
|
| 546 |
-
|
| 547 |
-
match = re.search(pattern, url, re.IGNORECASE)
|
| 548 |
-
if match:
|
| 549 |
-
return match.group(1)
|
| 550 |
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
"""Check if a string looks like a copyist ID"""
|
| 555 |
-
if not id_str or not id_str.isdigit():
|
| 556 |
-
return False
|
| 557 |
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
-
|
| 563 |
-
try:
|
| 564 |
-
id_num = int(id_str)
|
| 565 |
-
return 100000 <= id_num <= 999999
|
| 566 |
-
except ValueError:
|
| 567 |
-
return False
|
| 568 |
|
| 569 |
-
def
|
| 570 |
-
"""
|
| 571 |
-
|
| 572 |
-
return False
|
| 573 |
-
|
| 574 |
-
# IDs are typically 5-7 digits
|
| 575 |
-
if len(id_str) < 5 or len(id_str) > 7:
|
| 576 |
-
return False
|
| 577 |
-
|
| 578 |
-
# Quick HEAD request to check if page exists
|
| 579 |
try:
|
| 580 |
-
|
| 581 |
-
response = self.session.head(detail_url, timeout=3)
|
| 582 |
return response.status_code == 200
|
| 583 |
except:
|
| 584 |
return False
|
| 585 |
|
| 586 |
-
def
|
| 587 |
-
"""
|
| 588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
|
| 590 |
if not soup:
|
| 591 |
-
return
|
| 592 |
-
|
| 593 |
-
# Look for pagination elements with more specific selectors
|
| 594 |
-
pagination_selectors = [
|
| 595 |
-
'nav[aria-label*="pagination"]',
|
| 596 |
-
'nav[class*="pagination"]',
|
| 597 |
-
'.pagination',
|
| 598 |
-
'.pager',
|
| 599 |
-
'.page-navigation',
|
| 600 |
-
'[class*="page-"]'
|
| 601 |
-
]
|
| 602 |
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
if href and href not in ['#', 'javascript:void(0)']:
|
| 610 |
-
full_url = urljoin(self.base_url, href)
|
| 611 |
-
if full_url not in pagination_urls and full_url != self.base_url:
|
| 612 |
-
# Avoid duplicate URLs and navigation loops
|
| 613 |
-
if not any(existing_url in full_url or full_url in existing_url for existing_url in pagination_urls):
|
| 614 |
-
pagination_urls.append(full_url)
|
| 615 |
|
| 616 |
-
#
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
href = link.get('href', '')
|
| 621 |
-
|
| 622 |
-
# Look for pagination indicators
|
| 623 |
-
pagination_keywords = ['next', 'seguente', 'avanti', 'previous', 'precedente', 'indietro']
|
| 624 |
-
if (any(keyword in link_text for keyword in pagination_keywords) or
|
| 625 |
-
(link_text.isdigit() and int(link_text) <= 100)): # Reasonable page number
|
| 626 |
-
|
| 627 |
-
if href and href not in ['#', 'javascript:void(0)']:
|
| 628 |
-
full_url = urljoin(self.base_url, href)
|
| 629 |
-
if (full_url not in pagination_urls and
|
| 630 |
-
full_url != self.base_url and
|
| 631 |
-
'copisti' in full_url): # Ensure it's still in the copyist section
|
| 632 |
-
pagination_urls.append(full_url)
|
| 633 |
|
| 634 |
-
#
|
| 635 |
-
|
| 636 |
-
for url in pagination_urls:
|
| 637 |
-
if url not in unique_urls:
|
| 638 |
-
unique_urls.append(url)
|
| 639 |
|
| 640 |
-
return
|
| 641 |
|
| 642 |
-
def
|
| 643 |
-
"""
|
| 644 |
-
|
| 645 |
-
|
|
|
|
|
|
|
| 646 |
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
return {"error": "Could not fetch main page"}
|
| 650 |
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
"total_links": len(main_soup.find_all('a', href=True)),
|
| 654 |
-
"copyist_links": [],
|
| 655 |
-
"pagination_links": [],
|
| 656 |
-
"page_structure": []
|
| 657 |
-
}
|
| 658 |
|
| 659 |
-
#
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
if 'detail' in href or 'copista' in href.lower() or 'authorities' in href:
|
| 666 |
-
copyist_id = self.extract_copyist_id_from_url(href)
|
| 667 |
-
results["copyist_links"].append({
|
| 668 |
-
"href": href,
|
| 669 |
-
"text": text,
|
| 670 |
-
"extracted_id": copyist_id
|
| 671 |
-
})
|
| 672 |
|
| 673 |
-
#
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
for
|
| 677 |
-
|
| 678 |
-
|
| 679 |
|
| 680 |
-
|
|
|
|
|
|
|
| 681 |
|
| 682 |
-
def
|
| 683 |
-
"""Extract
|
| 684 |
-
|
| 685 |
-
'cnmn_code': '',
|
| 686 |
-
'vid_sbn': '',
|
| 687 |
-
'vid_sbn_url': '',
|
| 688 |
-
'isni_code': '',
|
| 689 |
-
'isni_url': '',
|
| 690 |
-
'other_identifiers': '',
|
| 691 |
-
'biographical_note': '',
|
| 692 |
-
'bibliographical_sources': '',
|
| 693 |
-
'bibliographical_notes': '',
|
| 694 |
-
'names_in_manuscript': '',
|
| 695 |
-
'date_of_creation': '',
|
| 696 |
-
'last_modification': '',
|
| 697 |
-
'page_title': '',
|
| 698 |
-
'copyist_name': ''
|
| 699 |
-
}
|
| 700 |
-
|
| 701 |
-
if not soup:
|
| 702 |
-
return metadata
|
| 703 |
-
|
| 704 |
-
# Extract page title
|
| 705 |
-
title_tag = soup.find('title')
|
| 706 |
-
if title_tag:
|
| 707 |
-
metadata['page_title'] = title_tag.get_text(strip=True)
|
| 708 |
-
|
| 709 |
-
# Try to extract copyist name from various possible locations
|
| 710 |
-
name_selectors = [
|
| 711 |
-
'h1', 'h2', '.title', '.copyist-name',
|
| 712 |
-
'[class*="name"]', '[class*="title"]'
|
| 713 |
-
]
|
| 714 |
-
|
| 715 |
-
for selector in name_selectors:
|
| 716 |
-
element = soup.select_one(selector)
|
| 717 |
-
if element:
|
| 718 |
-
name_text = element.get_text(strip=True)
|
| 719 |
-
if name_text and len(name_text) > 2:
|
| 720 |
-
metadata['copyist_name'] = name_text
|
| 721 |
-
break
|
| 722 |
-
|
| 723 |
-
# Find the main data table - look for the specific table structure
|
| 724 |
-
main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
|
| 725 |
-
|
| 726 |
-
if not main_table:
|
| 727 |
-
# Fallback: look for any table
|
| 728 |
-
main_table = soup.find('table')
|
| 729 |
-
|
| 730 |
-
if not main_table:
|
| 731 |
-
return metadata
|
| 732 |
-
|
| 733 |
-
# Process each row in the table
|
| 734 |
-
rows = main_table.find_all('tr')
|
| 735 |
|
| 736 |
for row in rows:
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
continue
|
| 742 |
-
|
| 743 |
-
title_div = title_cell.find('div', class_='table-title-item')
|
| 744 |
-
if not title_div:
|
| 745 |
-
continue
|
| 746 |
-
|
| 747 |
-
field_name = title_div.get_text(strip=True)
|
| 748 |
-
|
| 749 |
-
# Get the data cell (should be the second td in the row)
|
| 750 |
-
data_cells = row.find_all('td')
|
| 751 |
-
if len(data_cells) < 2:
|
| 752 |
-
continue
|
| 753 |
-
|
| 754 |
-
data_cell = data_cells[1]
|
| 755 |
-
|
| 756 |
-
# Extract data based on cell type
|
| 757 |
-
self.extract_cell_data(field_name, data_cell, metadata)
|
| 758 |
|
| 759 |
-
|
| 760 |
-
|
| 761 |
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
value = text_item.get_text(strip=True)
|
| 774 |
-
self.map_field_value(field_name, value, metadata)
|
| 775 |
-
|
| 776 |
-
# Handle link cells
|
| 777 |
-
elif 'table-link' in cell_classes:
|
| 778 |
-
text_item = data_cell.find('div', class_='table-text-item')
|
| 779 |
-
if text_item:
|
| 780 |
-
link = text_item.find('a')
|
| 781 |
if link:
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
|
|
|
|
|
|
|
|
|
| 785 |
else:
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
continue
|
| 816 |
-
else:
|
| 817 |
-
# Fallback: look for text items directly
|
| 818 |
-
text_items = data_cell.find_all('div', class_='table-text-item')
|
| 819 |
-
for item in text_items:
|
| 820 |
-
try:
|
| 821 |
-
link = item.find('a')
|
| 822 |
-
if link:
|
| 823 |
-
link_text = link.get_text(strip=True)
|
| 824 |
-
link_url = link.get('href', '')
|
| 825 |
-
if link_url:
|
| 826 |
-
values.append(f"{link_text} ({link_url})")
|
| 827 |
-
else:
|
| 828 |
-
values.append(link_text)
|
| 829 |
-
else:
|
| 830 |
-
text = item.get_text(strip=True)
|
| 831 |
-
if text:
|
| 832 |
-
values.append(text)
|
| 833 |
-
except AttributeError:
|
| 834 |
-
continue
|
| 835 |
-
|
| 836 |
-
self.map_field_list(field_name, values, metadata)
|
| 837 |
-
|
| 838 |
-
# Handle HTML text cells
|
| 839 |
-
elif 'table-text-html' in cell_classes:
|
| 840 |
-
text_item = data_cell.find('div', class_='table-text-item')
|
| 841 |
-
if text_item:
|
| 842 |
-
# Clean HTML and get text
|
| 843 |
-
value = ' '.join(text_item.get_text(strip=True).split())
|
| 844 |
-
self.map_field_value(field_name, value, metadata)
|
| 845 |
-
|
| 846 |
-
except (AttributeError, TypeError):
|
| 847 |
-
pass
|
| 848 |
-
|
| 849 |
-
def map_field_value(self, field_name: str, value: str, metadata: Dict):
|
| 850 |
-
"""Map field values to the appropriate metadata keys"""
|
| 851 |
-
field_mapping = {
|
| 852 |
-
'CNMN code': 'cnmn_code',
|
| 853 |
-
'Date of creation': 'date_of_creation',
|
| 854 |
-
'Last modification': 'last_modification',
|
| 855 |
-
'Biographical note': 'biographical_note',
|
| 856 |
-
'Bibliographical notes': 'bibliographical_notes'
|
| 857 |
-
}
|
| 858 |
-
|
| 859 |
-
mapped_key = field_mapping.get(field_name)
|
| 860 |
-
if mapped_key and mapped_key in metadata:
|
| 861 |
-
metadata[mapped_key] = value
|
| 862 |
-
|
| 863 |
-
def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
|
| 864 |
-
"""Map field links to metadata"""
|
| 865 |
-
if field_name == 'VID SBN':
|
| 866 |
-
metadata['vid_sbn'] = link_text
|
| 867 |
-
metadata['vid_sbn_url'] = link_url
|
| 868 |
-
elif field_name == 'Codice ISNI':
|
| 869 |
-
metadata['isni_code'] = link_text
|
| 870 |
-
metadata['isni_url'] = link_url
|
| 871 |
-
|
| 872 |
-
def map_field_list(self, field_name: str, values: List, metadata: Dict):
|
| 873 |
-
"""Map field lists to metadata"""
|
| 874 |
-
joined_values = '; '.join(str(v) for v in values if v)
|
| 875 |
-
|
| 876 |
-
if field_name == 'Other identifiers':
|
| 877 |
-
metadata['other_identifiers'] = joined_values
|
| 878 |
-
elif field_name == 'Bibliographical sources':
|
| 879 |
-
metadata['bibliographical_sources'] = joined_values
|
| 880 |
-
elif field_name == 'Names in manuscript':
|
| 881 |
-
metadata['names_in_manuscript'] = joined_values
|
| 882 |
-
|
| 883 |
-
def scrape_copyist_by_id(self, copyist_id: str) -> Dict:
|
| 884 |
-
"""Scrape a single copyist by ID"""
|
| 885 |
-
detail_url = f"{self.detail_base_url}{copyist_id}"
|
| 886 |
-
|
| 887 |
-
# Get the detail page
|
| 888 |
-
detail_soup = self.get_page_content(detail_url)
|
| 889 |
-
if not detail_soup:
|
| 890 |
-
return {'error': f'Could not fetch data for copyist ID {copyist_id}'}
|
| 891 |
-
|
| 892 |
-
# Extract metadata
|
| 893 |
-
metadata = self.extract_metadata_from_table(detail_soup)
|
| 894 |
-
|
| 895 |
-
# Add basic info
|
| 896 |
-
metadata['copyist_id'] = copyist_id
|
| 897 |
-
metadata['detail_url'] = detail_url
|
| 898 |
-
metadata['scrape_timestamp'] = datetime.now().isoformat()
|
| 899 |
-
|
| 900 |
-
return metadata
|
| 901 |
-
|
| 902 |
-
def scrape_multiple_copyists(self, copyist_ids: List[str], delay: float = 1.0, progress_callback=None) -> pd.DataFrame:
|
| 903 |
-
"""Scrape multiple copyists by their IDs"""
|
| 904 |
-
all_metadata = []
|
| 905 |
|
| 906 |
for i, copyist_id in enumerate(copyist_ids, 1):
|
| 907 |
-
|
| 908 |
-
progress_callback(f"Processing {i}/{len(copyist_ids)}: Copyist ID {copyist_id}")
|
| 909 |
|
| 910 |
-
|
| 911 |
|
| 912 |
-
if 'error' not in
|
| 913 |
-
|
| 914 |
-
|
| 915 |
else:
|
| 916 |
-
|
| 917 |
-
progress_callback(f"Failed to scrape copyist ID {copyist_id}: {metadata['error']}")
|
| 918 |
|
| 919 |
# Delay between requests
|
| 920 |
if delay > 0:
|
| 921 |
time.sleep(delay)
|
| 922 |
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
"""Scrape all copyists with progress updates"""
|
| 927 |
-
try:
|
| 928 |
-
# Discover all copyist IDs
|
| 929 |
-
copyist_ids = self.discover_all_copyist_ids(progress_callback)
|
| 930 |
-
|
| 931 |
-
if not copyist_ids:
|
| 932 |
-
return pd.DataFrame(), "No copyist IDs found"
|
| 933 |
-
|
| 934 |
-
if progress_callback:
|
| 935 |
-
progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
|
| 936 |
-
|
| 937 |
-
# Limit entries if specified
|
| 938 |
-
if max_entries and max_entries > 0:
|
| 939 |
-
copyist_ids = copyist_ids[:max_entries]
|
| 940 |
-
if progress_callback:
|
| 941 |
-
progress_callback(f"Limited to first {max_entries} entries for testing")
|
| 942 |
-
|
| 943 |
-
# Scrape the copyists
|
| 944 |
-
df = self.scrape_multiple_copyists(copyist_ids, delay, progress_callback)
|
| 945 |
-
|
| 946 |
-
success_msg = f"Successfully scraped {len(df)} copyist records out of {len(copyist_ids)} discovered IDs"
|
| 947 |
-
return df, success_msg
|
| 948 |
-
|
| 949 |
-
except Exception as e:
|
| 950 |
-
return pd.DataFrame(), f"Error during scraping: {str(e)}"
|
| 951 |
|
| 952 |
|
| 953 |
-
#
|
| 954 |
-
def
|
| 955 |
-
"""
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
# Create CSV output
|
| 975 |
-
csv_output = io.StringIO()
|
| 976 |
-
df.to_csv(csv_output, index=False)
|
| 977 |
-
csv_content = csv_output.getvalue()
|
| 978 |
-
|
| 979 |
-
return csv_content, f"Success! {status}"
|
| 980 |
-
|
| 981 |
-
except Exception as e:
|
| 982 |
-
return None, f"Error: {str(e)}"
|
| 983 |
-
finally:
|
| 984 |
-
if scraper:
|
| 985 |
-
scraper.cleanup()
|
| 986 |
-
|
| 987 |
-
def run_scraper_requests(delay, max_entries, progress=gr.Progress()):
|
| 988 |
-
"""Run the requests-based scraper with progress updates"""
|
| 989 |
-
try:
|
| 990 |
-
def update_progress(message):
|
| 991 |
-
progress(message)
|
| 992 |
-
|
| 993 |
-
scraper = ManusCopistaMetadataScraper()
|
| 994 |
-
df, status = scraper.scrape_all_copyists_with_progress(
|
| 995 |
-
delay=delay,
|
| 996 |
-
max_entries=max_entries if max_entries > 0 else None,
|
| 997 |
-
progress_callback=update_progress
|
| 998 |
-
)
|
| 999 |
-
|
| 1000 |
-
if df.empty:
|
| 1001 |
-
return None, f"No data scraped. Status: {status}"
|
| 1002 |
-
|
| 1003 |
-
# Create CSV output
|
| 1004 |
-
csv_output = io.StringIO()
|
| 1005 |
-
df.to_csv(csv_output, index=False)
|
| 1006 |
-
csv_content = csv_output.getvalue()
|
| 1007 |
-
|
| 1008 |
-
return csv_content, f"Success! {status}"
|
| 1009 |
-
|
| 1010 |
-
except Exception as e:
|
| 1011 |
-
return None, f"Error: {str(e)}"
|
| 1012 |
-
|
| 1013 |
-
def test_discovery(progress=gr.Progress()):
|
| 1014 |
-
"""Test the discovery method"""
|
| 1015 |
-
try:
|
| 1016 |
-
def update_progress(message):
|
| 1017 |
-
progress(message)
|
| 1018 |
-
|
| 1019 |
-
scraper = ManusCopistaMetadataScraper()
|
| 1020 |
-
results = scraper.test_discovery_method(progress_callback=update_progress)
|
| 1021 |
-
|
| 1022 |
-
return json.dumps(results, indent=2), "Discovery test completed"
|
| 1023 |
-
|
| 1024 |
-
except Exception as e:
|
| 1025 |
-
return None, f"Error: {str(e)}"
|
| 1026 |
-
|
| 1027 |
-
with gr.Blocks(title="Manus Copista Scraper") as interface:
|
| 1028 |
-
gr.Markdown("# Manus Copista Metadata Scraper")
|
| 1029 |
-
gr.Markdown("Scrape copyist metadata from the Manus database using either Selenium or requests.")
|
| 1030 |
-
|
| 1031 |
-
with gr.Tab("Selenium Scraper (Recommended)"):
|
| 1032 |
-
gr.Markdown("### Selenium-based scraper (handles JavaScript)")
|
| 1033 |
-
|
| 1034 |
-
with gr.Row():
|
| 1035 |
-
selenium_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
|
| 1036 |
-
selenium_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
|
| 1037 |
-
|
| 1038 |
-
selenium_run_btn = gr.Button("Run Selenium Scraper", variant="primary")
|
| 1039 |
-
selenium_status = gr.Textbox(label="Status", lines=3)
|
| 1040 |
-
selenium_output = gr.File(label="Download CSV")
|
| 1041 |
-
|
| 1042 |
-
selenium_run_btn.click(
|
| 1043 |
-
run_scraper_selenium,
|
| 1044 |
-
inputs=[selenium_delay, selenium_max_entries],
|
| 1045 |
-
outputs=[selenium_output, selenium_status]
|
| 1046 |
-
)
|
| 1047 |
-
|
| 1048 |
-
with gr.Tab("Requests Scraper"):
|
| 1049 |
-
gr.Markdown("### Requests-based scraper (faster, may miss JavaScript content)")
|
| 1050 |
-
|
| 1051 |
-
with gr.Row():
|
| 1052 |
-
requests_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
|
| 1053 |
-
requests_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
|
| 1054 |
-
|
| 1055 |
-
requests_run_btn = gr.Button("Run Requests Scraper", variant="primary")
|
| 1056 |
-
requests_status = gr.Textbox(label="Status", lines=3)
|
| 1057 |
-
requests_output = gr.File(label="Download CSV")
|
| 1058 |
-
|
| 1059 |
-
requests_run_btn.click(
|
| 1060 |
-
run_scraper_requests,
|
| 1061 |
-
inputs=[requests_delay, requests_max_entries],
|
| 1062 |
-
outputs=[requests_output, requests_status]
|
| 1063 |
-
)
|
| 1064 |
-
|
| 1065 |
-
with gr.Tab("Discovery Test"):
|
| 1066 |
-
gr.Markdown("### Test the ID discovery process")
|
| 1067 |
-
|
| 1068 |
-
test_btn = gr.Button("Test Discovery Method", variant="secondary")
|
| 1069 |
-
test_status = gr.Textbox(label="Status", lines=2)
|
| 1070 |
-
test_output = gr.Textbox(label="Test Results", lines=20)
|
| 1071 |
-
|
| 1072 |
-
test_btn.click(
|
| 1073 |
-
test_discovery,
|
| 1074 |
-
outputs=[test_output, test_status]
|
| 1075 |
-
)
|
| 1076 |
-
|
| 1077 |
-
gr.Markdown("---")
|
| 1078 |
-
gr.Markdown("**Note:** The Selenium scraper is recommended as it can handle JavaScript content. The requests scraper is faster but may miss some data.")
|
| 1079 |
-
|
| 1080 |
-
return interface
|
| 1081 |
|
| 1082 |
|
| 1083 |
-
# Main execution
|
| 1084 |
if __name__ == "__main__":
|
| 1085 |
-
|
| 1086 |
-
interface = create_gradio_interface()
|
| 1087 |
-
interface.launch(
|
| 1088 |
-
server_name="0.0.0.0",
|
| 1089 |
-
server_port=7860,
|
| 1090 |
-
share=False,
|
| 1091 |
-
debug=True
|
| 1092 |
-
)
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
import time
|
| 5 |
import re
|
| 6 |
from typing import Dict, List, Optional
|
| 7 |
import json
|
|
|
|
| 8 |
from datetime import datetime
|
| 9 |
+
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
class ManusCopistaRequestsScraper:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.base_url = "https://manus.iccu.sbn.it"
|
| 14 |
self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
|
| 15 |
self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# Setup session with proper headers
|
| 18 |
+
self.session = requests.Session()
|
| 19 |
+
self.session.headers.update({
|
| 20 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 21 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 22 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 23 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 24 |
+
'Connection': 'keep-alive',
|
| 25 |
+
'Upgrade-Insecure-Requests': '1',
|
| 26 |
+
'Sec-Fetch-Dest': 'document',
|
| 27 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 28 |
+
'Sec-Fetch-Site': 'none',
|
| 29 |
+
})
|
| 30 |
|
| 31 |
+
def get_page(self, url: str) -> Optional[BeautifulSoup]:
|
| 32 |
+
"""Fetch a page and return BeautifulSoup object"""
|
| 33 |
try:
|
| 34 |
+
print(f"Fetching: {url}")
|
| 35 |
+
response = self.session.get(url, timeout=15)
|
| 36 |
+
response.raise_for_status()
|
| 37 |
|
| 38 |
+
# Check if we got a proper response
|
| 39 |
+
if response.status_code != 200:
|
| 40 |
+
print(f"Bad status code: {response.status_code}")
|
| 41 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
except requests.exceptions.RequestException as e:
|
| 46 |
+
print(f"Request error for {url}: {e}")
|
|
|
|
|
|
|
|
|
|
| 47 |
return None
|
| 48 |
except Exception as e:
|
| 49 |
+
print(f"Unexpected error for {url}: {e}")
|
| 50 |
return None
|
| 51 |
|
| 52 |
+
def discover_copyist_ids(self) -> List[str]:
|
| 53 |
+
"""Discover copyist IDs from the browse page"""
|
| 54 |
+
print("Discovering copyist IDs...")
|
| 55 |
|
| 56 |
+
# Try different approaches to get the data
|
| 57 |
+
urls_to_try = [
|
| 58 |
+
self.browse_url,
|
| 59 |
+
"https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
|
| 60 |
+
"https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
|
| 61 |
+
"https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
|
| 62 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
all_ids = set()
|
|
|
|
| 65 |
|
| 66 |
+
for url in urls_to_try:
|
| 67 |
+
soup = self.get_page(url)
|
| 68 |
+
if soup:
|
| 69 |
+
ids = self.extract_ids_from_page(soup)
|
| 70 |
+
all_ids.update(ids)
|
| 71 |
+
print(f"Found {len(ids)} IDs from {url}")
|
| 72 |
+
|
| 73 |
+
# If we found IDs, try to get more from pagination
|
| 74 |
+
if ids:
|
| 75 |
+
pagination_ids = self.handle_pagination(soup, url)
|
| 76 |
+
all_ids.update(pagination_ids)
|
| 77 |
|
| 78 |
+
# If no IDs found from browse page, try a range-based approach
|
| 79 |
+
if not all_ids:
|
| 80 |
+
print("No IDs found from browse page, trying range-based discovery...")
|
| 81 |
+
all_ids = self.discover_ids_by_range()
|
| 82 |
|
| 83 |
+
return sorted(list(all_ids))
|
| 84 |
|
| 85 |
+
def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
|
| 86 |
+
"""Extract copyist IDs from a page"""
|
| 87 |
ids = set()
|
| 88 |
|
| 89 |
+
# Look for links that contain detail/ followed by numbers
|
| 90 |
+
links = soup.find_all('a', href=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
for link in links:
|
| 92 |
+
href = link.get('href', '')
|
| 93 |
+
match = re.search(r'detail/(\d+)', href)
|
| 94 |
+
if match:
|
| 95 |
+
copyist_id = match.group(1)
|
| 96 |
+
if len(copyist_id) >= 5: # Valid ID length
|
| 97 |
+
ids.add(copyist_id)
|
| 98 |
+
|
| 99 |
+
# Also look for any numbers that might be IDs in the page
|
| 100 |
+
text = soup.get_text()
|
| 101 |
+
numbers = re.findall(r'\b\d{6,7}\b', text)
|
| 102 |
+
for num in numbers:
|
| 103 |
+
if self.is_valid_id_format(num):
|
| 104 |
+
ids.add(num)
|
| 105 |
|
| 106 |
return list(ids)
|
| 107 |
|
| 108 |
+
def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
| 109 |
+
"""Handle pagination to get more IDs"""
|
| 110 |
+
all_ids = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
# Look for pagination links
|
| 113 |
+
pagination_links = []
|
| 114 |
+
links = soup.find_all('a', href=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
+
for link in links:
|
| 117 |
+
href = link.get('href', '')
|
| 118 |
+
text = link.get_text(strip=True).lower()
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
# Look for next page or numbered pages
|
| 121 |
+
if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
|
| 122 |
+
if href and href.startswith('/'):
|
| 123 |
+
full_url = self.base_url + href
|
| 124 |
+
pagination_links.append(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
# Visit pagination pages
|
| 127 |
+
for page_url in pagination_links[:10]: # Limit to prevent infinite loops
|
| 128 |
+
print(f"Checking pagination page: {page_url}")
|
| 129 |
+
page_soup = self.get_page(page_url)
|
| 130 |
+
if page_soup:
|
| 131 |
+
page_ids = self.extract_ids_from_page(page_soup)
|
| 132 |
+
all_ids.update(page_ids)
|
| 133 |
+
time.sleep(1) # Be respectful
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
return list(all_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
|
| 138 |
+
"""Discover IDs by testing a range of potential IDs"""
|
| 139 |
+
print(f"Testing range-based discovery with {sample_size} samples...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
valid_ids = []
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
# Test a sample of IDs in the range
|
| 144 |
+
import random
|
| 145 |
+
test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
for i, test_id in enumerate(test_ids):
|
| 148 |
+
if i % 100 == 0:
|
| 149 |
+
print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
|
| 150 |
+
|
| 151 |
+
if self.test_id_exists(str(test_id)):
|
| 152 |
+
valid_ids.append(str(test_id))
|
| 153 |
+
|
| 154 |
+
time.sleep(0.1) # Small delay
|
| 155 |
|
| 156 |
+
return valid_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
def test_id_exists(self, copyist_id: str) -> bool:
|
| 159 |
+
"""Test if a copyist ID exists by making a HEAD request"""
|
| 160 |
+
url = f"{self.detail_base_url}{copyist_id}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
try:
|
| 162 |
+
response = self.session.head(url, timeout=5)
|
|
|
|
| 163 |
return response.status_code == 200
|
| 164 |
except:
|
| 165 |
return False
|
| 166 |
|
| 167 |
+
def is_valid_id_format(self, id_str: str) -> bool:
|
| 168 |
+
"""Check if string looks like a valid copyist ID"""
|
| 169 |
+
if not id_str.isdigit():
|
| 170 |
+
return False
|
| 171 |
+
return 5 <= len(id_str) <= 7
|
| 172 |
+
|
| 173 |
+
def scrape_copyist_detail(self, copyist_id: str) -> Dict:
|
| 174 |
+
"""Scrape detailed information for a single copyist"""
|
| 175 |
+
url = f"{self.detail_base_url}{copyist_id}"
|
| 176 |
+
soup = self.get_page(url)
|
| 177 |
|
| 178 |
if not soup:
|
| 179 |
+
return {'error': f'Could not fetch page for ID {copyist_id}'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
# Extract basic info
|
| 182 |
+
data = {
|
| 183 |
+
'copyist_id': copyist_id,
|
| 184 |
+
'detail_url': url,
|
| 185 |
+
'scrape_timestamp': datetime.now().isoformat()
|
| 186 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
# Extract title
|
| 189 |
+
title = soup.find('title')
|
| 190 |
+
if title:
|
| 191 |
+
data['page_title'] = title.get_text(strip=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
# Extract main content
|
| 194 |
+
self.extract_copyist_data(soup, data)
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
return data
|
| 197 |
|
| 198 |
+
def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
|
| 199 |
+
"""Extract copyist data from the page"""
|
| 200 |
+
# Try to find the main content table
|
| 201 |
+
table = soup.find('table', class_='table')
|
| 202 |
+
if not table:
|
| 203 |
+
table = soup.find('table')
|
| 204 |
|
| 205 |
+
if table:
|
| 206 |
+
self.extract_table_data(table, data)
|
|
|
|
| 207 |
|
| 208 |
+
# Try to extract name from various locations
|
| 209 |
+
name_candidates = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
# Look in headings
|
| 212 |
+
for heading in soup.find_all(['h1', 'h2', 'h3']):
|
| 213 |
+
text = heading.get_text(strip=True)
|
| 214 |
+
if text and len(text) > 2:
|
| 215 |
+
name_candidates.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
+
# Look in title
|
| 218 |
+
if 'page_title' in data:
|
| 219 |
+
title_parts = data['page_title'].split(' - ')
|
| 220 |
+
for part in title_parts:
|
| 221 |
+
if part.strip() and len(part.strip()) > 2:
|
| 222 |
+
name_candidates.append(part.strip())
|
| 223 |
|
| 224 |
+
# Set the most likely name
|
| 225 |
+
if name_candidates:
|
| 226 |
+
data['copyist_name'] = name_candidates[0]
|
| 227 |
|
| 228 |
+
def extract_table_data(self, table, data: Dict):
|
| 229 |
+
"""Extract data from the main table"""
|
| 230 |
+
rows = table.find_all('tr')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
for row in rows:
|
| 233 |
+
cells = row.find_all(['td', 'th'])
|
| 234 |
+
if len(cells) >= 2:
|
| 235 |
+
key_cell = cells[0]
|
| 236 |
+
value_cell = cells[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
+
key = key_cell.get_text(strip=True).lower()
|
| 239 |
+
value = value_cell.get_text(strip=True)
|
| 240 |
|
| 241 |
+
# Map common fields
|
| 242 |
+
if 'cnmn' in key:
|
| 243 |
+
data['cnmn_code'] = value
|
| 244 |
+
elif 'sbn' in key:
|
| 245 |
+
data['vid_sbn'] = value
|
| 246 |
+
link = value_cell.find('a')
|
| 247 |
+
if link:
|
| 248 |
+
data['vid_sbn_url'] = link.get('href', '')
|
| 249 |
+
elif 'isni' in key:
|
| 250 |
+
data['isni_code'] = value
|
| 251 |
+
link = value_cell.find('a')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
if link:
|
| 253 |
+
data['isni_url'] = link.get('href', '')
|
| 254 |
+
elif 'biographical' in key or 'biografica' in key:
|
| 255 |
+
data['biographical_note'] = value
|
| 256 |
+
elif 'bibliographical' in key or 'bibliografia' in key:
|
| 257 |
+
if 'source' in key:
|
| 258 |
+
data['bibliographical_sources'] = value
|
| 259 |
else:
|
| 260 |
+
data['bibliographical_notes'] = value
|
| 261 |
+
elif 'name' in key and 'manuscript' in key:
|
| 262 |
+
data['names_in_manuscript'] = value
|
| 263 |
+
elif 'creation' in key or 'creazione' in key:
|
| 264 |
+
data['date_of_creation'] = value
|
| 265 |
+
elif 'modification' in key or 'modifica' in key:
|
| 266 |
+
data['last_modification'] = value
|
| 267 |
+
elif 'identifier' in key:
|
| 268 |
+
data['other_identifiers'] = value
|
| 269 |
+
|
| 270 |
+
def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
|
| 271 |
+
"""Scrape all copyists"""
|
| 272 |
+
print("Starting full scrape...")
|
| 273 |
+
|
| 274 |
+
# Discover IDs
|
| 275 |
+
copyist_ids = self.discover_copyist_ids()
|
| 276 |
+
print(f"Found {len(copyist_ids)} copyist IDs")
|
| 277 |
+
|
| 278 |
+
if not copyist_ids:
|
| 279 |
+
print("No copyist IDs found!")
|
| 280 |
+
return pd.DataFrame()
|
| 281 |
+
|
| 282 |
+
# Limit if requested
|
| 283 |
+
if max_entries and max_entries > 0:
|
| 284 |
+
copyist_ids = copyist_ids[:max_entries]
|
| 285 |
+
print(f"Limited to {max_entries} entries")
|
| 286 |
+
|
| 287 |
+
# Scrape each copyist
|
| 288 |
+
all_data = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
for i, copyist_id in enumerate(copyist_ids, 1):
|
| 291 |
+
print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
|
|
|
|
| 292 |
|
| 293 |
+
data = self.scrape_copyist_detail(copyist_id)
|
| 294 |
|
| 295 |
+
if 'error' not in data:
|
| 296 |
+
data['scrape_order'] = i
|
| 297 |
+
all_data.append(data)
|
| 298 |
else:
|
| 299 |
+
print(f"Error scraping {copyist_id}: {data['error']}")
|
|
|
|
| 300 |
|
| 301 |
# Delay between requests
|
| 302 |
if delay > 0:
|
| 303 |
time.sleep(delay)
|
| 304 |
|
| 305 |
+
df = pd.DataFrame(all_data)
|
| 306 |
+
print(f"Successfully scraped {len(df)} copyists")
|
| 307 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
|
| 310 |
+
# Simple usage example
|
| 311 |
+
def main():
|
| 312 |
+
"""Main function to run the scraper"""
|
| 313 |
+
scraper = ManusCopistaRequestsScraper()
|
| 314 |
+
|
| 315 |
+
# Test with a small number first
|
| 316 |
+
print("Testing with 10 entries...")
|
| 317 |
+
df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
|
| 318 |
+
|
| 319 |
+
if not df.empty:
|
| 320 |
+
print(f"Successfully scraped {len(df)} copyists")
|
| 321 |
+
print("\nColumns:", df.columns.tolist())
|
| 322 |
+
print("\nFirst few rows:")
|
| 323 |
+
print(df.head())
|
| 324 |
+
|
| 325 |
+
# Save to CSV
|
| 326 |
+
filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
| 327 |
+
df.to_csv(filename, index=False)
|
| 328 |
+
print(f"\nSaved to {filename}")
|
| 329 |
+
else:
|
| 330 |
+
print("No data scraped!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
|
|
|
|
| 333 |
if __name__ == "__main__":
|
| 334 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|