File size: 12,514 Bytes
2bd6de2
 
 
 
 
 
 
 
3bd337f
d1c8665
3bd337f
 
 
d1c8665
 
 
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
3bd337f
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
d1c8665
 
3bd337f
d1c8665
 
3bd337f
 
 
d1c8665
3bd337f
 
 
 
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
 
 
 
 
 
 
 
 
 
d1c8665
3bd337f
 
 
 
d1c8665
3bd337f
d1c8665
3bd337f
 
d1c8665
 
3bd337f
 
d1c8665
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
 
 
3bd337f
 
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
d1c8665
3bd337f
 
 
 
 
d1c8665
3bd337f
 
 
 
 
 
 
 
2bd6de2
3bd337f
d1c8665
3bd337f
 
 
55fd359
3bd337f
d1c8665
3bd337f
 
 
33a28ec
3bd337f
 
 
 
 
 
 
 
33a28ec
3bd337f
2bd6de2
3bd337f
 
 
33a28ec
3bd337f
33a28ec
 
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
2bd6de2
 
3bd337f
2bd6de2
3bd337f
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
2bd6de2
3bd337f
 
5852f69
3bd337f
5852f69
3bd337f
 
 
 
 
 
5852f69
3bd337f
 
5852f69
3bd337f
 
5852f69
3bd337f
 
 
 
 
5852f69
3bd337f
 
 
 
 
 
5852f69
3bd337f
 
 
2bd6de2
3bd337f
 
 
2bd6de2
 
3bd337f
 
 
 
2bd6de2
3bd337f
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
 
 
2bd6de2
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
 
3bd337f
d1c8665
3bd337f
d1c8665
3bd337f
 
 
d1c8665
3bd337f
d1c8665
 
 
 
 
3bd337f
 
 
2bd6de2
b15e2d6
3bd337f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c8665
1e189cd
2bd6de2
3bd337f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from typing import Dict, List, Optional
import json
from datetime import datetime
import io

class ManusCopistaRequestsScraper:
    def __init__(self):
        self.base_url = "https://manus.iccu.sbn.it"
        self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
        self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
        
        # Setup session with proper headers
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
        })
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch a page and return BeautifulSoup object"""
        try:
            print(f"Fetching: {url}")
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            
            # Check if we got a proper response
            if response.status_code != 200:
                print(f"Bad status code: {response.status_code}")
                return None
            
            return BeautifulSoup(response.text, 'html.parser')
            
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error for {url}: {e}")
            return None
    
    def discover_copyist_ids(self) -> List[str]:
        """Discover copyist IDs from the browse page"""
        print("Discovering copyist IDs...")
        
        # Try different approaches to get the data
        urls_to_try = [
            self.browse_url,
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
            "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
        ]
        
        all_ids = set()
        
        for url in urls_to_try:
            soup = self.get_page(url)
            if soup:
                ids = self.extract_ids_from_page(soup)
                all_ids.update(ids)
                print(f"Found {len(ids)} IDs from {url}")
                
                # If we found IDs, try to get more from pagination
                if ids:
                    pagination_ids = self.handle_pagination(soup, url)
                    all_ids.update(pagination_ids)
        
        # If no IDs found from browse page, try a range-based approach
        if not all_ids:
            print("No IDs found from browse page, trying range-based discovery...")
            all_ids = self.discover_ids_by_range()
        
        return sorted(list(all_ids))
    
    def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
        """Extract copyist IDs from a page"""
        ids = set()
        
        # Look for links that contain detail/ followed by numbers
        links = soup.find_all('a', href=True)
        for link in links:
            href = link.get('href', '')
            match = re.search(r'detail/(\d+)', href)
            if match:
                copyist_id = match.group(1)
                if len(copyist_id) >= 5:  # Valid ID length
                    ids.add(copyist_id)
        
        # Also look for any numbers that might be IDs in the page
        text = soup.get_text()
        numbers = re.findall(r'\b\d{6,7}\b', text)
        for num in numbers:
            if self.is_valid_id_format(num):
                ids.add(num)
        
        return list(ids)
    
    def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """Handle pagination to get more IDs"""
        all_ids = set()
        
        # Look for pagination links
        pagination_links = []
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link.get('href', '')
            text = link.get_text(strip=True).lower()
            
            # Look for next page or numbered pages
            if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
                if href and href.startswith('/'):
                    full_url = self.base_url + href
                    pagination_links.append(full_url)
        
        # Visit pagination pages
        for page_url in pagination_links[:10]:  # Limit to prevent infinite loops
            print(f"Checking pagination page: {page_url}")
            page_soup = self.get_page(page_url)
            if page_soup:
                page_ids = self.extract_ids_from_page(page_soup)
                all_ids.update(page_ids)
                time.sleep(1)  # Be respectful
        
        return list(all_ids)
    
    def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
        """Discover IDs by testing a range of potential IDs"""
        print(f"Testing range-based discovery with {sample_size} samples...")
        
        valid_ids = []
        
        # Test a sample of IDs in the range
        import random
        test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
        
        for i, test_id in enumerate(test_ids):
            if i % 100 == 0:
                print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
            
            if self.test_id_exists(str(test_id)):
                valid_ids.append(str(test_id))
            
            time.sleep(0.1)  # Small delay
        
        return valid_ids
    
    def test_id_exists(self, copyist_id: str) -> bool:
        """Test if a copyist ID exists by making a HEAD request"""
        url = f"{self.detail_base_url}{copyist_id}"
        try:
            response = self.session.head(url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def is_valid_id_format(self, id_str: str) -> bool:
        """Check if string looks like a valid copyist ID"""
        if not id_str.isdigit():
            return False
        return 5 <= len(id_str) <= 7
    
    def scrape_copyist_detail(self, copyist_id: str) -> Dict:
        """Scrape detailed information for a single copyist"""
        url = f"{self.detail_base_url}{copyist_id}"
        soup = self.get_page(url)
        
        if not soup:
            return {'error': f'Could not fetch page for ID {copyist_id}'}
        
        # Extract basic info
        data = {
            'copyist_id': copyist_id,
            'detail_url': url,
            'scrape_timestamp': datetime.now().isoformat()
        }
        
        # Extract title
        title = soup.find('title')
        if title:
            data['page_title'] = title.get_text(strip=True)
        
        # Extract main content
        self.extract_copyist_data(soup, data)
        
        return data
    
    def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
        """Extract copyist data from the page"""
        # Try to find the main content table
        table = soup.find('table', class_='table')
        if not table:
            table = soup.find('table')
        
        if table:
            self.extract_table_data(table, data)
        
        # Try to extract name from various locations
        name_candidates = []
        
        # Look in headings
        for heading in soup.find_all(['h1', 'h2', 'h3']):
            text = heading.get_text(strip=True)
            if text and len(text) > 2:
                name_candidates.append(text)
        
        # Look in title
        if 'page_title' in data:
            title_parts = data['page_title'].split(' - ')
            for part in title_parts:
                if part.strip() and len(part.strip()) > 2:
                    name_candidates.append(part.strip())
        
        # Set the most likely name
        if name_candidates:
            data['copyist_name'] = name_candidates[0]
    
    def extract_table_data(self, table, data: Dict):
        """Extract data from the main table"""
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                key_cell = cells[0]
                value_cell = cells[1]
                
                key = key_cell.get_text(strip=True).lower()
                value = value_cell.get_text(strip=True)
                
                # Map common fields
                if 'cnmn' in key:
                    data['cnmn_code'] = value
                elif 'sbn' in key:
                    data['vid_sbn'] = value
                    link = value_cell.find('a')
                    if link:
                        data['vid_sbn_url'] = link.get('href', '')
                elif 'isni' in key:
                    data['isni_code'] = value
                    link = value_cell.find('a')
                    if link:
                        data['isni_url'] = link.get('href', '')
                elif 'biographical' in key or 'biografica' in key:
                    data['biographical_note'] = value
                elif 'bibliographical' in key or 'bibliografia' in key:
                    if 'source' in key:
                        data['bibliographical_sources'] = value
                    else:
                        data['bibliographical_notes'] = value
                elif 'name' in key and 'manuscript' in key:
                    data['names_in_manuscript'] = value
                elif 'creation' in key or 'creazione' in key:
                    data['date_of_creation'] = value
                elif 'modification' in key or 'modifica' in key:
                    data['last_modification'] = value
                elif 'identifier' in key:
                    data['other_identifiers'] = value
    
    def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
        """Scrape all copyists"""
        print("Starting full scrape...")
        
        # Discover IDs
        copyist_ids = self.discover_copyist_ids()
        print(f"Found {len(copyist_ids)} copyist IDs")
        
        if not copyist_ids:
            print("No copyist IDs found!")
            return pd.DataFrame()
        
        # Limit if requested
        if max_entries and max_entries > 0:
            copyist_ids = copyist_ids[:max_entries]
            print(f"Limited to {max_entries} entries")
        
        # Scrape each copyist
        all_data = []
        
        for i, copyist_id in enumerate(copyist_ids, 1):
            print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
            
            data = self.scrape_copyist_detail(copyist_id)
            
            if 'error' not in data:
                data['scrape_order'] = i
                all_data.append(data)
            else:
                print(f"Error scraping {copyist_id}: {data['error']}")
            
            # Delay between requests
            if delay > 0:
                time.sleep(delay)
        
        df = pd.DataFrame(all_data)
        print(f"Successfully scraped {len(df)} copyists")
        return df


# Simple usage example
def main():
    """Main function to run the scraper"""
    scraper = ManusCopistaRequestsScraper()
    
    # Test with a small number first
    print("Testing with 10 entries...")
    df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
    
    if not df.empty:
        print(f"Successfully scraped {len(df)} copyists")
        print("\nColumns:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        
        # Save to CSV
        filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False)
        print(f"\nSaved to {filename}")
    else:
        print("No data scraped!")


if __name__ == "__main__":
    main()