saim1309 commited on
Commit
407925d
Β·
verified Β·
1 Parent(s): 5412674

Delete scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +0 -331
scraper.py DELETED
@@ -1,331 +0,0 @@
1
- import requests
2
- import json
3
- import re
4
- from bs4 import BeautifulSoup
5
- from typing import List, Dict, Any, Tuple
6
- from utils import clean_time
7
-
8
- def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
9
- """
10
- Extract workshops using our robust Squarespace JSON + HTML parsing system
11
- """
12
- headers = {
13
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
- }
15
-
16
- try:
17
- # First try the Squarespace JSON API
18
- json_url = f"{url}?format=json"
19
- print(f"πŸ” Trying Squarespace JSON API: {json_url}")
20
-
21
- response = requests.get(json_url, headers=headers, timeout=10)
22
- if response.status_code == 200:
23
- try:
24
- json_data = response.json()
25
- workshops = extract_workshops_from_json(json_data, json_url)
26
- if workshops:
27
- print(f"βœ… Extracted {len(workshops)} workshops from JSON API")
28
- return workshops
29
- else:
30
- print("❌ No workshops found in JSON, falling back to HTML")
31
- except json.JSONDecodeError:
32
- print("❌ Invalid JSON response, falling back to HTML")
33
-
34
- # Fallback to HTML scraping if JSON fails
35
- print(f"πŸ“„ Falling back to HTML scraping for {url}")
36
- response = requests.get(url, headers=headers, timeout=10)
37
- response.raise_for_status()
38
-
39
- soup = BeautifulSoup(response.content, 'html.parser')
40
- workshops = parse_workshops_from_html(soup, url)
41
-
42
- if workshops:
43
- print(f"βœ… Extracted {len(workshops)} workshops from HTML parsing")
44
- return workshops
45
- else:
46
- print("❌ No workshops found in HTML")
47
- return []
48
-
49
- except Exception as e:
50
- print(f"❌ Error scraping workshops from {url}: {e}")
51
- return []
52
-
53
- def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
54
- """Extract workshop information from Squarespace JSON data"""
55
- workshops = []
56
-
57
- # Check if there's mainContent HTML to parse
58
- if isinstance(data, dict) and 'mainContent' in data:
59
- main_content_html = data['mainContent']
60
- if isinstance(main_content_html, str):
61
- print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
62
-
63
- soup = BeautifulSoup(main_content_html, 'html.parser')
64
- workshops = parse_workshops_from_html(soup, source_url)
65
-
66
- if workshops:
67
- return workshops
68
-
69
- return workshops
70
-
71
- def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
72
- """Enhanced HTML parsing specifically for workshop content"""
73
- workshops = []
74
- workshop_texts = set()
75
-
76
- print(f"πŸ” ENHANCED HTML PARSING:")
77
-
78
- # Method 1: Find individual workshop containers
79
- potential_containers = soup.find_all(['div', 'section', 'article'],
80
- attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
81
-
82
- print(f" Found {len(potential_containers)} potential workshop containers")
83
-
84
- for container in potential_containers:
85
- workshop_text = container.get_text(strip=True)
86
-
87
- if len(workshop_text) < 30 or workshop_text in workshop_texts:
88
- continue
89
-
90
- if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
91
- workshop = extract_single_workshop_from_text(workshop_text, source_url)
92
- if workshop and not is_duplicate_workshop(workshop, workshops):
93
- workshops.append(workshop)
94
- workshop_texts.add(workshop_text)
95
-
96
- # Method 2: Pattern-based extraction from full text
97
- all_text = soup.get_text()
98
-
99
- workshop_patterns = [
100
- # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
101
- r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
102
-
103
- # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
104
- r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
105
-
106
- # Pattern 3: "Casting Director Name, Date @ Time"
107
- r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
108
- ]
109
-
110
- for i, pattern in enumerate(workshop_patterns):
111
- matches = re.findall(pattern, all_text, re.IGNORECASE)
112
- for match in matches:
113
- workshop = parse_refined_workshop_match(match, i+1, source_url)
114
- if workshop and not is_duplicate_workshop(workshop, workshops):
115
- workshops.append(workshop)
116
-
117
- print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
118
- return workshops
119
-
120
- def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
121
- """Extract workshop info from a single text block"""
122
-
123
- # Clean up the text
124
- text = re.sub(r'\$[0-9,]+\.00', '', text)
125
- text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
126
- text = re.sub(r'\s+', ' ', text).strip()
127
- text = re.sub(r'\n+', ' ', text)
128
-
129
- patterns = [
130
- # Pattern A: "Title with Professional Name on Date @ Time"
131
- r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
132
-
133
- # Pattern B: "Professional Name, Title on Date @ Time"
134
- r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
135
-
136
- # Pattern C: "Casting Director Name, Date at Time"
137
- r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
138
-
139
- # Pattern D: "Company Executive Producer Name on Date"
140
- r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
141
-
142
- # Pattern E: "Company Agent Name Date" (fixed "on" issue)
143
- r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
144
-
145
- # Pattern F: "Company, Person, Title on Date"
146
- r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
147
-
148
- # Pattern G: Flexible fallback
149
- r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
150
- ]
151
-
152
- for i, pattern in enumerate(patterns):
153
- match = re.search(pattern, text, re.IGNORECASE)
154
- if match:
155
- return parse_pattern_match(match, i, source_url)
156
-
157
- return None
158
-
159
- def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
160
- """Parse a regex match based on pattern type"""
161
- # Initialize variables to avoid UnboundLocalError
162
- workshop_title = ""
163
- instructor_title = ""
164
- instructor_name = ""
165
- date_str = ""
166
- time_str = ""
167
-
168
- try:
169
- if pattern_index == 0: # Pattern A
170
- workshop_title = match.group(1).strip()
171
- professional_full = match.group(2).strip()
172
- date_str = match.group(3).strip()
173
- time_str = match.group(4).strip() if match.group(4) else ""
174
-
175
- if professional_full.startswith('CD '):
176
- professional_full = 'Casting Director ' + professional_full[3:]
177
-
178
- instructor_title, instructor_name = parse_professional_info(professional_full)
179
-
180
- elif pattern_index == 1: # Pattern B
181
- professional_full = match.group(1).strip()
182
- workshop_title = match.group(2).strip()
183
- date_str = match.group(3).strip()
184
- time_str = match.group(4).strip() if match.group(4) else ""
185
-
186
- instructor_title, instructor_name = parse_professional_info(professional_full)
187
-
188
- elif pattern_index == 2: # Pattern C
189
- instructor_title = match.group(1).strip()
190
- instructor_name = match.group(2).strip()
191
- date_str = match.group(3).strip()
192
- time_str = match.group(4).strip() if match.group(4) else ""
193
- workshop_title = "Casting Workshop"
194
-
195
- elif pattern_index == 3: # Pattern D
196
- instructor_title = match.group(1).strip()
197
- instructor_name = match.group(2).strip()
198
- date_str = match.group(3).strip()
199
- time_str = match.group(4).strip() if match.group(4) else ""
200
- workshop_title = "Industry Workshop"
201
-
202
- elif pattern_index == 4: # Pattern E
203
- company_name = match.group(1).strip()
204
- agent_type = match.group(2).strip()
205
- instructor_name = match.group(3).strip()
206
- date_str = match.group(4).strip()
207
- time_str = match.group(5).strip() if len(match.groups()) > 4 and match.group(5) else ""
208
-
209
- instructor_title = f"{company_name} {agent_type}"
210
- workshop_title = "Industry Workshop"
211
-
212
- elif pattern_index == 5: # Pattern F
213
- company_name = match.group(1).strip()
214
- instructor_name = match.group(2).strip()
215
- workshop_title = match.group(3).strip()
216
- date_str = match.group(4).strip()
217
- time_str = match.group(5).strip() if len(match.groups()) > 4 and match.group(5) else ""
218
-
219
- instructor_title = company_name
220
-
221
- else: # Pattern G
222
- professional_full = match.group(1).strip() + " " + match.group(2).strip()
223
- date_str = match.group(3).strip()
224
- time_str = match.group(4).strip() if match.group(4) else ""
225
- workshop_title = "Industry Workshop"
226
-
227
- if len(professional_full) > 50 or '\n' in professional_full:
228
- return None
229
-
230
- instructor_title, instructor_name = parse_professional_info(professional_full)
231
-
232
- if instructor_name and date_str:
233
- # Create full_text for embedding (required by existing Flask API)
234
- full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
235
- if date_str:
236
- full_text += f" on {date_str}"
237
- if time_str:
238
- full_text += f" at {clean_time(time_str)}"
239
-
240
- return {
241
- 'title': workshop_title,
242
- 'instructor_name': instructor_name,
243
- 'instructor_title': instructor_title,
244
- 'date': date_str,
245
- 'time': clean_time(time_str),
246
- 'full_text': full_text, # Required for existing embedding system
247
- 'source_url': source_url
248
- }
249
-
250
- except Exception as e:
251
- print(f"Error parsing pattern match: {e}")
252
-
253
- return None
254
-
255
- def parse_professional_info(professional_full: str) -> tuple:
256
- """Parse professional title and name from full string"""
257
-
258
- professional_full = re.sub(r'\s+', ' ', professional_full).strip()
259
-
260
- # Handle specific multi-word titles
261
- specific_titles = [
262
- 'Atlanta Models & Talent President',
263
- 'Executive Casting Producer',
264
- 'Casting Director',
265
- 'Casting Associate',
266
- 'DDO Agent',
267
- 'Talent Agent',
268
- 'Acting Coach'
269
- ]
270
-
271
- for title in specific_titles:
272
- if title in professional_full:
273
- title_pos = professional_full.find(title)
274
-
275
- if title_pos == 0:
276
- name_part = professional_full[len(title):].strip()
277
- return title, name_part
278
- else:
279
- name_part = professional_full[:title_pos].strip().rstrip(',')
280
- return title, name_part
281
-
282
- # Fallback for single-word titles
283
- single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
284
-
285
- words = professional_full.split()
286
- for i, word in enumerate(words):
287
- if word in single_word_titles:
288
- if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
289
- title = f"{words[i-1]} {word}"
290
- name_parts = words[:i-1] + words[i+1:]
291
- else:
292
- title = word
293
- name_parts = words[:i] + words[i+1:]
294
-
295
- name = ' '.join(name_parts).strip()
296
- return title, name
297
-
298
- # Final fallback
299
- if len(words) >= 2:
300
- return words[0], ' '.join(words[1:])
301
-
302
- return '', professional_full
303
-
304
- def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
305
- """Parse a regex match into a clean workshop dictionary"""
306
- return parse_pattern_match(match, pattern_num-1, source_url) # Adjust for 0-based indexing
307
-
308
- def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
309
- """Enhanced duplicate detection"""
310
- for existing in existing_workshops:
311
- if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
312
- existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
313
-
314
- existing_title = existing.get('title', '').strip().lower()
315
- new_title = new_workshop.get('title', '').strip().lower()
316
-
317
- if (existing_title == new_title or
318
- 'workshop' in existing_title and 'workshop' in new_title or
319
- existing_title in new_title or new_title in existing_title):
320
- return True
321
- return False
322
-
323
- def calculate_workshop_confidence(w: Dict) -> float:
324
- """Calculate confidence score of retrieved workshop data"""
325
- score = 0.0
326
- if w.get('title'): score += 0.3
327
- if w.get('instructor_name'): score += 0.3
328
- if w.get('date'): score += 0.2
329
- if w.get('time'): score += 0.1
330
- if w.get('source_url'): score += 0.1
331
- return round(score, 2)